Source code for data_juicer.ops.mapper.clean_html_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/# --------------------------------------------------------fromdata_juicer.utils.lazy_loaderimportLazyLoaderfrom..base_opimportOPERATORS,Mapperselectolax=LazyLoader("selectolax")OP_NAME="clean_html_mapper"
[docs]@OPERATORS.register_module(OP_NAME)classCleanHtmlMapper(Mapper):"""Cleans HTML code from text samples, converting HTML to plain text. This operator processes text samples by removing HTML tags and converting HTML elements to a more readable format. Specifically, it replaces `<li>` and `<ol>` tags with newline and bullet points. The Selectolax HTML parser is used to extract the text content from the HTML. This operation is performed in a batched manner, making it efficient for large datasets."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)