Source code for data_juicer.ops.mapper.clean_html_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/# --------------------------------------------------------fromdata_juicer.utils.lazy_loaderimportLazyLoaderfrom..base_opimportOPERATORS,Mapperselectolax=LazyLoader('selectolax')OP_NAME='clean_html_mapper'
[docs]@OPERATORS.register_module(OP_NAME)classCleanHtmlMapper(Mapper):"""Mapper to clean html code in text samples."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)