Source code for data_juicer.ops.mapper.clean_copyright_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/# --------------------------------------------------------importregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module('clean_copyright_mapper')classCleanCopyrightMapper(Mapper):"""Mapper to clean copyright comments at the beginning of the text samples."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.pat=re.compile('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/')self.cpat=re.compile('copyright',re.IGNORECASE)
def_process_single_sample(self,sample):r=self.pat.search(sample)ifr:# found one, now see if it contains "copyright", if so strip itspan=r.span()sub=sample[span[0]:span[1]]ifself.cpat.search(sub):# cut itsample=sample[:span[0]]+sample[span[1]:]returnsamplelines=sample.split('\n')skip=0# Greedy replace any file that begins with comment block, most# are copyright headersforkinrange(len(lines)):if(lines[k].startswith('//')orlines[k].startswith('#')orlines[k].startswith('--')ornotlines[k]):skip=skip+1else:breakifskip:# we skipped, consume itsample='\n'.join(lines[skip:])returnsample