# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/# --------------------------------------------------------importregexasrefrom..base_opimportOPERATORS,Mapper
[文档]@OPERATORS.register_module("clean_copyright_mapper")classCleanCopyrightMapper(Mapper):"""Cleans copyright comments at the beginning of text samples. This operator removes copyright comments from the start of text samples. It identifies and strips multiline comments that contain the word "copyright" using a regular expression. It also greedily removes lines starting with comment markers like `//`, `#`, or `--` at the beginning of the text, as these are often part of copyright headers. The operator processes each sample individually but can handle batches for efficiency."""_batched_op=True
[文档]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.pat=re.compile("/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/")self.cpat=re.compile("copyright",re.IGNORECASE)
def_process_single_sample(self,sample):r=self.pat.search(sample)ifr:# found one, now see if it contains "copyright", if so strip itspan=r.span()sub=sample[span[0]:span[1]]ifself.cpat.search(sub):# cut itsample=sample[:span[0]]+sample[span[1]:]returnsamplelines=sample.split("\n")skip=0# Greedy replace any file that begins with comment block, most# are copyright headersforkinrange(len(lines)):iflines[k].startswith("//")orlines[k].startswith("#")orlines[k].startswith("--")ornotlines[k]:skip=skip+1else:breakifskip:# we skipped, consume itsample="\n".join(lines[skip:])returnsample