Source code for data_juicer.ops.mapper.clean_copyright_mapper

# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------

import regex as re

from ..base_op import OPERATORS, Mapper


[docs]@OPERATORS.register_module('clean_copyright_mapper') class CleanCopyrightMapper(Mapper): """Mapper to clean copyright comments at the beginning of the text samples.""" _batched_op = True
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.pat = re.compile('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/') self.cpat = re.compile('copyright', re.IGNORECASE)
def _process_single_sample(self, sample): r = self.pat.search(sample) if r: # found one, now see if it contains "copyright", if so strip it span = r.span() sub = sample[span[0]:span[1]] if self.cpat.search(sub): # cut it sample = sample[:span[0]] + sample[span[1]:] return sample lines = sample.split('\n') skip = 0 # Greedy replace any file that begins with comment block, most # are copyright headers for k in range(len(lines)): if (lines[k].startswith('//') or lines[k].startswith('#') or lines[k].startswith('--') or not lines[k]): skip = skip + 1 else: break if skip: # we skipped, consume it sample = '\n'.join(lines[skip:]) return sample
[docs] def process_batched(self, samples): samples[self.text_key] = [ self._process_single_sample(text) for text in samples[self.text_key] ] return samples