Source code for data_juicer.ops.mapper.remove_header_mapper

# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------

import regex as re

from ..base_op import OPERATORS, Mapper


[docs]@OPERATORS.register_module('remove_header_mapper') class RemoveHeaderMapper(Mapper): """Mapper to remove headers at the beginning of documents in Latex samples.""" _batched_op = True
[docs] def __init__(self, drop_no_head: bool = True, *args, **kwargs): """ Initialization method. :param drop_no_head: whether to drop sample texts without headers. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.pattern = r'^(.*?)(' self.pattern += r'\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|' self.pattern += r'\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|' self.pattern += r'\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' self.pattern += r'\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' self.pattern += r'\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' self.pattern += r'\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}' self.pattern += r'\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}' self.pattern += r')' self.drop_no_head = drop_no_head
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): if self.drop_no_head: text = '' continue text = re.sub(pattern=self.pattern, repl=r'\2', string=text, flags=re.DOTALL) samples[self.text_key][idx] = text return samples