Source code for data_juicer.ops.mapper.remove_bibliography_mapper

# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------

import regex as re

from ..base_op import OPERATORS, Mapper


[docs] @OPERATORS.register_module('remove_bibliography_mapper') class RemoveBibliographyMapper(Mapper): """Mapper to remove bibliography at the end of documents in Latex samples.""" _batched_op = True
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.pattern = r'(\\appendix|' self.pattern += r'\\begin\{references\}|' self.pattern += r'\\begin\{REFERENCES\}|' self.pattern += r'\\begin\{thebibliography\}|' self.pattern += r'\\bibliography\{.*\}' self.pattern += r').*$'
[docs] def process_batched(self, samples): samples[self.text_key] = [ re.sub(pattern=self.pattern, repl=r'', string=text, flags=re.DOTALL) for text in samples[self.text_key] ] return samples