Source code for data_juicer.ops.mapper.remove_bibliography_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/# --------------------------------------------------------importregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module("remove_bibliography_mapper")classRemoveBibliographyMapper(Mapper):"""Mapper to remove bibliography at the end of documents in Latex samples."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.pattern=r"(\\appendix|"self.pattern+=r"\\begin\{references\}|"self.pattern+=r"\\begin\{REFERENCES\}|"self.pattern+=r"\\begin\{thebibliography\}|"self.pattern+=r"\\bibliography\{.*\}"self.pattern+=r").*$"