Source code for data_juicer.ops.mapper.remove_bibliography_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/# --------------------------------------------------------importregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module("remove_bibliography_mapper")classRemoveBibliographyMapper(Mapper):"""Removes bibliography sections at the end of LaTeX documents. This operator identifies and removes bibliography sections in LaTeX documents. It uses a regular expression to match common bibliography commands such as \\appendix, \\begin{references}, \\begin{thebibliography}, and \\bibliography. The matched sections are removed from the text. The operator processes samples in batch mode for efficiency."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.pattern=r"(\\appendix|"self.pattern+=r"\\begin\{references\}|"self.pattern+=r"\\begin\{REFERENCES\}|"self.pattern+=r"\\begin\{thebibliography\}|"self.pattern+=r"\\bibliography\{.*\}"self.pattern+=r").*$"