Source code for data_juicer.ops.mapper.expand_macro_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py# --------------------------------------------------------importregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module("expand_macro_mapper")classExpandMacroMapper(Mapper):"""Expands macro definitions in the document body of LaTeX samples. This operator processes LaTeX documents to expand user-defined macros in the text. It supports \\newcommand and \\def macros without arguments. Macros are identified and expanded in the text, ensuring they are not part of longer alphanumeric words. The operator currently does not support macros with arguments. The processed text is updated in the samples."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)
def_build_non_arg_macros_dict(self,file_content):# regex for extracting \newcommand macros without argumentsnon_arg_nc_reg=re.compile(# this regex matches the following:# \newcommand{\macro_name}{macro_value}# \newcommand*{\macro_name}{macro_value}# where macro_name is only allowed to contain letters and numbers;# macro_value can contain any character.pattern=r"\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$",flags=re.MULTILINE,)# regex for extracting \def macros without argumentsnon_arg_def_reg=re.compile(# this regex matches the following:# \def\macro_name{macro_value}# where macro_name is only allowed to contain letters and numbers;# macro_value can contain any character.pattern=r"\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$",flags=re.MULTILINE,)# Extract all user-defined LaTeX macros from the preamblemacros={}forregin[non_arg_nc_reg,non_arg_def_reg]:formatchinreg.finditer(file_content):# convert the macro name and value to a raw string that can be# used in re.submacro_name=match.group(1).encode("unicode-escape").decode("utf-8")macro_val=match.group(2).encode("unicode-escape").decode("utf-8")macros[macro_name]=macro_valreturnmacros
[docs]defprocess_batched(self,samples):foridx,textinenumerate(samples[self.text_key]):non_arg_macros=self._build_non_arg_macros_dict(text)# TODO: macros that take arguments are not supported yetarg_macros={}# inline-expand all non-arg macrosformacro_name,macro_valueinnon_arg_macros.items():text=re.sub(# make pattern grouped to make sure that the macro# is not part of a longer alphanumeric wordpattern=r"("+macro_name+r")"+r"([^a-zA-Z0-9])",# replace the macro with its value and add back the# character that was matched after the macrorepl=macro_value+r"\2",string=text,)# inline-expand all macros that use args# TODO: inline-expand macros with argsformacro_name,macro_valueinarg_macros.items():passsamples[self.text_key][idx]=textreturnsamples