Source code for data_juicer.ops.mapper.expand_macro_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py# --------------------------------------------------------importregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module("expand_macro_mapper")classExpandMacroMapper(Mapper):"""Mapper to expand macro definitions in the document body of Latex samples."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)
def_build_non_arg_macros_dict(self,file_content):# regex for extracting \newcommand macros without argumentsnon_arg_nc_reg=re.compile(# this regex matches the following:# \newcommand{\macro_name}{macro_value}# \newcommand*{\macro_name}{macro_value}# where macro_name is only allowed to contain letters and numbers;# macro_value can contain any character.pattern=r"\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$",flags=re.MULTILINE,)# regex for extracting \def macros without argumentsnon_arg_def_reg=re.compile(# this regex matches the following:# \def\macro_name{macro_value}# where macro_name is only allowed to contain letters and numbers;# macro_value can contain any character.pattern=r"\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$",flags=re.MULTILINE,)# Extract all user-defined LaTeX macros from the preamblemacros={}forregin[non_arg_nc_reg,non_arg_def_reg]:formatchinreg.finditer(file_content):# convert the macro name and value to a raw string that can be# used in re.submacro_name=match.group(1).encode("unicode-escape").decode("utf-8")macro_val=match.group(2).encode("unicode-escape").decode("utf-8")macros[macro_name]=macro_valreturnmacros
[docs]defprocess_batched(self,samples):foridx,textinenumerate(samples[self.text_key]):non_arg_macros=self._build_non_arg_macros_dict(text)# TODO: macros that take arguments are not supported yetarg_macros={}# inline-expand all non-arg macrosformacro_name,macro_valueinnon_arg_macros.items():text=re.sub(# make pattern grouped to make sure that the macro# is not part of a longer alphanumeric wordpattern=r"("+macro_name+r")"+r"([^a-zA-Z0-9])",# replace the macro with its value and add back the# character that was matched after the macrorepl=macro_value+r"\2",string=text,)# inline-expand all macros that use args# TODO: inline-expand macros with argsformacro_name,macro_valueinarg_macros.items():passsamples[self.text_key][idx]=textreturnsamples