Source code for data_juicer.ops.mapper.expand_macro_mapper
# Some code here has been modified from:# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py# --------------------------------------------------------importregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module('expand_macro_mapper')classExpandMacroMapper(Mapper):"""Mapper to expand macro definitions in the document body of Latex samples."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)
def_build_non_arg_macros_dict(self,file_content):# regex for extracting \newcommand macros without argumentsnon_arg_nc_reg=re.compile(# this regex matches the following:# \newcommand{\macro_name}{macro_value}# \newcommand*{\macro_name}{macro_value}# where macro_name is only allowed to contain letters and numbers;# macro_value can contain any character.pattern=r'\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$',flags=re.MULTILINE)# regex for extracting \def macros without argumentsnon_arg_def_reg=re.compile(# this regex matches the following:# \def\macro_name{macro_value}# where macro_name is only allowed to contain letters and numbers;# macro_value can contain any character.pattern=r'\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$',flags=re.MULTILINE)# Extract all user-defined LaTeX macros from the preamblemacros={}forregin[non_arg_nc_reg,non_arg_def_reg]:formatchinreg.finditer(file_content):# convert the macro name and value to a raw string that can be# used in re.submacro_name=match.group(1).encode('unicode-escape').decode('utf-8')macro_val=match.group(2).encode('unicode-escape').decode('utf-8')macros[macro_name]=macro_valreturnmacros
[docs]defprocess_batched(self,samples):foridx,textinenumerate(samples[self.text_key]):non_arg_macros=self._build_non_arg_macros_dict(text)# TODO: macros that take arguments are not supported yetarg_macros={}# inline-expand all non-arg macrosformacro_name,macro_valueinnon_arg_macros.items():text=re.sub(# make pattern grouped to make sure that the macro# is not part of a longer alphanumeric wordpattern=r'('+macro_name+r')'+r'([^a-zA-Z0-9])',# replace the macro with its value and add back the# character that was matched after the macrorepl=macro_value+r'\2',string=text)# inline-expand all macros that use args# TODO: inline-expand macros with argsformacro_name,macro_valueinarg_macros.items():passsamples[self.text_key][idx]=textreturnsamples