Source code for data_juicer.ops.mapper.expand_macro_mapper

# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py
# --------------------------------------------------------

import regex as re

from ..base_op import OPERATORS, Mapper


[docs]@OPERATORS.register_module('expand_macro_mapper') class ExpandMacroMapper(Mapper): """Mapper to expand macro definitions in the document body of Latex samples.""" _batched_op = True
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs)
def _build_non_arg_macros_dict(self, file_content): # regex for extracting \newcommand macros without arguments non_arg_nc_reg = re.compile( # this regex matches the following: # \newcommand{\macro_name}{macro_value} # \newcommand*{\macro_name}{macro_value} # where macro_name is only allowed to contain letters and numbers; # macro_value can contain any character. pattern=r'\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$', flags=re.MULTILINE) # regex for extracting \def macros without arguments non_arg_def_reg = re.compile( # this regex matches the following: # \def\macro_name{macro_value} # where macro_name is only allowed to contain letters and numbers; # macro_value can contain any character. pattern=r'\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$', flags=re.MULTILINE) # Extract all user-defined LaTeX macros from the preamble macros = {} for reg in [non_arg_nc_reg, non_arg_def_reg]: for match in reg.finditer(file_content): # convert the macro name and value to a raw string that can be # used in re.sub macro_name = match.group(1).encode('unicode-escape').decode( 'utf-8') macro_val = match.group(2).encode('unicode-escape').decode( 'utf-8') macros[macro_name] = macro_val return macros
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): non_arg_macros = self._build_non_arg_macros_dict(text) # TODO: macros that take arguments are not supported yet arg_macros = {} # inline-expand all non-arg macros for macro_name, macro_value in non_arg_macros.items(): text = re.sub( # make pattern grouped to make sure that the macro # is not part of a longer alphanumeric word pattern=r'(' + macro_name + r')' + r'([^a-zA-Z0-9])', # replace the macro with its value and add back the # character that was matched after the macro repl=macro_value + r'\2', string=text) # inline-expand all macros that use args # TODO: inline-expand macros with args for macro_name, macro_value in arg_macros.items(): pass samples[self.text_key][idx] = text return samples