Source code for data_juicer.ops.mapper.remove_comments_mapper

# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------

from typing import List, Union

import regex as re

from ..base_op import OPERATORS, Mapper


[docs]@OPERATORS.register_module('remove_comments_mapper') class RemoveCommentsMapper(Mapper): """ Mapper to remove comments in different kinds of documents. Only support 'tex' for now. """ _batched_op = True
[docs] def __init__(self, doc_type: Union[str, List[str]] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs): """ Initialization method. :param doc_type: Type of document to remove comments. :param inline: Whether to remove inline comments. :param multiline: Whether to remove multiline comments. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.doc_type = doc_type self.inline = inline self.multiline = multiline
[docs] def process_batched(self, samples): # TODO: remove different comments by sample type for idx, text in enumerate(samples[self.text_key]): if self.inline: # remove all in comments within a line text = re.sub(pattern=r'[^\\]%.+$', repl=r'', string=text, flags=re.MULTILINE) if self.multiline: text = re.sub(pattern=r'(?m)^%.*\n?', repl=r'', string=text, flags=re.MULTILINE) samples[self.text_key][idx] = text return samples