Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

from typing import List, Union

import regex as re

from ..base_op import OPERATORS, Mapper


[docs]@OPERATORS.register_module('remove_specific_chars_mapper') class RemoveSpecificCharsMapper(Mapper): """Mapper to clean specific chars in text samples.""" _batched_op = True
[docs] def __init__(self, chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs): """ Initialization method. :param chars_to_remove: a list or a string including all characters that need to be removed from text. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) if chars_to_remove: self.pattern = '[' + '|'.join(chars_to_remove) + ']' else: self.pattern = None
[docs] def process_batched(self, samples): if self.pattern is None: return samples samples[self.text_key] = [ re.sub(pattern=self.pattern, repl=r'', string=text, flags=re.DOTALL) for text in samples[self.text_key] ] return samples