Source code for data_juicer.ops.mapper.remove_non_chinese_character_mapper

import regex as re

from ..base_op import OPERATORS, Mapper


[docs] @OPERATORS.register_module('remove_non_chinese_character_mapper') class RemoveNonChineseCharacterlMapper(Mapper): """Mapper to remove non chinese Character in text samples.""" _batched_op = True
[docs] def __init__(self, keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs): """ Initialization method. :param keep_alphabet: whether to keep alphabet :param keep_number: whether to keep number :param keep_punc: whether to keep punctuation :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.pattern = u'[^\u4e00-\u9fa5' if keep_alphabet: self.pattern += u'A-Za-z' if keep_number: self.pattern += u'0-9' if keep_punc: self.pattern += u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+' else: self.pattern += u']'
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): continue samples[self.text_key][idx] = re.sub(pattern=self.pattern, repl=r'', string=text, flags=re.DOTALL) return samples