[docs]@OPERATORS.register_module('remove_non_chinese_character_mapper')classRemoveNonChineseCharacterlMapper(Mapper):"""Mapper to remove non chinese Character in text samples."""_batched_op=True
[docs]def__init__(self,keep_alphabet:bool=True,keep_number:bool=True,keep_punc:bool=True,*args,**kwargs):""" Initialization method. :param keep_alphabet: whether to keep alphabet :param keep_number: whether to keep number :param keep_punc: whether to keep punctuation :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.pattern=u'[^\u4e00-\u9fa5'ifkeep_alphabet:self.pattern+=u'A-Za-z'ifkeep_number:self.pattern+=u'0-9'ifkeep_punc:self.pattern+=u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+'else:self.pattern+=u']'