[文档]@OPERATORS.register_module('remove_repeat_sentences_mapper')classRemoveRepeatSentencesMapper(Mapper):"""Mapper to remove repeat sentences in text samples."""_batched_op=True
[文档]def__init__(self,lowercase:bool=False,ignore_special_character:bool=True,min_repeat_sentence_length:int=2,*args,**kwargs):""" Initialization method. :param lowercase: Whether to convert sample text to lower case :param ignore_special_character: Whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers. :param min_repeat_sentence_length: Sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.lowercase=lowercaseself.min_repeat_sentence_length=min_repeat_sentence_lengthself.remove_regex=re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]')ifignore_special_characterelseNone