[docs]@OPERATORS.register_module("remove_repeat_sentences_mapper")classRemoveRepeatSentencesMapper(Mapper):"""Mapper to remove repeat sentences in text samples."""_batched_op=True
[docs]def__init__(self,lowercase:bool=False,ignore_special_character:bool=True,min_repeat_sentence_length:int=2,*args,**kwargs,):""" Initialization method. :param lowercase: Whether to convert sample text to lower case :param ignore_special_character: Whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers. :param min_repeat_sentence_length: Sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.lowercase=lowercaseself.min_repeat_sentence_length=min_repeat_sentence_lengthself.remove_regex=re.compile(r"[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]")ifignore_special_characterelseNone