[docs]@OPERATORS.register_module(OP_NAME)classRayDocumentDeduplicator(RayBasicDeduplicator):""" Deduplicator to deduplicate samples at document-level using exact matching. """
[docs]def__init__(self,backend:str='ray_actor',redis_address:str='redis://localhost:6379',lowercase:bool=False,ignore_non_character:bool=False,*args,**kwargs):""" Initialization method. :param backend: the backend for dedup, either 'ray_actor' or 'redis' :param redis_address: the address of redis server :param lowercase: Whether to convert sample text to lower case :param ignore_non_character: Whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations :param args: extra args :param kwargs: extra args. """super().__init__(backend=backend,redis_address=redis_address,*args,**kwargs)self.lowercase=lowercaseself.remove_non_character_regex=re.compile(f'\s+|\d+|[{re.escape(string.punctuation)}]'# noqa: W605)ifignore_non_characterelseNone