[docs]@OPERATORS.register_module(OP_NAME)classRayDocumentDeduplicator(RayBasicDeduplicator):""" Deduplicator to deduplicate samples at document-level using exact matching. """
[docs]def__init__(self,backend:str="ray_actor",redis_address:str="redis://localhost:6379",lowercase:bool=False,ignore_non_character:bool=False,*args,**kwargs,):""" Initialization method. :param backend: the backend for dedup, either 'ray_actor' or 'redis' :param redis_address: the address of redis server :param lowercase: Whether to convert sample text to lower case :param ignore_non_character: Whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations :param args: extra args :param kwargs: extra args. """super().__init__(backend=backend,redis_address=redis_address,*args,**kwargs)self.lowercase=lowercaseself.remove_non_character_regex=(re.compile(f"\s+|\d+|[{re.escape(string.punctuation)}]")ifignore_non_characterelseNone# noqa: W605)