[docs]@OPERATORS.register_module(OP_NAME)@LOADED_IMAGES.register_module(OP_NAME)classRayImageDeduplicator(RayBasicDeduplicator):"""Deduplicates samples at the document level using exact matching of images in Ray distributed mode. This operator uses a specified hash method to compute image hashes and identifies duplicates by comparing these hashes. It operates in Ray distributed mode, supporting 'ray_actor' or 'redis' backends for deduplication. The hash method can be set during initialization, with supported methods listed in `HASH_METHOD`. If a sample does not contain an image, it is assigned an empty hash value. The operator loads images from the specified keys and computes their combined hash for comparison."""
[docs]def__init__(self,backend:str="ray_actor",redis_address:str="redis://localhost:6379",method:str="phash",*args,**kwargs,):""" Initialization. :param backend: the backend for dedup, either 'ray_actor' or 'redis' :param redis_address: the address of redis server :param method: the hash method to use :param args: extra args :param kwargs: extra args """super().__init__(backend=backend,redis_address=redis_address,*args,**kwargs)ifmethodnotinHASH_METHOD:raiseValueError(f"Keep strategy [{method}] is not supported. "f"Can only be one of {HASH_METHOD}.")self.hasher=get_hash_method(method)()