[docs]@classmethoddefhash_default(cls,value:Any)->str:""" Use dill to serialize objects to avoid serialization failures. """returncls.hash_bytes(dill.dumps(value))
[docs]defupdate_fingerprint(fingerprint,transform,transform_args):""" Combining various objects to update the fingerprint. """hasher=Hasher()hasher.update(fingerprint)try:hasher.update(transform)except:# noqa various errors might raise here from pickle or dillif_CACHING_ENABLED:ifnotfingerprint_warnings.get('update_fingerprint_transform_hash_failed',False):logger.warning(f"Transform {transform} couldn't be hashed properly, \ a random hash was used instead. Make sure your \ transforms and parameters are serializable with \ pickle or dill for the dataset fingerprinting and \ caching to work. If you reuse this transform, the \ caching mechanism will consider it to be different \ from the previous calls and recompute everything. \ This warning is only showed once. Subsequent hashing \ failures won't be showed.")fingerprint_warnings['update_fingerprint_transform_hash_failed']=Trueelse:logger.info(f"Transform {transform} couldn't be hashed properly, \ a random hash was used instead.")else:logger.info(f"Transform {transform} couldn't be hashed properly, a \ random hash was used instead. This doesn't affect caching \ since it's disabled.")returngenerate_random_fingerprint()forkeyinsorted(transform_args):hasher.update(key)try:hasher.update(transform_args[key])except:# noqa various errors might raise here from pickle or dillif_CACHING_ENABLED:ifnotfingerprint_warnings.get('update_fingerprint_transform_hash_failed',False):logger.warning(f"Parameter '{key}'={transform_args[key]} of the \ transform {transform} couldn't be hashed properly, \ a random hash was used instead. Make sure your \ transforms and parameters are serializable with \ pickle or dill for the dataset fingerprinting and \ caching to work. If you reuse this transform, the \ caching mechanism will consider it to be different \ from the previous calls and recompute everything. \ This warning is only showed once. Subsequent hashing \ failures won't be showed.")fingerprint_warnings['update_fingerprint_transform_hash_failed']=Trueelse:logger.info(f"Parameter '{key}'={transform_args[key]} of the \ transform {transform} couldn't be hashed properly, \ a random hash was used instead.")else:logger.info(f"Parameter '{key}'={transform_args[key]} of the transform \{transform} couldn't be hashed properly, a random hash \ was used instead. This doesn't affect caching since it's \ disabled.")returngenerate_random_fingerprint()returnhasher.hexdigest()
[docs]defgenerate_fingerprint(ds,*args,**kwargs):""" Generate new fingerprints by using various kwargs of the dataset. """ifargs:args=list(args)dataset_kwargs={'shard':ds,'function':args[0]}else:dataset_kwargs={'shard':ds}dataset_kwargs.update(kwargs)# we create a unique hash from the function,# current dataset file and the mapping argstransform=format_transform_for_fingerprint(ds._map_single)kwargs_for_fingerprint=format_kwargs_for_fingerprint(ds._map_single,(),dataset_kwargs)kwargs_for_fingerprint['fingerprint_name']='new_fingerprint'new_fingerprint=update_fingerprint(ds._fingerprint,transform,kwargs_for_fingerprint)validate_fingerprint(new_fingerprint)returnnew_fingerprint