Source code for data_juicer.ops.deduplicator.document_deduplicator
# Some code here has been modified from:# https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/01a_catalogue_cleaning_and_filtering/clean_helpers/deduplication.py# --------------------------------------------------------importhashlibimportstringfromcollectionsimportdefaultdictfromtypingimportDict,Setimportregexasrefromdata_juicer.utils.constantimportHashKeysfrom..base_opimportOPERATORS,Deduplicator
[docs]@OPERATORS.register_module('document_deduplicator')classDocumentDeduplicator(Deduplicator):""" Deduplicator to deduplicate samples at document-level using exact matching. Using md5 hash to deduplicate samples. """
[docs]def__init__(self,lowercase:bool=False,ignore_non_character:bool=False,*args,**kwargs):""" Initialization method. :param lowercase: Whether to convert sample text to lower case :param ignore_non_character: Whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations :param args: extra args :param kwargs: extra args. """super().__init__(*args,**kwargs)self.lowercase=lowercaseself.remove_non_character_regex=re.compile(f'\s+|\d+|[{re.escape(string.punctuation)}]'# noqa: W605)ifignore_non_characterelseNone
[docs]defcompute_hash(self,sample):""" Compute md5 hash values for the sample. :param sample: input sample :return: sample with md5 hash value. """# check if it's computed alreadyifHashKeys.hashinsample:returnsampletext=sample[self.text_key]ifself.lowercase:text=text.lower()ifself.remove_non_character_regex:text=self.remove_non_character_regex.sub('',text)def_get_hash(txt):returnhashlib.md5(txt.strip().encode('utf-8')).hexdigest()sample[HashKeys.hash]=_get_hash(text)returnsample
[docs]defprocess(self,dataset,show_num=0):""" For doc-level, dataset --> dataset. :param dataset: input dataset :param show_num: number of traced samples used when tracer is open. :return: deduplicated dataset and the sampled duplicate pairs. """# no need to deduplicate because too few samplesiflen(dataset)<=1:returndataset,{}dup_hashes=Noneifshow_num>0:# sample duplicate pairshash2ids:Dict[int,Set[int]]=defaultdict(set)forsid,hash_valinenumerate(dataset[HashKeys.hash]):hash2ids[hash_val].add(sid)dup_samples=sorted(list(hash2ids.items()),key=lambdax:len(x[1]),reverse=True)dup_hashes=set([item[0]foritemindup_samplesiflen(item[1])>1][:show_num])def_filter_dup_helper(sample,hashes):hash=sample[HashKeys.hash]ifshow_num>0andhashindup_hashes \
andlen(dup_pairs[hash])<2:# tracer is open and not enough duplicate sample pairsdup_pairs[hash].append(sample)ifhashinhashes:returnFalseelse:hashes.add(hash)returnTruehashes=set()dup_pairs={hash_v:[]forhash_vindup_hashes}ifdup_hasheselse{}dataset=dataset.filter(_filter_dup_helper,fn_kwargs=dict(hashes=hashes),load_from_cache_file=Falseifshow_num>0elseTrue)# num_proc=1returndataset,dup_pairs