[docs]@OPERATORS.register_module(OP_NAME)classNlpcdaZhMapper(Mapper):"""Mapper to simply augment samples in Chinese based on nlpcda library."""_batched_op=True
[docs]def__init__(self,sequential:bool=False,aug_num:PositiveInt=1,keep_original_sample:bool=True,replace_similar_word:bool=False,replace_homophone_char:bool=False,delete_random_char:bool=False,swap_random_char:bool=False,replace_equivalent_num:bool=False,*args,**kwargs):""" Initialization method. All augmentation methods use default parameters in default. We recommend you to only use 1-3 augmentation methods at a time. Otherwise, the semantics of samples might be changed significantly. **Notice**: some augmentation method might not work for some special texts, so there might be no augmented texts generated. :param sequential: whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently. :param aug_num: number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated. :param keep_original_sample: whether to keep the original sample. If it's set to False, there will be only generated texts in the final datasets and the original texts will be removed. It's True in default. :param replace_similar_word: whether to open the augmentation method of replacing random words with their similar words in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法" :param replace_homophone_char: whether to open the augmentation method of replacing random characters with their homophones in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法" :param delete_random_char: whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强" :param swap_random_char: whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法" :param replace_equivalent_num: whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法" :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.aug_num=aug_numifaug_num>=10:logger.warning(f'Relatively large augmentation number [{aug_num}]'f' might generate large number of new samples and 'f'requires more memory and disk space.')self.sequential=sequentialself.keep_original_sample=keep_original_sample# hide the redundant outputs from nlpcda librarywithHiddenPrints():importwarningswarnings.filterwarnings('ignore')self.aug_pipeline=[]# sample level# word levelifreplace_similar_word:# the first sample of augmented sample list is the same as the# original sample, so we need generate one more augmented# sample to get the expected number of augmented samples. Same# belowcreate_num=(self.aug_num+1) \
ifnotself.sequentialorlen(self.aug_pipeline)==0 \
else2self.aug_pipeline.append(nlpcda.Similarword(create_num=create_num))# char levelifreplace_homophone_char:create_num=(self.aug_num+1) \
ifnotself.sequentialorlen(self.aug_pipeline)==0 \
else2self.aug_pipeline.append(nlpcda.Homophone(create_num=create_num))ifdelete_random_char:create_num=(self.aug_num+1) \
ifnotself.sequentialorlen(self.aug_pipeline)==0 \
else2self.aug_pipeline.append(nlpcda.RandomDeleteChar(create_num=create_num))ifswap_random_char:create_num=(self.aug_num+1) \
ifnotself.sequentialorlen(self.aug_pipeline)==0 \
else2# only use char_gram=1 for relatively minor changesself.aug_pipeline.append(nlpcda.CharPositionExchange(create_num=create_num,char_gram=1))# only for numbers nowifreplace_equivalent_num:create_num=(self.aug_num+1) \
ifnotself.sequentialorlen(self.aug_pipeline)==0 \
else2self.aug_pipeline.append(nlpcda.EquivalentChar(create_num=create_num))
[docs]defprocess_batched(self,samples):# no augmentation methods are openediflen(self.aug_pipeline)==0:ifself.keep_original_sample:returnsampleselse:return{key:[]forkeyinsamples}texts_to_aug=samples[self.text_key]res_samples=deepcopy(samples)# get augmented textsifself.sequential:aug_texts=texts_to_augforaug_methodinself.aug_pipeline:results=[]fortextinaug_texts:# aug and skip the original textresult=aug_method.replace(text)results+=result[1:]iflen(result)>1elseresultaug_texts=results[:]iflen(aug_texts)==1andaug_texts[0]==texts_to_aug[0]:aug_texts=[]else:# apply each aug method to generate several augmented textsaug_texts=[]foraug_methodinself.aug_pipeline:aug_texts+=aug_method.replace(texts_to_aug[0])[1:]# add augmented samples to the batch with other replicate fieldsifself.keep_original_sample:res_samples[self.text_key]+=aug_textselse:res_samples[self.text_key]=aug_texts# add other replicate fieldsforkeyinres_samples:ifkey!=self.text_key:res_samples[key]=res_samples[key]* \
len(res_samples[self.text_key])returnres_samples