# Some code here has been modified from:# https://github.com/bigscience-workshop/data-preparation# --------------------------------------------------------from..base_opimportOPERATORS,Mapper
[文档]@OPERATORS.register_module('punctuation_normalization_mapper')classPunctuationNormalizationMapper(Mapper):"""Mapper to normalize unicode punctuations to English punctuations in text samples."""_batched_op=True
[文档]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.punctuation_unicode={',':',','。':'.','、':',','„':'"','”':'"','“':'"','«':'"','»':'"','1':'"','」':'"','「':'"','《':'"','》':'"','´':"'",'∶':':',':':':','?':'?','!':'!','(':'(',')':')',';':';','–':'-','—':' - ','.':'. ','~':'~','’':"'",'…':'...','━':'-','〈':'<','〉':'>','【':'[','】':']','%':'%','►':'-',}