Source code for data_juicer.ops.mapper.punctuation_normalization_mapper
# Some code here has been modified from:# https://github.com/bigscience-workshop/data-preparation# --------------------------------------------------------from..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module("punctuation_normalization_mapper")classPunctuationNormalizationMapper(Mapper):"""Mapper to normalize unicode punctuations to English punctuations in text samples."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.punctuation_unicode={",":",","。":".","、":",","„":'"',"”":'"',"“":'"',"«":'"',"»":'"',"1":'"',"」":'"',"「":'"',"《":'"',"》":'"',"´":"'","∶":":",":":":","?":"?","!":"!","(":"(",")":")",";":";","–":"-","—":" - ",".":". ","~":"~","’":"'","…":"...","━":"-","〈":"<","〉":">","【":"[","】":"]","%":"%","►":"-",}