Source code for data_juicer.ops.mapper.punctuation_normalization_mapper

# Some code here has been modified from:
# https://github.com/bigscience-workshop/data-preparation
# --------------------------------------------------------

from ..base_op import OPERATORS, Mapper


[docs] @OPERATORS.register_module('punctuation_normalization_mapper') class PunctuationNormalizationMapper(Mapper): """Mapper to normalize unicode punctuations to English punctuations in text samples.""" _batched_op = True
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.punctuation_unicode = { ',': ',', '。': '.', '、': ',', '„': '"', '”': '"', '“': '"', '«': '"', '»': '"', '1': '"', '」': '"', '「': '"', '《': '"', '》': '"', '´': "'", '∶': ':', ':': ':', '?': '?', '!': '!', '(': '(', ')': ')', ';': ';', '–': '-', '—': ' - ', '.': '. ', '~': '~', '’': "'", '…': '...', '━': '-', '〈': '<', '〉': '>', '【': '[', '】': ']', '%': '%', '►': '-', }
[docs] def process_batched(self, samples): samples[self.text_key] = [ ''.join([self.punctuation_unicode.get(c, c) for c in text]) for text in samples[self.text_key] ] return samples