Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

# Most of the code here has been modified from:
# https://github.com/bigscience-workshop/data-preparation
# --------------------------------------------------------

from ..base_op import OPERATORS, Mapper
from ..common.special_characters import VARIOUS_WHITESPACES


[docs]@OPERATORS.register_module('whitespace_normalization_mapper') class WhitespaceNormalizationMapper(Mapper): """ Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20) in text samples. Different kinds of whitespaces can be found here: https://en.wikipedia.org/wiki/Whitespace_character """ _batched_op = True
[docs] def __init__(self, *args, **kwargs): """ Initialization method. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs)
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): # remove whitespaces before and after the main content text = text.strip() # replace all kinds of whitespaces with ' ' samples[self.text_key][idx] = ''.join([ char if char not in VARIOUS_WHITESPACES else ' ' for char in text ]) return samples