Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

# Most of the code here has been modified from:
# https://github.com/bigscience-workshop/data-preparation
# --------------------------------------------------------

from ..base_op import OPERATORS, Mapper
from ..common.special_characters import VARIOUS_WHITESPACES



[docs]
@OPERATORS.register_module('whitespace_normalization_mapper')
class WhitespaceNormalizationMapper(Mapper):
    """
    Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20)
    in text samples.

    Different kinds of whitespaces can be found here:
    https://en.wikipedia.org/wiki/Whitespace_character
    """

    _batched_op = True


[docs]
    def __init__(self, *args, **kwargs):
        """
        Initialization method.

        :param args: extra args
        :param kwargs: extra args
        """
        super().__init__(*args, **kwargs)



[docs]
    def process_batched(self, samples):
        for idx, text in enumerate(samples[self.text_key]):
            # remove whitespaces before and after the main content
            text = text.strip()

            # replace all kinds of whitespaces with ' '
            samples[self.text_key][idx] = ''.join([
                char if char not in VARIOUS_WHITESPACES else ' '
                for char in text
            ])

        return samples