Source code for data_juicer.ops.mapper.whitespace_normalization_mapper
# Most of the code here has been modified from:# https://github.com/bigscience-workshop/data-preparation# --------------------------------------------------------from..base_opimportOPERATORS,Mapperfrom..common.special_charactersimportVARIOUS_WHITESPACES
[docs]@OPERATORS.register_module("whitespace_normalization_mapper")classWhitespaceNormalizationMapper(Mapper):"""Normalizes various types of whitespace characters to standard spaces in text samples. This mapper converts all non-standard whitespace characters, such as tabs and newlines, to the standard space character (' ', 0x20). It also trims leading and trailing whitespace from the text. This ensures consistent spacing across all text samples, improving readability and consistency. The normalization process is based on a comprehensive list of whitespace characters, which can be found at https://en.wikipedia.org/wiki/Whitespace_character."""_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)
[docs]defprocess_batched(self,samples):foridx,textinenumerate(samples[self.text_key]):# remove whitespaces before and after the main contenttext=text.strip()# replace all kinds of whitespaces with ' 'samples[self.text_key][idx]="".join([charifcharnotinVARIOUS_WHITESPACESelse" "forcharintext])returnsamples