Source code for data_juicer.ops.mapper.whitespace_normalization_mapper
# Most of the code here has been modified from:# https://github.com/bigscience-workshop/data-preparation# --------------------------------------------------------from..base_opimportOPERATORS,Mapperfrom..common.special_charactersimportVARIOUS_WHITESPACES
[docs]@OPERATORS.register_module('whitespace_normalization_mapper')classWhitespaceNormalizationMapper(Mapper):""" Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20) in text samples. Different kinds of whitespaces can be found here: https://en.wikipedia.org/wiki/Whitespace_character """_batched_op=True
[docs]def__init__(self,*args,**kwargs):""" Initialization method. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)
[docs]defprocess_batched(self,samples):foridx,textinenumerate(samples[self.text_key]):# remove whitespaces before and after the main contenttext=text.strip()# replace all kinds of whitespaces with ' 'samples[self.text_key][idx]=''.join([charifcharnotinVARIOUS_WHITESPACESelse' 'forcharintext])returnsamples