Source code for data_juicer.ops.mapper.remove_long_words_mapper
# Some code here has been modified from:# https://huggingface.co/spaces/huggingface/text-data-filtering# --------------------------------------------------------importsysfrom..base_opimportOPERATORS,Mapperfrom..commonimport(SPECIAL_CHARACTERS,merge_on_whitespace_tab_newline,split_on_newline_tab_whitespace,strip)
[docs]@OPERATORS.register_module('remove_long_words_mapper')classRemoveLongWordsMapper(Mapper):"""Mapper to remove long words within a specific range."""_batched_op=True
[docs]def__init__(self,min_len:int=1,max_len:int=sys.maxsize,*args,**kwargs):""" Initialization method. :param min_len: The min mapper word length in this op, words will be filtered if their length is below this parameter. :param max_len: The max mapper word length in this op, words will be filtered if their length exceeds this parameter. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.min_len=min_lenself.max_len=max_len