Source code for data_juicer.ops.mapper.remove_long_words_mapper
# Some code here has been modified from:# https://huggingface.co/spaces/huggingface/text-data-filtering# --------------------------------------------------------importsysfrom..base_opimportOPERATORS,Mapperfrom..commonimport(SPECIAL_CHARACTERS,merge_on_whitespace_tab_newline,split_on_newline_tab_whitespace,strip,)
[docs]@OPERATORS.register_module("remove_long_words_mapper")classRemoveLongWordsMapper(Mapper):"""Mapper to remove long words within a specific range. This operator filters out words in the text that are either shorter than the specified minimum length or longer than the specified maximum length. Words are first checked with their original length, and if they do not meet the criteria, they are stripped of special characters and re-evaluated. The key metric used is the character-based length of each word. The processed text retains only the words that fall within the defined length range. This operator processes text in batches for efficiency."""_batched_op=True
[docs]def__init__(self,min_len:int=1,max_len:int=sys.maxsize,*args,**kwargs):""" Initialization method. :param min_len: The min mapper word length in this op, words will be filtered if their length is below this parameter. :param max_len: The max mapper word length in this op, words will be filtered if their length exceeds this parameter. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.min_len=min_lenself.max_len=max_len