Source code for data_juicer.ops.mapper.remove_long_words_mapper

# Some code here has been modified from:
# https://huggingface.co/spaces/huggingface/text-data-filtering
# --------------------------------------------------------

import sys

from ..base_op import OPERATORS, Mapper
from ..common import (
    SPECIAL_CHARACTERS,
    merge_on_whitespace_tab_newline,
    split_on_newline_tab_whitespace,
    strip,
)


[docs] @OPERATORS.register_module("remove_long_words_mapper") class RemoveLongWordsMapper(Mapper): """Mapper to remove long words within a specific range. This operator filters out words in the text that are either shorter than the specified minimum length or longer than the specified maximum length. Words are first checked with their original length, and if they do not meet the criteria, they are stripped of special characters and re-evaluated. The key metric used is the character-based length of each word. The processed text retains only the words that fall within the defined length range. This operator processes text in batches for efficiency.""" _batched_op = True
[docs] def __init__(self, min_len: int = 1, max_len: int = sys.maxsize, *args, **kwargs): """ Initialization method. :param min_len: The min mapper word length in this op, words will be filtered if their length is below this parameter. :param max_len: The max mapper word length in this op, words will be filtered if their length exceeds this parameter. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.min_len = min_len self.max_len = max_len
[docs] def should_keep_long_word(self, word): if self.min_len <= len(word) <= self.max_len: return True elif self.min_len <= len(strip(word, SPECIAL_CHARACTERS)) <= self.max_len: return True else: return False
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): sentences = split_on_newline_tab_whitespace(text) sentences = [ [[word for word in subsentence if self.should_keep_long_word(word)] for subsentence in sentence] for sentence in sentences ] samples[self.text_key][idx] = merge_on_whitespace_tab_newline(sentences) return samples