Source code for data_juicer.ops.mapper.remove_long_words_mapper

# Some code here has been modified from:
# https://huggingface.co/spaces/huggingface/text-data-filtering
# --------------------------------------------------------

import sys

from ..base_op import OPERATORS, Mapper
from ..common import (
    SPECIAL_CHARACTERS,
    merge_on_whitespace_tab_newline,
    split_on_newline_tab_whitespace,
    strip,
)



[docs]
@OPERATORS.register_module("remove_long_words_mapper")
class RemoveLongWordsMapper(Mapper):
    """Mapper to remove long words within a specific range.

    This operator filters out words in the text that are either shorter than the specified
    minimum length or longer than the specified maximum length. Words are first checked with
    their original length, and if they do not meet the criteria, they are stripped of
    special characters and re-evaluated. The key metric used is the character-based length
    of each word. The processed text retains only the words that fall within the defined
    length range. This operator processes text in batches for efficiency."""

    _batched_op = True


[docs]
    def __init__(self, min_len: int = 1, max_len: int = sys.maxsize, *args, **kwargs):
        """
        Initialization method.

        :param min_len: The min mapper word length in this op, words
            will be filtered if their length is below this parameter.
        :param max_len: The max mapper word length in this op, words
            will be filtered if their length exceeds this parameter.
        :param args: extra args
        :param kwargs: extra args
        """
        super().__init__(*args, **kwargs)
        self.min_len = min_len
        self.max_len = max_len



[docs]
    def should_keep_long_word(self, word):
        if self.min_len <= len(word) <= self.max_len:
            return True
        elif self.min_len <= len(strip(word, SPECIAL_CHARACTERS)) <= self.max_len:
            return True
        else:
            return False



[docs]
    def process_batched(self, samples):
        for idx, text in enumerate(samples[self.text_key]):
            sentences = split_on_newline_tab_whitespace(text)
            sentences = [
                [[word for word in subsentence if self.should_keep_long_word(word)] for subsentence in sentence]
                for sentence in sentences
            ]
            samples[self.text_key][idx] = merge_on_whitespace_tab_newline(sentences)
        return samples