Source code for data_juicer.ops.mapper.remove_long_words_mapper

# Some code here has been modified from:
# https://huggingface.co/spaces/huggingface/text-data-filtering
# --------------------------------------------------------

import sys

from ..base_op import OPERATORS, Mapper
from ..common import (SPECIAL_CHARACTERS, merge_on_whitespace_tab_newline,
                      split_on_newline_tab_whitespace, strip)


[docs] @OPERATORS.register_module('remove_long_words_mapper') class RemoveLongWordsMapper(Mapper): """Mapper to remove long words within a specific range.""" _batched_op = True
[docs] def __init__(self, min_len: int = 1, max_len: int = sys.maxsize, *args, **kwargs): """ Initialization method. :param min_len: The min mapper word length in this op, words will be filtered if their length is below this parameter. :param max_len: The max mapper word length in this op, words will be filtered if their length exceeds this parameter. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.min_len = min_len self.max_len = max_len
[docs] def should_keep_long_word(self, word): if self.min_len <= len(word) <= self.max_len: return True elif self.min_len <= len(strip(word, SPECIAL_CHARACTERS)) <= self.max_len: return True else: return False
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): sentences = split_on_newline_tab_whitespace(text) sentences = [[[ word for word in subsentence if self.should_keep_long_word(word) ] for subsentence in sentence] for sentence in sentences] samples[self.text_key][idx] = merge_on_whitespace_tab_newline( sentences) return samples