Source code for data_juicer.ops.mapper.sentence_split_mapper

from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Mapper
from ..common import get_sentences_from_document

OP_NAME = 'sentence_split_mapper'


[docs]@OPERATORS.register_module(OP_NAME) class SentenceSplitMapper(Mapper): """Mapper to split text samples to sentences.""" _batched_op = True
[docs] def __init__(self, lang: str = 'en', *args, **kwargs): """ Initialization method. :param lang: split sentence of text in which language. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) self.lang = lang self.model_key = prepare_model(model_type='nltk', lang=lang)
[docs] def process_batched(self, samples): nltk_model = get_model(self.model_key) samples[self.text_key] = [ get_sentences_from_document( text, model_func=nltk_model.tokenize if nltk_model else None) for text in samples[self.text_key] ] return samples