[文档]@OPERATORS.register_module(OP_NAME)classSentenceSplitMapper(Mapper):"""Splits text samples into individual sentences based on the specified language. This operator uses an NLTK-based tokenizer to split the input text into sentences. The language for the tokenizer is specified during initialization. The original text in each sample is replaced with a list of sentences. This operator processes samples in batches for efficiency. Ensure that the `lang` parameter is set to the appropriate language code (e.g., "en" for English) to achieve accurate sentence splitting."""_batched_op=True
[文档]def__init__(self,lang:str="en",*args,**kwargs):""" Initialization method. :param lang: split sentence of text in which language. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.lang=lang# Ensure NLTK pickle security patch is appliedpatch_nltk_pickle_security()# Prepare the sentence tokenizer modelself.model_key=prepare_model(model_type="nltk",lang=lang)
[文档]defprocess_batched(self,samples):# Get the sentence tokenizer modelnltk_model=get_model(self.model_key)samples[self.text_key]=[get_sentences_from_document(text,model_func=nltk_model.tokenizeifnltk_modelelseNone)fortextinsamples[self.text_key]]returnsamples