[文档]@OPERATORS.register_module(OP_NAME)classSentenceSplitMapper(Mapper):"""Mapper to split text samples to sentences."""_batched_op=True
[文档]def__init__(self,lang:str='en',*args,**kwargs):""" Initialization method. :param lang: split sentence of text in which language. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.lang=lang# Ensure NLTK pickle security patch is appliedpatch_nltk_pickle_security()# Prepare the sentence tokenizer modelself.model_key=prepare_model(model_type='nltk',lang=lang)
[文档]defprocess_batched(self,samples):# Get the sentence tokenizer modelnltk_model=get_model(self.model_key)samples[self.text_key]=[get_sentences_from_document(text,model_func=nltk_model.tokenizeifnltk_modelelseNone)fortextinsamples[self.text_key]]returnsamples