[文档]@OPERATORS.register_module(OP_NAME)classTextChunkMapper(Mapper):"""Split input text to chunks."""_batched_op=True
[文档]def__init__(self,max_len:Union[PositiveInt,None]=None,split_pattern:Union[str,None]=r'\n\n',overlap_len:NonNegativeInt=0,tokenizer:Union[str,None]=None,trust_remote_code:bool=False,*args,**kwargs):""" Initialization method. :param max_len: Split text into multi texts with this max len if not None. :param split_pattern: Make sure split in this pattern if it is not None and force cut if the length exceeds max_len. :param overlap_len: Overlap length of the split texts if not split in the split pattern. :param tokenizer: The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offered. Otherwise, the text length equals to string length. Support tiktoken tokenizer (such as gpt-4o), dashscope tokenizer ( such as qwen2.5-72b-instruct) and huggingface tokenizer. :trust_remote_code: for loading huggingface model :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)ifmax_lenisNoneandsplit_patternisNone:raiseValueError('max_len and split_pattern cannot be both None')ifmax_lenisnotNoneandoverlap_len>=max_len:raiseValueError('overlap_len must be less than max_len')self.max_len=max_lenself.overlap_len=overlap_lenself.split_pattern=split_patternself.tokenizer_name=tokenizeriftokenizerisnotNone:self.model_key=prepare_model(model_type='api',model=tokenizer,return_processor=True,processor_config={'trust_remote_code':trust_remote_code})