[文档]@OPERATORS.register_module('replace_content_mapper')classReplaceContentMapper(Mapper):"""Mapper to replace all content in the text that matches a specific regular expression pattern with a designated replacement string."""_batched_op=True
[文档]def__init__(self,pattern:Union[str,List[str],None]=None,repl:Union[str,List[str]]='',*args,**kwargs):""" Initialization method. :param pattern: regular expression pattern(s) to search for within text :param repl: replacement string(s), default is empty string :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.pattern=patternself.repl=replself.compiled_patterns=[]ifisinstance(pattern,str):self.compiled_patterns.append(self._prepare_pattern(pattern))elifisinstance(pattern,list):forpinpattern:self.compiled_patterns.append(self._prepare_pattern(p))
def_prepare_pattern(self,pattern:str)->re.Pattern:"""Prepare the regular expression pattern."""if((patternisnotNoneandlen(pattern)>2)and(pattern.startswith("r'")andpattern.endswith("'")orpattern.startswith('r"')andpattern.endswith('"'))):pattern=pattern[2:-1]returnre.compile(pattern,flags=re.DOTALL)
[文档]defprocess_batched(self,samples):ifself.patternisNone:returnsamplesforidx,textinenumerate(samples[self.text_key]):fori,patterninenumerate(self.compiled_patterns):ifisinstance(self.repl,list)andi<len(self.repl):replacement=self.repl[i]elifisinstance(self.repl,list)andi>=len(self.repl):raiseValueError(f"pattern length: {len(self.pattern)} '"f'must be equal to 'f'repl length: {len(self.repl)}')else:replacement=self.repltext=pattern.sub(replacement,text)samples[self.text_key][idx]=textreturnsamples