[docs]@OPERATORS.register_module("replace_content_mapper")classReplaceContentMapper(Mapper):"""Replaces content in the text that matches a specific regular expression pattern with a designated replacement string. This operator processes text by searching for patterns defined in `pattern` and replacing them with the corresponding `repl` string. If multiple patterns and replacements are provided, each pattern is replaced by its respective replacement. The operator supports both single and multiple patterns and replacements. The regular expressions are compiled with the `re.DOTALL` flag to match across multiple lines. If the length of the patterns and replacements do not match, a `ValueError` is raised. This operation is batched, meaning it processes multiple samples at once."""_batched_op=True
[docs]def__init__(self,pattern:Union[str,List[str],None]=None,repl:Union[str,List[str]]="",*args,**kwargs):""" Initialization method. :param pattern: regular expression pattern(s) to search for within text :param repl: replacement string(s), default is empty string :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)self.pattern=patternself.repl=replself.compiled_patterns=[]ifisinstance(pattern,str):self.compiled_patterns.append(self._prepare_pattern(pattern))elifisinstance(pattern,list):forpinpattern:self.compiled_patterns.append(self._prepare_pattern(p))
def_prepare_pattern(self,pattern:str)->re.Pattern:"""Prepare the regular expression pattern."""if(patternisnotNoneandlen(pattern)>2)and(pattern.startswith("r'")andpattern.endswith("'")orpattern.startswith('r"')andpattern.endswith('"')):pattern=pattern[2:-1]returnre.compile(pattern,flags=re.DOTALL)
[docs]defprocess_batched(self,samples):ifself.patternisNone:returnsamplesforidx,textinenumerate(samples[self.text_key]):fori,patterninenumerate(self.compiled_patterns):ifisinstance(self.repl,list)andi<len(self.repl):replacement=self.repl[i]elifisinstance(self.repl,list)andi>=len(self.repl):raiseValueError(f"pattern length: {len(self.pattern)} '"f"must be equal to "f"repl length: {len(self.repl)}")else:replacement=self.repltext=pattern.sub(replacement,text)samples[self.text_key][idx]=textreturnsamples