Source code for data_juicer.ops.mapper.clean_links_mapper
# Some code here has been modified from:# https://github.com/kallewesterling/CleanText/# --------------------------------------------------------fromtypingimportOptionalimportregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module("clean_links_mapper")classCleanLinksMapper(Mapper):"""Mapper to clean links like http/https/ftp in text samples."""_batched_op=True
[docs]def__init__(self,pattern:Optional[str]=None,repl:str="",*args,**kwargs):""" Initialization method. :param pattern: regular expression pattern to search for within text. :param repl: replacement string, default is empty string. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)ifpatternisNone:self.pattern=r"(?i)\b("self.pattern+=r"(?:[a-z][\w-]+:(?:\/{1,3}|"self.pattern+=r"[a-z0-9%])|www\d{0,3}[.]|"self.pattern+=r"[a-z0-9.\-]+[.][a-z]{2,4}\/)"self.pattern+=r"(?:[^\s()<>]+|\(([^\s()<>]+|"self.pattern+=r"(\([^\s()<>]+\)))*\))"self.pattern+=r"+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|"self.pattern+=r"[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])"self.pattern+=r")"else:self.pattern=patternif(len(pattern)>2)and(pattern.startswith("r'")andpattern.endswith("'")orpattern.startswith('r"')andpattern.endswith('"')):self.pattern=pattern[2:-1]self.repl=repl