Source code for data_juicer.ops.mapper.clean_links_mapper
# Some code here has been modified from:# https://github.com/kallewesterling/CleanText/# --------------------------------------------------------fromtypingimportOptionalimportregexasrefrom..base_opimportOPERATORS,Mapper
[docs]@OPERATORS.register_module('clean_links_mapper')classCleanLinksMapper(Mapper):"""Mapper to clean links like http/https/ftp in text samples."""_batched_op=True
[docs]def__init__(self,pattern:Optional[str]=None,repl:str='',*args,**kwargs):""" Initialization method. :param pattern: regular expression pattern to search for within text. :param repl: replacement string, default is empty string. :param args: extra args :param kwargs: extra args """super().__init__(*args,**kwargs)ifpatternisNone:self.pattern=r'(?i)\b('self.pattern+=r'(?:[a-z][\w-]+:(?:\/{1,3}|'self.pattern+=r'[a-z0-9%])|www\d{0,3}[.]|'self.pattern+=r'[a-z0-9.\-]+[.][a-z]{2,4}\/)'self.pattern+=r'(?:[^\s()<>]+|\(([^\s()<>]+|'self.pattern+=r'(\([^\s()<>]+\)))*\))'self.pattern+=r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|'self.pattern+=r'[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])'self.pattern+=r')'else:self.pattern=patternif((len(pattern)>2)and(pattern.startswith("r'")andpattern.endswith("'")orpattern.startswith('r"')andpattern.endswith('"'))):self.pattern=pattern[2:-1]self.repl=repl