Source code for data_juicer.ops.mapper.clean_links_mapper

# Some code here has been modified from:
# https://github.com/kallewesterling/CleanText/
# --------------------------------------------------------
from typing import Optional

import regex as re

from ..base_op import OPERATORS, Mapper


[docs] @OPERATORS.register_module("clean_links_mapper") class CleanLinksMapper(Mapper): """Mapper to clean links like http/https/ftp in text samples.""" _batched_op = True
[docs] def __init__(self, pattern: Optional[str] = None, repl: str = "", *args, **kwargs): """ Initialization method. :param pattern: regular expression pattern to search for within text. :param repl: replacement string, default is empty string. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) if pattern is None: self.pattern = r"(?i)\b(" self.pattern += r"(?:[a-z][\w-]+:(?:\/{1,3}|" self.pattern += r"[a-z0-9%])|www\d{0,3}[.]|" self.pattern += r"[a-z0-9.\-]+[.][a-z]{2,4}\/)" self.pattern += r"(?:[^\s()<>]+|\(([^\s()<>]+|" self.pattern += r"(\([^\s()<>]+\)))*\))" self.pattern += r"+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|" self.pattern += r"[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])" self.pattern += r")" else: self.pattern = pattern if (len(pattern) > 2) and ( pattern.startswith("r'") and pattern.endswith("'") or pattern.startswith('r"') and pattern.endswith('"') ): self.pattern = pattern[2:-1] self.repl = repl
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): continue samples[self.text_key][idx] = re.sub(pattern=self.pattern, repl=self.repl, string=text, flags=re.DOTALL) return samples