Source code for data_juicer.ops.mapper.clean_ip_mapper

from typing import Optional

import regex as re

from ..base_op import OPERATORS, Mapper


[docs] @OPERATORS.register_module("clean_ip_mapper") class CleanIpMapper(Mapper): """Mapper to clean ipv4 and ipv6 address in text samples.""" _batched_op = True
[docs] def __init__(self, pattern: Optional[str] = None, repl: str = "", *args, **kwargs): """ Initialization method. :param pattern: regular expression pattern to search for within text. :param repl: replacement string, default is empty string. :param args: extra args :param kwargs: extra args """ super().__init__(*args, **kwargs) if pattern is None: self.pattern = r"(?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|" self.pattern += r"(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))" self.pattern += r"{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|" self.pattern += r"(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|" self.pattern += r"([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}" # ipv6 else: self.pattern = pattern if (len(pattern) > 2) and ( pattern.startswith("r'") and pattern.endswith("'") or pattern.startswith('r"') and pattern.endswith('"') ): self.pattern = pattern[2:-1] self.repl = repl
[docs] def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): continue samples[self.text_key][idx] = re.sub(pattern=self.pattern, repl=self.repl, string=text, flags=re.DOTALL) return samples