Source code for data_juicer.ops.mapper.remove_header_mapper
# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------
import regex as re
from ..base_op import OPERATORS, Mapper
[docs]
@OPERATORS.register_module("remove_header_mapper")
class RemoveHeaderMapper(Mapper):
"""Mapper to remove headers at the beginning of documents in Latex
samples."""
_batched_op = True
[docs]
def __init__(self, drop_no_head: bool = True, *args, **kwargs):
"""
Initialization method.
:param drop_no_head: whether to drop sample texts without
headers.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.pattern = r"^(.*?)("
self.pattern += r"\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
self.pattern += r"\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
self.pattern += r"\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
self.pattern += r"\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
self.pattern += r"\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|"
self.pattern += r"\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}"
self.pattern += r"\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}"
self.pattern += r")"
self.drop_no_head = drop_no_head
[docs]
def process_batched(self, samples):
for idx, text in enumerate(samples[self.text_key]):
if not re.search(self.pattern, text, flags=re.DOTALL):
if self.drop_no_head:
text = ""
continue
text = re.sub(pattern=self.pattern, repl=r"\2", string=text, flags=re.DOTALL)
samples[self.text_key][idx] = text
return samples