Source code for data_juicer.format.text_formatter

import os
from multiprocessing import Pool

import pdfplumber
from datasets import Dataset, concatenate_datasets, load_dataset
from docx import Document
from loguru import logger

from data_juicer.utils.cache_utils import DATA_JUICER_CACHE_HOME
from data_juicer.utils.file_utils import find_files_with_suffix

from .formatter import FORMATTERS, LocalFormatter, add_suffixes, unify_format


def extract_txt_from_docx(fn, tgt_path):
    """
    Extract text from a docx file and save to target path.

    :param fn: path to input pdf file
    :param tgt_path: path to save text file.
    """
    doc = Document(fn)
    text = [para.text for para in doc.paragraphs if para.text.strip()]
    base_fn = os.path.basename(fn).lower().replace('.docx', '.txt')
    with open(os.path.join(tgt_path, base_fn), 'w') as f:
        f.write('\n'.join(text))


def extract_txt_from_pdf(fn, tgt_path):
    """
    Extract text from a pdf file and save to target path.

    :param fn: path to input pdf file
    :param tgt_path: path to save text file.
    """
    with pdfplumber.open(fn) as pdf:
        text = []
        for page in pdf.pages:
            # remove tables from each page extracted by pdfplumber
            tables = page.find_tables()
            for table in tables:
                page = page.outside_bbox(table.bbox)
            # remove page number from the end of each page
            page_text = page.extract_text()
            page_num = str(page.page_number)
            if page_text.rstrip().endswith(page_num):
                page_text = page_text.rstrip()[:-len(page_num)]
            if page_text.strip():
                text.append(page_text)
        base_fn = os.path.basename(fn).lower().replace('.pdf', '.txt')
        with open(os.path.join(tgt_path, base_fn), 'w') as f:
            f.write('\n'.join(text))


[docs]@FORMATTERS.register_module() class TextFormatter(LocalFormatter): """ The class is used to load and format text-type files. e.g. `['.txt', '.pdf', '.cpp', '.docx']` """ SUFFIXES = [ '.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', '.m', '.smali' ]
[docs] def __init__(self, dataset_path, suffixes=None, add_suffix=False, **kwargs): """ Initialization method. :param dataset_path: a dataset file or a dataset directory :param suffixes: files with specified suffixes to be processed :param add_suffix: Whether to add file suffix to datase meta info :param kwargs: extra args """ super().__init__( dataset_path=dataset_path, suffixes=suffixes if suffixes else self.SUFFIXES, type='text', add_suffix=add_suffix, **kwargs, ) self.dataset_path = dataset_path self.add_suffix = add_suffix
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from local text-type files. :param num_proc: number of processes when loading the dataset :param global_cfg: the global cfg used in consequent processes, :return: unified_format_dataset. """ # extract text to cache directory extracted_dataset_path = os.path.join( DATA_JUICER_CACHE_HOME, os.path.basename(os.path.abspath(self.dataset_path))) for file_type in self.data_files: # extract text from docx or pdf files, and save as txt type if file_type == '.docx' or file_type == '.pdf': extracted_filetype_path = os.path.join(extracted_dataset_path, file_type.strip('.')) if not os.path.exists(extracted_filetype_path): os.makedirs(extracted_filetype_path) logger.info('Extracting text from {} files...'.format( file_type.strip('.'))) extract_func = extract_txt_from_docx \ if file_type == '.docx' else extract_txt_from_pdf pool = Pool(num_proc) for data_file in self.data_files[file_type]: pool.apply_async(func=extract_func, args=( data_file, extracted_filetype_path, )) pool.close() pool.join() logger.info(f'Extracted text files are stored in directory ' f'{extracted_filetype_path}') # look for extracted txt files self.data_files[file_type] = find_files_with_suffix( extracted_filetype_path, '.txt')['.txt'] # load text dataset, one text file as one sample datasets = load_dataset('text', data_files={ key.strip('.'): self.data_files[key] for key in self.data_files }, sample_by='document', num_proc=num_proc, **self.kwargs) # whether to add file suffix to datase meta info if self.add_suffix: logger.info('Add suffix info into dataset...') datasets = add_suffixes(datasets, num_proc) else: datasets = concatenate_datasets([ds for _, ds in datasets.items()]) return unify_format(datasets, text_keys=self.text_keys, num_proc=num_proc, global_cfg=global_cfg)