Source code for data_juicer.format.formatter

import os
from typing import List, Union

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from loguru import logger

from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import (find_files_with_suffix,
                                          is_absolute_path)
from data_juicer.utils.registry import Registry

FORMATTERS = Registry('Formatters')


[docs] class BaseFormatter: """Base class to load dataset."""
[docs] def load_dataset(self, *args) -> Dataset: raise NotImplementedError
[docs] class LocalFormatter(BaseFormatter): """The class is used to load a dataset from local files or local directory."""
[docs] def __init__( self, dataset_path: str, type: str, suffixes: Union[str, List[str], None] = None, text_keys: List[str] = None, add_suffix=False, **kwargs, ): """ Initialization method. :param dataset_path: path to a dataset file or a dataset directory :param type: a packaged dataset module type (json, csv, etc.) :param suffixes: files with specified suffixes to be processed :param text_keys: key names of field that stores sample text. :param add_suffix: whether to add the file suffix to dataset meta info :param kwargs: extra args """ self.type = type self.kwargs = kwargs self.text_keys = text_keys self.data_files = find_files_with_suffix(dataset_path, suffixes) self.add_suffix = add_suffix
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from dataset file or dataset directory, and unify its format. :param num_proc: number of processes when loading the dataset :param global_cfg: global cfg used in consequent processes, :return: formatted dataset """ datasets = load_dataset(self.type, data_files={ key.strip('.'): self.data_files[key] for key in self.data_files }, num_proc=num_proc, **self.kwargs) if self.add_suffix: logger.info('Add suffix info into dataset...') datasets = add_suffixes(datasets, num_proc) else: from data_juicer.core.data import NestedDataset datasets = NestedDataset( concatenate_datasets([ds for _, ds in datasets.items()])) ds = unify_format(datasets, text_keys=self.text_keys, num_proc=num_proc, global_cfg=global_cfg) return ds
[docs] class RemoteFormatter(BaseFormatter): """The class is used to load a dataset from repository of huggingface hub."""
[docs] def __init__(self, dataset_path: str, text_keys: List[str] = None, **kwargs): """ Initialization method. :param dataset_path: a dataset file or a dataset directory :param text_keys: key names of field that stores sample text. :param kwargs: extra args """ self.path = dataset_path self.text_keys = text_keys self.kwargs = kwargs
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from HuggingFace, and unify its format. :param num_proc: number of processes when loading the dataset :param global_cfg: the global cfg used in consequent processes, :return: formatted dataset """ ds = load_dataset(self.path, split='train', num_proc=num_proc, **self.kwargs) ds = unify_format(ds, text_keys=self.text_keys, num_proc=num_proc, global_cfg=global_cfg) return ds
[docs] def add_suffixes(datasets: DatasetDict, num_proc: int = 1) -> Dataset: """ Add suffix filed to datasets. :param datasets: a DatasetDict object :param num_proc: number of processes to add suffixes :return: datasets with suffix features. """ logger.info('Add suffix column for dataset') from data_juicer.core.data import add_same_content_to_new_column for key, ds in datasets.items(): if Fields.suffix not in ds.features: datasets[key] = ds.map(add_same_content_to_new_column, fn_kwargs={ 'new_column_name': Fields.suffix, 'initial_value': '.' + key }, num_proc=num_proc, desc='Adding new column for suffix') datasets = concatenate_datasets([ds for _, ds in datasets.items()]) from data_juicer.core.data import NestedDataset return NestedDataset(datasets)
[docs] def unify_format( dataset: Dataset, text_keys: Union[List[str], str] = 'text', num_proc: int = 1, global_cfg=None, ) -> Dataset: """ Get an unified internal format, conduct the following modifications. 1. check keys of dataset 2. filter out those samples with empty or None text :param dataset: input dataset :param text_keys: original text key(s) of dataset. :param num_proc: number of processes for mapping :param global_cfg: the global cfg used in consequent processes, since cfg.text_key may be modified after unifying :return: unified_format_dataset """ from data_juicer.core.data import NestedDataset if isinstance(dataset, DatasetDict): datasets = list(dataset.values()) assert len(datasets) == 1, 'Please make sure the passed datasets ' \ 'contains only 1 dataset' dataset = datasets[0] assert isinstance(dataset, Dataset) or \ isinstance(dataset, NestedDataset), \ 'Currently we only support processing data' \ 'with huggingface-Dataset format' if text_keys is None: text_keys = [] if isinstance(text_keys, str): text_keys = [text_keys] logger.info('Unifying the input dataset formats...') dataset = NestedDataset(dataset) # 1. check text related keys for key in text_keys: if key not in dataset.features: err_msg = f'There is no key [{key}] in dataset. You might set ' \ f'wrong text_key in the config file for your dataset. ' \ f'Please check and retry!' logger.error(err_msg) raise ValueError(err_msg) # 2. filter out those samples with empty or None text # TODO: optimize the filtering operation for better efficiency logger.info(f'There are {len(dataset)} sample(s) in the original dataset.') def non_empty_text(sample, target_keys): for target_key in target_keys: # TODO: case for CFT, in which the len(sample[target_key]) == 0 if sample[target_key] is None: # we filter out the samples contains at least None column # since the op can not handle it now return False return True dataset = dataset.filter(non_empty_text, num_proc=num_proc, fn_kwargs={'target_keys': text_keys}) logger.info(f'{len(dataset)} samples left after filtering empty text.') # 3. convert relative paths to absolute paths if global_cfg: ds_dir = global_cfg.dataset_dir image_key = global_cfg.image_key audio_key = global_cfg.audio_key video_key = global_cfg.video_key data_path_keys = [] if image_key in dataset.features: data_path_keys.append(image_key) if audio_key in dataset.features: data_path_keys.append(audio_key) if video_key in dataset.features: data_path_keys.append(video_key) if len(data_path_keys) == 0: # no image/audio/video path list in dataset, no need to convert return dataset if ds_dir == '': return dataset logger.info('Converting relative paths in the dataset to their ' 'absolute version. (Based on the directory of input ' 'dataset file)') # function to convert relative paths to absolute paths def rel2abs(sample, path_keys, dataset_dir): for path_key in path_keys: if path_key not in sample: continue paths = sample[path_key] if not paths: continue new_paths = [ path if os.path.isabs(path) else os.path.join( dataset_dir, path) for path in paths ] sample[path_key] = new_paths return sample dataset = dataset.map(rel2abs, num_proc=num_proc, fn_kwargs={ 'path_keys': data_path_keys, 'dataset_dir': ds_dir }) else: logger.warning('No global config passed into unify_format function. ' 'Relative paths in the dataset might not be converted ' 'to their absolute versions. Data of other modalities ' 'might not be able to find by Data-Juicer.') return dataset
[docs] def load_formatter(dataset_path, text_keys=None, suffixes=None, add_suffix=False, **kwargs) -> BaseFormatter: """ Load the appropriate formatter for different types of data formats. :param dataset_path: Path to dataset file or dataset directory :param text_keys: key names of field that stores sample text. Default: None :param suffixes: the suffix of files that will be read. Default: None :return: a dataset formatter. """ if suffixes is None: suffixes = [] ext_num = {} if os.path.isdir(dataset_path) or os.path.isfile(dataset_path): file_dict = find_files_with_suffix(dataset_path, suffixes) if not file_dict: raise IOError( 'Unable to find files matching the suffix from {}'.format( dataset_path)) for ext in file_dict: ext_num[ext] = len(file_dict[ext]) # local dataset if ext_num: formatter_num = {} for name, formatter in FORMATTERS.modules.items(): formatter_num[name] = 0 for ext in ext_num: if ext in formatter.SUFFIXES: formatter_num[name] += ext_num[ext] formatter = max(formatter_num, key=lambda x: formatter_num[x]) target_suffixes = set(ext_num.keys()).intersection( set(FORMATTERS.modules[formatter].SUFFIXES)) return FORMATTERS.modules[formatter](dataset_path, text_keys=text_keys, suffixes=target_suffixes, add_suffix=add_suffix, **kwargs) # try huggingface dataset hub elif not is_absolute_path(dataset_path) and dataset_path.count('/') <= 1: return RemoteFormatter(dataset_path, text_keys=text_keys, **kwargs) # no data else: raise ValueError(f'Unable to load the dataset from [{dataset_path}]. ' f'It might be because Data-Juicer doesn\'t support ' f'the format of this dataset, or the path of this ' f'dataset is incorrect.Please check if it\'s a valid ' f'dataset path and retry.')