Source code for data_juicer.format.load

from .formatter import BaseFormatter
from .mixture_formatter import MixtureFormatter


[docs]def load_formatter(dataset_path, generated_dataset_config=None, text_keys=None, suffixes=[], add_suffix=False, **kwargs) -> BaseFormatter: """ Load mixture formatter for multiple different data formats with an optional weight(default 1.0) according to their formats. :param dataset_path: path to a dataset file or a dataset directory :param generated_dataset_config: Configuration used to create a dataset. The dataset will be created from this configuration if provided. It must contain the `type` field to specify the dataset name. :param text_keys: key names of field that stores sample text. Default: None :param suffixes: files with specified suffixes to be processed. :param add_suffix: whether to add the file suffix to dataset meta info :return: a dataset formatter. """ if generated_dataset_config: assert isinstance(generated_dataset_config, dict) and 'type' in generated_dataset_config args = generated_dataset_config.copy() obj_name = args.pop('type') args.update(kwargs) from .formatter import FORMATTERS return FORMATTERS.modules[obj_name](**args) formatter = MixtureFormatter(dataset_path=dataset_path, text_keys=text_keys, suffixes=suffixes, add_suffix=add_suffix, **kwargs) return formatter