Source code for data_juicer.format.load

from .formatter import BaseFormatter
from .mixture_formatter import MixtureFormatter



[docs]
def load_formatter(dataset_path,
                   generated_dataset_config=None,
                   text_keys=None,
                   suffixes=[],
                   add_suffix=False,
                   **kwargs) -> BaseFormatter:
    """
    Load mixture formatter for multiple different data formats with an optional
    weight(default 1.0) according to their formats.

    :param dataset_path: path to a dataset file or a dataset directory
    :param generated_dataset_config: Configuration used to create a dataset.
        The dataset will be created from this configuration if provided.
        It must contain the `type` field to specify the dataset name.
    :param text_keys: key names of field that stores sample text.
        Default: None
    :param suffixes: files with specified suffixes to be processed.
    :param add_suffix: whether to add the file suffix to dataset meta
        info
    :return: a dataset formatter.
    """
    if generated_dataset_config:
        assert isinstance(generated_dataset_config,
                          dict) and 'type' in generated_dataset_config
        args = generated_dataset_config.copy()
        obj_name = args.pop('type')
        args.update(kwargs)

        from .formatter import FORMATTERS
        return FORMATTERS.modules[obj_name](**args)

    formatter = MixtureFormatter(dataset_path=dataset_path,
                                 text_keys=text_keys,
                                 suffixes=suffixes,
                                 add_suffix=add_suffix,
                                 **kwargs)
    return formatter