Source code for data_juicer.format.load

import os

from data_juicer.format.formatter import FORMATTERS, BaseFormatter
from data_juicer.utils.file_utils import find_files_with_suffix


[docs] def load_formatter(dataset_path, text_keys=None, suffixes=None, add_suffix=False, **kwargs) -> BaseFormatter: """ Load the appropriate formatter for different types of data formats. :param dataset_path: Path to dataset file or dataset directory :param text_keys: key names of field that stores sample text. Default: None :param suffixes: the suffix of files that will be read. Default: None :param add_suffix: whether to add the file suffix to dataset meta. Default: False :return: a dataset formatter. """ if suffixes is None: suffixes = [] ext_num = {} if os.path.isdir(dataset_path) or os.path.isfile(dataset_path): file_dict = find_files_with_suffix(dataset_path, suffixes) if not file_dict: raise IOError("Unable to find files matching the suffix from {}".format(dataset_path)) for ext in file_dict: ext_num[ext] = len(file_dict[ext]) # local dataset if ext_num: formatter_num = {} for name, formatter in FORMATTERS.modules.items(): formatter_num[name] = 0 for ext in ext_num: if ext in formatter.SUFFIXES: formatter_num[name] += ext_num[ext] formatter = max(formatter_num, key=lambda x: formatter_num[x]) target_suffixes = set(ext_num.keys()).intersection(set(FORMATTERS.modules[formatter].SUFFIXES)) if not target_suffixes: raise ValueError( f"No suitable formatter found for {dataset_path}. " f"Supported extensions: " f"{[f.SUFFIXES for f in FORMATTERS.modules.values()]}" ) return FORMATTERS.modules[formatter]( dataset_path, text_keys=text_keys, suffixes=target_suffixes, add_suffix=add_suffix, **kwargs ) else: raise ValueError( f"Unable to load the dataset from [{dataset_path}]. " f"It might be because Data-Juicer doesn't support " f"the format of this dataset, or the path of this " f"dataset is incorrect.Please check if it's a valid " f"dataset path and retry." )