data_juicer.format.load 源代码
import os
from data_juicer.format.formatter import FORMATTERS, BaseFormatter
from data_juicer.utils.file_utils import find_files_with_suffix
[文档]
def load_formatter(dataset_path,
text_keys=None,
suffixes=None,
add_suffix=False,
**kwargs) -> BaseFormatter:
"""
Load the appropriate formatter for different types of data formats.
:param dataset_path: Path to dataset file or dataset directory
:param text_keys: key names of field that stores sample text.
Default: None
:param suffixes: the suffix of files that will be read.
Default: None
:param add_suffix: whether to add the file suffix to dataset meta.
Default: False
:return: a dataset formatter.
"""
if suffixes is None:
suffixes = []
ext_num = {}
if os.path.isdir(dataset_path) or os.path.isfile(dataset_path):
file_dict = find_files_with_suffix(dataset_path, suffixes)
if not file_dict:
raise IOError(
'Unable to find files matching the suffix from {}'.format(
dataset_path))
for ext in file_dict:
ext_num[ext] = len(file_dict[ext])
# local dataset
if ext_num:
formatter_num = {}
for name, formatter in FORMATTERS.modules.items():
formatter_num[name] = 0
for ext in ext_num:
if ext in formatter.SUFFIXES:
formatter_num[name] += ext_num[ext]
formatter = max(formatter_num, key=lambda x: formatter_num[x])
target_suffixes = set(ext_num.keys()).intersection(
set(FORMATTERS.modules[formatter].SUFFIXES))
if not target_suffixes:
raise ValueError(
f'No suitable formatter found for {dataset_path}. '
f'Supported extensions: '
f'{[f.SUFFIXES for f in FORMATTERS.modules.values()]}')
return FORMATTERS.modules[formatter](dataset_path,
text_keys=text_keys,
suffixes=target_suffixes,
add_suffix=add_suffix,
**kwargs)
else:
raise ValueError(f'Unable to load the dataset from [{dataset_path}]. '
f'It might be because Data-Juicer doesn\'t support '
f'the format of this dataset, or the path of this '
f'dataset is incorrect.Please check if it\'s a valid '
f'dataset path and retry.')