Source code for data_juicer.utils.asset_utils

import json
import os

import requests
from loguru import logger

from .cache_utils import DATA_JUICER_ASSETS_CACHE

# Default directory to store auxiliary resources
ASSET_DIR = DATA_JUICER_ASSETS_CACHE

# Default cached assets links for downloading
ASSET_LINKS = {
    'flagged_words':
    'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
    'data_juicer/flagged_words.json',
    'stopwords':
    'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
    'data_juicer/stopwords.json',
}


[docs] def load_words_asset(words_dir: str, words_type: str): """ Load words from a asset file named `words_type`, if not find a valid asset file, then download it from ASSET_LINKS cached by data_juicer team. :param words_dir: directory that stores asset file(s) :param words_type: name of target words assets :return: a dict that stores words assets, whose keys are language names, and the values are lists of words """ words_dict = {} os.makedirs(words_dir, exist_ok=True) # try to load words from `words_type` file for filename in os.listdir(words_dir): if filename.endswith('.json') and words_type in filename: with open(os.path.join(words_dir, filename), 'r') as file: loaded_words = json.load(file) for key in loaded_words: if key in words_dict: words_dict[key] += loaded_words[key] else: words_dict[key] = loaded_words[key] # if the asset file is not found, then download it from ASSET_LINKS if not bool(words_dict): logger.info(f'Specified {words_dir} does not contain ' f'any {words_type} files in json format, now ' 'download the one cached by data_juicer team') response = requests.get(ASSET_LINKS[words_type]) words_dict = response.json() # cache the asset file locally cache_path = os.path.join(words_dir, f'{words_type}.json') with open(cache_path, 'w') as file: json.dump(words_dict, file) return words_dict