importjsonimportosimportrequestsfromloguruimportloggerfrom.cache_utilsimportDATA_JUICER_ASSETS_CACHE# Default directory to store auxiliary resourcesASSET_DIR=DATA_JUICER_ASSETS_CACHE# Default cached assets links for downloadingASSET_LINKS={'flagged_words':'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/''data_juicer/flagged_words.json','stopwords':'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/''data_juicer/stopwords.json',}
[文档]defload_words_asset(words_dir:str,words_type:str):""" Load words from a asset file named `words_type`, if not find a valid asset file, then download it from ASSET_LINKS cached by data_juicer team. :param words_dir: directory that stores asset file(s) :param words_type: name of target words assets :return: a dict that stores words assets, whose keys are language names, and the values are lists of words """words_dict={}os.makedirs(words_dir,exist_ok=True)# try to load words from `words_type` fileforfilenameinos.listdir(words_dir):iffilename.endswith('.json')andwords_typeinfilename:withopen(os.path.join(words_dir,filename),'r')asfile:loaded_words=json.load(file)forkeyinloaded_words:ifkeyinwords_dict:words_dict[key]+=loaded_words[key]else:words_dict[key]=loaded_words[key]# if the asset file is not found, then download it from ASSET_LINKSifnotbool(words_dict):logger.info(f'Specified {words_dir} does not contain 'f'any {words_type} files in json format, now ''download the one cached by data_juicer team')ifwords_typenotinASSET_LINKS:raiseValueError(f'{words_type} is not in remote server.')response=requests.get(ASSET_LINKS[words_type])words_dict=response.json()# cache the asset file locallycache_path=os.path.join(words_dir,f'{words_type}.json')withopen(cache_path,'w')asfile:json.dump(words_dict,file)returnwords_dict