[docs]defstats_to_number(s,reverse=True):''' convert a stats value which can be string of list to a float. '''try:ifisinstance(s,str):returnfloat(s)ifsisNoneors==[]:raiseValueError('empty value')returnfloat(np.asarray(s).mean())exceptException:ifreverse:return-sys.maxsizeelse:returnsys.maxsize
[docs]defdict_to_hash(input_dict:dict,hash_length=None):""" hash a dict to a string with length hash_length :param input_dict: the given dict """sorted_items=sorted(input_dict.items())dict_string=str(sorted_items).encode()hasher=hashlib.sha256()hasher.update(dict_string)hash_value=hasher.hexdigest()ifhash_length:hash_value=hash_value[:hash_length]returnhash_value
[docs]defnested_access(data,path,digit_allowed=True):""" Access nested data using a dot-separated path. :param data: A dictionary or a list to access the nested data from. :param path: A dot-separated string representing the path to access. This can include numeric indices when accessing list elements. :param digit_allowed: Allow transferring string to digit. :return: The value located at the specified path, or raises a KeyError or IndexError if the path does not exist. """keys=path.split('.')forkeyinkeys:# Convert string keys to integers if they are numerickey=int(key)ifkey.isdigit()anddigit_allowedelsekeytry:data=data[key]exceptException:logger.warning(f'Unaccessible dot-separated path: {path}!')returnNonereturndata
[docs]defis_string_list(var):""" return if the var is list of string. :param var: input variance """returnisinstance(var,list)andall(isinstance(it,str)foritinvar)
[docs]defavg_split_string_list_under_limit(str_list:list,token_nums:list,max_token_num=None):""" Split the string list to several sub str_list, such that the total token num of each sub string list is less than max_token_num, keeping the total token nums of sub string lists are similar. :param str_list: input string list. :param token_nums: token num of each string list. :param max_token_num: max token num of each sub string list. """ifmax_token_numisNone:return[str_list]iflen(str_list)!=len(token_nums):logger.warning('The length of str_list and token_nums must be equal!')return[str_list]total_num=sum(token_nums)iftotal_num<=max_token_num:return[str_list]group_num=total_num//max_token_num+1avg_num=total_num/group_numres=[]cur_list=[]cur_sum=0fortext,token_numinzip(str_list,token_nums):iftoken_num>max_token_num:logger.warning('Token num is greater than max_token_num in one sample!')ifcur_sum+token_num>max_token_numandcur_list:res.append(cur_list)cur_list=[]cur_sum=0cur_list.append(text)cur_sum+=token_numifcur_sum>avg_num:res.append(cur_list)cur_list=[]cur_sum=0ifcur_list:res.append(cur_list)returnres