[docs]classCheckpointManager:""" This class is used to save the latest version of dataset to checkpoint directory or load it from checkpoint directory, a bit like cache management Rerun the same config will reload the checkpoint and skip ops before it. If any args of operator in process list is changed, all ops will be rerun from the beginning. """
[docs]def__init__(self,ckpt_dir,original_process_list,num_proc=1):""" Initialization method. :param ckpt_dir: path to save and load checkpoint :param original_process_list: process list in config :param num_proc: number of process workers when saving dataset """self.ckpt_dir=ckpt_dirself.ckpt_ds_dir=os.path.join(self.ckpt_dir,'latest')self.ckpt_op_record=os.path.join(self.ckpt_dir,'ckpt_op.json')self.process_list=original_process_listself.num_proc=num_procself.op_record=[]self.ckpt_available=self.check_ckpt()
[docs]defget_left_process_list(self):""" Get left process list of ops for processing dataset, when checkpoint is available, remove some ops from process list, otherwise keep it unchanged. :return: process list of left ops """returnself.process_list
[docs]defcheck_ckpt(self):""" Check if checkpoint is available. :return: True when checkpoint is available, else False """ifos.path.exists(self.ckpt_ds_dir) \
andos.path.isdir(self.ckpt_ds_dir) \
andos.path.exists(self.ckpt_op_record) \
andos.path.isfile(self.ckpt_op_record) \
andself.check_ops_to_skip():returnTrueelse:os.makedirs(self.ckpt_dir,exist_ok=True)returnFalse
[docs]defrecord(self,op_cfg:dict):"""Save op name and args to op record, which is used to compare with the process list from config to decide if a checkpoint is available."""self.op_record.append(op_cfg)
[docs]defcheck_ops_to_skip(self):""" Check which ops need to be skipped in the process list. If op record list from checkpoint are the same as the prefix part of process list, then skip these ops and start processing from the checkpoint. Otherwise, process the original dataset from scratch. :return: whether to skip some ops or not """# load op recordswithopen(self.ckpt_op_record,'r')asfin:self.op_record=json.load(fin)# check whether the op records are exactly the same# with prefix of process list# 1. same: remove these ops from process list# 2. different: cleanup op record, and keep process list unchangedrecorded_op_num=len(self.op_record)process_op_num=len(self.process_list)ifprocess_op_num<recorded_op_num:logger.warning(f'Current config ops ({process_op_num}) are fewer than 'f'checkpoint ops ({recorded_op_num}). Cannot reuse checkpoint;'f' all ops will be processed from the beginning.')self.op_record=[]returnFalseprefix_process=self.process_list[:recorded_op_num]all_the_same=Truedif1,dif2=None,Noneforrecord_op,config_opinzip(self.op_record,prefix_process):ifrecord_op!=config_op:all_the_same=Falsedif1,dif2=record_op,config_opbreakifall_the_same:foropinself.op_record:op_name=list(op.keys())[0]logger.info(f'Skip op [{op_name}].')self.process_list=self.process_list[recorded_op_num:]returnTrueelse:logger.warning(f'Processed ops of checkpoint are different from 'f'current configs: checkpoint-{dif1} vs. config-'f'{dif2}. All ops will be processed from the 'f'beginning.')self.op_record=[]returnFalse
[docs]defsave_ckpt(self,ds):""" Save dataset to checkpoint directory and dump processed ops list. :param ds: input dataset to save """left_sample_num=len(ds)ds.save_to_disk(self.ckpt_ds_dir,num_proc=min(self.num_proc,left_sample_num))withopen(self.ckpt_op_record,'w')asfout:json.dump(self.op_record,fout)
[docs]defload_ckpt(self):""" Load dataset from a checkpoint file. :return: a dataset stored in checkpoint file. """fromdata_juicer.core.dataimportNestedDatasetds=NestedDataset.load_from_disk(self.ckpt_ds_dir)returnds