[文档]defquery_cuda_info(query_key):globalNVSMI_REPORT# get cuda info using "nvidia-smi" command in MBtry:nvidia_smi_output=subprocess.check_output(["nvidia-smi",f"--query-gpu={query_key}","--format=csv,noheader,nounits"]).decode("utf-8")exceptExceptionase:if"non-zero exit status 2"instr(e):err_msg=(f"The specified query_key [{query_key}] might not be "f"supported by command nvidia-smi. Please check and "f"retry!")elif"No such file or directory"instr(e):err_msg="Command nvidia-smi is not found. There might be no ""GPUs on this machine."else:err_msg=str(e)ifNVSMI_REPORT:logger.warning(err_msg)NVSMI_REPORT=FalsereturnNonecuda_info_list=[]forlineinnvidia_smi_output.strip().split("\n"):cuda_info_list.append(int(line))returncuda_info_list
[文档]defquery_mem_info(query_key):mem=psutil.virtual_memory()ifquery_keynotinmem._fields:logger.warning(f"No such query key [{query_key}] for memory info. "f"Should be one of {mem._fields}")returnNoneval=round(mem.__getattribute__(query_key)/(2**20),2)# in MBreturnval
def_cuda_device_count():_torch_available=_is_package_available("torch")ifcheck_and_initialize_ray():returnint(ray_gpu_count())if_torch_available:returntorch.cuda.device_count()try:nvidia_smi_output=subprocess.check_output(["nvidia-smi","-L"],text=True)all_devices=nvidia_smi_output.strip().split("\n")cuda_visible_devices=os.getenv("CUDA_VISIBLE_DEVICES")ifcuda_visible_devicesisnotNone:logger.warning("CUDA_VISIBLE_DEVICES is ignored when torch is unavailable. ""All detected GPUs will be used.")returnlen(all_devices)exceptException:# nvidia-smi not found or other errorreturn0
[文档]defavailable_memories()->List[int]:"""Available memory for each node in MB."""ifcheck_and_initialize_ray():returnray_available_memories()return[int(psutil.virtual_memory().available/(1024**2))]
[文档]defavailable_gpu_memories()->List[int]:"""Available gpu memory of each gpu card for each alive node in MB."""ifcheck_and_initialize_ray():returnray_available_gpu_memories()try:returnquery_cuda_info("memory.free")exceptException:return[]