[docs]defquery_cuda_info(query_key):globalNVSMI_REPORT# get cuda info using "nvidia-smi" command in MBtry:nvidia_smi_output=subprocess.check_output(["nvidia-smi",f"--query-gpu={query_key}","--format=csv,noheader,nounits"]).decode("utf-8")exceptExceptionase:if"non-zero exit status 2"instr(e):err_msg=(f"The specified query_key [{query_key}] might not be "f"supported by command nvidia-smi. Please check and "f"retry!")elif"No such file or directory"instr(e):err_msg="Command nvidia-smi is not found. There might be no ""GPUs on this machine."else:err_msg=str(e)ifNVSMI_REPORT:logger.warning(err_msg)NVSMI_REPORT=FalsereturnNonecuda_info_list=[]forlineinnvidia_smi_output.strip().split("\n"):cuda_info_list.append(int(line))returncuda_info_list
[docs]defquery_mem_info(query_key):mem=psutil.virtual_memory()ifquery_keynotinmem._fields:logger.warning(f"No such query key [{query_key}] for memory info. "f"Should be one of {mem._fields}")returnNoneval=round(mem.__getattribute__(query_key)/(2**20),2)# in MBreturnval
[docs]defget_ray_gpu_count():""" Get the number of available GPUs in the Ray cluster. Returns: int: Number of available GPUs, or 0 if no GPUs are available or Ray is not initialized """try:ifnotray.is_initialized():logger.warning("Ray is not initialized. Call ray.init() first.")return0# Get available resourcesresources=ray.available_resources()gpu_count=int(resources.get("GPU",0))ifgpu_count==0:logger.warning("No GPUs available in Ray cluster")else:logger.info(f"Found {gpu_count} GPUs in Ray cluster")returngpu_countexceptExceptionase:logger.error(f"Error getting Ray GPU count: {str(e)}")return0
[docs]defget_ray_gpu_memory():""" Get the available GPU memory in the Ray cluster. Returns: dict: Dictionary mapping GPU indices to available memory in MB, or empty dict if no GPUs available """try:ifnotray.is_initialized():logger.warning("Ray is not initialized. Call ray.init() first.")return{}# Get available resourcesresources=ray.available_resources()gpu_count=int(resources.get("GPU",0))ifgpu_count==0:logger.warning("No GPUs available in Ray cluster")return{}# Get memory info for each GPUgpu_memory={}foriinrange(gpu_count):memory=query_cuda_info("memory.free")ifmemoryisnotNoneandi<len(memory):gpu_memory[i]=memory[i]logger.info(f"GPU {i} has {memory[i]}MB free memory")else:logger.warning(f"Could not get memory info for GPU {i}")returngpu_memoryexceptExceptionase:logger.error(f"Error getting Ray GPU memory: {str(e)}")return{}