GPU检测显卡是否空闲排队程序

本程序特有地加入了检测部分显卡空闲时，可以使用部分显卡直接运行程序，更加实用
测试GPU为3090，不同型号可能略有差别

import os
import sys
import time
from IPython import embed
CUDA_cmd = 'CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7'
cmd = 'echo hello world'
NUM_GPU = 8

def gpu_info(num_gpu=NUM_GPU):
    info = os.popen('nvidia-smi|grep %').read().split('\n')
    power = []
    memory = []
    for i in range(num_gpu):
        memory.append(int(info[i].split('|')[2].split('/')[0][:-4]))
        power.append(int(info[i].split('|')[1].split('/')[0].split(' ')[-2][:-1]))
    # print('memory', memory)
    # print('power', power)
    return power, memory

def narrow_setup(interval=2, num_gpu=NUM_GPU):
    gpu_power_list, gpu_memory_list = gpu_info()
    k = 0

    # check if there is empty gpu initially
    empty_gpu = []
    for i in range(num_gpu):
        if gpu_memory_list[i] < 1000 and gpu_power_list[i] < 30:
            gpu_index = i
            print('gpu %d is empty' % gpu_index)
            empty_gpu.append(gpu_index)
    if len(empty_gpu) >=2:
        print('gpu', empty_gpu, 'is empty, ready to run')
        return
    else:
        print('no enough empty gpus now')
        while(1):
            available_gpu = []
            gpu_power_list, gpu_memory_list = gpu_info()
            for j in range(num_gpu):
                if gpu_memory_list[j] < 1000 and gpu_power_list[j] < 30:
                    gpu_index = j
                    print('gpu %d is empty' % gpu_index)
                    available_gpu.append(gpu_index)
                
                k = k % 5
                symbol = 'monitoring: ' + '>' * k + ' ' * (10 - k - 1) + '|'
                gpu_power_str = f'gpu {j} power:{gpu_power_list[j]} W |'
                gpu_memory_str = f'gpu {j} memory:{gpu_memory_list[j]} MiB |'
        
                sys.stdout.write('\r' + gpu_memory_str + ' ' + gpu_power_str + ' ' + symbol)
                sys.stdout.flush()
                time.sleep(interval)
                k += 1
            if len(available_gpu) >= 2:
                break
        
        CUDA_cmd = generate_cuda_visible_devices_string(available_gpu)
        print('\n' + cmd)
        os.system(CUDA_cmd)
        os.system(cmd)

def generate_cuda_visible_devices_string(int_list):
    cuda_visible_devices = 'CUDA_VISIBLE_DEVICES='
    elements = ','.join(str(element) for element in int_list)
    return cuda_visible_devices + elements

if __name__ == '__main__':
    narrow_setup()
    # gpu_info()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

相关阅读:
1355C - Count Triangles，2021CCPC桂林 C，D
结合Elementplus源码讲解BEM的使用
【性能测试】数据库索引问题定位/分析+ 架构优化+ SQL优化+ 代码优化（详全）
Redis Desktop Manager安装和使用
MFC Windows 程序设计[247]之多彩编辑框(附源码)
《系统架构设计师教程》第一章：绪论
selenium安装和python中基本使用
道可云元宇宙每日资讯｜第二届世界元宇宙大会将在嘉定安亭举行
Fedora 36 dnf 安装ModSecurity和 OWASP 核心规则集
HTTPS基础概念

原文地址：https://blog.csdn.net/bj_zhb/article/details/132898105