File size: 3,192 Bytes
c19ca42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
try:
    import pynvml as nv
    nvml_ok = True
except ImportError:
    nvml_ok = False

nvml_initialized = False


def get_reason(val):
    throttle = {
        1: 'gpu idle',
        2: 'applications clocks setting',
        4: 'sw power cap',
        8: 'hw slowdown',
        16: 'sync boost',
        32: 'sw thermal slowdown',
        64: 'hw thermal slowdown',
        128: 'hw power brake slowdown',
        256: 'display clock setting',
    }
    reason = ', '.join([throttle[i] for i in throttle if i & val])
    return reason if len(reason) > 0 else 'ok'

def get_nvml():
    global nvml_initialized # pylint: disable=global-statement
    global nvml_ok # pylint: disable=global-statement
    if not nvml_ok:
        return []
    try:
        if not nvml_initialized:
            nvml_initialized = True
            nv.nvmlInit()
        devices = []
        for i in range(nv.nvmlDeviceGetCount()):
            dev = nv.nvmlDeviceGetHandleByIndex(i)
            device = {
                'name': nv.nvmlDeviceGetName(dev),
                'version': {
                    'cuda': nv.nvmlSystemGetCudaDriverVersion(),
                    'driver': nv.nvmlSystemGetDriverVersion(),
                    'vbios': nv.nvmlDeviceGetVbiosVersion(dev),
                    'rom': nv.nvmlDeviceGetInforomImageVersion(dev),
                    'capabilities': nv.nvmlDeviceGetCudaComputeCapability(dev),
                },
                'pci': {
                    'link': nv.nvmlDeviceGetCurrPcieLinkGeneration(dev),
                    'width': nv.nvmlDeviceGetCurrPcieLinkWidth(dev),
                    'busid': nv.nvmlDeviceGetPciInfo(dev).busId,
                    'deviceid': nv.nvmlDeviceGetPciInfo(dev).pciDeviceId,
                },
                'memory': {
                    'total': round(nv.nvmlDeviceGetMemoryInfo(dev).total/1024/1024, 2),
                    'free': round(nv.nvmlDeviceGetMemoryInfo(dev).free/1024/1024,2),
                    'used': round(nv.nvmlDeviceGetMemoryInfo(dev).used/1024/1024,2),
                },
                'clock': { # gpu, sm, memory
                    'gpu': [nv.nvmlDeviceGetClockInfo(dev, 0), nv.nvmlDeviceGetMaxClockInfo(dev, 0)],
                    'sm': [nv.nvmlDeviceGetClockInfo(dev, 1), nv.nvmlDeviceGetMaxClockInfo(dev, 1)],
                    'memory': [nv.nvmlDeviceGetClockInfo(dev, 2), nv.nvmlDeviceGetMaxClockInfo(dev, 2)],
                },
                'load': {
                    'gpu': round(nv.nvmlDeviceGetUtilizationRates(dev).gpu),
                    'memory': round(nv.nvmlDeviceGetUtilizationRates(dev).memory),
                    'temp': nv.nvmlDeviceGetTemperature(dev, 0),
                    'fan': nv.nvmlDeviceGetFanSpeed(dev),
                },
                'power': [round(nv.nvmlDeviceGetPowerUsage(dev)/1000, 2), round(nv.nvmlDeviceGetEnforcedPowerLimit(dev)/1000, 2)],
                'state': get_reason(nv.nvmlDeviceGetCurrentClocksThrottleReasons(dev)),
            }
            devices.append(device)
        # log.debug(f'nmvl: {devices}')
        return devices
    except Exception:
        # log.debug(f'nvml failed: {e}')
        nvml_ok = False
        return []