-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecord_gpu_usage
More file actions
128 lines (101 loc) · 4.02 KB
/
record_gpu_usage
File metadata and controls
128 lines (101 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
from __future__ import division
from pynvml import *
import os
import warnings
import mysql.connector
import json
import sys
if sys.version_info[0] < 3:
import ConfigParser
else:
import configparser as ConfigParser
default_conf = '/etc/slurm/record_gpu_usage.conf'
#
# Converts errors into string messages
#
def handleError(err):
if (err.value == NVML_ERROR_NOT_SUPPORTED):
return "N/A"
else:
return err.__str__()
def available_devices():
return (int(i) for i in os.environ['CUDA_VISIBLE_DEVICES'].split(','))
def ClearAccounting():
for i in available_devices():
handle = nvmlDeviceGetHandleByIndex(i)
nvmlDeviceClearAccountingPids(handle)
def to_json(dataset):
def row_to_json(row):
return { "serial": row[0], "pid": row[1], "gpuUtilization": row[2], "memoryUtilization": row[3],
"maxMemoryUsage": row[4], "time": row[5], "startTime": row[6]}
data = [row_to_json(row) for row in dataset]
return json.dumps(data)
#######
def DeviceQuery():
try:
nvmlInit()
for i in available_devices():
handle = nvmlDeviceGetHandleByIndex(i)
try:
serial = str(nvmlDeviceGetSerial(handle))
except NVMLError as err:
serial = handleError(err)
try:
pids = nvmlDeviceGetAccountingPids(handle)
for pid in pids :
try:
stats = nvmlDeviceGetAccountingStats(handle, pid)
if (stats.maxMemoryUsage == None):
maxMemoryUsage = None
else:
maxMemoryUsage = stats.maxMemoryUsage / 1024 / 1024
except NVMLError as err:
if (err.value == NVML_ERROR_NOT_FOUND):
# probably went away
continue
err = handleError(err)
stats.gpuUtilization = err
stats.memoryUtilization = err
maxMemoryUsage = err
stats.time = err
yield serial, pid, stats.gpuUtilization, stats.memoryUtilization, maxMemoryUsage, stats.time, stats.startTime
except NVMLError as err:
print(err)
except NVMLError as err:
print(err)
def get_config(config):
return config.get('CLUSTER','name'), dict(config.items('DATABASE'))
def write_data_mysqldb_python(cluster, db, dataset):
jobid = os.environ['SLURM_JOB_ID']
userid = os.environ['SLURM_JOB_UID']
db = MySQLdb.connect(db['host'], user=db['user'], passwd=db['password'], db=db['database'])
table = '`' + cluster + '_job_table' + '`'
sql = "update " + table + """ set admin_comment=%s where id_job=%s and id_user=%s"""
cursor = db.cursor()
with warnings.catch_warnings(): # supress mysql warnings like "out of range"
warnings.simplefilter("ignore")
cursor.execute(sql, (dataset, jobid, userid))
db.commit()
def write_data(cluster, db, dataset):
jobid = os.environ['SLURM_JOB_ID']
userid = os.environ['SLURM_JOB_UID']
file = open("/tmp/record_gpu_usage_mysql.out", "a")
db = mysql.connector.connect(host=db['host'], user=db['user'], passwd=db['password'], database=db['database'])
table = '`' + cluster + '_job_table' + '`'
sql = "update " + table + """ set admin_comment=concat(ifnull(admin_comment,''),%s) where id_job=%s and id_user=%s"""
cursor = db.cursor()
with warnings.catch_warnings(): # supress mysql warnings like "out of range"
warnings.simplefilter("ignore")
cursor.execute(sql, (dataset, jobid, userid))
db.commit()
db.close()
if __name__ == "__main__":
config = ConfigParser.ConfigParser()
config.read(default_conf)
cluster, db = get_config(config)
nvmlInit()
rows = list(DeviceQuery())
write_data(cluster, db, to_json(rows))
ClearAccounting()
nvmlShutdown()