-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgpu_mem_tracker.py
More file actions
70 lines (64 loc) · 3.1 KB
/
gpu_mem_tracker.py
File metadata and controls
70 lines (64 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import subprocess
import re
from datetime import datetime
def get_num_of_devices():
nDev = 0
p = subprocess.Popen('/opt/rocm/bin/rocm-smi -i', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
p.wait()
for line in p.stdout:
if "GPU ID" in line.decode():
nDev += 1
#print('nDev',nDev)
return nDev
class mem_tracker:
def __init__(self):
self.init_total_mem = []
self.init_used_mem = []
self.total_mem = []
self.used_mem = []
self.test_name = ""
self.num_devices = get_num_of_devices()
def start(self):
p = subprocess.Popen('/opt/rocm/bin/rocm-smi --showmeminfo vram', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
p.wait()
for line in p.stdout:
if re.search(r"\bTotal Memory\b",line.decode(), re.M):
searchObj = re.findall(\
"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", line.decode())
if (len(searchObj) > 1):
self.init_total_mem.append(searchObj[1])
if re.search(r"\bTotal Used Memory\b",line.decode(), re.M):
searchObj = re.findall(\
"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", line.decode())
if (len(searchObj) > 1):
self.init_used_mem.append(searchObj[1])
print('total mem', self.init_total_mem, 'Used mem', self.init_used_mem)
def stop(self):
p = subprocess.Popen('/opt/rocm/bin/rocm-smi --showmeminfo vram', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
p.wait()
for line in p.stdout:
if re.search(r"\bTotal Memory\b",line.decode(), re.M):
searchObj = re.findall(\
"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", line.decode())
if (len(searchObj) > 1):
self.total_mem.append(searchObj[1])
if re.search(r"\bTotal Used Memory\b",line.decode(), re.M):
searchObj = re.findall(\
"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", line.decode())
if (len(searchObj) > 1):
self.used_mem.append(searchObj[1])
print('total mem', self.total_mem, 'Used mem', self.used_mem)
def validate(self, test_name):
with open("gpu_memory_leak_report.txt",'a+') as f:
print("self.init_used_mem %s self.used_mem %s"%(self.init_used_mem, self.used_mem))
i = 0
while (i < self.num_devices):
if int(self.init_used_mem[i]) == int(self.used_mem[i]):
f.write("%s : %s, "%(test_name, "PASS"))
f.write("%s No memory leak %s \n"%(datetime.now(),test_name))
else:
f.write("%s : %s, "%(test_name, "FAIL"))
leak_bytes = int(self.used_mem[i]) - int(self.init_used_mem[i])
f.write("%s Total leak bytes for %s is %s on device %s \n"%(datetime.now(), test_name, leak_bytes, i+1))
i = i+1
f.close()