-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpbs_jobmonitor
More file actions
executable file
·113 lines (99 loc) · 2.42 KB
/
pbs_jobmonitor
File metadata and controls
executable file
·113 lines (99 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
#
# Author:
# Willem Vermin, SARA, April 2012
#
# SVN Info:
# $Id#
# $URL: https://oss.trac.surfsara.nl/pbs_python/svn/tags/4.6.0/examples/pbs_jobmonitor $
#
# pbs_jobmonitor, pbs_joblogin <jobnr> [nodenr]
# jobnr: the number of the job
# nodenr: the rank of the node in the job
#
# depending on the name with this script is called it performs the
# following:
# called as pbs_jobmonitor:
# shows the output of top -u user on the node
# - one cycle of top
# - user: the user the job belongs to
#
# called as pbs_joblogin:
# logs in to the node as the user who invokes this script
# (os.getenv('USER'))
#
from PBSQuery import PBSQuery
import sys,os
def uniq(seq, idfun=None):
# http://www.peterbe.com/plog/uniqifiers-benchmark
# order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
def usage(a):
if a == 'pbs_jobmonitor':
print a,'shows the system usage of a node where a job is running'
if a == 'pbs_joblogin':
print a,'logs you in to a node where a job is running'
print 'Usage:'
print a,'<jobnumber> [nodenumber]'
print 'where <jobnumber> is the number of the job'
print ' nodenumber is the rank number of the node allocated to the job'
print ' (default 0)'
me = sys.argv[0].split('/')[-1]
print '['+me+']'
p = PBSQuery()
try:
j=sys.argv[1]
except:
usage(me)
sys.exit(1)
if len(sys.argv) > 2:
try:
num = int(sys.argv[2])
except:
usage(me)
sys.exit(1)
else:
num = 0
job = p.getjob(j)
try:
h = job['exec_host'][0]
except:
print 'No such job:',j
sys.exit(1)
hh = h.split('+')
nodes=[]
for h in hh:
nodes = nodes + [ h.split('/')[0]]
nodes = uniq(nodes)
print 'Job',j,'is running on',len(nodes),'nodes:'
i=0
for h in nodes:
print h,
i = i+1
if i > 7:
i=0
print
if i != 0:
print
if num >= len(nodes):
print 'No node number',num
sys.exit(1)
if me == 'pbs_jobmonitor':
user=job['Job_Owner'][0].split('@')[0]
print 'top for node #',num,':',nodes[num],'user:',user
sys.stdout.flush()
os.system('ssh '+nodes[num]+' top -n1 -b -u ' + user)
if me == 'pbs_joblogin':
user = os.getenv('USER')
print 'logging in to node #',num,':',nodes[num],'user:',user
sys.stdout.flush()
os.system('ssh -X '+nodes[num])