-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtorque.py
More file actions
113 lines (105 loc) · 4.33 KB
/
torque.py
File metadata and controls
113 lines (105 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# PBS TORQUE plug-in
import os
import numpy as np
from crawldefs import Job
from identity import *
SUB = "qsub"
_BATCHSCRIPT = ("#!/bin/bash -l \n"+
"#PBS -l nodes=%d:ppn=%d \n"+
"#PBS -q %s \n"+
"#PBS -m %s \n"+
"#PBS -r n \n"+
"#PBS -l walltime=%02d:00:00 \n"+
"#PBS -N %s \n"+
"# EVERYTHING ABOVE THIS COMMENT IS NECESSARY, SHOULD ONLY CHANGE"+
" nodes,ppn,walltime and my_job_name VALUES \n"+
"cd $PBS_O_WORKDIR \n"+
"module load "+GCCMOD +" \n"+
"module load "+PYTHONMOD +" \n"+
"module load "+INTELMOD +" \n"+
"module load "+MPIMOD +" \n")
def BATCHSCRIPT(job,notify,wt=48):
return _BATCHSCRIPT%(1,job.ncores,job.queue,notify,wt,job.name)
def HOLD(jobs):
return SUB+" -W depend=afterany:" + ":".join(jobs)
MODELS = {"plasim":1, #tasks per node (1 workq node on Sunnyvale has 8 threads)
"exoplasim":1,
"sbdart":8, #Here we use 'task' to mean a Sunnyvale job, as opposed to the
"sbdart_earth":8, #HPC convention of a task being a thread or process. This way our
"sbdart_locked":8, #code is MPI/OpenMP-agnostic.
"postprocess":8,
"postprocess_earth":8,
"postprocess_locked":8,
"lmdz":8,
"mitgcm":6,
"pipeline":8}
def getjobs(rude=False):
print("Checking jobs")
os.system("qstat -u "+USER+" > cjobs.tmp")
cjf = open("cjobs.tmp","r")
joblist = cjf.read().split('\n')[5:-1]
cjf.close()
os.system("rm cjobs.tmp")
resources={}
for m in list(MODELS.keys()):
resources[m] = np.zeros(256)
running = 0
statuses = []
tags = []
#This part may need changing depending on how job tags are handled.
for j in joblist:
job = j.split()
#if job[3][5:]!="lmdz-":
#tags.append(job[0])
tags.append(job[0])
statuses.append(job[-2])
nt = 0
for t in tags:
print("Looking up "+t)
os.system("qstat -f "+t+" > jinfo.tmp")
jf = open("jinfo.tmp","r")
jinfo = jf.read().split('\n')[1:-2]
while '' in jinfo:
jinfo.remove('')
jf.close()
os.system("rm jinfo.tmp")
ncpus = 1
for l in jinfo:
if len(l.split())>0:
if l.split()[0]=="init_work_dir":
workdir = l.split()[2]
#if l.split()[0]=="Resource_List.ncpus":
#ncpus = int(l.split()[2])
if l.split()[0]=="Resource_List.nodes":
ncpus = int(l.split()[2].split("=")[1])
ourjob=True
try:
job = np.load(workdir+"/job.npy").item()
except:
for nl in range(0,len(jinfo)):
l = jinfo[nl]
if len(l.split())>0:
if l.split()[0]=="init_work_dir":
try:
workdir = l.split()[2] + jinfo[nl+1].split()[0]
except:
workdir = l.split()[2]
try:
job = np.load(workdir+"/job.npy").item()
except:
ourjob=False
if ourjob:
jid = job.home
if jid>=len(resources[job.model]):
tmp = np.zeros(jid+100)
tmp[:len(resources[job.model])] = resources[job.model][:]
resources[job.model] = tmp
resources[job.model][jid] = float(ncpus)/8.0#MODELS[job.model]
if rude:
if statuses[nt]=="R":
running += float(ncpus)/8.0
nt+=1
if rude:
return resources,running
else:
return resources