mirror of
https://gitlab.cs.uni-saarland.de/hpc/cc-condor-sync.git
synced 2024-12-25 13:09:06 +01:00
Start revamping to use htcondor EventLog not slurm
This commit is contained in:
parent
c9aa4095fe
commit
308df9907e
510
htcondor-clusercockpit-push.py
Executable file
510
htcondor-clusercockpit-push.py
Executable file
@ -0,0 +1,510 @@
|
||||
#!/usr/bin/python3
|
||||
# This script syncs the slurm jobs with the cluster cockpit backend. It uses
|
||||
# the slurm command line tools to gather the relevant slurm infos and reads
|
||||
# the corresponding info from cluster cockpit via its api.
|
||||
#
|
||||
# After reading the data, it stops all jobs in cluster cockpit which are
|
||||
# not running any more according to slurm and afterwards it creates all new
|
||||
# running jobs in cluster cockpit.
|
||||
#
|
||||
# -- Michael Schwarz <schwarz@uni-paderborn.de>
|
||||
|
||||
import subprocess
|
||||
import json
|
||||
import time
|
||||
import requests
|
||||
import re
|
||||
import tailf
|
||||
|
||||
|
||||
class CCApi:
|
||||
config = {}
|
||||
apiurl = ''
|
||||
apikey = ''
|
||||
headers = {}
|
||||
|
||||
def __init__(self, config, debug=False):
|
||||
self.config = config
|
||||
self.apiurl = "%s/api/" % config['cc-backend']['host']
|
||||
self.apikey = config['cc-backend']['apikey']
|
||||
self.headers = {'accept': 'application/ld+json',
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': 'Bearer %s' % self.config['cc-backend']['apikey']}
|
||||
|
||||
def startJob(self, data):
|
||||
url = self.apiurl+"jobs/start_job/"
|
||||
r = requests.post(url, headers=self.headers, json=data)
|
||||
if r.status_code == 201:
|
||||
return r.json()
|
||||
elif r.status_code == 422:
|
||||
return False
|
||||
else:
|
||||
print(data)
|
||||
print(r)
|
||||
print(r.json())
|
||||
return False
|
||||
|
||||
def stopJob(self, data):
|
||||
url = self.apiurl+"jobs/stop_job/"
|
||||
r = requests.post(url, headers=self.headers, json=data)
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
else:
|
||||
print(data)
|
||||
print(r)
|
||||
return False
|
||||
|
||||
def getJobs(self, filter_running=True):
|
||||
url = self.apiurl+"jobs/"
|
||||
if filter_running:
|
||||
url = url+"?state=running"
|
||||
r = requests.get(url, headers=self.headers)
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
else:
|
||||
return {'jobs': []}
|
||||
|
||||
|
||||
class CondorSync:
|
||||
condorJobData = {}
|
||||
ccData = {}
|
||||
config = {}
|
||||
debug = False
|
||||
ccapi = None
|
||||
|
||||
def __init__(self, config, debug=False):
|
||||
self.config = config
|
||||
self.debug = debug
|
||||
|
||||
# validate config TODO
|
||||
if "htcondor" not in config:
|
||||
raise KeyError
|
||||
if "eventlog" not in config['htcondor']:
|
||||
raise KeyError
|
||||
if "cc-backend" not in config:
|
||||
raise KeyError
|
||||
if "host" not in config['cc-backend']:
|
||||
raise KeyError
|
||||
if "apikey" not in config['cc-backend']:
|
||||
raise KeyError
|
||||
|
||||
self.ccapi = CCApi(self.config, debug)
|
||||
|
||||
def _exec(self, command):
|
||||
process = subprocess.Popen(
|
||||
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
output, error = process.communicate()
|
||||
if process.returncode == 0:
|
||||
return output.decode('utf-8')
|
||||
else:
|
||||
print("Error: ", error)
|
||||
return ""
|
||||
|
||||
def _readCondorData(self):
|
||||
if self.debug:
|
||||
print("DEBUG: _readCondorData called")
|
||||
with open(self.config['htcondor']['eventlog']) as f:
|
||||
self.condorJobData = json.load(f)
|
||||
|
||||
def _readCCData(self):
|
||||
if self.debug:
|
||||
print("DEBUG: _readCCBackendData called")
|
||||
self.ccData = self.ccapi.getJobs()
|
||||
if self.debug:
|
||||
print("DEBUG: ccData:", self.ccData)
|
||||
|
||||
def _getAccDataForJob(self, jobid):
|
||||
raise NotImplementedError
|
||||
command = "%s -j %s --json" % (self.config['slurm']['sacct'], jobid)
|
||||
return json.loads(self._exec(command))
|
||||
|
||||
def _jobIdInCC(self, jobid):
|
||||
for job in self.ccData['jobs']:
|
||||
if jobid == job['jobId']:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _jobIdToInt(self, globalJobId):
|
||||
job_id_parts = globalJobId.split('#')
|
||||
submit = self.config['htcondor']['submitnodes'][job_id_parts[0].split('.')[
|
||||
0]]
|
||||
cluster_id, proc_id = [int(id) for id in job_id_parts[1].split('.')]
|
||||
return cluster_id << 32 | ((proc_id & 0x3FFFFFFF) << 2) | submit
|
||||
|
||||
def _jobRunning(self, jobid):
|
||||
for job in self.slurmJobData['jobs']:
|
||||
if int(job['job_id']) == int(jobid):
|
||||
if job['job_state'] == 'RUNNING':
|
||||
return True
|
||||
return False
|
||||
|
||||
def _getACCIDsFromGRES(self, gres, nodename):
|
||||
ids = self.config['accelerators']
|
||||
|
||||
nodetype = None
|
||||
for k, v in ids.items():
|
||||
if nodename.startswith(k):
|
||||
nodetype = k
|
||||
|
||||
if not nodetype:
|
||||
print("WARNING: Can't find accelerator definition for node %s" %
|
||||
nodename.strip())
|
||||
return []
|
||||
|
||||
# the gres definition might be different on other clusters!
|
||||
m = re.match(r"(fpga|gpu):(\w+):(\d)\(IDX:([\d,\-]+)\)", gres)
|
||||
if m:
|
||||
family = m.group(1)
|
||||
type = m.group(2)
|
||||
amount = m.group(3)
|
||||
indexes = m.group(4)
|
||||
acc_id_list = []
|
||||
|
||||
# IDX might be: IDX:0,2-3
|
||||
# first split at , then expand the range to individual items
|
||||
if len(indexes) > 1:
|
||||
idx_list = indexes.split(',')
|
||||
idx = []
|
||||
for i in idx_list:
|
||||
if len(i) == 1:
|
||||
idx.append(i)
|
||||
else:
|
||||
start = i.split('-')[0]
|
||||
end = i.split('-')[1]
|
||||
idx = idx + list(range(int(start), int(end)+1))
|
||||
indexes = idx
|
||||
|
||||
for i in indexes:
|
||||
acc_id_list.append(ids[nodetype][str(i)])
|
||||
|
||||
print(acc_id_list)
|
||||
return acc_id_list
|
||||
|
||||
return []
|
||||
|
||||
def _ccStartJob(self, job):
|
||||
print("INFO: Create job %s, user %s, name %s" %
|
||||
(job['GlobalJobId'], job['Owner'], job['JobBatchName'] if 'JobBatchName' in job else ''))
|
||||
jobId = self._jobIdToInt(job['GlobalJobId'])
|
||||
for j in self.ccData['jobs']:
|
||||
if j['jobId'] == jobId:
|
||||
return
|
||||
|
||||
nodelist = [job['RemoteHost'].split('@')[1]]
|
||||
|
||||
# # Exclusive job?
|
||||
# if job['shared'] == "none":
|
||||
# exclusive = 1
|
||||
# # exclusive to user
|
||||
# elif job['shared'] == "user":
|
||||
# exclusive = 2
|
||||
# # exclusive to mcs
|
||||
# elif job['shared'] == "mcs":
|
||||
# exclusive = 3
|
||||
# # default is shared node
|
||||
# else:
|
||||
exclusive = 0
|
||||
|
||||
# read job script and environment
|
||||
# hashdir = "hash.%s" % str(job['GlobalJobId'])[-1]
|
||||
# jobscript_filename = "%s/%s/job.%s/script" % (
|
||||
# self.config['slurm']['state_save_location'], hashdir, job['GlobalJobId'])
|
||||
# jobscript = ''
|
||||
# try:
|
||||
# with open(jobscript_filename, 'r', encoding='utf-8') as f:
|
||||
# jobscript = f.read()
|
||||
# except FileNotFoundError:
|
||||
# jobscript = 'NO JOBSCRIPT'
|
||||
|
||||
environment = ''
|
||||
# FIXME sometimes produces utf-8 conversion errors
|
||||
# environment_filename = "%s/%s/job.%s/environment" % (
|
||||
# self.config['slurm']['state_save_location'], hashdir, job['GlobalJobId'])
|
||||
# try:
|
||||
# with open(environment_filename, 'r', encoding='utf-8') as f:
|
||||
# environment = f.read()
|
||||
# except FileNotFoundError:
|
||||
# environment = 'NO ENV'
|
||||
# except UnicodeDecodeError:
|
||||
# environment = 'UNICODE_DECODE_ERROR'
|
||||
|
||||
# truncate environment to 50.000 chars. Otherwise it might not fit into the
|
||||
# Database field together with the job script etc.
|
||||
# environment = environment[:50000]
|
||||
|
||||
# # get additional info from slurm and add environment
|
||||
# command = "scontrol show job %s" % job['GlobalJobId']
|
||||
# slurminfo = self._exec(command)
|
||||
# slurminfo = slurminfo + "ENV:\n====\n" + environment
|
||||
|
||||
if job['Subproc'] > 0:
|
||||
print("WARNING: did not expect to see Subproc != 0")
|
||||
|
||||
# build payload
|
||||
data = {'jobId': jobId,
|
||||
'user': job['Owner'],
|
||||
'cluster': self.config['cluster'],
|
||||
'numNodes': job['CurrentHosts'],
|
||||
'numHwthreads': job['CpusProvisioned'],
|
||||
'startTime': job['JobCurrentStartDate'],
|
||||
# 'walltime': int(job['time_limit']) * 60,
|
||||
'project': job['AccountingGroup'],
|
||||
'partition': 'main', # job['partition'],
|
||||
'exclusive': exclusive,
|
||||
'resources': [],
|
||||
'metadata': {
|
||||
'jobName': job['JobBatchName'] if 'JobBatchName' in job else ''
|
||||
}
|
||||
}
|
||||
|
||||
# is this part of an array job?
|
||||
if job['Cluster'] > 0:
|
||||
data.update({"arrayJobId": job['Cluster']})
|
||||
|
||||
num_acc = 0
|
||||
for node in nodelist:
|
||||
# begin dict
|
||||
resources = {'hostname': node.split('.')[0].strip()}
|
||||
|
||||
# if a job uses a node exclusive, there are some assigned cpus (used by this job)
|
||||
# and some unassigned cpus. In this case, the assigned_cpus are those which have
|
||||
# to be monitored, otherwise use the unassigned cpus.
|
||||
# hwthreads = job['job_resources']['allocated_nodes'][str(
|
||||
# i)]['cores']
|
||||
# cores_assigned = []
|
||||
# cores_unassigned = []
|
||||
# for k, v in hwthreads.items():
|
||||
# if v == "assigned":
|
||||
# cores_assigned.append(int(k))
|
||||
# else:
|
||||
# cores_unassigned.append(int(k))
|
||||
|
||||
# if len(cores_assigned) > 0:
|
||||
# cores = cores_assigned
|
||||
# else:
|
||||
# cores = cores_unassigned
|
||||
# resources.update({"hwthreads": cores})
|
||||
|
||||
# Get allocated GPUs if some are requested
|
||||
if 'AssignedGPUs' in job:
|
||||
|
||||
acc_ids = job['AssignedGPUs'].split(',')
|
||||
if len(acc_ids) > 0:
|
||||
num_acc = num_acc + len(acc_ids)
|
||||
resources.update(
|
||||
{"accelerators": [self.config['accelerators'][node][id] for id in acc_ids]})
|
||||
|
||||
data['resources'].append(resources)
|
||||
|
||||
# if the number of accelerators has changed in the meantime, upate this field
|
||||
data.update({"numAcc": num_acc})
|
||||
|
||||
if self.debug:
|
||||
print(data)
|
||||
|
||||
ccjob = self.ccapi.startJob(data)
|
||||
|
||||
def _ccStopJob(self, job):
|
||||
print("INFO: Stop job %s" % job['GlobalJobId'])
|
||||
jobId = self._jobIdToInt(job['GlobalJobId'])
|
||||
|
||||
# get search for the jobdata stored in CC
|
||||
# ccjob = {}
|
||||
# for j in self.ccData['jobs']:
|
||||
# if j['jobId'] == jobId:
|
||||
# ccjob = j
|
||||
|
||||
# # check if job is still in squeue data
|
||||
# for job in self.slurmJobData['jobs']:
|
||||
# if job['job_id'] == jobId:
|
||||
# jobstate = job['job_state'].lower()
|
||||
# endtime = job['end_time']
|
||||
# if jobstate == 'requeued':
|
||||
# print("Requeued job")
|
||||
# jobstate = 'failed'
|
||||
|
||||
# if int(ccjob['startTime']) >= int(job['end_time']):
|
||||
# print("squeue correction")
|
||||
# # For some reason (needs to get investigated), failed jobs sometimes report
|
||||
# # an earlier end time in squee than the starting time in CC. If this is the
|
||||
# # case, take the starting time from CC and add ten seconds to the starting
|
||||
# # time as new end time. Otherwise CC refuses to end the job.
|
||||
# endtime = int(ccjob['startTime']) + 1
|
||||
|
||||
# else:
|
||||
# jobsAcctData = self._getAccDataForJob(jobid)['jobs']
|
||||
# for j in jobsAcctData:
|
||||
# if len(j['steps']) > 0 and j['steps'][0]['time']['start'] == ccjob['startTime']:
|
||||
# jobAcctData = j
|
||||
# jobstate = jobAcctData['state']['current'].lower()
|
||||
# endtime = jobAcctData['time']['end']
|
||||
|
||||
# if jobstate == "node_fail":
|
||||
# jobstate = "failed"
|
||||
# if jobstate == "requeued":
|
||||
# print("Requeued job")
|
||||
# jobstate = "failed"
|
||||
|
||||
# if int(ccjob['startTime']) >= int(jobAcctData['time']['end']):
|
||||
# print("sacct correction")
|
||||
# # For some reason (needs to get investigated), failed jobs sometimes report
|
||||
# # an earlier end time in squee than the starting time in CC. If this is the
|
||||
# # case, take the starting time from CC and add ten seconds to the starting
|
||||
# # time as new end time. Otherwise CC refuses to end the job.
|
||||
# endtime = int(ccjob['startTime']) + 1
|
||||
|
||||
jobstate_map = {4: "canceled", 5: "completed",
|
||||
9: "failed", 10: "stopped", 12: "stopped"}
|
||||
jobstate = jobstate_map[job['TriggerEventTypeNumber']]
|
||||
|
||||
data = {
|
||||
'jobId': jobId,
|
||||
'cluster': self.config['cluster'],
|
||||
'startTime': job['JobCurrentStartDate'],
|
||||
'stopTime': job['ToE']['When'],
|
||||
'jobState': jobstate
|
||||
}
|
||||
|
||||
self.ccapi.stopJob(data)
|
||||
|
||||
def _convertNodelist(self, nodelist):
|
||||
# Use slurm to convert a nodelist with ranges into a comma separated list of unique nodes
|
||||
if re.search(self.config['node_regex'], nodelist):
|
||||
command = "scontrol show hostname %s | paste -d, -s" % nodelist
|
||||
retval = self._exec(command).split(',')
|
||||
return retval
|
||||
else:
|
||||
return []
|
||||
|
||||
def _handleEvent(self, event):
|
||||
# event codes: https://htcondor.readthedocs.io/en/latest/codes-other-values/job-event-log-codes.html
|
||||
if event['EventTypeNumber'] == 28: # JobAdInformationEvent
|
||||
if event['TriggerEventTypeNumber'] == 1: # Execute
|
||||
self._ccStartJob(event)
|
||||
elif event['TriggerEventTypeNumber'] == 4 or event['TriggerEventTypeNumber'] == 5 or \
|
||||
event['TriggerEventTypeNumber'] == 9:
|
||||
self._ccStopJob(event)
|
||||
|
||||
def sync(self, limit=200, jobid=None, direction='both'):
|
||||
if self.debug:
|
||||
print("DEBUG: sync called")
|
||||
print("DEBUG: jobid %s" % jobid)
|
||||
|
||||
self._readCCData()
|
||||
|
||||
with tailf.Tail(self.config['htcondor']['eventlog']) as tail:
|
||||
remaining = ""
|
||||
while True:
|
||||
for event in tail:
|
||||
if isinstance(event, bytes):
|
||||
eventlog = remaining + event.decode("utf-8")
|
||||
decoder = json.JSONDecoder()
|
||||
pos = 0
|
||||
while True:
|
||||
try:
|
||||
event, pos = decoder.raw_decode(eventlog, pos)
|
||||
remaining = ""
|
||||
pos += 1
|
||||
self._handleEvent(event)
|
||||
except json.JSONDecodeError:
|
||||
remaining = eventlog[pos:]
|
||||
break
|
||||
elif event is tailf.Truncated:
|
||||
print("File was truncated")
|
||||
else:
|
||||
assert False, "unreachable" # currently. more events may be introduced later
|
||||
time.sleep(5) # save CPU cycles
|
||||
|
||||
with open(self.config['htcondor']['eventlog'], 'r', encoding='utf-8') as f:
|
||||
eventlog = f.read()
|
||||
|
||||
decoder = json.JSONDecoder()
|
||||
pos = 0
|
||||
while True:
|
||||
try:
|
||||
event, pos = decoder.raw_decode(eventlog, pos)
|
||||
pos += 1
|
||||
self._handleEvent(event)
|
||||
except json.JSONDecodeError:
|
||||
break
|
||||
# self._readCondorData()
|
||||
return
|
||||
|
||||
# Abort after a defined count of sync actions. The intend is, to restart this script after the
|
||||
# limit is reached. Otherwise, if many many jobs get stopped, the script might miss some new jobs.
|
||||
sync_count = 0
|
||||
|
||||
# iterate over cc jobs and stop them if they have already ended
|
||||
if direction in ['both', 'stop']:
|
||||
for job in self.ccData['jobs']:
|
||||
if jobid:
|
||||
if int(job['jobId']) == int(jobid) and not self._jobRunning(job['jobId']):
|
||||
self._ccStopJob(job['jobId'])
|
||||
sync_count = sync_count + 1
|
||||
else:
|
||||
if not self._jobRunning(job['jobId']):
|
||||
self._ccStopJob(job['jobId'])
|
||||
sync_count = sync_count + 1
|
||||
if sync_count >= limit:
|
||||
print("INFO: sync limit (%s) reached" % limit)
|
||||
break
|
||||
|
||||
sync_count = 0
|
||||
# iterate over running jobs and add them to cc if they are still missing there
|
||||
if direction in ['both', 'start']:
|
||||
for job in self.slurmJobData['jobs']:
|
||||
# Skip this job if the user does not want the metadata of this job to be submitted to ClusterCockpit
|
||||
# The text field admin_comment is used for this. We assume that this field contains a comma seperated
|
||||
# list of flags.
|
||||
if "disable_cc_submission" in job['admin_comment'].split(','):
|
||||
print(
|
||||
"INFO: Job %s: disable_cc_sumbission is set. Continue with next job" % job['job_id'])
|
||||
continue
|
||||
# consider only running jobs
|
||||
if job['job_state'] == "RUNNING":
|
||||
if jobid:
|
||||
if int(job['job_id']) == int(jobid) and not self._jobIdInCC(job['job_id']):
|
||||
self._ccStartJob(job)
|
||||
sync_count = sync_count + 1
|
||||
else:
|
||||
if not self._jobIdInCC(job['job_id']):
|
||||
self._ccStartJob(job)
|
||||
sync_count = sync_count + 1
|
||||
if sync_count >= limit:
|
||||
print("INFO: sync limit (%s) reached" % limit)
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
about = """This script syncs the slurm jobs with the cluster cockpit backend. It uses
|
||||
the slurm command line tools to gather the relevant slurm infos and reads
|
||||
the corresponding info from cluster cockpit via its api.
|
||||
|
||||
After reading the data, it stops all jobs in cluster cockpit which are
|
||||
not running any more according to slurm and afterwards it creates all new
|
||||
running jobs in cluster cockpit.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description=about)
|
||||
parser.add_argument(
|
||||
"-c", "--config", help="Read config file. Default: config.json", default="config.json")
|
||||
parser.add_argument(
|
||||
"-d", "--debug", help="Enable debug output", action="store_true")
|
||||
parser.add_argument("-j", "--jobid", help="Sync this jobid")
|
||||
parser.add_argument(
|
||||
"-l", "--limit", help="Stop after n sync actions in each direction. Default: 200", default="200", type=int)
|
||||
parser.add_argument("--direction", help="Only sync in this direction",
|
||||
default="both", choices=['both', 'start', 'stop'])
|
||||
args = parser.parse_args()
|
||||
|
||||
# open config file
|
||||
if args.debug:
|
||||
print("DEBUG: load config file: %s" % args.config)
|
||||
with open(args.config, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
if args.debug:
|
||||
print("DEBUG: config file contents:")
|
||||
print(config)
|
||||
|
||||
s = CondorSync(config, args.debug)
|
||||
s.sync(args.limit, args.jobid, args.direction)
|
47
map-gpu-ids.py
Normal file
47
map-gpu-ids.py
Normal file
@ -0,0 +1,47 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from io import StringIO
|
||||
import pandas as pd
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
def fetch_condor_machines():
|
||||
compact_result = subprocess.run(
|
||||
["ssh", "conduit", "condor_status", "-compact"], capture_output=True, text=True)
|
||||
data = pd.read_csv(StringIO(compact_result.stdout),
|
||||
sep='\s+', skipfooter=5, engine="python")
|
||||
return data["Machine"]
|
||||
|
||||
def mapping_for_machine(host):
|
||||
machineAds = subprocess.run(
|
||||
["ssh", "conduit", "condor_status", "-json", host], capture_output=True, text=True)
|
||||
info = json.loads(machineAds.stdout)
|
||||
mapping = {}
|
||||
for slot in info:
|
||||
if 'DetectedGPUs' in slot and not 'ParentSlotId' in slot:
|
||||
detected = [name.strip() for name in slot['DetectedGPUs'].split(',')]
|
||||
for name in detected:
|
||||
snake = name.replace('-', '_').strip()
|
||||
if 'GPUs_' + snake in slot:
|
||||
mapping[name] = slot['GPUs_' + snake]['DevicePciBusId']
|
||||
elif snake + 'DevicePciBusId' in slot:
|
||||
mapping[name] = slot[snake + 'DevicePciBusId']
|
||||
return mapping
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
about = """This script reads a map from "AssignedGPUs" names to the PCIe bus ids.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description=about)
|
||||
parser.add_argument("--host", help="The host to map for.", default="")
|
||||
parser.add_argument("-d", "--debug", help="Enable debug output", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if len(args.host) > 0:
|
||||
print(json.dumps({args.host: mapping_for_machine(args.host)}))
|
||||
else:
|
||||
machines = fetch_condor_machines()
|
||||
mappings = {}
|
||||
for machine in machines:
|
||||
mappings[machine] = mapping_for_machine(machine)
|
||||
print(json.dumps(mappings))
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
tailf==0.2.5
|
@ -1,430 +0,0 @@
|
||||
#!/usr/bin/python3
|
||||
# This script syncs the slurm jobs with the cluster cockpit backend. It uses
|
||||
# the slurm command line tools to gather the relevant slurm infos and reads
|
||||
# the corresponding info from cluster cockpit via its api.
|
||||
#
|
||||
# After reading the data, it stops all jobs in cluster cockpit which are
|
||||
# not running any more according to slurm and afterwards it creates all new
|
||||
# running jobs in cluster cockpit.
|
||||
#
|
||||
# -- Michael Schwarz <schwarz@uni-paderborn.de>
|
||||
|
||||
import subprocess
|
||||
import json
|
||||
import requests
|
||||
import re
|
||||
|
||||
class CCApi:
|
||||
config = {}
|
||||
apiurl = ''
|
||||
apikey = ''
|
||||
headers = {}
|
||||
|
||||
def __init__(self, config, debug=False):
|
||||
self.config = config
|
||||
self.apiurl = "%s/api/" % config['cc-backend']['host']
|
||||
self.apikey = config['cc-backend']['apikey']
|
||||
self.headers = { 'accept': 'application/ld+json',
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': 'Bearer %s' % self.config['cc-backend']['apikey']}
|
||||
|
||||
def startJob(self, data):
|
||||
url = self.apiurl+"jobs/start_job/"
|
||||
r = requests.post(url, headers=self.headers, json=data)
|
||||
if r.status_code == 201:
|
||||
return r.json()
|
||||
else:
|
||||
print(data)
|
||||
print(r)
|
||||
return False
|
||||
|
||||
def stopJob(self, data):
|
||||
url = self.apiurl+"jobs/stop_job/"
|
||||
r = requests.post(url, headers=self.headers, json=data)
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
else:
|
||||
print(data)
|
||||
print(r)
|
||||
return False
|
||||
|
||||
def getJobs(self, filter_running=True):
|
||||
url = self.apiurl+"jobs/"
|
||||
if filter_running:
|
||||
url = url+"?state=running"
|
||||
r = requests.get(url, headers=self.headers)
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
else:
|
||||
return { 'jobs' : []}
|
||||
|
||||
class SlurmSync:
|
||||
slurmJobData = {}
|
||||
ccData = {}
|
||||
config = {}
|
||||
debug = False
|
||||
ccapi = None
|
||||
|
||||
def __init__(self, config, debug=False):
|
||||
self.config = config
|
||||
self.debug = debug
|
||||
|
||||
# validate config TODO
|
||||
if "slurm" not in config:
|
||||
raise KeyError
|
||||
if "squeue" not in config['slurm']:
|
||||
config.update({'squeue' : '/usr/bin/squeue'})
|
||||
if "sacct" not in config['slurm']:
|
||||
config.update({'sacct' : '/usr/bin/sacct'})
|
||||
if "cc-backend" not in config:
|
||||
raise KeyError
|
||||
if "host" not in config['cc-backend']:
|
||||
raise KeyError
|
||||
if "apikey" not in config['cc-backend']:
|
||||
raise KeyError
|
||||
|
||||
self.ccapi = CCApi(self.config, debug)
|
||||
|
||||
def _exec(self, command):
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
|
||||
output, error = process.communicate()
|
||||
if process.returncode is 0:
|
||||
return output.decode('utf-8')
|
||||
else:
|
||||
print("Error: ",error)
|
||||
return ""
|
||||
|
||||
def _readSlurmData(self):
|
||||
if self.debug:
|
||||
print("DEBUG: _readSlurmData called")
|
||||
command = "%s --json" % self.config['slurm']['squeue']
|
||||
self.slurmJobData = json.loads(self._exec(command))
|
||||
|
||||
def _readCCData(self):
|
||||
if self.debug:
|
||||
print("DEBUG: _readCCBackendData called")
|
||||
self.ccData = self.ccapi.getJobs()
|
||||
|
||||
def _getAccDataForJob(self, jobid):
|
||||
command = "%s -j %s --json" % (self.config['slurm']['sacct'], jobid)
|
||||
return json.loads(self._exec(command))
|
||||
|
||||
def _jobIdInCC(self, jobid):
|
||||
for job in self.ccData['jobs']:
|
||||
if jobid == job['jobId']:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _jobRunning(self, jobid):
|
||||
for job in self.slurmJobData['jobs']:
|
||||
if int(job['job_id']) == int(jobid):
|
||||
if job['job_state'] == 'RUNNING':
|
||||
return True
|
||||
return False
|
||||
|
||||
def _getACCIDsFromGRES(self, gres, nodename):
|
||||
ids = self.config['accelerators']
|
||||
|
||||
nodetype = None
|
||||
for k, v in ids.items():
|
||||
if nodename.startswith(k):
|
||||
nodetype = k
|
||||
|
||||
if not nodetype:
|
||||
print("WARNING: Can't find accelerator definition for node %s" % nodename.strip())
|
||||
return []
|
||||
|
||||
# the gres definition might be different on other clusters!
|
||||
m = re.match(r"(fpga|gpu):(\w+):(\d)\(IDX:([\d,\-]+)\)", gres)
|
||||
if m:
|
||||
family = m.group(1)
|
||||
type = m.group(2)
|
||||
amount = m.group(3)
|
||||
indexes = m.group(4)
|
||||
acc_id_list = []
|
||||
|
||||
# IDX might be: IDX:0,2-3
|
||||
# first split at , then expand the range to individual items
|
||||
if len(indexes) > 1:
|
||||
idx_list = indexes.split(',')
|
||||
idx = []
|
||||
for i in idx_list:
|
||||
if len(i) == 1:
|
||||
idx.append(i)
|
||||
else:
|
||||
start = i.split('-')[0]
|
||||
end = i.split('-')[1]
|
||||
idx = idx + list(range(int(start), int(end)+1))
|
||||
indexes = idx
|
||||
|
||||
for i in indexes:
|
||||
acc_id_list.append(ids[nodetype][str(i)])
|
||||
|
||||
print(acc_id_list)
|
||||
return acc_id_list
|
||||
|
||||
return []
|
||||
|
||||
def _ccStartJob(self, job):
|
||||
print("INFO: Crate job %s, user %s, name %s" % (job['job_id'], job['user_name'], job['name']))
|
||||
nodelist = self._convertNodelist(job['job_resources']['nodes'])
|
||||
|
||||
# Exclusive job?
|
||||
if job['shared'] == "none":
|
||||
exclusive = 1
|
||||
# exclusive to user
|
||||
elif job['shared'] == "user":
|
||||
exclusive = 2
|
||||
# exclusive to mcs
|
||||
elif job['shared'] == "mcs":
|
||||
exclusive = 3
|
||||
# default is shared node
|
||||
else:
|
||||
exclusive = 0
|
||||
|
||||
# read job script and environment
|
||||
hashdir = "hash.%s" % str(job['job_id'])[-1]
|
||||
jobscript_filename = "%s/%s/job.%s/script" % (self.config['slurm']['state_save_location'], hashdir, job['job_id'])
|
||||
jobscript = ''
|
||||
try:
|
||||
with open(jobscript_filename, 'r', encoding='utf-8') as f:
|
||||
jobscript = f.read()
|
||||
except FileNotFoundError:
|
||||
jobscript = 'NO JOBSCRIPT'
|
||||
|
||||
environment = ''
|
||||
# FIXME sometimes produces utf-8 conversion errors
|
||||
environment_filename = "%s/%s/job.%s/environment" % (self.config['slurm']['state_save_location'], hashdir, job['job_id'])
|
||||
try:
|
||||
with open(environment_filename, 'r', encoding='utf-8') as f:
|
||||
environment = f.read()
|
||||
except FileNotFoundError:
|
||||
environment = 'NO ENV'
|
||||
except UnicodeDecodeError:
|
||||
environment = 'UNICODE_DECODE_ERROR'
|
||||
|
||||
# truncate environment to 50.000 chars. Otherwise it might not fit into the
|
||||
# Database field together with the job script etc.
|
||||
environment = environment[:50000]
|
||||
|
||||
|
||||
# get additional info from slurm and add environment
|
||||
command = "scontrol show job %s" % job['job_id']
|
||||
slurminfo = self._exec(command)
|
||||
slurminfo = slurminfo + "ENV:\n====\n" + environment
|
||||
|
||||
# build payload
|
||||
data = {'jobId' : job['job_id'],
|
||||
'user' : job['user_name'],
|
||||
'cluster' : job['cluster'],
|
||||
'numNodes' : job['node_count'],
|
||||
'numHwthreads' : job['cpus'],
|
||||
'startTime': job['start_time'],
|
||||
'walltime': int(job['time_limit']) * 60,
|
||||
'project': job['account'],
|
||||
'partition': job['partition'],
|
||||
'exclusive': exclusive,
|
||||
'resources': [],
|
||||
'metadata': {
|
||||
'jobName' : job['name'],
|
||||
'jobScript' : jobscript,
|
||||
'slurmInfo' : slurminfo
|
||||
}
|
||||
}
|
||||
|
||||
# is this part of an array job?
|
||||
if job['array_job_id'] > 0:
|
||||
data.update({"arrayJobId" : job['array_job_id']})
|
||||
|
||||
i = 0
|
||||
num_acc = 0
|
||||
for node in nodelist:
|
||||
# begin dict
|
||||
resources = {'hostname' : node.strip()}
|
||||
|
||||
# if a job uses a node exclusive, there are some assigned cpus (used by this job)
|
||||
# and some unassigned cpus. In this case, the assigned_cpus are those which have
|
||||
# to be monitored, otherwise use the unassigned cpus.
|
||||
hwthreads = job['job_resources']['allocated_nodes'][str(i)]['cores']
|
||||
cores_assigned = []
|
||||
cores_unassigned = []
|
||||
for k,v in hwthreads.items():
|
||||
if v == "assigned":
|
||||
cores_assigned.append(int(k))
|
||||
else:
|
||||
cores_unassigned.append(int(k))
|
||||
|
||||
if len(cores_assigned) > 0:
|
||||
cores = cores_assigned
|
||||
else:
|
||||
cores = cores_unassigned
|
||||
resources.update({"hwthreads": cores})
|
||||
|
||||
# Get allocated GPUs if some are requested
|
||||
if len(job['gres_detail']) > 0:
|
||||
|
||||
gres = job['gres_detail'][i]
|
||||
acc_ids = self._getACCIDsFromGRES(gres, node)
|
||||
if len(acc_ids) > 0:
|
||||
num_acc = num_acc + len(acc_ids)
|
||||
resources.update({"accelerators" : acc_ids})
|
||||
|
||||
|
||||
data['resources'].append(resources)
|
||||
i = i + 1
|
||||
|
||||
# if the number of accelerators has changed in the meantime, upate this field
|
||||
data.update({"numAcc" : num_acc})
|
||||
|
||||
if self.debug:
|
||||
print(data)
|
||||
|
||||
self.ccapi.startJob(data)
|
||||
|
||||
def _ccStopJob(self, jobid):
|
||||
print("INFO: Stop job %s" % jobid)
|
||||
|
||||
# get search for the jobdata stored in CC
|
||||
ccjob = {}
|
||||
for j in self.ccData['jobs']:
|
||||
if j['jobId'] == jobid:
|
||||
ccjob = j
|
||||
|
||||
# check if job is still in squeue data
|
||||
for job in self.slurmJobData['jobs']:
|
||||
if job['job_id'] == jobid:
|
||||
jobstate = job['job_state'].lower()
|
||||
endtime = job['end_time']
|
||||
if jobstate == 'requeued':
|
||||
print("Requeued job")
|
||||
jobstate = 'failed'
|
||||
|
||||
if int(ccjob['startTime']) >= int(job['end_time']):
|
||||
print("squeue correction")
|
||||
# For some reason (needs to get investigated), failed jobs sometimes report
|
||||
# an earlier end time in squee than the starting time in CC. If this is the
|
||||
# case, take the starting time from CC and add ten seconds to the starting
|
||||
# time as new end time. Otherwise CC refuses to end the job.
|
||||
endtime = int(ccjob['startTime']) + 1
|
||||
|
||||
else:
|
||||
jobsAcctData = self._getAccDataForJob(jobid)['jobs']
|
||||
for j in jobsAcctData:
|
||||
if len(j['steps']) > 0 and j['steps'][0]['time']['start'] == ccjob['startTime']:
|
||||
jobAcctData = j
|
||||
jobstate = jobAcctData['state']['current'].lower()
|
||||
endtime = jobAcctData['time']['end']
|
||||
|
||||
if jobstate == "node_fail":
|
||||
jobstate = "failed"
|
||||
if jobstate == "requeued":
|
||||
print("Requeued job")
|
||||
jobstate = "failed"
|
||||
|
||||
if int(ccjob['startTime']) >= int(jobAcctData['time']['end']):
|
||||
print("sacct correction")
|
||||
# For some reason (needs to get investigated), failed jobs sometimes report
|
||||
# an earlier end time in squee than the starting time in CC. If this is the
|
||||
# case, take the starting time from CC and add ten seconds to the starting
|
||||
# time as new end time. Otherwise CC refuses to end the job.
|
||||
endtime = int(ccjob['startTime']) + 1
|
||||
|
||||
data = {
|
||||
'jobId' : jobid,
|
||||
'cluster' : ccjob['cluster'],
|
||||
'startTime' : ccjob['startTime'],
|
||||
'stopTime' : endtime,
|
||||
'jobState' : jobstate
|
||||
}
|
||||
|
||||
self.ccapi.stopJob(data)
|
||||
|
||||
def _convertNodelist(self, nodelist):
|
||||
# Use slurm to convert a nodelist with ranges into a comma separated list of unique nodes
|
||||
if re.search(self.config['node_regex'], nodelist):
|
||||
command = "scontrol show hostname %s | paste -d, -s" % nodelist
|
||||
retval = self._exec(command).split(',')
|
||||
return retval
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def sync(self, limit=200, jobid=None, direction='both'):
|
||||
if self.debug:
|
||||
print("DEBUG: sync called")
|
||||
print("DEBUG: jobid %s" % jobid)
|
||||
self._readSlurmData()
|
||||
self._readCCData()
|
||||
|
||||
# Abort after a defined count of sync actions. The intend is, to restart this script after the
|
||||
# limit is reached. Otherwise, if many many jobs get stopped, the script might miss some new jobs.
|
||||
sync_count = 0
|
||||
|
||||
# iterate over cc jobs and stop them if they have already ended
|
||||
if direction in ['both', 'stop']:
|
||||
for job in self.ccData['jobs']:
|
||||
if jobid:
|
||||
if int(job['jobId']) == int(jobid) and not self._jobRunning(job['jobId']):
|
||||
self._ccStopJob(job['jobId'])
|
||||
sync_count = sync_count + 1
|
||||
else:
|
||||
if not self._jobRunning(job['jobId']):
|
||||
self._ccStopJob(job['jobId'])
|
||||
sync_count = sync_count + 1
|
||||
if sync_count >= limit:
|
||||
print("INFO: sync limit (%s) reached" % limit)
|
||||
break
|
||||
|
||||
sync_count = 0
|
||||
# iterate over running jobs and add them to cc if they are still missing there
|
||||
if direction in ['both', 'start']:
|
||||
for job in self.slurmJobData['jobs']:
|
||||
# Skip this job if the user does not want the metadata of this job to be submitted to ClusterCockpit
|
||||
# The text field admin_comment is used for this. We assume that this field contains a comma seperated
|
||||
# list of flags.
|
||||
if "disable_cc_submission" in job['admin_comment'].split(','):
|
||||
print("INFO: Job %s: disable_cc_sumbission is set. Continue with next job" % job['job_id'])
|
||||
continue
|
||||
# consider only running jobs
|
||||
if job['job_state'] == "RUNNING":
|
||||
if jobid:
|
||||
if int(job['job_id']) == int(jobid) and not self._jobIdInCC(job['job_id']):
|
||||
self._ccStartJob(job)
|
||||
sync_count = sync_count + 1
|
||||
else:
|
||||
if not self._jobIdInCC(job['job_id']):
|
||||
self._ccStartJob(job)
|
||||
sync_count = sync_count + 1
|
||||
if sync_count >= limit:
|
||||
print("INFO: sync limit (%s) reached" % limit)
|
||||
break
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
about = """This script syncs the slurm jobs with the cluster cockpit backend. It uses
|
||||
the slurm command line tools to gather the relevant slurm infos and reads
|
||||
the corresponding info from cluster cockpit via its api.
|
||||
|
||||
After reading the data, it stops all jobs in cluster cockpit which are
|
||||
not running any more according to slurm and afterwards it creates all new
|
||||
running jobs in cluster cockpit.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description=about)
|
||||
parser.add_argument("-c", "--config", help="Read config file. Default: config.json", default="config.json")
|
||||
parser.add_argument("-d", "--debug", help="Enable debug output", action="store_true")
|
||||
parser.add_argument("-j", "--jobid", help="Sync this jobid")
|
||||
parser.add_argument("-l", "--limit", help="Stop after n sync actions in each direction. Default: 200", default="200", type=int)
|
||||
parser.add_argument("--direction", help="Only sync in this direction", default="both", choices=['both', 'start', 'stop'])
|
||||
args = parser.parse_args()
|
||||
|
||||
# open config file
|
||||
if args.debug:
|
||||
print("DEBUG: load config file: %s" % args.config)
|
||||
with open(args.config, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
if args.debug:
|
||||
print("DEBUG: config file contents:")
|
||||
print(config)
|
||||
|
||||
s = SlurmSync(config, args.debug)
|
||||
s.sync(args.limit, args.jobid, args.direction)
|
Loading…
Reference in New Issue
Block a user