Restructure.

This commit is contained in:
Joachim Meyer
2022-12-15 16:20:26 +01:00
parent a3ca962d84
commit b530d9034e
14 changed files with 71 additions and 736 deletions

View File

@@ -0,0 +1,29 @@
#!/usr/bin/python3
import json
if __name__ == "__main__":
import argparse
about = """This script parses a `condor_status -json` dump file and spits out a GPU hash ID to PCIe address map.
"""
parser = argparse.ArgumentParser(description=about)
parser.add_argument("condor_status_file",
help="`condor_status -json` dump", default="condor_status.json")
args = parser.parse_args()
# open dump file
with open(args.condor_status_file, 'r', encoding='utf-8') as f:
condor_status = json.load(f)
slot_gpu_map = {}
for slot in condor_status:
machine = slot["Machine"]
gpu_map = {}
if machine in slot_gpu_map:
gpu_map = slot_gpu_map[machine]
gpus = slot["AssignedGPUs"].split(',')
for gpu_id in gpus:
gpu = slot["GPUs_" + gpu_id.strip().replace("-", "_")]
gpu_map[gpu["Id"]] = gpu["DevicePciBusId"]
slot_gpu_map[machine] = gpu_map
print(json.dumps(slot_gpu_map))

47
scripts/map-gpu-ids.py Normal file
View File

@@ -0,0 +1,47 @@
#!/usr/bin/python3
from io import StringIO
import pandas as pd
import json
import subprocess
def fetch_condor_machines():
compact_result = subprocess.run(
["ssh", "conduit", "condor_status", "-compact"], capture_output=True, text=True)
data = pd.read_csv(StringIO(compact_result.stdout),
sep='\s+', skipfooter=5, engine="python")
return data["Machine"]
def mapping_for_machine(host):
machineAds = subprocess.run(
["ssh", "conduit", "condor_status", "-json", host], capture_output=True, text=True)
info = json.loads(machineAds.stdout)
mapping = {}
for slot in info:
if 'DetectedGPUs' in slot and not 'ParentSlotId' in slot:
detected = [name.strip() for name in slot['DetectedGPUs'].split(',')]
for name in detected:
snake = name.replace('-', '_').strip()
if 'GPUs_' + snake in slot:
mapping[name] = slot['GPUs_' + snake]['DevicePciBusId']
elif snake + 'DevicePciBusId' in slot:
mapping[name] = slot[snake + 'DevicePciBusId']
return mapping
if __name__ == "__main__":
import argparse
about = """This script reads a map from "AssignedGPUs" names to the PCIe bus ids.
"""
parser = argparse.ArgumentParser(description=about)
parser.add_argument("--host", help="The host to map for.", default="")
parser.add_argument("-d", "--debug", help="Enable debug output", action="store_true")
args = parser.parse_args()
if len(args.host) > 0:
print(json.dumps({args.host: mapping_for_machine(args.host)}))
else:
machines = fetch_condor_machines()
mappings = {}
for machine in machines:
mappings[machine] = mapping_for_machine(machine)
print(json.dumps(mappings))

70
scripts/stop-jobs.py Normal file
View File

@@ -0,0 +1,70 @@
#!/usr/bin/python3
import time
from dateutil import parser as dateparser
import requests
import json
class CCApi:
config = {}
apiurl = ''
apikey = ''
headers = {}
debug = False
def __init__(self, config, debug=False):
self.config = config
self.apiurl = "%s/api/" % config['cc-backend']['host']
self.apikey = config['cc-backend']['apikey']
self.headers = {'accept': 'application/ld+json',
'Content-Type': 'application/json',
'Authorization': 'Bearer %s' % self.config['cc-backend']['apikey']}
self.debug = debug
def stopJob(self, id, data):
url = self.apiurl+"jobs/stop_job/%d" % id
r = requests.post(url, headers=self.headers, json=data)
if r.status_code == 200:
return r.json()
else:
print(data)
print(r.status_code, r.content)
return False
if __name__ == "__main__":
import argparse
about = """This script syncs the slurm jobs with the cluster cockpit backend. It uses
the slurm command line tools to gather the relevant slurm infos and reads
the corresponding info from cluster cockpit via its api.
After reading the data, it stops all jobs in cluster cockpit which are
not running any more according to slurm and afterwards it creates all new
running jobs in cluster cockpit.
"""
parser = argparse.ArgumentParser(description=about)
parser.add_argument(
"-c", "--config", help="Read config file. Default: config.json", default="config.json")
parser.add_argument(
"-j", "--jobs", help="Read job file file. Default: tobestopped.json", default="tobestopped.json")
parser.add_argument(
"-d", "--debug", help="Enable debug output", action="store_true")
args = parser.parse_args()
with open(args.config, 'r', encoding='utf-8') as f:
config = json.load(f)
cc = CCApi(config, args.debug)
with open("tobestopped.json") as f:
jobs = json.load(f)['data']['jobs']['items']
for job in jobs:
startTime = int(time.mktime(dateparser.parse(job['startTime']).timetuple()))
data = {
"jobState": "cancelled",
"stopTime": startTime+1,
"cluster": job['cluster'],
"jobId": job['jobId'],
"startTime": startTime
}
cc.stopJob(job['id'], data)