mirror of
https://gitlab.cs.uni-saarland.de/hpc/cc-condor-sync.git
synced 2025-07-27 15:16:09 +02:00
Restructure.
This commit is contained in:
29
scripts/condor_status_to_gpu_map.py
Normal file
29
scripts/condor_status_to_gpu_map.py
Normal file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import json
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
about = """This script parses a `condor_status -json` dump file and spits out a GPU hash ID to PCIe address map.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description=about)
|
||||
parser.add_argument("condor_status_file",
|
||||
help="`condor_status -json` dump", default="condor_status.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
# open dump file
|
||||
with open(args.condor_status_file, 'r', encoding='utf-8') as f:
|
||||
condor_status = json.load(f)
|
||||
|
||||
slot_gpu_map = {}
|
||||
for slot in condor_status:
|
||||
machine = slot["Machine"]
|
||||
gpu_map = {}
|
||||
if machine in slot_gpu_map:
|
||||
gpu_map = slot_gpu_map[machine]
|
||||
gpus = slot["AssignedGPUs"].split(',')
|
||||
for gpu_id in gpus:
|
||||
gpu = slot["GPUs_" + gpu_id.strip().replace("-", "_")]
|
||||
gpu_map[gpu["Id"]] = gpu["DevicePciBusId"]
|
||||
slot_gpu_map[machine] = gpu_map
|
||||
print(json.dumps(slot_gpu_map))
|
47
scripts/map-gpu-ids.py
Normal file
47
scripts/map-gpu-ids.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
from io import StringIO
|
||||
import pandas as pd
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
def fetch_condor_machines():
|
||||
compact_result = subprocess.run(
|
||||
["ssh", "conduit", "condor_status", "-compact"], capture_output=True, text=True)
|
||||
data = pd.read_csv(StringIO(compact_result.stdout),
|
||||
sep='\s+', skipfooter=5, engine="python")
|
||||
return data["Machine"]
|
||||
|
||||
def mapping_for_machine(host):
|
||||
machineAds = subprocess.run(
|
||||
["ssh", "conduit", "condor_status", "-json", host], capture_output=True, text=True)
|
||||
info = json.loads(machineAds.stdout)
|
||||
mapping = {}
|
||||
for slot in info:
|
||||
if 'DetectedGPUs' in slot and not 'ParentSlotId' in slot:
|
||||
detected = [name.strip() for name in slot['DetectedGPUs'].split(',')]
|
||||
for name in detected:
|
||||
snake = name.replace('-', '_').strip()
|
||||
if 'GPUs_' + snake in slot:
|
||||
mapping[name] = slot['GPUs_' + snake]['DevicePciBusId']
|
||||
elif snake + 'DevicePciBusId' in slot:
|
||||
mapping[name] = slot[snake + 'DevicePciBusId']
|
||||
return mapping
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
about = """This script reads a map from "AssignedGPUs" names to the PCIe bus ids.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description=about)
|
||||
parser.add_argument("--host", help="The host to map for.", default="")
|
||||
parser.add_argument("-d", "--debug", help="Enable debug output", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if len(args.host) > 0:
|
||||
print(json.dumps({args.host: mapping_for_machine(args.host)}))
|
||||
else:
|
||||
machines = fetch_condor_machines()
|
||||
mappings = {}
|
||||
for machine in machines:
|
||||
mappings[machine] = mapping_for_machine(machine)
|
||||
print(json.dumps(mappings))
|
70
scripts/stop-jobs.py
Normal file
70
scripts/stop-jobs.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/python3
|
||||
import time
|
||||
from dateutil import parser as dateparser
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
||||
class CCApi:
|
||||
config = {}
|
||||
apiurl = ''
|
||||
apikey = ''
|
||||
headers = {}
|
||||
debug = False
|
||||
|
||||
def __init__(self, config, debug=False):
|
||||
self.config = config
|
||||
self.apiurl = "%s/api/" % config['cc-backend']['host']
|
||||
self.apikey = config['cc-backend']['apikey']
|
||||
self.headers = {'accept': 'application/ld+json',
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': 'Bearer %s' % self.config['cc-backend']['apikey']}
|
||||
self.debug = debug
|
||||
|
||||
def stopJob(self, id, data):
|
||||
url = self.apiurl+"jobs/stop_job/%d" % id
|
||||
r = requests.post(url, headers=self.headers, json=data)
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
else:
|
||||
print(data)
|
||||
print(r.status_code, r.content)
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
about = """This script syncs the slurm jobs with the cluster cockpit backend. It uses
|
||||
the slurm command line tools to gather the relevant slurm infos and reads
|
||||
the corresponding info from cluster cockpit via its api.
|
||||
|
||||
After reading the data, it stops all jobs in cluster cockpit which are
|
||||
not running any more according to slurm and afterwards it creates all new
|
||||
running jobs in cluster cockpit.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description=about)
|
||||
parser.add_argument(
|
||||
"-c", "--config", help="Read config file. Default: config.json", default="config.json")
|
||||
parser.add_argument(
|
||||
"-j", "--jobs", help="Read job file file. Default: tobestopped.json", default="tobestopped.json")
|
||||
parser.add_argument(
|
||||
"-d", "--debug", help="Enable debug output", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
|
||||
cc = CCApi(config, args.debug)
|
||||
with open("tobestopped.json") as f:
|
||||
jobs = json.load(f)['data']['jobs']['items']
|
||||
|
||||
for job in jobs:
|
||||
startTime = int(time.mktime(dateparser.parse(job['startTime']).timetuple()))
|
||||
data = {
|
||||
"jobState": "cancelled",
|
||||
"stopTime": startTime+1,
|
||||
"cluster": job['cluster'],
|
||||
"jobId": job['jobId'],
|
||||
"startTime": startTime
|
||||
}
|
||||
cc.stopJob(job['id'], data)
|
||||
|
Reference in New Issue
Block a user