From 523a688bed364727f206f2704ffc746f28b10e4a Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 10 Oct 2024 15:04:47 +0200 Subject: [PATCH 01/25] Initial commit --- cc-metric-store/Dockerfile | 5 +- data/init.sh | 34 - data/ldap/users.ldif | 1027 -------------------------- data/mariadb/slurm.cnf | 5 - data/slurm/home/config/cgroup.conf | 4 - data/slurm/home/config/slurm.conf | 48 -- data/slurm/home/config/slurmdbd.conf | 31 - docker-compose.yml | 1 + migrateTimestamps.pl | 127 ++-- setupDev.sh | 33 +- slurm/base/Dockerfile | 2 +- slurm/controller/Dockerfile | 2 +- slurm/database/Dockerfile | 2 +- slurm/rest/Dockerfile | 2 +- slurm/worker/Dockerfile | 2 +- 15 files changed, 89 insertions(+), 1236 deletions(-) delete mode 100755 data/init.sh delete mode 100644 data/ldap/users.ldif delete mode 100644 data/mariadb/slurm.cnf delete mode 100644 data/slurm/home/config/cgroup.conf delete mode 100644 data/slurm/home/config/slurm.conf delete mode 100644 data/slurm/home/config/slurmdbd.conf diff --git a/cc-metric-store/Dockerfile b/cc-metric-store/Dockerfile index 4284c98..eb7aa48 100644 --- a/cc-metric-store/Dockerfile +++ b/cc-metric-store/Dockerfile @@ -1,10 +1,11 @@ -FROM golang:1.17 +FROM golang:1.22.4 RUN apt-get update RUN apt-get -y install git RUN git clone https://github.com/ClusterCockpit/cc-metric-store.git /cc-metric-store -RUN cd /cc-metric-store && go build +RUN ls +RUN cd /cc-metric-store && go build ./cmd/cc-metric-store # Reactivate when latest commit is available #RUN go get -d -v github.com/ClusterCockpit/cc-metric-store diff --git a/data/init.sh b/data/init.sh deleted file mode 100755 index 3bddade..0000000 --- a/data/init.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -if [ -d symfony ]; then - echo "Data already initialized!" - echo -n "Perform a fresh initialisation? [yes to proceed / no to exit] " - read -r answer - if [ "$answer" == "yes" ]; then - echo "Cleaning directories ..." - rm -rf symfony - rm -rf job-archive - rm -rf influxdb/data/* - rm -rf sqldata/* - echo "done." - else - echo "Aborting ..." - exit - fi -fi - -mkdir symfony -wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive_stable.tar.xz -tar xJf job-archive_stable.tar.xz -rm ./job-archive_stable.tar.xz - -# 101 is the uid and gid of the user and group www-data in the cc-php container running php-fpm. -# For a demo with no new jobs it is enough to give www read permissions on that directory. -# echo "This script needs to chown the job-archive directory so that the application can write to it:" -# sudo chown -R 82:82 ./job-archive - -mkdir -p influxdb/data -wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/influxdbv2-data_stable.tar.xz -cd influxdb/data -tar xJf ../../influxdbv2-data_stable.tar.xz -rm ../../influxdbv2-data_stable.tar.xz diff --git a/data/ldap/users.ldif b/data/ldap/users.ldif deleted file mode 100644 index 79a390a..0000000 --- a/data/ldap/users.ldif +++ /dev/null @@ -1,1027 +0,0 @@ -# extended LDIF -# -# LDAPv3 -# base with scope subtree -# filter: (objectclass=*) -# requesting: ALL - -# people, hpc, rrze.uni-erlangen.de -dn: ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -objectClass: organizationalUnit -objectClass: top -ou: hpc - -# emmyUser1, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser1,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser1 -uidNumber: 10000 -gecos: Ann Watson -cn: emmyUser1 -homeDirectory: /home/hpc/emmyUser1 -userPassword: emmyUser1 - -# emmyUser10, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser10,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser10 -uidNumber: 10001 -gecos: Kenneth Wallis -cn: emmyUser10 -homeDirectory: /home/hpc/emmyUser10 -userPassword: emmyUser10 - -# emmyUser2, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser2,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser2 -uidNumber: 10002 -gecos: Lewis Bennett -cn: emmyUser2 -homeDirectory: /home/hpc/emmyUser2 -userPassword: emmyUser2 - -# emmyUser3, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser3,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser3 -uidNumber: 10003 -gecos: Darren Jenkins -cn: emmyUser3 -homeDirectory: /home/hpc/emmyUser3 -userPassword: emmyUser3 - -# emmyUser4, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser4,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser4 -uidNumber: 10004 -gecos: Terry Johnson -cn: emmyUser4 -homeDirectory: /home/hpc/emmyUser4 -userPassword: emmyUser4 - -# emmyUser5, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser5,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser5 -uidNumber: 10005 -gecos: Shaun Hurst -cn: emmyUser5 -homeDirectory: /home/hpc/emmyUser5 -userPassword: emmyUser5 - -# emmyUser6, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser6,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser6 -uidNumber: 10006 -gecos: Peter Peters -cn: emmyUser6 -homeDirectory: /home/hpc/emmyUser6 -userPassword: emmyUser6 - -# emmyUser7, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser7,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser7 -uidNumber: 10007 -gecos: Sean Davies -cn: emmyUser7 -homeDirectory: /home/hpc/emmyUser7 -userPassword: emmyUser7 - -# emmyUser8, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser8,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser8 -uidNumber: 10008 -gecos: Kyle Lawrence -cn: emmyUser8 -homeDirectory: /home/hpc/emmyUser8 -userPassword: emmyUser8 - -# emmyUser9, hpc, rrze.uni-erlangen.de -dn: uid=emmyUser9,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: emmyUser9 -uidNumber: 10009 -gecos: Ryan Edwards -cn: emmyUser9 -homeDirectory: /home/hpc/emmyUser9 -userPassword: emmyUser9 - -# influxUser1, hpc, rrze.uni-erlangen.de -dn: uid=influxUser1,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser1 -uidNumber: 10010 -gecos: Dale Sharpe -cn: influxUser1 -homeDirectory: /home/hpc/influxUser1 -userPassword: influxUser1 - -# influxUser10, hpc, rrze.uni-erlangen.de -dn: uid=influxUser10,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser10 -uidNumber: 10011 -gecos: Tracey McCarthy -cn: influxUser10 -homeDirectory: /home/hpc/influxUser10 -userPassword: influxUser10 - -# influxUser11, hpc, rrze.uni-erlangen.de -dn: uid=influxUser11,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser11 -uidNumber: 10012 -gecos: Douglas Harrison -cn: influxUser11 -homeDirectory: /home/hpc/influxUser11 -userPassword: influxUser11 - -# influxUser12, hpc, rrze.uni-erlangen.de -dn: uid=influxUser12,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser12 -uidNumber: 10013 -gecos: Kimberley Powell -cn: influxUser12 -homeDirectory: /home/hpc/influxUser12 -userPassword: influxUser12 - -# influxUser13, hpc, rrze.uni-erlangen.de -dn: uid=influxUser13,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser13 -uidNumber: 10014 -gecos: Patrick Hill -cn: influxUser13 -homeDirectory: /home/hpc/influxUser13 -userPassword: influxUser13 - -# influxUser14, hpc, rrze.uni-erlangen.de -dn: uid=influxUser14,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser14 -uidNumber: 10015 -gecos: Harriet Chadwick -cn: influxUser14 -homeDirectory: /home/hpc/influxUser14 -userPassword: influxUser14 - -# influxUser15, hpc, rrze.uni-erlangen.de -dn: uid=influxUser15,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser15 -uidNumber: 10016 -gecos: Annette Parker -cn: influxUser15 -homeDirectory: /home/hpc/influxUser15 -userPassword: influxUser15 - -# influxUser16, hpc, rrze.uni-erlangen.de -dn: uid=influxUser16,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser16 -uidNumber: 10017 -gecos: Owen Price -cn: influxUser16 -homeDirectory: /home/hpc/influxUser16 -userPassword: influxUser16 - -# influxUser17, hpc, rrze.uni-erlangen.de -dn: uid=influxUser17,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser17 -uidNumber: 10018 -gecos: Kyle Patel -cn: influxUser17 -homeDirectory: /home/hpc/influxUser17 -userPassword: influxUser17 - -# influxUser18, hpc, rrze.uni-erlangen.de -dn: uid=influxUser18,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser18 -uidNumber: 10019 -gecos: Denis Barber -cn: influxUser18 -homeDirectory: /home/hpc/influxUser18 -userPassword: influxUser18 - -# influxUser19, hpc, rrze.uni-erlangen.de -dn: uid=influxUser19,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser19 -uidNumber: 10020 -gecos: Diane Birch -cn: influxUser19 -homeDirectory: /home/hpc/influxUser19 -userPassword: influxUser19 - -# influxUser2, hpc, rrze.uni-erlangen.de -dn: uid=influxUser2,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser2 -uidNumber: 10021 -gecos: Jordan Walker -cn: influxUser2 -homeDirectory: /home/hpc/influxUser2 -userPassword: influxUser2 - -# influxUser20, hpc, rrze.uni-erlangen.de -dn: uid=influxUser20,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser20 -uidNumber: 10022 -gecos: Brian Wilson -cn: influxUser20 -homeDirectory: /home/hpc/influxUser20 -userPassword: influxUser20 - -# influxUser21, hpc, rrze.uni-erlangen.de -dn: uid=influxUser21,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser21 -uidNumber: 10023 -gecos: Molly Miller -cn: influxUser21 -homeDirectory: /home/hpc/influxUser21 -userPassword: influxUser21 - -# influxUser22, hpc, rrze.uni-erlangen.de -dn: uid=influxUser22,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser22 -uidNumber: 10024 -gecos: Reece Godfrey -cn: influxUser22 -homeDirectory: /home/hpc/influxUser22 -userPassword: influxUser22 - -# influxUser23, hpc, rrze.uni-erlangen.de -dn: uid=influxUser23,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser23 -uidNumber: 10025 -gecos: Antony Cooper -cn: influxUser23 -homeDirectory: /home/hpc/influxUser23 -userPassword: influxUser23 - -# influxUser24, hpc, rrze.uni-erlangen.de -dn: uid=influxUser24,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser24 -uidNumber: 10026 -gecos: Mark Evans -cn: influxUser24 -homeDirectory: /home/hpc/influxUser24 -userPassword: influxUser24 - -# influxUser25, hpc, rrze.uni-erlangen.de -dn: uid=influxUser25,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser25 -uidNumber: 10027 -gecos: Edward Coleman -cn: influxUser25 -homeDirectory: /home/hpc/influxUser25 -userPassword: influxUser25 - -# influxUser26, hpc, rrze.uni-erlangen.de -dn: uid=influxUser26,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser26 -uidNumber: 10028 -gecos: Lucy Marsden -cn: influxUser26 -homeDirectory: /home/hpc/influxUser26 -userPassword: influxUser26 - -# influxUser27, hpc, rrze.uni-erlangen.de -dn: uid=influxUser27,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser27 -uidNumber: 10029 -gecos: Leonard King -cn: influxUser27 -homeDirectory: /home/hpc/influxUser27 -userPassword: influxUser27 - -# influxUser28, hpc, rrze.uni-erlangen.de -dn: uid=influxUser28,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser28 -uidNumber: 10030 -gecos: Marion Harvey -cn: influxUser28 -homeDirectory: /home/hpc/influxUser28 -userPassword: influxUser28 - -# influxUser29, hpc, rrze.uni-erlangen.de -dn: uid=influxUser29,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser29 -uidNumber: 10031 -gecos: Jean Phillips -cn: influxUser29 -homeDirectory: /home/hpc/influxUser29 -userPassword: influxUser29 - -# influxUser3, hpc, rrze.uni-erlangen.de -dn: uid=influxUser3,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser3 -uidNumber: 10032 -gecos: Derek Sutton -cn: influxUser3 -homeDirectory: /home/hpc/influxUser3 -userPassword: influxUser3 - -# influxUser30, hpc, rrze.uni-erlangen.de -dn: uid=influxUser30,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser30 -uidNumber: 10033 -gecos: Marion Powell -cn: influxUser30 -homeDirectory: /home/hpc/influxUser30 -userPassword: influxUser30 - -# influxUser31, hpc, rrze.uni-erlangen.de -dn: uid=influxUser31,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser31 -uidNumber: 10034 -gecos: Laura Matthews -cn: influxUser31 -homeDirectory: /home/hpc/influxUser31 -userPassword: influxUser31 - -# influxUser32, hpc, rrze.uni-erlangen.de -dn: uid=influxUser32,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser32 -uidNumber: 10035 -gecos: Julie Bell -cn: influxUser32 -homeDirectory: /home/hpc/influxUser32 -userPassword: influxUser32 - -# influxUser33, hpc, rrze.uni-erlangen.de -dn: uid=influxUser33,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser33 -uidNumber: 10036 -gecos: Thomas Davies -cn: influxUser33 -homeDirectory: /home/hpc/influxUser33 -userPassword: influxUser33 - -# influxUser34, hpc, rrze.uni-erlangen.de -dn: uid=influxUser34,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser34 -uidNumber: 10037 -gecos: Robin Webster -cn: influxUser34 -homeDirectory: /home/hpc/influxUser34 -userPassword: influxUser34 - -# influxUser35, hpc, rrze.uni-erlangen.de -dn: uid=influxUser35,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser35 -uidNumber: 10038 -gecos: Josh Robinson -cn: influxUser35 -homeDirectory: /home/hpc/influxUser35 -userPassword: influxUser35 - -# influxUser36, hpc, rrze.uni-erlangen.de -dn: uid=influxUser36,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser36 -uidNumber: 10039 -gecos: Eileen Murphy -cn: influxUser36 -homeDirectory: /home/hpc/influxUser36 -userPassword: influxUser36 - -# influxUser37, hpc, rrze.uni-erlangen.de -dn: uid=influxUser37,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser37 -uidNumber: 10040 -gecos: Charlene Carter -cn: influxUser37 -homeDirectory: /home/hpc/influxUser37 -userPassword: influxUser37 - -# influxUser38, hpc, rrze.uni-erlangen.de -dn: uid=influxUser38,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser38 -uidNumber: 10041 -gecos: Declan Brown -cn: influxUser38 -homeDirectory: /home/hpc/influxUser38 -userPassword: influxUser38 - -# influxUser39, hpc, rrze.uni-erlangen.de -dn: uid=influxUser39,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser39 -uidNumber: 10042 -gecos: Lee Wilson -cn: influxUser39 -homeDirectory: /home/hpc/influxUser39 -userPassword: influxUser39 - -# influxUser4, hpc, rrze.uni-erlangen.de -dn: uid=influxUser4,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser4 -uidNumber: 10043 -gecos: Steven Collier -cn: influxUser4 -homeDirectory: /home/hpc/influxUser4 -userPassword: influxUser4 - -# influxUser40, hpc, rrze.uni-erlangen.de -dn: uid=influxUser40,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser40 -uidNumber: 10044 -gecos: Ashley Smith -cn: influxUser40 -homeDirectory: /home/hpc/influxUser40 -userPassword: influxUser40 - -# influxUser41, hpc, rrze.uni-erlangen.de -dn: uid=influxUser41,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser41 -uidNumber: 10045 -gecos: Alison Robinson -cn: influxUser41 -homeDirectory: /home/hpc/influxUser41 -userPassword: influxUser41 - -# influxUser42, hpc, rrze.uni-erlangen.de -dn: uid=influxUser42,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser42 -uidNumber: 10046 -gecos: Sandra Dunn -cn: influxUser42 -homeDirectory: /home/hpc/influxUser42 -userPassword: influxUser42 - -# influxUser43, hpc, rrze.uni-erlangen.de -dn: uid=influxUser43,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser43 -uidNumber: 10047 -gecos: Cheryl Price -cn: influxUser43 -homeDirectory: /home/hpc/influxUser43 -userPassword: influxUser43 - -# influxUser44, hpc, rrze.uni-erlangen.de -dn: uid=influxUser44,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser44 -uidNumber: 10048 -gecos: June Nicholson -cn: influxUser44 -homeDirectory: /home/hpc/influxUser44 -userPassword: influxUser44 - -# influxUser45, hpc, rrze.uni-erlangen.de -dn: uid=influxUser45,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser45 -uidNumber: 10049 -gecos: Olivia Potter -cn: influxUser45 -homeDirectory: /home/hpc/influxUser45 -userPassword: influxUser45 - -# influxUser46, hpc, rrze.uni-erlangen.de -dn: uid=influxUser46,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser46 -uidNumber: 10050 -gecos: Melissa Welch -cn: influxUser46 -homeDirectory: /home/hpc/influxUser46 -userPassword: influxUser46 - -# influxUser47, hpc, rrze.uni-erlangen.de -dn: uid=influxUser47,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser47 -uidNumber: 10051 -gecos: Marc Sims -cn: influxUser47 -homeDirectory: /home/hpc/influxUser47 -userPassword: influxUser47 - -# influxUser48, hpc, rrze.uni-erlangen.de -dn: uid=influxUser48,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser48 -uidNumber: 10052 -gecos: Alan Harris -cn: influxUser48 -homeDirectory: /home/hpc/influxUser48 -userPassword: influxUser48 - -# influxUser49, hpc, rrze.uni-erlangen.de -dn: uid=influxUser49,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser49 -uidNumber: 10053 -gecos: Declan Harrison -cn: influxUser49 -homeDirectory: /home/hpc/influxUser49 -userPassword: influxUser49 - -# influxUser5, hpc, rrze.uni-erlangen.de -dn: uid=influxUser5,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser5 -uidNumber: 10054 -gecos: Maureen Hall -cn: influxUser5 -homeDirectory: /home/hpc/influxUser5 -userPassword: influxUser5 - -# influxUser50, hpc, rrze.uni-erlangen.de -dn: uid=influxUser50,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser50 -uidNumber: 10055 -gecos: Daniel Wilson -cn: influxUser50 -homeDirectory: /home/hpc/influxUser50 -userPassword: influxUser50 - -# influxUser51, hpc, rrze.uni-erlangen.de -dn: uid=influxUser51,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser51 -uidNumber: 10056 -gecos: Ben Palmer -cn: influxUser51 -homeDirectory: /home/hpc/influxUser51 -userPassword: influxUser51 - -# influxUser52, hpc, rrze.uni-erlangen.de -dn: uid=influxUser52,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser52 -uidNumber: 10057 -gecos: Sarah Lyons -cn: influxUser52 -homeDirectory: /home/hpc/influxUser52 -userPassword: influxUser52 - -# influxUser53, hpc, rrze.uni-erlangen.de -dn: uid=influxUser53,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser53 -uidNumber: 10058 -gecos: Frank Hill -cn: influxUser53 -homeDirectory: /home/hpc/influxUser53 -userPassword: influxUser53 - -# influxUser54, hpc, rrze.uni-erlangen.de -dn: uid=influxUser54,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser54 -uidNumber: 10059 -gecos: Elliott Brown -cn: influxUser54 -homeDirectory: /home/hpc/influxUser54 -userPassword: influxUser54 - -# influxUser55, hpc, rrze.uni-erlangen.de -dn: uid=influxUser55,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser55 -uidNumber: 10060 -gecos: Shirley Pritchard -cn: influxUser55 -homeDirectory: /home/hpc/influxUser55 -userPassword: influxUser55 - -# influxUser56, hpc, rrze.uni-erlangen.de -dn: uid=influxUser56,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser56 -uidNumber: 10061 -gecos: Sylvia Morris -cn: influxUser56 -homeDirectory: /home/hpc/influxUser56 -userPassword: influxUser56 - -# influxUser57, hpc, rrze.uni-erlangen.de -dn: uid=influxUser57,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser57 -uidNumber: 10062 -gecos: Arthur Green -cn: influxUser57 -homeDirectory: /home/hpc/influxUser57 -userPassword: influxUser57 - -# influxUser58, hpc, rrze.uni-erlangen.de -dn: uid=influxUser58,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser58 -uidNumber: 10063 -gecos: Steven Begum -cn: influxUser58 -homeDirectory: /home/hpc/influxUser58 -userPassword: influxUser58 - -# influxUser59, hpc, rrze.uni-erlangen.de -dn: uid=influxUser59,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser59 -uidNumber: 10064 -gecos: Joanne Barber -cn: influxUser59 -homeDirectory: /home/hpc/influxUser59 -userPassword: influxUser59 - -# influxUser6, hpc, rrze.uni-erlangen.de -dn: uid=influxUser6,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser6 -uidNumber: 10065 -gecos: Mohamed Henderson -cn: influxUser6 -homeDirectory: /home/hpc/influxUser6 -userPassword: influxUser6 - -# influxUser60, hpc, rrze.uni-erlangen.de -dn: uid=influxUser60,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser60 -uidNumber: 10066 -gecos: Nicola James -cn: influxUser60 -homeDirectory: /home/hpc/influxUser60 -userPassword: influxUser60 - -# influxUser61, hpc, rrze.uni-erlangen.de -dn: uid=influxUser61,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser61 -uidNumber: 10067 -gecos: Graham Cartwright -cn: influxUser61 -homeDirectory: /home/hpc/influxUser61 -userPassword: influxUser61 - -# influxUser62, hpc, rrze.uni-erlangen.de -dn: uid=influxUser62,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser62 -uidNumber: 10068 -gecos: Kirsty George -cn: influxUser62 -homeDirectory: /home/hpc/influxUser62 -userPassword: influxUser62 - -# influxUser63, hpc, rrze.uni-erlangen.de -dn: uid=influxUser63,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser63 -uidNumber: 10069 -gecos: Kelly Singh -cn: influxUser63 -homeDirectory: /home/hpc/influxUser63 -userPassword: influxUser63 - -# influxUser7, hpc, rrze.uni-erlangen.de -dn: uid=influxUser7,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser7 -uidNumber: 10070 -gecos: Rebecca Miles -cn: influxUser7 -homeDirectory: /home/hpc/influxUser7 -userPassword: influxUser7 - -# influxUser8, hpc, rrze.uni-erlangen.de -dn: uid=influxUser8,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser8 -uidNumber: 10071 -gecos: Katy Higgins -cn: influxUser8 -homeDirectory: /home/hpc/influxUser8 -userPassword: influxUser8 - -# influxUser9, hpc, rrze.uni-erlangen.de -dn: uid=influxUser9,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: influxUser9 -uidNumber: 10072 -gecos: Aimee Hill -cn: influxUser9 -homeDirectory: /home/hpc/influxUser9 -userPassword: influxUser9 - -# woodyUser1, hpc, rrze.uni-erlangen.de -dn: uid=woodyUser1,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: woodyUser1 -uidNumber: 10073 -gecos: Jay Gordon -cn: woodyUser1 -homeDirectory: /home/hpc/woodyUser1 -userPassword: woodyUser1 - -# woodyUser2, hpc, rrze.uni-erlangen.de -dn: uid=woodyUser2,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: woodyUser2 -uidNumber: 10074 -gecos: Donna Kirby -cn: woodyUser2 -homeDirectory: /home/hpc/woodyUser2 -userPassword: woodyUser2 - -# woodyUser3, hpc, rrze.uni-erlangen.de -dn: uid=woodyUser3,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: woodyUser3 -uidNumber: 10075 -gecos: Marion Bevan -cn: woodyUser3 -homeDirectory: /home/hpc/woodyUser3 -userPassword: woodyUser3 - -# woodyUser4, hpc, rrze.uni-erlangen.de -dn: uid=woodyUser4,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: woodyUser4 -uidNumber: 10076 -gecos: Amber Harvey -cn: woodyUser4 -homeDirectory: /home/hpc/woodyUser4 -userPassword: woodyUser4 - -# woodyUser5, hpc, rrze.uni-erlangen.de -dn: uid=woodyUser5,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de -loginShell: /bin/bash -gidNumber: 12000 -objectClass: account -objectClass: posixAccount -uid: woodyUser5 -uidNumber: 10077 -gecos: Ryan Hughes -cn: woodyUser5 -homeDirectory: /home/hpc/woodyUser5 -userPassword: woodyUser5 - diff --git a/data/mariadb/slurm.cnf b/data/mariadb/slurm.cnf deleted file mode 100644 index 512356a..0000000 --- a/data/mariadb/slurm.cnf +++ /dev/null @@ -1,5 +0,0 @@ -[mysqld] -innodb_buffer_pool_size=4096M -innodb_log_file_size=64M -innodb_lock_wait_timeout=900 -max_allowed_packet=16M diff --git a/data/slurm/home/config/cgroup.conf b/data/slurm/home/config/cgroup.conf deleted file mode 100644 index 728b80b..0000000 --- a/data/slurm/home/config/cgroup.conf +++ /dev/null @@ -1,4 +0,0 @@ -ConstrainCores=yes -ConstrainDevices=no -ConstrainRAMSpace=yes -ConstrainSwapSpace=yes diff --git a/data/slurm/home/config/slurm.conf b/data/slurm/home/config/slurm.conf deleted file mode 100644 index caa130b..0000000 --- a/data/slurm/home/config/slurm.conf +++ /dev/null @@ -1,48 +0,0 @@ -# slurm.conf file generated by configurator.html. -# Put this file on all nodes of your cluster. -# See the slurm.conf man page for more information. -# -ClusterName=snowflake -SlurmctldHost=slurmctld -SlurmUser=slurm -SlurmctldPort=6817 -SlurmdPort=6818 -MpiDefault=none -ProctrackType=proctrack/linuxproc -ReturnToService=1 -SlurmctldPidFile=/var/run/slurmctld.pid -SlurmdPidFile=/var/run/slurmd.pid -SlurmdSpoolDir=/var/spool/slurm/d -StateSaveLocation=/var/spool/slurm/ctld -SwitchType=switch/none -TaskPlugin=task/affinity -# -# TIMERS -InactiveLimit=0 -KillWait=30 -MinJobAge=300 -SlurmctldTimeout=120 -SlurmdTimeout=300 -Waittime=0 -# -# SCHEDULING -SchedulerType=sched/backfill -SelectType=select/cons_tres -# -# LOGGING AND ACCOUNTING -AccountingStorageHost=slurmdb -AccountingStoragePort=6819 -AccountingStorageType=accounting_storage/slurmdbd -AccountingStorageUser=slurm -AccountingStoreFlags=job_script,job_comment,job_env,job_extra -JobCompType=jobcomp/none -JobAcctGatherFrequency=30 -JobAcctGatherType=jobacct_gather/linux -SlurmctldDebug=info -SlurmctldLogFile=/var/log/slurmctld.log -SlurmdDebug=info -SlurmdLogFile=/var/log/slurmd.log -# -# COMPUTE NODES -NodeName=node0[1-2] CPUs=1 State=UNKNOWN -PartitionName=main Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/data/slurm/home/config/slurmdbd.conf b/data/slurm/home/config/slurmdbd.conf deleted file mode 100644 index 6ee97ca..0000000 --- a/data/slurm/home/config/slurmdbd.conf +++ /dev/null @@ -1,31 +0,0 @@ -# Archive info -#ArchiveJobs=yes -#ArchiveDir="/tmp" -#ArchiveSteps=yes -#ArchiveScript= -#JobPurge=12 -#StepPurge=1 -# -# Authentication info -AuthType=auth/munge -AuthInfo=/var/run/munge/munge.socket.2 -# -# slurmDBD info -DbdAddr=slurmdb -DbdHost=slurmdb -DbdPort=6819 -SlurmUser=slurm -DebugLevel=4 -LogFile=/var/log/slurm/slurmdbd.log -PidFile=/var/run/slurmdbd.pid -#PluginDir=/usr/lib/slurm -#PrivateData=accounts,users,usage,jobs -#TrackWCKey=yes -# -# Database info -StorageType=accounting_storage/mysql -StorageHost=mariadb -StoragePort=3306 -StoragePass=demo -StorageUser=slurm -StorageLoc=slurm_acct_db diff --git a/docker-compose.yml b/docker-compose.yml index 345f60d..8c4d697 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -81,6 +81,7 @@ services: # - SYS_NICE slurm-controller: + image: cc-docker:22.05 container_name: slurmctld hostname: slurmctld build: diff --git a/migrateTimestamps.pl b/migrateTimestamps.pl index 5699c80..0ffa221 100755 --- a/migrateTimestamps.pl +++ b/migrateTimestamps.pl @@ -9,7 +9,6 @@ use File::Slurp; use Data::Dumper; use Time::Piece; use Sort::Versions; -use REST::Client; ### JOB-ARCHIVE my $localtime = localtime; @@ -19,80 +18,80 @@ my $archiveSrc = './data/job-archive-source'; my @ArchiveClusters; # Get clusters by job-archive/$subfolder -opendir my $dh, $archiveSrc or die "can't open directory: $!"; -while ( readdir $dh ) { - chomp; next if $_ eq '.' or $_ eq '..' or $_ eq 'job-archive'; +# opendir my $dh, $archiveSrc or die "can't open directory: $!"; +# while ( readdir $dh ) { +# chomp; next if $_ eq '.' or $_ eq '..' or $_ eq 'job-archive' or $_ eq 'version.txt'; - my $cluster = $_; - push @ArchiveClusters, $cluster; -} +# my $cluster = $_; +# push @ArchiveClusters, $cluster; +# } -# start for jobarchive -foreach my $cluster ( @ArchiveClusters ) { - print "Starting to update start- and stoptimes in job-archive for $cluster\n"; +# # start for jobarchive +# foreach my $cluster ( @ArchiveClusters ) { +# print "Starting to update start- and stoptimes in job-archive for $cluster\n"; - opendir my $dhLevel1, "$archiveSrc/$cluster" or die "can't open directory: $!"; - while ( readdir $dhLevel1 ) { - chomp; next if $_ eq '.' or $_ eq '..'; - my $level1 = $_; +# opendir my $dhLevel1, "$archiveSrc/$cluster" or die "can't open directory: $!"; +# while ( readdir $dhLevel1 ) { +# chomp; next if $_ eq '.' or $_ eq '..'; +# my $level1 = $_; - if ( -d "$archiveSrc/$cluster/$level1" ) { - opendir my $dhLevel2, "$archiveSrc/$cluster/$level1" or die "can't open directory: $!"; - while ( readdir $dhLevel2 ) { - chomp; next if $_ eq '.' or $_ eq '..'; - my $level2 = $_; - my $jobSource = "$archiveSrc/$cluster/$level1/$level2"; - my $jobTarget = "$archiveTarget/$cluster/$level1/$level2/"; - my $jobOrigin = $jobSource; - # check if files are directly accessible (old format) else get subfolders as file and update path - if ( ! -e "$jobSource/meta.json") { - my @folders = read_dir($jobSource); - if (!@folders) { - next; - } - # Only use first subfolder for now TODO - $jobSource = "$jobSource/".$folders[0]; - } - # check if subfolder contains file, else remove source and skip - if ( ! -e "$jobSource/meta.json") { - # rmtree $jobOrigin; - next; - } +# if ( -d "$archiveSrc/$cluster/$level1" ) { +# opendir my $dhLevel2, "$archiveSrc/$cluster/$level1" or die "can't open directory: $!"; +# while ( readdir $dhLevel2 ) { +# chomp; next if $_ eq '.' or $_ eq '..'; +# my $level2 = $_; +# my $jobSource = "$archiveSrc/$cluster/$level1/$level2"; +# my $jobTarget = "$archiveTarget/$cluster/$level1/$level2/"; +# my $jobOrigin = $jobSource; +# # check if files are directly accessible (old format) else get subfolders as file and update path +# if ( ! -e "$jobSource/meta.json") { +# my @folders = read_dir($jobSource); +# if (!@folders) { +# next; +# } +# # Only use first subfolder for now TODO +# $jobSource = "$jobSource/".$folders[0]; +# } +# # check if subfolder contains file, else remove source and skip +# if ( ! -e "$jobSource/meta.json") { +# # rmtree $jobOrigin; +# next; +# } - my $rawstr = read_file("$jobSource/meta.json"); - my $json = decode_json($rawstr); +# my $rawstr = read_file("$jobSource/meta.json"); +# my $json = decode_json($rawstr); - # NOTE Start meta.json iteration here - # my $random_number = int(rand(UPPERLIMIT)) + LOWERLIMIT; - # Set new startTime: Between 5 days and 1 day before now +# # NOTE Start meta.json iteration here +# # my $random_number = int(rand(UPPERLIMIT)) + LOWERLIMIT; +# # Set new startTime: Between 5 days and 1 day before now - # Remove id from attributes - $json->{startTime} = $epochtime - (int(rand(432000)) + 86400); - $json->{stopTime} = $json->{startTime} + $json->{duration}; +# # Remove id from attributes +# $json->{startTime} = $epochtime - (int(rand(432000)) + 86400); +# $json->{stopTime} = $json->{startTime} + $json->{duration}; - # Add starttime subfolder to target path - $jobTarget .= $json->{startTime}; +# # Add starttime subfolder to target path +# $jobTarget .= $json->{startTime}; - # target is not directory - if ( not -d $jobTarget ){ - # print "Writing files\n"; - # print "$cluster/$level1/$level2\n"; - make_path($jobTarget); +# # target is not directory +# if ( not -d $jobTarget ){ +# # print "Writing files\n"; +# # print "$cluster/$level1/$level2\n"; +# make_path($jobTarget); - my $outstr = encode_json($json); - write_file("$jobTarget/meta.json", $outstr); +# my $outstr = encode_json($json); +# write_file("$jobTarget/meta.json", $outstr); - my $datstr = read_file("$jobSource/data.json"); - write_file("$jobTarget/data.json", $datstr); - } else { - # rmtree $jobSource; - } - } - } - } -} -print "Done for job-archive\n"; -sleep(1); +# my $datstr = read_file("$jobSource/data.json.gz"); +# write_file("$jobTarget/data.json.gz", $datstr); +# } else { +# # rmtree $jobSource; +# } +# } +# } +# } +# } +# print "Done for job-archive\n"; +# sleep(1); ## CHECKPOINTS chomp(my $checkpointStart=`date --date 'TZ="Europe/Berlin" 0:00 7 days ago' +%s`); diff --git a/setupDev.sh b/setupDev.sh index 90aa011..6146351 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -10,6 +10,7 @@ else if [ ! -d var ]; then mkdir var touch var/job.db + make else echo "'cc-backend/var' exists. Cautiously exiting." echo -n "Stopped." @@ -17,13 +18,13 @@ else fi fi - # Download unedited job-archive to ./data/job-archive-source if [ ! -d data/job-archive-source ]; then - cd data wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar tar xf job-archive-demo.tar - mv ./job-archive ./job-archive-source + # mv ./var/job-archive ./job-archive-source + # mv -f ./var/job.db ./cc-backend/var/ + # rm -rf ./var rm ./job-archive-demo.tar cd .. else @@ -32,12 +33,12 @@ fi # Download unedited checkpoint files to ./data/cc-metric-store-source/checkpoints if [ ! -d data/cc-metric-store-source ]; then - mkdir -p data/cc-metric-store-source/checkpoints - cd data/cc-metric-store-source/checkpoints - wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/cc-metric-store-checkpoints.tar.xz - tar xf cc-metric-store-checkpoints.tar.xz - rm cc-metric-store-checkpoints.tar.xz - cd ../../../ + mkdir -p data/cc-metric-store-source/checkpoints + cd data/cc-metric-store-source/checkpoints + wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/cc-metric-store-checkpoints.tar.xz + tar xf cc-metric-store-checkpoints.tar.xz + rm cc-metric-store-checkpoints.tar.xz + cd ../../../ else echo "'data/cc-metric-store-source' already exists!" fi @@ -52,7 +53,7 @@ fi # cleanup sources # rm -r ./data/job-archive-source -# rm -r ./data/cc-metric-store-source +rm -r ./data/cc-metric-store-source # prepare folders for influxdb2 if [ ! -d data/influxdb ]; then @@ -67,7 +68,7 @@ if [ ! -d .env ]; then cp templates/env.default ./.env fi -if [ ! -d docker-compose.yml ]; then +if [ ! -f docker-compose.yml ]; then cp templates/docker-compose.yml.default ./docker-compose.yml fi @@ -75,8 +76,8 @@ docker-compose build ./cc-backend/cc-backend --init-db --add-user demo:admin:AdminDev docker-compose up -d -echo "" -echo "Setup complete, containers are up by default: Shut down with 'docker-compose down'." -echo "Use './cc-backend/cc-backend' to start cc-backend." -echo "Use scripts in /scripts to load data into influx or mariadb." -# ./cc-backend/cc-backend +# echo "" +# echo "Setup complete, containers are up by default: Shut down with 'docker-compose down'." +# echo "Use './cc-backend/cc-backend' to start cc-backend." +# echo "Use scripts in /scripts to load data into influx or mariadb." +# # ./cc-backend/cc-backend diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile index a006cc2..61c9e3e 100644 --- a/slurm/base/Dockerfile +++ b/slurm/base/Dockerfile @@ -1,5 +1,5 @@ FROM rockylinux:8 -MAINTAINER Jan Eitzinger +LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" ENV SLURM_VERSION=22.05.6 ENV ARCH=aarch64 diff --git a/slurm/controller/Dockerfile b/slurm/controller/Dockerfile index b627826..b236b5b 100644 --- a/slurm/controller/Dockerfile +++ b/slurm/controller/Dockerfile @@ -1,5 +1,5 @@ FROM clustercockpit/slurm.base:22.05.6 -MAINTAINER Jan Eitzinger +LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ diff --git a/slurm/database/Dockerfile b/slurm/database/Dockerfile index b627826..b236b5b 100644 --- a/slurm/database/Dockerfile +++ b/slurm/database/Dockerfile @@ -1,5 +1,5 @@ FROM clustercockpit/slurm.base:22.05.6 -MAINTAINER Jan Eitzinger +LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ diff --git a/slurm/rest/Dockerfile b/slurm/rest/Dockerfile index b627826..b236b5b 100644 --- a/slurm/rest/Dockerfile +++ b/slurm/rest/Dockerfile @@ -1,5 +1,5 @@ FROM clustercockpit/slurm.base:22.05.6 -MAINTAINER Jan Eitzinger +LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ diff --git a/slurm/worker/Dockerfile b/slurm/worker/Dockerfile index b615be5..2fb1c11 100644 --- a/slurm/worker/Dockerfile +++ b/slurm/worker/Dockerfile @@ -1,5 +1,5 @@ FROM clustercockpit/slurm.base:22.05.6 -MAINTAINER Jan Eitzinger +LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ From e52321247a41057424a261625680f8c51d70e7c9 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 10 Oct 2024 22:09:01 +0200 Subject: [PATCH 02/25] Adapting docker files to generic arch --- docker-compose.yml | 2 +- setupDev.sh | 47 +++++++++++++-------------- slurm/base/Dockerfile | 39 ++++++++++------------ slurm/controller/docker-entrypoint.sh | 17 ++++++---- slurm/database/docker-entrypoint.sh | 11 ++++--- slurm/rest/docker-entrypoint.sh | 17 ++++++---- slurm/worker/docker-entrypoint.sh | 13 +++++--- 7 files changed, 76 insertions(+), 70 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 8c4d697..851ecb4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -81,7 +81,7 @@ services: # - SYS_NICE slurm-controller: - image: cc-docker:22.05 + image: clustercockpit:22.05.6 container_name: slurmctld hostname: slurmctld build: diff --git a/setupDev.sh b/setupDev.sh index 6146351..b22b2a4 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -8,28 +8,22 @@ if [ ! -d cc-backend ]; then else cd cc-backend if [ ! -d var ]; then - mkdir var - touch var/job.db + wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar + tar xf job-archive-demo.tar + rm ./job-archive-demo.tar make + ./cc-backend -migrate-db + ./cc-backend --init-db --add-user demo:admin:AdminDev + cd .. else - echo "'cc-backend/var' exists. Cautiously exiting." - echo -n "Stopped." - exit + cd .. + # echo "'cc-backend/var' exists. Cautiously exiting." + # echo -n "Stopped." + # exit fi fi -# Download unedited job-archive to ./data/job-archive-source -if [ ! -d data/job-archive-source ]; then - wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar - tar xf job-archive-demo.tar - # mv ./var/job-archive ./job-archive-source - # mv -f ./var/job.db ./cc-backend/var/ - # rm -rf ./var - rm ./job-archive-demo.tar - cd .. -else - echo "'data/job-archive-source' already exists!" -fi +ls # Download unedited checkpoint files to ./data/cc-metric-store-source/checkpoints if [ ! -d data/cc-metric-store-source ]; then @@ -53,7 +47,7 @@ fi # cleanup sources # rm -r ./data/job-archive-source -rm -r ./data/cc-metric-store-source +# rm -r ./data/cc-metric-store-source # prepare folders for influxdb2 if [ ! -d data/influxdb ]; then @@ -72,12 +66,17 @@ if [ ! -f docker-compose.yml ]; then cp templates/docker-compose.yml.default ./docker-compose.yml fi +docker-compose down + +cd slurm/base/ +make +cd ../.. + docker-compose build -./cc-backend/cc-backend --init-db --add-user demo:admin:AdminDev docker-compose up -d -# echo "" -# echo "Setup complete, containers are up by default: Shut down with 'docker-compose down'." -# echo "Use './cc-backend/cc-backend' to start cc-backend." -# echo "Use scripts in /scripts to load data into influx or mariadb." -# # ./cc-backend/cc-backend +echo "" +echo "Setup complete, containers are up by default: Shut down with 'docker-compose down'." +echo "Use './cc-backend/cc-backend' to start cc-backend." +echo "Use scripts in /scripts to load data into influx or mariadb." +# ./cc-backend/cc-backend diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile index 61c9e3e..5f19b55 100644 --- a/slurm/base/Dockerfile +++ b/slurm/base/Dockerfile @@ -2,9 +2,8 @@ FROM rockylinux:8 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" ENV SLURM_VERSION=22.05.6 -ENV ARCH=aarch64 -RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y +RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm RUN groupadd -g 981 munge \ && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u 981 -g munge -s /sbin/nologin munge \ @@ -13,29 +12,25 @@ RUN groupadd -g 981 munge \ && groupadd -g 1000 worker \ && useradd -m -c "Workflow user" -d /home/worker -u 1000 -g worker -s /bin/bash worker -RUN yum install -y munge munge-libs -RUN dnf --enablerepo=powertools install munge-devel -y -RUN yum install rng-tools -y +RUN yum install -y munge munge-libs rng-tools \ + python3 gcc openssl openssl-devel \ + openssh-server openssh-clients dbus-devel \ + pam-devel numactl numactl-devel hwloc sudo \ + lua readline-devel ncurses-devel man2html \ + libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget -RUN yum install -y python3 gcc openssl openssl-devel \ -openssh-server openssh-clients dbus-devel \ -pam-devel numactl numactl-devel hwloc sudo \ -lua readline-devel ncurses-devel man2html \ -libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget +RUN dnf --enablerepo=powertools install -y munge-devel rrdtool-devel lua-devel hwloc-devel mariadb-server mariadb-devel -RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y -RUN dnf install mariadb-server mariadb-devel -y -RUN mkdir /usr/local/slurm-tmp -RUN cd /usr/local/slurm-tmp -RUN wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 -RUN rpmbuild -ta slurm-${SLURM_VERSION}.tar.bz2 +RUN mkdir -p /usr/local/slurm-tmp \ + && cd /usr/local/slurm-tmp \ + && wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 \ + && rpmbuild -ta slurm-${SLURM_VERSION}.tar.bz2 -WORKDIR /root/rpmbuild/RPMS/${ARCH} -RUN yum -y --nogpgcheck localinstall \ - slurm-${SLURM_VERSION}-1.el8.${ARCH}.rpm \ - slurm-perlapi-${SLURM_VERSION}-1.el8.${ARCH}.rpm \ - slurm-slurmctld-${SLURM_VERSION}-1.el8.${ARCH}.rpm -WORKDIR / +RUN ARCH=$(uname -m) \ + && yum -y --nogpgcheck localinstall \ + /root/rpmbuild/RPMS/$ARCH/slurm-${SLURM_VERSION}-1.el8.$ARCH.rpm \ + /root/rpmbuild/RPMS/$ARCH/slurm-perlapi-${SLURM_VERSION}-1.el8.$ARCH.rpm \ + /root/rpmbuild/RPMS/$ARCH/slurm-slurmctld-${SLURM_VERSION}-1.el8.$ARCH.rpm VOLUME ["/home", "/.secret"] # 22: SSH diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 75e36db..1d59ee9 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -1,6 +1,9 @@ #!/usr/bin/env bash set -e +# Determine the system architecture dynamically +ARCH=$(uname -m) + # start sshd server _sshd_host() { if [ ! -d /var/run/sshd ]; then @@ -70,12 +73,12 @@ _copy_secrets() { # run slurmctld _slurmctld() { - cd /root/rpmbuild/RPMS/aarch64 - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ - slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ - slurm-slurmd-22.05.6-1.el8.aarch64.rpm \ - slurm-torque-22.05.6-1.el8.aarch64.rpm \ - slurm-slurmctld-22.05.6-1.el8.aarch64.rpm + cd /root/rpmbuild/RPMS/$ARCH + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ + slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ + slurm-torque-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmctld-22.05.6-1.el8.$ARCH.rpm echo "checking for slurmdbd.conf" while [ ! -f /.secret/slurmdbd.conf ]; do echo -n "." @@ -109,4 +112,4 @@ _munge_start _copy_secrets _slurmctld -tail -f /dev/null +tail -f /dev/null \ No newline at end of file diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index 97aff4e..4275384 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -1,6 +1,9 @@ #!/usr/bin/env bash set -e +# Determine the system architecture dynamically +ARCH=$(uname -m) + SLURM_ACCT_DB_SQL=/slurm_acct_db.sql # start sshd server @@ -48,10 +51,10 @@ _wait_for_worker() { # run slurmdbd _slurmdbd() { - cd /root/rpmbuild/RPMS/aarch64 - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ - slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ - slurm-slurmdbd-22.05.6-1.el8.aarch64.rpm + cd /root/rpmbuild/RPMS/$ARCH + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ + slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmdbd-22.05.6-1.el8.$ARCH.rpm mkdir -p /var/spool/slurm/d /var/log/slurm /etc/slurm chown slurm: /var/spool/slurm/d /var/log/slurm if [[ ! -f /home/config/slurmdbd.conf ]]; then diff --git a/slurm/rest/docker-entrypoint.sh b/slurm/rest/docker-entrypoint.sh index 6ef6bcb..549b92c 100755 --- a/slurm/rest/docker-entrypoint.sh +++ b/slurm/rest/docker-entrypoint.sh @@ -1,6 +1,9 @@ #!/usr/bin/env bash set -e +# Determine the system architecture dynamically +ARCH=$(uname -m) + # start sshd server _sshd_host() { if [ ! -d /var/run/sshd ]; then @@ -68,13 +71,13 @@ _copy_secrets() { # run slurmctld _slurmctld() { - cd /root/rpmbuild/RPMS/aarch64 - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ - slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ - slurm-slurmd-22.05.6-1.el8.aarch64.rpm \ - slurm-torque-22.05.6-1.el8.aarch64.rpm \ - slurm-slurmctld-22.05.6-1.el8.aarch64.rpm \ - slurm-slurmrestd-22.05.6-1.el8.aarch64.rpm + cd /root/rpmbuild/RPMS/$ARCH + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ + slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ + slurm-torque-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmctld-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmrestd-22.05.6-1.el8.$ARCH.rpm echo -n "checking for slurmdbd.conf" while [ ! -f /.secret/slurmdbd.conf ]; do echo -n "." diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index 12ecf3e..ab4f512 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -1,6 +1,9 @@ #!/usr/bin/env bash set -e +# Determine the system architecture dynamically +ARCH=$(uname -m) + # start sshd server _sshd_host() { if [ ! -d /var/run/sshd ]; then @@ -50,11 +53,11 @@ _start_dbus() { # run slurmd _slurmd() { - cd /root/rpmbuild/RPMS/aarch64 - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ - slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ - slurm-slurmd-22.05.6-1.el8.aarch64.rpm \ - slurm-torque-22.05.6-1.el8.aarch64.rpm + cd /root/rpmbuild/RPMS/$ARCH + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ + slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ + slurm-torque-22.05.6-1.el8.$ARCH.rpm if [ ! -f /.secret/slurm.conf ]; then echo -n "checking for slurm.conf" while [ ! -f /.secret/slurm.conf ]; do From 4be4456428c7baabde6efa65cd30f9b963830ea8 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 10 Oct 2024 23:56:44 +0200 Subject: [PATCH 03/25] Fix to influxdb service --- docker-compose.yml | 4 ++-- setupDev.sh | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 851ecb4..2947d8d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,8 +19,8 @@ services: influxdb: container_name: influxdb - image: influxdb - command: ["--reporting-disabled"] + image: influxdb:latest + command: ["--reporting-disabled", "--log-level=debug"] environment: DOCKER_INFLUXDB_INIT_MODE: setup DOCKER_INFLUXDB_INIT_USERNAME: devel diff --git a/setupDev.sh b/setupDev.sh index b22b2a4..14e9d81 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -7,11 +7,16 @@ if [ ! -d cc-backend ]; then exit else cd cc-backend + make + if [ ! -d var ]; then wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar tar xf job-archive-demo.tar rm ./job-archive-demo.tar - make + + cp ./configs/env-template.txt .env + cp ./configs/config-demo.json config.json + ./cc-backend -migrate-db ./cc-backend --init-db --add-user demo:admin:AdminDev cd .. @@ -23,8 +28,6 @@ else fi fi -ls - # Download unedited checkpoint files to ./data/cc-metric-store-source/checkpoints if [ ! -d data/cc-metric-store-source ]; then mkdir -p data/cc-metric-store-source/checkpoints @@ -52,7 +55,7 @@ fi # prepare folders for influxdb2 if [ ! -d data/influxdb ]; then mkdir -p data/influxdb/data - mkdir -p data/influxdb/config/influx-configs + mkdir -p data/influxdb/config else echo "'data/influxdb' already exists!" fi From cf13ee5e7eca0972bb62c3fd7b117e839cf92d3c Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Fri, 11 Oct 2024 16:39:38 +0200 Subject: [PATCH 04/25] Generalizing setup --- docker-compose.yml | 41 ++++------ slurm/controller/docker-entrypoint.sh | 40 +++++----- slurm/controller/slurm.conf | 106 ++++++++++++++++++++++++++ slurm/database/docker-entrypoint.sh | 1 + slurm/database/slurmdbd.conf | 37 +++++++++ slurm/worker/docker-entrypoint.sh | 1 + 6 files changed, 182 insertions(+), 44 deletions(-) create mode 100644 slurm/controller/slurm.conf create mode 100644 slurm/database/slurmdbd.conf diff --git a/docker-compose.yml b/docker-compose.yml index 2947d8d..7620c04 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -63,58 +63,49 @@ services: cap_add: - SYS_NICE - # mysql: - # container_name: mysql - # image: mysql:8.0.22 - # command: ["--default-authentication-plugin=mysql_native_password"] - # environment: - # MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD} - # MYSQL_DATABASE: ${MYSQL_DATABASE} - # MYSQL_USER: ${MYSQL_USER} - # MYSQL_PASSWORD: ${MYSQL_PASSWORD} - # ports: - # - "127.0.0.1:${MYSQL_PORT}:3306" - # # volumes: - # # - ${DATADIR}/sql-init:/docker-entrypoint-initdb.d - # # - ${DATADIR}/sqldata:/var/lib/mysql - # cap_add: - # - SYS_NICE - - slurm-controller: - image: clustercockpit:22.05.6 + slurmctld: container_name: slurmctld hostname: slurmctld build: context: ./slurm/controller + depends_on: + - slurmdbd privileged: true + ports: + - "6817:6817" volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ./slurm/controller/slurm.conf:/home/config/slurm.conf - slurm-database: - container_name: slurmdb - hostname: slurmdb + slurmdbd: + container_name: slurmdbd + hostname: slurmdbd build: context: ./slurm/database depends_on: - mariadb - - slurm-controller privileged: true + ports: + - "6819:6819" volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf - slurm-worker01: + node01: container_name: node01 hostname: node01 build: context: ./slurm/worker depends_on: - - slurm-controller + - slurmctld privileged: true volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + ports: + - "6818:6818" # slurm-worker02: # container_name: node02 diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 1d59ee9..ce398a7 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -6,21 +6,21 @@ ARCH=$(uname -m) # start sshd server _sshd_host() { - if [ ! -d /var/run/sshd ]; then - mkdir /var/run/sshd - ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' - fi - echo "Starting sshd" - /usr/sbin/sshd + if [ ! -d /var/run/sshd ]; then + mkdir /var/run/sshd + ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' + fi + echo "Starting sshd" + /usr/sbin/sshd } # setup worker ssh to be passwordless _ssh_worker() { - if [[ ! -d /home/worker ]]; then + if [[ ! -d /home/worker ]]; then mkdir -p /home/worker chown -R worker:worker /home/worker fi - cat > /home/worker/setup-worker-ssh.sh </home/worker/setup-worker-ssh.sh < /etc/munge/munge.key" + sh -c "dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key" chown munge: /etc/munge/munge.key chmod 400 /etc/munge/munge.key sudo -u munge /sbin/munged @@ -64,11 +64,11 @@ _munge_start() { # copy secrets to /.secret directory for other nodes _copy_secrets() { - cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz - cp /home/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh - cp /etc/munge/munge.key /.secret/munge.key - rm -f /home/worker/worker-secret.tar.gz - rm -f /home/worker/setup-worker-ssh.sh + cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz + cp /home/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh + cp /etc/munge/munge.key /.secret/munge.key + rm -f /home/worker/worker-secret.tar.gz + rm -f /home/worker/setup-worker-ssh.sh } # run slurmctld @@ -85,8 +85,8 @@ _slurmctld() { sleep 1 done echo "" - mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm - chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm + mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm + chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm touch /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log if [[ ! -f /home/config/slurm.conf ]]; then @@ -98,11 +98,13 @@ _slurmctld() { chown slurm: /etc/slurm/slurm.conf chmod 600 /etc/slurm/slurm.conf fi + sacctmgr -i add cluster "snowflake" sleep 2s - echo "Starting slurmctld" + echo "Starting slurmctld" cp -f /etc/slurm/slurm.conf /.secret/ /usr/sbin/slurmctld + echo "Started slurmctld" } ### main ### @@ -112,4 +114,4 @@ _munge_start _copy_secrets _slurmctld -tail -f /dev/null \ No newline at end of file +tail -f /dev/null diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf new file mode 100644 index 0000000..63f48f8 --- /dev/null +++ b/slurm/controller/slurm.conf @@ -0,0 +1,106 @@ +# slurm.conf +# +# See the slurm.conf man page for more information. +# +ClusterName=linux +ControlMachine=slurmctld +ControlAddr=slurmctld +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/linuxproc +#PluginDir= +#CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +TaskPlugin=task/none +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_CPU_Memory +FastSchedule=1 +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp.log +# +# ACCOUNTING +#JobAcctGatherType=jobacct_gather/linux +JobAcctGatherType=jobacct_gather/cgroup +ProctrackType=proctrack/cgroup + +JobAcctGatherFrequency=30 +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurmdbd +AccountingStoragePort=6819 +#AccountingStorageLoc=slurm_acct_db +#AccountingStoragePass= +#AccountingStorageUser= +# + +# COMPUTE NODES +PartitionName=DEFAULT Nodes=c[1-2] +PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP + +# # COMPUTE NODES +# NodeName=c[1-2] RealMemory=1000 State=UNKNOWN +NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2 + +# # +# # PARTITIONS +# PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP + +#PrEpPlugins=pika diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index 4275384..c314e94 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -69,6 +69,7 @@ _slurmdbd() { echo "Starting slurmdbd" cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf /usr/sbin/slurmdbd + echo "Started slurmdbd" } ### main ### diff --git a/slurm/database/slurmdbd.conf b/slurm/database/slurmdbd.conf new file mode 100644 index 0000000..f6d5a81 --- /dev/null +++ b/slurm/database/slurmdbd.conf @@ -0,0 +1,37 @@ +# +# Example slurmdbd.conf file. +# +# See the slurmdbd.conf man page for more information. +# +# Archive info +#ArchiveJobs=yes +#ArchiveDir="/tmp" +#ArchiveSteps=yes +#ArchiveScript= +#JobPurge=12 +#StepPurge=1 +# +# Authentication info +AuthType=auth/munge +#AuthInfo=/var/run/munge/munge.socket.2 +# +# slurmDBD info +DbdAddr=slurmdbd +DbdHost=slurmdbd +DbdPort=6819 +SlurmUser=slurm +#MessageTimeout=300 +DebugLevel=4 +#DefaultQOS=normal,standby +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd/slurmdbd.pid +#PluginDir=/usr/lib/slurm +#PrivateData=accounts,users,usage,jobs +#TrackWCKey=yes +# +# Database info +StorageType=accounting_storage/mysql +StorageHost=mariadb +StorageUser=slurm +StoragePass=demo +StorageLoc=slurm_acct_db diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index ab4f512..299132b 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -78,6 +78,7 @@ _slurmd() { chown slurm: /var/log/slurmd.log echo -n "Starting slurmd" /usr/sbin/slurmd + echo -n "Started slurmd" } ### main ### From c646309a251092930408b389ea776d7562aa5ebf Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Mon, 14 Oct 2024 23:00:44 +0200 Subject: [PATCH 05/25] Fixed docker-entrypoint.sh scripts --- .gitignore | 4 ++++ README.md | 0 docker-compose.yml | 2 ++ env-template.txt | 0 setupDev.sh | 25 +++++++++++++++++++++---- slurm/controller/docker-entrypoint.sh | 6 +++++- slurm/controller/slurm.conf | 4 ++-- slurm/database/docker-entrypoint.sh | 2 +- slurm/database/slurmdbd.conf | 2 +- slurm/worker/cgroup.conf | 5 +++++ 10 files changed, 41 insertions(+), 9 deletions(-) mode change 100644 => 100755 README.md mode change 100644 => 100755 docker-compose.yml mode change 100644 => 100755 env-template.txt create mode 100644 slurm/worker/cgroup.conf diff --git a/.gitignore b/.gitignore index 23c5b49..147c94d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ data/job-archive/** data/influxdb data/sqldata data/cc-metric-store +data/cc-metric-store-source +data/ldap +data/mariadb +data/slurm cc-backend cc-backend/** .vscode diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/docker-compose.yml b/docker-compose.yml old mode 100644 new mode 100755 index 7620c04..e45067c --- a/docker-compose.yml +++ b/docker-compose.yml @@ -104,6 +104,8 @@ services: volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ./slurm/worker/cgroup.conf:/home/config/cgroup.conf + - ./slurm/controller/slurm.conf:/home/config/slurm.conf ports: - "6818:6818" diff --git a/env-template.txt b/env-template.txt old mode 100644 new mode 100755 diff --git a/setupDev.sh b/setupDev.sh index 14e9d81..81eeee6 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -1,4 +1,20 @@ #!/bin/bash +echo "" +echo "-----------------------------------------------------------------" +echo "Welcome to cc-docker automatic deployment script." +echo "Make sure you have sudo rights to run docker services" +echo "This script assumes that docker command is added to sudo group" +echo "This means that docker commands do not explicitly require" +echo "'sudo' keyword to run. You can use this following command:" +echo "" +echo "sudo groupadd docker" +echo "sudo usermod -aG docker $USER" +echo "" +echo "This will add docker to the sudo usergroup and all the docker" +echo "command will run as sudo by default without requiring" +echo "'sudo' keyword." +echo "-----------------------------------------------------------------" +echo "" # Check cc-backend, touch job.db if exists if [ ! -d cc-backend ]; then @@ -7,8 +23,6 @@ if [ ! -d cc-backend ]; then exit else cd cc-backend - make - if [ ! -d var ]; then wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar tar xf job-archive-demo.tar @@ -17,6 +31,8 @@ else cp ./configs/env-template.txt .env cp ./configs/config-demo.json config.json + make + ./cc-backend -migrate-db ./cc-backend --init-db --add-user demo:admin:AdminDev cd .. @@ -28,6 +44,8 @@ else fi fi +mkdir -m777 data + # Download unedited checkpoint files to ./data/cc-metric-store-source/checkpoints if [ ! -d data/cc-metric-store-source ]; then mkdir -p data/cc-metric-store-source/checkpoints @@ -80,6 +98,5 @@ docker-compose up -d echo "" echo "Setup complete, containers are up by default: Shut down with 'docker-compose down'." -echo "Use './cc-backend/cc-backend' to start cc-backend." +echo "Use './cc-backend/cc-backend -server' to start cc-backend." echo "Use scripts in /scripts to load data into influx or mariadb." -# ./cc-backend/cc-backend diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index ce398a7..2faa507 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -55,7 +55,7 @@ _munge_start() { /usr/sbin/create-munge-key -r -f sh -c "dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key" chown munge: /etc/munge/munge.key - chmod 400 /etc/munge/munge.key + chmod 600 /etc/munge/munge.key sudo -u munge /sbin/munged munge -n munge -n | unmunge @@ -64,6 +64,10 @@ _munge_start() { # copy secrets to /.secret directory for other nodes _copy_secrets() { + while [ ! -f /home/worker/worker-secret.tar.gz ]; do + echo -n "." + sleep 1 + done cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz cp /home/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh cp /etc/munge/munge.key /.secret/munge.key diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index 63f48f8..6a9a393 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -21,7 +21,7 @@ SwitchType=switch/none MpiDefault=none SlurmctldPidFile=/var/run/slurmd/slurmctld.pid SlurmdPidFile=/var/run/slurmd/slurmd.pid -ProctrackType=proctrack/linuxproc +# ProctrackType=proctrack/linuxproc #PluginDir= #CacheGroups=0 #FirstJobId= @@ -58,7 +58,7 @@ SchedulerType=sched/backfill #SchedulerRootFilter= SelectType=select/cons_res SelectTypeParameters=CR_CPU_Memory -FastSchedule=1 +# FastSchedule=1 #PriorityType=priority/multifactor #PriorityDecayHalfLife=14-0 #PriorityUsageResetPeriod=14-0 diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index c314e94..504ecd1 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -68,7 +68,7 @@ _slurmdbd() { fi echo "Starting slurmdbd" cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf - /usr/sbin/slurmdbd + /usr/sbin/slurmdbd -Dvv echo "Started slurmdbd" } diff --git a/slurm/database/slurmdbd.conf b/slurm/database/slurmdbd.conf index f6d5a81..d584535 100644 --- a/slurm/database/slurmdbd.conf +++ b/slurm/database/slurmdbd.conf @@ -24,7 +24,7 @@ SlurmUser=slurm DebugLevel=4 #DefaultQOS=normal,standby LogFile=/var/log/slurm/slurmdbd.log -PidFile=/var/run/slurmdbd/slurmdbd.pid +# PidFile=/var/run/slurmdbd/slurmdbd.pid #PluginDir=/usr/lib/slurm #PrivateData=accounts,users,usage,jobs #TrackWCKey=yes diff --git a/slurm/worker/cgroup.conf b/slurm/worker/cgroup.conf new file mode 100644 index 0000000..f24d9d7 --- /dev/null +++ b/slurm/worker/cgroup.conf @@ -0,0 +1,5 @@ +CgroupPlugin=cgroup/v1 +ConstrainCores=yes +ConstrainDevices=no +ConstrainRAMSpace=yes +ConstrainSwapSpace=yes From f7558779dad07734b69e7e2ba9ff560fdea698fe Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Tue, 15 Oct 2024 16:15:37 +0200 Subject: [PATCH 06/25] Stable services --- docker-compose.yml | 23 +++------ slurm/controller/docker-entrypoint.sh | 36 ++++++++++++-- slurm/controller/slurm.conf | 14 +++--- slurm/worker/docker-entrypoint.sh | 72 ++++++++++++++++----------- 4 files changed, 90 insertions(+), 55 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index e45067c..feab346 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -68,8 +68,6 @@ services: hostname: slurmctld build: context: ./slurm/controller - depends_on: - - slurmdbd privileged: true ports: - "6817:6817" @@ -77,6 +75,8 @@ services: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - ./slurm/controller/slurm.conf:/home/config/slurm.conf + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro slurmdbd: container_name: slurmdbd @@ -85,6 +85,7 @@ services: context: ./slurm/database depends_on: - mariadb + - slurmctld privileged: true ports: - "6819:6819" @@ -92,6 +93,8 @@ services: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro node01: container_name: node01 @@ -106,17 +109,7 @@ services: - ${DATADIR}/slurm/secret:/.secret - ./slurm/worker/cgroup.conf:/home/config/cgroup.conf - ./slurm/controller/slurm.conf:/home/config/slurm.conf + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro ports: - - "6818:6818" - - # slurm-worker02: - # container_name: node02 - # hostname: node02 - # build: - # context: ./slurm/worker - # depends_on: - # - slurm-controller - # privileged: true - # volumes: - # - ${DATADIR}/slurm/home:/home - # - ${DATADIR}/slurm/secret:/.secret + - "6818:6818" \ No newline at end of file diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 2faa507..3fc3d18 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -4,6 +4,18 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) +_delete_secrets() { + if [ -f /.secret/munge.key ]; then + echo "Removing secrets" + sudo rm -rf /.secret/munge.key + sudo rm -rf /.secret/worker-secret.tar.gz + sudo rm -rf /.secret/setup-worker-ssh.sh + + echo "Done removing secrets" + ls /.secret/ + fi +} + # start sshd server _sshd_host() { if [ ! -d /var/run/sshd ]; then @@ -90,9 +102,17 @@ _slurmctld() { done echo "" mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm - chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm + chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib touch /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log + touch /var/log/slurmd.log + chown slurm: /var/log/slurmd.log + + # touch /var/run/slurm/d/slurmctld.pid + # chown slurm: /var/run/slurm/d/slurmctld.pid + # touch /var/run/slurm/d/slurmd.pid + # chown slurm:/var/run/slurm/d/slurmd.pid + if [[ ! -f /home/config/slurm.conf ]]; then echo "### Missing slurm.conf ###" exit @@ -103,15 +123,25 @@ _slurmctld() { chmod 600 /etc/slurm/slurm.conf fi - sacctmgr -i add cluster "snowflake" + sudo yum install -y nc + sudo yum install -y procps + sudo yum install -y iputils + + while ! nc -z slurmdbd 6819; do + echo "Waiting for slurmdbd to be ready..." + sleep 2 + done + + sacctmgr -i add cluster name=linux sleep 2s echo "Starting slurmctld" cp -f /etc/slurm/slurm.conf /.secret/ - /usr/sbin/slurmctld + /usr/sbin/slurmctld -Dvv echo "Started slurmctld" } ### main ### +_delete_secrets _sshd_host _ssh_worker _munge_start diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index 6a9a393..7a55ff6 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -15,13 +15,13 @@ SlurmdPort=6818 AuthType=auth/munge #JobCredentialPrivateKey= #JobCredentialPublicCertificate= -StateSaveLocation=/var/lib/slurmd -SlurmdSpoolDir=/var/spool/slurmd +StateSaveLocation=/var/lib/slurm/d +SlurmdSpoolDir=/var/spool/slurm/d SwitchType=switch/none MpiDefault=none -SlurmctldPidFile=/var/run/slurmd/slurmctld.pid -SlurmdPidFile=/var/run/slurmd/slurmd.pid -# ProctrackType=proctrack/linuxproc +SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid +SlurmdPidFile=/var/run/slurm/d/slurmd.pid +ProctrackType=proctrack/linuxproc #PluginDir= #CacheGroups=0 #FirstJobId= @@ -79,7 +79,7 @@ JobCompLoc=/var/log/slurm/jobcomp.log # ACCOUNTING #JobAcctGatherType=jobacct_gather/linux JobAcctGatherType=jobacct_gather/cgroup -ProctrackType=proctrack/cgroup +# ProctrackType=proctrack/cgroup JobAcctGatherFrequency=30 # @@ -97,7 +97,7 @@ PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP # # COMPUTE NODES # NodeName=c[1-2] RealMemory=1000 State=UNKNOWN -NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2 +NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1 # # # # PARTITIONS diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index 299132b..db691bc 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -15,6 +15,10 @@ _sshd_host() { # start munge using existing key _munge_start_using_key() { + sudo yum install -y nc + sudo yum install -y procps + sudo yum install -y iputils + echo -n "cheking for munge.key" while [ ! -f /.secret/munge.key ]; do echo -n "." @@ -35,50 +39,58 @@ _munge_start_using_key() { # wait for worker user in shared /home volume _wait_for_worker() { + echo "checking for id_rsa.pub" if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then - echo -n "checking for id_rsa.pub" + echo "checking for id_rsa.pub" while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do echo -n "." sleep 1 done echo "" fi + echo "done checking for id_rsa.pub" + } _start_dbus() { - dbus-uuidgen > /var/lib/dbus/machine-id - mkdir -p /var/run/dbus - dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address + dbus-uuidgen >/var/lib/dbus/machine-id + mkdir -p /var/run/dbus + dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address } # run slurmd _slurmd() { - cd /root/rpmbuild/RPMS/$ARCH - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ - slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ - slurm-torque-22.05.6-1.el8.$ARCH.rpm - if [ ! -f /.secret/slurm.conf ]; then - echo -n "checking for slurm.conf" - while [ ! -f /.secret/slurm.conf ]; do - echo -n "." - sleep 1 - done - echo "" - fi - mkdir -p /var/spool/slurm/d /etc/slurm - chown slurm: /var/spool/slurm/d - cp /home/config/cgroup.conf /etc/slurm/cgroup.conf - chown slurm: /etc/slurm/cgroup.conf - chmod 600 /etc/slurm/cgroup.conf - cp /home/config/slurm.conf /etc/slurm/slurm.conf - chown slurm: /etc/slurm/slurm.conf - chmod 600 /etc/slurm/slurm.conf - touch /var/log/slurmd.log - chown slurm: /var/log/slurmd.log - echo -n "Starting slurmd" - /usr/sbin/slurmd - echo -n "Started slurmd" + cd /root/rpmbuild/RPMS/$ARCH + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ + slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ + slurm-torque-22.05.6-1.el8.$ARCH.rpm + + echo "checking for slurm.conf" + if [ ! -f /.secret/slurm.conf ]; then + echo "checking for slurm.conf" + while [ ! -f /.secret/slurm.conf ]; do + echo -n "." + sleep 1 + done + echo "" + fi + echo "found slurm.conf" + + mkdir -p /var/spool/slurm/d /etc/slurm + chown slurm: /var/spool/slurm/d + cp /home/config/cgroup.conf /etc/slurm/cgroup.conf + chown slurm: /etc/slurm/cgroup.conf + chmod 600 /etc/slurm/cgroup.conf + cp /home/config/slurm.conf /etc/slurm/slurm.conf + chown slurm: /etc/slurm/slurm.conf + chmod 600 /etc/slurm/slurm.conf + touch /var/log/slurmd.log + chown slurm: /var/log/slurmd.log + + echo "Starting slurmd" + /usr/sbin/slurmd -Dvv + echo "Started slurmd" } ### main ### From 2d15d513c631d5397822ef2a07c699b9da130643 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Tue, 15 Oct 2024 17:02:25 +0200 Subject: [PATCH 07/25] Stable docker services --- docker-compose.yml | 1 + slurm/controller/docker-entrypoint.sh | 16 ++++++++++------ slurm/controller/slurm.conf | 4 ++-- slurm/worker/docker-entrypoint.sh | 12 ++++++++---- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index feab346..11db075 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -77,6 +77,7 @@ services: - ./slurm/controller/slurm.conf:/home/config/slurm.conf - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro + - ${DATADIR}/slurm/state:/var/lib/slurm/d slurmdbd: container_name: slurmdbd diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 3fc3d18..279412b 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -101,17 +101,21 @@ _slurmctld() { sleep 1 done echo "" - mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm - chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib + mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm /var/run/slurm/d /var/run/slurm/ctld /var/lib/slurm/d /var/lib/slurm/ctld + chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib /var/run/slurm/d /var/run/slurm/ctld /var/lib/slurm/d /var/lib/slurm/ctld touch /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log touch /var/log/slurmd.log chown slurm: /var/log/slurmd.log - # touch /var/run/slurm/d/slurmctld.pid - # chown slurm: /var/run/slurm/d/slurmctld.pid - # touch /var/run/slurm/d/slurmd.pid - # chown slurm:/var/run/slurm/d/slurmd.pid + touch /var/lib/slurm/d/job_state + chown slurm: /var/lib/slurm/d/job_state + touch /var/lib/slurm/d/fed_mgr_state + chown slurm: /var/lib/slurm/d/fed_mgr_state + touch /var/run/slurm/d/slurmctld.pid + chown slurm: /var/run/slurm/d/slurmctld.pid + touch /var/run/slurm/d/slurmd.pid + chown slurm: /var/run/slurm/d/slurmd.pid if [[ ! -f /home/config/slurm.conf ]]; then echo "### Missing slurm.conf ###" diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index 7a55ff6..ab5172a 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -92,8 +92,8 @@ AccountingStoragePort=6819 # # COMPUTE NODES -PartitionName=DEFAULT Nodes=c[1-2] -PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP +PartitionName=DEFAULT Nodes=node01 +PartitionName=debug Nodes=node01 Default=YES MaxTime=INFINITE State=UP # # COMPUTE NODES # NodeName=c[1-2] RealMemory=1000 State=UNKNOWN diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index db691bc..e8bc2a9 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -77,16 +77,20 @@ _slurmd() { fi echo "found slurm.conf" - mkdir -p /var/spool/slurm/d /etc/slurm - chown slurm: /var/spool/slurm/d + mkdir -p /var/spool/slurm/d /etc/slurm /var/run/slurm/d /var/log/slurm + chown slurm: /var/spool/slurm/d /var/run/slurm/d /var/log/slurm cp /home/config/cgroup.conf /etc/slurm/cgroup.conf chown slurm: /etc/slurm/cgroup.conf chmod 600 /etc/slurm/cgroup.conf cp /home/config/slurm.conf /etc/slurm/slurm.conf chown slurm: /etc/slurm/slurm.conf chmod 600 /etc/slurm/slurm.conf - touch /var/log/slurmd.log - chown slurm: /var/log/slurmd.log + touch /var/log/slurm/slurmd.log + chown slurm: /var/log/slurm/slurmd.log + + touch /var/run/slurm/d/slurmd.pid + chmod 600 /var/run/slurm/d/slurmd.pid + chown slurm: /var/run/slurm/d/slurmd.pid echo "Starting slurmd" /usr/sbin/slurmd -Dvv From 2b6859772402d5a52eaace800c32413aa6fb98af Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Tue, 22 Oct 2024 20:27:09 +0200 Subject: [PATCH 08/25] Slurm version update and rest service add --- docker-compose.yml | 29 +++++-- setupDev.sh | 1 + slurm/base/Dockerfile | 20 +++-- slurm/base/Makefile | 2 +- slurm/controller/Dockerfile | 2 +- slurm/controller/docker-entrypoint.sh | 12 +-- slurm/controller/slurm.conf | 2 +- slurm/database/Dockerfile | 2 +- slurm/database/docker-entrypoint.sh | 7 +- slurm/rest/Dockerfile | 2 +- slurm/rest/docker-entrypoint.sh | 104 +++++++++----------------- slurm/rest/slurmrestd.conf | 6 ++ slurm/worker/Dockerfile | 2 +- slurm/worker/docker-entrypoint.sh | 9 ++- 14 files changed, 99 insertions(+), 101 deletions(-) create mode 100644 slurm/rest/slurmrestd.conf diff --git a/docker-compose.yml b/docker-compose.yml index 11db075..70f0a1e 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -69,8 +69,6 @@ services: build: context: ./slurm/controller privileged: true - ports: - - "6817:6817" volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret @@ -78,6 +76,8 @@ services: - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro - ${DATADIR}/slurm/state:/var/lib/slurm/d + ports: + - "6817:6817" slurmdbd: container_name: slurmdbd @@ -88,14 +88,15 @@ services: - mariadb - slurmctld privileged: true - ports: - - "6819:6819" volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro + - ${DATADIR}/slurm/state:/var/lib/slurm/d + ports: + - "6819:6819" node01: container_name: node01 @@ -113,4 +114,22 @@ services: - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro ports: - - "6818:6818" \ No newline at end of file + - "6818:6818" + + slurmrestd: + container_name: slurmrestd + hostname: slurmrestd + build: + context: ./slurm/rest + depends_on: + - slurmctld + privileged: true + volumes: + - ${DATADIR}/slurm/home:/home + - ${DATADIR}/slurm/secret:/.secret + - ./slurm/controller/slurm.conf:/home/config/slurm.conf + - ./slurm/rest/slurmrestd.conf:/home/config/slurmrestd.conf + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro + ports: + - "6820:6820" \ No newline at end of file diff --git a/setupDev.sh b/setupDev.sh index 81eeee6..1141138 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -88,6 +88,7 @@ if [ ! -f docker-compose.yml ]; then fi docker-compose down +docker-compose down --remove-orphans cd slurm/base/ make diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile index 5f19b55..caa8fde 100644 --- a/slurm/base/Dockerfile +++ b/slurm/base/Dockerfile @@ -1,9 +1,11 @@ FROM rockylinux:8 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" -ENV SLURM_VERSION=22.05.6 +ENV SLURM_VERSION=24.05.3 +ENV HTTP_PARSER_VERSION=2.8.0 RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm +RUN ARCH=$(uname -m) && yum install -y https://rpmfind.net/linux/almalinux/8.10/PowerTools/x86_64/os/Packages/http-parser-devel-2.8.0-9.el8.$ARCH.rpm RUN groupadd -g 981 munge \ && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u 981 -g munge -s /sbin/nologin munge \ @@ -17,20 +19,21 @@ RUN yum install -y munge munge-libs rng-tools \ openssh-server openssh-clients dbus-devel \ pam-devel numactl numactl-devel hwloc sudo \ lua readline-devel ncurses-devel man2html \ - libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget + autoconf automake json-c-devel \ + libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget RUN dnf --enablerepo=powertools install -y munge-devel rrdtool-devel lua-devel hwloc-devel mariadb-server mariadb-devel -RUN mkdir -p /usr/local/slurm-tmp \ +RUN mkdir -p /usr/local/slurm-tmp \ && cd /usr/local/slurm-tmp \ && wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 \ - && rpmbuild -ta slurm-${SLURM_VERSION}.tar.bz2 + && rpmbuild -ta --with slurmrestd slurm-${SLURM_VERSION}.tar.bz2 RUN ARCH=$(uname -m) \ && yum -y --nogpgcheck localinstall \ - /root/rpmbuild/RPMS/$ARCH/slurm-${SLURM_VERSION}-1.el8.$ARCH.rpm \ - /root/rpmbuild/RPMS/$ARCH/slurm-perlapi-${SLURM_VERSION}-1.el8.$ARCH.rpm \ - /root/rpmbuild/RPMS/$ARCH/slurm-slurmctld-${SLURM_VERSION}-1.el8.$ARCH.rpm + /root/rpmbuild/RPMS/$ARCH/slurm-${SLURM_VERSION}*.$ARCH.rpm \ + /root/rpmbuild/RPMS/$ARCH/slurm-perlapi-${SLURM_VERSION}*.$ARCH.rpm \ + /root/rpmbuild/RPMS/$ARCH/slurm-slurmctld-${SLURM_VERSION}*.$ARCH.rpm VOLUME ["/home", "/.secret"] # 22: SSH @@ -38,4 +41,5 @@ VOLUME ["/home", "/.secret"] # 6817: SlurmCtlD # 6818: SlurmD # 6819: SlurmDBD -EXPOSE 22 6817 6818 6819 +# 6820: SlurmRestD +EXPOSE 22 6817 6818 6819 6820 diff --git a/slurm/base/Makefile b/slurm/base/Makefile index dc0dff3..01029b8 100644 --- a/slurm/base/Makefile +++ b/slurm/base/Makefile @@ -1,6 +1,6 @@ include ../../.env IMAGE = clustercockpit/slurm.base - +SLURM_VERSION = 24.05.3 .PHONY: build clean build: diff --git a/slurm/controller/Dockerfile b/slurm/controller/Dockerfile index b236b5b..a111d6b 100644 --- a/slurm/controller/Dockerfile +++ b/slurm/controller/Dockerfile @@ -1,4 +1,4 @@ -FROM clustercockpit/slurm.base:22.05.6 +FROM clustercockpit/slurm.base:24.05.3 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 279412b..135f6c9 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -3,6 +3,7 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) +SLURM_VERSION="24.05.3" _delete_secrets() { if [ -f /.secret/munge.key ]; then @@ -90,11 +91,12 @@ _copy_secrets() { # run slurmctld _slurmctld() { cd /root/rpmbuild/RPMS/$ARCH - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ - slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ - slurm-torque-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmctld-22.05.6-1.el8.$ARCH.rpm + + yum -y --nogpgcheck localinstall slurm-$SLURM_VERSION*.$ARCH.rpm \ + slurm-perlapi-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmd-$SLURM_VERSION*.$ARCH.rpm \ + slurm-torque-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmctld-$SLURM_VERSION*.$ARCH.rpm echo "checking for slurmdbd.conf" while [ ! -f /.secret/slurmdbd.conf ]; do echo -n "." diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index ab5172a..f41d0f5 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -56,7 +56,7 @@ SchedulerType=sched/backfill #SchedulerAuth= #SchedulerPort= #SchedulerRootFilter= -SelectType=select/cons_res +# SelectType=select/con_res SelectTypeParameters=CR_CPU_Memory # FastSchedule=1 #PriorityType=priority/multifactor diff --git a/slurm/database/Dockerfile b/slurm/database/Dockerfile index b236b5b..a111d6b 100644 --- a/slurm/database/Dockerfile +++ b/slurm/database/Dockerfile @@ -1,4 +1,4 @@ -FROM clustercockpit/slurm.base:22.05.6 +FROM clustercockpit/slurm.base:24.05.3 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index 504ecd1..3f74437 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -3,6 +3,7 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) +SLURM_VERSION="24.05.3" SLURM_ACCT_DB_SQL=/slurm_acct_db.sql @@ -52,9 +53,9 @@ _wait_for_worker() { # run slurmdbd _slurmdbd() { cd /root/rpmbuild/RPMS/$ARCH - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ - slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmdbd-22.05.6-1.el8.$ARCH.rpm + yum -y --nogpgcheck localinstall slurm-$SLURM_VERSION*.$ARCH.rpm \ + slurm-perlapi-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmdbd-$SLURM_VERSION*.$ARCH.rpm mkdir -p /var/spool/slurm/d /var/log/slurm /etc/slurm chown slurm: /var/spool/slurm/d /var/log/slurm if [[ ! -f /home/config/slurmdbd.conf ]]; then diff --git a/slurm/rest/Dockerfile b/slurm/rest/Dockerfile index b236b5b..a111d6b 100644 --- a/slurm/rest/Dockerfile +++ b/slurm/rest/Dockerfile @@ -1,4 +1,4 @@ -FROM clustercockpit/slurm.base:22.05.6 +FROM clustercockpit/slurm.base:24.05.3 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up diff --git a/slurm/rest/docker-entrypoint.sh b/slurm/rest/docker-entrypoint.sh index 549b92c..0cdca57 100755 --- a/slurm/rest/docker-entrypoint.sh +++ b/slurm/rest/docker-entrypoint.sh @@ -3,109 +3,73 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) +SLURM_VERSION="24.05.3" # start sshd server _sshd_host() { - if [ ! -d /var/run/sshd ]; then - mkdir /var/run/sshd - ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' - fi - /usr/sbin/sshd -} - -# setup worker ssh to be passwordless -_ssh_worker() { - if [[ ! -d /home/worker ]]; then - mkdir -p /home/worker - chown -R worker:worker /home/worker + if [ ! -d /var/run/sshd ]; then + mkdir /var/run/sshd + ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' fi - cat > /home/worker/setup-worker-ssh.sh < ~/.ssh/authorized_keys -chmod 0640 ~/.ssh/authorized_keys -cat >> ~/.ssh/config < /etc/munge/munge.key" - chown munge: /etc/munge/munge.key - chmod 400 /etc/munge/munge.key sudo -u munge /sbin/munged munge -n munge -n | unmunge remunge } -# copy secrets to /.secret directory for other nodes -_copy_secrets() { - cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz - cp thome/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh - cp /etc/munge/munge.key /.secret/munge.key - rm -f /home/worker/worker-secret.tar.gz - rm -f /home/worker/setup-worker-ssh.sh -} - -# run slurmctld -_slurmctld() { +# run slurmrestd +_slurmrestd() { cd /root/rpmbuild/RPMS/$ARCH - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ - slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ - slurm-torque-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmctld-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmrestd-22.05.6-1.el8.$ARCH.rpm + yum -y --nogpgcheck localinstall slurm-$SLURM_VERSION*.$ARCH.rpm \ + slurm-perlapi-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmd-$SLURM_VERSION*.$ARCH.rpm \ + slurm-torque-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmctld-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmrestd-$SLURM_VERSION*.$ARCH.rpm echo -n "checking for slurmdbd.conf" while [ ! -f /.secret/slurmdbd.conf ]; do echo -n "." sleep 1 done echo "" - mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm - chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm - touch /var/log/slurmctld.log - chown slurm: /var/log/slurmctld.log - if [[ ! -f /home/config/slurm.conf ]]; then + # mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm + # chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm + touch /var/log/slurmrestd.log + chown slurm: /var/log/slurmrestd.log + if [[ ! -f /home/config/slurmrestd.conf ]]; then echo "### Missing slurm.conf ###" exit else - echo "### use provided slurm.conf ###" - cp /home/config/slurm.conf /etc/slurm/slurm.conf + echo "### use provided slurmrestd.conf ###" + cp /home/config/slurmrestd.conf /etc/config/slurmrestd.conf fi - sacctmgr -i add cluster "snowflake" sleep 2s - /usr/sbin/slurmctld - cp -f /etc/slurm/slurm.conf /.secret/ + /usr/sbin/slurmrestd -f /etc/config/slurmrestd.conf 0.0.0.0:6820 -Dvv } ### main ### _sshd_host -_ssh_worker -_munge_start -_copy_secrets -_slurmctld +_munge_start_using_key +_slurmrestd tail -f /dev/null diff --git a/slurm/rest/slurmrestd.conf b/slurm/rest/slurmrestd.conf new file mode 100644 index 0000000..a747d11 --- /dev/null +++ b/slurm/rest/slurmrestd.conf @@ -0,0 +1,6 @@ +# +# Example slurmdbd.conf file. +# +include /etc/slurm/slurm.conf + +AuthType=auth/munge \ No newline at end of file diff --git a/slurm/worker/Dockerfile b/slurm/worker/Dockerfile index 2fb1c11..556fcbc 100644 --- a/slurm/worker/Dockerfile +++ b/slurm/worker/Dockerfile @@ -1,4 +1,4 @@ -FROM clustercockpit/slurm.base:22.05.6 +FROM clustercockpit/slurm.base:24.05.3 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" # clean up diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index e8bc2a9..dfadc46 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -3,6 +3,7 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) +SLURM_VERSION="24.05.3" # start sshd server _sshd_host() { @@ -61,10 +62,10 @@ _start_dbus() { # run slurmd _slurmd() { cd /root/rpmbuild/RPMS/$ARCH - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ - slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ - slurm-torque-22.05.6-1.el8.$ARCH.rpm + yum -y --nogpgcheck localinstall slurm-$SLURM_VERSION*.$ARCH.rpm \ + slurm-perlapi-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmd-$SLURM_VERSION*.$ARCH.rpm \ + slurm-torque-$SLURM_VERSION*.$ARCH.rpm echo "checking for slurm.conf" if [ ! -f /.secret/slurm.conf ]; then From f574568d76b1c18b9e0f24396bba1fb40ff36c32 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Wed, 23 Oct 2024 10:17:42 +0200 Subject: [PATCH 09/25] Working rest service --- slurm/base/Dockerfile | 2 +- slurm/rest/docker-entrypoint.sh | 9 ++++++++- slurm/rest/slurmrestd.conf | 4 +--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile index caa8fde..fb9ae62 100644 --- a/slurm/base/Dockerfile +++ b/slurm/base/Dockerfile @@ -27,7 +27,7 @@ RUN dnf --enablerepo=powertools install -y munge-devel rrdtool-devel lua-devel h RUN mkdir -p /usr/local/slurm-tmp \ && cd /usr/local/slurm-tmp \ && wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 \ - && rpmbuild -ta --with slurmrestd slurm-${SLURM_VERSION}.tar.bz2 + && rpmbuild -ta --with slurmrestd --with jwt slurm-${SLURM_VERSION}.tar.bz2 RUN ARCH=$(uname -m) \ && yum -y --nogpgcheck localinstall \ diff --git a/slurm/rest/docker-entrypoint.sh b/slurm/rest/docker-entrypoint.sh index 0cdca57..f5ef0dd 100755 --- a/slurm/rest/docker-entrypoint.sh +++ b/slurm/rest/docker-entrypoint.sh @@ -54,6 +54,10 @@ _slurmrestd() { echo "" # mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm # chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm + + mkdir -p /etc/config /var/spool/slurm /var/spool/slurm/restd /var/spool/slurm/restd/rest + chown -R slurm: /etc/config /var/spool/slurm /var/spool/slurm/restd /var/spool/slurm/restd/rest + touch /var/log/slurmrestd.log chown slurm: /var/log/slurmrestd.log if [[ ! -f /home/config/slurmrestd.conf ]]; then @@ -62,9 +66,12 @@ _slurmrestd() { else echo "### use provided slurmrestd.conf ###" cp /home/config/slurmrestd.conf /etc/config/slurmrestd.conf + cp /home/config/slurm.conf /etc/config/slurm.conf + fi sleep 2s - /usr/sbin/slurmrestd -f /etc/config/slurmrestd.conf 0.0.0.0:6820 -Dvv + export SLURMRESTD=/var/spool/slurm/restd/rest + /usr/sbin/slurmrestd -f /etc/config/slurmrestd.conf -s dbv0.0.39,v0.0.39 -vv -u slurm 0.0.0.0:6820 } ### main ### diff --git a/slurm/rest/slurmrestd.conf b/slurm/rest/slurmrestd.conf index a747d11..8755372 100644 --- a/slurm/rest/slurmrestd.conf +++ b/slurm/rest/slurmrestd.conf @@ -1,6 +1,4 @@ # # Example slurmdbd.conf file. # -include /etc/slurm/slurm.conf - -AuthType=auth/munge \ No newline at end of file +include /etc/config/slurm.conf From 0c1f6b446e845dc60b480755ea54bd27807b3e0a Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 24 Oct 2024 14:54:40 +0200 Subject: [PATCH 10/25] Unix port based slurmrest service+ --- curl_slurmrestd.sh | 3 + docker-compose.yml | 6 ++ setupDev.sh | 46 +++++++++----- slurm/base/Dockerfile | 2 +- slurm/controller/docker-entrypoint.sh | 59 +++++++++++++++-- slurm/controller/slurm.conf | 2 + slurm/database/docker-entrypoint.sh | 36 +++++++++-- slurm/database/slurmdbd.conf | 3 +- slurm/rest/Dockerfile | 7 +- slurm/rest/docker-entrypoint.sh | 92 ++++++++++++++++++++++++--- slurm/worker/docker-entrypoint.sh | 4 ++ 11 files changed, 219 insertions(+), 41 deletions(-) create mode 100755 curl_slurmrestd.sh diff --git a/curl_slurmrestd.sh b/curl_slurmrestd.sh new file mode 100755 index 0000000..e3826ee --- /dev/null +++ b/curl_slurmrestd.sh @@ -0,0 +1,3 @@ +JWT="eyJhbGciOiJSUzI1NiIsICJ0eXAiOiJKV1QifQ.eyJpc3MiOiJzbHVybSJ9.dzAHf1Ojoa149uRCCWY1eP3vDyCIZCOZ3h554R-KJJ8-OP0CJ0ymvSkFISLcYcyd9vVKmaYdSN3tWEF6bNZEmyX7G560i1MbkNFvhkhNVSPLKEKNPs38h5ra3ZlTlLlxAlDzXRAAn6UEEgKdm5vx4Jhec7ptaRL_zeSFpTS5fJPc0QE1Cm7e7nU39-9e8l4WU4KpRMxT6ANFm22_G4-mSA-AgCAvKQFzj2FInKsXDUTGlliNJuAgFxf-9LQxoeAknOQhEqcTXii_yBy9DNcT03pdNcAu5Ru4_qlX62vroInU_eh5mWQyiUdXN9Wj_OfMmfLoYFkJeUFYexBMZnSBgg" + +curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/ping' -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:$SLURM_JWT" diff --git a/docker-compose.yml b/docker-compose.yml index 70f0a1e..f04fd8b 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -91,6 +91,7 @@ services: volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro @@ -109,6 +110,7 @@ services: volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/worker/cgroup.conf:/home/config/cgroup.conf - ./slurm/controller/slurm.conf:/home/config/slurm.conf - /etc/timezone:/etc/timezone:ro @@ -121,12 +123,16 @@ services: hostname: slurmrestd build: context: ./slurm/rest + args: + uid_u: ${UID_U} + gid_g: ${GID_G} depends_on: - slurmctld privileged: true volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/controller/slurm.conf:/home/config/slurm.conf - ./slurm/rest/slurmrestd.conf:/home/config/slurmrestd.conf - /etc/timezone:/etc/timezone:ro diff --git a/setupDev.sh b/setupDev.sh index 1141138..3616787 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -1,21 +1,24 @@ #!/bin/bash echo "" -echo "-----------------------------------------------------------------" -echo "Welcome to cc-docker automatic deployment script." -echo "Make sure you have sudo rights to run docker services" -echo "This script assumes that docker command is added to sudo group" -echo "This means that docker commands do not explicitly require" -echo "'sudo' keyword to run. You can use this following command:" -echo "" -echo "sudo groupadd docker" -echo "sudo usermod -aG docker $USER" -echo "" -echo "This will add docker to the sudo usergroup and all the docker" -echo "command will run as sudo by default without requiring" -echo "'sudo' keyword." -echo "-----------------------------------------------------------------" +echo "|--------------------------------------------------------------------------------------|" +echo "| Welcome to cc-docker automatic deployment script. |" +echo "| Make sure you have sudo rights to run docker services |" +echo "| This script assumes that docker command is added to sudo group |" +echo "| This means that docker commands do not explicitly require |" +echo "| 'sudo' keyword to run. You can use this following command: |" +echo "| |" +echo "| > sudo groupadd docker |" +echo "| > sudo usermod -aG docker $USER |" +echo "| |" +echo "| This will add docker to the sudo usergroup and all the docker |" +echo "| command will run as sudo by default without requiring |" +echo "| 'sudo' keyword. |" +echo "|--------------------------------------------------------------------------------------|" echo "" +export UID_U=$(id -u $USER) +export GID_G=$(id -g $USER) + # Check cc-backend, touch job.db if exists if [ ! -d cc-backend ]; then echo "'cc-backend' not yet prepared! Please clone cc-backend repository before starting this script." @@ -98,6 +101,15 @@ docker-compose build docker-compose up -d echo "" -echo "Setup complete, containers are up by default: Shut down with 'docker-compose down'." -echo "Use './cc-backend/cc-backend -server' to start cc-backend." -echo "Use scripts in /scripts to load data into influx or mariadb." +echo "|--------------------------------------------------------------------------------------|" +echo "| Check logs for each slurm service by using these commands: |" +echo "| docker-compose logs slurmctld |" +echo "| docker-compose logs slurmdbd |" +echo "| docker-compose logs slurmrestd |" +echo "| docker-compose logs node01 |" +echo "|======================================================================================|" +echo "| Setup complete, containers are up by default: Shut down with 'docker-compose down'. |" +echo "| Use './cc-backend/cc-backend -server' to start cc-backend. |" +echo "| Use scripts in /scripts to load data into influx or mariadb. |" +echo "|--------------------------------------------------------------------------------------|" +echo "" diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile index fb9ae62..f47588e 100644 --- a/slurm/base/Dockerfile +++ b/slurm/base/Dockerfile @@ -19,7 +19,7 @@ RUN yum install -y munge munge-libs rng-tools \ openssh-server openssh-clients dbus-devel \ pam-devel numactl numactl-devel hwloc sudo \ lua readline-devel ncurses-devel man2html \ - autoconf automake json-c-devel \ + autoconf automake json-c-devel libjwt-devel \ libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget RUN dnf --enablerepo=powertools install -y munge-devel rrdtool-devel lua-devel hwloc-devel mariadb-server mariadb-devel diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 135f6c9..2871c8d 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -4,6 +4,8 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) SLURM_VERSION="24.05.3" +SLURM_JWT=daemon +SLURMRESTD_SECURITY=disable_user_check _delete_secrets() { if [ -f /.secret/munge.key ]; then @@ -11,6 +13,9 @@ _delete_secrets() { sudo rm -rf /.secret/munge.key sudo rm -rf /.secret/worker-secret.tar.gz sudo rm -rf /.secret/setup-worker-ssh.sh + sudo rm -rf /.secret/jwt.key + sudo rm -rf /.secret/jwt_public.key + sudo rm -rf /.secret/jwt_token.key echo "Done removing secrets" ls /.secret/ @@ -88,6 +93,31 @@ _copy_secrets() { rm -f /home/worker/setup-worker-ssh.sh } +_openssl_jwt_key() { + cd /.secret + openssl rand -base64 32 > jwt.key + # openssl genpkey -algorithm RSA -out jwt.key -pkeyopt rsa_keygen_bits:2048 + # openssl rsa -pubout -in jwt.key -out jwt_public.key + cd .. +} + +_generate_jwt_token() { + PEM=$(cat /etc/config/jwt.key) + USER=\"slurm\" + NOW=$(date +%s) + IAT="${NOW}" + EXP=$((${NOW} + 3600000)) + HEADER_RAW='{"alg":"HS256", "typ":"JWT"}' + HEADER=$(echo -n "${HEADER_RAW}" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') + PAYLOAD_RAW='{"iss":'${USER}'}' + PAYLOAD=$(echo -n "${PAYLOAD_RAW}" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') + HEADER_PAYLOAD="${HEADER}"."${PAYLOAD}" + SIGNATURE=$(openssl dgst -sha256 -sign <(echo -n "${PEM}") <(echo -n "${HEADER_PAYLOAD}") | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') + JWT="${HEADER_PAYLOAD}"."${SIGNATURE}" + echo $JWT | cat >/.secret/jwt_token.txt + chmod 777 /.secret/jwt_token.txt +} + # run slurmctld _slurmctld() { cd /root/rpmbuild/RPMS/$ARCH @@ -105,19 +135,22 @@ _slurmctld() { echo "" mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm /var/run/slurm/d /var/run/slurm/ctld /var/lib/slurm/d /var/lib/slurm/ctld chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib /var/run/slurm/d /var/run/slurm/ctld /var/lib/slurm/d /var/lib/slurm/ctld + mkdir -p /etc/config + chown -R slurm: /etc/config + touch /var/log/slurmctld.log - chown slurm: /var/log/slurmctld.log + chown -R slurm: /var/log/slurmctld.log touch /var/log/slurmd.log - chown slurm: /var/log/slurmd.log + chown -R slurm: /var/log/slurmd.log touch /var/lib/slurm/d/job_state - chown slurm: /var/lib/slurm/d/job_state + chown -R slurm: /var/lib/slurm/d/job_state touch /var/lib/slurm/d/fed_mgr_state - chown slurm: /var/lib/slurm/d/fed_mgr_state + chown -R slurm: /var/lib/slurm/d/fed_mgr_state touch /var/run/slurm/d/slurmctld.pid - chown slurm: /var/run/slurm/d/slurmctld.pid + chown -R slurm: /var/run/slurm/d/slurmctld.pid touch /var/run/slurm/d/slurmd.pid - chown slurm: /var/run/slurm/d/slurmd.pid + chown -R slurm: /var/run/slurm/d/slurmd.pid if [[ ! -f /home/config/slurm.conf ]]; then echo "### Missing slurm.conf ###" @@ -129,6 +162,19 @@ _slurmctld() { chmod 600 /etc/slurm/slurm.conf fi + _openssl_jwt_key + + if [ ! -f /.secret/jwt.key ]; then + echo "### Missing jwt.key ###" + exit 1 + else + cp /.secret/jwt.key /etc/config/jwt.key + chown slurm: /etc/config/jwt.key + chmod 0400 /etc/config/jwt.key + fi + + _generate_jwt_token + sudo yum install -y nc sudo yum install -y procps sudo yum install -y iputils @@ -149,6 +195,7 @@ _slurmctld() { ### main ### _delete_secrets _sshd_host + _ssh_worker _munge_start _copy_secrets diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index f41d0f5..83c9f24 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -22,6 +22,8 @@ MpiDefault=none SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid SlurmdPidFile=/var/run/slurm/d/slurmd.pid ProctrackType=proctrack/linuxproc +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=/etc/config/jwt.key #PluginDir= #CacheGroups=0 #FirstJobId= diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index 3f74437..2b968fb 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -4,7 +4,7 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) SLURM_VERSION="24.05.3" - +SLURM_JWT=daemon SLURM_ACCT_DB_SQL=/slurm_acct_db.sql # start sshd server @@ -52,12 +52,16 @@ _wait_for_worker() { # run slurmdbd _slurmdbd() { - cd /root/rpmbuild/RPMS/$ARCH - yum -y --nogpgcheck localinstall slurm-$SLURM_VERSION*.$ARCH.rpm \ - slurm-perlapi-$SLURM_VERSION*.$ARCH.rpm \ - slurm-slurmdbd-$SLURM_VERSION*.$ARCH.rpm + cd /root/rpmbuild/RPMS/$ARCH + yum -y --nogpgcheck localinstall slurm-$SLURM_VERSION*.$ARCH.rpm \ + slurm-perlapi-$SLURM_VERSION*.$ARCH.rpm \ + slurm-slurmdbd-$SLURM_VERSION*.$ARCH.rpm mkdir -p /var/spool/slurm/d /var/log/slurm /etc/slurm - chown slurm: /var/spool/slurm/d /var/log/slurm + chown -R slurm: /var/spool/slurm/d /var/log/slurm + + mkdir -p /etc/config + chown -R slurm: /etc/config + if [[ ! -f /home/config/slurmdbd.conf ]]; then echo "### Missing slurmdbd.conf ###" exit @@ -67,8 +71,26 @@ _slurmdbd() { chown slurm: /etc/slurm/slurmdbd.conf chmod 600 /etc/slurm/slurmdbd.conf fi - echo "Starting slurmdbd" + + echo -n "checking for jwt.key" + while [ ! -f /.secret/jwt.key ]; do + echo -n "." + sleep 1 + done + + cp /.secret/jwt.key /etc/config/jwt.key + chown slurm: /etc/config/jwt.key + chmod 0400 /etc/config/jwt.key + + echo "" + + sudo yum install -y nc + sudo yum install -y procps + sudo yum install -y iputils + cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf + + echo "Starting slurmdbd" /usr/sbin/slurmdbd -Dvv echo "Started slurmdbd" } diff --git a/slurm/database/slurmdbd.conf b/slurm/database/slurmdbd.conf index d584535..1be920c 100644 --- a/slurm/database/slurmdbd.conf +++ b/slurm/database/slurmdbd.conf @@ -14,7 +14,8 @@ # Authentication info AuthType=auth/munge #AuthInfo=/var/run/munge/munge.socket.2 -# +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=/etc/config/jwt.key # slurmDBD info DbdAddr=slurmdbd DbdHost=slurmdbd diff --git a/slurm/rest/Dockerfile b/slurm/rest/Dockerfile index a111d6b..664921d 100644 --- a/slurm/rest/Dockerfile +++ b/slurm/rest/Dockerfile @@ -1,10 +1,15 @@ FROM clustercockpit/slurm.base:24.05.3 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" +ARG uid_u +ARG gid_g +ENV uid_u=${uid_u} +ENV gid_g=${gid_g} + # clean up RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ && yum clean all \ && rm -rf /var/cache/yum COPY docker-entrypoint.sh /docker-entrypoint.sh -ENTRYPOINT ["/docker-entrypoint.sh"] +ENTRYPOINT /docker-entrypoint.sh $uid_u $gid_g diff --git a/slurm/rest/docker-entrypoint.sh b/slurm/rest/docker-entrypoint.sh index f5ef0dd..fc0f726 100755 --- a/slurm/rest/docker-entrypoint.sh +++ b/slurm/rest/docker-entrypoint.sh @@ -4,6 +4,18 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) SLURM_VERSION="24.05.3" +SLURMRESTD="/tmp/slurmrestd.socket" +# SLURM_JWT=daemon + +uid_u="${1:-}" +gid_g="${2:-}" + +echo Your container args are: "$@" + +# Change the uid +# usermod -u "${uid_u}" slurm +# Change the gid +# groupmod -g "${gid_g}" slurm # start sshd server _sshd_host() { @@ -14,7 +26,6 @@ _sshd_host() { /usr/sbin/sshd } -# start munge and generate key # start munge using existing key _munge_start_using_key() { if [ ! -f /.secret/munge.key ]; then @@ -37,6 +48,48 @@ _munge_start_using_key() { remunge } +_enable_slurmrestd() { + + cd /tmp + mkdir statesave + dd if=/dev/random of=/tmp/statesave/jwt_hs256.key bs=32 count=1 + chown slurm:slurm /tmp/statesave/jwt_hs256.key + chmod 0600 /tmp/statesave/jwt_hs256.key + chown slurm:slurm /tmp/statesave + chmod 0755 /tmp/statesave + + cat >/usr/lib/systemd/system/slurmrestd.service < Date: Sun, 27 Oct 2024 20:17:56 +0100 Subject: [PATCH 11/25] fix: Fix dependency bugs and expose Unix socket --- curl_slurmrestd.sh | 3 ++- docker-compose.yml | 1 + slurm/controller/docker-entrypoint.sh | 5 +++-- slurm/database/docker-entrypoint.sh | 7 +++---- slurm/rest/docker-entrypoint.sh | 12 +++++++----- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/curl_slurmrestd.sh b/curl_slurmrestd.sh index e3826ee..a52cd70 100755 --- a/curl_slurmrestd.sh +++ b/curl_slurmrestd.sh @@ -1,3 +1,4 @@ JWT="eyJhbGciOiJSUzI1NiIsICJ0eXAiOiJKV1QifQ.eyJpc3MiOiJzbHVybSJ9.dzAHf1Ojoa149uRCCWY1eP3vDyCIZCOZ3h554R-KJJ8-OP0CJ0ymvSkFISLcYcyd9vVKmaYdSN3tWEF6bNZEmyX7G560i1MbkNFvhkhNVSPLKEKNPs38h5ra3ZlTlLlxAlDzXRAAn6UEEgKdm5vx4Jhec7ptaRL_zeSFpTS5fJPc0QE1Cm7e7nU39-9e8l4WU4KpRMxT6ANFm22_G4-mSA-AgCAvKQFzj2FInKsXDUTGlliNJuAgFxf-9LQxoeAknOQhEqcTXii_yBy9DNcT03pdNcAu5Ru4_qlX62vroInU_eh5mWQyiUdXN9Wj_OfMmfLoYFkJeUFYexBMZnSBgg" -curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/ping' -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:$SLURM_JWT" +# curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/ping' -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:$SLURM_JWT" +curl -v --unix-socket data/slurm/tmp/slurmrestd.socket 'http://localhost:6820/slurm/v0.0.39/ping' \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f04fd8b..51e3354 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,7 @@ services: volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/controller/slurm.conf:/home/config/slurm.conf - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 2871c8d..6fca56d 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -129,7 +129,7 @@ _slurmctld() { slurm-slurmctld-$SLURM_VERSION*.$ARCH.rpm echo "checking for slurmdbd.conf" while [ ! -f /.secret/slurmdbd.conf ]; do - echo -n "." + echo "." sleep 1 done echo "" @@ -170,7 +170,7 @@ _slurmctld() { else cp /.secret/jwt.key /etc/config/jwt.key chown slurm: /etc/config/jwt.key - chmod 0400 /etc/config/jwt.key + chmod 0600 /etc/config/jwt.key fi _generate_jwt_token @@ -178,6 +178,7 @@ _slurmctld() { sudo yum install -y nc sudo yum install -y procps sudo yum install -y iputils + sudo yum install -y lsof while ! nc -z slurmdbd 6819; do echo "Waiting for slurmdbd to be ready..." diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index 2b968fb..c9227ae 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -70,11 +70,12 @@ _slurmdbd() { cp /home/config/slurmdbd.conf /etc/slurm/slurmdbd.conf chown slurm: /etc/slurm/slurmdbd.conf chmod 600 /etc/slurm/slurmdbd.conf + cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf fi - echo -n "checking for jwt.key" + echo "checking for jwt.key" while [ ! -f /.secret/jwt.key ]; do - echo -n "." + echo "." sleep 1 done @@ -87,8 +88,6 @@ _slurmdbd() { sudo yum install -y nc sudo yum install -y procps sudo yum install -y iputils - - cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf echo "Starting slurmdbd" /usr/sbin/slurmdbd -Dvv diff --git a/slurm/rest/docker-entrypoint.sh b/slurm/rest/docker-entrypoint.sh index fc0f726..3c0cddd 100755 --- a/slurm/rest/docker-entrypoint.sh +++ b/slurm/rest/docker-entrypoint.sh @@ -113,8 +113,8 @@ _slurmrestd() { touch /var/log/slurmrestd.log chown slurm: /var/log/slurmrestd.log - chown slurm: /tmp - chmod 777 /tmp + chown worker: /tmp + chmod 770 /tmp if [[ ! -f /home/config/slurmrestd.conf ]]; then echo "### Missing slurm.conf ###" @@ -125,15 +125,17 @@ _slurmrestd() { cp /home/config/slurm.conf /etc/config/slurm.conf fi - echo -n "checking for jwt.key" + echo "checking for jwt.key" while [ ! -f /.secret/jwt.key ]; do - echo -n "." + echo "." sleep 1 done sudo yum install -y nc sudo yum install -y procps sudo yum install -y iputils + sudo yum install -y lsof + sudo yum install -y socat cp /.secret/jwt.key /etc/config/jwt.key chown slurm: /etc/config/jwt.key @@ -146,7 +148,7 @@ _slurmrestd() { # _enable_slurmrestd # sudo ln -s /usr/lib/systemd/system/slurmrestd.service /etc/systemd/system/multi-user.target.wants/slurmrestd.service - /usr/sbin/slurmrestd -f /etc/config/slurmrestd.conf -vvvvvv -s dbv0.0.39,v0.0.39 -u slurm unix:$SLURMRESTD 0.0.0.0:6820 + /usr/sbin/slurmrestd -f /etc/config/slurmrestd.conf -vvvvvv -s dbv0.0.39,v0.0.39 -u worker unix:$SLURMRESTD 0.0.0.0:6820 echo "Started slurmrestd" } From 255f05bee7156087b8c7b8d99ac4dd09377a19eb Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Sun, 27 Oct 2024 23:41:19 +0100 Subject: [PATCH 12/25] fix + feat: working JWT auth for slurm restd and other daemons --- curl_slurmrestd.sh | 7 +-- docker-compose.yml | 11 +--- jwt_verifier.py | 27 ++++++++ slurm/base/Dockerfile | 8 +-- slurm/controller/docker-entrypoint.sh | 89 +++++++++++++++++---------- slurm/controller/slurm.conf | 6 +- slurm/database/docker-entrypoint.sh | 11 ++-- slurm/database/slurmdbd.conf | 2 +- slurm/rest/Dockerfile | 7 +-- slurm/rest/docker-entrypoint.sh | 40 ++++-------- 10 files changed, 115 insertions(+), 93 deletions(-) create mode 100644 jwt_verifier.py diff --git a/curl_slurmrestd.sh b/curl_slurmrestd.sh index a52cd70..dc506ff 100755 --- a/curl_slurmrestd.sh +++ b/curl_slurmrestd.sh @@ -1,4 +1,3 @@ -JWT="eyJhbGciOiJSUzI1NiIsICJ0eXAiOiJKV1QifQ.eyJpc3MiOiJzbHVybSJ9.dzAHf1Ojoa149uRCCWY1eP3vDyCIZCOZ3h554R-KJJ8-OP0CJ0ymvSkFISLcYcyd9vVKmaYdSN3tWEF6bNZEmyX7G560i1MbkNFvhkhNVSPLKEKNPs38h5ra3ZlTlLlxAlDzXRAAn6UEEgKdm5vx4Jhec7ptaRL_zeSFpTS5fJPc0QE1Cm7e7nU39-9e8l4WU4KpRMxT6ANFm22_G4-mSA-AgCAvKQFzj2FInKsXDUTGlliNJuAgFxf-9LQxoeAknOQhEqcTXii_yBy9DNcT03pdNcAu5Ru4_qlX62vroInU_eh5mWQyiUdXN9Wj_OfMmfLoYFkJeUFYexBMZnSBgg" - -# curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/ping' -H "X-SLURM-USER-NAME:slurm" -H "X-SLURM-USER-TOKEN:$SLURM_JWT" -curl -v --unix-socket data/slurm/tmp/slurmrestd.socket 'http://localhost:6820/slurm/v0.0.39/ping' \ No newline at end of file +SLURM_JWT=$(cat data/slurm/secret/jwt_token.txt) +curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/ping' --location --silent --show-error -H "X-SLURM-USER-NAME: root" -H "X-SLURM-USER-TOKEN: $SLURM_JWT" +# curl -v --unix-socket data/slurm/tmp/slurmrestd.socket 'http://localhost:6820/slurm/v0.0.39/ping' \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 51e3354..f10d9ed 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,7 +72,6 @@ services: volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/controller/slurm.conf:/home/config/slurm.conf - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro @@ -92,11 +91,9 @@ services: volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf - /etc/timezone:/etc/timezone:ro - /etc/localtime:/etc/localtime:ro - - ${DATADIR}/slurm/state:/var/lib/slurm/d ports: - "6819:6819" @@ -111,7 +108,6 @@ services: volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/worker/cgroup.conf:/home/config/cgroup.conf - ./slurm/controller/slurm.conf:/home/config/slurm.conf - /etc/timezone:/etc/timezone:ro @@ -124,16 +120,15 @@ services: hostname: slurmrestd build: context: ./slurm/rest - args: - uid_u: ${UID_U} - gid_g: ${GID_G} + environment: + - SLURM_JWT=daemon + - SLURMRESTD_DEBUG=9 depends_on: - slurmctld privileged: true volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - - ${DATADIR}/slurm/tmp:/tmp:rw - ./slurm/controller/slurm.conf:/home/config/slurm.conf - ./slurm/rest/slurmrestd.conf:/home/config/slurmrestd.conf - /etc/timezone:/etc/timezone:ro diff --git a/jwt_verifier.py b/jwt_verifier.py new file mode 100644 index 0000000..e9ec78e --- /dev/null +++ b/jwt_verifier.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +import sys +import os +import pprint +import json +import time +from datetime import datetime, timedelta, timezone + +from jwt import JWT +from jwt.jwa import HS256 +from jwt.jwk import jwk_from_dict +from jwt.utils import b64decode,b64encode + +if len(sys.argv) != 2: + sys.exit("verify_jwt.py [JWT Token]"); + +with open("data/slurm/secret/jwt_hs256.key", "rb") as f: + priv_key = f.read() + +signing_key = jwk_from_dict({ + 'kty': 'oct', + 'k': b64encode(priv_key) +}) + +a = JWT() +b = a.decode(sys.argv[1], signing_key, algorithms=["HS256"]) +print(b) \ No newline at end of file diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile index f47588e..ca6b27f 100644 --- a/slurm/base/Dockerfile +++ b/slurm/base/Dockerfile @@ -9,10 +9,10 @@ RUN ARCH=$(uname -m) && yum install -y https://rpmfind.net/linux/almalinux/8.10/ RUN groupadd -g 981 munge \ && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u 981 -g munge -s /sbin/nologin munge \ - && groupadd -g 982 slurm \ - && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u 982 -g slurm -s /bin/bash slurm \ - && groupadd -g 1000 worker \ - && useradd -m -c "Workflow user" -d /home/worker -u 1000 -g worker -s /bin/bash worker + && groupadd -g 1000 slurm \ + && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u 1000 -g slurm -s /bin/bash slurm \ + && groupadd -g 982 worker \ + && useradd -m -c "Workflow user" -d /home/worker -u 982 -g worker -s /bin/bash worker RUN yum install -y munge munge-libs rng-tools \ python3 gcc openssl openssl-devel \ diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 6fca56d..72ac3c9 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -13,9 +13,8 @@ _delete_secrets() { sudo rm -rf /.secret/munge.key sudo rm -rf /.secret/worker-secret.tar.gz sudo rm -rf /.secret/setup-worker-ssh.sh - sudo rm -rf /.secret/jwt.key - sudo rm -rf /.secret/jwt_public.key - sudo rm -rf /.secret/jwt_token.key + sudo rm -rf /.secret/jwt_hs256.key + sudo rm -rf /.secret/jwt_token.txt echo "Done removing secrets" ls /.secret/ @@ -94,27 +93,48 @@ _copy_secrets() { } _openssl_jwt_key() { - cd /.secret - openssl rand -base64 32 > jwt.key - # openssl genpkey -algorithm RSA -out jwt.key -pkeyopt rsa_keygen_bits:2048 - # openssl rsa -pubout -in jwt.key -out jwt_public.key - cd .. + + mkdir -p /var/spool/slurm/statesave + dd if=/dev/random of=/var/spool/slurm/statesave/jwt_hs256.key bs=32 count=1 + chown slurm:slurm /var/spool/slurm/statesave/jwt_hs256.key + chmod 0600 /var/spool/slurm/statesave/jwt_hs256.key + chown slurm:slurm /var/spool/slurm/statesave + chmod 0755 /var/spool/slurm/statesave + cp /var/spool/slurm/statesave/jwt_hs256.key /.secret/jwt_hs256.key + chmod 777 /.secret/jwt_hs256.key } _generate_jwt_token() { - PEM=$(cat /etc/config/jwt.key) - USER=\"slurm\" - NOW=$(date +%s) - IAT="${NOW}" - EXP=$((${NOW} + 3600000)) - HEADER_RAW='{"alg":"HS256", "typ":"JWT"}' - HEADER=$(echo -n "${HEADER_RAW}" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') - PAYLOAD_RAW='{"iss":'${USER}'}' - PAYLOAD=$(echo -n "${PAYLOAD_RAW}" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') - HEADER_PAYLOAD="${HEADER}"."${PAYLOAD}" - SIGNATURE=$(openssl dgst -sha256 -sign <(echo -n "${PEM}") <(echo -n "${HEADER_PAYLOAD}") | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') - JWT="${HEADER_PAYLOAD}"."${SIGNATURE}" - echo $JWT | cat >/.secret/jwt_token.txt + + secret_key=$(cat /var/spool/slurm/statesave/jwt_hs256.key) + start_time=$(date +%s) + exp_time=$((start_time + 100000000)) + base64url() { + # Don't wrap, make URL-safe, delete trailer. + base64 -w 0 | tr '+/' '-_' | tr -d '=' + } + + jwt_header=$(echo -n '{"alg":"HS256","typ":"JWT"}' | base64url) + + jwt_claims=$(cat < Monochrome output, compact output, join lines + + jwt_signature=$(echo -n "${jwt_header}.${jwt_claims}" | + openssl dgst -sha256 -hmac "$secret_key" -binary | base64url) + + # Use the same colours as jwt.io, more-or-less. + echo "$(tput setaf 1)${jwt_header}$(tput sgr0).$(tput setaf 5)${jwt_claims}$(tput sgr0).$(tput setaf 6)${jwt_signature}$(tput sgr0)" + + jwt="${jwt_header}.${jwt_claims}.${jwt_signature}" + + echo $jwt | cat >/.secret/jwt_token.txt chmod 777 /.secret/jwt_token.txt } @@ -162,23 +182,24 @@ _slurmctld() { chmod 600 /etc/slurm/slurm.conf fi - _openssl_jwt_key - - if [ ! -f /.secret/jwt.key ]; then - echo "### Missing jwt.key ###" - exit 1 - else - cp /.secret/jwt.key /etc/config/jwt.key - chown slurm: /etc/config/jwt.key - chmod 0600 /etc/config/jwt.key - fi - - _generate_jwt_token - sudo yum install -y nc sudo yum install -y procps sudo yum install -y iputils sudo yum install -y lsof + sudo yum install -y jq + + _openssl_jwt_key + + if [ ! -f /.secret/jwt_hs256.key ]; then + echo "### Missing jwt.key ###" + exit 1 + else + cp /.secret/jwt_hs256.key /etc/config/jwt_hs256.key + chown slurm: /etc/config/jwt_hs256.key + chmod 0600 /etc/config/jwt_hs256.key + fi + + _generate_jwt_token while ! nc -z slurmdbd 6819; do echo "Waiting for slurmdbd to be ready..." diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index 83c9f24..8fbaee4 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -23,7 +23,7 @@ SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid SlurmdPidFile=/var/run/slurm/d/slurmd.pid ProctrackType=proctrack/linuxproc AuthAltTypes=auth/jwt -AuthAltParameters=jwt_key=/etc/config/jwt.key +AuthAltParameters=jwt_key=/var/spool/slurm/statesave/jwt_hs256.key #PluginDir= #CacheGroups=0 #FirstJobId= @@ -71,9 +71,9 @@ SelectTypeParameters=CR_CPU_Memory #PriorityMaxAge=1-0 # # LOGGING -SlurmctldDebug=3 +SlurmctldDebug=6 SlurmctldLogFile=/var/log/slurm/slurmctld.log -SlurmdDebug=3 +SlurmdDebug=6 SlurmdLogFile=/var/log/slurm/slurmd.log JobCompType=jobcomp/filetxt JobCompLoc=/var/log/slurm/jobcomp.log diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index c9227ae..62b967d 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -74,14 +74,17 @@ _slurmdbd() { fi echo "checking for jwt.key" - while [ ! -f /.secret/jwt.key ]; do + while [ ! -f /.secret/jwt_hs256.key ]; do echo "." sleep 1 done - cp /.secret/jwt.key /etc/config/jwt.key - chown slurm: /etc/config/jwt.key - chmod 0400 /etc/config/jwt.key + mkdir -p /var/spool/slurm/statesave + chown slurm:slurm /var/spool/slurm/statesave + chmod 0755 /var/spool/slurm/statesave + cp /.secret/jwt_hs256.key /var/spool/slurm/statesave/jwt_hs256.key + chown slurm: /var/spool/slurm/statesave/jwt_hs256.key + chmod 0600 /var/spool/slurm/statesave/jwt_hs256.key echo "" diff --git a/slurm/database/slurmdbd.conf b/slurm/database/slurmdbd.conf index 1be920c..884a988 100644 --- a/slurm/database/slurmdbd.conf +++ b/slurm/database/slurmdbd.conf @@ -15,7 +15,7 @@ AuthType=auth/munge #AuthInfo=/var/run/munge/munge.socket.2 AuthAltTypes=auth/jwt -AuthAltParameters=jwt_key=/etc/config/jwt.key +AuthAltParameters=jwt_key=/var/spool/slurm/statesave/jwt_hs256.key # slurmDBD info DbdAddr=slurmdbd DbdHost=slurmdbd diff --git a/slurm/rest/Dockerfile b/slurm/rest/Dockerfile index 664921d..a111d6b 100644 --- a/slurm/rest/Dockerfile +++ b/slurm/rest/Dockerfile @@ -1,15 +1,10 @@ FROM clustercockpit/slurm.base:24.05.3 LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de" -ARG uid_u -ARG gid_g -ENV uid_u=${uid_u} -ENV gid_g=${gid_g} - # clean up RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ && yum clean all \ && rm -rf /var/cache/yum COPY docker-entrypoint.sh /docker-entrypoint.sh -ENTRYPOINT /docker-entrypoint.sh $uid_u $gid_g +ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/rest/docker-entrypoint.sh b/slurm/rest/docker-entrypoint.sh index 3c0cddd..146ceff 100755 --- a/slurm/rest/docker-entrypoint.sh +++ b/slurm/rest/docker-entrypoint.sh @@ -4,18 +4,8 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) SLURM_VERSION="24.05.3" -SLURMRESTD="/tmp/slurmrestd.socket" -# SLURM_JWT=daemon - -uid_u="${1:-}" -gid_g="${2:-}" - -echo Your container args are: "$@" - -# Change the uid -# usermod -u "${uid_u}" slurm -# Change the gid -# groupmod -g "${gid_g}" slurm +# SLURMRESTD="/tmp/slurmrestd.socket" +SLURM_JWT=daemon # start sshd server _sshd_host() { @@ -50,14 +40,6 @@ _munge_start_using_key() { _enable_slurmrestd() { - cd /tmp - mkdir statesave - dd if=/dev/random of=/tmp/statesave/jwt_hs256.key bs=32 count=1 - chown slurm:slurm /tmp/statesave/jwt_hs256.key - chmod 0600 /tmp/statesave/jwt_hs256.key - chown slurm:slurm /tmp/statesave - chmod 0755 /tmp/statesave - cat >/usr/lib/systemd/system/slurmrestd.service < Date: Tue, 29 Oct 2024 16:58:43 +0100 Subject: [PATCH 13/25] update: adding CMD to all dockerfiles --- slurm/controller/Dockerfile | 1 + slurm/database/Dockerfile | 1 + slurm/rest/Dockerfile | 1 + slurm/worker/Dockerfile | 1 + 4 files changed, 4 insertions(+) diff --git a/slurm/controller/Dockerfile b/slurm/controller/Dockerfile index a111d6b..470748d 100644 --- a/slurm/controller/Dockerfile +++ b/slurm/controller/Dockerfile @@ -7,4 +7,5 @@ RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ && rm -rf /var/cache/yum COPY docker-entrypoint.sh /docker-entrypoint.sh +CMD ["/usr/sbin/init"] ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/database/Dockerfile b/slurm/database/Dockerfile index a111d6b..470748d 100644 --- a/slurm/database/Dockerfile +++ b/slurm/database/Dockerfile @@ -7,4 +7,5 @@ RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ && rm -rf /var/cache/yum COPY docker-entrypoint.sh /docker-entrypoint.sh +CMD ["/usr/sbin/init"] ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/rest/Dockerfile b/slurm/rest/Dockerfile index a111d6b..470748d 100644 --- a/slurm/rest/Dockerfile +++ b/slurm/rest/Dockerfile @@ -7,4 +7,5 @@ RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ && rm -rf /var/cache/yum COPY docker-entrypoint.sh /docker-entrypoint.sh +CMD ["/usr/sbin/init"] ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/worker/Dockerfile b/slurm/worker/Dockerfile index 556fcbc..85ed88b 100644 --- a/slurm/worker/Dockerfile +++ b/slurm/worker/Dockerfile @@ -8,4 +8,5 @@ RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ WORKDIR /home/worker COPY docker-entrypoint.sh /docker-entrypoint.sh +CMD ["/usr/sbin/init"] ENTRYPOINT ["/docker-entrypoint.sh"] From 325ab9b27d4e0d11566e90a027d447976161460b Mon Sep 17 00:00:00 2001 From: Aditya Date: Wed, 13 Nov 2024 13:14:41 +0000 Subject: [PATCH 14/25] Update to CgroupPlugin --- cc-metric-store/Dockerfile | 20 -------------------- cc-metric-store/config.json | 28 ---------------------------- curl_slurmrestd.sh | 2 +- slurm/controller/slurm.conf | 10 +++++----- slurm/worker/cgroup.conf | 2 +- slurm/worker/docker-entrypoint.sh | 7 ++++--- 6 files changed, 11 insertions(+), 58 deletions(-) delete mode 100644 cc-metric-store/Dockerfile delete mode 100644 cc-metric-store/config.json diff --git a/cc-metric-store/Dockerfile b/cc-metric-store/Dockerfile deleted file mode 100644 index eb7aa48..0000000 --- a/cc-metric-store/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM golang:1.22.4 - -RUN apt-get update -RUN apt-get -y install git - -RUN git clone https://github.com/ClusterCockpit/cc-metric-store.git /cc-metric-store -RUN ls -RUN cd /cc-metric-store && go build ./cmd/cc-metric-store - -# Reactivate when latest commit is available -#RUN go get -d -v github.com/ClusterCockpit/cc-metric-store -#RUN go install -v github.com/ClusterCockpit/cc-metric-store@latest - -RUN mv /cc-metric-store/cc-metric-store /go/bin -COPY config.json /go/bin - -VOLUME /data - -WORKDIR /go/bin -CMD ["./cc-metric-store"] diff --git a/cc-metric-store/config.json b/cc-metric-store/config.json deleted file mode 100644 index 674c67c..0000000 --- a/cc-metric-store/config.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "metrics": { - "clock": { "frequency": 60, "aggregation": null, "scope": "node" }, - "cpi": { "frequency": 60, "aggregation": null, "scope": "node" }, - "cpu_load": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_any": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_dp": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_sp": { "frequency": 60, "aggregation": null, "scope": "node" }, - "ib_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "lustre_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "mem_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "mem_used": { "frequency": 60, "aggregation": null, "scope": "node" }, - "rapl_power": { "frequency": 60, "aggregation": null, "scope": "node" } - }, - "checkpoints": { - "interval": 100000000000, - "directory": "/data/checkpoints", - "restore": 100000000000 - }, - "archive": { - "interval": 100000000000, - "directory": "/data/archive" - }, - "retention-in-memory": 100000000000, - "http-api-address": "0.0.0.0:8081", - "nats": "nats://cc-nats:4222", - "jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0=" -} diff --git a/curl_slurmrestd.sh b/curl_slurmrestd.sh index dc506ff..8168267 100755 --- a/curl_slurmrestd.sh +++ b/curl_slurmrestd.sh @@ -1,3 +1,3 @@ SLURM_JWT=$(cat data/slurm/secret/jwt_token.txt) -curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/ping' --location --silent --show-error -H "X-SLURM-USER-NAME: root" -H "X-SLURM-USER-TOKEN: $SLURM_JWT" +curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/node/node01' --location --silent --show-error -H "X-SLURM-USER-NAME: root" -H "X-SLURM-USER-TOKEN: $SLURM_JWT" # curl -v --unix-socket data/slurm/tmp/slurmrestd.socket 'http://localhost:6820/slurm/v0.0.39/ping' \ No newline at end of file diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index 8fbaee4..f5a34ef 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -39,7 +39,7 @@ ReturnToService=0 #SrunEpilog= #TaskProlog= #TaskEpilog= -TaskPlugin=task/none +TaskPlugin=task/affinity #TrackWCKey=no #TreeWidth=50 #TmpFS= @@ -79,9 +79,9 @@ JobCompType=jobcomp/filetxt JobCompLoc=/var/log/slurm/jobcomp.log # # ACCOUNTING -#JobAcctGatherType=jobacct_gather/linux -JobAcctGatherType=jobacct_gather/cgroup -# ProctrackType=proctrack/cgroup +JobAcctGatherType=jobacct_gather/linux +#JobAcctGatherType=jobacct_gather/cgroup +#ProctrackType=proctrack/cgroup JobAcctGatherFrequency=30 # @@ -99,7 +99,7 @@ PartitionName=debug Nodes=node01 Default=YES MaxTime=INFINITE State=UP # # COMPUTE NODES # NodeName=c[1-2] RealMemory=1000 State=UNKNOWN -NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1 +NodeName=node01 CPUs=1 Boards=1 SocketsPerBoard=1 CoresPerSocket=1 ThreadsPerCore=1 # # # # PARTITIONS diff --git a/slurm/worker/cgroup.conf b/slurm/worker/cgroup.conf index f24d9d7..1f930c7 100644 --- a/slurm/worker/cgroup.conf +++ b/slurm/worker/cgroup.conf @@ -1,4 +1,4 @@ -CgroupPlugin=cgroup/v1 +CgroupPlugin=disabled ConstrainCores=yes ConstrainDevices=no ConstrainRAMSpace=yes diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index 090c3a0..e254c0a 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -78,9 +78,9 @@ _slurmd() { fi echo "found slurm.conf" - sudo yum install -y nc - sudo yum install -y procps - sudo yum install -y iputils + # sudo yum install -y nc + # sudo yum install -y procps + # sudo yum install -y iputils mkdir -p /var/spool/slurm/d /etc/slurm /var/run/slurm/d /var/log/slurm chown slurm: /var/spool/slurm/d /var/run/slurm/d /var/log/slurm @@ -98,6 +98,7 @@ _slurmd() { chown slurm: /var/run/slurm/d/slurmd.pid echo "Starting slurmd" + /usr/sbin/slurmstepd infinity & /usr/sbin/slurmd -Dvv echo "Started slurmd" } From 9bcc1cf456c9355d8f86e3814054bbf78dda661f Mon Sep 17 00:00:00 2001 From: Aditya Date: Wed, 13 Nov 2024 13:18:01 +0000 Subject: [PATCH 15/25] Undo deleted files --- cc-metric-store/Dockerfile | 20 ++++++++++++++++++++ cc-metric-store/config.json | 28 ++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 cc-metric-store/Dockerfile create mode 100644 cc-metric-store/config.json diff --git a/cc-metric-store/Dockerfile b/cc-metric-store/Dockerfile new file mode 100644 index 0000000..eb7aa48 --- /dev/null +++ b/cc-metric-store/Dockerfile @@ -0,0 +1,20 @@ +FROM golang:1.22.4 + +RUN apt-get update +RUN apt-get -y install git + +RUN git clone https://github.com/ClusterCockpit/cc-metric-store.git /cc-metric-store +RUN ls +RUN cd /cc-metric-store && go build ./cmd/cc-metric-store + +# Reactivate when latest commit is available +#RUN go get -d -v github.com/ClusterCockpit/cc-metric-store +#RUN go install -v github.com/ClusterCockpit/cc-metric-store@latest + +RUN mv /cc-metric-store/cc-metric-store /go/bin +COPY config.json /go/bin + +VOLUME /data + +WORKDIR /go/bin +CMD ["./cc-metric-store"] diff --git a/cc-metric-store/config.json b/cc-metric-store/config.json new file mode 100644 index 0000000..674c67c --- /dev/null +++ b/cc-metric-store/config.json @@ -0,0 +1,28 @@ +{ + "metrics": { + "clock": { "frequency": 60, "aggregation": null, "scope": "node" }, + "cpi": { "frequency": 60, "aggregation": null, "scope": "node" }, + "cpu_load": { "frequency": 60, "aggregation": null, "scope": "node" }, + "flops_any": { "frequency": 60, "aggregation": null, "scope": "node" }, + "flops_dp": { "frequency": 60, "aggregation": null, "scope": "node" }, + "flops_sp": { "frequency": 60, "aggregation": null, "scope": "node" }, + "ib_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, + "lustre_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, + "mem_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, + "mem_used": { "frequency": 60, "aggregation": null, "scope": "node" }, + "rapl_power": { "frequency": 60, "aggregation": null, "scope": "node" } + }, + "checkpoints": { + "interval": 100000000000, + "directory": "/data/checkpoints", + "restore": 100000000000 + }, + "archive": { + "interval": 100000000000, + "directory": "/data/archive" + }, + "retention-in-memory": 100000000000, + "http-api-address": "0.0.0.0:8081", + "nats": "nats://cc-nats:4222", + "jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0=" +} From 861036e864e3be7794a831f10e699aca8debf2ea Mon Sep 17 00:00:00 2001 From: Aditya Date: Wed, 13 Nov 2024 16:57:26 +0000 Subject: [PATCH 16/25] Update to CCMS config --- .gitignore | 1 + cc-metric-store/Dockerfile | 1 + cc-metric-store/config.json | 197 ++++++++++++++++++++++++++++++++---- 3 files changed, 179 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 147c94d..28989ba 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ data/cc-metric-store-source data/ldap data/mariadb data/slurm +data cc-backend cc-backend/** .vscode diff --git a/cc-metric-store/Dockerfile b/cc-metric-store/Dockerfile index eb7aa48..a06d075 100644 --- a/cc-metric-store/Dockerfile +++ b/cc-metric-store/Dockerfile @@ -17,4 +17,5 @@ COPY config.json /go/bin VOLUME /data WORKDIR /go/bin +RUN mkdir -p ./var/checkpoints CMD ["./cc-metric-store"] diff --git a/cc-metric-store/config.json b/cc-metric-store/config.json index 674c67c..29d4d28 100644 --- a/cc-metric-store/config.json +++ b/cc-metric-store/config.json @@ -1,28 +1,185 @@ { "metrics": { - "clock": { "frequency": 60, "aggregation": null, "scope": "node" }, - "cpi": { "frequency": 60, "aggregation": null, "scope": "node" }, - "cpu_load": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_any": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_dp": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_sp": { "frequency": 60, "aggregation": null, "scope": "node" }, - "ib_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "lustre_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "mem_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "mem_used": { "frequency": 60, "aggregation": null, "scope": "node" }, - "rapl_power": { "frequency": 60, "aggregation": null, "scope": "node" } + "debug_metric": { + "frequency": 60, + "aggregation": "avg" + }, + "clock": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_idle": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_iowait": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_irq": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_system": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_user": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_mem_util": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_temp": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_sm_clock": { + "frequency": 60, + "aggregation": "avg" + }, + "acc_utilization": { + "frequency": 60, + "aggregation": "avg" + }, + "acc_mem_used": { + "frequency": 60, + "aggregation": "sum" + }, + "acc_power": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_any": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_dp": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_sp": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_recv": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_xmit": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_recv_pkts": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_xmit_pkts": { + "frequency": 60, + "aggregation": "sum" + }, + "cpu_power": { + "frequency": 60, + "aggregation": "sum" + }, + "core_power": { + "frequency": 60, + "aggregation": "sum" + }, + "mem_power": { + "frequency": 60, + "aggregation": "sum" + }, + "ipc": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_load": { + "frequency": 60, + "aggregation": null + }, + "lustre_close": { + "frequency": 60, + "aggregation": null + }, + "lustre_open": { + "frequency": 60, + "aggregation": null + }, + "lustre_statfs": { + "frequency": 60, + "aggregation": null + }, + "lustre_read_bytes": { + "frequency": 60, + "aggregation": null + }, + "lustre_write_bytes": { + "frequency": 60, + "aggregation": null + }, + "net_bw": { + "frequency": 60, + "aggregation": null + }, + "file_bw": { + "frequency": 60, + "aggregation": null + }, + "mem_bw": { + "frequency": 60, + "aggregation": "sum" + }, + "mem_cached": { + "frequency": 60, + "aggregation": null + }, + "mem_used": { + "frequency": 60, + "aggregation": null + }, + "net_bytes_in": { + "frequency": 60, + "aggregation": null + }, + "net_bytes_out": { + "frequency": 60, + "aggregation": null + }, + "nfs4_read": { + "frequency": 60, + "aggregation": null + }, + "nfs4_total": { + "frequency": 60, + "aggregation": null + }, + "nfs4_write": { + "frequency": 60, + "aggregation": null + }, + "vectorization_ratio": { + "frequency": 60, + "aggregation": "avg" + } }, "checkpoints": { - "interval": 100000000000, - "directory": "/data/checkpoints", - "restore": 100000000000 + "interval": "12h", + "directory": "./var/checkpoints", + "restore": "48h" }, "archive": { - "interval": 100000000000, - "directory": "/data/archive" + "interval": "50h", + "directory": "./var/archive" }, - "retention-in-memory": 100000000000, - "http-api-address": "0.0.0.0:8081", - "nats": "nats://cc-nats:4222", + "http-api": { + "address": "localhost:8082", + "https-cert-file": null, + "https-key-file": null + }, + "retention-in-memory": "48h", + "nats": null, "jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0=" -} +} \ No newline at end of file From 37dbd36c0f6736c8311adacf4123b077e88a7fd4 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Wed, 4 Dec 2024 14:42:08 +0100 Subject: [PATCH 17/25] Update to ccms docker config --- cc-metric-store/Dockerfile | 4 ++-- cc-metric-store/config.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cc-metric-store/Dockerfile b/cc-metric-store/Dockerfile index a06d075..e7e6d1d 100644 --- a/cc-metric-store/Dockerfile +++ b/cc-metric-store/Dockerfile @@ -3,8 +3,9 @@ FROM golang:1.22.4 RUN apt-get update RUN apt-get -y install git +RUN rm -rf /cc-metric-store + RUN git clone https://github.com/ClusterCockpit/cc-metric-store.git /cc-metric-store -RUN ls RUN cd /cc-metric-store && go build ./cmd/cc-metric-store # Reactivate when latest commit is available @@ -17,5 +18,4 @@ COPY config.json /go/bin VOLUME /data WORKDIR /go/bin -RUN mkdir -p ./var/checkpoints CMD ["./cc-metric-store"] diff --git a/cc-metric-store/config.json b/cc-metric-store/config.json index 29d4d28..902053b 100644 --- a/cc-metric-store/config.json +++ b/cc-metric-store/config.json @@ -167,12 +167,12 @@ }, "checkpoints": { "interval": "12h", - "directory": "./var/checkpoints", + "directory": "/data/checkpoints", "restore": "48h" }, "archive": { "interval": "50h", - "directory": "./var/archive" + "directory": "/data/archive" }, "http-api": { "address": "localhost:8082", From 4ba14d8db75c5951743803691b547b3d3ce29747 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 5 Dec 2024 10:34:58 +0100 Subject: [PATCH 18/25] Updated README for latest collection of services --- README.md | 162 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 122 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 99ba46a..23698b1 100755 --- a/README.md +++ b/README.md @@ -1,30 +1,27 @@ # cc-docker This is a `docker-compose` setup which provides a quickly started environment for ClusterCockpit development and testing, using `cc-backend`. -A number of services is readily available as docker container (nats, cc-metric-store, InfluxDB, LDAP), or easily added by manual configuration (MySQL). +A number of services is readily available as docker container (nats, cc-metric-store, InfluxDB, LDAP, SLURM), or easily added by manual configuration (MariaDB). It includes the following containers: -* nats (Default) -* cc-metric-store (Default) -* influxdb (Default) -* openldap (Default) -* mysql (Optional) -* mariadb (Optional) -* phpmyadmin (Optional) +|Service full name|docker service name|port| +| --- | --- | --- | +|Slurm Controller service|slurmctld|6818| +|Slurm Database service|slurmdbd|6817| +|Slurm Rest service with JWT authentication|slurmrestd|6820| +|Slurm Worker|node01|6818| +|MariaDB service|mariadb|3306| +|InfluxDB serice|influxdb|8086| +|NATS service|nats|4222| +|cc-metric-store service|cc-metric-store|8084| +|OpenLDAP|openldap|389, 636| -The setup comes with fixture data for a Job archive, cc-metric-store checkpoints, InfluxDB, MySQL, and a LDAP user directory. - -## Known Issues - -* `docker-compose` installed on Ubuntu (18.04, 20.04) via `apt-get` can not correctly parse `docker-compose.yml` due to version differences. Install latest version of `docker-compose` from https://docs.docker.com/compose/install/ instead. -* You need to ensure that no other web server is running on ports 8080 (cc-backend), 8081 (phpmyadmin), 8084 (cc-metric-store), 8086 (nfluxDB), 4222 and 8222 (Nats), or 3306 (MySQL). If one or more ports are already in use, you habe to adapt the related config accordingly. -* Existing VPN connections sometimes cause problems with docker. If `docker-compose` does not start up correctly, try disabling any active VPN connection. Refer to https://stackoverflow.com/questions/45692255/how-make-openvpn-work-with-docker for further information. +The setup comes with fixture data for a Job archive, cc-metric-store checkpoints, InfluxDB, MariaDB, and a LDAP user directory. ## Configuration Templates Located in `./templates` * `docker-compose.yml.default`: Docker-Compose file to setup cc-metric-store, InfluxDB, MariaDB, PhpMyadmin, and LDAP containers (Default). Used in `setupDev.sh`. -* `docker-compose.yml.mysql`: Docker-Compose configuration template if MySQL is desired instead of MariaDB. * `env.default`: Environment variables for setup with cc-metric-store, InfluxDB, MariaDB, PhpMyadmin, and LDAP containers (Default). Used in `setupDev.sh`. * `env.mysql`: Additional environment variables required if MySQL is desired instead of MariaDB. @@ -32,43 +29,128 @@ Located in `./templates` 1. Clone `cc-backend` repository in chosen base folder: `$> git clone https://github.com/ClusterCockpit/cc-backend.git` -2. Run `$ ./setupDev.sh`: **NOTICE** The script will download files of a total size of 338MB (mostly for the InfluxDB data). +2. Run `$ ./setupDev.sh`: **NOTICE** The script will download files of a total size of 338MB (mostly for the cc-metric-store data). -3. The setup-script launches the supporting container stack in the background automatically if everything went well. Run `$> ./cc-backend/cc-backend` to start `cc-backend.` +3. The setup-script launches the supporting container stack in the background automatically if everything went well. Run `$> ./cc-backend/cc-backend -server -dev` to start `cc-backend`. 4. By default, you can access `cc-backend` in your browser at `http://localhost:8080`. You can shut down the cc-backend server by pressing `CTRL-C`, remember to also shut down all containers via `$> docker-compose down` afterwards. 5. You can restart the containers with: `$> docker-compose up -d`. -## Post-Setup Adjustment for using `influxdb` - -When using `influxdb` as a metric database, one must adjust the following files: -* `cc-backend/var/job-archive/emmy/cluster.json` -* `cc-backend/var/job-archive/woody/cluster.json` - -In the JSON, exchange the content of the `metricDataRepository`-Entry (By default configured for `cc-metric-store`) with: -``` -"metricDataRepository": { - "kind": "influxdb", - "url": "http://localhost:8086", - "token": "egLfcf7fx0FESqFYU3RpAAbj", - "bucket": "ClusterCockpit", - "org": "ClusterCockpit", - "skiptls": false -} -``` - - -## Usage +## Credentials for logging into clustercockpit Credentials for the preconfigured demo user are: * User: `demo` -* Password: `AdminDev` +* Password: `demo` You can also login as regular user using any credential in the LDAP user directory at `./data/ldap/users.ldif`. +## Post-Setup adjustment for using `cc-metric-store` + +When using `influxdb` as a metric database, one must adjust the following files: +* `cc-backend/var/job-archive/fritz/cluster.json` +* `cc-backend/var/job-archive/alex/cluster.json` + +In the JSON (cc-backend/config.json), exchange the content of the `metricDataRepository`-Entry (By default configured for `cc-metric-store`) with: +``` +"metricDataRepository": +{ + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" +} +``` + +## Docker commands to access the services + +> Note: You need to be in cc-docker directory in order to execute any docker command + +You can view all docker processes running on either of the VM instance by using this command: + +``` +$ docker ps +``` + +Now that you can see the docker services, and if you want to manually access the docker services, you have to run **`bash`** command in those running services. + +> **`Example`**: You want to run slurm commands like `sinfo` or `squeue` or `scontrol` on slurm controller, you cannot directly access it. + +You need to **`bash`** into the running service by using the following command: + +``` +$ docker exec -it bash + +#example +$ docker exec -it slurmctld bash + +#or +$ docker exec -it mariadb bash +``` + +Once you start a **`bash`** on any docker service, then you may execute any service related commands in that **`bash`**. + +But for Cluster Cockpit development, you only need ports to access these docker services. You have to use `localhost:` when trying to access any docker service. You may need to configure the `cc-backend/config.json` based on these docker services and ports. + +## Slurm setup in cc-docker + +### 1. Slurm controller + +Currently slurm controller is aware of the 1 node that we have setup in our mini cluster i.e. node01. + +In order to execute slurm commands, you may need to **`bash`** into the **`slurmctld`** docker service. + +``` +$ docker exec -it slurmctld bash +``` + +Then you may be able to run slurm controller commands. A few examples without output are: + +``` +$ sinfo + +or + +$ squeue + +or + +$ scontrol show nodes +``` + +### 2. Slurm rest service + +You do not need to **`bash`** into the slurmrestd service but can directly access the rest API via localhost:6820. A simple example on how to CURL to the slurm rest API is given in the `curl_slurmrestd.sh`. + +You can directly use `curl_slurmrestd.sh` with a never expiring JWT token ( can be found in /data/slurm/secret/jwt_token.txt ) + +You may also use the never expiring token directly from the file for any of your custom CURL commands. + +## Known Issues + +* `docker-compose` installed on Ubuntu (18.04, 20.04) via `apt-get` can not correctly parse `docker-compose.yml` due to version differences. Install latest version of `docker-compose` from https://docs.docker.com/compose/install/ instead. +* You need to ensure that no other web server is running on ports 8080 (cc-backend), 8082 (cc-metric-store), 8086 (InfluxDB), 4222 and 8222 (Nats), or 3306 (MariaDB). If one or more ports are already in use, you have to adapt the related config accordingly. +* Existing VPN connections sometimes cause problems with docker. If `docker-compose` does not start up correctly, try disabling any active VPN connection. Refer to https://stackoverflow.com/questions/45692255/how-make-openvpn-work-with-docker for further information. + +## Docker services and restarting the services + +You can find all the docker services in `docker-compose.yml`. Feel free to modify it. + +Whenever you modify it, please use + +``` +$ docker compose down +``` + +in order to shut down all the services in all the VM’s (maininstance, nodeinstance, nodeinstance2) and then start all the services by using + +``` +$ docker compose up +``` + + + TODO: Update job archive and all other metric data. The job archive with 1867 jobs originates from the second half of 2020. Roughly 2700 jobs from the first week of 2021 are loaded with data from InfluxDB. Some views of ClusterCockpit (e.g. the Users view) show the last week or month. -To show some data there you have to set the filter to time periods with jobs (August 2020 to January 2021). +To show some data there you have to set the filter to time periods with jobs (August 2020 to January 2021). \ No newline at end of file From 494fcaa9f3a2a295ed635bfc4a1cf9352760d7cf Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 5 Dec 2024 11:19:18 +0100 Subject: [PATCH 19/25] feat: prerequisite installation script --- README.md | 26 +++++++++++++++++++ prerequisite_installation_script.sh | 40 +++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 prerequisite_installation_script.sh diff --git a/README.md b/README.md index 23698b1..7cd28ec 100755 --- a/README.md +++ b/README.md @@ -18,6 +18,32 @@ It includes the following containers: The setup comes with fixture data for a Job archive, cc-metric-store checkpoints, InfluxDB, MariaDB, and a LDAP user directory. +## Prerequisites + +For all the docker services to work correctly, you will need the following tools installed: + +1. `docker` and `docker-compose` +2. `golang` (for compiling cc-metric-store) +3. `perl` (for migrateTimestamp.pl) with Cpanel::JSON::XS, Data::Dumper, Time::Piece, Sort::Versions and File::Slurp perl modules. +4. `npm` (for cc-backend) +5. `make` (for building slurm base image) + +It is also recommended to add docker service to sudouser group since the setupDev.sh script assumes sudo permissions for docker and docker-compose services. + +You can use: + +``` +sudo groupadd docker +sudo usermod -aG docker $USER + +# restart after adding your docker with your user to sudo group +sudo shutdown -r -t 0 +``` + +Note: You can install all these dependencies via predefined installation steps in `prerequisite_installation_script.sh`. + +If you are using different linux flavors, you will have to adapt `prerequisite_installation_script.sh` as well as `setupDev.sh`. + ## Configuration Templates Located in `./templates` diff --git a/prerequisite_installation_script.sh b/prerequisite_installation_script.sh new file mode 100644 index 0000000..e061a13 --- /dev/null +++ b/prerequisite_installation_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash -l + +sudo apt-get update +sudo apt-get upgrade -f -y + +# Add Docker's official GPG key: +sudo apt-get update +sudo apt-get install ca-certificates curl +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc + +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update + +sudo apt-get install -f -y gcc +sudo apt-get install -f -y npm +sudo apt-get install -f -y make +sudo apt-get install -f -y gh +sudo apt-get install -f -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +sudo apt-get install -f -y docker-compose +sudo apt install perl -f -y libdatetime-perl libjson-perl +sudo apt-get install -f -y golang-go + +sudo cpan Cpanel::JSON::XS +sudo cpan File::Slurp +sudo cpan Data::Dumper +sudo cpan Time::Piece +sudo cpan Sort::Versions + +sudo groupadd docker +sudo usermod -aG docker ubuntu + +sudo shutdown -r -t 0 + + From 84a96b131837ae2bf2e433b3e47b8e326e46523a Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 5 Dec 2024 11:23:10 +0100 Subject: [PATCH 20/25] fix: removed templates from README --- README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/README.md b/README.md index 7cd28ec..5f706f6 100755 --- a/README.md +++ b/README.md @@ -44,13 +44,6 @@ Note: You can install all these dependencies via predefined installation steps i If you are using different linux flavors, you will have to adapt `prerequisite_installation_script.sh` as well as `setupDev.sh`. -## Configuration Templates - -Located in `./templates` -* `docker-compose.yml.default`: Docker-Compose file to setup cc-metric-store, InfluxDB, MariaDB, PhpMyadmin, and LDAP containers (Default). Used in `setupDev.sh`. -* `env.default`: Environment variables for setup with cc-metric-store, InfluxDB, MariaDB, PhpMyadmin, and LDAP containers (Default). Used in `setupDev.sh`. -* `env.mysql`: Additional environment variables required if MySQL is desired instead of MariaDB. - ## Setup 1. Clone `cc-backend` repository in chosen base folder: `$> git clone https://github.com/ClusterCockpit/cc-backend.git` From 0edb94da94427f33163b7f09c2fefaf60bdeed7e Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Tue, 28 Jan 2025 21:54:48 +0100 Subject: [PATCH 21/25] Update to mariad and openldap --- cc-metric-store/config.json | 2 +- config.json | 77 +++++++++++++++++++++++++++++++++++++ docker-compose.yml | 19 +++++---- setupDev.sh | 35 +++++++++++++++++ 4 files changed, 125 insertions(+), 8 deletions(-) create mode 100644 config.json diff --git a/cc-metric-store/config.json b/cc-metric-store/config.json index 902053b..2dc0b1b 100644 --- a/cc-metric-store/config.json +++ b/cc-metric-store/config.json @@ -175,7 +175,7 @@ "directory": "/data/archive" }, "http-api": { - "address": "localhost:8082", + "address": "0.0.0.0:8084", "https-cert-file": null, "https-key-file": null }, diff --git a/config.json b/config.json new file mode 100644 index 0000000..2977e72 --- /dev/null +++ b/config.json @@ -0,0 +1,77 @@ +{ + "addr": "127.0.0.1:8080", + "short-running-jobs-duration": 300, + "archive": { + "kind": "file", + "path": "./var/job-archive" + }, + "jwts": { + "max-age": "2000h" + }, + "db-driver": "mysql", + "db": "root:root@tcp(0.0.0.0:3306)/ccbackend", + "ldap": { + "url": "ldap://0.0.0.0", + "user_base": "ou=users,dc=example,dc=com", + "search_dn": "cn=admin,dc=example,dc=com", + "user_bind": "uid={username},ou=users,dc=example,dc=com", + "user_filter": "(&(objectclass=posixAccount))", + "syncUserOnLogin": true + }, + "enable-resampling": { + "trigger": 30, + "resolutions": [ + 600, + 300, + 120, + 60 + ] + }, + "emission-constant": 317, + "clusters": [ + { + "name": "fritz", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://0.0.0.0:8084", + "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2022-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "alex", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://0.0.0.0:8084", + "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2022-01-01T00:00:00Z", + "to": null + } + } + } + ] +} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f10d9ed..7227dda 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,9 +40,15 @@ services: image: osixia/openldap:1.5.0 command: --copy-service --loglevel debug environment: - - LDAP_ADMIN_PASSWORD=${LDAP_ADMIN_PASSWORD} - - LDAP_ORGANISATION=${LDAP_ORGANISATION} - - LDAP_DOMAIN=${LDAP_DOMAIN} + - LDAP_ADMIN_PASSWORD=mashup + - LDAP_ORGANISATION=Example Organization + - LDAP_DOMAIN=example.com + - LDAP_LOGGING=true + - LDAP_CONNECTION=default + - LDAP_CONNECTIONS=default + - LDAP_DEFAULT_HOSTS=0.0.0.0 + ports: + - "0.0.0.0:389:389" volumes: - ${DATADIR}/ldap:/container/service/slapd/assets/config/bootstrap/ldif/custom @@ -51,15 +57,14 @@ services: image: mariadb:latest command: ["--default-authentication-plugin=mysql_native_password"] environment: - MARIADB_ROOT_PASSWORD: ${MARIADB_ROOT_PASSWORD} + MARIADB_ROOT_PASSWORD: root MARIADB_DATABASE: slurm_acct_db MARIADB_USER: slurm MARIADB_PASSWORD: demo ports: - - "127.0.0.1:${MARIADB_PORT}:3306" + - "0.0.0.0:${MARIADB_PORT}:3306" volumes: - - ${DATADIR}/mariadb:/etc/mysql/conf.d - # - ${DATADIR}/sql-init:/docker-entrypoint-initdb.d + - ${DATADIR}/mariadb:/docker-entrypoint-initdb.d cap_add: - SYS_NICE diff --git a/setupDev.sh b/setupDev.sh index 3616787..616ba89 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -73,6 +73,39 @@ fi # rm -r ./data/job-archive-source # rm -r ./data/cc-metric-store-source +if [ ! -d data/mariadb ]; then + mkdir -p data/mariadb + cat > data/mariadb/01.databases.sql < data/ldap/add_users.ldif < Date: Wed, 29 Jan 2025 23:29:41 +0100 Subject: [PATCH 22/25] Update to nats auto generation script --- .env | 30 ----- cc-metric-store/config.json | 18 ++- dataGenerationScript.sh | 114 ++++++++++++++++++ docker-compose.yml | 14 ++- env-template.txt | 5 - config.json => misc/config.json | 0 curl_slurmrestd.sh => misc/curl_slurmrestd.sh | 0 jwt_verifier.py => misc/jwt_verifier.py | 0 .../prerequisite_installation_script.sh | 0 setupDev.sh | 85 ++----------- 10 files changed, 152 insertions(+), 114 deletions(-) create mode 100755 dataGenerationScript.sh delete mode 100755 env-template.txt rename config.json => misc/config.json (100%) rename curl_slurmrestd.sh => misc/curl_slurmrestd.sh (100%) rename jwt_verifier.py => misc/jwt_verifier.py (100%) rename prerequisite_installation_script.sh => scripts/prerequisite_installation_script.sh (100%) diff --git a/.env b/.env index 4e9aa63..04f530a 100644 --- a/.env +++ b/.env @@ -2,15 +2,6 @@ # CCBACKEND DEVEL DOCKER SETTINGS ######################################################################## -######################################################################## -# SLURM -######################################################################## -SLURM_VERSION=22.05.6 -ARCH=aarch64 -MUNGE_UID=981 -SLURM_UID=982 -WORKER_UID=1000 - ######################################################################## # INFLUXDB ######################################################################## @@ -22,27 +13,6 @@ INFLUXDB_BUCKET=ClusterCockpit # Whether or not to check SSL Cert in Symfony Client, Default: false INFLUXDB_SSL=false -######################################################################## -# MARIADB -######################################################################## -MARIADB_ROOT_PASSWORD=root -MARIADB_DATABASE=ClusterCockpit -MARIADB_USER=clustercockpit -MARIADB_PASSWORD=clustercockpit -MARIADB_PORT=3306 - -######################################### -# LDAP -######################################################################## -LDAP_ADMIN_PASSWORD=mashup -LDAP_ORGANISATION=NHR@FAU -LDAP_DOMAIN=rrze.uni-erlangen.de - -######################################################################## -# PHPMyAdmin -######################################################################## -PHPMYADMIN_PORT=8081 - ######################################################################## # INTERNAL SETTINGS ######################################################################## diff --git a/cc-metric-store/config.json b/cc-metric-store/config.json index 2dc0b1b..a7173b2 100644 --- a/cc-metric-store/config.json +++ b/cc-metric-store/config.json @@ -180,6 +180,22 @@ "https-key-file": null }, "retention-in-memory": "48h", - "nats": null, + "nats": [ + { + "address": "nats://nats:4222", + "username": "root", + "password": "root", + "subscriptions": [ + { + "subscribe-to": "hpc-nats", + "cluster-tag": "fritz" + }, + { + "subscribe-to": "hpc-nats", + "cluster-tag": "alex" + } + ] + } + ], "jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0=" } \ No newline at end of file diff --git a/dataGenerationScript.sh b/dataGenerationScript.sh new file mode 100755 index 0000000..b049bce --- /dev/null +++ b/dataGenerationScript.sh @@ -0,0 +1,114 @@ +#!/bin/bash +echo "" +echo "|--------------------------------------------------------------------------------------|" +echo "| This is Data generation script for docker services |" +echo "| Starting file required by docker services in data/ |" +echo "|--------------------------------------------------------------------------------------|" + +# Download unedited checkpoint files to ./data/cc-metric-store-source/checkpoints +if [ ! -d data/cc-metric-store-source ]; then + mkdir -p data/cc-metric-store-source/checkpoints + cd data/cc-metric-store-source/checkpoints + wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/cc-metric-store-checkpoints.tar.xz + tar xf cc-metric-store-checkpoints.tar.xz + rm cc-metric-store-checkpoints.tar.xz + cd ../../../ +else + echo "'data/cc-metric-store-source' already exists!" +fi + +if [ ! -d data/mariadb ]; then + mkdir -p data/mariadb + cat > data/mariadb/01.databases.sql < data/ldap/add_users.ldif < data/nats/docker-entrypoint.sh <sample_alex.txt + done + done + + ./nats pub hpc-nats "\$(cat sample_alex.txt)" -s nats://0.0.0.0:4222 --user root --password root + + for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do + for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229; do + echo "\$metric,cluster=fritz,hostname=\$hostname,type=node value=$((1 + RANDOM % 100)).0 \$timestamp" >sample_fritz.txt + done + done + + ./nats pub hpc-nats "\$(cat sample_fritz.txt)" -s nats://0.0.0.0:4222 --user root --password root + + sleep 1m + +done +EOF + +else + echo "'data/nats' already exists!" +fi + +# prepare folders for influxdb2 +if [ ! -d data/influxdb ]; then + mkdir -p data/influxdb/data + mkdir -p data/influxdb/config +else + echo "'data/influxdb' already exists!" +fi + +echo "" +echo "|--------------------------------------------------------------------------------------|" +echo "| Finished generating relevant files for docker services in data/ |" +echo "|--------------------------------------------------------------------------------------|" \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 7227dda..59f5891 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,15 +3,19 @@ services: container_name: nats image: nats:alpine ports: - - "4222:4222" - - "8222:8222" + - "0.0.0.0:4222:4222" + - "0.0.0.0:8222:8222" + - "0.0.0.0:6222:6222" + volumes: + - ${DATADIR}/nats:/data + entrypoint: ["/bin/sh", "/data/docker-entrypoint.sh"] cc-metric-store: container_name: cc-metric-store build: context: ./cc-metric-store ports: - - "8084:8084" + - "0.0.0.0:8084:8084" volumes: - ${DATADIR}/cc-metric-store:/data depends_on: @@ -30,7 +34,7 @@ services: DOCKER_INFLUXDB_INIT_RETENTION: 100w DOCKER_INFLUXDB_INIT_ADMIN_TOKEN: ${INFLUXDB_ADMIN_TOKEN} ports: - - "127.0.0.1:${INFLUXDB_PORT}:8086" + - "0.0.0.0:8086:8086" volumes: - ${DATADIR}/influxdb/data:/var/lib/influxdb2 - ${DATADIR}/influxdb/config:/etc/influxdb2 @@ -62,7 +66,7 @@ services: MARIADB_USER: slurm MARIADB_PASSWORD: demo ports: - - "0.0.0.0:${MARIADB_PORT}:3306" + - "0.0.0.0:3306:3306" volumes: - ${DATADIR}/mariadb:/docker-entrypoint-initdb.d cap_add: diff --git a/env-template.txt b/env-template.txt deleted file mode 100755 index 3bdeb8f..0000000 --- a/env-template.txt +++ /dev/null @@ -1,5 +0,0 @@ -SLURM_VERSION=22.05.6 -ARCH=aarch64 -MUNGE_UID=981 -SLURM_UID=982 -WORKER_UID=1000 diff --git a/config.json b/misc/config.json similarity index 100% rename from config.json rename to misc/config.json diff --git a/curl_slurmrestd.sh b/misc/curl_slurmrestd.sh similarity index 100% rename from curl_slurmrestd.sh rename to misc/curl_slurmrestd.sh diff --git a/jwt_verifier.py b/misc/jwt_verifier.py similarity index 100% rename from jwt_verifier.py rename to misc/jwt_verifier.py diff --git a/prerequisite_installation_script.sh b/scripts/prerequisite_installation_script.sh similarity index 100% rename from prerequisite_installation_script.sh rename to scripts/prerequisite_installation_script.sh diff --git a/setupDev.sh b/setupDev.sh index 616ba89..0408c85 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -16,9 +16,6 @@ echo "| 'sudo' keyword. echo "|--------------------------------------------------------------------------------------|" echo "" -export UID_U=$(id -u $USER) -export GID_G=$(id -g $USER) - # Check cc-backend, touch job.db if exists if [ ! -d cc-backend ]; then echo "'cc-backend' not yet prepared! Please clone cc-backend repository before starting this script." @@ -32,35 +29,28 @@ else rm ./job-archive-demo.tar cp ./configs/env-template.txt .env - cp ./configs/config-demo.json config.json + cp -f ../misc/config.json config.json make ./cc-backend -migrate-db - ./cc-backend --init-db --add-user demo:admin:AdminDev + ./cc-backend --init-db --add-user demo:admin:demo cd .. else cd .. - # echo "'cc-backend/var' exists. Cautiously exiting." - # echo -n "Stopped." - # exit + echo "'cc-backend/var' exists. Cautiously exiting." + echo -n "Stopped." + exit fi fi -mkdir -m777 data - -# Download unedited checkpoint files to ./data/cc-metric-store-source/checkpoints -if [ ! -d data/cc-metric-store-source ]; then - mkdir -p data/cc-metric-store-source/checkpoints - cd data/cc-metric-store-source/checkpoints - wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/cc-metric-store-checkpoints.tar.xz - tar xf cc-metric-store-checkpoints.tar.xz - rm cc-metric-store-checkpoints.tar.xz - cd ../../../ -else - echo "'data/cc-metric-store-source' already exists!" +if [ ! -d data ]; then + mkdir -m777 data fi +chmod u+x dataGenerationScript.sh +./dataGenerationScript.sh + # Update timestamps perl ./migrateTimestamps.pl @@ -70,57 +60,8 @@ if [ ! -d data/cc-metric-store/archive ]; then fi # cleanup sources -# rm -r ./data/job-archive-source -# rm -r ./data/cc-metric-store-source - -if [ ! -d data/mariadb ]; then - mkdir -p data/mariadb - cat > data/mariadb/01.databases.sql < data/ldap/add_users.ldif < Date: Thu, 30 Jan 2025 13:56:56 +0100 Subject: [PATCH 23/25] Finished testing docker services and updated README --- README.md | 28 ++++++++++++++-------------- dataGenerationScript.sh | 19 ++++++++++++++++++- setupDev.sh | 15 +++++++++++++-- 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 5f706f6..b196299 100755 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ It includes the following containers: |Slurm Worker|node01|6818| |MariaDB service|mariadb|3306| |InfluxDB serice|influxdb|8086| -|NATS service|nats|4222| +|NATS service|nats|4222, 6222, 8222| |cc-metric-store service|cc-metric-store|8084| |OpenLDAP|openldap|389, 636| @@ -62,23 +62,23 @@ Credentials for the preconfigured demo user are: * User: `demo` * Password: `demo` +Credentials for the preconfigured LDAP user are: +* User: `ldapuser` +* Password: `ldapuser` + You can also login as regular user using any credential in the LDAP user directory at `./data/ldap/users.ldif`. -## Post-Setup adjustment for using `cc-metric-store` +## Preconfigured setup between docker services and ClusterCockpit components -When using `influxdb` as a metric database, one must adjust the following files: -* `cc-backend/var/job-archive/fritz/cluster.json` -* `cc-backend/var/job-archive/alex/cluster.json` +When you are done cloning the cc-backend repo and once you execute `setupDev.sh` file, it will copy a preconfigured `config.json` from `misc/config.json` and replace the `cc-backend/config.json`, which will be used by cc-backend, once you start the server. +The preconfigured config.json attaches to: +#### 1. MariaDB docker service on port 3306 (database: ccbackend) +#### 2. OpenLDAP docker service on port 389 +#### 3. cc-metric-store docker service on port 8084 -In the JSON (cc-backend/config.json), exchange the content of the `metricDataRepository`-Entry (By default configured for `cc-metric-store`) with: -``` -"metricDataRepository": -{ - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" -} -``` +cc-metric-store also has a preconfigured `config.json` in `cc-metric-store/config.json` which attaches to NATS docker service on port 4222 and subscribes to topic 'hpc-nats'. + +Basically, all the ClusterCockpit components and the docker services attach to each other like lego pieces. ## Docker commands to access the services diff --git a/dataGenerationScript.sh b/dataGenerationScript.sh index b049bce..34b7271 100755 --- a/dataGenerationScript.sh +++ b/dataGenerationScript.sh @@ -6,6 +6,11 @@ echo "| Starting file required by docker services in data/ echo "|--------------------------------------------------------------------------------------|" # Download unedited checkpoint files to ./data/cc-metric-store-source/checkpoints +# After this, migrateTimestamp.pl will run from setupDev.sh. This will update the timestamps +# for all the checkpoint files, which then can be read by cc-metric-store. +# cc-metric-store reads only data upto certain time, like 48 hours of data. +# These checkpoint files have timestamp older than 48 hours and needs to be updated with +# migrateTimestamp.pl file, which will be automatically invoked from setupDev.sh. if [ ! -d data/cc-metric-store-source ]; then mkdir -p data/cc-metric-store-source/checkpoints cd data/cc-metric-store-source/checkpoints @@ -17,6 +22,10 @@ else echo "'data/cc-metric-store-source' already exists!" fi +# A simple configuration file for mariadb docker service. +# Required because you can specify only one database per docker service. +# This file mentions the database to be created for cc-backend. +# This file automatically picked by mariadb after the docker service starts. if [ ! -d data/mariadb ]; then mkdir -p data/mariadb cat > data/mariadb/01.databases.sql < data/ldap/add_users.ldif < data/nats/docker-entrypoint.sh < Date: Thu, 30 Jan 2025 14:19:17 +0100 Subject: [PATCH 24/25] Update to setupDev.sh with change in execution steps for cc-backend --- setupDev.sh | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/setupDev.sh b/setupDev.sh index 25478cf..c0e6609 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -12,7 +12,7 @@ echo "| > sudo usermod -aG docker $USER echo "| |" echo "| This will add docker to the sudo usergroup and all the docker |" echo "| command will run as sudo by default without requiring |" -echo "| 'sudo' keyword. |" +echo "| 'sudo' keyword. |" echo "|--------------------------------------------------------------------------------------|" echo "" @@ -21,27 +21,6 @@ if [ ! -d cc-backend ]; then echo "'cc-backend' not yet prepared! Please clone cc-backend repository before starting this script." echo -n "Stopped." exit -else - cd cc-backend - if [ ! -d var ]; then - wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar - tar xf job-archive-demo.tar - rm ./job-archive-demo.tar - - cp ./configs/env-template.txt .env - cp -f ../misc/config.json config.json - - make - - ./cc-backend -migrate-db - ./cc-backend --init-db --add-user demo:admin:demo - cd .. - else - cd .. - echo "'cc-backend/var' exists. Cautiously exiting." - echo -n "Stopped." - exit - fi fi # Creates data directory if it does not exists. @@ -85,6 +64,27 @@ cd ../.. docker-compose build docker-compose up -d +cd cc-backend +if [ ! -d var ]; then + wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar + tar xf job-archive-demo.tar + rm ./job-archive-demo.tar + + cp ./configs/env-template.txt .env + cp -f ../misc/config.json config.json + + make + + ./cc-backend -migrate-db + ./cc-backend --init-db --add-user demo:admin:demo + cd .. +else + cd .. + echo "'cc-backend/var' exists. Cautiously exiting." + echo -n "Stopped." + exit +fi + echo "" echo "|--------------------------------------------------------------------------------------|" echo "| Check logs for each slurm service by using these commands: |" From 5a6912c1acdf0cdda0df74958267781209a18e56 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Thu, 30 Jan 2025 15:06:02 +0100 Subject: [PATCH 25/25] Update to dummy data generation script in nats --- dataGenerationScript.sh | 14 +++++++++++--- setupDev.sh | 2 -- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/dataGenerationScript.sh b/dataGenerationScript.sh index 34b7271..72efcd1 100755 --- a/dataGenerationScript.sh +++ b/dataGenerationScript.sh @@ -87,27 +87,35 @@ echo "NATS is up and running. Executing custom script..." apk add curl curl -sf https://binaries.nats.dev/nats-io/natscli/nats@latest | sh -# Run your custom script +# This is a dummy data generation loop, that inserts data for given nodes at 1 min interval while true; do + # Timestamp in seconds timestamp="\$(date '+%s')" + # Generate data for alex cluster. Push to sample_alex.txt for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934; do - echo "\$metric,cluster=alex,hostname=\$hostname,type=node value=$((1 + RANDOM % 100)).0 \$timestamp" >sample_alex.txt + echo "\$metric,cluster=alex,hostname=\$hostname,type=node value=\$((1 + RANDOM % 100)).0 \$timestamp" >>sample_alex.txt done done + # Nats client will publish the data from sample_alex.txt to 'hpc-nats' subject on this nats server ./nats pub hpc-nats "\$(cat sample_alex.txt)" -s nats://0.0.0.0:4222 --user root --password root + # Generate data for fritz cluster. Push to sample_fritz.txt for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229; do - echo "\$metric,cluster=fritz,hostname=\$hostname,type=node value=$((1 + RANDOM % 100)).0 \$timestamp" >sample_fritz.txt + echo "\$metric,cluster=fritz,hostname=\$hostname,type=node value=\$((1 + RANDOM % 100)).0 \$timestamp" >>sample_fritz.txt done done + # Nats client will publish the data from sample_fritz.txt to 'hpc-nats' subject on this nats server ./nats pub hpc-nats "\$(cat sample_fritz.txt)" -s nats://0.0.0.0:4222 --user root --password root + rm sample_alex.txt + rm sample_fritz.txt + sleep 1m done diff --git a/setupDev.sh b/setupDev.sh index c0e6609..2549181 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -81,8 +81,6 @@ if [ ! -d var ]; then else cd .. echo "'cc-backend/var' exists. Cautiously exiting." - echo -n "Stopped." - exit fi echo ""