From f7558779dad07734b69e7e2ba9ff560fdea698fe Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Tue, 15 Oct 2024 16:15:37 +0200 Subject: [PATCH] Stable services --- docker-compose.yml | 23 +++------ slurm/controller/docker-entrypoint.sh | 36 ++++++++++++-- slurm/controller/slurm.conf | 14 +++--- slurm/worker/docker-entrypoint.sh | 72 ++++++++++++++++----------- 4 files changed, 90 insertions(+), 55 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index e45067c..feab346 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -68,8 +68,6 @@ services: hostname: slurmctld build: context: ./slurm/controller - depends_on: - - slurmdbd privileged: true ports: - "6817:6817" @@ -77,6 +75,8 @@ services: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - ./slurm/controller/slurm.conf:/home/config/slurm.conf + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro slurmdbd: container_name: slurmdbd @@ -85,6 +85,7 @@ services: context: ./slurm/database depends_on: - mariadb + - slurmctld privileged: true ports: - "6819:6819" @@ -92,6 +93,8 @@ services: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro node01: container_name: node01 @@ -106,17 +109,7 @@ services: - ${DATADIR}/slurm/secret:/.secret - ./slurm/worker/cgroup.conf:/home/config/cgroup.conf - ./slurm/controller/slurm.conf:/home/config/slurm.conf + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro ports: - - "6818:6818" - - # slurm-worker02: - # container_name: node02 - # hostname: node02 - # build: - # context: ./slurm/worker - # depends_on: - # - slurm-controller - # privileged: true - # volumes: - # - ${DATADIR}/slurm/home:/home - # - ${DATADIR}/slurm/secret:/.secret + - "6818:6818" \ No newline at end of file diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 2faa507..3fc3d18 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -4,6 +4,18 @@ set -e # Determine the system architecture dynamically ARCH=$(uname -m) +_delete_secrets() { + if [ -f /.secret/munge.key ]; then + echo "Removing secrets" + sudo rm -rf /.secret/munge.key + sudo rm -rf /.secret/worker-secret.tar.gz + sudo rm -rf /.secret/setup-worker-ssh.sh + + echo "Done removing secrets" + ls /.secret/ + fi +} + # start sshd server _sshd_host() { if [ ! -d /var/run/sshd ]; then @@ -90,9 +102,17 @@ _slurmctld() { done echo "" mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm - chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm + chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib touch /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log + touch /var/log/slurmd.log + chown slurm: /var/log/slurmd.log + + # touch /var/run/slurm/d/slurmctld.pid + # chown slurm: /var/run/slurm/d/slurmctld.pid + # touch /var/run/slurm/d/slurmd.pid + # chown slurm:/var/run/slurm/d/slurmd.pid + if [[ ! -f /home/config/slurm.conf ]]; then echo "### Missing slurm.conf ###" exit @@ -103,15 +123,25 @@ _slurmctld() { chmod 600 /etc/slurm/slurm.conf fi - sacctmgr -i add cluster "snowflake" + sudo yum install -y nc + sudo yum install -y procps + sudo yum install -y iputils + + while ! nc -z slurmdbd 6819; do + echo "Waiting for slurmdbd to be ready..." + sleep 2 + done + + sacctmgr -i add cluster name=linux sleep 2s echo "Starting slurmctld" cp -f /etc/slurm/slurm.conf /.secret/ - /usr/sbin/slurmctld + /usr/sbin/slurmctld -Dvv echo "Started slurmctld" } ### main ### +_delete_secrets _sshd_host _ssh_worker _munge_start diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index 6a9a393..7a55ff6 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -15,13 +15,13 @@ SlurmdPort=6818 AuthType=auth/munge #JobCredentialPrivateKey= #JobCredentialPublicCertificate= -StateSaveLocation=/var/lib/slurmd -SlurmdSpoolDir=/var/spool/slurmd +StateSaveLocation=/var/lib/slurm/d +SlurmdSpoolDir=/var/spool/slurm/d SwitchType=switch/none MpiDefault=none -SlurmctldPidFile=/var/run/slurmd/slurmctld.pid -SlurmdPidFile=/var/run/slurmd/slurmd.pid -# ProctrackType=proctrack/linuxproc +SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid +SlurmdPidFile=/var/run/slurm/d/slurmd.pid +ProctrackType=proctrack/linuxproc #PluginDir= #CacheGroups=0 #FirstJobId= @@ -79,7 +79,7 @@ JobCompLoc=/var/log/slurm/jobcomp.log # ACCOUNTING #JobAcctGatherType=jobacct_gather/linux JobAcctGatherType=jobacct_gather/cgroup -ProctrackType=proctrack/cgroup +# ProctrackType=proctrack/cgroup JobAcctGatherFrequency=30 # @@ -97,7 +97,7 @@ PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP # # COMPUTE NODES # NodeName=c[1-2] RealMemory=1000 State=UNKNOWN -NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2 +NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1 # # # # PARTITIONS diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index 299132b..db691bc 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -15,6 +15,10 @@ _sshd_host() { # start munge using existing key _munge_start_using_key() { + sudo yum install -y nc + sudo yum install -y procps + sudo yum install -y iputils + echo -n "cheking for munge.key" while [ ! -f /.secret/munge.key ]; do echo -n "." @@ -35,50 +39,58 @@ _munge_start_using_key() { # wait for worker user in shared /home volume _wait_for_worker() { + echo "checking for id_rsa.pub" if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then - echo -n "checking for id_rsa.pub" + echo "checking for id_rsa.pub" while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do echo -n "." sleep 1 done echo "" fi + echo "done checking for id_rsa.pub" + } _start_dbus() { - dbus-uuidgen > /var/lib/dbus/machine-id - mkdir -p /var/run/dbus - dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address + dbus-uuidgen >/var/lib/dbus/machine-id + mkdir -p /var/run/dbus + dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address } # run slurmd _slurmd() { - cd /root/rpmbuild/RPMS/$ARCH - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ - slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ - slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ - slurm-torque-22.05.6-1.el8.$ARCH.rpm - if [ ! -f /.secret/slurm.conf ]; then - echo -n "checking for slurm.conf" - while [ ! -f /.secret/slurm.conf ]; do - echo -n "." - sleep 1 - done - echo "" - fi - mkdir -p /var/spool/slurm/d /etc/slurm - chown slurm: /var/spool/slurm/d - cp /home/config/cgroup.conf /etc/slurm/cgroup.conf - chown slurm: /etc/slurm/cgroup.conf - chmod 600 /etc/slurm/cgroup.conf - cp /home/config/slurm.conf /etc/slurm/slurm.conf - chown slurm: /etc/slurm/slurm.conf - chmod 600 /etc/slurm/slurm.conf - touch /var/log/slurmd.log - chown slurm: /var/log/slurmd.log - echo -n "Starting slurmd" - /usr/sbin/slurmd - echo -n "Started slurmd" + cd /root/rpmbuild/RPMS/$ARCH + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \ + slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ + slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ + slurm-torque-22.05.6-1.el8.$ARCH.rpm + + echo "checking for slurm.conf" + if [ ! -f /.secret/slurm.conf ]; then + echo "checking for slurm.conf" + while [ ! -f /.secret/slurm.conf ]; do + echo -n "." + sleep 1 + done + echo "" + fi + echo "found slurm.conf" + + mkdir -p /var/spool/slurm/d /etc/slurm + chown slurm: /var/spool/slurm/d + cp /home/config/cgroup.conf /etc/slurm/cgroup.conf + chown slurm: /etc/slurm/cgroup.conf + chmod 600 /etc/slurm/cgroup.conf + cp /home/config/slurm.conf /etc/slurm/slurm.conf + chown slurm: /etc/slurm/slurm.conf + chmod 600 /etc/slurm/slurm.conf + touch /var/log/slurmd.log + chown slurm: /var/log/slurmd.log + + echo "Starting slurmd" + /usr/sbin/slurmd -Dvv + echo "Started slurmd" } ### main ###