diff --git a/.env b/.env index 3fe251d..4e9aa63 100644 --- a/.env +++ b/.env @@ -2,6 +2,15 @@ # CCBACKEND DEVEL DOCKER SETTINGS ######################################################################## +######################################################################## +# SLURM +######################################################################## +SLURM_VERSION=22.05.6 +ARCH=aarch64 +MUNGE_UID=981 +SLURM_UID=982 +WORKER_UID=1000 + ######################################################################## # INFLUXDB ######################################################################## diff --git a/docker-compose.yml b/docker-compose.yml index 36c5c3d..345f60d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -52,12 +52,13 @@ services: command: ["--default-authentication-plugin=mysql_native_password"] environment: MARIADB_ROOT_PASSWORD: ${MARIADB_ROOT_PASSWORD} - MARIADB_DATABASE: ${MARIADB_DATABASE} - MARIADB_USER: ${MARIADB_USER} - MARIADB_PASSWORD: ${MARIADB_PASSWORD} + MARIADB_DATABASE: slurm_acct_db + MARIADB_USER: slurm + MARIADB_PASSWORD: demo ports: - "127.0.0.1:${MARIADB_PORT}:3306" - # volumes: + volumes: + - ${DATADIR}/mariadb:/etc/mysql/conf.d # - ${DATADIR}/sql-init:/docker-entrypoint-initdb.d cap_add: - SYS_NICE @@ -81,72 +82,47 @@ services: slurm-controller: container_name: slurmctld + hostname: slurmctld build: context: ./slurm/controller privileged: true volumes: - - ./home:/home - - ./secret:/.secret - restart: always - environment: - USE_SLURMDBD: 'true' - CLUSTER_NAME: snowflake - CONTROL_MACHINE: controller - SLURMCTLD_PORT: 6817 - SLURMD_PORT: 6818 - ACCOUNTING_STORAGE_HOST: database - ACCOUNTING_STORAGE_PORT: 6819 - COMPUTE_NODES: worker01 worker02 - PARTITION_NAME: docker + - ${DATADIR}/slurm/home:/home + - ${DATADIR}/slurm/secret:/.secret slurm-database: container_name: slurmdb + hostname: slurmdb build: context: ./slurm/database depends_on: + - mariadb - slurm-controller privileged: true volumes: - - ./home:/home - - ./secret:/.secret - restart: always - environment: - DBD_ADDR: database - DBD_HOST: database - DBD_PORT: 6819 - STORAGE_HOST: database.local.dev - STORAGE_PORT: 3306 - STORAGE_PASS: password - STORAGE_USER: slurm + - ${DATADIR}/slurm/home:/home + - ${DATADIR}/slurm/secret:/.secret slurm-worker01: container_name: node01 + hostname: node01 build: context: ./slurm/worker depends_on: - slurm-controller privileged: true volumes: - - ./home:/home - - ./secret:/.secret - restart: always - environment: - CONTROL_MACHINE: controller - ACCOUNTING_STORAGE_HOST: database - COMPUTE_NODES: worker01 worker02 + - ${DATADIR}/slurm/home:/home + - ${DATADIR}/slurm/secret:/.secret # slurm-worker02: - # container_name: slurm-worker02 + # container_name: node02 + # hostname: node02 # build: # context: ./slurm/worker # depends_on: # - slurm-controller # privileged: true # volumes: - # - ./home:/home - # - ./secret:/.secret - # restart: always - # environment: - # CONTROL_MACHINE: controller - # ACCOUNTING_STORAGE_HOST: database - # COMPUTE_NODES: worker01 worker02 + # - ${DATADIR}/slurm/home:/home + # - ${DATADIR}/slurm/secret:/.secret diff --git a/env-template.txt b/env-template.txt index 2cb9b12..3bdeb8f 100644 --- a/env-template.txt +++ b/env-template.txt @@ -1,5 +1,5 @@ -ENV SLURM_VERSION=19.05.1 \ - MUNGE_UID=981 \ - SLURM_UID=982 \ - WORKER_UID=1000 - +SLURM_VERSION=22.05.6 +ARCH=aarch64 +MUNGE_UID=981 +SLURM_UID=982 +WORKER_UID=1000 diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile index 0e225c8..a006cc2 100644 --- a/slurm/base/Dockerfile +++ b/slurm/base/Dockerfile @@ -18,6 +18,7 @@ RUN dnf --enablerepo=powertools install munge-devel -y RUN yum install rng-tools -y RUN yum install -y python3 gcc openssl openssl-devel \ +openssh-server openssh-clients dbus-devel \ pam-devel numactl numactl-devel hwloc sudo \ lua readline-devel ncurses-devel man2html \ libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget @@ -42,4 +43,4 @@ VOLUME ["/home", "/.secret"] # 6817: SlurmCtlD # 6818: SlurmD # 6819: SlurmDBD -EXPOSE 22 3306 6817 6818 6819 +EXPOSE 22 6817 6818 6819 diff --git a/slurm/base/Makefile b/slurm/base/Makefile index a91b59a..dc0dff3 100644 --- a/slurm/base/Makefile +++ b/slurm/base/Makefile @@ -1,4 +1,5 @@ -IMAGE = scidas/slurm.base +include ../../.env +IMAGE = clustercockpit/slurm.base .PHONY: build clean diff --git a/slurm/controller/Dockerfile b/slurm/controller/Dockerfile index 192b554..b627826 100644 --- a/slurm/controller/Dockerfile +++ b/slurm/controller/Dockerfile @@ -1,4 +1,4 @@ -FROM clustercockpit/slurm.base:latest +FROM clustercockpit/slurm.base:22.05.6 MAINTAINER Jan Eitzinger # clean up diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 301ff03..75e36db 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -7,6 +7,7 @@ _sshd_host() { mkdir /var/run/sshd ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' fi + echo "Starting sshd" /usr/sbin/sshd } @@ -40,6 +41,7 @@ EOF2 # start munge and generate key _munge_start() { + echo "Starting munge" chown -R munge: /etc/munge /var/lib/munge /var/log/munge /var/run/munge chmod 0700 /etc/munge chmod 0711 /var/lib/munge @@ -69,8 +71,12 @@ _copy_secrets() { # run slurmctld _slurmctld() { cd /root/rpmbuild/RPMS/aarch64 - yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm slurm-perlapi-22.05.6-1.el8.aarch64.rpm slurm-slurmctld-22.05.6-1.el8.aarch64.rpm - echo -n "checking for slurmdbd.conf" + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ + slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ + slurm-slurmd-22.05.6-1.el8.aarch64.rpm \ + slurm-torque-22.05.6-1.el8.aarch64.rpm \ + slurm-slurmctld-22.05.6-1.el8.aarch64.rpm + echo "checking for slurmdbd.conf" while [ ! -f /.secret/slurmdbd.conf ]; do echo -n "." sleep 1 @@ -86,11 +92,14 @@ _slurmctld() { else echo "### use provided slurm.conf ###" cp /home/config/slurm.conf /etc/slurm/slurm.conf + chown slurm: /etc/slurm/slurm.conf + chmod 600 /etc/slurm/slurm.conf fi sacctmgr -i add cluster "snowflake" sleep 2s - /usr/sbin/slurmctld + echo "Starting slurmctld" cp -f /etc/slurm/slurm.conf /.secret/ + /usr/sbin/slurmctld } ### main ### diff --git a/slurm/database/Dockerfile b/slurm/database/Dockerfile index 192b554..b627826 100644 --- a/slurm/database/Dockerfile +++ b/slurm/database/Dockerfile @@ -1,4 +1,4 @@ -FROM clustercockpit/slurm.base:latest +FROM clustercockpit/slurm.base:22.05.6 MAINTAINER Jan Eitzinger # clean up diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index 4ed8d94..97aff4e 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -12,32 +12,6 @@ _sshd_host() { /usr/sbin/sshd } -# slurm database user settings -_slurm_acct_db() { - { - echo "create database slurm_acct_db;" - echo "create user 'slurm'@slurmdb'';" - echo "set password for 'slurm'@'slurmdb' = password('demo');" - echo "grant usage on *.* to 'slurm'@'slurmdb';" - echo "grant all privileges on slurm_acct_db.* to 'slurm'@'slurmdb';" - echo "flush privileges;" - } >> $SLURM_ACCT_DB_SQL -} - -# start database -_mariadb_start() { - # mariadb somehow expects `resolveip` to be found under this path; see https://github.com/SciDAS/slurm-in-docker/issues/26 - ln -s /usr/bin/resolveip /usr/libexec/resolveip - mysql_install_db - chown -R mysql: /var/lib/mysql/ /var/log/mariadb/ /var/run/mariadb - cd /var/lib/mysql - mysqld_safe --user=mysql & - cd / - _slurm_acct_db - sleep 5s - mysql -uroot < $SLURM_ACCT_DB_SQL -} - # start munge using existing key _munge_start_using_key() { if [ ! -f /.secret/munge.key ]; then @@ -74,24 +48,28 @@ _wait_for_worker() { # run slurmdbd _slurmdbd() { - mkdir -p /var/spool/slurm/d \ - /var/log/slurm - chown slurm: /var/spool/slurm/d \ - /var/log/slurm + cd /root/rpmbuild/RPMS/aarch64 + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ + slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ + slurm-slurmdbd-22.05.6-1.el8.aarch64.rpm + mkdir -p /var/spool/slurm/d /var/log/slurm /etc/slurm + chown slurm: /var/spool/slurm/d /var/log/slurm if [[ ! -f /home/config/slurmdbd.conf ]]; then echo "### Missing slurmdbd.conf ###" exit else echo "### use provided slurmdbd.conf ###" cp /home/config/slurmdbd.conf /etc/slurm/slurmdbd.conf + chown slurm: /etc/slurm/slurmdbd.conf + chmod 600 /etc/slurm/slurmdbd.conf fi - /usr/sbin/slurmdbd + echo "Starting slurmdbd" cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf + /usr/sbin/slurmdbd } ### main ### _sshd_host -_mariadb_start _munge_start_using_key _wait_for_worker _slurmdbd diff --git a/slurm/rest/Dockerfile b/slurm/rest/Dockerfile new file mode 100644 index 0000000..b627826 --- /dev/null +++ b/slurm/rest/Dockerfile @@ -0,0 +1,10 @@ +FROM clustercockpit/slurm.base:22.05.6 +MAINTAINER Jan Eitzinger + +# clean up +RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ + && yum clean all \ + && rm -rf /var/cache/yum + +COPY docker-entrypoint.sh /docker-entrypoint.sh +ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/rest/docker-entrypoint.sh b/slurm/rest/docker-entrypoint.sh new file mode 100755 index 0000000..6ef6bcb --- /dev/null +++ b/slurm/rest/docker-entrypoint.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +set -e + +# start sshd server +_sshd_host() { + if [ ! -d /var/run/sshd ]; then + mkdir /var/run/sshd + ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' + fi + /usr/sbin/sshd +} + +# setup worker ssh to be passwordless +_ssh_worker() { + if [[ ! -d /home/worker ]]; then + mkdir -p /home/worker + chown -R worker:worker /home/worker + fi + cat > /home/worker/setup-worker-ssh.sh < ~/.ssh/authorized_keys +chmod 0640 ~/.ssh/authorized_keys +cat >> ~/.ssh/config < /etc/munge/munge.key" + chown munge: /etc/munge/munge.key + chmod 400 /etc/munge/munge.key + sudo -u munge /sbin/munged + munge -n + munge -n | unmunge + remunge +} + +# copy secrets to /.secret directory for other nodes +_copy_secrets() { + cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz + cp thome/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh + cp /etc/munge/munge.key /.secret/munge.key + rm -f /home/worker/worker-secret.tar.gz + rm -f /home/worker/setup-worker-ssh.sh +} + +# run slurmctld +_slurmctld() { + cd /root/rpmbuild/RPMS/aarch64 + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ + slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ + slurm-slurmd-22.05.6-1.el8.aarch64.rpm \ + slurm-torque-22.05.6-1.el8.aarch64.rpm \ + slurm-slurmctld-22.05.6-1.el8.aarch64.rpm \ + slurm-slurmrestd-22.05.6-1.el8.aarch64.rpm + echo -n "checking for slurmdbd.conf" + while [ ! -f /.secret/slurmdbd.conf ]; do + echo -n "." + sleep 1 + done + echo "" + mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm + chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm + touch /var/log/slurmctld.log + chown slurm: /var/log/slurmctld.log + if [[ ! -f /home/config/slurm.conf ]]; then + echo "### Missing slurm.conf ###" + exit + else + echo "### use provided slurm.conf ###" + cp /home/config/slurm.conf /etc/slurm/slurm.conf + fi + sacctmgr -i add cluster "snowflake" + sleep 2s + /usr/sbin/slurmctld + cp -f /etc/slurm/slurm.conf /.secret/ +} + +### main ### +_sshd_host +_ssh_worker +_munge_start +_copy_secrets +_slurmctld + +tail -f /dev/null diff --git a/slurm/worker/Dockerfile b/slurm/worker/Dockerfile index 1a86561..b615be5 100644 --- a/slurm/worker/Dockerfile +++ b/slurm/worker/Dockerfile @@ -1,4 +1,4 @@ -FROM clustercockpit/slurm.base:latest +FROM clustercockpit/slurm.base:22.05.6 MAINTAINER Jan Eitzinger # clean up diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index a0fe425..12ecf3e 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -33,7 +33,7 @@ _munge_start_using_key() { # wait for worker user in shared /home volume _wait_for_worker() { if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then - echo -n "cheking for id_rsa.pub" + echo -n "checking for id_rsa.pub" while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do echo -n "." sleep 1 @@ -42,28 +42,46 @@ _wait_for_worker() { fi } +_start_dbus() { + dbus-uuidgen > /var/lib/dbus/machine-id + mkdir -p /var/run/dbus + dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address +} + # run slurmd _slurmd() { - if [ ! -f /.secret/slurm.conf ]; then - echo -n "checking for slurm.conf" - while [ ! -f /.secret/slurm.conf ]; do - echo -n "." - sleep 1 - done - echo "" - fi - mkdir -p /var/spool/slurm/d - chown slurm: /var/spool/slurm/d - cp /.secret/slurm.conf /etc/slurm/slurm.conf - touch /var/log/slurmd.log - chown slurm: /var/log/slurmd.log - /usr/sbin/slurmd + cd /root/rpmbuild/RPMS/aarch64 + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \ + slurm-perlapi-22.05.6-1.el8.aarch64.rpm \ + slurm-slurmd-22.05.6-1.el8.aarch64.rpm \ + slurm-torque-22.05.6-1.el8.aarch64.rpm + if [ ! -f /.secret/slurm.conf ]; then + echo -n "checking for slurm.conf" + while [ ! -f /.secret/slurm.conf ]; do + echo -n "." + sleep 1 + done + echo "" + fi + mkdir -p /var/spool/slurm/d /etc/slurm + chown slurm: /var/spool/slurm/d + cp /home/config/cgroup.conf /etc/slurm/cgroup.conf + chown slurm: /etc/slurm/cgroup.conf + chmod 600 /etc/slurm/cgroup.conf + cp /home/config/slurm.conf /etc/slurm/slurm.conf + chown slurm: /etc/slurm/slurm.conf + chmod 600 /etc/slurm/slurm.conf + touch /var/log/slurmd.log + chown slurm: /var/log/slurmd.log + echo -n "Starting slurmd" + /usr/sbin/slurmd } ### main ### _sshd_host _munge_start_using_key _wait_for_worker +_start_dbus _slurmd tail -f /dev/null