diff --git a/docker-compose.yml b/docker-compose.yml index e5e811f..56c06e6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -131,34 +131,34 @@ services: STORAGE_PASS: password STORAGE_USER: slurm - slurm-worker01: - container_name: slurm-worker01 - build: - context: ./slurm/worker - depends_on: - - slurm-controller - privileged: true - volumes: - - ./home:/home - - ./secret:/.secret - restart: always - environment: - CONTROL_MACHINE: controller - ACCOUNTING_STORAGE_HOST: database - COMPUTE_NODES: worker01 worker02 + # slurm-worker01: + # container_name: slurm-worker01 + # build: + # context: ./slurm/worker + # depends_on: + # - slurm-controller + # privileged: true + # volumes: + # - ./home:/home + # - ./secret:/.secret + # restart: always + # environment: + # CONTROL_MACHINE: controller + # ACCOUNTING_STORAGE_HOST: database + # COMPUTE_NODES: worker01 worker02 - slurm-worker02: - container_name: slurm-worker02 - build: - context: ./slurm/worker - depends_on: - - slurm-controller - privileged: true - volumes: - - ./home:/home - - ./secret:/.secret - restart: always - environment: - CONTROL_MACHINE: controller - ACCOUNTING_STORAGE_HOST: database - COMPUTE_NODES: worker01 worker02 + # slurm-worker02: + # container_name: slurm-worker02 + # build: + # context: ./slurm/worker + # depends_on: + # - slurm-controller + # privileged: true + # volumes: + # - ./home:/home + # - ./secret:/.secret + # restart: always + # environment: + # CONTROL_MACHINE: controller + # ACCOUNTING_STORAGE_HOST: database + # COMPUTE_NODES: worker01 worker02 diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile deleted file mode 100644 index 6073bdb..0000000 --- a/slurm/base/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -FROM krallin/centos-tini:7 -MAINTAINER Michael J. Stealey - -ENV SLURM_VERSION=19.05.1 \ - MUNGE_UID=981 \ - SLURM_UID=982 \ - WORKER_UID=1000 - -RUN groupadd -g $MUNGE_UID munge \ - && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \ - && groupadd -g $SLURM_UID slurm \ - && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \ - && groupadd -g $WORKER_UID worker \ - && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker - -# install packages for general functionality -RUN yum -y install \ - epel-release \ - && yum -y install \ - sudo \ - wget \ - which \ - tree \ - mariadb-server \ - mariadb-devel \ - munge \ - munge-libs \ - munge-devel \ - openssh-server \ - openssh-clients - -# install slurm 19.05.1 -COPY rpms /packages -# /usr/bin/mpiexec from slurm-torque conflicts with openmpi install -WORKDIR /packages -RUN yum -y localinstall $(ls | grep -v -e 'torque' -e 'openmpi') -WORKDIR / - -VOLUME ["/home", "/.secret"] - -# 22: SSH -# 3306: MariaDB -# 6817: Slurm Ctl D -# 6818: Slurm D -# 6819: Slurm DBD -EXPOSE 22 3306 6817 6818 6819 diff --git a/slurm/base/Makefile b/slurm/base/Makefile deleted file mode 100644 index ff5859c..0000000 --- a/slurm/base/Makefile +++ /dev/null @@ -1,17 +0,0 @@ - -SLURM_VERSION = 19.05.1 -IMAGE = scidas/slurm.base - -DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) - -.PHONY: all build clean test - -all: build - -build: - cp -r "$(DIR)/../packages/centos-7/rpms" . - docker build -t $(IMAGE):$(SLURM_VERSION) . - -clean: - @[ -z $(docker images -q $(IMAGE):$(SLURM_VERSION)) ] || docker rmi $(IMAGE):$(SLURM_VERSION) - rm -rf rpms diff --git a/slurm/base/README.md b/slurm/base/README.md deleted file mode 100644 index 4b21b62..0000000 --- a/slurm/base/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Slurm Base Image - -TODO diff --git a/slurm/controller/Dockerfile b/slurm/controller/Dockerfile index 31cb001..14d6066 100644 --- a/slurm/controller/Dockerfile +++ b/slurm/controller/Dockerfile @@ -1,31 +1,35 @@ -FROM scidas/slurm.base:19.05.1 -MAINTAINER Michael J. Stealey +FROM rockylinux:8 +MAINTAINER Jan Eitzinger -# install openmpi 3.0.1 -RUN yum -y install \ - gcc-c++ \ - gcc-gfortran \ - && yum -y localinstall \ - /packages/openmpi-*.rpm +ENV SLURM_VERSION=19.05.1 \ + MUNGE_UID=981 \ + SLURM_UID=982 \ + WORKER_UID=1000 -# install Lmod 7.7 -RUN yum -y install \ - lua-posix \ - lua \ - lua-filesystem \ - lua-devel \ - wget \ - bzip2 \ - expectk \ - make \ - && wget https://sourceforge.net/projects/lmod/files/Lmod-7.7.tar.bz2 \ - && tar -xjvf Lmod-7.7.tar.bz2 -WORKDIR /Lmod-7.7 -RUN ./configure --prefix=/opt/apps \ - && make install \ - && ln -s /opt/apps/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh \ - && ln -s /opt/apps/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh -WORKDIR / +RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y + +RUN groupadd -g $MUNGE_UID munge \ + && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \ + && groupadd -g $SLURM_UID slurm \ + && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \ + && groupadd -g $WORKER_UID worker \ + && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker + +RUN yum install -y munge munge-libs +RUN dnf --enablerepo=powertools install munge-devel -y +RUN yum install rng-tools -y + +RUN yum install -y python3 gcc openssl openssl-devel \ +pam-devel numactl numactl-devel hwloc sudo \ +lua readline-devel ncurses-devel man2html \ +libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget + +RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y +RUN dnf install mariadb-server mariadb-devel -y +RUN mkdir /usr/local/slurm-tmp +RUN cd /usr/local/slurm-tmp +RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2 +RUN rpmbuild -ta slurm-22.05.6.tar.bz2 ENV USE_SLURMDBD=true \ CLUSTER_NAME=snowflake \ @@ -36,12 +40,12 @@ ENV USE_SLURMDBD=true \ ACCOUNTING_STORAGE_PORT=6819 \ PARTITION_NAME=docker -# clean up -RUN rm -f /packages/slurm-*.rpm /packages/openmpi-*.rpm \ - && yum clean all \ - && rm -rf /var/cache/yum \ - && rm -f /Lmod-7.7.tar.bz2 - COPY docker-entrypoint.sh /docker-entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"] +VOLUME ["/home", "/.secret"] +# 22: SSH +# 3306: MariaDB +# 6817: SlurmCtlD +# 6818: SlurmD +# 6819: SlurmDBD +EXPOSE 22 3306 6817 6818 6819 +ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 3a58322..5aff60c 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -12,44 +12,49 @@ _sshd_host() { # setup worker ssh to be passwordless _ssh_worker() { - if [[ ! -d /home/worker ]]; then - mkdir -p /home/worker - chown -R worker:worker /home/worker - fi - cat > /home/worker/setup-worker-ssh.sh <<'EOF2' -mkdir -p ~/.ssh -chmod 0700 ~/.ssh -ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N "" -C "$(whoami)@$(hostname)-$(date -I)" -cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys -chmod 0640 ~/.ssh/authorized_keys -cat >> ~/.ssh/config < /home/worker/setup-worker-ssh.sh < ~/.ssh/authorized_keys + chmod 0640 ~/.ssh/authorized_keys + cat >> ~/.ssh/config < /etc/munge/munge.key" + chown munge: /etc/munge/munge.key + chmod 400 /etc/munge/munge.key + sudo -u munge /sbin/munged + munge -n + munge -n | unmunge + remunge } # copy secrets to /.secret directory for other nodes @@ -76,15 +81,10 @@ _generate_slurm_conf() { # ClusterName=$CLUSTER_NAME SlurmctldHost=$CONTROL_MACHINE -#SlurmctldHostr= -# SlurmUser=slurm -#SlurmdUser=root SlurmctldPort=$SLURMCTLD_PORT SlurmdPort=$SLURMD_PORT AuthType=auth/munge -#JobCredentialPrivateKey= -#JobCredentialPublicCertificate= StateSaveLocation=/var/spool/slurm/ctld SlurmdSpoolDir=/var/spool/slurm/d SwitchType=switch/none @@ -92,25 +92,7 @@ MpiDefault=none SlurmctldPidFile=/var/run/slurmctld.pid SlurmdPidFile=/var/run/slurmd.pid ProctrackType=proctrack/pgid -#PluginDir= -#FirstJobId= ReturnToService=0 -#MaxJobCount= -#PlugStackConfig= -#PropagatePrioProcess= -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -#Prolog= -#Epilog= -#SrunProlog= -#SrunEpilog= -#TaskProlog= -#TaskEpilog= -#TaskPlugin= -#TrackWCKey=no -#TreeWidth=50 -#TmpFS= -#UsePAM= # # TIMERS SlurmctldTimeout=300 @@ -122,17 +104,7 @@ Waittime=0 # # SCHEDULING SchedulerType=sched/backfill -#SchedulerAuth= -#SelectType=select/linear FastSchedule=1 -#PriorityType=priority/multifactor -#PriorityDecayHalfLife=14-0 -#PriorityUsageResetPeriod=14-0 -#PriorityWeightFairshare=100000 -#PriorityWeightAge=1000 -#PriorityWeightPartition=10000 -#PriorityWeightJobSize=1000 -#PriorityMaxAge=1-0 # # LOGGING SlurmctldDebug=3 @@ -161,33 +133,31 @@ EOF # run slurmctld _slurmctld() { - if $USE_SLURMDBD; then - echo -n "cheking for slurmdbd.conf" - while [ ! -f /.secret/slurmdbd.conf ]; do - echo -n "." - sleep 1 - done - echo "" - fi - mkdir -p /var/spool/slurm/ctld \ - /var/spool/slurm/d \ - /var/log/slurm - chown -R slurm: /var/spool/slurm/ctld \ - /var/spool/slurm/d \ - /var/log/slurm - touch /var/log/slurmctld.log - chown slurm: /var/log/slurmctld.log - if [[ ! -f /home/config/slurm.conf ]]; then - echo "### generate slurm.conf ###" - _generate_slurm_conf - else - echo "### use provided slurm.conf ###" - cp /home/config/slurm.conf /etc/slurm/slurm.conf - fi - sacctmgr -i add cluster "${CLUSTER_NAME}" - sleep 2s - /usr/sbin/slurmctld - cp -f /etc/slurm/slurm.conf /.secret/ + cd /root/rpmbuild/RPMS/aarch64 + yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm slurm-perlapi-22.05.6-1.el8.aarch64.rpm slurm-slurmctld-22.05.6-1.el8.aarch64.rpm + if $USE_SLURMDBD; then + echo -n "checking for slurmdbd.conf" + while [ ! -f /.secret/slurmdbd.conf ]; do + echo -n "." + sleep 1 + done + echo "" + fi + mkdir -p /var/spool/slurm/ctld /var/spool/slurmd /var/log/slurm /etc/slurm + chown -R slurm: /var/spool/slurm/ctld /var/spool/slurmd /var/log/slurm + touch /var/log/slurmctld.log + chown slurm: /var/log/slurmctld.log + if [[ ! -f /home/config/slurm.conf ]]; then + echo "### generate slurm.conf ###" + _generate_slurm_conf + else + echo "### use provided slurm.conf ###" + cp /home/config/slurm.conf /etc/slurm/slurm.conf + fi + sacctmgr -i add cluster "${CLUSTER_NAME}" + sleep 2s + /usr/sbin/slurmctld + cp -f /etc/slurm/slurm.conf /.secret/ } ### main ### diff --git a/slurm/database/Dockerfile b/slurm/database/Dockerfile index a5c20f4..6e8db8d 100644 --- a/slurm/database/Dockerfile +++ b/slurm/database/Dockerfile @@ -1,5 +1,36 @@ -FROM scidas/slurm.base:19.05.1 -MAINTAINER Michael J. Stealey +FROM rockylinux:8 +MAINTAINER Jan Eitzinger + +ENV SLURM_VERSION=19.05.1 \ + MUNGE_UID=981 \ + SLURM_UID=982 \ + WORKER_UID=1000 + +RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y + +RUN groupadd -g $MUNGE_UID munge \ + && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \ + && groupadd -g $SLURM_UID slurm \ + && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \ + && groupadd -g $WORKER_UID worker \ + && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker + +RUN yum install -y munge munge-libs +RUN dnf --enablerepo=powertools install munge-devel -y +RUN yum install rng-tools -y + +RUN yum install -y python3 gcc openssl openssl-devel \ +pam-devel numactl numactl-devel hwloc sudo \ +lua readline-devel ncurses-devel man2html \ +libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget + +RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y +RUN dnf install mariadb-server mariadb-devel -y +RUN mkdir /usr/local/slurm-tmp +RUN cd /usr/local/slurm-tmp +RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2 +RUN rpmbuild -ta slurm-22.05.6.tar.bz2 + ENV DBD_ADDR=database \ DBD_HOST=database \ @@ -9,11 +40,12 @@ ENV DBD_ADDR=database \ STORAGE_PASS=password \ STORAGE_USER=slurm -# clean up -RUN rm -f /packages/slurm-*.rpm /packages/openmpi-*.rpm \ - && yum clean all \ - && rm -rf /var/cache/yum - COPY docker-entrypoint.sh /docker-entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"] +VOLUME ["/home", "/.secret"] +# 22: SSH +# 3306: MariaDB +# 6817: Slurm Ctl D +# 6818: Slurm D +# 6819: Slurm DBD +EXPOSE 22 3306 6817 6818 6819 +ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index eb5c1a1..b011977 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -41,7 +41,7 @@ _mariadb_start() { # start munge using existing key _munge_start_using_key() { if [ ! -f /.secret/munge.key ]; then - echo -n "cheking for munge.key" + echo -n "checking for munge.key" while [ ! -f /.secret/munge.key ]; do echo -n "." sleep 1 @@ -63,7 +63,7 @@ _munge_start_using_key() { # wait for worker user in shared /home volume _wait_for_worker() { if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then - echo -n "cheking for id_rsa.pub" + echo -n "checking for id_rsa.pub" while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do echo -n "." sleep 1 diff --git a/slurm/slurm.conf b/slurm/slurm.conf new file mode 100644 index 0000000..d6c7dcb --- /dev/null +++ b/slurm/slurm.conf @@ -0,0 +1,48 @@ +# slurm.conf file generated by configurator.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +ClusterName=snowflake +SlurmctldHost=linux0 +SlurmUser=slurm +SlurmctldPort=6817 +SlurmdPort=6818 +MpiDefault=none +ProctrackType=proctrack/cgroup +ReturnToService=1 +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +SlurmdSpoolDir=/var/spool/slurmd +StateSaveLocation=/var/spool/slurmctld +SwitchType=switch/none +TaskPlugin=task/affinity,task/cgroup +# +# TIMERS +InactiveLimit=0 +KillWait=30 +MinJobAge=300 +SlurmctldTimeout=120 +SlurmdTimeout=300 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +SelectType=select/cons_tres +# +# LOGGING AND ACCOUNTING +AccountingStorageHost=slurm-db +AccountingStoragePort=6818 +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageUser=slurm +AccountingStoreFlags=job_script,job_comment,job_env,job_extra +JobCompType=jobcomp/none +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/cgroup +SlurmctldDebug=info +SlurmctldLogFile=/var/log/slurmctld.log +SlurmdDebug=info +SlurmdLogFile=/var/log/slurmd.log +# +# COMPUTE NODES +NodeName=linux[1-32] CPUs=1 State=UNKNOWN +PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/slurm/worker/Dockerfile b/slurm/worker/Dockerfile index cd4ca6d..7879c24 100644 --- a/slurm/worker/Dockerfile +++ b/slurm/worker/Dockerfile @@ -1,39 +1,43 @@ -FROM scidas/slurm.base:19.05.1 -MAINTAINER Michael J. Stealey +FROM rockylinux:8 +MAINTAINER Jan Eitzinger -# install openmpi 3.0.1 -RUN yum -y install \ - gcc-c++ \ - gcc-gfortran \ - && yum -y localinstall \ - /packages/openmpi-*.rpm +ENV SLURM_VERSION=19.05.1 \ + MUNGE_UID=981 \ + SLURM_UID=982 \ + WORKER_UID=1000 -# install Lmod 7.7 -RUN yum -y install \ - lua-posix \ - lua \ - lua-filesystem \ - lua-devel \ - wget \ - bzip2 \ - expectk \ - make \ - && wget https://sourceforge.net/projects/lmod/files/Lmod-7.7.tar.bz2 \ - && tar -xjvf Lmod-7.7.tar.bz2 -WORKDIR /Lmod-7.7 -RUN ./configure --prefix=/opt/apps \ - && make install \ - && ln -s /opt/apps/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh \ - && ln -s /opt/apps/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh +RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y +RUN groupadd -g $MUNGE_UID munge \ + && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \ + && groupadd -g $SLURM_UID slurm \ + && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \ + && groupadd -g $WORKER_UID worker \ + && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker + +RUN yum install -y munge munge-libs +RUN dnf --enablerepo=powertools install munge-devel -y +RUN yum install rng-tools -y + +RUN yum install -y python3 gcc openssl openssl-devel \ +pam-devel numactl numactl-devel hwloc sudo \ +lua readline-devel ncurses-devel man2html \ +libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget + +RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y +RUN dnf install mariadb-server mariadb-devel -y +RUN mkdir /usr/local/slurm-tmp +RUN cd /usr/local/slurm-tmp +RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2 +RUN rpmbuild -ta slurm-22.05.6.tar.bz2 + +VOLUME ["/home", "/.secret"] +# 22: SSH +# 3306: MariaDB +# 6817: SlurmCtlD +# 6818: SlurmD +# 6819: SlurmDBD +EXPOSE 22 3306 6817 6818 6819 WORKDIR /home/worker - -# clean up -RUN rm -f /packages/slurm-*.rpm /packages/openmpi-*.rpm \ - && yum clean all \ - && rm -rf /var/cache/yum \ - && rm -f /Lmod-7.7.tar.bz2 - COPY docker-entrypoint.sh /docker-entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"] +ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index f18b9f7..a0fe425 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -45,7 +45,7 @@ _wait_for_worker() { # run slurmd _slurmd() { if [ ! -f /.secret/slurm.conf ]; then - echo -n "cheking for slurm.conf" + echo -n "checking for slurm.conf" while [ ! -f /.secret/slurm.conf ]; do echo -n "." sleep 1