diff --git a/docker-compose.yml b/docker-compose.yml index 56c06e6..36c5c3d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ services: nats: - container_name: cc-nats + container_name: nats image: nats:alpine ports: - "4222:4222" @@ -18,7 +18,7 @@ services: - nats influxdb: - container_name: cc-influxdb + container_name: influxdb image: influxdb command: ["--reporting-disabled"] environment: @@ -36,7 +36,7 @@ services: - ${DATADIR}/influxdb/config:/etc/influxdb2 openldap: - container_name: cc-ldap + container_name: ldap image: osixia/openldap:1.5.0 command: --copy-service --loglevel debug environment: @@ -47,7 +47,7 @@ services: - ${DATADIR}/ldap:/container/service/slapd/assets/config/bootstrap/ldif/custom mariadb: - container_name: cc-db + container_name: mariadb image: mariadb:latest command: ["--default-authentication-plugin=mysql_native_password"] environment: @@ -63,7 +63,7 @@ services: - SYS_NICE # mysql: - # container_name: cc-mysql + # container_name: mysql # image: mysql:8.0.22 # command: ["--default-authentication-plugin=mysql_native_password"] # environment: @@ -79,20 +79,8 @@ services: # cap_add: # - SYS_NICE - phpmyadmin: - container_name: cc-phpmyadmin - image: phpmyadmin - environment: - - PMA_HOST=cc-db - - PMA_USER=root - - PMA_PASSWORD=${MARIADB_ROOT_PASSWORD} - ports: - - "127.0.0.1:${PHPMYADMIN_PORT}:80" - depends_on: - - mariadb - slurm-controller: - container_name: slurm-controller + container_name: slurmctld build: context: ./slurm/controller privileged: true @@ -112,7 +100,7 @@ services: PARTITION_NAME: docker slurm-database: - container_name: slurm-database + container_name: slurmdb build: context: ./slurm/database depends_on: @@ -131,21 +119,21 @@ services: STORAGE_PASS: password STORAGE_USER: slurm - # slurm-worker01: - # container_name: slurm-worker01 - # build: - # context: ./slurm/worker - # depends_on: - # - slurm-controller - # privileged: true - # volumes: - # - ./home:/home - # - ./secret:/.secret - # restart: always - # environment: - # CONTROL_MACHINE: controller - # ACCOUNTING_STORAGE_HOST: database - # COMPUTE_NODES: worker01 worker02 + slurm-worker01: + container_name: node01 + build: + context: ./slurm/worker + depends_on: + - slurm-controller + privileged: true + volumes: + - ./home:/home + - ./secret:/.secret + restart: always + environment: + CONTROL_MACHINE: controller + ACCOUNTING_STORAGE_HOST: database + COMPUTE_NODES: worker01 worker02 # slurm-worker02: # container_name: slurm-worker02 diff --git a/env-template.txt b/env-template.txt new file mode 100644 index 0000000..2cb9b12 --- /dev/null +++ b/env-template.txt @@ -0,0 +1,5 @@ +ENV SLURM_VERSION=19.05.1 \ + MUNGE_UID=981 \ + SLURM_UID=982 \ + WORKER_UID=1000 + diff --git a/slurm/slurm.conf b/home/config/slurm.conf similarity index 78% rename from slurm/slurm.conf rename to home/config/slurm.conf index d6c7dcb..b6059e0 100644 --- a/slurm/slurm.conf +++ b/home/config/slurm.conf @@ -3,7 +3,7 @@ # See the slurm.conf man page for more information. # ClusterName=snowflake -SlurmctldHost=linux0 +SlurmctldHost=slurmctld SlurmUser=slurm SlurmctldPort=6817 SlurmdPort=6818 @@ -12,8 +12,8 @@ ProctrackType=proctrack/cgroup ReturnToService=1 SlurmctldPidFile=/var/run/slurmctld.pid SlurmdPidFile=/var/run/slurmd.pid -SlurmdSpoolDir=/var/spool/slurmd -StateSaveLocation=/var/spool/slurmctld +SlurmdSpoolDir=/var/spool/slurm/d +StateSaveLocation=/var/spool/slurm/ctld SwitchType=switch/none TaskPlugin=task/affinity,task/cgroup # @@ -30,8 +30,8 @@ SchedulerType=sched/backfill SelectType=select/cons_tres # # LOGGING AND ACCOUNTING -AccountingStorageHost=slurm-db -AccountingStoragePort=6818 +AccountingStorageHost=slurmdb +AccountingStoragePort=6819 AccountingStorageType=accounting_storage/slurmdbd AccountingStorageUser=slurm AccountingStoreFlags=job_script,job_comment,job_env,job_extra @@ -44,5 +44,5 @@ SlurmdDebug=info SlurmdLogFile=/var/log/slurmd.log # # COMPUTE NODES -NodeName=linux[1-32] CPUs=1 State=UNKNOWN -PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP +NodeName=node0[1-2] CPUs=1 State=UNKNOWN +PartitionName=main Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/home/config/slurmdbd.conf b/home/config/slurmdbd.conf new file mode 100644 index 0000000..35838b9 --- /dev/null +++ b/home/config/slurmdbd.conf @@ -0,0 +1,31 @@ +# Archive info +#ArchiveJobs=yes +#ArchiveDir="/tmp" +#ArchiveSteps=yes +#ArchiveScript= +#JobPurge=12 +#StepPurge=1 +# +# Authentication info +AuthType=auth/munge +AuthInfo=/var/run/munge/munge.socket.2 +# +# slurmDBD info +DbdAddr=slurmdb +DbdHost=slurmdb +DbdPort=6819 +SlurmUser=slurm +DebugLevel=4 +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd.pid +#PluginDir=/usr/lib/slurm +#PrivateData=accounts,users,usage,jobs +#TrackWCKey=yes +# +# Database info +StorageType=accounting_storage/mysql +StorageHost=slurmdb +StoragePort=3306 +StoragePass=demo +StorageUser=slurm +StorageLoc=slurm_acct_db diff --git a/slurm/base/Dockerfile b/slurm/base/Dockerfile new file mode 100644 index 0000000..0e225c8 --- /dev/null +++ b/slurm/base/Dockerfile @@ -0,0 +1,45 @@ +FROM rockylinux:8 +MAINTAINER Jan Eitzinger + +ENV SLURM_VERSION=22.05.6 +ENV ARCH=aarch64 + +RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y + +RUN groupadd -g 981 munge \ + && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u 981 -g munge -s /sbin/nologin munge \ + && groupadd -g 982 slurm \ + && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u 982 -g slurm -s /bin/bash slurm \ + && groupadd -g 1000 worker \ + && useradd -m -c "Workflow user" -d /home/worker -u 1000 -g worker -s /bin/bash worker + +RUN yum install -y munge munge-libs +RUN dnf --enablerepo=powertools install munge-devel -y +RUN yum install rng-tools -y + +RUN yum install -y python3 gcc openssl openssl-devel \ +pam-devel numactl numactl-devel hwloc sudo \ +lua readline-devel ncurses-devel man2html \ +libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget + +RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y +RUN dnf install mariadb-server mariadb-devel -y +RUN mkdir /usr/local/slurm-tmp +RUN cd /usr/local/slurm-tmp +RUN wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 +RUN rpmbuild -ta slurm-${SLURM_VERSION}.tar.bz2 + +WORKDIR /root/rpmbuild/RPMS/${ARCH} +RUN yum -y --nogpgcheck localinstall \ + slurm-${SLURM_VERSION}-1.el8.${ARCH}.rpm \ + slurm-perlapi-${SLURM_VERSION}-1.el8.${ARCH}.rpm \ + slurm-slurmctld-${SLURM_VERSION}-1.el8.${ARCH}.rpm +WORKDIR / + +VOLUME ["/home", "/.secret"] +# 22: SSH +# 3306: MariaDB +# 6817: SlurmCtlD +# 6818: SlurmD +# 6819: SlurmDBD +EXPOSE 22 3306 6817 6818 6819 diff --git a/slurm/controller/Dockerfile b/slurm/controller/Dockerfile index 14d6066..192b554 100644 --- a/slurm/controller/Dockerfile +++ b/slurm/controller/Dockerfile @@ -1,51 +1,10 @@ -FROM rockylinux:8 +FROM clustercockpit/slurm.base:latest MAINTAINER Jan Eitzinger -ENV SLURM_VERSION=19.05.1 \ - MUNGE_UID=981 \ - SLURM_UID=982 \ - WORKER_UID=1000 - -RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y - -RUN groupadd -g $MUNGE_UID munge \ - && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \ - && groupadd -g $SLURM_UID slurm \ - && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \ - && groupadd -g $WORKER_UID worker \ - && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker - -RUN yum install -y munge munge-libs -RUN dnf --enablerepo=powertools install munge-devel -y -RUN yum install rng-tools -y - -RUN yum install -y python3 gcc openssl openssl-devel \ -pam-devel numactl numactl-devel hwloc sudo \ -lua readline-devel ncurses-devel man2html \ -libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget - -RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y -RUN dnf install mariadb-server mariadb-devel -y -RUN mkdir /usr/local/slurm-tmp -RUN cd /usr/local/slurm-tmp -RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2 -RUN rpmbuild -ta slurm-22.05.6.tar.bz2 - -ENV USE_SLURMDBD=true \ - CLUSTER_NAME=snowflake \ - CONTROL_MACHINE=controller \ - SLURMCTLD_PORT=6817 \ - SLURMD_PORT=6818 \ - ACCOUNTING_STORAGE_HOST=database \ - ACCOUNTING_STORAGE_PORT=6819 \ - PARTITION_NAME=docker +# clean up +RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ + && yum clean all \ + && rm -rf /var/cache/yum COPY docker-entrypoint.sh /docker-entrypoint.sh -VOLUME ["/home", "/.secret"] -# 22: SSH -# 3306: MariaDB -# 6817: SlurmCtlD -# 6818: SlurmD -# 6819: SlurmDBD -EXPOSE 22 3306 6817 6818 6819 ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 5aff60c..301ff03 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -17,22 +17,22 @@ _ssh_worker() { chown -R worker:worker /home/worker fi cat > /home/worker/setup-worker-ssh.sh < ~/.ssh/authorized_keys - chmod 0640 ~/.ssh/authorized_keys - cat >> ~/.ssh/config < ~/.ssh/authorized_keys +chmod 0640 ~/.ssh/authorized_keys +cat >> ~/.ssh/config < /etc/slurm/slurm.conf < -ENV SLURM_VERSION=19.05.1 \ - MUNGE_UID=981 \ - SLURM_UID=982 \ - WORKER_UID=1000 - -RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y - -RUN groupadd -g $MUNGE_UID munge \ - && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \ - && groupadd -g $SLURM_UID slurm \ - && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \ - && groupadd -g $WORKER_UID worker \ - && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker - -RUN yum install -y munge munge-libs -RUN dnf --enablerepo=powertools install munge-devel -y -RUN yum install rng-tools -y - -RUN yum install -y python3 gcc openssl openssl-devel \ -pam-devel numactl numactl-devel hwloc sudo \ -lua readline-devel ncurses-devel man2html \ -libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget - -RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y -RUN dnf install mariadb-server mariadb-devel -y -RUN mkdir /usr/local/slurm-tmp -RUN cd /usr/local/slurm-tmp -RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2 -RUN rpmbuild -ta slurm-22.05.6.tar.bz2 - - -ENV DBD_ADDR=database \ - DBD_HOST=database \ - DBD_PORT=6819 \ - STORAGE_HOST=database.local.dev \ - STORAGE_PORT=3306 \ - STORAGE_PASS=password \ - STORAGE_USER=slurm +# clean up +RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ + && yum clean all \ + && rm -rf /var/cache/yum COPY docker-entrypoint.sh /docker-entrypoint.sh -VOLUME ["/home", "/.secret"] -# 22: SSH -# 3306: MariaDB -# 6817: Slurm Ctl D -# 6818: Slurm D -# 6819: Slurm DBD -EXPOSE 22 3306 6817 6818 6819 ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index b011977..4ed8d94 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -16,10 +16,10 @@ _sshd_host() { _slurm_acct_db() { { echo "create database slurm_acct_db;" - echo "create user '${STORAGE_USER}'@'${STORAGE_HOST}';" - echo "set password for '${STORAGE_USER}'@'${STORAGE_HOST}' = password('${STORAGE_PASS}');" - echo "grant usage on *.* to '${STORAGE_USER}'@'${STORAGE_HOST}';" - echo "grant all privileges on slurm_acct_db.* to '${STORAGE_USER}'@'${STORAGE_HOST}';" + echo "create user 'slurm'@slurmdb'';" + echo "set password for 'slurm'@'slurmdb' = password('demo');" + echo "grant usage on *.* to 'slurm'@'slurmdb';" + echo "grant all privileges on slurm_acct_db.* to 'slurm'@'slurmdb';" echo "flush privileges;" } >> $SLURM_ACCT_DB_SQL } @@ -72,50 +72,6 @@ _wait_for_worker() { fi } -# generate slurmdbd.conf -_generate_slurmdbd_conf() { - cat > /etc/slurm/slurmdbd.conf < -ENV SLURM_VERSION=19.05.1 \ - MUNGE_UID=981 \ - SLURM_UID=982 \ - WORKER_UID=1000 +# clean up +RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \ + && yum clean all \ + && rm -rf /var/cache/yum -RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y - -RUN groupadd -g $MUNGE_UID munge \ - && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \ - && groupadd -g $SLURM_UID slurm \ - && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \ - && groupadd -g $WORKER_UID worker \ - && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker - -RUN yum install -y munge munge-libs -RUN dnf --enablerepo=powertools install munge-devel -y -RUN yum install rng-tools -y - -RUN yum install -y python3 gcc openssl openssl-devel \ -pam-devel numactl numactl-devel hwloc sudo \ -lua readline-devel ncurses-devel man2html \ -libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget - -RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y -RUN dnf install mariadb-server mariadb-devel -y -RUN mkdir /usr/local/slurm-tmp -RUN cd /usr/local/slurm-tmp -RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2 -RUN rpmbuild -ta slurm-22.05.6.tar.bz2 - -VOLUME ["/home", "/.secret"] -# 22: SSH -# 3306: MariaDB -# 6817: SlurmCtlD -# 6818: SlurmD -# 6819: SlurmDBD -EXPOSE 22 3306 6817 6818 6819 WORKDIR /home/worker COPY docker-entrypoint.sh /docker-entrypoint.sh ENTRYPOINT ["/docker-entrypoint.sh"]