Start converting to Rocky Linux base image

This commit is contained in:
Jan Eitzinger 2023-07-10 16:48:51 +02:00
parent fa2287c661
commit f0a6652fb0
11 changed files with 260 additions and 268 deletions

View File

@ -131,34 +131,34 @@ services:
STORAGE_PASS: password STORAGE_PASS: password
STORAGE_USER: slurm STORAGE_USER: slurm
slurm-worker01: # slurm-worker01:
container_name: slurm-worker01 # container_name: slurm-worker01
build: # build:
context: ./slurm/worker # context: ./slurm/worker
depends_on: # depends_on:
- slurm-controller # - slurm-controller
privileged: true # privileged: true
volumes: # volumes:
- ./home:/home # - ./home:/home
- ./secret:/.secret # - ./secret:/.secret
restart: always # restart: always
environment: # environment:
CONTROL_MACHINE: controller # CONTROL_MACHINE: controller
ACCOUNTING_STORAGE_HOST: database # ACCOUNTING_STORAGE_HOST: database
COMPUTE_NODES: worker01 worker02 # COMPUTE_NODES: worker01 worker02
slurm-worker02: # slurm-worker02:
container_name: slurm-worker02 # container_name: slurm-worker02
build: # build:
context: ./slurm/worker # context: ./slurm/worker
depends_on: # depends_on:
- slurm-controller # - slurm-controller
privileged: true # privileged: true
volumes: # volumes:
- ./home:/home # - ./home:/home
- ./secret:/.secret # - ./secret:/.secret
restart: always # restart: always
environment: # environment:
CONTROL_MACHINE: controller # CONTROL_MACHINE: controller
ACCOUNTING_STORAGE_HOST: database # ACCOUNTING_STORAGE_HOST: database
COMPUTE_NODES: worker01 worker02 # COMPUTE_NODES: worker01 worker02

View File

@ -1,46 +0,0 @@
FROM krallin/centos-tini:7
MAINTAINER Michael J. Stealey <stealey@renci.org>
ENV SLURM_VERSION=19.05.1 \
MUNGE_UID=981 \
SLURM_UID=982 \
WORKER_UID=1000
RUN groupadd -g $MUNGE_UID munge \
&& useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \
&& groupadd -g $SLURM_UID slurm \
&& useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \
&& groupadd -g $WORKER_UID worker \
&& useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker
# install packages for general functionality
RUN yum -y install \
epel-release \
&& yum -y install \
sudo \
wget \
which \
tree \
mariadb-server \
mariadb-devel \
munge \
munge-libs \
munge-devel \
openssh-server \
openssh-clients
# install slurm 19.05.1
COPY rpms /packages
# /usr/bin/mpiexec from slurm-torque conflicts with openmpi install
WORKDIR /packages
RUN yum -y localinstall $(ls | grep -v -e 'torque' -e 'openmpi')
WORKDIR /
VOLUME ["/home", "/.secret"]
# 22: SSH
# 3306: MariaDB
# 6817: Slurm Ctl D
# 6818: Slurm D
# 6819: Slurm DBD
EXPOSE 22 3306 6817 6818 6819

View File

@ -1,17 +0,0 @@
SLURM_VERSION = 19.05.1
IMAGE = scidas/slurm.base
DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
.PHONY: all build clean test
all: build
build:
cp -r "$(DIR)/../packages/centos-7/rpms" .
docker build -t $(IMAGE):$(SLURM_VERSION) .
clean:
@[ -z $(docker images -q $(IMAGE):$(SLURM_VERSION)) ] || docker rmi $(IMAGE):$(SLURM_VERSION)
rm -rf rpms

View File

@ -1,3 +0,0 @@
# Slurm Base Image
TODO

View File

@ -1,31 +1,35 @@
FROM scidas/slurm.base:19.05.1 FROM rockylinux:8
MAINTAINER Michael J. Stealey <stealey@renci.org> MAINTAINER Jan Eitzinger <jan.eitzinger@fau.de>
# install openmpi 3.0.1 ENV SLURM_VERSION=19.05.1 \
RUN yum -y install \ MUNGE_UID=981 \
gcc-c++ \ SLURM_UID=982 \
gcc-gfortran \ WORKER_UID=1000
&& yum -y localinstall \
/packages/openmpi-*.rpm
# install Lmod 7.7 RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y
RUN yum -y install \
lua-posix \ RUN groupadd -g $MUNGE_UID munge \
lua \ && useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \
lua-filesystem \ && groupadd -g $SLURM_UID slurm \
lua-devel \ && useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \
wget \ && groupadd -g $WORKER_UID worker \
bzip2 \ && useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker
expectk \
make \ RUN yum install -y munge munge-libs
&& wget https://sourceforge.net/projects/lmod/files/Lmod-7.7.tar.bz2 \ RUN dnf --enablerepo=powertools install munge-devel -y
&& tar -xjvf Lmod-7.7.tar.bz2 RUN yum install rng-tools -y
WORKDIR /Lmod-7.7
RUN ./configure --prefix=/opt/apps \ RUN yum install -y python3 gcc openssl openssl-devel \
&& make install \ pam-devel numactl numactl-devel hwloc sudo \
&& ln -s /opt/apps/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh \ lua readline-devel ncurses-devel man2html \
&& ln -s /opt/apps/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget
WORKDIR /
RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y
RUN dnf install mariadb-server mariadb-devel -y
RUN mkdir /usr/local/slurm-tmp
RUN cd /usr/local/slurm-tmp
RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2
RUN rpmbuild -ta slurm-22.05.6.tar.bz2
ENV USE_SLURMDBD=true \ ENV USE_SLURMDBD=true \
CLUSTER_NAME=snowflake \ CLUSTER_NAME=snowflake \
@ -36,12 +40,12 @@ ENV USE_SLURMDBD=true \
ACCOUNTING_STORAGE_PORT=6819 \ ACCOUNTING_STORAGE_PORT=6819 \
PARTITION_NAME=docker PARTITION_NAME=docker
# clean up
RUN rm -f /packages/slurm-*.rpm /packages/openmpi-*.rpm \
&& yum clean all \
&& rm -rf /var/cache/yum \
&& rm -f /Lmod-7.7.tar.bz2
COPY docker-entrypoint.sh /docker-entrypoint.sh COPY docker-entrypoint.sh /docker-entrypoint.sh
VOLUME ["/home", "/.secret"]
ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"] # 22: SSH
# 3306: MariaDB
# 6817: SlurmCtlD
# 6818: SlurmD
# 6819: SlurmDBD
EXPOSE 22 3306 6817 6818 6819
ENTRYPOINT ["/docker-entrypoint.sh"]

View File

@ -16,23 +16,23 @@ _ssh_worker() {
mkdir -p /home/worker mkdir -p /home/worker
chown -R worker:worker /home/worker chown -R worker:worker /home/worker
fi fi
cat > /home/worker/setup-worker-ssh.sh <<'EOF2' cat > /home/worker/setup-worker-ssh.sh <<EOF2
mkdir -p ~/.ssh mkdir -p ~/.ssh
chmod 0700 ~/.ssh chmod 0700 ~/.ssh
ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N "" -C "$(whoami)@$(hostname)-$(date -I)" ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N "" -C "$(whoami)@$(hostname)-$(date -I)"
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
chmod 0640 ~/.ssh/authorized_keys chmod 0640 ~/.ssh/authorized_keys
cat >> ~/.ssh/config <<EOF cat >> ~/.ssh/config <<EOF
Host * Host *
StrictHostKeyChecking no StrictHostKeyChecking no
UserKnownHostsFile /dev/null UserKnownHostsFile /dev/null
LogLevel QUIET LogLevel QUIET
EOF
chmod 0644 ~/.ssh/config
cd ~/
tar -czvf ~/worker-secret.tar.gz .ssh
cd -
EOF2 EOF2
chmod 0644 ~/.ssh/config
cd ~/
tar -czvf ~/worker-secret.tar.gz .ssh
cd -
EOF2
chmod +x /home/worker/setup-worker-ssh.sh chmod +x /home/worker/setup-worker-ssh.sh
chown worker: /home/worker/setup-worker-ssh.sh chown worker: /home/worker/setup-worker-ssh.sh
sudo -u worker /home/worker/setup-worker-ssh.sh sudo -u worker /home/worker/setup-worker-ssh.sh
@ -46,6 +46,11 @@ _munge_start() {
chmod 0700 /var/log/munge chmod 0700 /var/log/munge
chmod 0755 /var/run/munge chmod 0755 /var/run/munge
/sbin/create-munge-key -f /sbin/create-munge-key -f
rngd -r /dev/urandom
/usr/sbin/create-munge-key -r -f
sh -c "dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key"
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
sudo -u munge /sbin/munged sudo -u munge /sbin/munged
munge -n munge -n
munge -n | unmunge munge -n | unmunge
@ -76,15 +81,10 @@ _generate_slurm_conf() {
# #
ClusterName=$CLUSTER_NAME ClusterName=$CLUSTER_NAME
SlurmctldHost=$CONTROL_MACHINE SlurmctldHost=$CONTROL_MACHINE
#SlurmctldHostr=
#
SlurmUser=slurm SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=$SLURMCTLD_PORT SlurmctldPort=$SLURMCTLD_PORT
SlurmdPort=$SLURMD_PORT SlurmdPort=$SLURMD_PORT
AuthType=auth/munge AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none SwitchType=switch/none
@ -92,25 +92,7 @@ MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid ProctrackType=proctrack/pgid
#PluginDir=
#FirstJobId=
ReturnToService=0 ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
# #
# TIMERS # TIMERS
SlurmctldTimeout=300 SlurmctldTimeout=300
@ -122,17 +104,7 @@ Waittime=0
# #
# SCHEDULING # SCHEDULING
SchedulerType=sched/backfill SchedulerType=sched/backfill
#SchedulerAuth=
#SelectType=select/linear
FastSchedule=1 FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
# #
# LOGGING # LOGGING
SlurmctldDebug=3 SlurmctldDebug=3
@ -161,20 +133,18 @@ EOF
# run slurmctld # run slurmctld
_slurmctld() { _slurmctld() {
cd /root/rpmbuild/RPMS/aarch64
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm slurm-perlapi-22.05.6-1.el8.aarch64.rpm slurm-slurmctld-22.05.6-1.el8.aarch64.rpm
if $USE_SLURMDBD; then if $USE_SLURMDBD; then
echo -n "cheking for slurmdbd.conf" echo -n "checking for slurmdbd.conf"
while [ ! -f /.secret/slurmdbd.conf ]; do while [ ! -f /.secret/slurmdbd.conf ]; do
echo -n "." echo -n "."
sleep 1 sleep 1
done done
echo "" echo ""
fi fi
mkdir -p /var/spool/slurm/ctld \ mkdir -p /var/spool/slurm/ctld /var/spool/slurmd /var/log/slurm /etc/slurm
/var/spool/slurm/d \ chown -R slurm: /var/spool/slurm/ctld /var/spool/slurmd /var/log/slurm
/var/log/slurm
chown -R slurm: /var/spool/slurm/ctld \
/var/spool/slurm/d \
/var/log/slurm
touch /var/log/slurmctld.log touch /var/log/slurmctld.log
chown slurm: /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log
if [[ ! -f /home/config/slurm.conf ]]; then if [[ ! -f /home/config/slurm.conf ]]; then

View File

@ -1,5 +1,36 @@
FROM scidas/slurm.base:19.05.1 FROM rockylinux:8
MAINTAINER Michael J. Stealey <stealey@renci.org> MAINTAINER Jan Eitzinger <jan.eitzinger@fau.de>
ENV SLURM_VERSION=19.05.1 \
MUNGE_UID=981 \
SLURM_UID=982 \
WORKER_UID=1000
RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y
RUN groupadd -g $MUNGE_UID munge \
&& useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \
&& groupadd -g $SLURM_UID slurm \
&& useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \
&& groupadd -g $WORKER_UID worker \
&& useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker
RUN yum install -y munge munge-libs
RUN dnf --enablerepo=powertools install munge-devel -y
RUN yum install rng-tools -y
RUN yum install -y python3 gcc openssl openssl-devel \
pam-devel numactl numactl-devel hwloc sudo \
lua readline-devel ncurses-devel man2html \
libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget
RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y
RUN dnf install mariadb-server mariadb-devel -y
RUN mkdir /usr/local/slurm-tmp
RUN cd /usr/local/slurm-tmp
RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2
RUN rpmbuild -ta slurm-22.05.6.tar.bz2
ENV DBD_ADDR=database \ ENV DBD_ADDR=database \
DBD_HOST=database \ DBD_HOST=database \
@ -9,11 +40,12 @@ ENV DBD_ADDR=database \
STORAGE_PASS=password \ STORAGE_PASS=password \
STORAGE_USER=slurm STORAGE_USER=slurm
# clean up
RUN rm -f /packages/slurm-*.rpm /packages/openmpi-*.rpm \
&& yum clean all \
&& rm -rf /var/cache/yum
COPY docker-entrypoint.sh /docker-entrypoint.sh COPY docker-entrypoint.sh /docker-entrypoint.sh
VOLUME ["/home", "/.secret"]
ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"] # 22: SSH
# 3306: MariaDB
# 6817: Slurm Ctl D
# 6818: Slurm D
# 6819: Slurm DBD
EXPOSE 22 3306 6817 6818 6819
ENTRYPOINT ["/docker-entrypoint.sh"]

View File

@ -41,7 +41,7 @@ _mariadb_start() {
# start munge using existing key # start munge using existing key
_munge_start_using_key() { _munge_start_using_key() {
if [ ! -f /.secret/munge.key ]; then if [ ! -f /.secret/munge.key ]; then
echo -n "cheking for munge.key" echo -n "checking for munge.key"
while [ ! -f /.secret/munge.key ]; do while [ ! -f /.secret/munge.key ]; do
echo -n "." echo -n "."
sleep 1 sleep 1
@ -63,7 +63,7 @@ _munge_start_using_key() {
# wait for worker user in shared /home volume # wait for worker user in shared /home volume
_wait_for_worker() { _wait_for_worker() {
if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then
echo -n "cheking for id_rsa.pub" echo -n "checking for id_rsa.pub"
while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do
echo -n "." echo -n "."
sleep 1 sleep 1

48
slurm/slurm.conf Normal file
View File

@ -0,0 +1,48 @@
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=snowflake
SlurmctldHost=linux0
SlurmUser=slurm
SlurmctldPort=6817
SlurmdPort=6818
MpiDefault=none
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
SlurmdSpoolDir=/var/spool/slurmd
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
TaskPlugin=task/affinity,task/cgroup
#
# TIMERS
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
#
# LOGGING AND ACCOUNTING
AccountingStorageHost=slurm-db
AccountingStoragePort=6818
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageUser=slurm
AccountingStoreFlags=job_script,job_comment,job_env,job_extra
JobCompType=jobcomp/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#
# COMPUTE NODES
NodeName=linux[1-32] CPUs=1 State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

View File

@ -1,39 +1,43 @@
FROM scidas/slurm.base:19.05.1 FROM rockylinux:8
MAINTAINER Michael J. Stealey <stealey@renci.org> MAINTAINER Jan Eitzinger <jan.eitzinger@fau.de>
# install openmpi 3.0.1 ENV SLURM_VERSION=19.05.1 \
RUN yum -y install \ MUNGE_UID=981 \
gcc-c++ \ SLURM_UID=982 \
gcc-gfortran \ WORKER_UID=1000
&& yum -y localinstall \
/packages/openmpi-*.rpm
# install Lmod 7.7 RUN yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y
RUN yum -y install \
lua-posix \
lua \
lua-filesystem \
lua-devel \
wget \
bzip2 \
expectk \
make \
&& wget https://sourceforge.net/projects/lmod/files/Lmod-7.7.tar.bz2 \
&& tar -xjvf Lmod-7.7.tar.bz2
WORKDIR /Lmod-7.7
RUN ./configure --prefix=/opt/apps \
&& make install \
&& ln -s /opt/apps/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh \
&& ln -s /opt/apps/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh
RUN groupadd -g $MUNGE_UID munge \
&& useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGE_UID -g munge -s /sbin/nologin munge \
&& groupadd -g $SLURM_UID slurm \
&& useradd -m -c "Slurm workload manager" -d /var/lib/slurm -u $SLURM_UID -g slurm -s /bin/bash slurm \
&& groupadd -g $WORKER_UID worker \
&& useradd -m -c "Workflow user" -d /home/worker -u $WORKER_UID -g worker -s /bin/bash worker
RUN yum install -y munge munge-libs
RUN dnf --enablerepo=powertools install munge-devel -y
RUN yum install rng-tools -y
RUN yum install -y python3 gcc openssl openssl-devel \
pam-devel numactl numactl-devel hwloc sudo \
lua readline-devel ncurses-devel man2html \
libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget
RUN dnf --enablerepo=powertools install rrdtool-devel lua-devel hwloc-devel rpm-build -y
RUN dnf install mariadb-server mariadb-devel -y
RUN mkdir /usr/local/slurm-tmp
RUN cd /usr/local/slurm-tmp
RUN wget https://download.schedmd.com/slurm/slurm-22.05.6.tar.bz2
RUN rpmbuild -ta slurm-22.05.6.tar.bz2
VOLUME ["/home", "/.secret"]
# 22: SSH
# 3306: MariaDB
# 6817: SlurmCtlD
# 6818: SlurmD
# 6819: SlurmDBD
EXPOSE 22 3306 6817 6818 6819
WORKDIR /home/worker WORKDIR /home/worker
# clean up
RUN rm -f /packages/slurm-*.rpm /packages/openmpi-*.rpm \
&& yum clean all \
&& rm -rf /var/cache/yum \
&& rm -f /Lmod-7.7.tar.bz2
COPY docker-entrypoint.sh /docker-entrypoint.sh COPY docker-entrypoint.sh /docker-entrypoint.sh
ENTRYPOINT ["/docker-entrypoint.sh"]
ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"]

View File

@ -45,7 +45,7 @@ _wait_for_worker() {
# run slurmd # run slurmd
_slurmd() { _slurmd() {
if [ ! -f /.secret/slurm.conf ]; then if [ ! -f /.secret/slurm.conf ]; then
echo -n "cheking for slurm.conf" echo -n "checking for slurm.conf"
while [ ! -f /.secret/slurm.conf ]; do while [ ! -f /.secret/slurm.conf ]; do
echo -n "." echo -n "."
sleep 1 sleep 1