Update slurm container setup

This commit is contained in:
2023-08-21 09:57:51 +02:00
parent 0a3a6e4752
commit a43b95b2bd
13 changed files with 213 additions and 103 deletions

View File

@@ -18,6 +18,7 @@ RUN dnf --enablerepo=powertools install munge-devel -y
RUN yum install rng-tools -y
RUN yum install -y python3 gcc openssl openssl-devel \
openssh-server openssh-clients dbus-devel \
pam-devel numactl numactl-devel hwloc sudo \
lua readline-devel ncurses-devel man2html \
libibmad libibumad rpm-build perl-ExtUtils-MakeMaker.noarch rpm-build make wget
@@ -42,4 +43,4 @@ VOLUME ["/home", "/.secret"]
# 6817: SlurmCtlD
# 6818: SlurmD
# 6819: SlurmDBD
EXPOSE 22 3306 6817 6818 6819
EXPOSE 22 6817 6818 6819

View File

@@ -1,4 +1,5 @@
IMAGE = scidas/slurm.base
include ../../.env
IMAGE = clustercockpit/slurm.base
.PHONY: build clean

View File

@@ -1,4 +1,4 @@
FROM clustercockpit/slurm.base:latest
FROM clustercockpit/slurm.base:22.05.6
MAINTAINER Jan Eitzinger <jan.eitzinger@fau.de>
# clean up

View File

@@ -7,6 +7,7 @@ _sshd_host() {
mkdir /var/run/sshd
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N ''
fi
echo "Starting sshd"
/usr/sbin/sshd
}
@@ -40,6 +41,7 @@ EOF2
# start munge and generate key
_munge_start() {
echo "Starting munge"
chown -R munge: /etc/munge /var/lib/munge /var/log/munge /var/run/munge
chmod 0700 /etc/munge
chmod 0711 /var/lib/munge
@@ -69,8 +71,12 @@ _copy_secrets() {
# run slurmctld
_slurmctld() {
cd /root/rpmbuild/RPMS/aarch64
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm slurm-perlapi-22.05.6-1.el8.aarch64.rpm slurm-slurmctld-22.05.6-1.el8.aarch64.rpm
echo -n "checking for slurmdbd.conf"
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \
slurm-perlapi-22.05.6-1.el8.aarch64.rpm \
slurm-slurmd-22.05.6-1.el8.aarch64.rpm \
slurm-torque-22.05.6-1.el8.aarch64.rpm \
slurm-slurmctld-22.05.6-1.el8.aarch64.rpm
echo "checking for slurmdbd.conf"
while [ ! -f /.secret/slurmdbd.conf ]; do
echo -n "."
sleep 1
@@ -86,11 +92,14 @@ _slurmctld() {
else
echo "### use provided slurm.conf ###"
cp /home/config/slurm.conf /etc/slurm/slurm.conf
chown slurm: /etc/slurm/slurm.conf
chmod 600 /etc/slurm/slurm.conf
fi
sacctmgr -i add cluster "snowflake"
sleep 2s
/usr/sbin/slurmctld
echo "Starting slurmctld"
cp -f /etc/slurm/slurm.conf /.secret/
/usr/sbin/slurmctld
}
### main ###

View File

@@ -1,4 +1,4 @@
FROM clustercockpit/slurm.base:latest
FROM clustercockpit/slurm.base:22.05.6
MAINTAINER Jan Eitzinger <jan.eitzinger@fau.de>
# clean up

View File

@@ -12,32 +12,6 @@ _sshd_host() {
/usr/sbin/sshd
}
# slurm database user settings
_slurm_acct_db() {
{
echo "create database slurm_acct_db;"
echo "create user 'slurm'@slurmdb'';"
echo "set password for 'slurm'@'slurmdb' = password('demo');"
echo "grant usage on *.* to 'slurm'@'slurmdb';"
echo "grant all privileges on slurm_acct_db.* to 'slurm'@'slurmdb';"
echo "flush privileges;"
} >> $SLURM_ACCT_DB_SQL
}
# start database
_mariadb_start() {
# mariadb somehow expects `resolveip` to be found under this path; see https://github.com/SciDAS/slurm-in-docker/issues/26
ln -s /usr/bin/resolveip /usr/libexec/resolveip
mysql_install_db
chown -R mysql: /var/lib/mysql/ /var/log/mariadb/ /var/run/mariadb
cd /var/lib/mysql
mysqld_safe --user=mysql &
cd /
_slurm_acct_db
sleep 5s
mysql -uroot < $SLURM_ACCT_DB_SQL
}
# start munge using existing key
_munge_start_using_key() {
if [ ! -f /.secret/munge.key ]; then
@@ -74,24 +48,28 @@ _wait_for_worker() {
# run slurmdbd
_slurmdbd() {
mkdir -p /var/spool/slurm/d \
/var/log/slurm
chown slurm: /var/spool/slurm/d \
/var/log/slurm
cd /root/rpmbuild/RPMS/aarch64
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \
slurm-perlapi-22.05.6-1.el8.aarch64.rpm \
slurm-slurmdbd-22.05.6-1.el8.aarch64.rpm
mkdir -p /var/spool/slurm/d /var/log/slurm /etc/slurm
chown slurm: /var/spool/slurm/d /var/log/slurm
if [[ ! -f /home/config/slurmdbd.conf ]]; then
echo "### Missing slurmdbd.conf ###"
exit
else
echo "### use provided slurmdbd.conf ###"
cp /home/config/slurmdbd.conf /etc/slurm/slurmdbd.conf
chown slurm: /etc/slurm/slurmdbd.conf
chmod 600 /etc/slurm/slurmdbd.conf
fi
/usr/sbin/slurmdbd
echo "Starting slurmdbd"
cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf
/usr/sbin/slurmdbd
}
### main ###
_sshd_host
_mariadb_start
_munge_start_using_key
_wait_for_worker
_slurmdbd

10
slurm/rest/Dockerfile Normal file
View File

@@ -0,0 +1,10 @@
FROM clustercockpit/slurm.base:22.05.6
MAINTAINER Jan Eitzinger <jan.eitzinger@fau.de>
# clean up
RUN rm -f /root/rpmbuild/RPMS/slurm-*.rpm \
&& yum clean all \
&& rm -rf /var/cache/yum
COPY docker-entrypoint.sh /docker-entrypoint.sh
ENTRYPOINT ["/docker-entrypoint.sh"]

108
slurm/rest/docker-entrypoint.sh Executable file
View File

@@ -0,0 +1,108 @@
#!/usr/bin/env bash
set -e
# start sshd server
_sshd_host() {
if [ ! -d /var/run/sshd ]; then
mkdir /var/run/sshd
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N ''
fi
/usr/sbin/sshd
}
# setup worker ssh to be passwordless
_ssh_worker() {
if [[ ! -d /home/worker ]]; then
mkdir -p /home/worker
chown -R worker:worker /home/worker
fi
cat > /home/worker/setup-worker-ssh.sh <<EOF2
mkdir -p ~/.ssh
chmod 0700 ~/.ssh
ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N "" -C "$(whoami)@$(hostname)-$(date -I)"
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
chmod 0640 ~/.ssh/authorized_keys
cat >> ~/.ssh/config <<EOF
Host *
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
LogLevel QUIET
EOF
chmod 0644 ~/.ssh/config
cd ~/
tar -czvf ~/worker-secret.tar.gz .ssh
cd -
EOF2
chmod +x /home/worker/setup-worker-ssh.sh
chown worker: /home/worker/setup-worker-ssh.sh
sudo -u worker /home/worker/setup-worker-ssh.sh
}
# start munge and generate key
_munge_start() {
chown -R munge: /etc/munge /var/lib/munge /var/log/munge /var/run/munge
chmod 0700 /etc/munge
chmod 0711 /var/lib/munge
chmod 0700 /var/log/munge
chmod 0755 /var/run/munge
/sbin/create-munge-key -f
rngd -r /dev/urandom
/usr/sbin/create-munge-key -r -f
sh -c "dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key"
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
sudo -u munge /sbin/munged
munge -n
munge -n | unmunge
remunge
}
# copy secrets to /.secret directory for other nodes
_copy_secrets() {
cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz
cp thome/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh
cp /etc/munge/munge.key /.secret/munge.key
rm -f /home/worker/worker-secret.tar.gz
rm -f /home/worker/setup-worker-ssh.sh
}
# run slurmctld
_slurmctld() {
cd /root/rpmbuild/RPMS/aarch64
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \
slurm-perlapi-22.05.6-1.el8.aarch64.rpm \
slurm-slurmd-22.05.6-1.el8.aarch64.rpm \
slurm-torque-22.05.6-1.el8.aarch64.rpm \
slurm-slurmctld-22.05.6-1.el8.aarch64.rpm \
slurm-slurmrestd-22.05.6-1.el8.aarch64.rpm
echo -n "checking for slurmdbd.conf"
while [ ! -f /.secret/slurmdbd.conf ]; do
echo -n "."
sleep 1
done
echo ""
mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm
touch /var/log/slurmctld.log
chown slurm: /var/log/slurmctld.log
if [[ ! -f /home/config/slurm.conf ]]; then
echo "### Missing slurm.conf ###"
exit
else
echo "### use provided slurm.conf ###"
cp /home/config/slurm.conf /etc/slurm/slurm.conf
fi
sacctmgr -i add cluster "snowflake"
sleep 2s
/usr/sbin/slurmctld
cp -f /etc/slurm/slurm.conf /.secret/
}
### main ###
_sshd_host
_ssh_worker
_munge_start
_copy_secrets
_slurmctld
tail -f /dev/null

View File

@@ -1,4 +1,4 @@
FROM clustercockpit/slurm.base:latest
FROM clustercockpit/slurm.base:22.05.6
MAINTAINER Jan Eitzinger <jan.eitzinger@fau.de>
# clean up

View File

@@ -33,7 +33,7 @@ _munge_start_using_key() {
# wait for worker user in shared /home volume
_wait_for_worker() {
if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then
echo -n "cheking for id_rsa.pub"
echo -n "checking for id_rsa.pub"
while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do
echo -n "."
sleep 1
@@ -42,28 +42,46 @@ _wait_for_worker() {
fi
}
_start_dbus() {
dbus-uuidgen > /var/lib/dbus/machine-id
mkdir -p /var/run/dbus
dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address
}
# run slurmd
_slurmd() {
if [ ! -f /.secret/slurm.conf ]; then
echo -n "checking for slurm.conf"
while [ ! -f /.secret/slurm.conf ]; do
echo -n "."
sleep 1
done
echo ""
fi
mkdir -p /var/spool/slurm/d
chown slurm: /var/spool/slurm/d
cp /.secret/slurm.conf /etc/slurm/slurm.conf
touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log
/usr/sbin/slurmd
cd /root/rpmbuild/RPMS/aarch64
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.aarch64.rpm \
slurm-perlapi-22.05.6-1.el8.aarch64.rpm \
slurm-slurmd-22.05.6-1.el8.aarch64.rpm \
slurm-torque-22.05.6-1.el8.aarch64.rpm
if [ ! -f /.secret/slurm.conf ]; then
echo -n "checking for slurm.conf"
while [ ! -f /.secret/slurm.conf ]; do
echo -n "."
sleep 1
done
echo ""
fi
mkdir -p /var/spool/slurm/d /etc/slurm
chown slurm: /var/spool/slurm/d
cp /home/config/cgroup.conf /etc/slurm/cgroup.conf
chown slurm: /etc/slurm/cgroup.conf
chmod 600 /etc/slurm/cgroup.conf
cp /home/config/slurm.conf /etc/slurm/slurm.conf
chown slurm: /etc/slurm/slurm.conf
chmod 600 /etc/slurm/slurm.conf
touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log
echo -n "Starting slurmd"
/usr/sbin/slurmd
}
### main ###
_sshd_host
_munge_start_using_key
_wait_for_worker
_start_dbus
_slurmd
tail -f /dev/null