Slurm version update and rest service add

This commit is contained in:
Aditya Ujeniya
2024-10-22 20:27:09 +02:00
parent 2d15d513c6
commit 2b68597724
14 changed files with 99 additions and 101 deletions

View File

@@ -1,4 +1,4 @@
FROM clustercockpit/slurm.base:22.05.6
FROM clustercockpit/slurm.base:24.05.3
LABEL org.opencontainers.image.authors="jan.eitzinger@fau.de"
# clean up

View File

@@ -3,109 +3,73 @@ set -e
# Determine the system architecture dynamically
ARCH=$(uname -m)
SLURM_VERSION="24.05.3"
# start sshd server
_sshd_host() {
if [ ! -d /var/run/sshd ]; then
mkdir /var/run/sshd
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N ''
fi
/usr/sbin/sshd
}
# setup worker ssh to be passwordless
_ssh_worker() {
if [[ ! -d /home/worker ]]; then
mkdir -p /home/worker
chown -R worker:worker /home/worker
if [ ! -d /var/run/sshd ]; then
mkdir /var/run/sshd
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N ''
fi
cat > /home/worker/setup-worker-ssh.sh <<EOF2
mkdir -p ~/.ssh
chmod 0700 ~/.ssh
ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N "" -C "$(whoami)@$(hostname)-$(date -I)"
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
chmod 0640 ~/.ssh/authorized_keys
cat >> ~/.ssh/config <<EOF
Host *
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
LogLevel QUIET
EOF
chmod 0644 ~/.ssh/config
cd ~/
tar -czvf ~/worker-secret.tar.gz .ssh
cd -
EOF2
chmod +x /home/worker/setup-worker-ssh.sh
chown worker: /home/worker/setup-worker-ssh.sh
sudo -u worker /home/worker/setup-worker-ssh.sh
/usr/sbin/sshd
}
# start munge and generate key
_munge_start() {
# start munge using existing key
_munge_start_using_key() {
if [ ! -f /.secret/munge.key ]; then
echo -n "checking for munge.key"
while [ ! -f /.secret/munge.key ]; do
echo -n "."
sleep 1
done
echo ""
fi
cp /.secret/munge.key /etc/munge/munge.key
chown -R munge: /etc/munge /var/lib/munge /var/log/munge /var/run/munge
chmod 0700 /etc/munge
chmod 0711 /var/lib/munge
chmod 0700 /var/log/munge
chmod 0755 /var/run/munge
/sbin/create-munge-key -f
rngd -r /dev/urandom
/usr/sbin/create-munge-key -r -f
sh -c "dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key"
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
sudo -u munge /sbin/munged
munge -n
munge -n | unmunge
remunge
}
# copy secrets to /.secret directory for other nodes
_copy_secrets() {
cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz
cp thome/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh
cp /etc/munge/munge.key /.secret/munge.key
rm -f /home/worker/worker-secret.tar.gz
rm -f /home/worker/setup-worker-ssh.sh
}
# run slurmctld
_slurmctld() {
# run slurmrestd
_slurmrestd() {
cd /root/rpmbuild/RPMS/$ARCH
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \
slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \
slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \
slurm-torque-22.05.6-1.el8.$ARCH.rpm \
slurm-slurmctld-22.05.6-1.el8.$ARCH.rpm \
slurm-slurmrestd-22.05.6-1.el8.$ARCH.rpm
yum -y --nogpgcheck localinstall slurm-$SLURM_VERSION*.$ARCH.rpm \
slurm-perlapi-$SLURM_VERSION*.$ARCH.rpm \
slurm-slurmd-$SLURM_VERSION*.$ARCH.rpm \
slurm-torque-$SLURM_VERSION*.$ARCH.rpm \
slurm-slurmctld-$SLURM_VERSION*.$ARCH.rpm \
slurm-slurmrestd-$SLURM_VERSION*.$ARCH.rpm
echo -n "checking for slurmdbd.conf"
while [ ! -f /.secret/slurmdbd.conf ]; do
echo -n "."
sleep 1
done
echo ""
mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm
touch /var/log/slurmctld.log
chown slurm: /var/log/slurmctld.log
if [[ ! -f /home/config/slurm.conf ]]; then
# mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
# chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm
touch /var/log/slurmrestd.log
chown slurm: /var/log/slurmrestd.log
if [[ ! -f /home/config/slurmrestd.conf ]]; then
echo "### Missing slurm.conf ###"
exit
else
echo "### use provided slurm.conf ###"
cp /home/config/slurm.conf /etc/slurm/slurm.conf
echo "### use provided slurmrestd.conf ###"
cp /home/config/slurmrestd.conf /etc/config/slurmrestd.conf
fi
sacctmgr -i add cluster "snowflake"
sleep 2s
/usr/sbin/slurmctld
cp -f /etc/slurm/slurm.conf /.secret/
/usr/sbin/slurmrestd -f /etc/config/slurmrestd.conf 0.0.0.0:6820 -Dvv
}
### main ###
_sshd_host
_ssh_worker
_munge_start
_copy_secrets
_slurmctld
_munge_start_using_key
_slurmrestd
tail -f /dev/null

View File

@@ -0,0 +1,6 @@
#
# Example slurmdbd.conf file.
#
include /etc/slurm/slurm.conf
AuthType=auth/munge