Stable services

This commit is contained in:
Aditya Ujeniya 2024-10-15 16:15:37 +02:00
parent c646309a25
commit f7558779da
4 changed files with 90 additions and 55 deletions

View File

@ -68,8 +68,6 @@ services:
hostname: slurmctld
build:
context: ./slurm/controller
depends_on:
- slurmdbd
privileged: true
ports:
- "6817:6817"
@ -77,6 +75,8 @@ services:
- ${DATADIR}/slurm/home:/home
- ${DATADIR}/slurm/secret:/.secret
- ./slurm/controller/slurm.conf:/home/config/slurm.conf
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
slurmdbd:
container_name: slurmdbd
@ -85,6 +85,7 @@ services:
context: ./slurm/database
depends_on:
- mariadb
- slurmctld
privileged: true
ports:
- "6819:6819"
@ -92,6 +93,8 @@ services:
- ${DATADIR}/slurm/home:/home
- ${DATADIR}/slurm/secret:/.secret
- ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
node01:
container_name: node01
@ -106,17 +109,7 @@ services:
- ${DATADIR}/slurm/secret:/.secret
- ./slurm/worker/cgroup.conf:/home/config/cgroup.conf
- ./slurm/controller/slurm.conf:/home/config/slurm.conf
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
ports:
- "6818:6818"
# slurm-worker02:
# container_name: node02
# hostname: node02
# build:
# context: ./slurm/worker
# depends_on:
# - slurm-controller
# privileged: true
# volumes:
# - ${DATADIR}/slurm/home:/home
# - ${DATADIR}/slurm/secret:/.secret

View File

@ -4,6 +4,18 @@ set -e
# Determine the system architecture dynamically
ARCH=$(uname -m)
_delete_secrets() {
if [ -f /.secret/munge.key ]; then
echo "Removing secrets"
sudo rm -rf /.secret/munge.key
sudo rm -rf /.secret/worker-secret.tar.gz
sudo rm -rf /.secret/setup-worker-ssh.sh
echo "Done removing secrets"
ls /.secret/
fi
}
# start sshd server
_sshd_host() {
if [ ! -d /var/run/sshd ]; then
@ -90,9 +102,17 @@ _slurmctld() {
done
echo ""
mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib
touch /var/log/slurmctld.log
chown slurm: /var/log/slurmctld.log
touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log
# touch /var/run/slurm/d/slurmctld.pid
# chown slurm: /var/run/slurm/d/slurmctld.pid
# touch /var/run/slurm/d/slurmd.pid
# chown slurm:/var/run/slurm/d/slurmd.pid
if [[ ! -f /home/config/slurm.conf ]]; then
echo "### Missing slurm.conf ###"
exit
@ -103,15 +123,25 @@ _slurmctld() {
chmod 600 /etc/slurm/slurm.conf
fi
sacctmgr -i add cluster "snowflake"
sudo yum install -y nc
sudo yum install -y procps
sudo yum install -y iputils
while ! nc -z slurmdbd 6819; do
echo "Waiting for slurmdbd to be ready..."
sleep 2
done
sacctmgr -i add cluster name=linux
sleep 2s
echo "Starting slurmctld"
cp -f /etc/slurm/slurm.conf /.secret/
/usr/sbin/slurmctld
/usr/sbin/slurmctld -Dvv
echo "Started slurmctld"
}
### main ###
_delete_secrets
_sshd_host
_ssh_worker
_munge_start

View File

@ -15,13 +15,13 @@ SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
StateSaveLocation=/var/lib/slurm/d
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
# ProctrackType=proctrack/linuxproc
SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid
SlurmdPidFile=/var/run/slurm/d/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
#CacheGroups=0
#FirstJobId=
@ -79,7 +79,7 @@ JobCompLoc=/var/log/slurm/jobcomp.log
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
JobAcctGatherType=jobacct_gather/cgroup
ProctrackType=proctrack/cgroup
# ProctrackType=proctrack/cgroup
JobAcctGatherFrequency=30
#
@ -97,7 +97,7 @@ PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP
# # COMPUTE NODES
# NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2
NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1
# #
# # PARTITIONS

View File

@ -15,6 +15,10 @@ _sshd_host() {
# start munge using existing key
_munge_start_using_key() {
sudo yum install -y nc
sudo yum install -y procps
sudo yum install -y iputils
echo -n "cheking for munge.key"
while [ ! -f /.secret/munge.key ]; do
echo -n "."
@ -35,50 +39,58 @@ _munge_start_using_key() {
# wait for worker user in shared /home volume
_wait_for_worker() {
echo "checking for id_rsa.pub"
if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then
echo -n "checking for id_rsa.pub"
echo "checking for id_rsa.pub"
while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do
echo -n "."
sleep 1
done
echo ""
fi
echo "done checking for id_rsa.pub"
}
_start_dbus() {
dbus-uuidgen > /var/lib/dbus/machine-id
mkdir -p /var/run/dbus
dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address
dbus-uuidgen >/var/lib/dbus/machine-id
mkdir -p /var/run/dbus
dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address
}
# run slurmd
_slurmd() {
cd /root/rpmbuild/RPMS/$ARCH
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \
slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \
slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \
slurm-torque-22.05.6-1.el8.$ARCH.rpm
if [ ! -f /.secret/slurm.conf ]; then
echo -n "checking for slurm.conf"
while [ ! -f /.secret/slurm.conf ]; do
echo -n "."
sleep 1
done
echo ""
fi
mkdir -p /var/spool/slurm/d /etc/slurm
chown slurm: /var/spool/slurm/d
cp /home/config/cgroup.conf /etc/slurm/cgroup.conf
chown slurm: /etc/slurm/cgroup.conf
chmod 600 /etc/slurm/cgroup.conf
cp /home/config/slurm.conf /etc/slurm/slurm.conf
chown slurm: /etc/slurm/slurm.conf
chmod 600 /etc/slurm/slurm.conf
touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log
echo -n "Starting slurmd"
/usr/sbin/slurmd
echo -n "Started slurmd"
cd /root/rpmbuild/RPMS/$ARCH
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \
slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \
slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \
slurm-torque-22.05.6-1.el8.$ARCH.rpm
echo "checking for slurm.conf"
if [ ! -f /.secret/slurm.conf ]; then
echo "checking for slurm.conf"
while [ ! -f /.secret/slurm.conf ]; do
echo -n "."
sleep 1
done
echo ""
fi
echo "found slurm.conf"
mkdir -p /var/spool/slurm/d /etc/slurm
chown slurm: /var/spool/slurm/d
cp /home/config/cgroup.conf /etc/slurm/cgroup.conf
chown slurm: /etc/slurm/cgroup.conf
chmod 600 /etc/slurm/cgroup.conf
cp /home/config/slurm.conf /etc/slurm/slurm.conf
chown slurm: /etc/slurm/slurm.conf
chmod 600 /etc/slurm/slurm.conf
touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log
echo "Starting slurmd"
/usr/sbin/slurmd -Dvv
echo "Started slurmd"
}
### main ###