Stable services

This commit is contained in:
Aditya Ujeniya 2024-10-15 16:15:37 +02:00
parent c646309a25
commit f7558779da
4 changed files with 90 additions and 55 deletions

View File

@ -68,8 +68,6 @@ services:
hostname: slurmctld hostname: slurmctld
build: build:
context: ./slurm/controller context: ./slurm/controller
depends_on:
- slurmdbd
privileged: true privileged: true
ports: ports:
- "6817:6817" - "6817:6817"
@ -77,6 +75,8 @@ services:
- ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/home:/home
- ${DATADIR}/slurm/secret:/.secret - ${DATADIR}/slurm/secret:/.secret
- ./slurm/controller/slurm.conf:/home/config/slurm.conf - ./slurm/controller/slurm.conf:/home/config/slurm.conf
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
slurmdbd: slurmdbd:
container_name: slurmdbd container_name: slurmdbd
@ -85,6 +85,7 @@ services:
context: ./slurm/database context: ./slurm/database
depends_on: depends_on:
- mariadb - mariadb
- slurmctld
privileged: true privileged: true
ports: ports:
- "6819:6819" - "6819:6819"
@ -92,6 +93,8 @@ services:
- ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/home:/home
- ${DATADIR}/slurm/secret:/.secret - ${DATADIR}/slurm/secret:/.secret
- ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
node01: node01:
container_name: node01 container_name: node01
@ -106,17 +109,7 @@ services:
- ${DATADIR}/slurm/secret:/.secret - ${DATADIR}/slurm/secret:/.secret
- ./slurm/worker/cgroup.conf:/home/config/cgroup.conf - ./slurm/worker/cgroup.conf:/home/config/cgroup.conf
- ./slurm/controller/slurm.conf:/home/config/slurm.conf - ./slurm/controller/slurm.conf:/home/config/slurm.conf
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
ports: ports:
- "6818:6818" - "6818:6818"
# slurm-worker02:
# container_name: node02
# hostname: node02
# build:
# context: ./slurm/worker
# depends_on:
# - slurm-controller
# privileged: true
# volumes:
# - ${DATADIR}/slurm/home:/home
# - ${DATADIR}/slurm/secret:/.secret

View File

@ -4,6 +4,18 @@ set -e
# Determine the system architecture dynamically # Determine the system architecture dynamically
ARCH=$(uname -m) ARCH=$(uname -m)
_delete_secrets() {
if [ -f /.secret/munge.key ]; then
echo "Removing secrets"
sudo rm -rf /.secret/munge.key
sudo rm -rf /.secret/worker-secret.tar.gz
sudo rm -rf /.secret/setup-worker-ssh.sh
echo "Done removing secrets"
ls /.secret/
fi
}
# start sshd server # start sshd server
_sshd_host() { _sshd_host() {
if [ ! -d /var/run/sshd ]; then if [ ! -d /var/run/sshd ]; then
@ -90,9 +102,17 @@ _slurmctld() {
done done
echo "" echo ""
mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib
touch /var/log/slurmctld.log touch /var/log/slurmctld.log
chown slurm: /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log
touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log
# touch /var/run/slurm/d/slurmctld.pid
# chown slurm: /var/run/slurm/d/slurmctld.pid
# touch /var/run/slurm/d/slurmd.pid
# chown slurm:/var/run/slurm/d/slurmd.pid
if [[ ! -f /home/config/slurm.conf ]]; then if [[ ! -f /home/config/slurm.conf ]]; then
echo "### Missing slurm.conf ###" echo "### Missing slurm.conf ###"
exit exit
@ -103,15 +123,25 @@ _slurmctld() {
chmod 600 /etc/slurm/slurm.conf chmod 600 /etc/slurm/slurm.conf
fi fi
sacctmgr -i add cluster "snowflake" sudo yum install -y nc
sudo yum install -y procps
sudo yum install -y iputils
while ! nc -z slurmdbd 6819; do
echo "Waiting for slurmdbd to be ready..."
sleep 2
done
sacctmgr -i add cluster name=linux
sleep 2s sleep 2s
echo "Starting slurmctld" echo "Starting slurmctld"
cp -f /etc/slurm/slurm.conf /.secret/ cp -f /etc/slurm/slurm.conf /.secret/
/usr/sbin/slurmctld /usr/sbin/slurmctld -Dvv
echo "Started slurmctld" echo "Started slurmctld"
} }
### main ### ### main ###
_delete_secrets
_sshd_host _sshd_host
_ssh_worker _ssh_worker
_munge_start _munge_start

View File

@ -15,13 +15,13 @@ SlurmdPort=6818
AuthType=auth/munge AuthType=auth/munge
#JobCredentialPrivateKey= #JobCredentialPrivateKey=
#JobCredentialPublicCertificate= #JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd StateSaveLocation=/var/lib/slurm/d
SlurmdSpoolDir=/var/spool/slurmd SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none SwitchType=switch/none
MpiDefault=none MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid SlurmdPidFile=/var/run/slurm/d/slurmd.pid
# ProctrackType=proctrack/linuxproc ProctrackType=proctrack/linuxproc
#PluginDir= #PluginDir=
#CacheGroups=0 #CacheGroups=0
#FirstJobId= #FirstJobId=
@ -79,7 +79,7 @@ JobCompLoc=/var/log/slurm/jobcomp.log
# ACCOUNTING # ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux #JobAcctGatherType=jobacct_gather/linux
JobAcctGatherType=jobacct_gather/cgroup JobAcctGatherType=jobacct_gather/cgroup
ProctrackType=proctrack/cgroup # ProctrackType=proctrack/cgroup
JobAcctGatherFrequency=30 JobAcctGatherFrequency=30
# #
@ -97,7 +97,7 @@ PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP
# # COMPUTE NODES # # COMPUTE NODES
# NodeName=c[1-2] RealMemory=1000 State=UNKNOWN # NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2 NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1
# # # #
# # PARTITIONS # # PARTITIONS

View File

@ -15,6 +15,10 @@ _sshd_host() {
# start munge using existing key # start munge using existing key
_munge_start_using_key() { _munge_start_using_key() {
sudo yum install -y nc
sudo yum install -y procps
sudo yum install -y iputils
echo -n "cheking for munge.key" echo -n "cheking for munge.key"
while [ ! -f /.secret/munge.key ]; do while [ ! -f /.secret/munge.key ]; do
echo -n "." echo -n "."
@ -35,14 +39,17 @@ _munge_start_using_key() {
# wait for worker user in shared /home volume # wait for worker user in shared /home volume
_wait_for_worker() { _wait_for_worker() {
echo "checking for id_rsa.pub"
if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then
echo -n "checking for id_rsa.pub" echo "checking for id_rsa.pub"
while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do
echo -n "." echo -n "."
sleep 1 sleep 1
done done
echo "" echo ""
fi fi
echo "done checking for id_rsa.pub"
} }
_start_dbus() { _start_dbus() {
@ -58,14 +65,18 @@ _slurmd() {
slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \ slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \
slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \ slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \
slurm-torque-22.05.6-1.el8.$ARCH.rpm slurm-torque-22.05.6-1.el8.$ARCH.rpm
echo "checking for slurm.conf"
if [ ! -f /.secret/slurm.conf ]; then if [ ! -f /.secret/slurm.conf ]; then
echo -n "checking for slurm.conf" echo "checking for slurm.conf"
while [ ! -f /.secret/slurm.conf ]; do while [ ! -f /.secret/slurm.conf ]; do
echo -n "." echo -n "."
sleep 1 sleep 1
done done
echo "" echo ""
fi fi
echo "found slurm.conf"
mkdir -p /var/spool/slurm/d /etc/slurm mkdir -p /var/spool/slurm/d /etc/slurm
chown slurm: /var/spool/slurm/d chown slurm: /var/spool/slurm/d
cp /home/config/cgroup.conf /etc/slurm/cgroup.conf cp /home/config/cgroup.conf /etc/slurm/cgroup.conf
@ -76,9 +87,10 @@ _slurmd() {
chmod 600 /etc/slurm/slurm.conf chmod 600 /etc/slurm/slurm.conf
touch /var/log/slurmd.log touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log chown slurm: /var/log/slurmd.log
echo -n "Starting slurmd"
/usr/sbin/slurmd echo "Starting slurmd"
echo -n "Started slurmd" /usr/sbin/slurmd -Dvv
echo "Started slurmd"
} }
### main ### ### main ###