Stable services

This commit is contained in:
Aditya Ujeniya
2024-10-15 16:15:37 +02:00
parent c646309a25
commit f7558779da
4 changed files with 90 additions and 55 deletions

View File

@@ -4,6 +4,18 @@ set -e
# Determine the system architecture dynamically
ARCH=$(uname -m)
_delete_secrets() {
if [ -f /.secret/munge.key ]; then
echo "Removing secrets"
sudo rm -rf /.secret/munge.key
sudo rm -rf /.secret/worker-secret.tar.gz
sudo rm -rf /.secret/setup-worker-ssh.sh
echo "Done removing secrets"
ls /.secret/
fi
}
# start sshd server
_sshd_host() {
if [ ! -d /var/run/sshd ]; then
@@ -90,9 +102,17 @@ _slurmctld() {
done
echo ""
mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib
touch /var/log/slurmctld.log
chown slurm: /var/log/slurmctld.log
touch /var/log/slurmd.log
chown slurm: /var/log/slurmd.log
# touch /var/run/slurm/d/slurmctld.pid
# chown slurm: /var/run/slurm/d/slurmctld.pid
# touch /var/run/slurm/d/slurmd.pid
# chown slurm:/var/run/slurm/d/slurmd.pid
if [[ ! -f /home/config/slurm.conf ]]; then
echo "### Missing slurm.conf ###"
exit
@@ -103,15 +123,25 @@ _slurmctld() {
chmod 600 /etc/slurm/slurm.conf
fi
sacctmgr -i add cluster "snowflake"
sudo yum install -y nc
sudo yum install -y procps
sudo yum install -y iputils
while ! nc -z slurmdbd 6819; do
echo "Waiting for slurmdbd to be ready..."
sleep 2
done
sacctmgr -i add cluster name=linux
sleep 2s
echo "Starting slurmctld"
cp -f /etc/slurm/slurm.conf /.secret/
/usr/sbin/slurmctld
/usr/sbin/slurmctld -Dvv
echo "Started slurmctld"
}
### main ###
_delete_secrets
_sshd_host
_ssh_worker
_munge_start

View File

@@ -15,13 +15,13 @@ SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
StateSaveLocation=/var/lib/slurm/d
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
# ProctrackType=proctrack/linuxproc
SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid
SlurmdPidFile=/var/run/slurm/d/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
#CacheGroups=0
#FirstJobId=
@@ -79,7 +79,7 @@ JobCompLoc=/var/log/slurm/jobcomp.log
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
JobAcctGatherType=jobacct_gather/cgroup
ProctrackType=proctrack/cgroup
# ProctrackType=proctrack/cgroup
JobAcctGatherFrequency=30
#
@@ -97,7 +97,7 @@ PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP
# # COMPUTE NODES
# NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2
NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1
# #
# # PARTITIONS