mirror of
https://github.com/ClusterCockpit/cc-docker.git
synced 2025-03-15 11:25:56 +01:00
Stable services
This commit is contained in:
parent
c646309a25
commit
f7558779da
@ -68,8 +68,6 @@ services:
|
|||||||
hostname: slurmctld
|
hostname: slurmctld
|
||||||
build:
|
build:
|
||||||
context: ./slurm/controller
|
context: ./slurm/controller
|
||||||
depends_on:
|
|
||||||
- slurmdbd
|
|
||||||
privileged: true
|
privileged: true
|
||||||
ports:
|
ports:
|
||||||
- "6817:6817"
|
- "6817:6817"
|
||||||
@ -77,6 +75,8 @@ services:
|
|||||||
- ${DATADIR}/slurm/home:/home
|
- ${DATADIR}/slurm/home:/home
|
||||||
- ${DATADIR}/slurm/secret:/.secret
|
- ${DATADIR}/slurm/secret:/.secret
|
||||||
- ./slurm/controller/slurm.conf:/home/config/slurm.conf
|
- ./slurm/controller/slurm.conf:/home/config/slurm.conf
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
|
||||||
slurmdbd:
|
slurmdbd:
|
||||||
container_name: slurmdbd
|
container_name: slurmdbd
|
||||||
@ -85,6 +85,7 @@ services:
|
|||||||
context: ./slurm/database
|
context: ./slurm/database
|
||||||
depends_on:
|
depends_on:
|
||||||
- mariadb
|
- mariadb
|
||||||
|
- slurmctld
|
||||||
privileged: true
|
privileged: true
|
||||||
ports:
|
ports:
|
||||||
- "6819:6819"
|
- "6819:6819"
|
||||||
@ -92,6 +93,8 @@ services:
|
|||||||
- ${DATADIR}/slurm/home:/home
|
- ${DATADIR}/slurm/home:/home
|
||||||
- ${DATADIR}/slurm/secret:/.secret
|
- ${DATADIR}/slurm/secret:/.secret
|
||||||
- ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf
|
- ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
|
||||||
node01:
|
node01:
|
||||||
container_name: node01
|
container_name: node01
|
||||||
@ -106,17 +109,7 @@ services:
|
|||||||
- ${DATADIR}/slurm/secret:/.secret
|
- ${DATADIR}/slurm/secret:/.secret
|
||||||
- ./slurm/worker/cgroup.conf:/home/config/cgroup.conf
|
- ./slurm/worker/cgroup.conf:/home/config/cgroup.conf
|
||||||
- ./slurm/controller/slurm.conf:/home/config/slurm.conf
|
- ./slurm/controller/slurm.conf:/home/config/slurm.conf
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
ports:
|
ports:
|
||||||
- "6818:6818"
|
- "6818:6818"
|
||||||
|
|
||||||
# slurm-worker02:
|
|
||||||
# container_name: node02
|
|
||||||
# hostname: node02
|
|
||||||
# build:
|
|
||||||
# context: ./slurm/worker
|
|
||||||
# depends_on:
|
|
||||||
# - slurm-controller
|
|
||||||
# privileged: true
|
|
||||||
# volumes:
|
|
||||||
# - ${DATADIR}/slurm/home:/home
|
|
||||||
# - ${DATADIR}/slurm/secret:/.secret
|
|
@ -4,6 +4,18 @@ set -e
|
|||||||
# Determine the system architecture dynamically
|
# Determine the system architecture dynamically
|
||||||
ARCH=$(uname -m)
|
ARCH=$(uname -m)
|
||||||
|
|
||||||
|
_delete_secrets() {
|
||||||
|
if [ -f /.secret/munge.key ]; then
|
||||||
|
echo "Removing secrets"
|
||||||
|
sudo rm -rf /.secret/munge.key
|
||||||
|
sudo rm -rf /.secret/worker-secret.tar.gz
|
||||||
|
sudo rm -rf /.secret/setup-worker-ssh.sh
|
||||||
|
|
||||||
|
echo "Done removing secrets"
|
||||||
|
ls /.secret/
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# start sshd server
|
# start sshd server
|
||||||
_sshd_host() {
|
_sshd_host() {
|
||||||
if [ ! -d /var/run/sshd ]; then
|
if [ ! -d /var/run/sshd ]; then
|
||||||
@ -90,9 +102,17 @@ _slurmctld() {
|
|||||||
done
|
done
|
||||||
echo ""
|
echo ""
|
||||||
mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
|
mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm
|
||||||
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm
|
chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /var/spool /var/lib
|
||||||
touch /var/log/slurmctld.log
|
touch /var/log/slurmctld.log
|
||||||
chown slurm: /var/log/slurmctld.log
|
chown slurm: /var/log/slurmctld.log
|
||||||
|
touch /var/log/slurmd.log
|
||||||
|
chown slurm: /var/log/slurmd.log
|
||||||
|
|
||||||
|
# touch /var/run/slurm/d/slurmctld.pid
|
||||||
|
# chown slurm: /var/run/slurm/d/slurmctld.pid
|
||||||
|
# touch /var/run/slurm/d/slurmd.pid
|
||||||
|
# chown slurm:/var/run/slurm/d/slurmd.pid
|
||||||
|
|
||||||
if [[ ! -f /home/config/slurm.conf ]]; then
|
if [[ ! -f /home/config/slurm.conf ]]; then
|
||||||
echo "### Missing slurm.conf ###"
|
echo "### Missing slurm.conf ###"
|
||||||
exit
|
exit
|
||||||
@ -103,15 +123,25 @@ _slurmctld() {
|
|||||||
chmod 600 /etc/slurm/slurm.conf
|
chmod 600 /etc/slurm/slurm.conf
|
||||||
fi
|
fi
|
||||||
|
|
||||||
sacctmgr -i add cluster "snowflake"
|
sudo yum install -y nc
|
||||||
|
sudo yum install -y procps
|
||||||
|
sudo yum install -y iputils
|
||||||
|
|
||||||
|
while ! nc -z slurmdbd 6819; do
|
||||||
|
echo "Waiting for slurmdbd to be ready..."
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
sacctmgr -i add cluster name=linux
|
||||||
sleep 2s
|
sleep 2s
|
||||||
echo "Starting slurmctld"
|
echo "Starting slurmctld"
|
||||||
cp -f /etc/slurm/slurm.conf /.secret/
|
cp -f /etc/slurm/slurm.conf /.secret/
|
||||||
/usr/sbin/slurmctld
|
/usr/sbin/slurmctld -Dvv
|
||||||
echo "Started slurmctld"
|
echo "Started slurmctld"
|
||||||
}
|
}
|
||||||
|
|
||||||
### main ###
|
### main ###
|
||||||
|
_delete_secrets
|
||||||
_sshd_host
|
_sshd_host
|
||||||
_ssh_worker
|
_ssh_worker
|
||||||
_munge_start
|
_munge_start
|
||||||
|
@ -15,13 +15,13 @@ SlurmdPort=6818
|
|||||||
AuthType=auth/munge
|
AuthType=auth/munge
|
||||||
#JobCredentialPrivateKey=
|
#JobCredentialPrivateKey=
|
||||||
#JobCredentialPublicCertificate=
|
#JobCredentialPublicCertificate=
|
||||||
StateSaveLocation=/var/lib/slurmd
|
StateSaveLocation=/var/lib/slurm/d
|
||||||
SlurmdSpoolDir=/var/spool/slurmd
|
SlurmdSpoolDir=/var/spool/slurm/d
|
||||||
SwitchType=switch/none
|
SwitchType=switch/none
|
||||||
MpiDefault=none
|
MpiDefault=none
|
||||||
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
|
SlurmctldPidFile=/var/run/slurm/d/slurmctld.pid
|
||||||
SlurmdPidFile=/var/run/slurmd/slurmd.pid
|
SlurmdPidFile=/var/run/slurm/d/slurmd.pid
|
||||||
# ProctrackType=proctrack/linuxproc
|
ProctrackType=proctrack/linuxproc
|
||||||
#PluginDir=
|
#PluginDir=
|
||||||
#CacheGroups=0
|
#CacheGroups=0
|
||||||
#FirstJobId=
|
#FirstJobId=
|
||||||
@ -79,7 +79,7 @@ JobCompLoc=/var/log/slurm/jobcomp.log
|
|||||||
# ACCOUNTING
|
# ACCOUNTING
|
||||||
#JobAcctGatherType=jobacct_gather/linux
|
#JobAcctGatherType=jobacct_gather/linux
|
||||||
JobAcctGatherType=jobacct_gather/cgroup
|
JobAcctGatherType=jobacct_gather/cgroup
|
||||||
ProctrackType=proctrack/cgroup
|
# ProctrackType=proctrack/cgroup
|
||||||
|
|
||||||
JobAcctGatherFrequency=30
|
JobAcctGatherFrequency=30
|
||||||
#
|
#
|
||||||
@ -97,7 +97,7 @@ PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP
|
|||||||
|
|
||||||
# # COMPUTE NODES
|
# # COMPUTE NODES
|
||||||
# NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
|
# NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
|
||||||
NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2
|
NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1
|
||||||
|
|
||||||
# #
|
# #
|
||||||
# # PARTITIONS
|
# # PARTITIONS
|
||||||
|
@ -15,6 +15,10 @@ _sshd_host() {
|
|||||||
|
|
||||||
# start munge using existing key
|
# start munge using existing key
|
||||||
_munge_start_using_key() {
|
_munge_start_using_key() {
|
||||||
|
sudo yum install -y nc
|
||||||
|
sudo yum install -y procps
|
||||||
|
sudo yum install -y iputils
|
||||||
|
|
||||||
echo -n "cheking for munge.key"
|
echo -n "cheking for munge.key"
|
||||||
while [ ! -f /.secret/munge.key ]; do
|
while [ ! -f /.secret/munge.key ]; do
|
||||||
echo -n "."
|
echo -n "."
|
||||||
@ -35,50 +39,58 @@ _munge_start_using_key() {
|
|||||||
|
|
||||||
# wait for worker user in shared /home volume
|
# wait for worker user in shared /home volume
|
||||||
_wait_for_worker() {
|
_wait_for_worker() {
|
||||||
|
echo "checking for id_rsa.pub"
|
||||||
if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then
|
if [ ! -f /home/worker/.ssh/id_rsa.pub ]; then
|
||||||
echo -n "checking for id_rsa.pub"
|
echo "checking for id_rsa.pub"
|
||||||
while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do
|
while [ ! -f /home/worker/.ssh/id_rsa.pub ]; do
|
||||||
echo -n "."
|
echo -n "."
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
echo ""
|
echo ""
|
||||||
fi
|
fi
|
||||||
|
echo "done checking for id_rsa.pub"
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_start_dbus() {
|
_start_dbus() {
|
||||||
dbus-uuidgen > /var/lib/dbus/machine-id
|
dbus-uuidgen >/var/lib/dbus/machine-id
|
||||||
mkdir -p /var/run/dbus
|
mkdir -p /var/run/dbus
|
||||||
dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address
|
dbus-daemon --config-file=/usr/share/dbus-1/system.conf --print-address
|
||||||
}
|
}
|
||||||
|
|
||||||
# run slurmd
|
# run slurmd
|
||||||
_slurmd() {
|
_slurmd() {
|
||||||
cd /root/rpmbuild/RPMS/$ARCH
|
cd /root/rpmbuild/RPMS/$ARCH
|
||||||
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \
|
yum -y --nogpgcheck localinstall slurm-22.05.6-1.el8.$ARCH.rpm \
|
||||||
slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \
|
slurm-perlapi-22.05.6-1.el8.$ARCH.rpm \
|
||||||
slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \
|
slurm-slurmd-22.05.6-1.el8.$ARCH.rpm \
|
||||||
slurm-torque-22.05.6-1.el8.$ARCH.rpm
|
slurm-torque-22.05.6-1.el8.$ARCH.rpm
|
||||||
if [ ! -f /.secret/slurm.conf ]; then
|
|
||||||
echo -n "checking for slurm.conf"
|
echo "checking for slurm.conf"
|
||||||
while [ ! -f /.secret/slurm.conf ]; do
|
if [ ! -f /.secret/slurm.conf ]; then
|
||||||
echo -n "."
|
echo "checking for slurm.conf"
|
||||||
sleep 1
|
while [ ! -f /.secret/slurm.conf ]; do
|
||||||
done
|
echo -n "."
|
||||||
echo ""
|
sleep 1
|
||||||
fi
|
done
|
||||||
mkdir -p /var/spool/slurm/d /etc/slurm
|
echo ""
|
||||||
chown slurm: /var/spool/slurm/d
|
fi
|
||||||
cp /home/config/cgroup.conf /etc/slurm/cgroup.conf
|
echo "found slurm.conf"
|
||||||
chown slurm: /etc/slurm/cgroup.conf
|
|
||||||
chmod 600 /etc/slurm/cgroup.conf
|
mkdir -p /var/spool/slurm/d /etc/slurm
|
||||||
cp /home/config/slurm.conf /etc/slurm/slurm.conf
|
chown slurm: /var/spool/slurm/d
|
||||||
chown slurm: /etc/slurm/slurm.conf
|
cp /home/config/cgroup.conf /etc/slurm/cgroup.conf
|
||||||
chmod 600 /etc/slurm/slurm.conf
|
chown slurm: /etc/slurm/cgroup.conf
|
||||||
touch /var/log/slurmd.log
|
chmod 600 /etc/slurm/cgroup.conf
|
||||||
chown slurm: /var/log/slurmd.log
|
cp /home/config/slurm.conf /etc/slurm/slurm.conf
|
||||||
echo -n "Starting slurmd"
|
chown slurm: /etc/slurm/slurm.conf
|
||||||
/usr/sbin/slurmd
|
chmod 600 /etc/slurm/slurm.conf
|
||||||
echo -n "Started slurmd"
|
touch /var/log/slurmd.log
|
||||||
|
chown slurm: /var/log/slurmd.log
|
||||||
|
|
||||||
|
echo "Starting slurmd"
|
||||||
|
/usr/sbin/slurmd -Dvv
|
||||||
|
echo "Started slurmd"
|
||||||
}
|
}
|
||||||
|
|
||||||
### main ###
|
### main ###
|
||||||
|
Loading…
x
Reference in New Issue
Block a user