Introduce slurm cluster

This commit is contained in:
2023-06-23 08:38:15 +02:00
parent d6517a2797
commit fa2287c661
17 changed files with 1528 additions and 17 deletions

View File

@@ -0,0 +1,47 @@
FROM scidas/slurm.base:19.05.1
MAINTAINER Michael J. Stealey <stealey@renci.org>
# install openmpi 3.0.1
RUN yum -y install \
gcc-c++ \
gcc-gfortran \
&& yum -y localinstall \
/packages/openmpi-*.rpm
# install Lmod 7.7
RUN yum -y install \
lua-posix \
lua \
lua-filesystem \
lua-devel \
wget \
bzip2 \
expectk \
make \
&& wget https://sourceforge.net/projects/lmod/files/Lmod-7.7.tar.bz2 \
&& tar -xjvf Lmod-7.7.tar.bz2
WORKDIR /Lmod-7.7
RUN ./configure --prefix=/opt/apps \
&& make install \
&& ln -s /opt/apps/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh \
&& ln -s /opt/apps/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh
WORKDIR /
ENV USE_SLURMDBD=true \
CLUSTER_NAME=snowflake \
CONTROL_MACHINE=controller \
SLURMCTLD_PORT=6817 \
SLURMD_PORT=6818 \
ACCOUNTING_STORAGE_HOST=database \
ACCOUNTING_STORAGE_PORT=6819 \
PARTITION_NAME=docker
# clean up
RUN rm -f /packages/slurm-*.rpm /packages/openmpi-*.rpm \
&& yum clean all \
&& rm -rf /var/cache/yum \
&& rm -f /Lmod-7.7.tar.bz2
COPY docker-entrypoint.sh /docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/tini", "--", "/docker-entrypoint.sh"]

14
slurm/controller/Makefile Normal file
View File

@@ -0,0 +1,14 @@
SLURM_VERSION = 19.05.1
IMAGE = scidas/slurm.controller
.PHONY: all build clean test
all: build
build:
docker build -t $(IMAGE):$(SLURM_VERSION) .
clean:
@[ -z $(docker images -q $(IMAGE):$(SLURM_VERSION)) ] || docker rmi $(IMAGE):$(SLURM_VERSION)

View File

@@ -0,0 +1,3 @@
# Slurm Controller
TODO

View File

@@ -0,0 +1,200 @@
#!/usr/bin/env bash
set -e
# start sshd server
_sshd_host() {
if [ ! -d /var/run/sshd ]; then
mkdir /var/run/sshd
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N ''
fi
/usr/sbin/sshd
}
# setup worker ssh to be passwordless
_ssh_worker() {
if [[ ! -d /home/worker ]]; then
mkdir -p /home/worker
chown -R worker:worker /home/worker
fi
cat > /home/worker/setup-worker-ssh.sh <<'EOF2'
mkdir -p ~/.ssh
chmod 0700 ~/.ssh
ssh-keygen -b 2048 -t rsa -f ~/.ssh/id_rsa -q -N "" -C "$(whoami)@$(hostname)-$(date -I)"
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
chmod 0640 ~/.ssh/authorized_keys
cat >> ~/.ssh/config <<EOF
Host *
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
LogLevel QUIET
EOF
chmod 0644 ~/.ssh/config
cd ~/
tar -czvf ~/worker-secret.tar.gz .ssh
cd -
EOF2
chmod +x /home/worker/setup-worker-ssh.sh
chown worker: /home/worker/setup-worker-ssh.sh
sudo -u worker /home/worker/setup-worker-ssh.sh
}
# start munge and generate key
_munge_start() {
chown -R munge: /etc/munge /var/lib/munge /var/log/munge /var/run/munge
chmod 0700 /etc/munge
chmod 0711 /var/lib/munge
chmod 0700 /var/log/munge
chmod 0755 /var/run/munge
/sbin/create-munge-key -f
sudo -u munge /sbin/munged
munge -n
munge -n | unmunge
remunge
}
# copy secrets to /.secret directory for other nodes
_copy_secrets() {
cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz
cp /home/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh
cp /etc/munge/munge.key /.secret/munge.key
rm -f /home/worker/worker-secret.tar.gz
rm -f /home/worker/setup-worker-ssh.sh
}
# generate slurm.conf
_generate_slurm_conf() {
cat > /etc/slurm/slurm.conf <<EOF
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=$CLUSTER_NAME
SlurmctldHost=$CONTROL_MACHINE
#SlurmctldHostr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=$SLURMCTLD_PORT
SlurmdPort=$SLURMD_PORT
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SelectType=select/linear
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=$ACCOUNTING_STORAGE_HOST
AccountingStoragePort=$ACCOUNTING_STORAGE_PORT
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=worker[01-02] RealMemory=1800 CPUs=1 State=UNKNOWN
PartitionName=$PARTITION_NAME Nodes=ALL Default=YES MaxTime=INFINITE State=UP
EOF
}
# run slurmctld
_slurmctld() {
if $USE_SLURMDBD; then
echo -n "cheking for slurmdbd.conf"
while [ ! -f /.secret/slurmdbd.conf ]; do
echo -n "."
sleep 1
done
echo ""
fi
mkdir -p /var/spool/slurm/ctld \
/var/spool/slurm/d \
/var/log/slurm
chown -R slurm: /var/spool/slurm/ctld \
/var/spool/slurm/d \
/var/log/slurm
touch /var/log/slurmctld.log
chown slurm: /var/log/slurmctld.log
if [[ ! -f /home/config/slurm.conf ]]; then
echo "### generate slurm.conf ###"
_generate_slurm_conf
else
echo "### use provided slurm.conf ###"
cp /home/config/slurm.conf /etc/slurm/slurm.conf
fi
sacctmgr -i add cluster "${CLUSTER_NAME}"
sleep 2s
/usr/sbin/slurmctld
cp -f /etc/slurm/slurm.conf /.secret/
}
### main ###
_sshd_host
_ssh_worker
_munge_start
_copy_secrets
_slurmctld
tail -f /dev/null