diff --git a/docker-compose.yml b/docker-compose.yml index 2947d8d..7620c04 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -63,58 +63,49 @@ services: cap_add: - SYS_NICE - # mysql: - # container_name: mysql - # image: mysql:8.0.22 - # command: ["--default-authentication-plugin=mysql_native_password"] - # environment: - # MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD} - # MYSQL_DATABASE: ${MYSQL_DATABASE} - # MYSQL_USER: ${MYSQL_USER} - # MYSQL_PASSWORD: ${MYSQL_PASSWORD} - # ports: - # - "127.0.0.1:${MYSQL_PORT}:3306" - # # volumes: - # # - ${DATADIR}/sql-init:/docker-entrypoint-initdb.d - # # - ${DATADIR}/sqldata:/var/lib/mysql - # cap_add: - # - SYS_NICE - - slurm-controller: - image: clustercockpit:22.05.6 + slurmctld: container_name: slurmctld hostname: slurmctld build: context: ./slurm/controller + depends_on: + - slurmdbd privileged: true + ports: + - "6817:6817" volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ./slurm/controller/slurm.conf:/home/config/slurm.conf - slurm-database: - container_name: slurmdb - hostname: slurmdb + slurmdbd: + container_name: slurmdbd + hostname: slurmdbd build: context: ./slurm/database depends_on: - mariadb - - slurm-controller privileged: true + ports: + - "6819:6819" volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + - ./slurm/database/slurmdbd.conf:/home/config/slurmdbd.conf - slurm-worker01: + node01: container_name: node01 hostname: node01 build: context: ./slurm/worker depends_on: - - slurm-controller + - slurmctld privileged: true volumes: - ${DATADIR}/slurm/home:/home - ${DATADIR}/slurm/secret:/.secret + ports: + - "6818:6818" # slurm-worker02: # container_name: node02 diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index 1d59ee9..ce398a7 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -6,21 +6,21 @@ ARCH=$(uname -m) # start sshd server _sshd_host() { - if [ ! -d /var/run/sshd ]; then - mkdir /var/run/sshd - ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' - fi - echo "Starting sshd" - /usr/sbin/sshd + if [ ! -d /var/run/sshd ]; then + mkdir /var/run/sshd + ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' + fi + echo "Starting sshd" + /usr/sbin/sshd } # setup worker ssh to be passwordless _ssh_worker() { - if [[ ! -d /home/worker ]]; then + if [[ ! -d /home/worker ]]; then mkdir -p /home/worker chown -R worker:worker /home/worker fi - cat > /home/worker/setup-worker-ssh.sh </home/worker/setup-worker-ssh.sh < /etc/munge/munge.key" + sh -c "dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key" chown munge: /etc/munge/munge.key chmod 400 /etc/munge/munge.key sudo -u munge /sbin/munged @@ -64,11 +64,11 @@ _munge_start() { # copy secrets to /.secret directory for other nodes _copy_secrets() { - cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz - cp /home/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh - cp /etc/munge/munge.key /.secret/munge.key - rm -f /home/worker/worker-secret.tar.gz - rm -f /home/worker/setup-worker-ssh.sh + cp /home/worker/worker-secret.tar.gz /.secret/worker-secret.tar.gz + cp /home/worker/setup-worker-ssh.sh /.secret/setup-worker-ssh.sh + cp /etc/munge/munge.key /.secret/munge.key + rm -f /home/worker/worker-secret.tar.gz + rm -f /home/worker/setup-worker-ssh.sh } # run slurmctld @@ -85,8 +85,8 @@ _slurmctld() { sleep 1 done echo "" - mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm - chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm + mkdir -p /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm /etc/slurm + chown -R slurm: /var/spool/slurm/ctld /var/spool/slurm/d /var/log/slurm touch /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log if [[ ! -f /home/config/slurm.conf ]]; then @@ -98,11 +98,13 @@ _slurmctld() { chown slurm: /etc/slurm/slurm.conf chmod 600 /etc/slurm/slurm.conf fi + sacctmgr -i add cluster "snowflake" sleep 2s - echo "Starting slurmctld" + echo "Starting slurmctld" cp -f /etc/slurm/slurm.conf /.secret/ /usr/sbin/slurmctld + echo "Started slurmctld" } ### main ### @@ -112,4 +114,4 @@ _munge_start _copy_secrets _slurmctld -tail -f /dev/null \ No newline at end of file +tail -f /dev/null diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf new file mode 100644 index 0000000..63f48f8 --- /dev/null +++ b/slurm/controller/slurm.conf @@ -0,0 +1,106 @@ +# slurm.conf +# +# See the slurm.conf man page for more information. +# +ClusterName=linux +ControlMachine=slurmctld +ControlAddr=slurmctld +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/linuxproc +#PluginDir= +#CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +TaskPlugin=task/none +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_CPU_Memory +FastSchedule=1 +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp.log +# +# ACCOUNTING +#JobAcctGatherType=jobacct_gather/linux +JobAcctGatherType=jobacct_gather/cgroup +ProctrackType=proctrack/cgroup + +JobAcctGatherFrequency=30 +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurmdbd +AccountingStoragePort=6819 +#AccountingStorageLoc=slurm_acct_db +#AccountingStoragePass= +#AccountingStorageUser= +# + +# COMPUTE NODES +PartitionName=DEFAULT Nodes=c[1-2] +PartitionName=debug Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP + +# # COMPUTE NODES +# NodeName=c[1-2] RealMemory=1000 State=UNKNOWN +NodeName=c[1-2] CPUs=12 Boards=1 SocketsPerBoard=2 CoresPerSocket=3 ThreadsPerCore=2 + +# # +# # PARTITIONS +# PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP + +#PrEpPlugins=pika diff --git a/slurm/database/docker-entrypoint.sh b/slurm/database/docker-entrypoint.sh index 4275384..c314e94 100755 --- a/slurm/database/docker-entrypoint.sh +++ b/slurm/database/docker-entrypoint.sh @@ -69,6 +69,7 @@ _slurmdbd() { echo "Starting slurmdbd" cp /etc/slurm/slurmdbd.conf /.secret/slurmdbd.conf /usr/sbin/slurmdbd + echo "Started slurmdbd" } ### main ### diff --git a/slurm/database/slurmdbd.conf b/slurm/database/slurmdbd.conf new file mode 100644 index 0000000..f6d5a81 --- /dev/null +++ b/slurm/database/slurmdbd.conf @@ -0,0 +1,37 @@ +# +# Example slurmdbd.conf file. +# +# See the slurmdbd.conf man page for more information. +# +# Archive info +#ArchiveJobs=yes +#ArchiveDir="/tmp" +#ArchiveSteps=yes +#ArchiveScript= +#JobPurge=12 +#StepPurge=1 +# +# Authentication info +AuthType=auth/munge +#AuthInfo=/var/run/munge/munge.socket.2 +# +# slurmDBD info +DbdAddr=slurmdbd +DbdHost=slurmdbd +DbdPort=6819 +SlurmUser=slurm +#MessageTimeout=300 +DebugLevel=4 +#DefaultQOS=normal,standby +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd/slurmdbd.pid +#PluginDir=/usr/lib/slurm +#PrivateData=accounts,users,usage,jobs +#TrackWCKey=yes +# +# Database info +StorageType=accounting_storage/mysql +StorageHost=mariadb +StorageUser=slurm +StoragePass=demo +StorageLoc=slurm_acct_db diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index ab4f512..299132b 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -78,6 +78,7 @@ _slurmd() { chown slurm: /var/log/slurmd.log echo -n "Starting slurmd" /usr/sbin/slurmd + echo -n "Started slurmd" } ### main ###