mirror of
https://github.com/ClusterCockpit/cc-docker.git
synced 2025-04-19 03:15:55 +02:00
Added new Docker Compose setup
This commit is contained in:
parent
cf7ddde758
commit
237c0ecd43
107
Dockerfile
Normal file
107
Dockerfile
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
FROM rockylinux:8
|
||||||
|
|
||||||
|
LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docker-cluster" \
|
||||||
|
org.opencontainers.image.title="slurm-docker-cluster" \
|
||||||
|
org.opencontainers.image.description="Slurm Docker cluster on Rocky Linux 8" \
|
||||||
|
org.label-schema.docker.cmd="docker-compose up -d" \
|
||||||
|
maintainer="Giovanni Torres"
|
||||||
|
|
||||||
|
ARG SLURM_TAG=slurm-21-08-6-1
|
||||||
|
ARG GOSU_VERSION=1.11
|
||||||
|
|
||||||
|
RUN set -ex \
|
||||||
|
&& yum makecache \
|
||||||
|
&& yum -y update \
|
||||||
|
&& yum -y install dnf-plugins-core \
|
||||||
|
&& yum config-manager --set-enabled powertools \
|
||||||
|
&& yum -y install \
|
||||||
|
wget \
|
||||||
|
bzip2 \
|
||||||
|
cmake \
|
||||||
|
perl \
|
||||||
|
gcc \
|
||||||
|
gcc-c++\
|
||||||
|
git \
|
||||||
|
gnupg \
|
||||||
|
make \
|
||||||
|
munge \
|
||||||
|
munge-devel \
|
||||||
|
nano \
|
||||||
|
python3-devel \
|
||||||
|
python3-pip \
|
||||||
|
python3 \
|
||||||
|
mariadb-server \
|
||||||
|
mariadb-devel \
|
||||||
|
psmisc \
|
||||||
|
bash-completion \
|
||||||
|
vim-enhanced \
|
||||||
|
http-parser-devel \
|
||||||
|
json-c-devel \
|
||||||
|
&& yum clean all \
|
||||||
|
&& rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
RUN alternatives --set python /usr/bin/python3
|
||||||
|
|
||||||
|
RUN pip3 install Cython nose
|
||||||
|
|
||||||
|
RUN set -ex \
|
||||||
|
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \
|
||||||
|
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \
|
||||||
|
&& export GNUPGHOME="$(mktemp -d)" \
|
||||||
|
&& gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
|
||||||
|
&& gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
|
||||||
|
&& rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \
|
||||||
|
&& chmod +x /usr/local/bin/gosu \
|
||||||
|
&& gosu nobody true
|
||||||
|
|
||||||
|
WORKDIR /home
|
||||||
|
|
||||||
|
RUN set -x \
|
||||||
|
&& git clone https://gitlab.hrz.tu-chemnitz.de/pika/pika-packages.git \
|
||||||
|
&& git clone https://github.com/nats-io/nats.c.git \
|
||||||
|
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
|
||||||
|
&& pushd slurm \
|
||||||
|
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
|
||||||
|
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
|
||||||
|
&& make install \
|
||||||
|
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
|
||||||
|
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
|
||||||
|
&& install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
|
||||||
|
&& install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
|
||||||
|
&& popd \
|
||||||
|
&& cp -r slurm /opt \
|
||||||
|
&& groupadd -r --gid=990 slurm \
|
||||||
|
&& useradd -r -g slurm --uid=990 slurm \
|
||||||
|
&& mkdir /etc/sysconfig/slurm \
|
||||||
|
/var/spool/slurmd \
|
||||||
|
/var/run/slurmd \
|
||||||
|
/var/run/slurmdbd \
|
||||||
|
/var/lib/slurmd \
|
||||||
|
/var/log/slurm \
|
||||||
|
/data \
|
||||||
|
&& touch /var/lib/slurmd/node_state \
|
||||||
|
/var/lib/slurmd/front_end_state \
|
||||||
|
/var/lib/slurmd/job_state \
|
||||||
|
/var/lib/slurmd/resv_state \
|
||||||
|
/var/lib/slurmd/trigger_state \
|
||||||
|
/var/lib/slurmd/assoc_mgr_state \
|
||||||
|
/var/lib/slurmd/assoc_usage \
|
||||||
|
/var/lib/slurmd/qos_usage \
|
||||||
|
/var/lib/slurmd/fed_mgr_state \
|
||||||
|
&& chown -R slurm:slurm /var/*/slurm* \
|
||||||
|
&& /sbin/create-munge-key
|
||||||
|
|
||||||
|
COPY slurm-prep-pika_v4.c /home/slurm-prep-pika_v4.c
|
||||||
|
COPY makefile /home/makefile
|
||||||
|
|
||||||
|
COPY slurm.conf /etc/slurm/slurm.conf
|
||||||
|
COPY slurmdbd.conf /etc/slurm/slurmdbd.conf
|
||||||
|
RUN set -x \
|
||||||
|
&& chown slurm:slurm /etc/slurm/slurmdbd.conf \
|
||||||
|
&& chmod 600 /etc/slurm/slurmdbd.conf
|
||||||
|
|
||||||
|
|
||||||
|
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||||
|
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
|
||||||
|
|
||||||
|
CMD ["slurmdbd"]
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
|||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2021 ClusterCockpit
|
Copyright (c) 2019 Giovanni Torres
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
158
README.md
158
README.md
@ -1,74 +1,116 @@
|
|||||||
# cc-docker
|
# Slurm Docker Cluster
|
||||||
|
|
||||||
This is a `docker-compose` setup which provides a quickly started environment for ClusterCockpit development and testing, using `cc-backend`.
|
This is a multi-container Slurm cluster using docker-compose. The compose file
|
||||||
A number of services is readily available as docker container (nats, cc-metric-store, InfluxDB, LDAP), or easily added by manual configuration (MySQL).
|
creates named volumes for persistent storage of MySQL data files as well as
|
||||||
|
Slurm state and log directories.
|
||||||
|
|
||||||
It includes the following containers:
|
## Containers and Volumes
|
||||||
* nats (Default)
|
|
||||||
* cc-metric-store (Default)
|
|
||||||
* influxdb (Default)
|
|
||||||
* openldap (Default)
|
|
||||||
* mysql (Optional)
|
|
||||||
* mariadb (Optional)
|
|
||||||
* phpmyadmin (Optional)
|
|
||||||
|
|
||||||
The setup comes with fixture data for a Job archive, cc-metric-store checkpoints, InfluxDB, MySQL, and a LDAP user directory.
|
The compose file will run the following containers:
|
||||||
|
|
||||||
## Known Issues
|
* mysql
|
||||||
|
* slurmdbd
|
||||||
|
* slurmctld
|
||||||
|
* c1 (slurmd)
|
||||||
|
* c2 (slurmd)
|
||||||
|
|
||||||
* `docker-compose` installed on Ubuntu (18.04, 20.04) via `apt-get` can not correctly parse `docker-compose.yml` due to version differences. Install latest version of `docker-compose` from https://docs.docker.com/compose/install/ instead.
|
The compose file will create the following named volumes:
|
||||||
* You need to ensure that no other web server is running on ports 8080 (cc-backend), 8081 (phpmyadmin), 8084 (cc-metric-store), 8086 (nfluxDB), 4222 and 8222 (Nats), or 3306 (MySQL). If one or more ports are already in use, you habe to adapt the related config accordingly.
|
|
||||||
* Existing VPN connections sometimes cause problems with docker. If `docker-compose` does not start up correctly, try disabling any active VPN connection. Refer to https://stackoverflow.com/questions/45692255/how-make-openvpn-work-with-docker for further information.
|
|
||||||
|
|
||||||
## Configuration Templates
|
* etc_munge ( -> /etc/munge )
|
||||||
|
* etc_slurm ( -> /etc/slurm )
|
||||||
|
* slurm_jobdir ( -> /data )
|
||||||
|
* var_lib_mysql ( -> /var/lib/mysql )
|
||||||
|
* var_log_slurm ( -> /var/log/slurm )
|
||||||
|
|
||||||
Located in `./templates`
|
## Building the Docker Image
|
||||||
* `docker-compose.yml.default`: Docker-Compose file to setup cc-metric-store, InfluxDB, MariaDB, PhpMyadmin, and LDAP containers (Default). Used in `setupDev.sh`.
|
|
||||||
* `docker-compose.yml.mysql`: Docker-Compose configuration template if MySQL is desired instead of MariaDB.
|
|
||||||
* `env.default`: Environment variables for setup with cc-metric-store, InfluxDB, MariaDB, PhpMyadmin, and LDAP containers (Default). Used in `setupDev.sh`.
|
|
||||||
* `env.mysql`: Additional environment variables required if MySQL is desired instead of MariaDB.
|
|
||||||
|
|
||||||
## Setup
|
Build the image locally:
|
||||||
|
|
||||||
1. Clone `cc-backend` repository in chosen base folder: `$> git clone https://github.com/ClusterCockpit/cc-backend.git`
|
```console
|
||||||
|
docker build -t slurm-docker-cluster:21.08.6 .
|
||||||
2. Run `$ ./setupDev.sh`: **NOTICE** The script will download files of a total size of 338MB (mostly for the InfluxDB data).
|
|
||||||
|
|
||||||
3. The setup-script launches the supporting container stack in the background automatically if everything went well. Run `$> ./cc-backend/cc-backend` to start `cc-backend.`
|
|
||||||
|
|
||||||
4. By default, you can access `cc-backend` in your browser at `http://localhost:8080`. You can shut down the cc-backend server by pressing `CTRL-C`, remember to also shut down all containers via `$> docker-compose down` afterwards.
|
|
||||||
|
|
||||||
5. You can restart the containers with: `$> docker-compose up -d`.
|
|
||||||
|
|
||||||
## Post-Setup Adjustment for using `influxdb`
|
|
||||||
|
|
||||||
When using `influxdb` as a metric database, one must adjust the following files:
|
|
||||||
* `cc-backend/var/job-archive/emmy/cluster.json`
|
|
||||||
* `cc-backend/var/job-archive/woody/cluster.json`
|
|
||||||
|
|
||||||
In the JSON, exchange the content of the `metricDataRepository`-Entry (By default configured for `cc-metric-store`) with:
|
|
||||||
```
|
```
|
||||||
"metricDataRepository": {
|
|
||||||
"kind": "influxdb",
|
Build a different version of Slurm using Docker build args and the Slurm Git
|
||||||
"url": "http://localhost:8086",
|
tag:
|
||||||
"token": "egLfcf7fx0FESqFYU3RpAAbj",
|
|
||||||
"bucket": "ClusterCockpit",
|
```console
|
||||||
"org": "ClusterCockpit",
|
docker build --build-arg SLURM_TAG="slurm-19-05-2-1" -t slurm-docker-cluster:19.05.2 .
|
||||||
"skiptls": false
|
```
|
||||||
}
|
|
||||||
|
Or equivalently using `docker-compose`:
|
||||||
|
|
||||||
|
```console
|
||||||
|
SLURM_TAG=slurm-19-05-2-1 IMAGE_TAG=19.05.2 docker-compose build
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Starting the Cluster
|
||||||
|
|
||||||
Credentials for the preconfigured demo user are:
|
Run `docker-compose` to instantiate the cluster:
|
||||||
* User: `demo`
|
|
||||||
* Password: `AdminDev`
|
|
||||||
|
|
||||||
You can also login as regular user using any credential in the LDAP user directory at `./data/ldap/users.ldif`.
|
```console
|
||||||
|
IMAGE_TAG=19.05.2 docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
TODO: Update job archive and all other metric data.
|
## Register the Cluster with SlurmDBD
|
||||||
The job archive with 1867 jobs originates from the second half of 2020.
|
|
||||||
Roughly 2700 jobs from the first week of 2021 are loaded with data from InfluxDB.
|
To register the cluster to the slurmdbd daemon, run the `register_cluster.sh`
|
||||||
Some views of ClusterCockpit (e.g. the Users view) show the last week or month.
|
script:
|
||||||
To show some data there you have to set the filter to time periods with jobs (August 2020 to January 2021).
|
|
||||||
|
```console
|
||||||
|
./register_cluster.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
> Note: You may have to wait a few seconds for the cluster daemons to become
|
||||||
|
> ready before registering the cluster. Otherwise, you may get an error such
|
||||||
|
> as **sacctmgr: error: Problem talking to the database: Connection refused**.
|
||||||
|
>
|
||||||
|
> You can check the status of the cluster by viewing the logs: `docker-compose
|
||||||
|
> logs -f`
|
||||||
|
|
||||||
|
## Accessing the Cluster
|
||||||
|
|
||||||
|
Use `docker exec` to run a bash shell on the controller container:
|
||||||
|
|
||||||
|
```console
|
||||||
|
docker exec -it slurmctld bash
|
||||||
|
```
|
||||||
|
|
||||||
|
From the shell, execute slurm commands, for example:
|
||||||
|
|
||||||
|
```console
|
||||||
|
[root@slurmctld /]# sinfo
|
||||||
|
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
|
||||||
|
normal* up 5-00:00:00 2 idle c[1-2]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Submitting Jobs
|
||||||
|
|
||||||
|
The `slurm_jobdir` named volume is mounted on each Slurm container as `/data`.
|
||||||
|
Therefore, in order to see job output files while on the controller, change to
|
||||||
|
the `/data` directory when on the **slurmctld** container and then submit a job:
|
||||||
|
|
||||||
|
```console
|
||||||
|
[root@slurmctld /]# cd /data/
|
||||||
|
[root@slurmctld data]# sbatch --wrap="uptime"
|
||||||
|
Submitted batch job 2
|
||||||
|
[root@slurmctld data]# ls
|
||||||
|
slurm-2.out
|
||||||
|
```
|
||||||
|
|
||||||
|
## Stopping and Restarting the Cluster
|
||||||
|
|
||||||
|
```console
|
||||||
|
docker-compose stop
|
||||||
|
docker-compose start
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deleting the Cluster
|
||||||
|
|
||||||
|
To remove all containers and volumes, run:
|
||||||
|
|
||||||
|
```console
|
||||||
|
docker-compose stop
|
||||||
|
docker-compose rm -f
|
||||||
|
docker volume rm slurm-docker-cluster_etc_munge slurm-docker-cluster_etc_slurm slurm-docker-cluster_slurm_jobdir slurm-docker-cluster_var_lib_mysql slurm-docker-cluster_var_log_slurm
|
||||||
|
```
|
||||||
|
@ -1,128 +1,84 @@
|
|||||||
|
version: "2.2"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
nats:
|
mysql:
|
||||||
container_name: nats
|
image: mariadb:10.10
|
||||||
image: nats:alpine
|
hostname: mysql
|
||||||
ports:
|
container_name: mysql
|
||||||
- "4222:4222"
|
environment:
|
||||||
- "8222:8222"
|
MYSQL_RANDOM_ROOT_PASSWORD: "yes"
|
||||||
|
MYSQL_DATABASE: slurm_acct_db
|
||||||
|
MYSQL_USER: slurm
|
||||||
|
MYSQL_PASSWORD: password
|
||||||
|
volumes:
|
||||||
|
- var_lib_mysql:/var/lib/mysql
|
||||||
|
|
||||||
cc-metric-store:
|
slurmdbd:
|
||||||
container_name: cc-metric-store
|
image: slurm-docker-cluster:${IMAGE_TAG:-21.08}
|
||||||
build:
|
build:
|
||||||
context: ./cc-metric-store
|
context: .
|
||||||
ports:
|
args:
|
||||||
- "8084:8084"
|
SLURM_TAG: ${SLURM_TAG:-slurm-21-08-6-1}
|
||||||
|
command: ["slurmdbd"]
|
||||||
|
container_name: slurmdbd
|
||||||
|
hostname: slurmdbd
|
||||||
volumes:
|
volumes:
|
||||||
- ${DATADIR}/cc-metric-store:/data
|
- etc_munge:/etc/munge
|
||||||
|
- etc_slurm:/etc/slurm
|
||||||
|
- var_log_slurm:/var/log/slurm
|
||||||
|
expose:
|
||||||
|
- "6819"
|
||||||
depends_on:
|
depends_on:
|
||||||
- nats
|
- mysql
|
||||||
|
|
||||||
influxdb:
|
slurmctld:
|
||||||
container_name: influxdb
|
image: slurm-docker-cluster:${IMAGE_TAG:-21.08}
|
||||||
image: influxdb
|
command: ["slurmctld"]
|
||||||
command: ["--reporting-disabled"]
|
|
||||||
environment:
|
|
||||||
DOCKER_INFLUXDB_INIT_MODE: setup
|
|
||||||
DOCKER_INFLUXDB_INIT_USERNAME: devel
|
|
||||||
DOCKER_INFLUXDB_INIT_PASSWORD: ${INFLUXDB_PASSWORD}
|
|
||||||
DOCKER_INFLUXDB_INIT_ORG: ${INFLUXDB_ORG}
|
|
||||||
DOCKER_INFLUXDB_INIT_BUCKET: ${INFLUXDB_BUCKET}
|
|
||||||
DOCKER_INFLUXDB_INIT_RETENTION: 100w
|
|
||||||
DOCKER_INFLUXDB_INIT_ADMIN_TOKEN: ${INFLUXDB_ADMIN_TOKEN}
|
|
||||||
ports:
|
|
||||||
- "127.0.0.1:${INFLUXDB_PORT}:8086"
|
|
||||||
volumes:
|
|
||||||
- ${DATADIR}/influxdb/data:/var/lib/influxdb2
|
|
||||||
- ${DATADIR}/influxdb/config:/etc/influxdb2
|
|
||||||
|
|
||||||
openldap:
|
|
||||||
container_name: ldap
|
|
||||||
image: osixia/openldap:1.5.0
|
|
||||||
command: --copy-service --loglevel debug
|
|
||||||
environment:
|
|
||||||
- LDAP_ADMIN_PASSWORD=${LDAP_ADMIN_PASSWORD}
|
|
||||||
- LDAP_ORGANISATION=${LDAP_ORGANISATION}
|
|
||||||
- LDAP_DOMAIN=${LDAP_DOMAIN}
|
|
||||||
volumes:
|
|
||||||
- ${DATADIR}/ldap:/container/service/slapd/assets/config/bootstrap/ldif/custom
|
|
||||||
|
|
||||||
mariadb:
|
|
||||||
container_name: mariadb
|
|
||||||
image: mariadb:latest
|
|
||||||
command: ["--default-authentication-plugin=mysql_native_password"]
|
|
||||||
environment:
|
|
||||||
MARIADB_ROOT_PASSWORD: ${MARIADB_ROOT_PASSWORD}
|
|
||||||
MARIADB_DATABASE: slurm_acct_db
|
|
||||||
MARIADB_USER: slurm
|
|
||||||
MARIADB_PASSWORD: demo
|
|
||||||
ports:
|
|
||||||
- "127.0.0.1:${MARIADB_PORT}:3306"
|
|
||||||
volumes:
|
|
||||||
- ${DATADIR}/mariadb:/etc/mysql/conf.d
|
|
||||||
# - ${DATADIR}/sql-init:/docker-entrypoint-initdb.d
|
|
||||||
cap_add:
|
|
||||||
- SYS_NICE
|
|
||||||
|
|
||||||
# mysql:
|
|
||||||
# container_name: mysql
|
|
||||||
# image: mysql:8.0.22
|
|
||||||
# command: ["--default-authentication-plugin=mysql_native_password"]
|
|
||||||
# environment:
|
|
||||||
# MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD}
|
|
||||||
# MYSQL_DATABASE: ${MYSQL_DATABASE}
|
|
||||||
# MYSQL_USER: ${MYSQL_USER}
|
|
||||||
# MYSQL_PASSWORD: ${MYSQL_PASSWORD}
|
|
||||||
# ports:
|
|
||||||
# - "127.0.0.1:${MYSQL_PORT}:3306"
|
|
||||||
# # volumes:
|
|
||||||
# # - ${DATADIR}/sql-init:/docker-entrypoint-initdb.d
|
|
||||||
# # - ${DATADIR}/sqldata:/var/lib/mysql
|
|
||||||
# cap_add:
|
|
||||||
# - SYS_NICE
|
|
||||||
|
|
||||||
slurm-controller:
|
|
||||||
container_name: slurmctld
|
container_name: slurmctld
|
||||||
hostname: slurmctld
|
hostname: slurmctld
|
||||||
build:
|
|
||||||
context: ./slurm/controller
|
|
||||||
privileged: true
|
|
||||||
volumes:
|
volumes:
|
||||||
- ${DATADIR}/slurm/home:/home
|
- etc_munge:/etc/munge
|
||||||
- ${DATADIR}/slurm/secret:/.secret
|
- etc_slurm:/etc/slurm
|
||||||
|
- slurm_jobdir:/data
|
||||||
slurm-database:
|
- var_log_slurm:/var/log/slurm
|
||||||
container_name: slurmdb
|
expose:
|
||||||
hostname: slurmdb
|
- "6817"
|
||||||
build:
|
|
||||||
context: ./slurm/database
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- mariadb
|
- "slurmdbd"
|
||||||
- slurm-controller
|
|
||||||
privileged: true
|
|
||||||
volumes:
|
|
||||||
- ${DATADIR}/slurm/home:/home
|
|
||||||
- ${DATADIR}/slurm/secret:/.secret
|
|
||||||
|
|
||||||
slurm-worker01:
|
c1:
|
||||||
container_name: node01
|
image: slurm-docker-cluster:${IMAGE_TAG:-21.08}
|
||||||
hostname: node01
|
command: ["slurmd"]
|
||||||
build:
|
hostname: c1
|
||||||
context: ./slurm/worker
|
container_name: c1
|
||||||
|
volumes:
|
||||||
|
- etc_munge:/etc/munge
|
||||||
|
- etc_slurm:/etc/slurm
|
||||||
|
- slurm_jobdir:/data
|
||||||
|
- var_log_slurm:/var/log/slurm
|
||||||
|
expose:
|
||||||
|
- "6818"
|
||||||
depends_on:
|
depends_on:
|
||||||
- slurm-controller
|
- "slurmctld"
|
||||||
privileged: true
|
|
||||||
volumes:
|
|
||||||
- ${DATADIR}/slurm/home:/home
|
|
||||||
- ${DATADIR}/slurm/secret:/.secret
|
|
||||||
|
|
||||||
# slurm-worker02:
|
c2:
|
||||||
# container_name: node02
|
image: slurm-docker-cluster:${IMAGE_TAG:-21.08}
|
||||||
# hostname: node02
|
command: ["slurmd"]
|
||||||
# build:
|
hostname: c2
|
||||||
# context: ./slurm/worker
|
container_name: c2
|
||||||
# depends_on:
|
volumes:
|
||||||
# - slurm-controller
|
- etc_munge:/etc/munge
|
||||||
# privileged: true
|
- etc_slurm:/etc/slurm
|
||||||
# volumes:
|
- slurm_jobdir:/data
|
||||||
# - ${DATADIR}/slurm/home:/home
|
- var_log_slurm:/var/log/slurm
|
||||||
# - ${DATADIR}/slurm/secret:/.secret
|
expose:
|
||||||
|
- "6818"
|
||||||
|
depends_on:
|
||||||
|
- "slurmctld"
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
etc_munge:
|
||||||
|
etc_slurm:
|
||||||
|
slurm_jobdir:
|
||||||
|
var_lib_mysql:
|
||||||
|
var_log_slurm:
|
||||||
|
64
docker-entrypoint.sh
Executable file
64
docker-entrypoint.sh
Executable file
@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ "$1" = "slurmdbd" ]
|
||||||
|
then
|
||||||
|
echo "---> Starting the MUNGE Authentication service (munged) ..."
|
||||||
|
gosu munge /usr/sbin/munged
|
||||||
|
|
||||||
|
echo "---> Starting the Slurm Database Daemon (slurmdbd) ..."
|
||||||
|
|
||||||
|
{
|
||||||
|
. /etc/slurm/slurmdbd.conf
|
||||||
|
until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null
|
||||||
|
do
|
||||||
|
echo "-- Waiting for database to become active ..."
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
}
|
||||||
|
echo "-- Database is now active ..."
|
||||||
|
|
||||||
|
exec gosu slurm /usr/sbin/slurmdbd -Dvvv
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$1" = "slurmctld" ]
|
||||||
|
then
|
||||||
|
echo "---> Starting the MUNGE Authentication service (munged) ..."
|
||||||
|
gosu munge /usr/sbin/munged
|
||||||
|
|
||||||
|
echo "---> Waiting for slurmdbd to become active before starting slurmctld ..."
|
||||||
|
|
||||||
|
until 2>/dev/null >/dev/tcp/slurmdbd/6819
|
||||||
|
do
|
||||||
|
echo "-- slurmdbd is not available. Sleeping ..."
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
echo "-- slurmdbd is now active ..."
|
||||||
|
|
||||||
|
echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
|
||||||
|
if /usr/sbin/slurmctld -V | grep -q '17.02' ; then
|
||||||
|
exec gosu slurm /usr/sbin/slurmctld -Dvvv
|
||||||
|
else
|
||||||
|
exec gosu slurm /usr/sbin/slurmctld -i -Dvvv
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$1" = "slurmd" ]
|
||||||
|
then
|
||||||
|
echo "---> Starting the MUNGE Authentication service (munged) ..."
|
||||||
|
gosu munge /usr/sbin/munged
|
||||||
|
|
||||||
|
echo "---> Waiting for slurmctld to become active before starting slurmd..."
|
||||||
|
|
||||||
|
until 2>/dev/null >/dev/tcp/slurmctld/6817
|
||||||
|
do
|
||||||
|
echo "-- slurmctld is not available. Sleeping ..."
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
echo "-- slurmctld is now active ..."
|
||||||
|
|
||||||
|
echo "---> Starting the Slurm Node Daemon (slurmd) ..."
|
||||||
|
exec /usr/sbin/slurmd -Dvvv
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec "$@"
|
30
makefile
Normal file
30
makefile
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
SLURM_ROOT_DIR = /usr
|
||||||
|
SLURM_INC_DIR = /usr/include/slurm
|
||||||
|
SLURM_LIB_DIR = /usr/lib64/slurm
|
||||||
|
SLURM_BUILD = 21.08.6
|
||||||
|
SLURM_BUILD_DIR = /home/slurm
|
||||||
|
|
||||||
|
PLUGIN_TYPE = prep
|
||||||
|
PLUGIN_NAME = pika
|
||||||
|
PLUGIN_FILE = $(PLUGIN_TYPE)_$(PLUGIN_NAME).so
|
||||||
|
|
||||||
|
SRC_FILE = slurm-prep-pika_v4.c
|
||||||
|
|
||||||
|
CC = gcc
|
||||||
|
CFLAGS ?= -Wall -fPIC -g -I$(SLURM_INC_DIR) -I$(SLURM_BUILD_DIR) -I/home/slurm/src/ -I/home/slurm
|
||||||
|
LDFLAGS ?= --shared -L.
|
||||||
|
|
||||||
|
all: $(PLUGIN_FILE)
|
||||||
|
|
||||||
|
default: $(PLUGIN_FILE)
|
||||||
|
|
||||||
|
$(PLUGIN_FILE): $(SRC_FILE)
|
||||||
|
$(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@
|
||||||
|
|
||||||
|
install: $(PLUGIN_FILE)
|
||||||
|
install -m 755 $(PLUGIN_FILE) $(SLURM_LIB_DIR)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f $(PLUGIN_FILE)
|
||||||
|
|
||||||
|
mrproper: clean
|
5
register_cluster.sh
Executable file
5
register_cluster.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \
|
||||||
|
docker-compose restart slurmdbd slurmctld
|
1012
slurm-prep-pika_v4.c
Normal file
1012
slurm-prep-pika_v4.c
Normal file
File diff suppressed because it is too large
Load Diff
95
slurm.conf
Normal file
95
slurm.conf
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
# slurm.conf
|
||||||
|
#
|
||||||
|
# See the slurm.conf man page for more information.
|
||||||
|
#
|
||||||
|
ClusterName=linux
|
||||||
|
ControlMachine=slurmctld
|
||||||
|
ControlAddr=slurmctld
|
||||||
|
#BackupController=
|
||||||
|
#BackupAddr=
|
||||||
|
#
|
||||||
|
SlurmUser=slurm
|
||||||
|
#SlurmdUser=root
|
||||||
|
SlurmctldPort=6817
|
||||||
|
SlurmdPort=6818
|
||||||
|
AuthType=auth/munge
|
||||||
|
#JobCredentialPrivateKey=
|
||||||
|
#JobCredentialPublicCertificate=
|
||||||
|
StateSaveLocation=/var/lib/slurmd
|
||||||
|
SlurmdSpoolDir=/var/spool/slurmd
|
||||||
|
SwitchType=switch/none
|
||||||
|
MpiDefault=none
|
||||||
|
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
|
||||||
|
SlurmdPidFile=/var/run/slurmd/slurmd.pid
|
||||||
|
ProctrackType=proctrack/linuxproc
|
||||||
|
#PluginDir=
|
||||||
|
#CacheGroups=0
|
||||||
|
#FirstJobId=
|
||||||
|
ReturnToService=0
|
||||||
|
#MaxJobCount=
|
||||||
|
#PlugStackConfig=
|
||||||
|
#PropagatePrioProcess=
|
||||||
|
#PropagateResourceLimits=
|
||||||
|
#PropagateResourceLimitsExcept=
|
||||||
|
#Prolog=
|
||||||
|
#Epilog=
|
||||||
|
#SrunProlog=
|
||||||
|
#SrunEpilog=
|
||||||
|
#TaskProlog=
|
||||||
|
#TaskEpilog=
|
||||||
|
#TaskPlugin=
|
||||||
|
#TrackWCKey=no
|
||||||
|
#TreeWidth=50
|
||||||
|
#TmpFS=
|
||||||
|
#UsePAM=
|
||||||
|
#
|
||||||
|
# TIMERS
|
||||||
|
SlurmctldTimeout=300
|
||||||
|
SlurmdTimeout=300
|
||||||
|
InactiveLimit=0
|
||||||
|
MinJobAge=300
|
||||||
|
KillWait=30
|
||||||
|
Waittime=0
|
||||||
|
#
|
||||||
|
# SCHEDULING
|
||||||
|
SchedulerType=sched/backfill
|
||||||
|
#SchedulerAuth=
|
||||||
|
#SchedulerPort=
|
||||||
|
#SchedulerRootFilter=
|
||||||
|
SelectType=select/cons_res
|
||||||
|
SelectTypeParameters=CR_CPU_Memory
|
||||||
|
FastSchedule=1
|
||||||
|
#PriorityType=priority/multifactor
|
||||||
|
#PriorityDecayHalfLife=14-0
|
||||||
|
#PriorityUsageResetPeriod=14-0
|
||||||
|
#PriorityWeightFairshare=100000
|
||||||
|
#PriorityWeightAge=1000
|
||||||
|
#PriorityWeightPartition=10000
|
||||||
|
#PriorityWeightJobSize=1000
|
||||||
|
#PriorityMaxAge=1-0
|
||||||
|
#
|
||||||
|
# LOGGING
|
||||||
|
SlurmctldDebug=3
|
||||||
|
SlurmctldLogFile=/var/log/slurm/slurmctld.log
|
||||||
|
SlurmdDebug=3
|
||||||
|
SlurmdLogFile=/var/log/slurm/slurmd.log
|
||||||
|
JobCompType=jobcomp/filetxt
|
||||||
|
JobCompLoc=/var/log/slurm/jobcomp.log
|
||||||
|
#
|
||||||
|
# ACCOUNTING
|
||||||
|
JobAcctGatherType=jobacct_gather/linux
|
||||||
|
JobAcctGatherFrequency=30
|
||||||
|
#
|
||||||
|
AccountingStorageType=accounting_storage/slurmdbd
|
||||||
|
AccountingStorageHost=slurmdbd
|
||||||
|
AccountingStoragePort=6819
|
||||||
|
#AccountingStorageLoc=slurm_acct_db
|
||||||
|
#AccountingStoragePass=
|
||||||
|
#AccountingStorageUser=
|
||||||
|
#
|
||||||
|
# COMPUTE NODES
|
||||||
|
NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
|
||||||
|
#
|
||||||
|
# PARTITIONS
|
||||||
|
PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
|
||||||
|
#PrEpPlugins=pika
|
BIN
slurm/.DS_Store
vendored
Normal file
BIN
slurm/.DS_Store
vendored
Normal file
Binary file not shown.
37
slurmdbd.conf
Normal file
37
slurmdbd.conf
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#
|
||||||
|
# Example slurmdbd.conf file.
|
||||||
|
#
|
||||||
|
# See the slurmdbd.conf man page for more information.
|
||||||
|
#
|
||||||
|
# Archive info
|
||||||
|
#ArchiveJobs=yes
|
||||||
|
#ArchiveDir="/tmp"
|
||||||
|
#ArchiveSteps=yes
|
||||||
|
#ArchiveScript=
|
||||||
|
#JobPurge=12
|
||||||
|
#StepPurge=1
|
||||||
|
#
|
||||||
|
# Authentication info
|
||||||
|
AuthType=auth/munge
|
||||||
|
#AuthInfo=/var/run/munge/munge.socket.2
|
||||||
|
#
|
||||||
|
# slurmDBD info
|
||||||
|
DbdAddr=slurmdbd
|
||||||
|
DbdHost=slurmdbd
|
||||||
|
#DbdPort=6819
|
||||||
|
SlurmUser=slurm
|
||||||
|
#MessageTimeout=300
|
||||||
|
DebugLevel=4
|
||||||
|
#DefaultQOS=normal,standby
|
||||||
|
LogFile=/var/log/slurm/slurmdbd.log
|
||||||
|
PidFile=/var/run/slurmdbd/slurmdbd.pid
|
||||||
|
#PluginDir=/usr/lib/slurm
|
||||||
|
#PrivateData=accounts,users,usage,jobs
|
||||||
|
#TrackWCKey=yes
|
||||||
|
#
|
||||||
|
# Database info
|
||||||
|
StorageType=accounting_storage/mysql
|
||||||
|
StorageHost=mysql
|
||||||
|
StorageUser=slurm
|
||||||
|
StoragePass=password
|
||||||
|
#StorageLoc=slurm_acct_db
|
Loading…
x
Reference in New Issue
Block a user