From 325ab9b27d4e0d11566e90a027d447976161460b Mon Sep 17 00:00:00 2001 From: Aditya Date: Wed, 13 Nov 2024 13:14:41 +0000 Subject: [PATCH] Update to CgroupPlugin --- cc-metric-store/Dockerfile | 20 -------------------- cc-metric-store/config.json | 28 ---------------------------- curl_slurmrestd.sh | 2 +- slurm/controller/slurm.conf | 10 +++++----- slurm/worker/cgroup.conf | 2 +- slurm/worker/docker-entrypoint.sh | 7 ++++--- 6 files changed, 11 insertions(+), 58 deletions(-) delete mode 100644 cc-metric-store/Dockerfile delete mode 100644 cc-metric-store/config.json diff --git a/cc-metric-store/Dockerfile b/cc-metric-store/Dockerfile deleted file mode 100644 index eb7aa48..0000000 --- a/cc-metric-store/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM golang:1.22.4 - -RUN apt-get update -RUN apt-get -y install git - -RUN git clone https://github.com/ClusterCockpit/cc-metric-store.git /cc-metric-store -RUN ls -RUN cd /cc-metric-store && go build ./cmd/cc-metric-store - -# Reactivate when latest commit is available -#RUN go get -d -v github.com/ClusterCockpit/cc-metric-store -#RUN go install -v github.com/ClusterCockpit/cc-metric-store@latest - -RUN mv /cc-metric-store/cc-metric-store /go/bin -COPY config.json /go/bin - -VOLUME /data - -WORKDIR /go/bin -CMD ["./cc-metric-store"] diff --git a/cc-metric-store/config.json b/cc-metric-store/config.json deleted file mode 100644 index 674c67c..0000000 --- a/cc-metric-store/config.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "metrics": { - "clock": { "frequency": 60, "aggregation": null, "scope": "node" }, - "cpi": { "frequency": 60, "aggregation": null, "scope": "node" }, - "cpu_load": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_any": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_dp": { "frequency": 60, "aggregation": null, "scope": "node" }, - "flops_sp": { "frequency": 60, "aggregation": null, "scope": "node" }, - "ib_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "lustre_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "mem_bw": { "frequency": 60, "aggregation": null, "scope": "node" }, - "mem_used": { "frequency": 60, "aggregation": null, "scope": "node" }, - "rapl_power": { "frequency": 60, "aggregation": null, "scope": "node" } - }, - "checkpoints": { - "interval": 100000000000, - "directory": "/data/checkpoints", - "restore": 100000000000 - }, - "archive": { - "interval": 100000000000, - "directory": "/data/archive" - }, - "retention-in-memory": 100000000000, - "http-api-address": "0.0.0.0:8081", - "nats": "nats://cc-nats:4222", - "jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0=" -} diff --git a/curl_slurmrestd.sh b/curl_slurmrestd.sh index dc506ff..8168267 100755 --- a/curl_slurmrestd.sh +++ b/curl_slurmrestd.sh @@ -1,3 +1,3 @@ SLURM_JWT=$(cat data/slurm/secret/jwt_token.txt) -curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/ping' --location --silent --show-error -H "X-SLURM-USER-NAME: root" -H "X-SLURM-USER-TOKEN: $SLURM_JWT" +curl -X 'GET' -v 'http://localhost:6820/slurm/v0.0.39/node/node01' --location --silent --show-error -H "X-SLURM-USER-NAME: root" -H "X-SLURM-USER-TOKEN: $SLURM_JWT" # curl -v --unix-socket data/slurm/tmp/slurmrestd.socket 'http://localhost:6820/slurm/v0.0.39/ping' \ No newline at end of file diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index 8fbaee4..f5a34ef 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -39,7 +39,7 @@ ReturnToService=0 #SrunEpilog= #TaskProlog= #TaskEpilog= -TaskPlugin=task/none +TaskPlugin=task/affinity #TrackWCKey=no #TreeWidth=50 #TmpFS= @@ -79,9 +79,9 @@ JobCompType=jobcomp/filetxt JobCompLoc=/var/log/slurm/jobcomp.log # # ACCOUNTING -#JobAcctGatherType=jobacct_gather/linux -JobAcctGatherType=jobacct_gather/cgroup -# ProctrackType=proctrack/cgroup +JobAcctGatherType=jobacct_gather/linux +#JobAcctGatherType=jobacct_gather/cgroup +#ProctrackType=proctrack/cgroup JobAcctGatherFrequency=30 # @@ -99,7 +99,7 @@ PartitionName=debug Nodes=node01 Default=YES MaxTime=INFINITE State=UP # # COMPUTE NODES # NodeName=c[1-2] RealMemory=1000 State=UNKNOWN -NodeName=node01 CPUs=2 Boards=1 SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1 +NodeName=node01 CPUs=1 Boards=1 SocketsPerBoard=1 CoresPerSocket=1 ThreadsPerCore=1 # # # # PARTITIONS diff --git a/slurm/worker/cgroup.conf b/slurm/worker/cgroup.conf index f24d9d7..1f930c7 100644 --- a/slurm/worker/cgroup.conf +++ b/slurm/worker/cgroup.conf @@ -1,4 +1,4 @@ -CgroupPlugin=cgroup/v1 +CgroupPlugin=disabled ConstrainCores=yes ConstrainDevices=no ConstrainRAMSpace=yes diff --git a/slurm/worker/docker-entrypoint.sh b/slurm/worker/docker-entrypoint.sh index 090c3a0..e254c0a 100755 --- a/slurm/worker/docker-entrypoint.sh +++ b/slurm/worker/docker-entrypoint.sh @@ -78,9 +78,9 @@ _slurmd() { fi echo "found slurm.conf" - sudo yum install -y nc - sudo yum install -y procps - sudo yum install -y iputils + # sudo yum install -y nc + # sudo yum install -y procps + # sudo yum install -y iputils mkdir -p /var/spool/slurm/d /etc/slurm /var/run/slurm/d /var/log/slurm chown slurm: /var/spool/slurm/d /var/run/slurm/d /var/log/slurm @@ -98,6 +98,7 @@ _slurmd() { chown slurm: /var/run/slurm/d/slurmd.pid echo "Starting slurmd" + /usr/sbin/slurmstepd infinity & /usr/sbin/slurmd -Dvv echo "Started slurmd" }