From 1ec3c7d80f53747bac96900c8a59ce8f91b830c7 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Fri, 4 Jul 2025 11:22:03 +0200 Subject: [PATCH] Cleaning, stabilizing and testing --- README.md | 20 ++-- dataGenerationScript.sh | 2 + docker-compose.yml | 4 +- misc/config.json | 5 +- scripts/checkModules.sh | 2 + scripts/checkpointsToInflux.pl | 100 -------------------- scripts/checkpointsToInflux.sh | 65 ------------- scripts/prerequisite_installation_script.sh | 4 +- scripts/sqliteToMariadb.sh | 12 --- setupDev.sh | 6 +- slurm/controller/docker-entrypoint.sh | 2 +- slurm/controller/slurm.conf | 8 +- 12 files changed, 28 insertions(+), 202 deletions(-) delete mode 100644 scripts/checkpointsToInflux.pl delete mode 100755 scripts/checkpointsToInflux.sh delete mode 100644 scripts/sqliteToMariadb.sh diff --git a/README.md b/README.md index b196299..3489a4a 100755 --- a/README.md +++ b/README.md @@ -10,13 +10,11 @@ It includes the following containers: |Slurm Database service|slurmdbd|6817| |Slurm Rest service with JWT authentication|slurmrestd|6820| |Slurm Worker|node01|6818| -|MariaDB service|mariadb|3306| -|InfluxDB serice|influxdb|8086| |NATS service|nats|4222, 6222, 8222| |cc-metric-store service|cc-metric-store|8084| |OpenLDAP|openldap|389, 636| -The setup comes with fixture data for a Job archive, cc-metric-store checkpoints, InfluxDB, MariaDB, and a LDAP user directory. +The setup comes with fixture data for a Job archive, cc-metric-store checkpoints, and a LDAP user directory. ## Prerequisites @@ -44,11 +42,11 @@ Note: You can install all these dependencies via predefined installation steps i If you are using different linux flavors, you will have to adapt `prerequisite_installation_script.sh` as well as `setupDev.sh`. -## Setup +## Setup Procedure 1. Clone `cc-backend` repository in chosen base folder: `$> git clone https://github.com/ClusterCockpit/cc-backend.git` -2. Run `$ ./setupDev.sh`: **NOTICE** The script will download files of a total size of 338MB (mostly for the cc-metric-store data). +2. Run the setup bash file: `$> ./setupDev.sh`: **NOTICE** The script will download files of a total size of 338MB (mostly for the cc-metric-store data). 3. The setup-script launches the supporting container stack in the background automatically if everything went well. Run `$> ./cc-backend/cc-backend -server -dev` to start `cc-backend`. @@ -72,9 +70,9 @@ You can also login as regular user using any credential in the LDAP user directo When you are done cloning the cc-backend repo and once you execute `setupDev.sh` file, it will copy a preconfigured `config.json` from `misc/config.json` and replace the `cc-backend/config.json`, which will be used by cc-backend, once you start the server. The preconfigured config.json attaches to: -#### 1. MariaDB docker service on port 3306 (database: ccbackend) -#### 2. OpenLDAP docker service on port 389 -#### 3. cc-metric-store docker service on port 8084 +#### 1. OpenLDAP docker service on port 389 +#### 2. cc-metric-store docker service on port 8084 +#### 3. cc-slurm-adapter is running on slurmctld docker service. cc-metric-store also has a preconfigured `config.json` in `cc-metric-store/config.json` which attaches to NATS docker service on port 4222 and subscribes to topic 'hpc-nats'. @@ -94,7 +92,7 @@ Now that you can see the docker services, and if you want to manually access the > **`Example`**: You want to run slurm commands like `sinfo` or `squeue` or `scontrol` on slurm controller, you cannot directly access it. -You need to **`bash`** into the running service by using the following command: +You need to open a **`bash`** session in the running service by using the following command: ``` $ docker exec -it bash @@ -103,7 +101,7 @@ $ docker exec -it bash $ docker exec -it slurmctld bash #or -$ docker exec -it mariadb bash +$ docker exec -it cc-metric-store bash ``` Once you start a **`bash`** on any docker service, then you may execute any service related commands in that **`bash`**. @@ -147,7 +145,7 @@ You may also use the never expiring token directly from the file for any of your ## Known Issues * `docker-compose` installed on Ubuntu (18.04, 20.04) via `apt-get` can not correctly parse `docker-compose.yml` due to version differences. Install latest version of `docker-compose` from https://docs.docker.com/compose/install/ instead. -* You need to ensure that no other web server is running on ports 8080 (cc-backend), 8082 (cc-metric-store), 8086 (InfluxDB), 4222 and 8222 (Nats), or 3306 (MariaDB). If one or more ports are already in use, you have to adapt the related config accordingly. +* You need to ensure that no other web server is running on ports 8080 (cc-backend), 8084 (cc-metric-store), 4222 and 8222 (Nats). If one or more ports are already in use, you have to adapt the related config accordingly. * Existing VPN connections sometimes cause problems with docker. If `docker-compose` does not start up correctly, try disabling any active VPN connection. Refer to https://stackoverflow.com/questions/45692255/how-make-openvpn-work-with-docker for further information. ## Docker services and restarting the services diff --git a/dataGenerationScript.sh b/dataGenerationScript.sh index cf7796d..fac98a2 100755 --- a/dataGenerationScript.sh +++ b/dataGenerationScript.sh @@ -1,4 +1,6 @@ #!/bin/bash +set -euo pipefail + echo "" echo "|--------------------------------------------------------------------------------------|" echo "| This is Data generation script for docker services |" diff --git a/docker-compose.yml b/docker-compose.yml index 92d8fca..a9d19d7 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -91,8 +91,8 @@ services: - "6819:6819" node01: - container_name: node01 - hostname: node01 + container_name: f0101 + hostname: f0101 build: context: ./slurm/worker depends_on: diff --git a/misc/config.json b/misc/config.json index 2977e72..dcb1745 100644 --- a/misc/config.json +++ b/misc/config.json @@ -8,8 +8,9 @@ "jwts": { "max-age": "2000h" }, - "db-driver": "mysql", - "db": "root:root@tcp(0.0.0.0:3306)/ccbackend", + "apiAllowedIPs": [ + "*" + ], "ldap": { "url": "ldap://0.0.0.0", "user_base": "ou=users,dc=example,dc=com", diff --git a/scripts/checkModules.sh b/scripts/checkModules.sh index 6fdd04d..5b32d1f 100755 --- a/scripts/checkModules.sh +++ b/scripts/checkModules.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -euo pipefail + cd scripts # Check if required perl modules are installed diff --git a/scripts/checkpointsToInflux.pl b/scripts/checkpointsToInflux.pl deleted file mode 100644 index 144c90d..0000000 --- a/scripts/checkpointsToInflux.pl +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; -use utf8; - -use File::Path qw( make_path rmtree ); -use Cpanel::JSON::XS qw( decode_json encode_json ); -use File::Slurp; -use Data::Dumper; -use Time::Piece; -use Sort::Versions; -use REST::Client; - -### INFLUXDB -my $newCheckpoints = './data/cc-metric-store/checkpoints'; -my @CheckpClusters; -my $verbose = 1; -my $restClient = REST::Client->new(); -$restClient->setHost('http://localhost:8086'); # Adapt port here! -$restClient->addHeader('Authorization', "Token 74008ea2a8dad5e6f856838a90c6392e"); # compare .env file -$restClient->addHeader('Content-Type', 'text/plain; charset=utf-8'); -$restClient->addHeader('Accept', 'application/json'); -$restClient->getUseragent()->ssl_opts(SSL_verify_mode => 0); # Temporary: Disable Cert Check -$restClient->getUseragent()->ssl_opts(verify_hostname => 0); # Temporary: Disable Cert Check - -# Get clusters by cc-metric-store/$subfolder -opendir my $dhc, $newCheckpoints or die "can't open directory: $!"; -while ( readdir $dhc ) { - chomp; next if $_ eq '.' or $_ eq '..' or $_ eq 'job-archive'; - - my $cluster = $_; - push @CheckpClusters, $cluster; -} - -# start to read checkpoints for influx -foreach my $cluster ( @CheckpClusters ) { - print "Starting to read updated checkpoint-files into influx for $cluster\n"; - - opendir my $dhLevel1, "$newCheckpoints/$cluster" or die "can't open directory: $!"; - while ( readdir $dhLevel1 ) { - chomp; next if $_ eq '.' or $_ eq '..'; - my $level1 = $_; - - if ( -d "$newCheckpoints/$cluster/$level1" ) { - my $nodeSource = "$newCheckpoints/$cluster/$level1/"; - my @files = read_dir($nodeSource); - my $length = @files; - if (!@files || $length != 14) { # needs 14 files == 7 days worth of data - next; - } - my @sortedFiles = sort { versioncmp($a,$b) } @files; # sort alphanumerically: _Really_ start with index == 0 == 1609459200.json - my $nodeMeasurement; - - foreach my $file (@sortedFiles) { - # print "$file\n"; - my $rawstr = read_file("$nodeSource/$file"); - my $json = decode_json($rawstr); - my $fileMeasurement; - - foreach my $metric (keys %{$json->{metrics}}) { - my $start = $json->{metrics}->{$metric}->{start}; - my $timestep = $json->{metrics}->{$metric}->{frequency}; - my $data = $json->{metrics}->{$metric}->{data}; - my $length = @$data; - my $measurement; - - while (my ($index, $value) = each(@$data)) { - if ($value) { - my $timestamp = $start + ($timestep * $index); - $measurement .= "$metric,cluster=$cluster,hostname=$level1,type=node value=".$value." $timestamp"."\n"; - } - } - # Use v2 API for Influx2 - if ($measurement) { - # print "Adding: #VALUES $length KEY $metric"."\n"; - $fileMeasurement .= $measurement; - } - } - if ($fileMeasurement) { - $nodeMeasurement .= $fileMeasurement; - } - } - - $restClient->POST("/api/v2/write?org=ClusterCockpit&bucket=ClusterCockpit&precision=s", "$nodeMeasurement"); # compare .env for bucket and org - my $responseCode = $restClient->responseCode(); - - if ( $responseCode eq '204') { - if ( $verbose ) { - print "INFLUX API WRITE: CLUSTER $cluster HOST $level1"."\n"; - }; - } else { - if ( $responseCode ne '422' ) { # Exclude High Frequency Error 422 - Temporary! - my $response = $restClient->responseContent(); - print "INFLUX API WRITE ERROR CODE ".$responseCode.": ".$response."\n"; - }; - }; - } - } -} -print "Done for influx\n"; diff --git a/scripts/checkpointsToInflux.sh b/scripts/checkpointsToInflux.sh deleted file mode 100755 index 4097163..0000000 --- a/scripts/checkpointsToInflux.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -NEW_CHECKPOINTS='../data/cc-metric-store/checkpoints' -VERBOSE=1 -INFLUX_HOST='http://0.0.0.0:8181' - -HEADERS=( - -H "Content-Type: text/plain; charset=utf-8" - -H "Accept: application/json" -) - -checkp_clusters=() -while IFS= read -r -d '' dir; do - checkp_clusters+=("$(basename "$dir")") -done < <(find "$NEW_CHECKPOINTS" -mindepth 1 -maxdepth 1 -type d \! -name 'job-archive' -print0) - -for cluster in "${checkp_clusters[@]}"; do - echo "Starting to read updated checkpoint-files into influx for $cluster" - - while IFS= read -r -d '' level1_dir; do - level1=$(basename "$level1_dir") - node_source="$NEW_CHECKPOINTS/$cluster/$level1" - - mapfile -t files < <(find "$node_source" -type f -name '*.json' | sort -V) - # if [[ ${#files[@]} -ne 14 ]]; then - # continue - # fi - - node_measurement="" - for file in "${files[@]}"; do - rawstr=$(<"$file") - - while IFS= read -r metric; do - start=$(jq -r ".metrics[\"$metric\"].start" <<<"$rawstr") - timestep=$(jq -r ".metrics[\"$metric\"].frequency" <<<"$rawstr") - - while IFS= read -r index_value; do - index=$(awk -F: '{print $1}' <<<"$index_value") - value=$(awk -F: '{print $2}' <<<"$index_value") - - if [[ -n "$value" && "$value" != "null" ]]; then - timestamp=$((start + (timestep * index))) - node_measurement+="$metric,cluster=$cluster,hostname=$level1,type=node value=$value $timestamp\n" - fi - done < <(jq -r ".metrics[\"$metric\"].data | to_entries | map(\"\(.key):\(.value // \"null\")\") | .[]" <<<"$rawstr") - done < <(jq -r '.metrics | keys[]' <<<"$rawstr") - done - - if [[ -n "$node_measurement" ]]; then - while IFS= read -r chunk; do - response_code=$(curl -s -o /dev/null -w "%{http_code}" "${HEADERS[@]}" --data-binary "$chunk" "$INFLUX_HOST/api/v2/write?bucket=mydb&precision=s") - if [[ "$response_code" == "204" ]]; then - [[ "$VERBOSE" -eq 1 ]] && echo "INFLUX API WRITE: CLUSTER $cluster HOST $level1" - elif [[ "$response_code" != "422" ]]; then - echo "INFLUX API WRITE ERROR CODE $response_code" - fi - done < <(echo -e "$node_measurement" | split -l 1000 --filter='cat') - fi - echo "Done for : "$node_source - done < <(find "$NEW_CHECKPOINTS/$cluster" -mindepth 1 -maxdepth 1 -type d -print0) -done - -echo "Done for influx" diff --git a/scripts/prerequisite_installation_script.sh b/scripts/prerequisite_installation_script.sh index e061a13..b8d1cad 100644 --- a/scripts/prerequisite_installation_script.sh +++ b/scripts/prerequisite_installation_script.sh @@ -1,5 +1,7 @@ #!/bin/bash -l +set -euo pipefail + sudo apt-get update sudo apt-get upgrade -f -y @@ -33,7 +35,7 @@ sudo cpan Time::Piece sudo cpan Sort::Versions sudo groupadd docker -sudo usermod -aG docker ubuntu +sudo usermod -aG docker $USER sudo shutdown -r -t 0 diff --git a/scripts/sqliteToMariadb.sh b/scripts/sqliteToMariadb.sh deleted file mode 100644 index 566fd03..0000000 --- a/scripts/sqliteToMariadb.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -echo "Will run prerequisites 'apt install python3-pip' and 'pip install sqlite3-to-mysql'" - -sudo apt install python3-pip - -pip install sqlite3-to-mysql - -echo "'sqlite3mysql' requires running DB container, will fail otherwise." - -# -f FILE -d DBNAME -u USER -h HOST -P PORT -~/.local/bin/sqlite3mysql -f job.db -d ClusterCockpit -u root --mysql-password root -h localhost -P 3306 diff --git a/setupDev.sh b/setupDev.sh index 5165921..cd478ea 100755 --- a/setupDev.sh +++ b/setupDev.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -eu +set -euo pipefail echo "" echo "|--------------------------------------------------------------------------------------|" echo "| Welcome to cc-docker automatic deployment script. |" @@ -41,15 +41,13 @@ chmod u+x dataGenerationScript.sh cd cc-backend -rm -rf var - if [ ! -d var ]; then wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar tar xf job-archive-demo.tar rm ./job-archive-demo.tar cp ./configs/env-template.txt .env - cp ./configs/config-demo.json config.json + cp ../misc/config.json config.json sed -i 's/"addr": *"127\.0\.0\.1:8080"/"addr": "0.0.0.0:8080"/' config.json diff --git a/slurm/controller/docker-entrypoint.sh b/slurm/controller/docker-entrypoint.sh index fe264e8..335a573 100755 --- a/slurm/controller/docker-entrypoint.sh +++ b/slurm/controller/docker-entrypoint.sh @@ -208,7 +208,7 @@ _slurmctld() { sleep 2 done - sacctmgr -i add cluster name=linux + sacctmgr -i add cluster name=fritz sleep 2s echo "Starting slurmctld" cp -f /etc/slurm/slurm.conf /.secret/ diff --git a/slurm/controller/slurm.conf b/slurm/controller/slurm.conf index f5a34ef..a72f17e 100644 --- a/slurm/controller/slurm.conf +++ b/slurm/controller/slurm.conf @@ -2,7 +2,7 @@ # # See the slurm.conf man page for more information. # -ClusterName=linux +ClusterName=fritz ControlMachine=slurmctld ControlAddr=slurmctld #BackupController= @@ -94,12 +94,12 @@ AccountingStoragePort=6819 # # COMPUTE NODES -PartitionName=DEFAULT Nodes=node01 -PartitionName=debug Nodes=node01 Default=YES MaxTime=INFINITE State=UP +# PartitionName=DEFAULT Nodes=f0101 +PartitionName=main Nodes=f0101 Default=YES MaxTime=INFINITE State=UP # # COMPUTE NODES # NodeName=c[1-2] RealMemory=1000 State=UNKNOWN -NodeName=node01 CPUs=1 Boards=1 SocketsPerBoard=1 CoresPerSocket=1 ThreadsPerCore=1 +NodeName=f0101 CPUs=1 Boards=1 SocketsPerBoard=1 CoresPerSocket=1 ThreadsPerCore=1 # # # # PARTITIONS