From ce9995dac7acf19dec01615393938eb08898747c Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 8 Aug 2024 12:29:45 +0200 Subject: [PATCH 01/12] fix: fix wrongly inserted gql request and import path error --- web/frontend/src/Job.root.svelte | 2 +- web/frontend/src/Node.root.svelte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index f991e4f..213898e 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -75,7 +75,7 @@ duration, numNodes, numHWThreads, numAcc, SMT, exclusive, partition, subCluster, arrayJobId, monitoringStatus, state, walltime, - tags { id, type, name, scope }, + tags { id, type, name }, resources { hostname, hwthreads, accelerators }, metaData, userData { name, email }, diff --git a/web/frontend/src/Node.root.svelte b/web/frontend/src/Node.root.svelte index ad6983b..2d58540 100644 --- a/web/frontend/src/Node.root.svelte +++ b/web/frontend/src/Node.root.svelte @@ -27,7 +27,7 @@ import { init, checkMetricDisabled, - } from "./utils.js"; + } from "./generic/utils.js"; import PlotTable from "./generic/PlotTable.svelte"; import MetricPlot from "./generic/plots/MetricPlot.svelte"; import TimeSelection from "./generic/select/TimeSelection.svelte"; From 561fd41d5d3b9877dc7be3980bda50e4fcb1f46a Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Tue, 13 Aug 2024 17:49:28 +0200 Subject: [PATCH 02/12] fix: add accelerator scope to to-be archived scopes - if numAcc > 0 - fixes Add accelerator scope to archive requests #282 --- internal/metricdata/metricdata.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/metricdata/metricdata.go b/internal/metricdata/metricdata.go index c826113..eba9dee 100644 --- a/internal/metricdata/metricdata.go +++ b/internal/metricdata/metricdata.go @@ -307,6 +307,10 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { scopes = append(scopes, schema.MetricScopeCore) } + if job.NumAcc > 0 { + scopes = append(scopes, schema.MetricScopeAccelerator) + } + jobData, err := LoadData(job, allMetrics, scopes, ctx) if err != nil { log.Error("Error wile loading job data for archiving") From 9b6db4684adde310b29edb56d298c3faacbcd1b3 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Aug 2024 08:53:49 +0200 Subject: [PATCH 03/12] Refactor: Remove redundant code --- cmd/cc-backend/server.go | 49 +++++++++++----------------------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 5531415..d2b62e2 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -38,6 +38,15 @@ var ( apiHandle *api.RestApi ) +func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) { + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusUnauthorized) + json.NewEncoder(rw).Encode(map[string]string{ + "status": http.StatusText(http.StatusUnauthorized), + "error": err.Error(), + }) +} + func serverInit() { // Setup the http.Handler/Router used by the server graph.Init() @@ -166,64 +175,32 @@ func serverInit() { return authHandle.AuthApi( // On success; next, - // On failure: JSON Response - func(rw http.ResponseWriter, r *http.Request, err error) { - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusUnauthorized) - json.NewEncoder(rw).Encode(map[string]string{ - "status": http.StatusText(http.StatusUnauthorized), - "error": err.Error(), - }) - }) + onFailureResponse) }) userapi.Use(func(next http.Handler) http.Handler { return authHandle.AuthUserApi( // On success; next, - // On failure: JSON Response - func(rw http.ResponseWriter, r *http.Request, err error) { - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusUnauthorized) - json.NewEncoder(rw).Encode(map[string]string{ - "status": http.StatusText(http.StatusUnauthorized), - "error": err.Error(), - }) - }) + onFailureResponse) }) configapi.Use(func(next http.Handler) http.Handler { return authHandle.AuthConfigApi( // On success; next, - // On failure: JSON Response - func(rw http.ResponseWriter, r *http.Request, err error) { - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusUnauthorized) - json.NewEncoder(rw).Encode(map[string]string{ - "status": http.StatusText(http.StatusUnauthorized), - "error": err.Error(), - }) - }) + onFailureResponse) }) frontendapi.Use(func(next http.Handler) http.Handler { return authHandle.AuthFrontendApi( // On success; next, - // On failure: JSON Response - func(rw http.ResponseWriter, r *http.Request, err error) { - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusUnauthorized) - json.NewEncoder(rw).Encode(map[string]string{ - "status": http.StatusText(http.StatusUnauthorized), - "error": err.Error(), - }) - }) + onFailureResponse) }) } From ba2f406bc08b9ca75d9b03773269f8ada89c7e2b Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Aug 2024 09:41:54 +0200 Subject: [PATCH 04/12] Extend sqlite db migration --- .../sqlite3/08_add-footprint.down.sql | 21 +++++++++++++++++++ .../sqlite3/08_add-footprint.up.sql | 10 +++++++++ pkg/schema/job.go | 2 +- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.down.sql b/internal/repository/migrations/sqlite3/08_add-footprint.down.sql index e69de29..8c99eb5 100644 --- a/internal/repository/migrations/sqlite3/08_add-footprint.down.sql +++ b/internal/repository/migrations/sqlite3/08_add-footprint.down.sql @@ -0,0 +1,21 @@ +ALTER TABLE job DROP energy; +ALTER TABLE job DROP energy_footprint; +ALTER TABLE job ADD COLUMN flops_any_avg; +ALTER TABLE job ADD COLUMN mem_bw_avg; +ALTER TABLE job ADD COLUMN mem_used_max; +ALTER TABLE job ADD COLUMN load_avg; +ALTER TABLE job ADD COLUMN net_bw_avg; +ALTER TABLE job ADD COLUMN net_data_vol_total; +ALTER TABLE job ADD COLUMN file_bw_avg; +ALTER TABLE job ADD COLUMN file_data_vol_total; + +UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg'); +UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg'); +UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max'); +UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg'); +UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg'); +UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total'); +UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg'); +UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total'); + +ALTER TABLE job DROP footprint; diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql index 643b87e..e5af149 100644 --- a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql +++ b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql @@ -1,4 +1,5 @@ ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0; +ALTER TABLE job ADD COLUMN energy_footprint TEXT DEFAULT NULL; ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL; UPDATE job SET footprint = '{"flops_any_avg": 0.0}'; @@ -6,7 +7,16 @@ UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_ UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg); UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max); UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg); +UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg); +UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total); +UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg); +UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total); + ALTER TABLE job DROP flops_any_avg; ALTER TABLE job DROP mem_bw_avg; ALTER TABLE job DROP mem_used_max; ALTER TABLE job DROP load_avg; +ALTER TABLE job DROP net_bw_avg; +ALTER TABLE job DROP net_data_vol_total; +ALTER TABLE job DROP file_bw_avg; +ALTER TABLE job DROP file_data_vol_total; diff --git a/pkg/schema/job.go b/pkg/schema/job.go index 83064c7..2a2ea95 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -32,7 +32,7 @@ type BaseJob struct { Footprint map[string]float64 `json:"footprint"` MetaData map[string]string `json:"metaData"` ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` - Energy float64 `json:"energy"` + Energy float64 `json:"energy" db:"energy"` ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` JobID int64 `json:"jobId" db:"job_id" example:"123000"` From e1faba0ff2334d7eabaa2d4aa566c55ca4c55f78 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Aug 2024 10:39:32 +0200 Subject: [PATCH 05/12] Update cluster json schema --- pkg/schema/schemas/cluster.schema.json | 597 +++++++++++++------------ 1 file changed, 316 insertions(+), 281 deletions(-) diff --git a/pkg/schema/schemas/cluster.schema.json b/pkg/schema/schemas/cluster.schema.json index e745f99..81b138a 100644 --- a/pkg/schema/schemas/cluster.schema.json +++ b/pkg/schema/schemas/cluster.schema.json @@ -1,284 +1,319 @@ { - "$schema": "http://json-schema.org/draft/2020-12/schema", - "$id": "embedfs://cluster.schema.json", - "title": "HPC cluster description", - "description": "Meta data information of a HPC cluster", - "type": "object", - "properties": { - "name": { - "description": "The unique identifier of a cluster", - "type": "string" - }, - "metricConfig": { - "description": "Metric specifications", - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "description": "Metric name", - "type": "string" - }, - "unit": { - "description": "Metric unit", - "$ref": "embedfs://unit.schema.json" - }, - "scope": { - "description": "Native measurement resolution", - "type": "string" - }, - "timestep": { - "description": "Frequency of timeseries points", - "type": "integer" - }, - "aggregation": { - "description": "How the metric is aggregated", - "type": "string", - "enum": [ - "sum", - "avg" - ] - }, - "peak": { - "description": "Metric peak threshold (Upper metric limit)", - "type": "number" - }, - "normal": { - "description": "Metric normal threshold", - "type": "number" - }, - "caution": { - "description": "Metric caution threshold (Suspicious but does not require immediate action)", - "type": "number" - }, - "alert": { - "description": "Metric alert threshold (Requires immediate action)", - "type": "number" - }, - "subClusters": { - "description": "Array of cluster hardware partition metric thresholds", - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "description": "Hardware partition name", - "type": "string" - }, - "peak": { - "type": "number" - }, - "normal": { - "type": "number" - }, - "caution": { - "type": "number" - }, - "alert": { - "type": "number" - }, - "remove": { - "type": "boolean" - } - }, - "required": [ - "name" - ] - } - } - }, - "required": [ - "name", - "unit", - "scope", - "timestep", - "aggregation", - "peak", - "normal", - "caution", - "alert" - ] - }, - "minItems": 1 - }, - "subClusters": { - "description": "Array of cluster hardware partitions", - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "description": "Hardware partition name", - "type": "string" - }, - "processorType": { - "description": "Processor type", - "type": "string" - }, - "socketsPerNode": { - "description": "Number of sockets per node", - "type": "integer" - }, - "coresPerSocket": { - "description": "Number of cores per socket", - "type": "integer" - }, - "threadsPerCore": { - "description": "Number of SMT threads per core", - "type": "integer" - }, - "flopRateScalar": { - "description": "Theoretical node peak flop rate for scalar code in GFlops/s", - "type": "object", - "properties": { - "unit": { - "description": "Metric unit", - "$ref": "embedfs://unit.schema.json" - }, - "value": { - "type": "number" - } - } - }, - "flopRateSimd": { - "description": "Theoretical node peak flop rate for SIMD code in GFlops/s", - "type": "object", - "properties": { - "unit": { - "description": "Metric unit", - "$ref": "embedfs://unit.schema.json" - }, - "value": { - "type": "number" - } - } - }, - "memoryBandwidth": { - "description": "Theoretical node peak memory bandwidth in GB/s", - "type": "object", - "properties": { - "unit": { - "description": "Metric unit", - "$ref": "embedfs://unit.schema.json" - }, - "value": { - "type": "number" - } - } - }, - "nodes": { - "description": "Node list expression", - "type": "string" - }, - "topology": { - "description": "Node topology", - "type": "object", - "properties": { - "node": { - "description": "HwTread lists of node", - "type": "array", - "items": { - "type": "integer" - } - }, - "socket": { - "description": "HwTread lists of sockets", - "type": "array", - "items": { - "type": "array", - "items": { - "type": "integer" - } - } - }, - "memoryDomain": { - "description": "HwTread lists of memory domains", - "type": "array", - "items": { - "type": "array", - "items": { - "type": "integer" - } - } - }, - "die": { - "description": "HwTread lists of dies", - "type": "array", - "items": { - "type": "array", - "items": { - "type": "integer" - } - } - }, - "core": { - "description": "HwTread lists of cores", - "type": "array", - "items": { - "type": "array", - "items": { - "type": "integer" - } - } - }, - "accelerators": { - "type": "array", - "description": "List of of accelerator devices", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string", - "description": "The unique device id" - }, - "type": { - "type": "string", - "description": "The accelerator type", - "enum": [ - "Nvidia GPU", - "AMD GPU", - "Intel GPU" - ] - }, - "model": { - "type": "string", - "description": "The accelerator model" - } - }, - "required": [ - "id", - "type", - "model" - ] - } - } - }, - "required": [ - "node", - "socket", - "memoryDomain" - ] - } - }, - "required": [ - "name", - "nodes", - "topology", - "processorType", - "socketsPerNode", - "coresPerSocket", - "threadsPerCore", - "flopRateScalar", - "flopRateSimd", - "memoryBandwidth" - ] - }, - "minItems": 1 - } + "$schema": "http://json-schema.org/draft/2020-12/schema", + "$id": "embedfs://cluster.schema.json", + "title": "HPC cluster description", + "description": "Meta data information of a HPC cluster", + "type": "object", + "properties": { + "name": { + "description": "The unique identifier of a cluster", + "type": "string" }, - "required": [ - "name", - "metricConfig", - "subClusters" - ] + "metricConfig": { + "description": "Metric specifications", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "description": "Metric name", + "type": "string" + }, + "unit": { + "description": "Metric unit", + "$ref": "embedfs://unit.schema.json" + }, + "scope": { + "description": "Native measurement resolution", + "type": "string" + }, + "timestep": { + "description": "Frequency of timeseries points", + "type": "integer" + }, + "aggregation": { + "description": "How the metric is aggregated", + "type": "string", + "enum": [ + "sum", + "avg" + ] + }, + "footprint": { + "description": "Is it a footprint metric and what type", + "type": "string", + "enum": [ + "avg", + "max", + "min" + ] + }, + "energy": { + "description": "Is it used to calculate job energy", + "type": "boolean" + }, + "lowerIsBetter": { + "description": "Is lower better.", + "type": "boolean" + }, + "peak": { + "description": "Metric peak threshold (Upper metric limit)", + "type": "number" + }, + "normal": { + "description": "Metric normal threshold", + "type": "number" + }, + "caution": { + "description": "Metric caution threshold (Suspicious but does not require immediate action)", + "type": "number" + }, + "alert": { + "description": "Metric alert threshold (Requires immediate action)", + "type": "number" + }, + "subClusters": { + "description": "Array of cluster hardware partition metric thresholds", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "description": "Hardware partition name", + "type": "string" + }, + "footprint": { + "description": "Is it a footprint metric and what type. Overwrite global setting", + "type": "string", + "enum": [ + "avg", + "max", + "min" + ] + }, + "energy": { + "description": "Is it used to calculate job energy. Overwrite global", + "type": "boolean" + }, + "lowerIsBetter": { + "description": "Is lower better. Overwrite global", + "type": "boolean" + }, + "peak": { + "type": "number" + }, + "normal": { + "type": "number" + }, + "caution": { + "type": "number" + }, + "alert": { + "type": "number" + }, + "remove": { + "description": "Remove this metric for this subcluster", + "type": "boolean" + } + }, + "required": [ + "name" + ] + } + } + }, + "required": [ + "name", + "unit", + "scope", + "timestep", + "aggregation", + "peak", + "normal", + "caution", + "alert" + ] + }, + "minItems": 1 + }, + "subClusters": { + "description": "Array of cluster hardware partitions", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "description": "Hardware partition name", + "type": "string" + }, + "processorType": { + "description": "Processor type", + "type": "string" + }, + "socketsPerNode": { + "description": "Number of sockets per node", + "type": "integer" + }, + "coresPerSocket": { + "description": "Number of cores per socket", + "type": "integer" + }, + "threadsPerCore": { + "description": "Number of SMT threads per core", + "type": "integer" + }, + "flopRateScalar": { + "description": "Theoretical node peak flop rate for scalar code in GFlops/s", + "type": "object", + "properties": { + "unit": { + "description": "Metric unit", + "$ref": "embedfs://unit.schema.json" + }, + "value": { + "type": "number" + } + } + }, + "flopRateSimd": { + "description": "Theoretical node peak flop rate for SIMD code in GFlops/s", + "type": "object", + "properties": { + "unit": { + "description": "Metric unit", + "$ref": "embedfs://unit.schema.json" + }, + "value": { + "type": "number" + } + } + }, + "memoryBandwidth": { + "description": "Theoretical node peak memory bandwidth in GB/s", + "type": "object", + "properties": { + "unit": { + "description": "Metric unit", + "$ref": "embedfs://unit.schema.json" + }, + "value": { + "type": "number" + } + } + }, + "nodes": { + "description": "Node list expression", + "type": "string" + }, + "topology": { + "description": "Node topology", + "type": "object", + "properties": { + "node": { + "description": "HwTread lists of node", + "type": "array", + "items": { + "type": "integer" + } + }, + "socket": { + "description": "HwTread lists of sockets", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "memoryDomain": { + "description": "HwTread lists of memory domains", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "die": { + "description": "HwTread lists of dies", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "core": { + "description": "HwTread lists of cores", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "accelerators": { + "type": "array", + "description": "List of of accelerator devices", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The unique device id" + }, + "type": { + "type": "string", + "description": "The accelerator type", + "enum": [ + "Nvidia GPU", + "AMD GPU", + "Intel GPU" + ] + }, + "model": { + "type": "string", + "description": "The accelerator model" + } + }, + "required": [ + "id", + "type", + "model" + ] + } + } + }, + "required": [ + "node", + "socket", + "memoryDomain" + ] + } + }, + "required": [ + "name", + "nodes", + "topology", + "processorType", + "socketsPerNode", + "coresPerSocket", + "threadsPerCore", + "flopRateScalar", + "flopRateSimd", + "memoryBandwidth" + ] + }, + "minItems": 1 + } + }, + "required": [ + "name", + "metricConfig", + "subClusters" + ] } From 5c99f5f8bbb44366dc13d0250bff1087dfa2e0cd Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Aug 2024 12:35:11 +0200 Subject: [PATCH 06/12] Only add footprint columns if not 0 --- .../repository/migrations/sqlite3/08_add-footprint.up.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql index e5af149..0ffbf37 100644 --- a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql +++ b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql @@ -7,10 +7,10 @@ UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_ UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg); UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max); UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg); -UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg); -UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total); -UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg); -UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total); +UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) IF job.net_bw_avg != 0; +UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) IF job.net_data_vol_total != 0; +UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) IF job.file_bw_avg != 0; +UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) IF job.file_data_vol_total != 0; ALTER TABLE job DROP flops_any_avg; ALTER TABLE job DROP mem_bw_avg; From d6a88896d059023eeac8dbad415a0ce065f328fe Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Aug 2024 12:36:21 +0200 Subject: [PATCH 07/12] Refactor: Reduce struct memory size --- pkg/schema/cluster.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index a77bd32..e9aa178 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -47,11 +47,11 @@ type SubCluster struct { type SubClusterConfig struct { Name string `json:"name"` + Footprint string `json:"footprint,omitempty"` Peak float64 `json:"peak"` Normal float64 `json:"normal"` Caution float64 `json:"caution"` Alert float64 `json:"alert"` - Footprint string `json:"footprint,omitempty"` Remove bool `json:"remove"` LowerIsBetter bool `json:"lowerIsBetter"` Energy bool `json:"energy"` @@ -62,14 +62,14 @@ type MetricConfig struct { Name string `json:"name"` Scope MetricScope `json:"scope"` Aggregation string `json:"aggregation"` + Footprint string `json:"footprint,omitempty"` SubClusters []*SubClusterConfig `json:"subClusters,omitempty"` - Timestep int `json:"timestep"` Peak float64 `json:"peak"` Normal float64 `json:"normal"` Caution float64 `json:"caution"` Alert float64 `json:"alert"` + Timestep int `json:"timestep"` LowerIsBetter bool `json:"lowerIsBetter"` - Footprint string `json:"footprint,omitempty"` Energy bool `json:"energy"` } From 5e074dad1029062a39241daaa8319f20f5f36736 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Aug 2024 12:39:14 +0200 Subject: [PATCH 08/12] Resolve error in migration --- .../repository/migrations/sqlite3/08_add-footprint.up.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql index 0ffbf37..93f0659 100644 --- a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql +++ b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql @@ -7,10 +7,10 @@ UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_ UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg); UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max); UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg); -UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) IF job.net_bw_avg != 0; -UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) IF job.net_data_vol_total != 0; -UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) IF job.file_bw_avg != 0; -UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) IF job.file_data_vol_total != 0; +UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0; +UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0; +UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0; +UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0; ALTER TABLE job DROP flops_any_avg; ALTER TABLE job DROP mem_bw_avg; From 49e0a2c0550c208264747c8dce3c5812b9a0f921 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 15 Aug 2024 14:33:04 +0200 Subject: [PATCH 09/12] fix: add compatibility for footprint metrics without config --- .../src/generic/helper/JobFootprint.svelte | 222 ++++++++++-------- 1 file changed, 129 insertions(+), 93 deletions(-) diff --git a/web/frontend/src/generic/helper/JobFootprint.svelte b/web/frontend/src/generic/helper/JobFootprint.svelte index 82818e3..4e1abb0 100644 --- a/web/frontend/src/generic/helper/JobFootprint.svelte +++ b/web/frontend/src/generic/helper/JobFootprint.svelte @@ -67,62 +67,74 @@ export let height = "310px"; const footprintData = job?.footprint?.map((jf) => { - // Unit const fmc = getContext("getMetricConfig")(job.cluster, job.subCluster, jf.name); - const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "") + if (fmc) { + // Unit + const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "") - // Threshold / -Differences - const fmt = findJobThresholds(job, fmc); - if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0); + // Threshold / -Differences + const fmt = findJobThresholds(job, fmc); + if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0); - // Define basic data -> Value: Use as Provided - const fmBase = { - name: jf.name + ' (' + jf.stat + ')', - avg: jf.value, - unit: unit, - max: fmt.peak, - dir: fmc.lowerIsBetter - }; + // Define basic data -> Value: Use as Provided + const fmBase = { + name: jf.name + ' (' + jf.stat + ')', + avg: jf.value, + unit: unit, + max: fmt.peak, + dir: fmc.lowerIsBetter + }; - if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "alert")) { + if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "alert")) { + return { + ...fmBase, + color: "danger", + message: `Metric average way ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`, + impact: 3 + }; + } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "caution")) { + return { + ...fmBase, + color: "warning", + message: `Metric average ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`, + impact: 2, + }; + } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "normal")) { + return { + ...fmBase, + color: "success", + message: "Metric average within expected thresholds.", + impact: 1, + }; + } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "peak")) { + return { + ...fmBase, + color: "info", + message: + "Metric average above expected normal thresholds: Check for artifacts recommended.", + impact: 0, + }; + } else { + return { + ...fmBase, + color: "secondary", + message: + "Metric average above expected peak threshold: Check for artifacts!", + impact: -1, + }; + } + } else { // No matching metric config: display as single value return { - ...fmBase, - color: "danger", - message: `Metric average way ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`, - impact: 3 - }; - } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "caution")) { - return { - ...fmBase, - color: "warning", - message: `Metric average ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`, - impact: 2, - }; - } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "normal")) { - return { - ...fmBase, - color: "success", - message: "Metric average within expected thresholds.", - impact: 1, - }; - } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "peak")) { - return { - ...fmBase, - color: "info", + name: jf.name + ' (' + jf.stat + ')', + avg: jf.value, message: - "Metric average above expected normal thresholds: Check for artifacts recommended.", - impact: 0, - }; - } else { - return { - ...fmBase, - color: "secondary", - message: - "Metric average above expected peak threshold: Check for artifacts!", - impact: -1, + `No config for metric ${jf.name} found.`, + impact: 4, }; } - }); + }).sort(function (a, b) { // Sort by impact value primarily, within impact sort name alphabetically + return a.impact - b.impact || ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0)); + });; function evalFootprint(mean, thresholds, lowerIsBetter, level) { // Handle Metrics in which less value is better @@ -159,37 +171,76 @@ {/if} {#each footprintData as fpd, index} -
-
 {fpd.name}
- -
-
- - {#if fpd.impact === 3 || fpd.impact === -1} - - {:else if fpd.impact === 2} - - {/if} - - {#if fpd.impact === 3} - - {:else if fpd.impact === 2} - - {:else if fpd.impact === 1} - - {:else if fpd.impact === 0} - - {:else if fpd.impact === -1} - - {/if} + {#if fpd.impact !== 4} +
+
 {fpd.name}
+ +
+
+ + {#if fpd.impact === 3 || fpd.impact === -1} + + {:else if fpd.impact === 2} + + {/if} + + {#if fpd.impact === 3} + + {:else if fpd.impact === 2} + + {:else if fpd.impact === 1} + + {:else if fpd.impact === 0} + + {:else if fpd.impact === -1} + + {/if} +
+
+ + {fpd.avg} / {fpd.max} + {fpd.unit}   +
+ {fpd.message} +
+ + {#if fpd.dir} + + + + {/if} + + + + {#if !fpd.dir} + + + + {/if} + + {:else} +
- - {fpd.avg} / {fpd.max} - {fpd.unit}   +  {fpd.name} +
+
+
+ +
+
+ {fpd.avg}  +
{fpd.message} -
- - {#if fpd.dir} - - - - {/if} - - - - {#if !fpd.dir} - - - - {/if} - + {/if} {/each} {#if job?.metaData?.message}
From a8a27c9b51cb9002498634cf625423e0896c985c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 19 Aug 2024 12:11:53 +0200 Subject: [PATCH 10/12] Add project index to job table --- .../repository/migrations/sqlite3/08_add-footprint.up.sql | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql index 93f0659..bcd6494 100644 --- a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql +++ b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql @@ -1,8 +1,12 @@ +CREATE INDEX IF NOT EXISTS job_by_project ON job (project); +CREATE INDEX IF NOT EXISTS job_list_projects ON job (project, job_state); + ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0; ALTER TABLE job ADD COLUMN energy_footprint TEXT DEFAULT NULL; ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL; UPDATE job SET footprint = '{"flops_any_avg": 0.0}'; + UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg); UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg); UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max); From 1758275f115cef35c17ca255ade6fbd4d3db4c11 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 22 Aug 2024 14:01:27 +0200 Subject: [PATCH 11/12] fix: fix getMetricConfigDeep util function - threw error for mismatching metric availability between clusters --- web/frontend/src/generic/utils.js | 43 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/web/frontend/src/generic/utils.js b/web/frontend/src/generic/utils.js index bb63a4f..3aaafa1 100644 --- a/web/frontend/src/generic/utils.js +++ b/web/frontend/src/generic/utils.js @@ -301,7 +301,7 @@ export function stickyHeader(datatableHeaderSelector, updatePading) { onDestroy(() => document.removeEventListener("scroll", onscroll)); } -export function checkMetricDisabled(m, c, s) { //[m]etric, [c]luster, [s]ubcluster +export function checkMetricDisabled(m, c, s) { // [m]etric, [c]luster, [s]ubcluster const metrics = getContext("globalMetrics"); const result = metrics?.find((gm) => gm.name === m)?.availability?.find((av) => av.cluster === c)?.subClusters?.includes(s) return !result @@ -309,23 +309,22 @@ export function checkMetricDisabled(m, c, s) { //[m]etric, [c]luster, [s]ubclust export function getStatsItems() { // console.time('stats') - // console.log('getStatsItems ...') const globalMetrics = getContext("globalMetrics") const result = globalMetrics.map((gm) => { if (gm?.footprint) { - // Footprint contains suffix naming the used stat-type // console.time('deep') - // console.log('Deep Config for', gm.name) const mc = getMetricConfigDeep(gm.name, null, null) // console.timeEnd('deep') - return { - field: gm.name + '_' + gm.footprint, - text: gm.name + ' (' + gm.footprint + ')', - metric: gm.name, - from: 0, - to: mc.peak, - peak: mc.peak, - enabled: false + if (mc) { + return { + field: gm.name + '_' + gm.footprint, + text: gm.name + ' (' + gm.footprint + ')', + metric: gm.name, + from: 0, + to: mc.peak, + peak: mc.peak, + enabled: false + } } } return null @@ -336,11 +335,9 @@ export function getStatsItems() { export function getSortItems() { //console.time('sort') - //console.log('getSortItems ...') const globalMetrics = getContext("globalMetrics") const result = globalMetrics.map((gm) => { if (gm?.footprint) { - // Footprint contains suffix naming the used stat-type return { field: gm.name + '_' + gm.footprint, type: 'foot', @@ -357,21 +354,22 @@ export function getSortItems() { function getMetricConfigDeep(metric, cluster, subCluster) { const clusters = getContext("clusters"); if (cluster != null) { - let c = clusters.find((c) => c.name == cluster); + const c = clusters.find((c) => c.name == cluster); if (subCluster != null) { - let sc = c.subClusters.find((sc) => sc.name == subCluster); + const sc = c.subClusters.find((sc) => sc.name == subCluster); return sc.metricConfig.find((mc) => mc.name == metric) } else { let result; for (let sc of c.subClusters) { const mc = sc.metricConfig.find((mc) => mc.name == metric) - if (result) { // If lowerIsBetter: Peak is still maximum value, no special case required + if (result && mc) { // update result; If lowerIsBetter: Peak is still maximum value, no special case required result.alert = (mc.alert > result.alert) ? mc.alert : result.alert result.caution = (mc.caution > result.caution) ? mc.caution : result.caution result.normal = (mc.normal > result.normal) ? mc.normal : result.normal result.peak = (mc.peak > result.peak) ? mc.peak : result.peak - } else { - if (mc) result = {...mc}; + } else if (mc) { + // start new result + result = {...mc}; } } return result @@ -381,13 +379,14 @@ function getMetricConfigDeep(metric, cluster, subCluster) { for (let c of clusters) { for (let sc of c.subClusters) { const mc = sc.metricConfig.find((mc) => mc.name == metric) - if (result) { // If lowerIsBetter: Peak is still maximum value, no special case required + if (result && mc) { // update result; If lowerIsBetter: Peak is still maximum value, no special case required result.alert = (mc.alert > result.alert) ? mc.alert : result.alert result.caution = (mc.caution > result.caution) ? mc.caution : result.caution result.normal = (mc.normal > result.normal) ? mc.normal : result.normal result.peak = (mc.peak > result.peak) ? mc.peak : result.peak - } else { - if (mc) result = {...mc}; + } else if (mc) { + // Start new result + result = {...mc}; } } } From 084f89fa32b55467030f452b227ffd96adaefd35 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 22 Aug 2024 14:46:27 +0200 Subject: [PATCH 12/12] fix: fix svelte source paths in makefile --- Makefile | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 508c9fb..b673e79 100644 --- a/Makefile +++ b/Makefile @@ -22,11 +22,21 @@ SVELTE_COMPONENTS = status \ header SVELTE_TARGETS = $(addprefix $(FRONTEND)/public/build/,$(addsuffix .js, $(SVELTE_COMPONENTS))) -SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \ - $(wildcard $(FRONTEND)/src/*.js) \ - $(wildcard $(FRONTEND)/src/filters/*.svelte) \ - $(wildcard $(FRONTEND)/src/plots/*.svelte) \ - $(wildcard $(FRONTEND)/src/joblist/*.svelte) +SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \ + $(wildcard $(FRONTEND)/src/*.js) \ + $(wildcard $(FRONTEND)/src/analysis/*.svelte) \ + $(wildcard $(FRONTEND)/src/config/*.svelte) \ + $(wildcard $(FRONTEND)/src/config/admin/*.svelte) \ + $(wildcard $(FRONTEND)/src/config/user/*.svelte) \ + $(wildcard $(FRONTEND)/src/generic/*.js) \ + $(wildcard $(FRONTEND)/src/generic/*.svelte) \ + $(wildcard $(FRONTEND)/src/generic/filters/*.svelte) \ + $(wildcard $(FRONTEND)/src/generic/plots/*.svelte) \ + $(wildcard $(FRONTEND)/src/generic/joblist/*.svelte) \ + $(wildcard $(FRONTEND)/src/generic/helper/*.svelte) \ + $(wildcard $(FRONTEND)/src/generic/select/*.svelte) \ + $(wildcard $(FRONTEND)/src/header/*.svelte) \ + $(wildcard $(FRONTEND)/src/job/*.svelte) .PHONY: clean distclean test tags frontend swagger graphql $(TARGET)