From 1031b3eb79caf6c6698f66e6c2c4b6edaec86e81 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 13:06:40 +0100 Subject: [PATCH 01/27] fix: user and status view histogram selection - correctly loads selection for selected cluster - applies availablility for selected cluster --- web/frontend/src/Status.root.svelte | 4 ++-- web/frontend/src/User.root.svelte | 4 ++-- .../generic/select/HistogramSelection.svelte | 22 +++++++++++++------ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/web/frontend/src/Status.root.svelte b/web/frontend/src/Status.root.svelte index a44a962..63a69f5 100644 --- a/web/frontend/src/Status.root.svelte +++ b/web/frontend/src/Status.root.svelte @@ -76,8 +76,8 @@ let isHistogramSelectionOpen = false; $: metricsInHistograms = cluster - ? ccconfig[`user_view_histogramMetrics:${cluster}`] || [] - : ccconfig.user_view_histogramMetrics || []; + ? ccconfig[`user_view_histogramMetrics:${cluster}`] || ( ccconfig['user_view_histogramMetrics'] || [] ) + : ccconfig['user_view_histogramMetrics'] || []; const client = getContextClient(); // Note: nodeMetrics are requested on configured $timestep resolution diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index fae972b..77c4e01 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -69,8 +69,8 @@ let metricBinOptions = [10, 20, 50, 100]; $: metricsInHistograms = selectedCluster - ? ccconfig[`user_view_histogramMetrics:${selectedCluster}`] || [] - : ccconfig.user_view_histogramMetrics || []; + ? ccconfig[`user_view_histogramMetrics:${selectedCluster}`] || ( ccconfig['user_view_histogramMetrics'] || [] ) + : ccconfig['user_view_histogramMetrics'] || []; const client = getContextClient(); $: stats = queryStore({ diff --git a/web/frontend/src/generic/select/HistogramSelection.svelte b/web/frontend/src/generic/select/HistogramSelection.svelte index 4e38123..48971b0 100644 --- a/web/frontend/src/generic/select/HistogramSelection.svelte +++ b/web/frontend/src/generic/select/HistogramSelection.svelte @@ -27,15 +27,23 @@ const client = getContextClient(); const initialized = getContext("initialized"); - let availableMetrics = [] - function loadHistoMetrics(isInitialized) { - if (!isInitialized) return; - const rawAvailableMetrics = getContext("globalMetrics").filter((gm) => gm?.footprint).map((fgm) => { return fgm.name }) - availableMetrics = [...rawAvailableMetrics] + function loadHistoMetrics(isInitialized, thisCluster) { + if (!isInitialized) return []; + + if (!thisCluster) { + return getContext("globalMetrics") + .filter((gm) => gm?.footprint) + .map((fgm) => { return fgm.name }) + } else { + return getContext("globalMetrics") + .filter((gm) => gm?.availability.find((av) => av.cluster == thisCluster)) + .filter((agm) => agm?.footprint) + .map((afgm) => { return afgm.name }) + } } - let pendingMetrics = [...metricsInHistograms]; // Copy + $: pendingMetrics = [...metricsInHistograms]; // Copy on change from above const updateConfigurationMutation = ({ name, value }) => { return mutationStore({ @@ -71,7 +79,7 @@ }); } - $: loadHistoMetrics($initialized); + $: availableMetrics = loadHistoMetrics($initialized, cluster); From 5ce03c2db3f07a496ec7295c10a680542c87fa6d Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 13:08:32 +0100 Subject: [PATCH 02/27] add metric selection count info to job view --- web/frontend/src/Job.root.svelte | 6 ++++-- web/frontend/src/job/StatsTable.svelte | 23 +++++++++++++++-------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index a384e32..43d4f10 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -58,7 +58,8 @@ let plots = {}, statsTable - let missingMetrics = [], + let availableMetrics = new Set(), + missingMetrics = [], missingHosts = [], somethingMissing = false; @@ -293,7 +294,7 @@ {#if $initq.data} {/if} @@ -431,6 +432,7 @@ configName="job_view_selectedMetrics" bind:metrics={selectedMetrics} bind:isOpen={isMetricsSelectionOpen} + bind:allMetrics={availableMetrics} /> {/if} diff --git a/web/frontend/src/job/StatsTable.svelte b/web/frontend/src/job/StatsTable.svelte index d68d237..21d9b3b 100644 --- a/web/frontend/src/job/StatsTable.svelte +++ b/web/frontend/src/job/StatsTable.svelte @@ -18,6 +18,8 @@ InputGroup, InputGroupText, Icon, + Row, + Col } from "@sveltestrap/sveltestrap"; import { maxScope } from "../generic/utils.js"; import StatsTableEntry from "./StatsTableEntry.svelte"; @@ -26,7 +28,7 @@ export let job; export let jobMetrics; - const allMetrics = [...new Set(jobMetrics.map((m) => m.name))].sort() + const sortedJobMetrics = [...new Set(jobMetrics.map((m) => m.name))].sort() const scopesForMetric = (metric) => jobMetrics.filter((jm) => jm.name == metric).map((jm) => jm.scope); @@ -34,11 +36,12 @@ selectedScopes = {}, sorting = {}, isMetricSelectionOpen = false, + availableMetrics = new Set(), selectedMetrics = getContext("cc-config")[`job_view_nodestats_selectedMetrics:${job.cluster}`] || getContext("cc-config")["job_view_nodestats_selectedMetrics"]; - for (let metric of allMetrics) { + for (let metric of sortedJobMetrics) { // Not Exclusive or Multi-Node: get maxScope directly (mostly: node) // -> Else: Load smallest available granularity as default as per availability const availableScopes = scopesForMetric(metric); @@ -95,15 +98,19 @@ }; + + + + + +
- + {#if groupSelection.key == "user"} {:else} diff --git a/web/frontend/src/Status.root.svelte b/web/frontend/src/Status.root.svelte index 63a69f5..1249e0c 100644 --- a/web/frontend/src/Status.root.svelte +++ b/web/frontend/src/Status.root.svelte @@ -177,6 +177,7 @@ groupBy: USER ) { id + name totalJobs totalNodes totalCores @@ -518,7 +519,7 @@ From 0fe0461340aac88efb01d5ef9689856ebcdb2ab4 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 14:00:27 +0100 Subject: [PATCH 06/27] remove conflicting variable layer in metric histo select --- web/frontend/src/Status.root.svelte | 12 ++++++------ web/frontend/src/User.root.svelte | 12 ++++++------ .../src/generic/select/HistogramSelection.svelte | 12 ++++-------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/web/frontend/src/Status.root.svelte b/web/frontend/src/Status.root.svelte index 1249e0c..a310421 100644 --- a/web/frontend/src/Status.root.svelte +++ b/web/frontend/src/Status.root.svelte @@ -75,7 +75,7 @@ ); let isHistogramSelectionOpen = false; - $: metricsInHistograms = cluster + $: selectedHistograms = cluster ? ccconfig[`user_view_histogramMetrics:${cluster}`] || ( ccconfig['user_view_histogramMetrics'] || [] ) : ccconfig['user_view_histogramMetrics'] || []; @@ -90,7 +90,7 @@ $metrics: [String!] $from: Time! $to: Time! - $metricsInHistograms: [String!] + $selectedHistograms: [String!] ) { nodeMetrics( cluster: $cluster @@ -116,7 +116,7 @@ } } - stats: jobsStatistics(filter: $filter, metrics: $metricsInHistograms) { + stats: jobsStatistics(filter: $filter, metrics: $selectedHistograms) { histDuration { count value @@ -157,7 +157,7 @@ from: from.toISOString(), to: to.toISOString(), filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], - metricsInHistograms: metricsInHistograms, + selectedHistograms: selectedHistograms, }, }); @@ -653,7 +653,7 @@ - {#if metricsInHistograms} + {#if selectedHistograms} {#key $mainQuery.data.stats[0].histMetrics} diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 77c4e01..0e6a5b8 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -68,7 +68,7 @@ let durationBinOptions = ["1m","10m","1h","6h","12h"]; let metricBinOptions = [10, 20, 50, 100]; - $: metricsInHistograms = selectedCluster + $: selectedHistograms = selectedCluster ? ccconfig[`user_view_histogramMetrics:${selectedCluster}`] || ( ccconfig['user_view_histogramMetrics'] || [] ) : ccconfig['user_view_histogramMetrics'] || []; @@ -76,8 +76,8 @@ $: stats = queryStore({ client: client, query: gql` - query ($jobFilters: [JobFilter!]!, $metricsInHistograms: [String!], $numDurationBins: String, $numMetricBins: Int) { - jobsStatistics(filter: $jobFilters, metrics: $metricsInHistograms, numDurationBins: $numDurationBins , numMetricBins: $numMetricBins ) { + query ($jobFilters: [JobFilter!]!, $selectedHistograms: [String!], $numDurationBins: String, $numMetricBins: Int) { + jobsStatistics(filter: $jobFilters, metrics: $selectedHistograms, numDurationBins: $numDurationBins , numMetricBins: $numMetricBins ) { totalJobs shortJobs totalWalltime @@ -104,7 +104,7 @@ } } `, - variables: { jobFilters, metricsInHistograms, numDurationBins, numMetricBins }, + variables: { jobFilters, selectedHistograms, numDurationBins, numMetricBins }, }); onMount(() => filterComponent.updateFilters()); @@ -290,7 +290,7 @@ -{#if metricsInHistograms?.length > 0} +{#if selectedHistograms?.length > 0} {#if $stats.error} @@ -357,6 +357,6 @@ diff --git a/web/frontend/src/generic/select/HistogramSelection.svelte b/web/frontend/src/generic/select/HistogramSelection.svelte index 48971b0..604fc95 100644 --- a/web/frontend/src/generic/select/HistogramSelection.svelte +++ b/web/frontend/src/generic/select/HistogramSelection.svelte @@ -3,7 +3,7 @@ Properties: - `cluster String`: Currently selected cluster - - `metricsInHistograms [String]`: The currently selected metrics to display as histogram + - `selectedHistograms [String]`: The currently selected metrics to display as histogram - ìsOpen Bool`: Is selection opened --> @@ -21,13 +21,12 @@ import { gql, getContextClient, mutationStore } from "@urql/svelte"; export let cluster; - export let metricsInHistograms; + export let selectedHistograms; export let isOpen; const client = getContextClient(); const initialized = getContext("initialized"); - function loadHistoMetrics(isInitialized, thisCluster) { if (!isInitialized) return []; @@ -43,8 +42,6 @@ } } - $: pendingMetrics = [...metricsInHistograms]; // Copy on change from above - const updateConfigurationMutation = ({ name, value }) => { return mutationStore({ client: client, @@ -69,13 +66,12 @@ } function closeAndApply() { - metricsInHistograms = [...pendingMetrics]; // Set for parent isOpen = !isOpen; updateConfiguration({ name: cluster ? `user_view_histogramMetrics:${cluster}` : "user_view_histogramMetrics", - value: metricsInHistograms, + value: selectedHistograms, }); } @@ -89,7 +85,7 @@ {#each availableMetrics as metric (metric)} - + {metric} {/each} From c661baf058e4de122d600b51ab7ecae7a6be13bf Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 28 Feb 2025 14:36:19 +0100 Subject: [PATCH 07/27] Load new default metrics config from working directory --- internal/config/default_metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/config/default_metrics.go b/internal/config/default_metrics.go index 83015d4..b0a0cc5 100644 --- a/internal/config/default_metrics.go +++ b/internal/config/default_metrics.go @@ -16,7 +16,7 @@ type DefaultMetricsConfig struct { } func LoadDefaultMetricsConfig() (*DefaultMetricsConfig, error) { - filePath := "configs/default_metrics.json" + filePath := "default_metrics.json" if _, err := os.Stat(filePath); os.IsNotExist(err) { return nil, nil } From b31aea7bc5492da69d58c00cc66742d04be17579 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 14:40:27 +0100 Subject: [PATCH 08/27] revert back to using globalMetrics in jobView metric default select --- web/frontend/src/Job.root.svelte | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 43d4f10..b641a43 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -129,7 +129,12 @@ const pendingMetrics = [ ...(ccconfig[`job_view_selectedMetrics:${job.cluster}`] || - ccconfig[`job_view_selectedMetrics`] + $initq.data.globalMetrics.reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === job.cluster)) { + names.push(gm.name); + } + return names; + }, []) ), ...(ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}`] || ccconfig[`job_view_nodestats_selectedMetrics`] From d7aefe0cf0b206a288bff8330d0814114f55d025 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 14:55:32 +0100 Subject: [PATCH 09/27] move user names in top lists to tooltip --- web/frontend/src/Analysis.root.svelte | 12 ++++++++++-- web/frontend/src/Status.root.svelte | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/web/frontend/src/Analysis.root.svelte b/web/frontend/src/Analysis.root.svelte index 1617ccd..861c0ec 100644 --- a/web/frontend/src/Analysis.root.svelte +++ b/web/frontend/src/Analysis.root.svelte @@ -20,6 +20,7 @@ Card, Table, Icon, + Tooltip } from "@sveltestrap/sveltestrap"; import { init, @@ -425,11 +426,18 @@ {#if groupSelection.key == "user"} - + {#if te?.name} + {te.name} + {/if} {:else} - + {#if tu?.name} + {tu.name} + {/if} {/each} From ec895e1d9e1b789e76f08da399922e7244b215d2 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 3 Mar 2025 09:36:37 +0100 Subject: [PATCH 10/27] Add fallback case to nodeInfo --- web/frontend/src/systems/nodelist/NodeInfo.svelte | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/web/frontend/src/systems/nodelist/NodeInfo.svelte b/web/frontend/src/systems/nodelist/NodeInfo.svelte index ad6c98e..6b14656 100644 --- a/web/frontend/src/systems/nodelist/NodeInfo.svelte +++ b/web/frontend/src/systems/nodelist/NodeInfo.svelte @@ -102,6 +102,19 @@ Shared + + {:else if nodeJobsData.jobs.count >= 1} + + + + + + Status + + + {:else} From c21d7cf101ad7e4a1e4d6780db519351c420b14a Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 3 Mar 2025 11:21:54 +0100 Subject: [PATCH 11/27] fix and review quick starttime select handling --- web/frontend/src/generic/Filters.svelte | 26 ++-- .../src/generic/filters/StartTime.svelte | 113 +++++++++++------- 2 files changed, 86 insertions(+), 53 deletions(-) diff --git a/web/frontend/src/generic/Filters.svelte b/web/frontend/src/generic/Filters.svelte index 481211b..4a9be3e 100644 --- a/web/frontend/src/generic/Filters.svelte +++ b/web/frontend/src/generic/Filters.svelte @@ -45,6 +45,14 @@ export let startTimeQuickSelect = false; export let matchedJobs = -2; + const startTimeSelectOptions = [ + { range: "", rangeLabel: "No Selection"}, + { range: "last6h", rangeLabel: "Last 6hrs"}, + { range: "last24h", rangeLabel: "Last 24hrs"}, + { range: "last7d", rangeLabel: "Last 7 days"}, + { range: "last30d", rangeLabel: "Last 30 days"} + ]; + let filters = { projectMatch: filterPresets.projectMatch || "contains", userMatch: filterPresets.userMatch || "contains", @@ -56,7 +64,7 @@ filterPresets.states || filterPresets.state ? [filterPresets.state].flat() : allJobStates, - startTime: filterPresets.startTime || { from: null, to: null }, + startTime: filterPresets.startTime || { from: null, to: null, range: ""}, tags: filterPresets.tags || [], duration: filterPresets.duration || { lessThan: null, @@ -268,16 +276,17 @@ {#if startTimeQuickSelect} Start Time Quick Selection - {#each [{ text: "Last 6hrs", range: "last6h" }, { text: "Last 24hrs", range: "last24h" }, { text: "Last 7 days", range: "last7d" }, { text: "Last 30 days", range: "last30d" }] as { text, range }} + {#each startTimeSelectOptions.filter((stso) => stso.range !== "") as { rangeLabel, range }} { + filters.startTime.from = null + filters.startTime.to = null filters.startTime.range = range; - filters.startTime.text = text; updateFilters(); }} > - {text} + {rangeLabel} {/each} {/if} @@ -316,7 +325,7 @@ {#if filters.startTime.range} (isStartTimeOpen = true)}> - {filters?.startTime?.text ? filters.startTime.text : filters.startTime.range } + {startTimeSelectOptions.find((stso) => stso.range === filters.startTime.range).rangeLabel } {/if} @@ -414,11 +423,8 @@ bind:from={filters.startTime.from} bind:to={filters.startTime.to} bind:range={filters.startTime.range} - on:set-filter={() => { - delete filters.startTime["text"]; - delete filters.startTime["range"]; - updateFilters(); - }} + {startTimeSelectOptions} + on:set-filter={() => updateFilters()} /> (isOpen = !isOpen)}> @@ -92,52 +89,82 @@ {#if range !== ""}

Current Range

-
- - + + + {#each startTimeSelectOptions as { rangeLabel, range }} + {/if}

From

- + - +

To

- + - + - + {#if pendingRange !== ""} + + + {:else} + + {/if} From 3ab8973895441481c8ce5b2494b87a284d4bb7a5 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 3 Mar 2025 12:44:18 +0100 Subject: [PATCH 12/27] use extendedLegend in nodeList for all non-idle nodes - changed from "use for shared nodes only" --- web/frontend/src/systems/nodelist/NodeListRow.svelte | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index a1e4a54..07e5556 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -98,8 +98,8 @@ let extendedLegendData = null; $: if ($nodeJobsData?.data) { - // Get Shared State of Node: Only Build extended Legend For Shared Nodes - if ($nodeJobsData.data.jobs.count >= 1 && !$nodeJobsData.data.jobs.items[0].exclusive) { + // Build Extended for allocated nodes [Commented: Only Build extended Legend For Shared Nodes] + if ($nodeJobsData.data.jobs.count >= 1) { // "&& !$nodeJobsData.data.jobs.items[0].exclusive)" const accSet = Array.from(new Set($nodeJobsData.data.jobs.items .map((i) => i.resources .filter((r) => r.hostname === nodeData.host) From 419bc2747b9e774b5404c34f59b17d3609fe5299 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 3 Mar 2025 16:53:19 +0100 Subject: [PATCH 13/27] fix nodeInfo null error --- web/frontend/src/systems/nodelist/NodeListRow.svelte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index 07e5556..5e6e4ac 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -105,7 +105,7 @@ .filter((r) => r.hostname === nodeData.host) .map((r) => r.accelerators) ) - )).flat(2) + )).flat(2).filter(a => a) // Last filter(): Exclude Null, Undefined and empty Str extendedLegendData = {} for (const accId of accSet) { From 5c9d4ffa9a1f70ea4ff3ad2c3185ef9c7cc87168 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 3 Mar 2025 17:00:33 +0100 Subject: [PATCH 14/27] clarify and simplyfy earlier change --- web/frontend/src/systems/nodelist/NodeListRow.svelte | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index 5e6e4ac..5202573 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -102,10 +102,10 @@ if ($nodeJobsData.data.jobs.count >= 1) { // "&& !$nodeJobsData.data.jobs.items[0].exclusive)" const accSet = Array.from(new Set($nodeJobsData.data.jobs.items .map((i) => i.resources - .filter((r) => r.hostname === nodeData.host) - .map((r) => r.accelerators) + .filter((r) => (r.hostname === nodeData.host) && r?.accelerators) + .map((r) => r?.accelerators) ) - )).flat(2).filter(a => a) // Last filter(): Exclude Null, Undefined and empty Str + )).flat(2) extendedLegendData = {} for (const accId of accSet) { From fcc9e17664ecf5d05206bda34e054c6fc48abccb Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 3 Mar 2025 17:24:54 +0100 Subject: [PATCH 15/27] change: remove metrics from job view select if unavailable on subCLuster --- web/frontend/src/Job.root.svelte | 3 ++- web/frontend/src/generic/select/MetricSelection.svelte | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index b641a43..f2df916 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -130,7 +130,7 @@ const pendingMetrics = [ ...(ccconfig[`job_view_selectedMetrics:${job.cluster}`] || $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster)) { + if (gm.availability.find((av) => av.cluster === job.cluster && av.subClusters.includes(job.subCluster))) { names.push(gm.name); } return names; @@ -434,6 +434,7 @@ {#if $initq.data} av.cluster === cluster)) allMetrics.add(gm.name); + if (subCluster == null) { + if (gm.availability.find((av) => av.cluster === cluster)) allMetrics.add(gm.name); + } else { + if (gm.availability.find((av) => av.cluster === cluster && av.subClusters.includes(subCluster))) allMetrics.add(gm.name); + } } } newMetricsOrder = [...allMetrics].filter((m) => !metrics.includes(m)); From e733688fd03b41ccc6bb6ffac47aa8c1abd69bad Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 3 Mar 2025 17:54:34 +0100 Subject: [PATCH 16/27] add new subCluster prop to statsTable metric select --- web/frontend/src/job/StatsTable.svelte | 1 + 1 file changed, 1 insertion(+) diff --git a/web/frontend/src/job/StatsTable.svelte b/web/frontend/src/job/StatsTable.svelte index 21d9b3b..b6b0f85 100644 --- a/web/frontend/src/job/StatsTable.svelte +++ b/web/frontend/src/job/StatsTable.svelte @@ -169,6 +169,7 @@ Date: Thu, 27 Feb 2025 15:11:07 +0100 Subject: [PATCH 17/27] allow /start_job/ with 0 second duration Apparently it is possible to get this for very short jobs. --- internal/api/rest.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/api/rest.go b/internal/api/rest.go index b76da0b..fd2f86d 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -1008,8 +1008,8 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo return } - if job == nil || job.StartTime.Unix() >= req.StopTime { - handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime.Unix()), http.StatusBadRequest, rw) + if job == nil || job.StartTime.Unix() > req.StopTime { + handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime.Unix()), http.StatusBadRequest, rw) return } From 6454576417ca9048435390a6a3c30415d1a15951 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 17:39:38 +0100 Subject: [PATCH 18/27] add node_fail job state --- api/swagger.json | 6 ++++-- api/swagger.yaml | 2 ++ internal/api/docs.go | 6 ++++-- pkg/schema/job.go | 4 +++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/api/swagger.json b/api/swagger.json index 51b22c8..9035beb 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -1786,7 +1786,8 @@ "stopped", "timeout", "preempted", - "out_of_memory" + "out_of_memory", + "node_fail" ], "x-enum-varnames": [ "JobStateRunning", @@ -1796,7 +1797,8 @@ "JobStateStopped", "JobStateTimeout", "JobStatePreempted", - "JobStateOutOfMemory" + "JobStateOutOfMemory", + "JobStateNodeFail" ] }, "schema.JobStatistics": { diff --git a/api/swagger.yaml b/api/swagger.yaml index f5f0081..20fa031 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -395,6 +395,7 @@ definitions: - timeout - preempted - out_of_memory + - node_fail type: string x-enum-varnames: - JobStateRunning @@ -405,6 +406,7 @@ definitions: - JobStateTimeout - JobStatePreempted - JobStateOutOfMemory + - JobStateNodeFail schema.JobStatistics: description: Specification for job metric statistics. properties: diff --git a/internal/api/docs.go b/internal/api/docs.go index 642003f..6f034b4 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -1792,7 +1792,8 @@ const docTemplate = `{ "stopped", "timeout", "preempted", - "out_of_memory" + "out_of_memory", + "node_fail" ], "x-enum-varnames": [ "JobStateRunning", @@ -1802,7 +1803,8 @@ const docTemplate = `{ "JobStateStopped", "JobStateTimeout", "JobStatePreempted", - "JobStateOutOfMemory" + "JobStateOutOfMemory", + "JobStateNodeFail" ] }, "schema.JobStatistics": { diff --git a/pkg/schema/job.go b/pkg/schema/job.go index 5e3110b..b6ac44d 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -143,6 +143,7 @@ const ( JobStateTimeout JobState = "timeout" JobStatePreempted JobState = "preempted" JobStateOutOfMemory JobState = "out_of_memory" + JobStateNodeFail JobState = "node_fail" ) func (e *JobState) UnmarshalGQL(v interface{}) error { @@ -171,5 +172,6 @@ func (e JobState) Valid() bool { e == JobStateStopped || e == JobStateTimeout || e == JobStatePreempted || - e == JobStateOutOfMemory + e == JobStateOutOfMemory || + e == JobStateNodeFail } From 65d2698af4a104fbd5ff1faf0f462e6a50b6a466 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 17:47:49 +0100 Subject: [PATCH 19/27] add node_fail state to database schema --- internal/repository/migrations/mysql/01_init-schema.up.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/repository/migrations/mysql/01_init-schema.up.sql b/internal/repository/migrations/mysql/01_init-schema.up.sql index 3a6930c..16f7627 100644 --- a/internal/repository/migrations/mysql/01_init-schema.up.sql +++ b/internal/repository/migrations/mysql/01_init-schema.up.sql @@ -13,7 +13,7 @@ CREATE TABLE IF NOT EXISTS job ( walltime INT NOT NULL DEFAULT 0, job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', - 'stopped', 'timeout', 'preempted', 'out_of_memory')), + 'stopped', 'timeout', 'preempted', 'out_of_memory', 'node_fail')), meta_data TEXT, -- JSON resources TEXT NOT NULL, -- JSON From d4336b0dcb4e054a39033fc681634c285d08d4d8 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 18:00:02 +0100 Subject: [PATCH 20/27] add missing node_fail to db constraints --- .../repository/migrations/sqlite3/04_add-constraints.up.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/repository/migrations/sqlite3/04_add-constraints.up.sql b/internal/repository/migrations/sqlite3/04_add-constraints.up.sql index 06b1a9b..a6898c3 100644 --- a/internal/repository/migrations/sqlite3/04_add-constraints.up.sql +++ b/internal/repository/migrations/sqlite3/04_add-constraints.up.sql @@ -11,7 +11,7 @@ array_job_id BIGINT, duration INT NOT NULL, walltime INT NOT NULL, job_state VARCHAR(255) NOT NULL -CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')), +CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory', 'node_fail')), meta_data TEXT, -- JSON resources TEXT NOT NULL, -- JSON num_nodes INT NOT NULL, From 0a3e678329bc7162bffde549a2e85ac69b63e11b Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 18:03:01 +0100 Subject: [PATCH 21/27] add more missing node_fail states --- api/swagger.json | 6 ++++-- api/swagger.yaml | 2 ++ internal/api/docs.go | 6 ++++-- pkg/schema/job.go | 2 +- pkg/schema/schemas/job-meta.schema.json | 1 + web/frontend/src/generic/filters/JobStates.svelte | 1 + 6 files changed, 13 insertions(+), 5 deletions(-) diff --git a/api/swagger.json b/api/swagger.json index 9035beb..5cd4a5e 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -1512,7 +1512,8 @@ "cancelled", "stopped", "timeout", - "out_of_memory" + "out_of_memory", + "node_fail" ], "allOf": [ { @@ -1670,7 +1671,8 @@ "cancelled", "stopped", "timeout", - "out_of_memory" + "out_of_memory", + "node_fail" ], "allOf": [ { diff --git a/api/swagger.yaml b/api/swagger.yaml index 20fa031..3f188c2 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -201,6 +201,7 @@ definitions: - stopped - timeout - out_of_memory + - node_fail example: completed metaData: additionalProperties: @@ -314,6 +315,7 @@ definitions: - stopped - timeout - out_of_memory + - node_fail example: completed metaData: additionalProperties: diff --git a/internal/api/docs.go b/internal/api/docs.go index 6f034b4..99a8a14 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -1518,7 +1518,8 @@ const docTemplate = `{ "cancelled", "stopped", "timeout", - "out_of_memory" + "out_of_memory", + "node_fail" ], "allOf": [ { @@ -1676,7 +1677,8 @@ const docTemplate = `{ "cancelled", "stopped", "timeout", - "out_of_memory" + "out_of_memory", + "node_fail" ], "allOf": [ { diff --git a/pkg/schema/job.go b/pkg/schema/job.go index b6ac44d..7a2d950 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -21,7 +21,7 @@ type BaseJob struct { Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"` Project string `json:"project" db:"project" example:"abcd200"` User string `json:"user" db:"hpc_user" example:"abcd100h"` - State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory,node_fail"` Tags []*Tag `json:"tags,omitempty"` RawEnergyFootprint []byte `json:"-" db:"energy_footprint"` RawFootprint []byte `json:"-" db:"footprint"` diff --git a/pkg/schema/schemas/job-meta.schema.json b/pkg/schema/schemas/job-meta.schema.json index db7475c..a12057b 100644 --- a/pkg/schema/schemas/job-meta.schema.json +++ b/pkg/schema/schemas/job-meta.schema.json @@ -76,6 +76,7 @@ "cancelled", "stopped", "out_of_memory", + "node_fail", "timeout" ] }, diff --git a/web/frontend/src/generic/filters/JobStates.svelte b/web/frontend/src/generic/filters/JobStates.svelte index d903abc..b9a747d 100644 --- a/web/frontend/src/generic/filters/JobStates.svelte +++ b/web/frontend/src/generic/filters/JobStates.svelte @@ -23,6 +23,7 @@ "timeout", "preempted", "out_of_memory", + "node_fail", ]; From a61ff915ac0517261b8ac2be3e3cc3b8e7f40e7c Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 18:15:39 +0100 Subject: [PATCH 22/27] Revert "add more missing node_fail states" This reverts commit 0a3e678329bc7162bffde549a2e85ac69b63e11b. --- api/swagger.json | 6 ++---- api/swagger.yaml | 2 -- internal/api/docs.go | 6 ++---- pkg/schema/job.go | 2 +- pkg/schema/schemas/job-meta.schema.json | 1 - web/frontend/src/generic/filters/JobStates.svelte | 1 - 6 files changed, 5 insertions(+), 13 deletions(-) diff --git a/api/swagger.json b/api/swagger.json index 5cd4a5e..9035beb 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -1512,8 +1512,7 @@ "cancelled", "stopped", "timeout", - "out_of_memory", - "node_fail" + "out_of_memory" ], "allOf": [ { @@ -1671,8 +1670,7 @@ "cancelled", "stopped", "timeout", - "out_of_memory", - "node_fail" + "out_of_memory" ], "allOf": [ { diff --git a/api/swagger.yaml b/api/swagger.yaml index 3f188c2..20fa031 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -201,7 +201,6 @@ definitions: - stopped - timeout - out_of_memory - - node_fail example: completed metaData: additionalProperties: @@ -315,7 +314,6 @@ definitions: - stopped - timeout - out_of_memory - - node_fail example: completed metaData: additionalProperties: diff --git a/internal/api/docs.go b/internal/api/docs.go index 99a8a14..6f034b4 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -1518,8 +1518,7 @@ const docTemplate = `{ "cancelled", "stopped", "timeout", - "out_of_memory", - "node_fail" + "out_of_memory" ], "allOf": [ { @@ -1677,8 +1676,7 @@ const docTemplate = `{ "cancelled", "stopped", "timeout", - "out_of_memory", - "node_fail" + "out_of_memory" ], "allOf": [ { diff --git a/pkg/schema/job.go b/pkg/schema/job.go index 7a2d950..b6ac44d 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -21,7 +21,7 @@ type BaseJob struct { Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"` Project string `json:"project" db:"project" example:"abcd200"` User string `json:"user" db:"hpc_user" example:"abcd100h"` - State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory,node_fail"` + State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` Tags []*Tag `json:"tags,omitempty"` RawEnergyFootprint []byte `json:"-" db:"energy_footprint"` RawFootprint []byte `json:"-" db:"footprint"` diff --git a/pkg/schema/schemas/job-meta.schema.json b/pkg/schema/schemas/job-meta.schema.json index a12057b..db7475c 100644 --- a/pkg/schema/schemas/job-meta.schema.json +++ b/pkg/schema/schemas/job-meta.schema.json @@ -76,7 +76,6 @@ "cancelled", "stopped", "out_of_memory", - "node_fail", "timeout" ] }, diff --git a/web/frontend/src/generic/filters/JobStates.svelte b/web/frontend/src/generic/filters/JobStates.svelte index b9a747d..d903abc 100644 --- a/web/frontend/src/generic/filters/JobStates.svelte +++ b/web/frontend/src/generic/filters/JobStates.svelte @@ -23,7 +23,6 @@ "timeout", "preempted", "out_of_memory", - "node_fail", ]; From aa3fe2b8726634800a36d6dc4153ab6c7c9f93f9 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 18:15:46 +0100 Subject: [PATCH 23/27] Revert "add missing node_fail to db constraints" This reverts commit d4336b0dcb4e054a39033fc681634c285d08d4d8. --- .../repository/migrations/sqlite3/04_add-constraints.up.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/repository/migrations/sqlite3/04_add-constraints.up.sql b/internal/repository/migrations/sqlite3/04_add-constraints.up.sql index a6898c3..06b1a9b 100644 --- a/internal/repository/migrations/sqlite3/04_add-constraints.up.sql +++ b/internal/repository/migrations/sqlite3/04_add-constraints.up.sql @@ -11,7 +11,7 @@ array_job_id BIGINT, duration INT NOT NULL, walltime INT NOT NULL, job_state VARCHAR(255) NOT NULL -CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory', 'node_fail')), +CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')), meta_data TEXT, -- JSON resources TEXT NOT NULL, -- JSON num_nodes INT NOT NULL, From bd93b8be8efd2440d3eab6d5c43e9c4e7d4c164b Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 18:15:53 +0100 Subject: [PATCH 24/27] Revert "add node_fail state to database schema" This reverts commit 65d2698af4a104fbd5ff1faf0f462e6a50b6a466. --- internal/repository/migrations/mysql/01_init-schema.up.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/repository/migrations/mysql/01_init-schema.up.sql b/internal/repository/migrations/mysql/01_init-schema.up.sql index 16f7627..3a6930c 100644 --- a/internal/repository/migrations/mysql/01_init-schema.up.sql +++ b/internal/repository/migrations/mysql/01_init-schema.up.sql @@ -13,7 +13,7 @@ CREATE TABLE IF NOT EXISTS job ( walltime INT NOT NULL DEFAULT 0, job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', - 'stopped', 'timeout', 'preempted', 'out_of_memory', 'node_fail')), + 'stopped', 'timeout', 'preempted', 'out_of_memory')), meta_data TEXT, -- JSON resources TEXT NOT NULL, -- JSON From 4b2d7068b334c99bca3b77cc6a34371d5cb4416e Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Tue, 4 Mar 2025 18:16:02 +0100 Subject: [PATCH 25/27] Revert "add node_fail job state" This reverts commit 6454576417ca9048435390a6a3c30415d1a15951. --- api/swagger.json | 6 ++---- api/swagger.yaml | 2 -- internal/api/docs.go | 6 ++---- pkg/schema/job.go | 4 +--- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/api/swagger.json b/api/swagger.json index 9035beb..51b22c8 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -1786,8 +1786,7 @@ "stopped", "timeout", "preempted", - "out_of_memory", - "node_fail" + "out_of_memory" ], "x-enum-varnames": [ "JobStateRunning", @@ -1797,8 +1796,7 @@ "JobStateStopped", "JobStateTimeout", "JobStatePreempted", - "JobStateOutOfMemory", - "JobStateNodeFail" + "JobStateOutOfMemory" ] }, "schema.JobStatistics": { diff --git a/api/swagger.yaml b/api/swagger.yaml index 20fa031..f5f0081 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -395,7 +395,6 @@ definitions: - timeout - preempted - out_of_memory - - node_fail type: string x-enum-varnames: - JobStateRunning @@ -406,7 +405,6 @@ definitions: - JobStateTimeout - JobStatePreempted - JobStateOutOfMemory - - JobStateNodeFail schema.JobStatistics: description: Specification for job metric statistics. properties: diff --git a/internal/api/docs.go b/internal/api/docs.go index 6f034b4..642003f 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -1792,8 +1792,7 @@ const docTemplate = `{ "stopped", "timeout", "preempted", - "out_of_memory", - "node_fail" + "out_of_memory" ], "x-enum-varnames": [ "JobStateRunning", @@ -1803,8 +1802,7 @@ const docTemplate = `{ "JobStateStopped", "JobStateTimeout", "JobStatePreempted", - "JobStateOutOfMemory", - "JobStateNodeFail" + "JobStateOutOfMemory" ] }, "schema.JobStatistics": { diff --git a/pkg/schema/job.go b/pkg/schema/job.go index b6ac44d..5e3110b 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -143,7 +143,6 @@ const ( JobStateTimeout JobState = "timeout" JobStatePreempted JobState = "preempted" JobStateOutOfMemory JobState = "out_of_memory" - JobStateNodeFail JobState = "node_fail" ) func (e *JobState) UnmarshalGQL(v interface{}) error { @@ -172,6 +171,5 @@ func (e JobState) Valid() bool { e == JobStateStopped || e == JobStateTimeout || e == JobStatePreempted || - e == JobStateOutOfMemory || - e == JobStateNodeFail + e == JobStateOutOfMemory } From 2b56b40e6d2b69d49f666f0753e131f34a13aa83 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 6 Mar 2025 12:46:25 +0100 Subject: [PATCH 26/27] Review energyFootprint calculation, fix missing numNodes factor, add log --- internal/importer/handleImport.go | 24 ++++++++++++++++-------- internal/importer/initDB.go | 24 ++++++++++++++++-------- internal/repository/job.go | 24 +++++++++++++++--------- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 01773a5..623291c 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -96,27 +96,35 @@ func HandleImportFlag(flag string) error { } job.EnergyFootprint = make(map[string]float64) - var totalEnergy float64 - var energy float64 + // Total Job Energy Outside Loop + totalEnergy := 0.0 for _, fp := range sc.EnergyFootprint { + // Always Init Metric Energy Inside Loop + metricEnergy := 0.0 if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil { // Note: For DB data, calculate and save as kWh - // Energy: Power (in Watts) * Time (in Seconds) if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules) + log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp) + // FIXME: Needs sum as stats type } else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt) - // Unit: ( W * s ) / 3600 / 1000 = kWh ; Rounded to 2 nearest digits - energy = math.Round(((repository.LoadJobStat(&job, fp, "avg")*float64(job.Duration))/3600/1000)*100) / 100 + // Energy: Power (in Watts) * Time (in Seconds) + // Unit: (W * (s / 3600)) / 1000 = kWh + // Round 2 Digits: round(Energy * 100) / 100 + // Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000 + // Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1 + rawEnergy := ((repository.LoadJobStat(&job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0 + metricEnergy = math.Round(rawEnergy*100.0) / 100.0 } } else { log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID) } - job.EnergyFootprint[fp] = energy - totalEnergy += energy + job.EnergyFootprint[fp] = metricEnergy + totalEnergy += metricEnergy } - job.Energy = (math.Round(totalEnergy*100) / 100) + job.Energy = (math.Round(totalEnergy*100.0) / 100.0) if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil { log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID) return err diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index fa2ee6e..9a2ccdf 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -93,27 +93,35 @@ func InitDB() error { } job.EnergyFootprint = make(map[string]float64) - var totalEnergy float64 - var energy float64 + // Total Job Energy Outside Loop + totalEnergy := 0.0 for _, fp := range sc.EnergyFootprint { + // Always Init Metric Energy Inside Loop + metricEnergy := 0.0 if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil { // Note: For DB data, calculate and save as kWh - // Energy: Power (in Watts) * Time (in Seconds) if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules) + log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp) + // FIXME: Needs sum as stats type } else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt) - // Unit: ( W * s ) / 3600 / 1000 = kWh ; Rounded to 2 nearest digits - energy = math.Round(((repository.LoadJobStat(jobMeta, fp, "avg")*float64(jobMeta.Duration))/3600/1000)*100) / 100 + // Energy: Power (in Watts) * Time (in Seconds) + // Unit: (W * (s / 3600)) / 1000 = kWh + // Round 2 Digits: round(Energy * 100) / 100 + // Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000 + // Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1 + rawEnergy := ((repository.LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0 + metricEnergy = math.Round(rawEnergy*100.0) / 100.0 } } else { log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) } - job.EnergyFootprint[fp] = energy - totalEnergy += energy + job.EnergyFootprint[fp] = metricEnergy + totalEnergy += metricEnergy } - job.Energy = (math.Round(totalEnergy*100) / 100) + job.Energy = (math.Round(totalEnergy*100.0) / 100.0) if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil { log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID) return err diff --git a/internal/repository/job.go b/internal/repository/job.go index 020c3c2..84de6f7 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -590,28 +590,34 @@ func (r *JobRepository) UpdateEnergy( return stmt, err } energyFootprint := make(map[string]float64) - var totalEnergy float64 - var energy float64 + // Total Job Energy Outside Loop + totalEnergy := 0.0 for _, fp := range sc.EnergyFootprint { + // Always Init Metric Energy Inside Loop + metricEnergy := 0.0 if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil { // Note: For DB data, calculate and save as kWh if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules or Wh) + log.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp) // FIXME: Needs sum as stats type } else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt) // Energy: Power (in Watts) * Time (in Seconds) - // Unit: (( W * s ) / 3600) / 1000 = kWh ; Rounded to 2 nearest digits: (Energy * 100) / 100 - // Here: All-Node Metric Average * Number of Nodes * Job Runtime + // Unit: (W * (s / 3600)) / 1000 = kWh + // Round 2 Digits: round(Energy * 100) / 100 + // Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000 // Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1 - metricNodeSum := LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes) * float64(jobMeta.Duration) - energy = math.Round(((metricNodeSum/3600)/1000)*100) / 100 + rawEnergy := ((LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0 + metricEnergy = math.Round(rawEnergy*100.0) / 100.0 } } else { log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) } - energyFootprint[fp] = energy - totalEnergy += energy + energyFootprint[fp] = metricEnergy + totalEnergy += metricEnergy + + // log.Infof("Metric %s Average %f -> %f kWh | Job %d Total -> %f kWh", fp, LoadJobStat(jobMeta, fp, "avg"), energy, jobMeta.JobID, totalEnergy) } var rawFootprint []byte @@ -620,7 +626,7 @@ func (r *JobRepository) UpdateEnergy( return stmt, err } - return stmt.Set("energy_footprint", string(rawFootprint)).Set("energy", (math.Round(totalEnergy*100) / 100)), nil + return stmt.Set("energy_footprint", string(rawFootprint)).Set("energy", (math.Round(totalEnergy*100.0) / 100.0)), nil } func (r *JobRepository) UpdateFootprint( From d0af933b350d3e50cc64c648b5fbfd2fd4d1a0cf Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 6 Mar 2025 15:39:15 +0100 Subject: [PATCH 27/27] feat: add subCluster level frontend keys for metric selections - applies to jobView and nodeList --- web/frontend/src/Job.root.svelte | 25 +++++++---- web/frontend/src/Jobs.root.svelte | 2 +- web/frontend/src/Systems.root.svelte | 10 +++-- web/frontend/src/User.root.svelte | 2 +- .../src/generic/select/MetricSelection.svelte | 42 ++++++++++++------- web/frontend/src/job/StatsTable.svelte | 7 ++-- web/frontend/src/systems/NodeList.svelte | 16 +++---- 7 files changed, 65 insertions(+), 39 deletions(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index f2df916..6980230 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -128,15 +128,24 @@ if (!job) return; const pendingMetrics = [ - ...(ccconfig[`job_view_selectedMetrics:${job.cluster}`] || - $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster && av.subClusters.includes(job.subCluster))) { - names.push(gm.name); - } - return names; - }, []) + ...( + ( + ccconfig[`job_view_selectedMetrics:${job.cluster}:${job.subCluster}`] || + ccconfig[`job_view_selectedMetrics:${job.cluster}`] + ) || + $initq.data.globalMetrics + .reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === job.cluster && av.subClusters.includes(job.subCluster))) { + names.push(gm.name); + } + return names; + }, []) ), - ...(ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}`] || + ...( + ( + ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}:${job.subCluster}`] || + ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}`] + ) || ccconfig[`job_view_nodestats_selectedMetrics`] ), ]; diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index df928d0..7faa8b8 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -137,5 +137,5 @@ bind:metrics bind:isOpen={isMetricsSelectionOpen} bind:showFootprint - footprintSelect={true} + footprintSelect /> diff --git a/web/frontend/src/Systems.root.svelte b/web/frontend/src/Systems.root.svelte index 8089bbe..1589cac 100644 --- a/web/frontend/src/Systems.root.svelte +++ b/web/frontend/src/Systems.root.svelte @@ -29,8 +29,8 @@ import Refresher from "./generic/helper/Refresher.svelte"; export let displayType; - export let cluster; - export let subCluster = ""; + export let cluster = null; + export let subCluster = null; export let from = null; export let to = null; @@ -60,7 +60,10 @@ let hostnameFilter = ""; let pendingHostnameFilter = ""; let selectedMetric = ccconfig.system_view_selectedMetric || ""; - let selectedMetrics = ccconfig[`node_list_selectedMetrics:${cluster}`] || [ccconfig.system_view_selectedMetric]; + let selectedMetrics = ( + ccconfig[`node_list_selectedMetrics:${cluster}:${subCluster}`] || + ccconfig[`node_list_selectedMetrics:${cluster}`] + ) || [ccconfig.system_view_selectedMetric]; let isMetricsSelectionOpen = false; /* @@ -191,6 +194,7 @@ av.cluster === cluster)) allMetrics.add(gm.name); } else { if (gm.availability.find((av) => av.cluster === cluster && av.subClusters.includes(subCluster))) allMetrics.add(gm.name); @@ -67,7 +67,7 @@ function printAvailability(metric, cluster) { const avail = globalMetrics.find((gm) => gm.name === metric)?.availability - if (cluster == null) { + if (!cluster) { return avail.map((av) => av.cluster).join(',') } else { return avail.find((av) => av.cluster === cluster).subClusters.join(',') @@ -112,10 +112,17 @@ metrics = newMetricsOrder.filter((m) => unorderedMetrics.includes(m)); isOpen = false; - showFootprint = !!pendingShowFootprint; + let configKey; + if (cluster && subCluster) { + configKey = `${configName}:${cluster}:${subCluster}`; + } else if (cluster && !subCluster) { + configKey = `${configName}:${cluster}`; + } else { + configKey = `${configName}`; + } updateConfigurationMutation({ - name: cluster == null ? configName : `${configName}:${cluster}`, + name: configKey, value: JSON.stringify(metrics), }).subscribe((res) => { if (res.fetching === false && res.error) { @@ -123,17 +130,20 @@ } }); - updateConfigurationMutation({ - name: - cluster == null - ? "plot_list_showFootprint" - : `plot_list_showFootprint:${cluster}`, - value: JSON.stringify(showFootprint), - }).subscribe((res) => { - if (res.fetching === false && res.error) { - throw res.error; - } - }); + if (footprintSelect) { + showFootprint = !!pendingShowFootprint; + updateConfigurationMutation({ + name: + !cluster + ? "plot_list_showFootprint" + : `plot_list_showFootprint:${cluster}`, + value: JSON.stringify(showFootprint), + }).subscribe((res) => { + if (res.fetching === false && res.error) { + throw res.error; + } + }); + }; dispatch('update-metrics', metrics); } diff --git a/web/frontend/src/job/StatsTable.svelte b/web/frontend/src/job/StatsTable.svelte index b6b0f85..c8f12f2 100644 --- a/web/frontend/src/job/StatsTable.svelte +++ b/web/frontend/src/job/StatsTable.svelte @@ -37,9 +37,10 @@ sorting = {}, isMetricSelectionOpen = false, availableMetrics = new Set(), - selectedMetrics = - getContext("cc-config")[`job_view_nodestats_selectedMetrics:${job.cluster}`] || - getContext("cc-config")["job_view_nodestats_selectedMetrics"]; + selectedMetrics = ( + getContext("cc-config")[`job_view_nodestats_selectedMetrics:${job.cluster}:${job.subCluster}`] || + getContext("cc-config")[`job_view_nodestats_selectedMetrics:${job.cluster}`] + ) || getContext("cc-config")["job_view_nodestats_selectedMetrics"]; for (let metric of sortedJobMetrics) { // Not Exclusive or Multi-Node: get maxScope directly (mostly: node) diff --git a/web/frontend/src/systems/NodeList.svelte b/web/frontend/src/systems/NodeList.svelte index ad64a1f..ca22d57 100644 --- a/web/frontend/src/systems/NodeList.svelte +++ b/web/frontend/src/systems/NodeList.svelte @@ -217,13 +217,15 @@
- - {#each selectedMetrics as metric} @@ -163,7 +170,7 @@ From 38569f55c740fa92019cfb772f902cea073d653f Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 13:09:04 +0100 Subject: [PATCH 03/27] add title to roofline plot - Clarify that roofline is CPU only --- web/frontend/src/generic/plots/Roofline.svelte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/frontend/src/generic/plots/Roofline.svelte b/web/frontend/src/generic/plots/Roofline.svelte index 558d8e8..2941ecb 100644 --- a/web/frontend/src/generic/plots/Roofline.svelte +++ b/web/frontend/src/generic/plots/Roofline.svelte @@ -179,7 +179,7 @@ function render(plotData) { if (plotData) { const opts = { - title: "", + title: "CPU Roofline Diagram", mode: 2, width: width, height: height, From 42135fd26ceb2c30b02800618723026ff2426064 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 13:37:28 +0100 Subject: [PATCH 04/27] if disableClusterSelection is set, display info in cluster filter - instead of undocumented unresponsive cluster name select --- .../src/generic/filters/Cluster.svelte | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/web/frontend/src/generic/filters/Cluster.svelte b/web/frontend/src/generic/filters/Cluster.svelte index 8606247..f886582 100644 --- a/web/frontend/src/generic/filters/Cluster.svelte +++ b/web/frontend/src/generic/filters/Cluster.svelte @@ -43,26 +43,31 @@ {#if $initialized}

Cluster

- - ((pendingCluster = null), (pendingPartition = null))} - > - Any Cluster - - {#each clusters as cluster} + {#if disableClusterSelection} + + + {:else} + ( - (pendingCluster = cluster.name), (pendingPartition = null) - )} + active={pendingCluster == null} + on:click={() => ((pendingCluster = null), (pendingPartition = null))} > - {cluster.name} + Any Cluster - {/each} - + {#each clusters as cluster} + ( + (pendingCluster = cluster.name), (pendingPartition = null) + )} + > + {cluster.name} + + {/each} + + {/if} {/if} {#if $initialized && pendingCluster != null}
From d5394c9e92de8d04a186ac18db05838c1c732c70 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 28 Feb 2025 13:37:59 +0100 Subject: [PATCH 05/27] fix: analysis view top links fixed, add full name to topusers --- web/frontend/src/Analysis.root.svelte | 9 ++++++--- web/frontend/src/Status.root.svelte | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/web/frontend/src/Analysis.root.svelte b/web/frontend/src/Analysis.root.svelte index 40757d3..1617ccd 100644 --- a/web/frontend/src/Analysis.root.svelte +++ b/web/frontend/src/Analysis.root.svelte @@ -70,6 +70,8 @@ ...new Set([...metricsInHistograms, ...metricsInScatterplots.flat()]), ]; + $: clusterName = cluster?.name ? cluster.name : cluster; + const sortOptions = [ { key: "totalWalltime", label: "Walltime" }, { key: "totalNodeHours", label: "Node Hours" }, @@ -159,6 +161,7 @@ groupBy: $groupBy ) { id + name totalWalltime totalNodeHours totalCoreHours @@ -423,14 +426,14 @@
{te.id}{te.id} {te?.name ? `(${te.name})` : ''}{te.id}{tu.id}{tu.id} {tu?.name ? `(${tu.name})` : ''} {tu[topUserSelection.key]}
{te.id} {te?.name ? `(${te.name})` : ''}{te.id} {tu.id} {tu?.name ? `(${tu.name})` : ''}{tu.id}{tu[topUserSelection.key]}
-

- Loading nodes {nodes.length + 1} to - { matchedNodes - ? `${(nodes.length + paging.itemsPerPage) > matchedNodes ? matchedNodes : (nodes.length + paging.itemsPerPage)} of ${matchedNodes} total` - : (nodes.length + paging.itemsPerPage) - } -

+ {#if !usePaging} +

+ Loading nodes {nodes.length + 1} to + { matchedNodes + ? `${(nodes.length + paging.itemsPerPage) > matchedNodes ? matchedNodes : (nodes.length + paging.itemsPerPage)} of ${matchedNodes} total` + : (nodes.length + paging.itemsPerPage) + } +

+ {/if}