From 5ea11a5ad256fe76218f70f40cc0cf25868bae31 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 10 Jan 2025 16:06:29 +0100 Subject: [PATCH] fix legends, add resolution, add statsseries, add simple healthcheck --- internal/metricDataDispatcher/dataLoader.go | 15 ++++++- internal/metricdata/cc-metric-store.go | 8 ++-- internal/metricdata/influxdb-v2.go | 2 +- internal/metricdata/metricdata.go | 2 +- internal/metricdata/prometheus.go | 2 +- internal/metricdata/utils.go | 2 +- web/frontend/src/Systems.root.svelte | 43 +++++++++++++++---- .../src/generic/plots/MetricPlot.svelte | 2 +- web/frontend/src/systems.entrypoint.js | 3 +- web/frontend/src/systems/NodeList.svelte | 21 +++++++-- .../src/systems/nodelist/NodeInfo.svelte | 32 +++++++++++++- .../src/systems/nodelist/NodeListRow.svelte | 11 +++-- web/templates/monitoring/systems.tmpl | 1 + 13 files changed, 116 insertions(+), 28 deletions(-) diff --git a/internal/metricDataDispatcher/dataLoader.go b/internal/metricDataDispatcher/dataLoader.go index 1d1605a..4642ffd 100644 --- a/internal/metricDataDispatcher/dataLoader.go +++ b/internal/metricDataDispatcher/dataLoader.go @@ -264,7 +264,7 @@ func LoadNodeListData( from, to time.Time, page *model.PageRequest, ctx context.Context, -) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) { +) (map[string]schema.JobData, int, bool, error) { repo, err := metricdata.GetMetricDataRepo(cluster) if err != nil { return nil, 0, false, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster) @@ -286,6 +286,19 @@ func LoadNodeListData( } } + // NOTE: New StatsSeries will always be calculated as 'min/median/max' + const maxSeriesSize int = 15 + for _, jd := range data { + for _, scopes := range jd { + for _, jm := range scopes { + if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize { + continue + } + jm.AddStatisticsSeries() + } + } + } + if data == nil { return nil, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster) } diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go index f1bf278..8d2d3f5 100644 --- a/internal/metricdata/cc-metric-store.go +++ b/internal/metricdata/cc-metric-store.go @@ -701,7 +701,7 @@ func (ccms *CCMetricStore) LoadNodeListData( from, to time.Time, page *model.PageRequest, ctx context.Context, -) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) { +) (map[string]schema.JobData, int, bool, error) { // 0) Init additional vars var totalNodes int = 0 @@ -747,6 +747,8 @@ func (ccms *CCMetricStore) LoadNodeListData( nodes = nodes[start:end] } + // Note: Order of node data is not guaranteed after this point, but contents match page and filter criteria + queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) if err != nil { log.Warn("Error while building queries") @@ -769,7 +771,7 @@ func (ccms *CCMetricStore) LoadNodeListData( } var errors []string - data := make(map[string]map[string]map[schema.MetricScope]*schema.JobMetric) + data := make(map[string]schema.JobData) for i, row := range resBody.Results { var query ApiQuery if resBody.Queries != nil { @@ -790,7 +792,7 @@ func (ccms *CCMetricStore) LoadNodeListData( // Init Nested Map Data Structures If Not Found hostData, ok := data[query.Hostname] if !ok { - hostData = make(map[string]map[schema.MetricScope]*schema.JobMetric) + hostData = make(schema.JobData) data[query.Hostname] = hostData } diff --git a/internal/metricdata/influxdb-v2.go b/internal/metricdata/influxdb-v2.go index c62998e..79c2d4a 100644 --- a/internal/metricdata/influxdb-v2.go +++ b/internal/metricdata/influxdb-v2.go @@ -322,7 +322,7 @@ func (idb *InfluxDBv2DataRepository) LoadNodeListData( from, to time.Time, page *model.PageRequest, ctx context.Context, -) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) { +) (map[string]schema.JobData, int, bool, error) { var totalNodes int = 0 var hasNextPage bool = false diff --git a/internal/metricdata/metricdata.go b/internal/metricdata/metricdata.go index 23a2cc4..0fe94d1 100644 --- a/internal/metricdata/metricdata.go +++ b/internal/metricdata/metricdata.go @@ -31,7 +31,7 @@ type MetricDataRepository interface { LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) // Return a map of hosts to a map of metrics to a map of scopes for multiple nodes. - LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) + LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]schema.JobData, int, bool, error) } var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{} diff --git a/internal/metricdata/prometheus.go b/internal/metricdata/prometheus.go index c873650..cd849ce 100644 --- a/internal/metricdata/prometheus.go +++ b/internal/metricdata/prometheus.go @@ -456,7 +456,7 @@ func (pdb *PrometheusDataRepository) LoadNodeListData( from, to time.Time, page *model.PageRequest, ctx context.Context, -) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) { +) (map[string]schema.JobData, int, bool, error) { var totalNodes int = 0 var hasNextPage bool = false diff --git a/internal/metricdata/utils.go b/internal/metricdata/utils.go index be5a3de..48dd237 100644 --- a/internal/metricdata/utils.go +++ b/internal/metricdata/utils.go @@ -59,7 +59,7 @@ func (tmdr *TestMetricDataRepository) LoadNodeListData( from, to time.Time, page *model.PageRequest, ctx context.Context, -) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) { +) (map[string]schema.JobData, int, bool, error) { panic("TODO") } diff --git a/web/frontend/src/Systems.root.svelte b/web/frontend/src/Systems.root.svelte index 4ce6ece..d9caf90 100644 --- a/web/frontend/src/Systems.root.svelte +++ b/web/frontend/src/Systems.root.svelte @@ -9,7 +9,7 @@ --> - + {#if $initq.data} {#if !displayNodeOverview} @@ -91,7 +103,7 @@ Metrics - - + + + + + Resolution + + {#each resampleResolutions as res} + + {/each} + + + {/if} - + Find Node(s) @@ -159,7 +184,7 @@ {:else} - + {/if} {/if} diff --git a/web/frontend/src/generic/plots/MetricPlot.svelte b/web/frontend/src/generic/plots/MetricPlot.svelte index 3a0e1e4..f2bff02 100644 --- a/web/frontend/src/generic/plots/MetricPlot.svelte +++ b/web/frontend/src/generic/plots/MetricPlot.svelte @@ -142,7 +142,7 @@ if (useStatsSeries == null) useStatsSeries = statisticsSeries != null; if (useStatsSeries == false && series == null) useStatsSeries = true; - const usesMeanStatsSeries = (useStatsSeries && statisticsSeries.mean.length != 0) + const usesMeanStatsSeries = (useStatsSeries?.mean && statisticsSeries.mean.length != 0) const dispatch = createEventDispatcher(); const subClusterTopology = getContext("getHardwareTopology")(cluster, subCluster); const metricConfig = getContext("getMetricConfig")(cluster, subCluster, metric); diff --git a/web/frontend/src/systems.entrypoint.js b/web/frontend/src/systems.entrypoint.js index 9f504cd..7cee46f 100644 --- a/web/frontend/src/systems.entrypoint.js +++ b/web/frontend/src/systems.entrypoint.js @@ -11,6 +11,7 @@ new Systems({ to: infos.to }, context: new Map([ - ['cc-config', clusterCockpitConfig] + ['cc-config', clusterCockpitConfig], + ['resampling', resampleConfig] ]) }) diff --git a/web/frontend/src/systems/NodeList.svelte b/web/frontend/src/systems/NodeList.svelte index 6ef48b1..39319e5 100644 --- a/web/frontend/src/systems/NodeList.svelte +++ b/web/frontend/src/systems/NodeList.svelte @@ -20,6 +20,7 @@ export let subCluster = ""; export const ccconfig = null; export let selectedMetrics = []; + export let selectedResolution = 0; export let hostnameFilter = ""; export let systemUnits = null; export let from = null; @@ -39,7 +40,7 @@ const { query: initq } = init(); const client = getContextClient(); const nodeListQuery = gql` - query ($cluster: String!, $subCluster: String!, $nodeFilter: String!, $metrics: [String!], $scopes: [MetricScope!]!, $from: Time!, $to: Time!, $paging: PageRequest!) { + query ($cluster: String!, $subCluster: String!, $nodeFilter: String!, $metrics: [String!], $scopes: [MetricScope!]!, $from: Time!, $to: Time!, $paging: PageRequest!, $selectedResolution: Int) { nodeMetricsList( cluster: $cluster subCluster: $subCluster @@ -49,6 +50,7 @@ from: $from to: $to page: $paging + resolution: $selectedResolution ) { items { host @@ -63,12 +65,19 @@ prefix } series { + id + hostname + data statistics { min avg max } - data + } + statisticsSeries { + min + median + max } } } @@ -86,15 +95,19 @@ cluster: cluster, subCluster: subCluster, nodeFilter: hostnameFilter, - scopes: ["core", "accelerator"], + scopes: ["core", "socket", "accelerator"], metrics: selectedMetrics, from: from.toISOString(), to: to.toISOString(), paging: paging, + selectedResolution: selectedResolution, }, + requestPolicy: "network-only", // Resolution queries are cached, but how to access them? For now: reload on every change }); $: matchedNodes = $nodesQuery.data?.nodeMetricsList.totalNodes || 0; + $: orderedData = $nodesQuery.data?.nodeMetricsList.items.sort((a, b) => a.host.localeCompare(b.host)); + {#if $nodesQuery.error} @@ -135,7 +148,7 @@ - {#each $nodesQuery.data.nodeMetricsList.items as nodeData (nodeData.host)} + {#each orderedData as nodeData (nodeData.host)} {:else} diff --git a/web/frontend/src/systems/nodelist/NodeInfo.svelte b/web/frontend/src/systems/nodelist/NodeInfo.svelte index 4ff4e1c..825bca7 100644 --- a/web/frontend/src/systems/nodelist/NodeInfo.svelte +++ b/web/frontend/src/systems/nodelist/NodeInfo.svelte @@ -27,6 +27,7 @@ export let cluster; export let subCluster export let hostname; + export let dataHealth; const client = getContextClient(); const paging = { itemsPerPage: 50, page: 1 }; @@ -49,6 +50,11 @@ } `; + // Not at least one returned, selected metric: NodeHealth warning + const healthWarn = !dataHealth.includes(true); + // At least one non-returned selected metric: Metric config error? + const metricWarn = dataHealth.includes(false); + $: nodeJobsData = queryStore({ client: client, query: nodeJobsQuery, @@ -78,7 +84,31 @@ {:else if $nodeJobsData.data}

- {#if $nodeJobsData.data.jobs.count > 0} + {#if healthWarn} + + + + + + Status + + + + {:else if metricWarn} + + + + + + Status + + + + {:else if $nodeJobsData.data.jobs.count > 0} diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index 411ac2a..1ea7d1d 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -50,7 +50,7 @@ - + (m.metric.series.length > 0))}/> {#each sortAndSelectScope(nodeData?.metrics) as metricData (metricData.data.name)} @@ -63,11 +63,14 @@ {:else} {/if} diff --git a/web/templates/monitoring/systems.tmpl b/web/templates/monitoring/systems.tmpl index 635bf46..b5ee4b6 100644 --- a/web/templates/monitoring/systems.tmpl +++ b/web/templates/monitoring/systems.tmpl @@ -10,6 +10,7 @@ const displayType = {{ .Infos.displayType }}; const infos = {{ .Infos }}; const clusterCockpitConfig = {{ .Config }}; + const resampleConfig = {{ .Resampling }}; {{end}}