fix legends, add resolution, add statsseries, add simple healthcheck

This commit is contained in:
Christoph Kluge 2025-01-10 16:06:29 +01:00
parent 2a3383e9e6
commit 5ea11a5ad2
13 changed files with 116 additions and 28 deletions

View File

@ -264,7 +264,7 @@ func LoadNodeListData(
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) {
) (map[string]schema.JobData, int, bool, error) {
repo, err := metricdata.GetMetricDataRepo(cluster)
if err != nil {
return nil, 0, false, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
@ -286,6 +286,19 @@ func LoadNodeListData(
}
}
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
const maxSeriesSize int = 15
for _, jd := range data {
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
}
if data == nil {
return nil, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}

View File

@ -701,7 +701,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) {
) (map[string]schema.JobData, int, bool, error) {
// 0) Init additional vars
var totalNodes int = 0
@ -747,6 +747,8 @@ func (ccms *CCMetricStore) LoadNodeListData(
nodes = nodes[start:end]
}
// Note: Order of node data is not guaranteed after this point, but contents match page and filter criteria
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
if err != nil {
log.Warn("Error while building queries")
@ -769,7 +771,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
}
var errors []string
data := make(map[string]map[string]map[schema.MetricScope]*schema.JobMetric)
data := make(map[string]schema.JobData)
for i, row := range resBody.Results {
var query ApiQuery
if resBody.Queries != nil {
@ -790,7 +792,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
// Init Nested Map Data Structures If Not Found
hostData, ok := data[query.Hostname]
if !ok {
hostData = make(map[string]map[schema.MetricScope]*schema.JobMetric)
hostData = make(schema.JobData)
data[query.Hostname] = hostData
}

View File

@ -322,7 +322,7 @@ func (idb *InfluxDBv2DataRepository) LoadNodeListData(
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) {
) (map[string]schema.JobData, int, bool, error) {
var totalNodes int = 0
var hasNextPage bool = false

View File

@ -31,7 +31,7 @@ type MetricDataRepository interface {
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error)
LoadNodeListData(cluster, subCluster, nodeFilter string, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, page *model.PageRequest, ctx context.Context) (map[string]schema.JobData, int, bool, error)
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}

View File

@ -456,7 +456,7 @@ func (pdb *PrometheusDataRepository) LoadNodeListData(
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) {
) (map[string]schema.JobData, int, bool, error) {
var totalNodes int = 0
var hasNextPage bool = false

View File

@ -59,7 +59,7 @@ func (tmdr *TestMetricDataRepository) LoadNodeListData(
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) {
) (map[string]schema.JobData, int, bool, error) {
panic("TODO")
}

View File

@ -9,7 +9,7 @@
-->
<script>
import { getContext } from "svelte";
import { getContext, createEventDispatcher } from "svelte";
import {
Row,
Col,
@ -52,15 +52,22 @@
const globalMetrics = getContext("globalMetrics");
const displayNodeOverview = (displayType === 'OVERVIEW')
const resampleConfig = getContext("resampling") || null;
const resampleResolutions = resampleConfig ? [...resampleConfig.resolutions] : [];
const resampleDefault = resampleConfig ? Math.max(...resampleConfig.resolutions) : 0;
let selectedResolution = resampleConfig ? resampleDefault : 0;
let hostnameFilter = "";
let pendingHostnameFilter = "";
let selectedMetric = ccconfig.system_view_selectedMetric || "";
let selectedMetrics = ccconfig[`node_list_selectedMetrics:${cluster}`] || [ccconfig.system_view_selectedMetric];
let isMetricsSelectionOpen = false;
/*
Note 1: Scope Selector or Auto-Scoped?
Note 2: "Sorting" as use-case ignored for now, probably default to alphanumerical on hostnames of cluster
Note 1: Scope Selector or Auto-Scoped? -> USeful auto scoping with stats view where applicable -> CHeck with JVe
Note 2: "Sorting" as use-case ignored for now, probably default to alphanumerical on hostnames of cluster (handled in frontend at the moment)
Note 3: Add Idle State Filter (== No allocated Jobs) [Frontend?] : Cannot be handled by CCMS, requires secondary job query and refiltering of visible nodes
Note 4: Resolution changes as implemented only possible for all plots generally, not for individual metrics: Result list if build from GQL result *including* metric series
*/
let systemMetrics = [];
@ -80,10 +87,15 @@
selectedMetrics = [selectedMetric]
}
$: { // Wait after input for some time to prevent too many requests
setTimeout(function () {
hostnameFilter = pendingHostnameFilter;
}, 500);
}
</script>
<!-- ROW1: Tools-->
<Row cols={{ xs: 2, lg: 4 }} class="mb-3">
<Row cols={{ xs: 2, lg: !displayNodeOverview ? 5 : 4 }} class="mb-3">
{#if $initq.data}
<!-- List Metric Select Col-->
{#if !displayNodeOverview}
@ -91,7 +103,7 @@
<InputGroup>
<InputGroupText><Icon name="graph-up" /></InputGroupText>
<InputGroupText class="text-capitalize">Metrics</InputGroupText>
<Button
<Button
outline
color="primary"
on:click={() => (isMetricsSelectionOpen = true)}
@ -99,17 +111,30 @@
{selectedMetrics.length} selected
</Button>
</InputGroup>
</Col>
</Col>
<Col>
<InputGroup>
<InputGroupText><Icon name="plus-slash-minus" /></InputGroupText>
<InputGroupText>Resolution</InputGroupText>
<Input type="select" bind:value={selectedResolution}>
{#each resampleResolutions as res}
<option value={res}
>{res} sec</option
>
{/each}
</Input>
</InputGroup>
</Col>
{/if}
<!-- Node Col-->
<Col>
<Col class="mt-2 mt-lg-0">
<InputGroup>
<InputGroupText><Icon name="hdd" /></InputGroupText>
<InputGroupText>Find Node(s)</InputGroupText>
<Input
placeholder="Filter hostname ..."
type="text"
bind:value={hostnameFilter}
bind:value={pendingHostnameFilter}
/>
</InputGroup>
</Col>
@ -159,7 +184,7 @@
<NodeOverview {cluster} {subCluster} {ccconfig} {selectedMetrics} {from} {to} {hostnameFilter}/>
{:else}
<!-- ROW2-2: Node List (Grid Included)-->
<NodeList {cluster} {subCluster} {ccconfig} {selectedMetrics} {hostnameFilter} {from} {to} {systemUnits}/>
<NodeList {cluster} {subCluster} {ccconfig} {selectedMetrics} {selectedResolution} {hostnameFilter} {from} {to} {systemUnits}/>
{/if}
{/if}

View File

@ -142,7 +142,7 @@
if (useStatsSeries == null) useStatsSeries = statisticsSeries != null;
if (useStatsSeries == false && series == null) useStatsSeries = true;
const usesMeanStatsSeries = (useStatsSeries && statisticsSeries.mean.length != 0)
const usesMeanStatsSeries = (useStatsSeries?.mean && statisticsSeries.mean.length != 0)
const dispatch = createEventDispatcher();
const subClusterTopology = getContext("getHardwareTopology")(cluster, subCluster);
const metricConfig = getContext("getMetricConfig")(cluster, subCluster, metric);

View File

@ -11,6 +11,7 @@ new Systems({
to: infos.to
},
context: new Map([
['cc-config', clusterCockpitConfig]
['cc-config', clusterCockpitConfig],
['resampling', resampleConfig]
])
})

View File

@ -20,6 +20,7 @@
export let subCluster = "";
export const ccconfig = null;
export let selectedMetrics = [];
export let selectedResolution = 0;
export let hostnameFilter = "";
export let systemUnits = null;
export let from = null;
@ -39,7 +40,7 @@
const { query: initq } = init();
const client = getContextClient();
const nodeListQuery = gql`
query ($cluster: String!, $subCluster: String!, $nodeFilter: String!, $metrics: [String!], $scopes: [MetricScope!]!, $from: Time!, $to: Time!, $paging: PageRequest!) {
query ($cluster: String!, $subCluster: String!, $nodeFilter: String!, $metrics: [String!], $scopes: [MetricScope!]!, $from: Time!, $to: Time!, $paging: PageRequest!, $selectedResolution: Int) {
nodeMetricsList(
cluster: $cluster
subCluster: $subCluster
@ -49,6 +50,7 @@
from: $from
to: $to
page: $paging
resolution: $selectedResolution
) {
items {
host
@ -63,12 +65,19 @@
prefix
}
series {
id
hostname
data
statistics {
min
avg
max
}
data
}
statisticsSeries {
min
median
max
}
}
}
@ -86,15 +95,19 @@
cluster: cluster,
subCluster: subCluster,
nodeFilter: hostnameFilter,
scopes: ["core", "accelerator"],
scopes: ["core", "socket", "accelerator"],
metrics: selectedMetrics,
from: from.toISOString(),
to: to.toISOString(),
paging: paging,
selectedResolution: selectedResolution,
},
requestPolicy: "network-only", // Resolution queries are cached, but how to access them? For now: reload on every change
});
$: matchedNodes = $nodesQuery.data?.nodeMetricsList.totalNodes || 0;
$: orderedData = $nodesQuery.data?.nodeMetricsList.items.sort((a, b) => a.host.localeCompare(b.host));
</script>
{#if $nodesQuery.error}
@ -135,7 +148,7 @@
</tr>
</thead>
<tbody>
{#each $nodesQuery.data.nodeMetricsList.items as nodeData (nodeData.host)}
{#each orderedData as nodeData (nodeData.host)}
<NodeListRow {nodeData} {cluster} {selectedMetrics}/>
{:else}
<tr>

View File

@ -27,6 +27,7 @@
export let cluster;
export let subCluster
export let hostname;
export let dataHealth;
const client = getContextClient();
const paging = { itemsPerPage: 50, page: 1 };
@ -49,6 +50,11 @@
}
`;
// Not at least one returned, selected metric: NodeHealth warning
const healthWarn = !dataHealth.includes(true);
// At least one non-returned selected metric: Metric config error?
const metricWarn = dataHealth.includes(false);
$: nodeJobsData = queryStore({
client: client,
query: nodeJobsQuery,
@ -78,7 +84,31 @@
<Spinner />
{:else if $nodeJobsData.data}
<p>
{#if $nodeJobsData.data.jobs.count > 0}
{#if healthWarn}
<InputGroup>
<InputGroupText>
<Icon name="exclamation-circle"/>
</InputGroupText>
<InputGroupText>
Status
</InputGroupText>
<Button color="danger" disabled>
Unhealthy
</Button>
</InputGroup>
{:else if metricWarn}
<InputGroup>
<InputGroupText>
<Icon name="circle-half"/>
</InputGroupText>
<InputGroupText>
Status
</InputGroupText>
<Button color="warning" disabled>
Missing Metric
</Button>
</InputGroup>
{:else if $nodeJobsData.data.jobs.count > 0}
<InputGroup>
<InputGroupText>
<Icon name="circle-fill"/>

View File

@ -50,7 +50,7 @@
<tr>
<td>
<NodeInfo {cluster} subCluster={nodeData.subCluster} hostname={nodeData.host} />
<NodeInfo {cluster} subCluster={nodeData.subCluster} hostname={nodeData.host} dataHealth={nodeData?.metrics.map((m) => (m.metric.series.length > 0))}/>
</td>
{#each sortAndSelectScope(nodeData?.metrics) as metricData (metricData.data.name)}
<td>
@ -63,11 +63,14 @@
{:else}
<!-- "No Data"-Warning included in MetricPlot-Component -->
<MetricPlot
timestep={metricData.data.metric.timestep}
series={metricData.data.metric.series}
metric={metricData.data.name}
{cluster}
subCluster={nodeData.subCluster}
metric={metricData.data.name}
scope={metricData.data.scope}
timestep={metricData.data.metric.timestep}
series={metricData.data.metric.series}
statisticsSeries={metricData.data?.metric.statisticsSeries}
useStatsSeries={!!metricData.data?.metric.statisticsSeries}
forNode
/>
{/if}

View File

@ -10,6 +10,7 @@
const displayType = {{ .Infos.displayType }};
const infos = {{ .Infos }};
const clusterCockpitConfig = {{ .Config }};
const resampleConfig = {{ .Resampling }};
</script>
<script src='/build/systems.js'></script>
{{end}}