Streamline missing data warnings, review logging

This commit is contained in:
Christoph Kluge
2026-01-29 15:17:33 +01:00
parent 7101d2bb3b
commit f26cabbdf1
11 changed files with 147 additions and 88 deletions

View File

@@ -57,13 +57,13 @@ func (r *queryResolver) rooflineHeatmap(
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil {
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
cclog.Warnf("Error while loading roofline metrics for job %d", job.ID)
return nil, err
}
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil {
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
continue
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
}

View File

@@ -97,8 +97,8 @@ func LoadData(job *schema.Job,
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
@@ -116,11 +116,11 @@ func LoadData(job *schema.Job,
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
} else {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
}
@@ -129,8 +129,8 @@ func LoadData(job *schema.Job,
var jdTemp schema.JobData
jdTemp, err = archive.GetHandle().LoadJobData(job)
if err != nil {
cclog.Errorf("failed to load job data from archive for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
@@ -244,15 +244,15 @@ func LoadAverages(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return err
}
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err
}
@@ -288,15 +288,15 @@ func LoadScopedJobStats(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return nil, err
}
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return nil, err
}
@@ -320,8 +320,8 @@ func LoadJobStats(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return nil, err
}
@@ -329,8 +329,8 @@ func LoadJobStats(
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return data, err
}
@@ -379,8 +379,8 @@ func LoadNodeData(
ms, err := GetMetricDataRepo(cluster, "")
if err != nil {
cclog.Errorf("failed to load node data from metric store: %s",
err.Error())
cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
cluster, err.Error())
return nil, err
}
@@ -389,7 +389,7 @@ func LoadNodeData(
if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
} else {
cclog.Errorf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
return nil, err
}
}
@@ -423,8 +423,8 @@ func LoadNodeListData(
ms, err := GetMetricDataRepo(cluster, subCluster)
if err != nil {
cclog.Errorf("failed to load node data from metric store: %s",
err.Error())
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
cluster, subCluster, err.Error())
return nil, err
}
@@ -434,7 +434,7 @@ func LoadNodeListData(
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error())
} else {
cclog.Errorf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error())
return nil, err
}

View File

@@ -329,7 +329,7 @@ func (ccms *CCMetricStore) LoadStats(
metric := query.Metric
data := res[0]
if data.Error != nil {
cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
continue
}
@@ -556,7 +556,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
) (map[string]schema.JobData, error) {
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
if err != nil {
cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
return nil, err
}

View File

@@ -68,8 +68,8 @@ func RegisterFootprintWorker() {
ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
continue
}

View File

@@ -342,7 +342,7 @@
<b>Disabled Metric</b>
</CardHeader>
<CardBody>
<p>Metric <b>{item.metric}</b> is disabled for subcluster <b>{$initq.data.job.subCluster}</b>.</p>
<p>Metric <b>{item.metric}</b> is disabled for cluster <b>{$initq.data.job.cluster}:{$initq.data.job.subCluster}</b>.</p>
<p class="mb-1">To remove this card, open metric selection and press "Close and Apply".</p>
</CardBody>
</Card>
@@ -352,7 +352,8 @@
<b>Missing Metric</b>
</CardHeader>
<CardBody>
<p class="mb-1">No dataset returned for <b>{item.metric}</b>.</p>
<p>No dataset returned for <b>{item.metric}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{$initq.data.job.cluster}</b>.</p>
</CardBody>
</Card>
{/if}

View File

@@ -22,6 +22,8 @@
Icon,
Spinner,
Card,
CardHeader,
CardBody
} from "@sveltestrap/sveltestrap";
import {
queryStore,
@@ -254,12 +256,15 @@
></Card
>
{:else}
<Card
style="margin-left: 2rem;margin-right: 2rem;"
body
color="warning"
>No dataset returned for <code>{item.name}</code></Card
>
<Card color="warning" class="mx-2">
<CardHeader class="mb-0">
<b>Missing Metric</b>
</CardHeader>
<CardBody>
<p>No dataset returned for <b>{item.name}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</CardBody>
</Card>
{/if}
{/snippet}

View File

@@ -229,7 +229,10 @@
></Card
>
{:else}
<Card body color="warning">No dataset returned</Card>
<Card body class="mx-2" color="warning">
<p>No dataset returned for <b>{metrics[i]}</b></p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{job.cluster}</b>.</p>
</Card>
{/if}
</td>
{/each}

View File

@@ -27,7 +27,7 @@
import uPlot from "uplot";
import { formatNumber, formatDurationTime } from "../units.js";
import { getContext, onMount, onDestroy } from "svelte";
import { Card } from "@sveltestrap/sveltestrap";
import { Card, CardBody, CardHeader } from "@sveltestrap/sveltestrap";
/* Svelte 5 Props */
let {
@@ -633,7 +633,13 @@
style="background-color: {backgroundColor()};" class={forNode ? 'py-2 rounded' : 'rounded'}
></div>
{:else}
<Card body color="warning" class="mx-4"
>Cannot render plot: No series data returned for <code>{metric}</code></Card
>
<Card color="warning" class={forNode ? 'mx-2' : 'mt-2'}>
<CardHeader class="mb-0">
<b>Empty Metric</b>
</CardHeader>
<CardBody>
<p>Cannot render plot for <b>{metric}</b>.</p>
<p class="mb-1">Metric found but returned without timeseries data.</p>
</CardBody>
</Card>
{/if}

View File

@@ -55,6 +55,7 @@
function setupAvailable(data) {
let pendingAvailable = {};
if (data) {
// Returns Only For Available Metrics
for (let d of data) {
if (!pendingAvailable[d.name]) {
pendingAvailable[d.name] = [d.scope]
@@ -90,6 +91,8 @@
pendingTableData[host] = {};
};
for (const metric of sm) {
// Only Returned, Available Metrics
if (as[metric]) {
if (!pendingTableData[host][metric]) {
pendingTableData[host][metric] = {};
};
@@ -101,6 +104,7 @@
};
};
};
};
return pendingTableData;
}
@@ -136,6 +140,7 @@
<th></th>
{#each selectedMetrics as metric}
<!-- To Match Row-2 Header Field Count-->
{#if availableScopes[metric]}
<th colspan={selectedScopes[metric] == "node" ? 3 : 4}>
<InputGroup>
<InputGroupText>
@@ -148,12 +153,22 @@
</Input>
</InputGroup>
</th>
{:else}
<th>
<InputGroup>
<InputGroupText>
{metric}
</InputGroupText>
</InputGroup>
</th>
{/if}
{/each}
</tr>
<!-- Header Row 2: Fields -->
<tr>
<th>Node</th>
{#each selectedMetrics as metric}
{#if availableScopes[metric]}
{#if selectedScopes[metric] != "node"}
<th>Id</th>
{/if}
@@ -170,6 +185,11 @@
{/if}
</th>
{/each}
{:else}
<th class="table-warning">
Missing Metric
</th>
{/if}
{/each}
</tr>
</thead>
@@ -178,10 +198,17 @@
<tr>
<th scope="col">{host}</th>
{#each selectedMetrics as metric (metric)}
{#if tableData[host][metric]}
<StatsTableEntry
data={tableData[host][metric][selectedScopes[metric]]}
scope={selectedScopes[metric]}
/>
{:else}
<td class="table-warning" style="max-width:10rem;">
<p>No dataset returned for <b>{metric}</b>.</p>
<p>Metric was not found in metric store for cluster.</p>
</td>
{/if}
{/each}
</tr>
{/each}

View File

@@ -14,7 +14,7 @@
<script>
import { getContext } from "svelte";
import { queryStore, gql, getContextClient } from "@urql/svelte";
import { Row, Col, Card, Spinner, Badge } from "@sveltestrap/sveltestrap";
import { Row, Col, Card, CardHeader, CardBody, Spinner, Badge } from "@sveltestrap/sveltestrap";
import { checkMetricDisabled } from "../generic/utils.js";
import MetricPlot from "../generic/plots/MetricPlot.svelte";
@@ -189,4 +189,16 @@
{/each}
{/key}
</Row>
{:else}
<Row>
<Card color="warning">
<CardHeader class="mb-0">
<b>Missing Metric</b>
</CardHeader>
<CardBody>
<p>No dataset returned for <b>{selectedMetric}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</CardBody>
</Card>
</Row>
{/if}

View File

@@ -171,13 +171,18 @@
{#key metricData}
<td>
{#if metricData?.disabled}
<Card body class="mx-3" color="info"
>Metric disabled for subcluster <code
>{metricData?.data?.name ? metricData.data.name : `Metric Index ${i}`}:{nodeData.subCluster}</code
<Card body class="mx-2" color="info"
>Metric <b>{selectedMetrics[i]}</b> disabled for subcluster <code
>{nodeData.subCluster}</code
></Card
>
{:else if !metricData?.data}
<Card body class="mx-2" color="warning">
<p>No dataset returned for <b>{selectedMetrics[i]}</b></p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</Card>
{:else if !metricData?.data?.name}
<Card body class="mx-3" color="warning"
<Card body class="mx-2" color="warning"
>Metric without name for subcluster <code
>{`Metric Index ${i}`}:{nodeData.subCluster}</code
></Card