Streamline missing data warnings, review logging

This commit is contained in:
Christoph Kluge
2026-01-29 15:17:33 +01:00
parent 7101d2bb3b
commit f26cabbdf1
11 changed files with 147 additions and 88 deletions

View File

@@ -57,13 +57,13 @@ func (r *queryResolver) rooflineHeatmap(
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0) jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil { if err != nil {
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID) cclog.Warnf("Error while loading roofline metrics for job %d", job.ID)
return nil, err return nil, err
} }
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"] flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil { if flops_ == nil && membw_ == nil {
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID) cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
continue continue
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID) // return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
} }

View File

@@ -97,8 +97,8 @@ func LoadData(job *schema.Job,
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return err, 0, 0 return err, 0, 0
} }
@@ -116,11 +116,11 @@ func LoadData(job *schema.Job,
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution) jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil { if err != nil {
if len(jd) != 0 { if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s", cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
} else { } else {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0 return err, 0, 0
} }
} }
@@ -129,8 +129,8 @@ func LoadData(job *schema.Job,
var jdTemp schema.JobData var jdTemp schema.JobData
jdTemp, err = archive.GetHandle().LoadJobData(job) jdTemp, err = archive.GetHandle().LoadJobData(job)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from archive for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0 return err, 0, 0
} }
@@ -244,15 +244,15 @@ func LoadAverages(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return err return err
} }
stats, err := ms.LoadStats(job, metrics, ctx) stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err return err
} }
@@ -288,15 +288,15 @@ func LoadScopedJobStats(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return nil, err return nil, err
} }
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx) scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return nil, err return nil, err
} }
@@ -320,8 +320,8 @@ func LoadJobStats(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return nil, err return nil, err
} }
@@ -329,8 +329,8 @@ func LoadJobStats(
stats, err := ms.LoadStats(job, metrics, ctx) stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return data, err return data, err
} }
@@ -379,8 +379,8 @@ func LoadNodeData(
ms, err := GetMetricDataRepo(cluster, "") ms, err := GetMetricDataRepo(cluster, "")
if err != nil { if err != nil {
cclog.Errorf("failed to load node data from metric store: %s", cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
err.Error()) cluster, err.Error())
return nil, err return nil, err
} }
@@ -389,7 +389,7 @@ func LoadNodeData(
if len(data) != 0 { if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error()) cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
} else { } else {
cclog.Errorf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error()) cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
return nil, err return nil, err
} }
} }
@@ -423,8 +423,8 @@ func LoadNodeListData(
ms, err := GetMetricDataRepo(cluster, subCluster) ms, err := GetMetricDataRepo(cluster, subCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load node data from metric store: %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
err.Error()) cluster, subCluster, err.Error())
return nil, err return nil, err
} }
@@ -434,7 +434,7 @@ func LoadNodeListData(
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s", cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error()) cluster, subCluster, err.Error())
} else { } else {
cclog.Errorf("failed to load node list data from metric store for cluster %s, subcluster %s: %s", cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error()) cluster, subCluster, err.Error())
return nil, err return nil, err
} }

View File

@@ -329,7 +329,7 @@ func (ccms *CCMetricStore) LoadStats(
metric := query.Metric metric := query.Metric
data := res[0] data := res[0]
if data.Error != nil { if data.Error != nil {
cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
continue continue
} }
@@ -556,7 +556,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
) (map[string]schema.JobData, error) { ) (map[string]schema.JobData, error) {
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
if err != nil { if err != nil {
cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
return nil, err return nil, err
} }

View File

@@ -68,8 +68,8 @@ func RegisterFootprintWorker() {
ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
continue continue
} }

View File

@@ -342,7 +342,7 @@
<b>Disabled Metric</b> <b>Disabled Metric</b>
</CardHeader> </CardHeader>
<CardBody> <CardBody>
<p>Metric <b>{item.metric}</b> is disabled for subcluster <b>{$initq.data.job.subCluster}</b>.</p> <p>Metric <b>{item.metric}</b> is disabled for cluster <b>{$initq.data.job.cluster}:{$initq.data.job.subCluster}</b>.</p>
<p class="mb-1">To remove this card, open metric selection and press "Close and Apply".</p> <p class="mb-1">To remove this card, open metric selection and press "Close and Apply".</p>
</CardBody> </CardBody>
</Card> </Card>
@@ -352,7 +352,8 @@
<b>Missing Metric</b> <b>Missing Metric</b>
</CardHeader> </CardHeader>
<CardBody> <CardBody>
<p class="mb-1">No dataset returned for <b>{item.metric}</b>.</p> <p>No dataset returned for <b>{item.metric}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{$initq.data.job.cluster}</b>.</p>
</CardBody> </CardBody>
</Card> </Card>
{/if} {/if}

View File

@@ -22,6 +22,8 @@
Icon, Icon,
Spinner, Spinner,
Card, Card,
CardHeader,
CardBody
} from "@sveltestrap/sveltestrap"; } from "@sveltestrap/sveltestrap";
import { import {
queryStore, queryStore,
@@ -254,12 +256,15 @@
></Card ></Card
> >
{:else} {:else}
<Card <Card color="warning" class="mx-2">
style="margin-left: 2rem;margin-right: 2rem;" <CardHeader class="mb-0">
body <b>Missing Metric</b>
color="warning" </CardHeader>
>No dataset returned for <code>{item.name}</code></Card <CardBody>
> <p>No dataset returned for <b>{item.name}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</CardBody>
</Card>
{/if} {/if}
{/snippet} {/snippet}

View File

@@ -229,7 +229,10 @@
></Card ></Card
> >
{:else} {:else}
<Card body color="warning">No dataset returned</Card> <Card body class="mx-2" color="warning">
<p>No dataset returned for <b>{metrics[i]}</b></p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{job.cluster}</b>.</p>
</Card>
{/if} {/if}
</td> </td>
{/each} {/each}

View File

@@ -27,7 +27,7 @@
import uPlot from "uplot"; import uPlot from "uplot";
import { formatNumber, formatDurationTime } from "../units.js"; import { formatNumber, formatDurationTime } from "../units.js";
import { getContext, onMount, onDestroy } from "svelte"; import { getContext, onMount, onDestroy } from "svelte";
import { Card } from "@sveltestrap/sveltestrap"; import { Card, CardBody, CardHeader } from "@sveltestrap/sveltestrap";
/* Svelte 5 Props */ /* Svelte 5 Props */
let { let {
@@ -633,7 +633,13 @@
style="background-color: {backgroundColor()};" class={forNode ? 'py-2 rounded' : 'rounded'} style="background-color: {backgroundColor()};" class={forNode ? 'py-2 rounded' : 'rounded'}
></div> ></div>
{:else} {:else}
<Card body color="warning" class="mx-4" <Card color="warning" class={forNode ? 'mx-2' : 'mt-2'}>
>Cannot render plot: No series data returned for <code>{metric}</code></Card <CardHeader class="mb-0">
> <b>Empty Metric</b>
</CardHeader>
<CardBody>
<p>Cannot render plot for <b>{metric}</b>.</p>
<p class="mb-1">Metric found but returned without timeseries data.</p>
</CardBody>
</Card>
{/if} {/if}

View File

@@ -55,6 +55,7 @@
function setupAvailable(data) { function setupAvailable(data) {
let pendingAvailable = {}; let pendingAvailable = {};
if (data) { if (data) {
// Returns Only For Available Metrics
for (let d of data) { for (let d of data) {
if (!pendingAvailable[d.name]) { if (!pendingAvailable[d.name]) {
pendingAvailable[d.name] = [d.scope] pendingAvailable[d.name] = [d.scope]
@@ -90,6 +91,8 @@
pendingTableData[host] = {}; pendingTableData[host] = {};
}; };
for (const metric of sm) { for (const metric of sm) {
// Only Returned, Available Metrics
if (as[metric]) {
if (!pendingTableData[host][metric]) { if (!pendingTableData[host][metric]) {
pendingTableData[host][metric] = {}; pendingTableData[host][metric] = {};
}; };
@@ -101,6 +104,7 @@
}; };
}; };
}; };
};
return pendingTableData; return pendingTableData;
} }
@@ -136,6 +140,7 @@
<th></th> <th></th>
{#each selectedMetrics as metric} {#each selectedMetrics as metric}
<!-- To Match Row-2 Header Field Count--> <!-- To Match Row-2 Header Field Count-->
{#if availableScopes[metric]}
<th colspan={selectedScopes[metric] == "node" ? 3 : 4}> <th colspan={selectedScopes[metric] == "node" ? 3 : 4}>
<InputGroup> <InputGroup>
<InputGroupText> <InputGroupText>
@@ -148,12 +153,22 @@
</Input> </Input>
</InputGroup> </InputGroup>
</th> </th>
{:else}
<th>
<InputGroup>
<InputGroupText>
{metric}
</InputGroupText>
</InputGroup>
</th>
{/if}
{/each} {/each}
</tr> </tr>
<!-- Header Row 2: Fields --> <!-- Header Row 2: Fields -->
<tr> <tr>
<th>Node</th> <th>Node</th>
{#each selectedMetrics as metric} {#each selectedMetrics as metric}
{#if availableScopes[metric]}
{#if selectedScopes[metric] != "node"} {#if selectedScopes[metric] != "node"}
<th>Id</th> <th>Id</th>
{/if} {/if}
@@ -170,6 +185,11 @@
{/if} {/if}
</th> </th>
{/each} {/each}
{:else}
<th class="table-warning">
Missing Metric
</th>
{/if}
{/each} {/each}
</tr> </tr>
</thead> </thead>
@@ -178,10 +198,17 @@
<tr> <tr>
<th scope="col">{host}</th> <th scope="col">{host}</th>
{#each selectedMetrics as metric (metric)} {#each selectedMetrics as metric (metric)}
{#if tableData[host][metric]}
<StatsTableEntry <StatsTableEntry
data={tableData[host][metric][selectedScopes[metric]]} data={tableData[host][metric][selectedScopes[metric]]}
scope={selectedScopes[metric]} scope={selectedScopes[metric]}
/> />
{:else}
<td class="table-warning" style="max-width:10rem;">
<p>No dataset returned for <b>{metric}</b>.</p>
<p>Metric was not found in metric store for cluster.</p>
</td>
{/if}
{/each} {/each}
</tr> </tr>
{/each} {/each}

View File

@@ -14,7 +14,7 @@
<script> <script>
import { getContext } from "svelte"; import { getContext } from "svelte";
import { queryStore, gql, getContextClient } from "@urql/svelte"; import { queryStore, gql, getContextClient } from "@urql/svelte";
import { Row, Col, Card, Spinner, Badge } from "@sveltestrap/sveltestrap"; import { Row, Col, Card, CardHeader, CardBody, Spinner, Badge } from "@sveltestrap/sveltestrap";
import { checkMetricDisabled } from "../generic/utils.js"; import { checkMetricDisabled } from "../generic/utils.js";
import MetricPlot from "../generic/plots/MetricPlot.svelte"; import MetricPlot from "../generic/plots/MetricPlot.svelte";
@@ -189,4 +189,16 @@
{/each} {/each}
{/key} {/key}
</Row> </Row>
{:else}
<Row>
<Card color="warning">
<CardHeader class="mb-0">
<b>Missing Metric</b>
</CardHeader>
<CardBody>
<p>No dataset returned for <b>{selectedMetric}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</CardBody>
</Card>
</Row>
{/if} {/if}

View File

@@ -171,13 +171,18 @@
{#key metricData} {#key metricData}
<td> <td>
{#if metricData?.disabled} {#if metricData?.disabled}
<Card body class="mx-3" color="info" <Card body class="mx-2" color="info"
>Metric disabled for subcluster <code >Metric <b>{selectedMetrics[i]}</b> disabled for subcluster <code
>{metricData?.data?.name ? metricData.data.name : `Metric Index ${i}`}:{nodeData.subCluster}</code >{nodeData.subCluster}</code
></Card ></Card
> >
{:else if !metricData?.data}
<Card body class="mx-2" color="warning">
<p>No dataset returned for <b>{selectedMetrics[i]}</b></p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</Card>
{:else if !metricData?.data?.name} {:else if !metricData?.data?.name}
<Card body class="mx-3" color="warning" <Card body class="mx-2" color="warning"
>Metric without name for subcluster <code >Metric without name for subcluster <code
>{`Metric Index ${i}`}:{nodeData.subCluster}</code >{`Metric Index ${i}`}:{nodeData.subCluster}</code
></Card ></Card