diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 9bc8811d..d7d6b675 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -23,6 +23,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits" "github.com/ClusterCockpit/cc-lib/v2/schema" ) @@ -938,15 +939,21 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr } for metricName, data := range collectorData { - cu := collectorUnit[metricName] + // use ccUnits for backend normalization to "Tera" + p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix) + p_new := ccunit.NewPrefix("T") + convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new) + u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base} + roundedData := make([]schema.Float, 0) - for _, val := range data { - roundedData = append(roundedData, schema.Float((math.Round(float64(val)*100.0) / 100.0))) + for _, v_old := range data { + v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0 + roundedData = append(roundedData, schema.Float(v_new)) } cm := model.ClusterMetricWithName{ Name: metricName, - Unit: &cu, + Unit: &u_new, Timestep: collectorTimestep[metricName], Data: roundedData, } diff --git a/internal/graph/util.go b/internal/graph/util.go index 4135ca72..dd5e388f 100644 --- a/internal/graph/util.go +++ b/internal/graph/util.go @@ -57,13 +57,13 @@ func (r *queryResolver) rooflineHeatmap( jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0) if err != nil { - cclog.Errorf("Error while loading roofline metrics for job %d", job.ID) + cclog.Warnf("Error while loading roofline metrics for job %d", job.ID) return nil, err } flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"] if flops_ == nil && membw_ == nil { - cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID) + cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID) continue // return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID) } diff --git a/internal/metricdispatch/dataLoader.go b/internal/metricdispatch/dataLoader.go index 43a6d92b..09a8ac09 100644 --- a/internal/metricdispatch/dataLoader.go +++ b/internal/metricdispatch/dataLoader.go @@ -97,8 +97,8 @@ func LoadData(job *schema.Job, ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) if err != nil { - cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) return err, 0, 0 } @@ -116,11 +116,11 @@ func LoadData(job *schema.Job, jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution) if err != nil { if len(jd) != 0 { - cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) } else { - cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) return err, 0, 0 } } @@ -129,8 +129,8 @@ func LoadData(job *schema.Job, var jdTemp schema.JobData jdTemp, err = archive.GetHandle().LoadJobData(job) if err != nil { - cclog.Errorf("failed to load job data from archive for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) return err, 0, 0 } @@ -244,15 +244,15 @@ func LoadAverages( ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) if err != nil { - cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) return err } stats, err := ms.LoadStats(job, metrics, ctx) if err != nil { - cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) return err } @@ -288,15 +288,15 @@ func LoadScopedJobStats( ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) if err != nil { - cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) return nil, err } scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx) if err != nil { - cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) return nil, err } @@ -320,8 +320,8 @@ func LoadJobStats( ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) if err != nil { - cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) return nil, err } @@ -329,8 +329,8 @@ func LoadJobStats( stats, err := ms.LoadStats(job, metrics, ctx) if err != nil { - cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) return data, err } @@ -379,8 +379,8 @@ func LoadNodeData( ms, err := GetMetricDataRepo(cluster, "") if err != nil { - cclog.Errorf("failed to load node data from metric store: %s", - err.Error()) + cclog.Errorf("failed to access metricDataRepo for cluster %s: %s", + cluster, err.Error()) return nil, err } @@ -389,7 +389,7 @@ func LoadNodeData( if len(data) != 0 { cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error()) } else { - cclog.Errorf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error()) + cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error()) return nil, err } } @@ -423,8 +423,8 @@ func LoadNodeListData( ms, err := GetMetricDataRepo(cluster, subCluster) if err != nil { - cclog.Errorf("failed to load node data from metric store: %s", - err.Error()) + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + cluster, subCluster, err.Error()) return nil, err } @@ -434,7 +434,7 @@ func LoadNodeListData( cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s", cluster, subCluster, err.Error()) } else { - cclog.Errorf("failed to load node list data from metric store for cluster %s, subcluster %s: %s", + cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s", cluster, subCluster, err.Error()) return nil, err } diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go index ded644ea..aadbe1b1 100644 --- a/internal/metricstoreclient/cc-metric-store.go +++ b/internal/metricstoreclient/cc-metric-store.go @@ -329,7 +329,7 @@ func (ccms *CCMetricStore) LoadStats( metric := query.Metric data := res[0] if data.Error != nil { - cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) + cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) continue } @@ -556,7 +556,7 @@ func (ccms *CCMetricStore) LoadNodeListData( ) (map[string]schema.JobData, error) { queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) if err != nil { - cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) + cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) return nil, err } diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 851a4ca1..af764d46 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -38,7 +38,7 @@ // - All queries use prepared statements via stmtCache // - Complex aggregations use SQL for efficiency // - Histogram pre-initialization ensures consistent bin ranges -// - Metric histogram queries limited to 500 jobs for running job analysis +// - Metric histogram queries limited to 5000 jobs for running job analysis package repository @@ -686,7 +686,7 @@ func (r *JobRepository) AddHistograms( // - Pre-initialized with zeros for consistent visualization // // Limitations: -// - Running jobs: Limited to 500 jobs for performance +// - Running jobs: Limited to 5000 jobs for performance // - Requires valid cluster configuration with metric peak values // - Uses footprint statistic (avg/max/min) configured per metric func (r *JobRepository) AddMetricHistograms( @@ -995,12 +995,12 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( // Returns slice of MetricHistoPoints, one per metric. // // Limitations: -// - Maximum 500 jobs (returns nil if more jobs match) +// - Maximum 5000 jobs (returns nil if more jobs match) // - Requires metric backend availability // - Bins based on metric peak values from cluster configuration // // Algorithm: -// 1. Query first 501 jobs to check count limit +// 1. Query first 5001 jobs to check count limit // 2. Load metric averages for all jobs via metricdispatch // 3. For each metric, create bins based on peak value // 4. Iterate averages and count jobs per bin @@ -1011,13 +1011,13 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram( bins *int, ) []*model.MetricHistoPoints { // Get Jobs - jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil) + jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 5000 + 1}, nil) if err != nil { cclog.Errorf("Error while querying jobs for footprint: %s", err) return nil } - if len(jobs) > 500 { - cclog.Errorf("too many jobs matched (max: %d)", 500) + if len(jobs) > 5000 { + cclog.Errorf("too many jobs matched (max: %d)", 5000) return nil } diff --git a/internal/taskmanager/updateFootprintService.go b/internal/taskmanager/updateFootprintService.go index 71bf4089..65f4c229 100644 --- a/internal/taskmanager/updateFootprintService.go +++ b/internal/taskmanager/updateFootprintService.go @@ -68,8 +68,8 @@ func RegisterFootprintWorker() { ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster) if err != nil { - cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", - job.JobID, job.User, job.Project, err.Error()) + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) continue } diff --git a/pkg/metricstore/api.go b/pkg/metricstore/api.go index f0f3bb3c..21f8db0c 100644 --- a/pkg/metricstore/api.go +++ b/pkg/metricstore/api.go @@ -13,13 +13,14 @@ import ( "fmt" "math" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/util" ) var ( // ErrNoHostOrMetric is returned when the metric store does not find the host or the metric - ErrNoHostOrMetric error = errors.New("[METRICSTORE]> [METRICSTORE]> metric or host not found") + ErrNoHostOrMetric error = errors.New("[METRICSTORE]> metric or host not found") // ErrInvalidTimeRange is returned when a query has 'from' >= 'to' ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'") // ErrEmptyCluster is returned when a query with ForAllNodes has no cluster specified @@ -280,20 +281,16 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) { data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution) if err != nil { - // Check a special case where only the metric or host. - // Dont send errors, instead just send empty array - // where frontend already renders error for empty array. - if err == ErrNoHostOrMetric { - data.Data = make([]schema.Float, 0) - data.From = req.From - data.To = req.To - data.Resolution = query.Resolution - } else { + // Skip Error If Just Missing Host or Metric, Continue + // Empty Return For Metric Handled Gracefully By Frontend + if err != ErrNoHostOrMetric { msg := err.Error() data.Error = &msg res = append(res, data) - continue + } else { + cclog.Warnf("failed to fetch '%s' from host '%s' (cluster: %s): %s", query.Metric, query.Hostname, req.Cluster, err.Error()) } + continue } if req.WithStats { diff --git a/pkg/metricstore/query.go b/pkg/metricstore/query.go index 62216e59..e5a49af3 100644 --- a/pkg/metricstore/query.go +++ b/pkg/metricstore/query.go @@ -104,6 +104,11 @@ func (ccms *InternalMetricStore) LoadData( var errors []string jobData := make(schema.JobData) for i, row := range resBody.Results { + if len(row) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + query := req.Queries[i] metric := query.Metric scope := assignedScope[i] @@ -229,7 +234,7 @@ func buildQueries( for _, metric := range metrics { mc := archive.GetMetricConfig(job.Cluster, metric) if mc == nil { - cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) + cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) continue } @@ -535,11 +540,15 @@ func (ccms *InternalMetricStore) LoadStats( stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) for i, res := range resBody.Results { + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } query := req.Queries[i] metric := query.Metric data := res[0] if data.Error != nil { - cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) + cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) continue } @@ -609,6 +618,10 @@ func (ccms *InternalMetricStore) LoadScopedStats( scopedJobStats := make(schema.ScopedJobStats) for i, row := range resBody.Results { + if len(row) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } query := req.Queries[i] metric := query.Metric scope := assignedScope[i] @@ -717,6 +730,11 @@ func (ccms *InternalMetricStore) LoadNodeData( var errors []string data := make(map[string]map[string][]*schema.JobMetric) for i, res := range resBody.Results { + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + var query APIQuery if resBody.Queries != nil { query = resBody.Queries[i] @@ -816,6 +834,10 @@ func (ccms *InternalMetricStore) LoadNodeListData( var errors []string data := make(map[string]schema.JobData) for i, row := range resBody.Results { + if len(row) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } var query APIQuery if resBody.Queries != nil { query = resBody.Queries[i] diff --git a/web/frontend/src/DashPublic.root.svelte b/web/frontend/src/DashPublic.root.svelte index 9c17e7d8..f66a6435 100644 --- a/web/frontend/src/DashPublic.root.svelte +++ b/web/frontend/src/DashPublic.root.svelte @@ -16,6 +16,7 @@ } from "./generic/utils.js"; import { formatNumber, + scaleNumber } from "./generic/units.js"; import { Row, @@ -222,8 +223,10 @@ else rawInfos['totalAccs'] += (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0; // Units (Set Once) - if (!rawInfos['flopRateUnit']) rawInfos['flopRateUnit'] = subCluster.flopRateSimd.unit.prefix + subCluster.flopRateSimd.unit.base - if (!rawInfos['memBwRateUnit']) rawInfos['memBwRateUnit'] = subCluster.memoryBandwidth.unit.prefix + subCluster.memoryBandwidth.unit.base + if (!rawInfos['flopRateUnitBase']) rawInfos['flopRateUnitBase'] = subCluster.flopRateSimd.unit.base + if (!rawInfos['memBwRateUnitBase']) rawInfos['memBwRateUnitBase'] = subCluster.memoryBandwidth.unit.base + if (!rawInfos['flopRateUnitPrefix']) rawInfos['flopRateUnitPrefix'] = subCluster.flopRateSimd.unit.prefix + if (!rawInfos['memBwRateUnitPrefix']) rawInfos['memBwRateUnitPrefix'] = subCluster.memoryBandwidth.unit.prefix // Get Maxima For Roofline Knee Render if (!rawInfos['roofData']) { @@ -239,10 +242,14 @@ } } - // Get Idle Infos after Sums + // Get Simple Idle Infos after Sums by Diff if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes']; if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores']; if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs']; + // Cap at 0 (Negative hints towards Config <> Reality Mismatch!) + if (rawInfos['idleNodes'] < 0) rawInfos['idleNodes'] = 0; + if (rawInfos['idleCores'] < 0) rawInfos['idleCores'] = 0; + if (rawInfos['idleAccs'] < 0) rawInfos['idleAccs'] = 0; // Keymetrics (Data on Cluster-Scope) let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => @@ -262,20 +269,20 @@ 0, // Initial Value ) || 0; rawInfos['cpuPwr'] = Math.floor((rawCpuPwr * 100) / 100) - if (!rawInfos['cpuPwrUnit']) { - let rawCpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'cpu_power')?.metric?.unit || null - rawInfos['cpuPwrUnit'] = rawCpuUnit ? rawCpuUnit.prefix + rawCpuUnit.base : '' - } + + let rawCpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'cpu_power')?.metric?.unit || null + if (!rawInfos['cpuPwrUnitBase']) rawInfos['cpuPwrUnitBase'] = rawCpuUnit ? rawCpuUnit.base : '' + if (!rawInfos['cpuPwrUnitPrefix']) rawInfos['cpuPwrUnitPrefix'] = rawCpuUnit ? rawCpuUnit.prefix : '' let rawGpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => sum + (node.metrics.find((m) => m.name == 'acc_power')?.metric?.series[0]?.statistics?.avg || 0), 0, // Initial Value ) || 0; rawInfos['gpuPwr'] = Math.floor((rawGpuPwr * 100) / 100) - if (!rawInfos['gpuPwrUnit']) { - let rawGpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'acc_power')?.metric?.unit || null - rawInfos['gpuPwrUnit'] = rawGpuUnit ? rawGpuUnit.prefix + rawGpuUnit.base : '' - } + + let rawGpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'acc_power')?.metric?.unit || null + if (!rawInfos['gpuPwrUnitBase']) rawInfos['gpuPwrUnitBase'] = rawGpuUnit ? rawGpuUnit.base : '' + if (!rawInfos['gpuPwrUnitPrefix']) rawInfos['gpuPwrUnitPrefix'] = rawGpuUnit ? rawGpuUnit.prefix : '' } return rawInfos; }); @@ -443,7 +450,7 @@ - {clusterInfo?.flopRate} {clusterInfo?.flopRateUnit} + {scaleNumber(clusterInfo?.flopRate, clusterInfo?.flopRateUnitPrefix)}{clusterInfo?.flopRateUnitBase}
Total Flop Rate @@ -451,7 +458,7 @@ - {clusterInfo?.memBwRate} {clusterInfo?.memBwRateUnit} + {scaleNumber(clusterInfo?.memBwRate, clusterInfo?.memBwRateUnitPrefix)}{clusterInfo?.memBwRateUnitBase}
Total Memory Bandwidth @@ -460,7 +467,7 @@ {#if clusterInfo?.totalAccs !== 0} - {clusterInfo?.gpuPwr} {clusterInfo?.gpuPwrUnit} + {scaleNumber(clusterInfo?.gpuPwr, clusterInfo?.gpuPwrUnitPrefix)}{clusterInfo?.gpuPwrUnitBase}
Total GPU Power @@ -469,7 +476,7 @@ {:else} - {clusterInfo?.cpuPwr} {clusterInfo?.cpuPwrUnit} + {scaleNumber(clusterInfo?.cpuPwr, clusterInfo?.cpuPwrUnitPrefix)}{clusterInfo?.cpuPwrUnitBase}
Total CPU Power diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 8cfceb96..a393995f 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -342,7 +342,7 @@ Disabled Metric -

Metric {item.metric} is disabled for subcluster {$initq.data.job.subCluster}.

+

Metric {item.metric} is disabled for cluster {$initq.data.job.cluster}:{$initq.data.job.subCluster}.

To remove this card, open metric selection and press "Close and Apply".

@@ -352,7 +352,8 @@ Missing Metric -

No dataset returned for {item.metric}.

+

No dataset(s) returned for {item.metric}.

+

Metric was not found in metric store for cluster {$initq.data.job.cluster}.

{/if} @@ -386,17 +387,17 @@ {#if missingMetrics.length > 0}

- No data at all is available for the metrics: {missingMetrics.join( + No datasets were returned for the metrics: {missingMetrics.join( ", ", - )} + )}

{/if} {#if missingHosts.length > 0} -

Some metrics are missing for the following hosts:

+

Metrics are missing for the following hosts:

    {#each missingHosts as missing}
  • - {missing.hostname}: {missing.metrics.join(", ")} + {missing.hostname}: {missing.metrics.join(", ")}
  • {/each}
diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index a34e8727..02d41712 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -37,6 +37,7 @@ /* Const Init */ const { query: initq } = init(); const ccconfig = getContext("cc-config"); + const matchedJobCompareLimit = 500; /* State Init */ let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the @@ -154,8 +155,9 @@ {#if !showCompare} filterComponent.updateFilters(filter)} /> {/if} @@ -169,12 +171,12 @@ {/if}
- @@ -254,12 +260,15 @@ > {:else} - No dataset returned for {item.name} + + + Missing Metric + + +

No dataset returned for {item.name}.

+

Metric was not found in metric store for cluster {cluster}.

+
+
{/if} {/snippet} diff --git a/web/frontend/src/Systems.root.svelte b/web/frontend/src/Systems.root.svelte index 5c62f390..7981afd3 100644 --- a/web/frontend/src/Systems.root.svelte +++ b/web/frontend/src/Systems.root.svelte @@ -65,10 +65,6 @@ let timeoutId = null; /* State Init */ - // svelte-ignore state_referenced_locally - let to = $state(presetTo || new Date(Date.now())); - // svelte-ignore state_referenced_locally - let from = $state(presetFrom || new Date(nowDate.setHours(nowDate.getHours() - 4))); let selectedResolution = $state(resampleConfig ? resampleDefault : 0); let hostnameFilter = $state(""); let hoststateFilter = $state("all"); @@ -76,6 +72,8 @@ let isMetricsSelectionOpen = $state(false); /* Derived States */ + let to = $derived(presetTo ? presetTo : new Date(Date.now())); + let from = $derived(presetFrom ? presetFrom : new Date(nowDate.setHours(nowDate.getHours() - 4))); const displayNodeOverview = $derived((displayType === 'OVERVIEW')); const systemMetrics = $derived($initialized ? [...globalMetrics.filter((gm) => gm?.availability.find((av) => av.cluster == cluster))] : []); const presetSystemUnits = $derived(loadUnits(systemMetrics)); diff --git a/web/frontend/src/generic/Filters.svelte b/web/frontend/src/generic/Filters.svelte index c79a56e4..74f55ca7 100644 --- a/web/frontend/src/generic/Filters.svelte +++ b/web/frontend/src/generic/Filters.svelte @@ -241,12 +241,15 @@ if (filters.project) opts.push(`project=${filters.project}`); if (filters.project && filters.projectMatch != "contains") // "contains" is default-case opts.push(`projectMatch=${filters.projectMatch}`); - if (filters.user.length != 0) - if (filters.userMatch != "in") { - opts.push(`user=${filters.user}`); - } else { - for (let singleUser of filters.user) opts.push(`user=${singleUser}`); + if (filters.user) { + if (filters.user.length != 0) { + if (filters.userMatch != "in") { + opts.push(`user=${filters.user}`); + } else { + for (let singleUser of filters.user) opts.push(`user=${singleUser}`); + } } + } if (filters.userMatch != "contains") // "contains" is default-case opts.push(`userMatch=${filters.userMatch}`); // Filter Modals diff --git a/web/frontend/src/generic/helper/TextFilter.svelte b/web/frontend/src/generic/helper/TextFilter.svelte index c38fe33d..6e263c4d 100644 --- a/web/frontend/src/generic/helper/TextFilter.svelte +++ b/web/frontend/src/generic/helper/TextFilter.svelte @@ -2,9 +2,10 @@ @component Search Field for Job-Lists with separate mode if project filter is active Properties: - - `presetProject String?`: Currently active project filter [Default: ''] + - `presetProject String?`: Currently active project filter preset [Default: ''] - `authlevel Number?`: The current users authentication level [Default: null] - `roles [Number]?`: Enum containing available roles [Default: null] + - `filterBuffer [Obj]?`: Currently active filters, if any. - `setFilter Func`: The callback function to apply current filter selection --> @@ -18,78 +19,69 @@ presetProject = "", authlevel = null, roles = null, + filterBuffer = [], setFilter } = $props(); /* Const Init*/ - const throttle = 500; + const throttle = 300; /* Var Init */ - let user = ""; - let jobName = ""; let timeoutId = null; - /* State Init */ - let term = $state(""); + /* Derived */ + const bufferProject = $derived.by(() => { + let bp = filterBuffer.find((fb) => + Object.keys(fb).includes("project") + ) + return bp?.project?.contains || null + }); - /* Derived */ - let project = $derived(presetProject ? presetProject : ""); - let mode = $derived(presetProject ? "jobName" : "project"); + const bufferUser = $derived.by(() => { + let bu = filterBuffer.find((fb) => + Object.keys(fb).includes("user") + ) + return bu?.user?.contains || null + }); + + const bufferJobName = $derived.by(() => { + let bjn = filterBuffer.find((fb) => + Object.keys(fb).includes("jobName") + ) + return bjn?.jobName?.contains || null + }); + + let mode = $derived.by(() => { + if (presetProject) return "jobName" // Search by jobName if presetProject set + else if (bufferUser) return "user" + else if (bufferJobName) return "jobName" + else return "project" + }); + + let term = $derived(bufferUser || bufferJobName || bufferProject || ""); /* Functions */ - function modeChanged() { + function inputChanged(sleep = throttle) { + if (timeoutId != null) clearTimeout(timeoutId); if (mode == "user") { - project = presetProject ? presetProject : ""; - jobName = ""; + timeoutId = setTimeout(() => { + setFilter({ user: term, project: (presetProject ? presetProject : null), jobName: null }); + }, sleep); } else if (mode == "project") { - user = ""; - jobName = ""; - } else { - project = presetProject ? presetProject : ""; - user = ""; - } - termChanged(0); - } - - // Compatibility: Handle "user role" and "no role" identically - function termChanged(sleep = throttle) { - if (roles && authlevel >= roles.manager) { - if (mode == "user") user = term; - else if (mode == "project") project = term; - else jobName = term; - - if (timeoutId != null) clearTimeout(timeoutId); - timeoutId = setTimeout(() => { - setFilter({ - user, - project, - jobName - }); - }, sleep); - } else { - if (mode == "project") project = term; - else jobName = term; - - if (timeoutId != null) clearTimeout(timeoutId); - + setFilter({ project: term, user: null, jobName: null }); + }, sleep); + } else if (mode == "jobName") { timeoutId = setTimeout(() => { - setFilter({ - project, - jobName - }); - }, sleep); + setFilter({ jobName: term, user: null, project: (presetProject ? presetProject : null) }); + }, sleep); } } function resetProject () { - mode = "project" - term = "" - presetProject = "" - project = "" - jobName = "" - user = "" - termChanged(0); + presetProject = ""; + term = ""; + inputChanged(0); } @@ -100,12 +92,12 @@ class="form-select w-auto" title="Search Mode" bind:value={mode} - onchange={modeChanged} + onchange={() => inputChanged()} > {#if !presetProject} {/if} - {#if roles && authlevel >= roles.manager} + {#if roles && authlevel >= roles?.manager} {/if} @@ -113,8 +105,8 @@ termChanged()} - onkeyup={(event) => termChanged(event.key == "Enter" ? 0 : throttle)} + onchange={() => inputChanged()} + onkeyup={(event) => inputChanged(event.key == "Enter" ? 0 : throttle)} placeholder={presetProject ? `Find in ${scrambleNames ? scramble(presetProject) : presetProject} ...` : `Find ${mode} ...`} /> {#if presetProject} diff --git a/web/frontend/src/generic/joblist/JobListRow.svelte b/web/frontend/src/generic/joblist/JobListRow.svelte index 4604883a..17a160e1 100644 --- a/web/frontend/src/generic/joblist/JobListRow.svelte +++ b/web/frontend/src/generic/joblist/JobListRow.svelte @@ -229,7 +229,12 @@ > {:else} - No dataset returned + +

No dataset(s) returned for {metrics[i]}

+

Metric or host was not found in metric store for cluster {job.cluster}:

+

Identical messages in {metrics[i]} column: Metric not found.

+

Identical messages in job {job.jobId} row: Host not found.

+
{/if} {/each} diff --git a/web/frontend/src/generic/plots/DoubleMetricPlot.svelte b/web/frontend/src/generic/plots/DoubleMetricPlot.svelte index e94e269d..10e01311 100644 --- a/web/frontend/src/generic/plots/DoubleMetricPlot.svelte +++ b/web/frontend/src/generic/plots/DoubleMetricPlot.svelte @@ -25,7 +25,7 @@ metricData, timestep, numNodes, - cluster, + cluster = "", forNode = true, enableFlip = false, publicMode = false, @@ -316,12 +316,14 @@
-{:else if cluster} - Cannot render plot: No series data returned for {cluster}. {:else} - Cannot render plot: No series data returned. + + + Empty Metrics + + +

Cannot render plot for cluster {cluster}.

+

Metrics found but returned without timeseries data.

+
+
{/if} diff --git a/web/frontend/src/generic/plots/MetricPlot.svelte b/web/frontend/src/generic/plots/MetricPlot.svelte index 7e48e8e1..063b43fb 100644 --- a/web/frontend/src/generic/plots/MetricPlot.svelte +++ b/web/frontend/src/generic/plots/MetricPlot.svelte @@ -27,7 +27,7 @@ import uPlot from "uplot"; import { formatNumber, formatDurationTime } from "../units.js"; import { getContext, onMount, onDestroy } from "svelte"; - import { Card } from "@sveltestrap/sveltestrap"; + import { Card, CardBody, CardHeader } from "@sveltestrap/sveltestrap"; /* Svelte 5 Props */ let { @@ -633,7 +633,13 @@ style="background-color: {backgroundColor()};" class={forNode ? 'py-2 rounded' : 'rounded'} >
{:else} - Cannot render plot: No series data returned for {metric} + + + Empty Metric + + +

Cannot render plot for {metric}.

+

Metric found but returned without timeseries data.

+
+
{/if} diff --git a/web/frontend/src/generic/plots/Pie.svelte b/web/frontend/src/generic/plots/Pie.svelte index 3cfb1821..331ae904 100644 --- a/web/frontend/src/generic/plots/Pie.svelte +++ b/web/frontend/src/generic/plots/Pie.svelte @@ -67,6 +67,11 @@ reserved: "rgba(255, 0, 255, 0.75)", mixed: "rgba(255, 215, 0, 0.75)", unknown: "rgba(0, 0, 0, 0.75)" + }, + healthStates: { + full: "rgba(0, 128, 0, 0.75)", + failed: "rgba(255, 0, 0, 0.75)", + partial: "rgba(255, 215, 0, 0.75)", } } diff --git a/web/frontend/src/generic/plots/Roofline.svelte b/web/frontend/src/generic/plots/Roofline.svelte index 20c91872..6500eb5c 100644 --- a/web/frontend/src/generic/plots/Roofline.svelte +++ b/web/frontend/src/generic/plots/Roofline.svelte @@ -46,6 +46,7 @@ /* Const Init */ const lineWidth = 2 // clusterCockpitConfig.plotConfiguration_lineWidth; const cbmode = clusterCockpitConfig?.plotConfiguration_colorblindMode || false; + const bubbleSizeMax = 50; /* Var Init */ let timeoutId = null; @@ -317,8 +318,13 @@ size = sizeBase + scaling // Nodes: Size based on Jobcount } else if (nodesData) { - size = sizeBase + (nodesData[i]?.numJobs * 1.5) // Max Jobs Scale: 8 * 1.5 = 12 + size = sizeBase + (nodesData[i]?.numJobs * 1.5) }; + + // Apply Size Capping + if (size >= bubbleSizeMax) { + size = bubbleSizeMax; + } if (xVal >= filtLft && xVal <= filtRgt && yVal >= filtBtm && yVal <= filtTop) { let cx = valToPosX(xVal, scaleX, xDim, xOff); diff --git a/web/frontend/src/generic/select/TimeSelection.svelte b/web/frontend/src/generic/select/TimeSelection.svelte index 7d3c0c84..05120ba7 100644 --- a/web/frontend/src/generic/select/TimeSelection.svelte +++ b/web/frontend/src/generic/select/TimeSelection.svelte @@ -46,13 +46,13 @@ /* Derived */ let timeRange = $derived.by(() => { if (presetTo && presetFrom) { - return ((presetTo.getTime() - presetFrom.getTime()) / 1000) + return Math.floor(((presetTo.getTime() - presetFrom.getTime()) / 1000)) } else { - return ((defaultTo.getTime() - defaultFrom.getTime()) / 1000) + return Math.floor(((defaultTo.getTime() - defaultFrom.getTime()) / 1000)) } }); let unknownRange = $derived(!Object.values(options).includes(timeRange)); - + /* Functions */ function updateTimeRange() { let now = Date.now(); diff --git a/web/frontend/src/generic/units.js b/web/frontend/src/generic/units.js index d7fe90fe..540a1a32 100644 --- a/web/frontend/src/generic/units.js +++ b/web/frontend/src/generic/units.js @@ -17,11 +17,26 @@ export function formatNumber(x) { } } +export function scaleNumber(x, p = '') { + if ( isNaN(x) || x == null) { + return `${x} ${p}` // Return if String or Null + } else { + const oldPower = power[prefix.indexOf(p)] + const rawValue = x * oldPower + for (let i = 0; i < prefix.length; i++) { + if (power[i] <= rawValue && rawValue < power[i+1]) { + return `${Math.round((rawValue / power[i]) * 100) / 100} ${prefix[i]}` + } + } + return `${x} ${p}` + } +} + export function roundTwoDigits(x) { return Math.round(x * 100) / 100 } -export function scaleNumbers(x, y , p = '') { +export function scaleNumbers(x, y, p = '') { const oldPower = power[prefix.indexOf(p)] const rawXValue = x * oldPower const rawYValue = y * oldPower diff --git a/web/frontend/src/job/statstab/StatsTable.svelte b/web/frontend/src/job/statstab/StatsTable.svelte index 06b2d105..d5fad2a8 100644 --- a/web/frontend/src/job/statstab/StatsTable.svelte +++ b/web/frontend/src/job/statstab/StatsTable.svelte @@ -55,6 +55,7 @@ function setupAvailable(data) { let pendingAvailable = {}; if (data) { + // Returns Only For Available Metrics for (let d of data) { if (!pendingAvailable[d.name]) { pendingAvailable[d.name] = [d.scope] @@ -90,13 +91,16 @@ pendingTableData[host] = {}; }; for (const metric of sm) { - if (!pendingTableData[host][metric]) { - pendingTableData[host][metric] = {}; - }; - for (const scope of as[metric]) { - pendingTableData[host][metric][scope] = js.find((d) => d.name == metric && d.scope == scope) - ?.stats.filter((st) => st.hostname == host && st.data != null) - ?.sort((a, b) => a.id - b.id) || [] + // Only Returned, Available Metrics + if (as[metric]) { + if (!pendingTableData[host][metric]) { + pendingTableData[host][metric] = {}; + }; + for (const scope of as[metric]) { + pendingTableData[host][metric][scope] = js.find((d) => d.name == metric && d.scope == scope) + ?.stats.filter((st) => st.hostname == host && st.data != null) + ?.sort((a, b) => a.id - b.id) || [] + }; }; }; }; @@ -136,40 +140,56 @@ {#each selectedMetrics as metric} - - - - {metric} - - - {#each (availableScopes[metric] || []) as scope} - - {/each} - - - + {#if availableScopes[metric]} + + + + {metric} + + + {#each (availableScopes[metric] || []) as scope} + + {/each} + + + + {:else} + + + + {metric} + + + + {/if} {/each} Node {#each selectedMetrics as metric} - {#if selectedScopes[metric] != "node"} - Id - {/if} - {#each ["min", "avg", "max"] as stat} - sortBy(metric, stat)}> - {stat} - {#if selectedScopes[metric] == "node"} - - {/if} + {#if availableScopes[metric]} + {#if selectedScopes[metric] != "node"} + Id + {/if} + {#each ["min", "avg", "max"] as stat} + sortBy(metric, stat)}> + {stat} + {#if selectedScopes[metric] == "node"} + + {/if} + + {/each} + {:else} + + Missing Metric - {/each} + {/if} {/each} @@ -178,10 +198,17 @@ {host} {#each selectedMetrics as metric (metric)} - + {#if tableData[host][metric]} + + {:else} + +

No dataset(s) returned for {metric}.

+

Metric was not found in metric store for host {host}.

+ + {/if} {/each} {/each} diff --git a/web/frontend/src/status/DashInternal.svelte b/web/frontend/src/status/DashInternal.svelte index c42c758e..145ac4dc 100644 --- a/web/frontend/src/status/DashInternal.svelte +++ b/web/frontend/src/status/DashInternal.svelte @@ -22,6 +22,7 @@ import { formatDurationTime, formatNumber, + scaleNumber } from "../generic/units.js"; import { Row, @@ -250,9 +251,11 @@ if (!rawInfos['totalAccs']) rawInfos['totalAccs'] = (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0; else rawInfos['totalAccs'] += (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0; - // Units (Set Once) - if (!rawInfos['flopRateUnit']) rawInfos['flopRateUnit'] = subCluster.flopRateSimd.unit.prefix + subCluster.flopRateSimd.unit.base - if (!rawInfos['memBwRateUnit']) rawInfos['memBwRateUnit'] = subCluster.memoryBandwidth.unit.prefix + subCluster.memoryBandwidth.unit.base + // Unit Parts (Set Once) + if (!rawInfos['flopRateUnitBase']) rawInfos['flopRateUnitBase'] = subCluster.flopRateSimd.unit.base + if (!rawInfos['memBwRateUnitBase']) rawInfos['memBwRateUnitBase'] = subCluster.memoryBandwidth.unit.base + if (!rawInfos['flopRateUnitPrefix']) rawInfos['flopRateUnitPrefix'] = subCluster.flopRateSimd.unit.prefix + if (!rawInfos['memBwRateUnitPrefix']) rawInfos['memBwRateUnitPrefix'] = subCluster.memoryBandwidth.unit.prefix // Get Maxima For Roofline Knee Render if (!rawInfos['roofData']) { @@ -268,10 +271,14 @@ } } - // Get Idle Infos after Sums + // Get Simple Idle Infos after Sums by Diff if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes']; if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores']; if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs']; + // Cap at 0 (Negative hints towards Config <> Reality Mismatch!) + if (rawInfos['idleNodes'] < 0) rawInfos['idleNodes'] = 0; + if (rawInfos['idleCores'] < 0) rawInfos['idleCores'] = 0; + if (rawInfos['idleAccs'] < 0) rawInfos['idleAccs'] = 0; // Keymetrics (Data on Cluster-Scope) let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => @@ -418,12 +425,10 @@ - {clusterInfo?.flopRate} - {clusterInfo?.flopRateUnit} + {scaleNumber(clusterInfo?.flopRate, clusterInfo?.flopRateUnitPrefix)}{clusterInfo?.flopRateUnitBase} - {clusterInfo?.memBwRate} - {clusterInfo?.memBwRateUnit} + {scaleNumber(clusterInfo?.memBwRate, clusterInfo?.memBwRateUnitPrefix)}{clusterInfo?.memBwRateUnitBase}
diff --git a/web/frontend/src/status/dashdetails/StatusDash.svelte b/web/frontend/src/status/dashdetails/StatusDash.svelte index 9968a9fd..730ecdcc 100644 --- a/web/frontend/src/status/dashdetails/StatusDash.svelte +++ b/web/frontend/src/status/dashdetails/StatusDash.svelte @@ -23,7 +23,7 @@ gql, getContextClient, } from "@urql/svelte"; - import { formatDurationTime } from "../../generic/units.js"; + import { formatDurationTime, scaleNumber } from "../../generic/units.js"; import Refresher from "../../generic/helper/Refresher.svelte"; import TimeSelection from "../../generic/select/TimeSelection.svelte"; import Roofline from "../../generic/plots/Roofline.svelte"; @@ -418,7 +418,7 @@ {:else if $statesTimed.error} - {$statesTimed.error.message} + States Timed: {$statesTimed.error.message} {:else if $statesTimed.data} @@ -472,7 +472,7 @@ {:else if $statusQuery.error} - {$statesTimed.error.message} + Status Query (States): {$statesTimed.error.message} {:else if $statusQuery?.data?.nodeStates} @@ -484,7 +484,6 @@ Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Node States sd.state, )} + fixColors={refinedStateData.map( + (sd) => colors['nodeStates'][sd.state], + )} /> {/key}
@@ -508,7 +510,7 @@ {#each refinedStateData as sd, i} - + {sd.state} {sd.count} @@ -524,15 +526,17 @@ Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Node Health sd.count, + (hd) => hd.count, )} entities={refinedHealthData.map( - (sd) => sd.state, + (hd) => hd.state, + )} + fixColors={refinedHealthData.map( + (hd) => colors['healthStates'][hd.state], )} /> {/key} @@ -548,7 +552,7 @@ {#each refinedHealthData as hd, i} - + {hd.state} {hd.count} @@ -570,7 +574,7 @@ {:else if $statusQuery.error} - {$statusQuery.error.message} + Status Query (Details): {$statusQuery.error.message} {:else if $statusQuery.data} @@ -599,12 +603,10 @@ - {flopRate[subCluster.name]} - {flopRateUnitPrefix[subCluster.name]}{flopRateUnitBase[subCluster.name]} + {scaleNumber(flopRate[subCluster.name], flopRateUnitPrefix[subCluster.name])}{flopRateUnitBase[subCluster.name]} - {memBwRate[subCluster.name]} - {memBwRateUnitPrefix[subCluster.name]}{memBwRateUnitBase[subCluster.name]} + {scaleNumber(memBwRate[subCluster.name], memBwRateUnitPrefix[subCluster.name])}{memBwRateUnitBase[subCluster.name]}
diff --git a/web/frontend/src/systems/NodeOverview.svelte b/web/frontend/src/systems/NodeOverview.svelte index 6e893989..bb58b833 100644 --- a/web/frontend/src/systems/NodeOverview.svelte +++ b/web/frontend/src/systems/NodeOverview.svelte @@ -14,7 +14,7 @@