Merge pull request #479 from ClusterCockpit/dev

Dev
This commit is contained in:
Jan Eitzinger
2026-01-30 16:26:27 +01:00
committed by GitHub
27 changed files with 406 additions and 264 deletions

View File

@@ -23,6 +23,7 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
"github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
@@ -938,15 +939,21 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
} }
for metricName, data := range collectorData { for metricName, data := range collectorData {
cu := collectorUnit[metricName] // use ccUnits for backend normalization to "Tera"
p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix)
p_new := ccunit.NewPrefix("T")
convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new)
u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base}
roundedData := make([]schema.Float, 0) roundedData := make([]schema.Float, 0)
for _, val := range data { for _, v_old := range data {
roundedData = append(roundedData, schema.Float((math.Round(float64(val)*100.0) / 100.0))) v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0
roundedData = append(roundedData, schema.Float(v_new))
} }
cm := model.ClusterMetricWithName{ cm := model.ClusterMetricWithName{
Name: metricName, Name: metricName,
Unit: &cu, Unit: &u_new,
Timestep: collectorTimestep[metricName], Timestep: collectorTimestep[metricName],
Data: roundedData, Data: roundedData,
} }

View File

@@ -57,13 +57,13 @@ func (r *queryResolver) rooflineHeatmap(
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0) jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil { if err != nil {
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID) cclog.Warnf("Error while loading roofline metrics for job %d", job.ID)
return nil, err return nil, err
} }
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"] flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil { if flops_ == nil && membw_ == nil {
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID) cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
continue continue
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID) // return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
} }

View File

@@ -97,8 +97,8 @@ func LoadData(job *schema.Job,
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return err, 0, 0 return err, 0, 0
} }
@@ -116,11 +116,11 @@ func LoadData(job *schema.Job,
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution) jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil { if err != nil {
if len(jd) != 0 { if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s", cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
} else { } else {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0 return err, 0, 0
} }
} }
@@ -129,8 +129,8 @@ func LoadData(job *schema.Job,
var jdTemp schema.JobData var jdTemp schema.JobData
jdTemp, err = archive.GetHandle().LoadJobData(job) jdTemp, err = archive.GetHandle().LoadJobData(job)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from archive for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0 return err, 0, 0
} }
@@ -244,15 +244,15 @@ func LoadAverages(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return err return err
} }
stats, err := ms.LoadStats(job, metrics, ctx) stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err return err
} }
@@ -288,15 +288,15 @@ func LoadScopedJobStats(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return nil, err return nil, err
} }
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx) scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return nil, err return nil, err
} }
@@ -320,8 +320,8 @@ func LoadJobStats(
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
return nil, err return nil, err
} }
@@ -329,8 +329,8 @@ func LoadJobStats(
stats, err := ms.LoadStats(job, metrics, ctx) stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil { if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s", cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, err.Error()) job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return data, err return data, err
} }
@@ -379,8 +379,8 @@ func LoadNodeData(
ms, err := GetMetricDataRepo(cluster, "") ms, err := GetMetricDataRepo(cluster, "")
if err != nil { if err != nil {
cclog.Errorf("failed to load node data from metric store: %s", cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
err.Error()) cluster, err.Error())
return nil, err return nil, err
} }
@@ -389,7 +389,7 @@ func LoadNodeData(
if len(data) != 0 { if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error()) cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
} else { } else {
cclog.Errorf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error()) cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
return nil, err return nil, err
} }
} }
@@ -423,8 +423,8 @@ func LoadNodeListData(
ms, err := GetMetricDataRepo(cluster, subCluster) ms, err := GetMetricDataRepo(cluster, subCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load node data from metric store: %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
err.Error()) cluster, subCluster, err.Error())
return nil, err return nil, err
} }
@@ -434,7 +434,7 @@ func LoadNodeListData(
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s", cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error()) cluster, subCluster, err.Error())
} else { } else {
cclog.Errorf("failed to load node list data from metric store for cluster %s, subcluster %s: %s", cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error()) cluster, subCluster, err.Error())
return nil, err return nil, err
} }

View File

@@ -329,7 +329,7 @@ func (ccms *CCMetricStore) LoadStats(
metric := query.Metric metric := query.Metric
data := res[0] data := res[0]
if data.Error != nil { if data.Error != nil {
cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
continue continue
} }
@@ -556,7 +556,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
) (map[string]schema.JobData, error) { ) (map[string]schema.JobData, error) {
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
if err != nil { if err != nil {
cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
return nil, err return nil, err
} }

View File

@@ -38,7 +38,7 @@
// - All queries use prepared statements via stmtCache // - All queries use prepared statements via stmtCache
// - Complex aggregations use SQL for efficiency // - Complex aggregations use SQL for efficiency
// - Histogram pre-initialization ensures consistent bin ranges // - Histogram pre-initialization ensures consistent bin ranges
// - Metric histogram queries limited to 500 jobs for running job analysis // - Metric histogram queries limited to 5000 jobs for running job analysis
package repository package repository
@@ -686,7 +686,7 @@ func (r *JobRepository) AddHistograms(
// - Pre-initialized with zeros for consistent visualization // - Pre-initialized with zeros for consistent visualization
// //
// Limitations: // Limitations:
// - Running jobs: Limited to 500 jobs for performance // - Running jobs: Limited to 5000 jobs for performance
// - Requires valid cluster configuration with metric peak values // - Requires valid cluster configuration with metric peak values
// - Uses footprint statistic (avg/max/min) configured per metric // - Uses footprint statistic (avg/max/min) configured per metric
func (r *JobRepository) AddMetricHistograms( func (r *JobRepository) AddMetricHistograms(
@@ -995,12 +995,12 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
// Returns slice of MetricHistoPoints, one per metric. // Returns slice of MetricHistoPoints, one per metric.
// //
// Limitations: // Limitations:
// - Maximum 500 jobs (returns nil if more jobs match) // - Maximum 5000 jobs (returns nil if more jobs match)
// - Requires metric backend availability // - Requires metric backend availability
// - Bins based on metric peak values from cluster configuration // - Bins based on metric peak values from cluster configuration
// //
// Algorithm: // Algorithm:
// 1. Query first 501 jobs to check count limit // 1. Query first 5001 jobs to check count limit
// 2. Load metric averages for all jobs via metricdispatch // 2. Load metric averages for all jobs via metricdispatch
// 3. For each metric, create bins based on peak value // 3. For each metric, create bins based on peak value
// 4. Iterate averages and count jobs per bin // 4. Iterate averages and count jobs per bin
@@ -1011,13 +1011,13 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
bins *int, bins *int,
) []*model.MetricHistoPoints { ) []*model.MetricHistoPoints {
// Get Jobs // Get Jobs
jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil) jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 5000 + 1}, nil)
if err != nil { if err != nil {
cclog.Errorf("Error while querying jobs for footprint: %s", err) cclog.Errorf("Error while querying jobs for footprint: %s", err)
return nil return nil
} }
if len(jobs) > 500 { if len(jobs) > 5000 {
cclog.Errorf("too many jobs matched (max: %d)", 500) cclog.Errorf("too many jobs matched (max: %d)", 5000)
return nil return nil
} }

View File

@@ -68,8 +68,8 @@ func RegisterFootprintWorker() {
ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster) ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil { if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s", cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.JobID, job.User, job.Project, err.Error()) job.Cluster, job.SubCluster, err.Error())
continue continue
} }

View File

@@ -13,13 +13,14 @@ import (
"fmt" "fmt"
"math" "math"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util" "github.com/ClusterCockpit/cc-lib/v2/util"
) )
var ( var (
// ErrNoHostOrMetric is returned when the metric store does not find the host or the metric // ErrNoHostOrMetric is returned when the metric store does not find the host or the metric
ErrNoHostOrMetric error = errors.New("[METRICSTORE]> [METRICSTORE]> metric or host not found") ErrNoHostOrMetric error = errors.New("[METRICSTORE]> metric or host not found")
// ErrInvalidTimeRange is returned when a query has 'from' >= 'to' // ErrInvalidTimeRange is returned when a query has 'from' >= 'to'
ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'") ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'")
// ErrEmptyCluster is returned when a query with ForAllNodes has no cluster specified // ErrEmptyCluster is returned when a query with ForAllNodes has no cluster specified
@@ -280,20 +281,16 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution) data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
if err != nil { if err != nil {
// Check a special case where only the metric or host. // Skip Error If Just Missing Host or Metric, Continue
// Dont send errors, instead just send empty array // Empty Return For Metric Handled Gracefully By Frontend
// where frontend already renders error for empty array. if err != ErrNoHostOrMetric {
if err == ErrNoHostOrMetric {
data.Data = make([]schema.Float, 0)
data.From = req.From
data.To = req.To
data.Resolution = query.Resolution
} else {
msg := err.Error() msg := err.Error()
data.Error = &msg data.Error = &msg
res = append(res, data) res = append(res, data)
continue } else {
cclog.Warnf("failed to fetch '%s' from host '%s' (cluster: %s): %s", query.Metric, query.Hostname, req.Cluster, err.Error())
} }
continue
} }
if req.WithStats { if req.WithStats {

View File

@@ -104,6 +104,11 @@ func (ccms *InternalMetricStore) LoadData(
var errors []string var errors []string
jobData := make(schema.JobData) jobData := make(schema.JobData)
for i, row := range resBody.Results { for i, row := range resBody.Results {
if len(row) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
continue
}
query := req.Queries[i] query := req.Queries[i]
metric := query.Metric metric := query.Metric
scope := assignedScope[i] scope := assignedScope[i]
@@ -229,7 +234,7 @@ func buildQueries(
for _, metric := range metrics { for _, metric := range metrics {
mc := archive.GetMetricConfig(job.Cluster, metric) mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil { if mc == nil {
cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
continue continue
} }
@@ -535,11 +540,15 @@ func (ccms *InternalMetricStore) LoadStats(
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
for i, res := range resBody.Results { for i, res := range resBody.Results {
if len(res) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
continue
}
query := req.Queries[i] query := req.Queries[i]
metric := query.Metric metric := query.Metric
data := res[0] data := res[0]
if data.Error != nil { if data.Error != nil {
cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
continue continue
} }
@@ -609,6 +618,10 @@ func (ccms *InternalMetricStore) LoadScopedStats(
scopedJobStats := make(schema.ScopedJobStats) scopedJobStats := make(schema.ScopedJobStats)
for i, row := range resBody.Results { for i, row := range resBody.Results {
if len(row) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
continue
}
query := req.Queries[i] query := req.Queries[i]
metric := query.Metric metric := query.Metric
scope := assignedScope[i] scope := assignedScope[i]
@@ -717,6 +730,11 @@ func (ccms *InternalMetricStore) LoadNodeData(
var errors []string var errors []string
data := make(map[string]map[string][]*schema.JobMetric) data := make(map[string]map[string][]*schema.JobMetric)
for i, res := range resBody.Results { for i, res := range resBody.Results {
if len(res) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
continue
}
var query APIQuery var query APIQuery
if resBody.Queries != nil { if resBody.Queries != nil {
query = resBody.Queries[i] query = resBody.Queries[i]
@@ -816,6 +834,10 @@ func (ccms *InternalMetricStore) LoadNodeListData(
var errors []string var errors []string
data := make(map[string]schema.JobData) data := make(map[string]schema.JobData)
for i, row := range resBody.Results { for i, row := range resBody.Results {
if len(row) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
continue
}
var query APIQuery var query APIQuery
if resBody.Queries != nil { if resBody.Queries != nil {
query = resBody.Queries[i] query = resBody.Queries[i]

View File

@@ -16,6 +16,7 @@
} from "./generic/utils.js"; } from "./generic/utils.js";
import { import {
formatNumber, formatNumber,
scaleNumber
} from "./generic/units.js"; } from "./generic/units.js";
import { import {
Row, Row,
@@ -222,8 +223,10 @@
else rawInfos['totalAccs'] += (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0; else rawInfos['totalAccs'] += (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0;
// Units (Set Once) // Units (Set Once)
if (!rawInfos['flopRateUnit']) rawInfos['flopRateUnit'] = subCluster.flopRateSimd.unit.prefix + subCluster.flopRateSimd.unit.base if (!rawInfos['flopRateUnitBase']) rawInfos['flopRateUnitBase'] = subCluster.flopRateSimd.unit.base
if (!rawInfos['memBwRateUnit']) rawInfos['memBwRateUnit'] = subCluster.memoryBandwidth.unit.prefix + subCluster.memoryBandwidth.unit.base if (!rawInfos['memBwRateUnitBase']) rawInfos['memBwRateUnitBase'] = subCluster.memoryBandwidth.unit.base
if (!rawInfos['flopRateUnitPrefix']) rawInfos['flopRateUnitPrefix'] = subCluster.flopRateSimd.unit.prefix
if (!rawInfos['memBwRateUnitPrefix']) rawInfos['memBwRateUnitPrefix'] = subCluster.memoryBandwidth.unit.prefix
// Get Maxima For Roofline Knee Render // Get Maxima For Roofline Knee Render
if (!rawInfos['roofData']) { if (!rawInfos['roofData']) {
@@ -239,10 +242,14 @@
} }
} }
// Get Idle Infos after Sums // Get Simple Idle Infos after Sums by Diff
if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes']; if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes'];
if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores']; if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores'];
if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs']; if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs'];
// Cap at 0 (Negative hints towards Config <> Reality Mismatch!)
if (rawInfos['idleNodes'] < 0) rawInfos['idleNodes'] = 0;
if (rawInfos['idleCores'] < 0) rawInfos['idleCores'] = 0;
if (rawInfos['idleAccs'] < 0) rawInfos['idleAccs'] = 0;
// Keymetrics (Data on Cluster-Scope) // Keymetrics (Data on Cluster-Scope)
let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
@@ -262,20 +269,20 @@
0, // Initial Value 0, // Initial Value
) || 0; ) || 0;
rawInfos['cpuPwr'] = Math.floor((rawCpuPwr * 100) / 100) rawInfos['cpuPwr'] = Math.floor((rawCpuPwr * 100) / 100)
if (!rawInfos['cpuPwrUnit']) {
let rawCpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'cpu_power')?.metric?.unit || null let rawCpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'cpu_power')?.metric?.unit || null
rawInfos['cpuPwrUnit'] = rawCpuUnit ? rawCpuUnit.prefix + rawCpuUnit.base : '' if (!rawInfos['cpuPwrUnitBase']) rawInfos['cpuPwrUnitBase'] = rawCpuUnit ? rawCpuUnit.base : ''
} if (!rawInfos['cpuPwrUnitPrefix']) rawInfos['cpuPwrUnitPrefix'] = rawCpuUnit ? rawCpuUnit.prefix : ''
let rawGpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => let rawGpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
sum + (node.metrics.find((m) => m.name == 'acc_power')?.metric?.series[0]?.statistics?.avg || 0), sum + (node.metrics.find((m) => m.name == 'acc_power')?.metric?.series[0]?.statistics?.avg || 0),
0, // Initial Value 0, // Initial Value
) || 0; ) || 0;
rawInfos['gpuPwr'] = Math.floor((rawGpuPwr * 100) / 100) rawInfos['gpuPwr'] = Math.floor((rawGpuPwr * 100) / 100)
if (!rawInfos['gpuPwrUnit']) {
let rawGpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'acc_power')?.metric?.unit || null let rawGpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'acc_power')?.metric?.unit || null
rawInfos['gpuPwrUnit'] = rawGpuUnit ? rawGpuUnit.prefix + rawGpuUnit.base : '' if (!rawInfos['gpuPwrUnitBase']) rawInfos['gpuPwrUnitBase'] = rawGpuUnit ? rawGpuUnit.base : ''
} if (!rawInfos['gpuPwrUnitPrefix']) rawInfos['gpuPwrUnitPrefix'] = rawGpuUnit ? rawGpuUnit.prefix : ''
} }
return rawInfos; return rawInfos;
}); });
@@ -443,7 +450,7 @@
<Row class="mt-1 mb-2"> <Row class="mt-1 mb-2">
<Col xs={4} class="d-inline-flex align-items-center justify-content-center"> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;"> <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
{clusterInfo?.flopRate} {clusterInfo?.flopRateUnit} {scaleNumber(clusterInfo?.flopRate, clusterInfo?.flopRateUnitPrefix)}{clusterInfo?.flopRateUnitBase}
</Badge> </Badge>
<div style="font-size:large;"> <div style="font-size:large;">
Total Flop Rate Total Flop Rate
@@ -451,7 +458,7 @@
</Col> </Col>
<Col xs={4} class="d-inline-flex align-items-center justify-content-center"> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;"> <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
{clusterInfo?.memBwRate} {clusterInfo?.memBwRateUnit} {scaleNumber(clusterInfo?.memBwRate, clusterInfo?.memBwRateUnitPrefix)}{clusterInfo?.memBwRateUnitBase}
</Badge> </Badge>
<div style="font-size:large;"> <div style="font-size:large;">
Total Memory Bandwidth Total Memory Bandwidth
@@ -460,7 +467,7 @@
{#if clusterInfo?.totalAccs !== 0} {#if clusterInfo?.totalAccs !== 0}
<Col xs={4} class="d-inline-flex align-items-center justify-content-center"> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;"> <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
{clusterInfo?.gpuPwr} {clusterInfo?.gpuPwrUnit} {scaleNumber(clusterInfo?.gpuPwr, clusterInfo?.gpuPwrUnitPrefix)}{clusterInfo?.gpuPwrUnitBase}
</Badge> </Badge>
<div style="font-size:large;"> <div style="font-size:large;">
Total GPU Power Total GPU Power
@@ -469,7 +476,7 @@
{:else} {:else}
<Col xs={4} class="d-inline-flex align-items-center justify-content-center"> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;"> <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
{clusterInfo?.cpuPwr} {clusterInfo?.cpuPwrUnit} {scaleNumber(clusterInfo?.cpuPwr, clusterInfo?.cpuPwrUnitPrefix)}{clusterInfo?.cpuPwrUnitBase}
</Badge> </Badge>
<div style="font-size:large;"> <div style="font-size:large;">
Total CPU Power Total CPU Power

View File

@@ -342,7 +342,7 @@
<b>Disabled Metric</b> <b>Disabled Metric</b>
</CardHeader> </CardHeader>
<CardBody> <CardBody>
<p>Metric <b>{item.metric}</b> is disabled for subcluster <b>{$initq.data.job.subCluster}</b>.</p> <p>Metric <b>{item.metric}</b> is disabled for cluster <b>{$initq.data.job.cluster}:{$initq.data.job.subCluster}</b>.</p>
<p class="mb-1">To remove this card, open metric selection and press "Close and Apply".</p> <p class="mb-1">To remove this card, open metric selection and press "Close and Apply".</p>
</CardBody> </CardBody>
</Card> </Card>
@@ -352,7 +352,8 @@
<b>Missing Metric</b> <b>Missing Metric</b>
</CardHeader> </CardHeader>
<CardBody> <CardBody>
<p class="mb-1">No dataset returned for <b>{item.metric}</b>.</p> <p>No dataset(s) returned for <b>{item.metric}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{$initq.data.job.cluster}</b>.</p>
</CardBody> </CardBody>
</Card> </Card>
{/if} {/if}
@@ -386,17 +387,17 @@
<CardBody> <CardBody>
{#if missingMetrics.length > 0} {#if missingMetrics.length > 0}
<p> <p>
No data at all is available for the metrics: {missingMetrics.join( No datasets were returned for the metrics: <b>{missingMetrics.join(
", ", ", ",
)} )}</b>
</p> </p>
{/if} {/if}
{#if missingHosts.length > 0} {#if missingHosts.length > 0}
<p>Some metrics are missing for the following hosts:</p> <p>Metrics are missing for the following hosts:</p>
<ul> <ul>
{#each missingHosts as missing} {#each missingHosts as missing}
<li> <li>
{missing.hostname}: {missing.metrics.join(", ")} <b>{missing.hostname}</b>: {missing.metrics.join(", ")}
</li> </li>
{/each} {/each}
</ul> </ul>

View File

@@ -37,6 +37,7 @@
/* Const Init */ /* Const Init */
const { query: initq } = init(); const { query: initq } = init();
const ccconfig = getContext("cc-config"); const ccconfig = getContext("cc-config");
const matchedJobCompareLimit = 500;
/* State Init */ /* State Init */
let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the
@@ -154,8 +155,9 @@
{#if !showCompare} {#if !showCompare}
<TextFilter <TextFilter
{presetProject} {presetProject}
bind:authlevel {authlevel}
bind:roles {roles}
{filterBuffer}
setFilter={(filter) => filterComponent.updateFilters(filter)} setFilter={(filter) => filterComponent.updateFilters(filter)}
/> />
{/if} {/if}
@@ -169,12 +171,12 @@
{/if} {/if}
<div class="mx-1"></div> <div class="mx-1"></div>
<ButtonGroup class="w-50"> <ButtonGroup class="w-50">
<Button color="primary" disabled={(matchedListJobs >= 500 && !(selectedJobs.length != 0)) || $initq.fetching} onclick={() => { <Button color="primary" disabled={(matchedListJobs >= matchedJobCompareLimit && !(selectedJobs.length != 0)) || $initq.fetching} onclick={() => {
if (selectedJobs.length != 0) filterComponent.updateFilters({dbId: selectedJobs}, true) if (selectedJobs.length != 0) filterComponent.updateFilters({dbId: selectedJobs})
showCompare = !showCompare showCompare = !showCompare
}} > }} >
{showCompare ? 'Return to List' : {showCompare ? 'Return to List' :
matchedListJobs >= 500 && selectedJobs.length == 0 matchedListJobs >= matchedJobCompareLimit && selectedJobs.length == 0
? 'Compare Disabled' ? 'Compare Disabled'
: 'Compare' + (selectedJobs.length != 0 ? ` ${selectedJobs.length} ` : ' ') + 'Jobs' : 'Compare' + (selectedJobs.length != 0 ? ` ${selectedJobs.length} ` : ' ') + 'Jobs'
} }

View File

@@ -22,6 +22,8 @@
Icon, Icon,
Spinner, Spinner,
Card, Card,
CardHeader,
CardBody
} from "@sveltestrap/sveltestrap"; } from "@sveltestrap/sveltestrap";
import { import {
queryStore, queryStore,
@@ -174,7 +176,11 @@
<InputGroupText><Icon name="clipboard2-pulse" /></InputGroupText> <InputGroupText><Icon name="clipboard2-pulse" /></InputGroupText>
<InputGroupText>Node State</InputGroupText> <InputGroupText>Node State</InputGroupText>
<Button class="flex-grow-1 text-center" color={stateColors[thisNodeState]} disabled> <Button class="flex-grow-1 text-center" color={stateColors[thisNodeState]} disabled>
{thisNodeState} {#if $nodeMetricsData?.data}
{thisNodeState}
{:else}
<span><Spinner size="sm" secondary/></span>
{/if}
</Button> </Button>
</InputGroup> </InputGroup>
</Col> </Col>
@@ -254,12 +260,15 @@
></Card ></Card
> >
{:else} {:else}
<Card <Card color="warning" class="mx-2">
style="margin-left: 2rem;margin-right: 2rem;" <CardHeader class="mb-0">
body <b>Missing Metric</b>
color="warning" </CardHeader>
>No dataset returned for <code>{item.name}</code></Card <CardBody>
> <p>No dataset returned for <b>{item.name}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</CardBody>
</Card>
{/if} {/if}
{/snippet} {/snippet}

View File

@@ -65,10 +65,6 @@
let timeoutId = null; let timeoutId = null;
/* State Init */ /* State Init */
// svelte-ignore state_referenced_locally
let to = $state(presetTo || new Date(Date.now()));
// svelte-ignore state_referenced_locally
let from = $state(presetFrom || new Date(nowDate.setHours(nowDate.getHours() - 4)));
let selectedResolution = $state(resampleConfig ? resampleDefault : 0); let selectedResolution = $state(resampleConfig ? resampleDefault : 0);
let hostnameFilter = $state(""); let hostnameFilter = $state("");
let hoststateFilter = $state("all"); let hoststateFilter = $state("all");
@@ -76,6 +72,8 @@
let isMetricsSelectionOpen = $state(false); let isMetricsSelectionOpen = $state(false);
/* Derived States */ /* Derived States */
let to = $derived(presetTo ? presetTo : new Date(Date.now()));
let from = $derived(presetFrom ? presetFrom : new Date(nowDate.setHours(nowDate.getHours() - 4)));
const displayNodeOverview = $derived((displayType === 'OVERVIEW')); const displayNodeOverview = $derived((displayType === 'OVERVIEW'));
const systemMetrics = $derived($initialized ? [...globalMetrics.filter((gm) => gm?.availability.find((av) => av.cluster == cluster))] : []); const systemMetrics = $derived($initialized ? [...globalMetrics.filter((gm) => gm?.availability.find((av) => av.cluster == cluster))] : []);
const presetSystemUnits = $derived(loadUnits(systemMetrics)); const presetSystemUnits = $derived(loadUnits(systemMetrics));

View File

@@ -241,12 +241,15 @@
if (filters.project) opts.push(`project=${filters.project}`); if (filters.project) opts.push(`project=${filters.project}`);
if (filters.project && filters.projectMatch != "contains") // "contains" is default-case if (filters.project && filters.projectMatch != "contains") // "contains" is default-case
opts.push(`projectMatch=${filters.projectMatch}`); opts.push(`projectMatch=${filters.projectMatch}`);
if (filters.user.length != 0) if (filters.user) {
if (filters.userMatch != "in") { if (filters.user.length != 0) {
opts.push(`user=${filters.user}`); if (filters.userMatch != "in") {
} else { opts.push(`user=${filters.user}`);
for (let singleUser of filters.user) opts.push(`user=${singleUser}`); } else {
for (let singleUser of filters.user) opts.push(`user=${singleUser}`);
}
} }
}
if (filters.userMatch != "contains") // "contains" is default-case if (filters.userMatch != "contains") // "contains" is default-case
opts.push(`userMatch=${filters.userMatch}`); opts.push(`userMatch=${filters.userMatch}`);
// Filter Modals // Filter Modals

View File

@@ -2,9 +2,10 @@
@component Search Field for Job-Lists with separate mode if project filter is active @component Search Field for Job-Lists with separate mode if project filter is active
Properties: Properties:
- `presetProject String?`: Currently active project filter [Default: ''] - `presetProject String?`: Currently active project filter preset [Default: '']
- `authlevel Number?`: The current users authentication level [Default: null] - `authlevel Number?`: The current users authentication level [Default: null]
- `roles [Number]?`: Enum containing available roles [Default: null] - `roles [Number]?`: Enum containing available roles [Default: null]
- `filterBuffer [Obj]?`: Currently active filters, if any.
- `setFilter Func`: The callback function to apply current filter selection - `setFilter Func`: The callback function to apply current filter selection
--> -->
@@ -18,78 +19,69 @@
presetProject = "", presetProject = "",
authlevel = null, authlevel = null,
roles = null, roles = null,
filterBuffer = [],
setFilter setFilter
} = $props(); } = $props();
/* Const Init*/ /* Const Init*/
const throttle = 500; const throttle = 300;
/* Var Init */ /* Var Init */
let user = "";
let jobName = "";
let timeoutId = null; let timeoutId = null;
/* State Init */ /* Derived */
let term = $state(""); const bufferProject = $derived.by(() => {
let bp = filterBuffer.find((fb) =>
Object.keys(fb).includes("project")
)
return bp?.project?.contains || null
});
/* Derived */ const bufferUser = $derived.by(() => {
let project = $derived(presetProject ? presetProject : ""); let bu = filterBuffer.find((fb) =>
let mode = $derived(presetProject ? "jobName" : "project"); Object.keys(fb).includes("user")
)
return bu?.user?.contains || null
});
const bufferJobName = $derived.by(() => {
let bjn = filterBuffer.find((fb) =>
Object.keys(fb).includes("jobName")
)
return bjn?.jobName?.contains || null
});
let mode = $derived.by(() => {
if (presetProject) return "jobName" // Search by jobName if presetProject set
else if (bufferUser) return "user"
else if (bufferJobName) return "jobName"
else return "project"
});
let term = $derived(bufferUser || bufferJobName || bufferProject || "");
/* Functions */ /* Functions */
function modeChanged() { function inputChanged(sleep = throttle) {
if (timeoutId != null) clearTimeout(timeoutId);
if (mode == "user") { if (mode == "user") {
project = presetProject ? presetProject : ""; timeoutId = setTimeout(() => {
jobName = ""; setFilter({ user: term, project: (presetProject ? presetProject : null), jobName: null });
}, sleep);
} else if (mode == "project") { } else if (mode == "project") {
user = "";
jobName = "";
} else {
project = presetProject ? presetProject : "";
user = "";
}
termChanged(0);
}
// Compatibility: Handle "user role" and "no role" identically
function termChanged(sleep = throttle) {
if (roles && authlevel >= roles.manager) {
if (mode == "user") user = term;
else if (mode == "project") project = term;
else jobName = term;
if (timeoutId != null) clearTimeout(timeoutId);
timeoutId = setTimeout(() => { timeoutId = setTimeout(() => {
setFilter({ setFilter({ project: term, user: null, jobName: null });
user, }, sleep);
project, } else if (mode == "jobName") {
jobName
});
}, sleep);
} else {
if (mode == "project") project = term;
else jobName = term;
if (timeoutId != null) clearTimeout(timeoutId);
timeoutId = setTimeout(() => { timeoutId = setTimeout(() => {
setFilter({ setFilter({ jobName: term, user: null, project: (presetProject ? presetProject : null) });
project, }, sleep);
jobName
});
}, sleep);
} }
} }
function resetProject () { function resetProject () {
mode = "project" presetProject = "";
term = "" term = "";
presetProject = "" inputChanged(0);
project = ""
jobName = ""
user = ""
termChanged(0);
} }
</script> </script>
@@ -100,12 +92,12 @@
class="form-select w-auto" class="form-select w-auto"
title="Search Mode" title="Search Mode"
bind:value={mode} bind:value={mode}
onchange={modeChanged} onchange={() => inputChanged()}
> >
{#if !presetProject} {#if !presetProject}
<option value={"project"}>Project</option> <option value={"project"}>Project</option>
{/if} {/if}
{#if roles && authlevel >= roles.manager} {#if roles && authlevel >= roles?.manager}
<option value={"user"}>User</option> <option value={"user"}>User</option>
{/if} {/if}
<option value={"jobName"}>Jobname</option> <option value={"jobName"}>Jobname</option>
@@ -113,8 +105,8 @@
<Input <Input
type="text" type="text"
bind:value={term} bind:value={term}
onchange={() => termChanged()} onchange={() => inputChanged()}
onkeyup={(event) => termChanged(event.key == "Enter" ? 0 : throttle)} onkeyup={(event) => inputChanged(event.key == "Enter" ? 0 : throttle)}
placeholder={presetProject ? `Find in ${scrambleNames ? scramble(presetProject) : presetProject} ...` : `Find ${mode} ...`} placeholder={presetProject ? `Find in ${scrambleNames ? scramble(presetProject) : presetProject} ...` : `Find ${mode} ...`}
/> />
{#if presetProject} {#if presetProject}

View File

@@ -229,7 +229,12 @@
></Card ></Card
> >
{:else} {:else}
<Card body color="warning">No dataset returned</Card> <Card body class="mx-2" color="warning">
<p>No dataset(s) returned for <b>{metrics[i]}</b></p>
<p class="mb-1">Metric or host was not found in metric store for cluster <b>{job.cluster}</b>:</p>
<p class="mb-1">Identical messages in <i>{metrics[i]} column</i>: Metric not found.</p>
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
</Card>
{/if} {/if}
</td> </td>
{/each} {/each}

View File

@@ -25,7 +25,7 @@
metricData, metricData,
timestep, timestep,
numNodes, numNodes,
cluster, cluster = "",
forNode = true, forNode = true,
enableFlip = false, enableFlip = false,
publicMode = false, publicMode = false,
@@ -316,12 +316,14 @@
<div bind:this={plotWrapper} bind:clientWidth={width} <div bind:this={plotWrapper} bind:clientWidth={width}
class={forNode ? 'py-2 rounded' : 'rounded'} class={forNode ? 'py-2 rounded' : 'rounded'}
></div> ></div>
{:else if cluster}
<Card body color="warning" class="mx-4"
>Cannot render plot: No series data returned for <code>{cluster}</code>.</Card
>
{:else} {:else}
<Card body color="warning" class="mx-4" <Card color="warning" class="mx-2 mt-2">
>Cannot render plot: No series data returned.</Card <CardHeader class="mb-0">
> <b>Empty Metrics</b>
</CardHeader>
<CardBody>
<p>Cannot render plot for cluster <b>{cluster}</b>.</p>
<p class="mb-1">Metrics found but returned without timeseries data.</p>
</CardBody>
</Card>
{/if} {/if}

View File

@@ -27,7 +27,7 @@
import uPlot from "uplot"; import uPlot from "uplot";
import { formatNumber, formatDurationTime } from "../units.js"; import { formatNumber, formatDurationTime } from "../units.js";
import { getContext, onMount, onDestroy } from "svelte"; import { getContext, onMount, onDestroy } from "svelte";
import { Card } from "@sveltestrap/sveltestrap"; import { Card, CardBody, CardHeader } from "@sveltestrap/sveltestrap";
/* Svelte 5 Props */ /* Svelte 5 Props */
let { let {
@@ -633,7 +633,13 @@
style="background-color: {backgroundColor()};" class={forNode ? 'py-2 rounded' : 'rounded'} style="background-color: {backgroundColor()};" class={forNode ? 'py-2 rounded' : 'rounded'}
></div> ></div>
{:else} {:else}
<Card body color="warning" class="mx-4" <Card color="warning" class={forNode ? 'mx-2' : 'mt-2'}>
>Cannot render plot: No series data returned for <code>{metric}</code></Card <CardHeader class="mb-0">
> <b>Empty Metric</b>
</CardHeader>
<CardBody>
<p>Cannot render plot for <b>{metric}</b>.</p>
<p class="mb-1">Metric found but returned without timeseries data.</p>
</CardBody>
</Card>
{/if} {/if}

View File

@@ -67,6 +67,11 @@
reserved: "rgba(255, 0, 255, 0.75)", reserved: "rgba(255, 0, 255, 0.75)",
mixed: "rgba(255, 215, 0, 0.75)", mixed: "rgba(255, 215, 0, 0.75)",
unknown: "rgba(0, 0, 0, 0.75)" unknown: "rgba(0, 0, 0, 0.75)"
},
healthStates: {
full: "rgba(0, 128, 0, 0.75)",
failed: "rgba(255, 0, 0, 0.75)",
partial: "rgba(255, 215, 0, 0.75)",
} }
} }
</script> </script>

View File

@@ -46,6 +46,7 @@
/* Const Init */ /* Const Init */
const lineWidth = 2 // clusterCockpitConfig.plotConfiguration_lineWidth; const lineWidth = 2 // clusterCockpitConfig.plotConfiguration_lineWidth;
const cbmode = clusterCockpitConfig?.plotConfiguration_colorblindMode || false; const cbmode = clusterCockpitConfig?.plotConfiguration_colorblindMode || false;
const bubbleSizeMax = 50;
/* Var Init */ /* Var Init */
let timeoutId = null; let timeoutId = null;
@@ -317,8 +318,13 @@
size = sizeBase + scaling size = sizeBase + scaling
// Nodes: Size based on Jobcount // Nodes: Size based on Jobcount
} else if (nodesData) { } else if (nodesData) {
size = sizeBase + (nodesData[i]?.numJobs * 1.5) // Max Jobs Scale: 8 * 1.5 = 12 size = sizeBase + (nodesData[i]?.numJobs * 1.5)
}; };
// Apply Size Capping
if (size >= bubbleSizeMax) {
size = bubbleSizeMax;
}
if (xVal >= filtLft && xVal <= filtRgt && yVal >= filtBtm && yVal <= filtTop) { if (xVal >= filtLft && xVal <= filtRgt && yVal >= filtBtm && yVal <= filtTop) {
let cx = valToPosX(xVal, scaleX, xDim, xOff); let cx = valToPosX(xVal, scaleX, xDim, xOff);

View File

@@ -46,13 +46,13 @@
/* Derived */ /* Derived */
let timeRange = $derived.by(() => { let timeRange = $derived.by(() => {
if (presetTo && presetFrom) { if (presetTo && presetFrom) {
return ((presetTo.getTime() - presetFrom.getTime()) / 1000) return Math.floor(((presetTo.getTime() - presetFrom.getTime()) / 1000))
} else { } else {
return ((defaultTo.getTime() - defaultFrom.getTime()) / 1000) return Math.floor(((defaultTo.getTime() - defaultFrom.getTime()) / 1000))
} }
}); });
let unknownRange = $derived(!Object.values(options).includes(timeRange)); let unknownRange = $derived(!Object.values(options).includes(timeRange));
/* Functions */ /* Functions */
function updateTimeRange() { function updateTimeRange() {
let now = Date.now(); let now = Date.now();

View File

@@ -17,11 +17,26 @@ export function formatNumber(x) {
} }
} }
export function scaleNumber(x, p = '') {
if ( isNaN(x) || x == null) {
return `${x} ${p}` // Return if String or Null
} else {
const oldPower = power[prefix.indexOf(p)]
const rawValue = x * oldPower
for (let i = 0; i < prefix.length; i++) {
if (power[i] <= rawValue && rawValue < power[i+1]) {
return `${Math.round((rawValue / power[i]) * 100) / 100} ${prefix[i]}`
}
}
return `${x} ${p}`
}
}
export function roundTwoDigits(x) { export function roundTwoDigits(x) {
return Math.round(x * 100) / 100 return Math.round(x * 100) / 100
} }
export function scaleNumbers(x, y , p = '') { export function scaleNumbers(x, y, p = '') {
const oldPower = power[prefix.indexOf(p)] const oldPower = power[prefix.indexOf(p)]
const rawXValue = x * oldPower const rawXValue = x * oldPower
const rawYValue = y * oldPower const rawYValue = y * oldPower

View File

@@ -55,6 +55,7 @@
function setupAvailable(data) { function setupAvailable(data) {
let pendingAvailable = {}; let pendingAvailable = {};
if (data) { if (data) {
// Returns Only For Available Metrics
for (let d of data) { for (let d of data) {
if (!pendingAvailable[d.name]) { if (!pendingAvailable[d.name]) {
pendingAvailable[d.name] = [d.scope] pendingAvailable[d.name] = [d.scope]
@@ -90,13 +91,16 @@
pendingTableData[host] = {}; pendingTableData[host] = {};
}; };
for (const metric of sm) { for (const metric of sm) {
if (!pendingTableData[host][metric]) { // Only Returned, Available Metrics
pendingTableData[host][metric] = {}; if (as[metric]) {
}; if (!pendingTableData[host][metric]) {
for (const scope of as[metric]) { pendingTableData[host][metric] = {};
pendingTableData[host][metric][scope] = js.find((d) => d.name == metric && d.scope == scope) };
?.stats.filter((st) => st.hostname == host && st.data != null) for (const scope of as[metric]) {
?.sort((a, b) => a.id - b.id) || [] pendingTableData[host][metric][scope] = js.find((d) => d.name == metric && d.scope == scope)
?.stats.filter((st) => st.hostname == host && st.data != null)
?.sort((a, b) => a.id - b.id) || []
};
}; };
}; };
}; };
@@ -136,40 +140,56 @@
<th></th> <th></th>
{#each selectedMetrics as metric} {#each selectedMetrics as metric}
<!-- To Match Row-2 Header Field Count--> <!-- To Match Row-2 Header Field Count-->
<th colspan={selectedScopes[metric] == "node" ? 3 : 4}> {#if availableScopes[metric]}
<InputGroup> <th colspan={selectedScopes[metric] == "node" ? 3 : 4}>
<InputGroupText> <InputGroup>
{metric} <InputGroupText>
</InputGroupText> {metric}
<Input type="select" bind:value={selectedScopes[metric]} disabled={availableScopes[metric]?.length === 1}> </InputGroupText>
{#each (availableScopes[metric] || []) as scope} <Input type="select" bind:value={selectedScopes[metric]} disabled={availableScopes[metric]?.length === 1}>
<option value={scope}>{scope}</option> {#each (availableScopes[metric] || []) as scope}
{/each} <option value={scope}>{scope}</option>
</Input> {/each}
</InputGroup> </Input>
</th> </InputGroup>
</th>
{:else}
<th>
<InputGroup>
<InputGroupText>
{metric}
</InputGroupText>
</InputGroup>
</th>
{/if}
{/each} {/each}
</tr> </tr>
<!-- Header Row 2: Fields --> <!-- Header Row 2: Fields -->
<tr> <tr>
<th>Node</th> <th>Node</th>
{#each selectedMetrics as metric} {#each selectedMetrics as metric}
{#if selectedScopes[metric] != "node"} {#if availableScopes[metric]}
<th>Id</th> {#if selectedScopes[metric] != "node"}
{/if} <th>Id</th>
{#each ["min", "avg", "max"] as stat} {/if}
<th onclick={() => sortBy(metric, stat)}> {#each ["min", "avg", "max"] as stat}
{stat} <th onclick={() => sortBy(metric, stat)}>
{#if selectedScopes[metric] == "node"} {stat}
<Icon {#if selectedScopes[metric] == "node"}
name="caret-{sorting[metric][stat].dir}{sorting[metric][stat] <Icon
.active name="caret-{sorting[metric][stat].dir}{sorting[metric][stat]
? '-fill' .active
: ''}" ? '-fill'
/> : ''}"
{/if} />
{/if}
</th>
{/each}
{:else}
<th class="table-warning">
Missing Metric
</th> </th>
{/each} {/if}
{/each} {/each}
</tr> </tr>
</thead> </thead>
@@ -178,10 +198,17 @@
<tr> <tr>
<th scope="col">{host}</th> <th scope="col">{host}</th>
{#each selectedMetrics as metric (metric)} {#each selectedMetrics as metric (metric)}
<StatsTableEntry {#if tableData[host][metric]}
data={tableData[host][metric][selectedScopes[metric]]} <StatsTableEntry
scope={selectedScopes[metric]} data={tableData[host][metric][selectedScopes[metric]]}
/> scope={selectedScopes[metric]}
/>
{:else}
<td class="table-warning" style="max-width:10rem;">
<p>No dataset(s) returned for <b>{metric}</b>.</p>
<p>Metric was not found in metric store for host <b>{host}</b>.</p>
</td>
{/if}
{/each} {/each}
</tr> </tr>
{/each} {/each}

View File

@@ -22,6 +22,7 @@
import { import {
formatDurationTime, formatDurationTime,
formatNumber, formatNumber,
scaleNumber
} from "../generic/units.js"; } from "../generic/units.js";
import { import {
Row, Row,
@@ -250,9 +251,11 @@
if (!rawInfos['totalAccs']) rawInfos['totalAccs'] = (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0; if (!rawInfos['totalAccs']) rawInfos['totalAccs'] = (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0;
else rawInfos['totalAccs'] += (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0; else rawInfos['totalAccs'] += (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || 0;
// Units (Set Once) // Unit Parts (Set Once)
if (!rawInfos['flopRateUnit']) rawInfos['flopRateUnit'] = subCluster.flopRateSimd.unit.prefix + subCluster.flopRateSimd.unit.base if (!rawInfos['flopRateUnitBase']) rawInfos['flopRateUnitBase'] = subCluster.flopRateSimd.unit.base
if (!rawInfos['memBwRateUnit']) rawInfos['memBwRateUnit'] = subCluster.memoryBandwidth.unit.prefix + subCluster.memoryBandwidth.unit.base if (!rawInfos['memBwRateUnitBase']) rawInfos['memBwRateUnitBase'] = subCluster.memoryBandwidth.unit.base
if (!rawInfos['flopRateUnitPrefix']) rawInfos['flopRateUnitPrefix'] = subCluster.flopRateSimd.unit.prefix
if (!rawInfos['memBwRateUnitPrefix']) rawInfos['memBwRateUnitPrefix'] = subCluster.memoryBandwidth.unit.prefix
// Get Maxima For Roofline Knee Render // Get Maxima For Roofline Knee Render
if (!rawInfos['roofData']) { if (!rawInfos['roofData']) {
@@ -268,10 +271,14 @@
} }
} }
// Get Idle Infos after Sums // Get Simple Idle Infos after Sums by Diff
if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes']; if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes'];
if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores']; if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores'];
if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs']; if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs'];
// Cap at 0 (Negative hints towards Config <> Reality Mismatch!)
if (rawInfos['idleNodes'] < 0) rawInfos['idleNodes'] = 0;
if (rawInfos['idleCores'] < 0) rawInfos['idleCores'] = 0;
if (rawInfos['idleAccs'] < 0) rawInfos['idleAccs'] = 0;
// Keymetrics (Data on Cluster-Scope) // Keymetrics (Data on Cluster-Scope)
let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
@@ -418,12 +425,10 @@
</tr> </tr>
<tr class="pb-2"> <tr class="pb-2">
<td style="font-size:x-large;"> <td style="font-size:x-large;">
{clusterInfo?.flopRate} {scaleNumber(clusterInfo?.flopRate, clusterInfo?.flopRateUnitPrefix)}{clusterInfo?.flopRateUnitBase}
{clusterInfo?.flopRateUnit}
</td> </td>
<td style="font-size:x-large;"> <td style="font-size:x-large;">
{clusterInfo?.memBwRate} {scaleNumber(clusterInfo?.memBwRate, clusterInfo?.memBwRateUnitPrefix)}{clusterInfo?.memBwRateUnitBase}
{clusterInfo?.memBwRateUnit}
</td> </td>
</tr> </tr>
<hr class="my-1"/> <hr class="my-1"/>

View File

@@ -23,7 +23,7 @@
gql, gql,
getContextClient, getContextClient,
} from "@urql/svelte"; } from "@urql/svelte";
import { formatDurationTime } from "../../generic/units.js"; import { formatDurationTime, scaleNumber } from "../../generic/units.js";
import Refresher from "../../generic/helper/Refresher.svelte"; import Refresher from "../../generic/helper/Refresher.svelte";
import TimeSelection from "../../generic/select/TimeSelection.svelte"; import TimeSelection from "../../generic/select/TimeSelection.svelte";
import Roofline from "../../generic/plots/Roofline.svelte"; import Roofline from "../../generic/plots/Roofline.svelte";
@@ -418,7 +418,7 @@
{:else if $statesTimed.error} {:else if $statesTimed.error}
<Row cols={1} class="text-center mt-3"> <Row cols={1} class="text-center mt-3">
<Col> <Col>
<Card body color="danger">{$statesTimed.error.message}</Card> <Card body color="danger">States Timed: {$statesTimed.error.message}</Card>
</Col> </Col>
</Row> </Row>
{:else if $statesTimed.data} {:else if $statesTimed.data}
@@ -472,7 +472,7 @@
{:else if $statusQuery.error} {:else if $statusQuery.error}
<Row cols={1} class="text-center mt-3"> <Row cols={1} class="text-center mt-3">
<Col> <Col>
<Card body color="danger">{$statesTimed.error.message}</Card> <Card body color="danger">Status Query (States): {$statesTimed.error.message}</Card>
</Col> </Col>
</Row> </Row>
{:else if $statusQuery?.data?.nodeStates} {:else if $statusQuery?.data?.nodeStates}
@@ -484,7 +484,6 @@
Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Node States Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Node States
</h4> </h4>
<Pie <Pie
{useAltColors}
canvasId="hpcpie-slurm" canvasId="hpcpie-slurm"
size={pieWidth * 0.55} size={pieWidth * 0.55}
sliceLabel="Nodes" sliceLabel="Nodes"
@@ -494,6 +493,9 @@
entities={refinedStateData.map( entities={refinedStateData.map(
(sd) => sd.state, (sd) => sd.state,
)} )}
fixColors={refinedStateData.map(
(sd) => colors['nodeStates'][sd.state],
)}
/> />
{/key} {/key}
</div> </div>
@@ -508,7 +510,7 @@
</tr> </tr>
{#each refinedStateData as sd, i} {#each refinedStateData as sd, i}
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};"/></td> <td><Icon name="circle-fill" style="color: {colors['nodeStates'][sd.state]};"/></td>
<td>{sd.state}</td> <td>{sd.state}</td>
<td>{sd.count}</td> <td>{sd.count}</td>
</tr> </tr>
@@ -524,15 +526,17 @@
Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Node Health Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Node Health
</h4> </h4>
<Pie <Pie
{useAltColors}
canvasId="hpcpie-health" canvasId="hpcpie-health"
size={pieWidth * 0.55} size={pieWidth * 0.55}
sliceLabel="Nodes" sliceLabel="Nodes"
quantities={refinedHealthData.map( quantities={refinedHealthData.map(
(sd) => sd.count, (hd) => hd.count,
)} )}
entities={refinedHealthData.map( entities={refinedHealthData.map(
(sd) => sd.state, (hd) => hd.state,
)}
fixColors={refinedHealthData.map(
(hd) => colors['healthStates'][hd.state],
)} )}
/> />
{/key} {/key}
@@ -548,7 +552,7 @@
</tr> </tr>
{#each refinedHealthData as hd, i} {#each refinedHealthData as hd, i}
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td> <td><Icon name="circle-fill"style="color: {colors['healthStates'][hd.state]};" /></td>
<td>{hd.state}</td> <td>{hd.state}</td>
<td>{hd.count}</td> <td>{hd.count}</td>
</tr> </tr>
@@ -570,7 +574,7 @@
{:else if $statusQuery.error} {:else if $statusQuery.error}
<Row cols={1} class="text-center mt-3"> <Row cols={1} class="text-center mt-3">
<Col> <Col>
<Card body color="danger">{$statusQuery.error.message}</Card> <Card body color="danger">Status Query (Details): {$statusQuery.error.message}</Card>
</Col> </Col>
</Row> </Row>
{:else if $statusQuery.data} {:else if $statusQuery.data}
@@ -599,12 +603,10 @@
</tr> </tr>
<tr class="pb-2"> <tr class="pb-2">
<td style="font-size:x-large;"> <td style="font-size:x-large;">
{flopRate[subCluster.name]} {scaleNumber(flopRate[subCluster.name], flopRateUnitPrefix[subCluster.name])}{flopRateUnitBase[subCluster.name]}
{flopRateUnitPrefix[subCluster.name]}{flopRateUnitBase[subCluster.name]}
</td> </td>
<td colspan="2" style="font-size:x-large;"> <td colspan="2" style="font-size:x-large;">
{memBwRate[subCluster.name]} {scaleNumber(memBwRate[subCluster.name], memBwRateUnitPrefix[subCluster.name])}{memBwRateUnitBase[subCluster.name]}
{memBwRateUnitPrefix[subCluster.name]}{memBwRateUnitBase[subCluster.name]}
</td> </td>
</tr> </tr>
<hr class="my-1"/> <hr class="my-1"/>

View File

@@ -14,7 +14,7 @@
<script> <script>
import { getContext } from "svelte"; import { getContext } from "svelte";
import { queryStore, gql, getContextClient } from "@urql/svelte"; import { queryStore, gql, getContextClient } from "@urql/svelte";
import { Row, Col, Card, Spinner, Badge } from "@sveltestrap/sveltestrap"; import { Row, Col, Card, CardHeader, CardBody, Spinner, Badge } from "@sveltestrap/sveltestrap";
import { checkMetricDisabled } from "../generic/utils.js"; import { checkMetricDisabled } from "../generic/utils.js";
import MetricPlot from "../generic/plots/MetricPlot.svelte"; import MetricPlot from "../generic/plots/MetricPlot.svelte";
@@ -156,37 +156,63 @@
> >
</h4> </h4>
<span style="margin-right: 0.5rem;"> <span style="margin-right: 0.5rem;">
<Badge color={stateColors[item?.state? item.state : 'notindb']}>{item?.state? item.state : 'notindb'}</Badge> <Badge color={stateColors[item?.state? item.state : 'notindb']}>
State: {item?.state? item.state.charAt(0).toUpperCase() + item.state.slice(1) : 'Not in DB'}
</Badge>
</span> </span>
</div> </div>
{#if item.disabled === true} {#if item?.data}
<Card body class="mx-3" color="info" {#if item.disabled === true}
>Metric disabled for subcluster <code <Card body class="mx-3" color="info"
>{selectedMetric}:{item.subCluster}</code >Metric disabled for subcluster <code
></Card >{selectedMetric}:{item.subCluster}</code
> ></Card
{:else if item.disabled === false} >
<!-- "No Data"-Warning included in MetricPlot-Component --> {:else if item.disabled === false}
<!-- #key: X-axis keeps last selected timerange otherwise --> <!-- "No Data"-Warning included in MetricPlot-Component -->
{#key item.data[0].metric.series[0].data.length} <!-- #key: X-axis keeps last selected timerange otherwise -->
<MetricPlot {#key item.data[0].metric.series[0].data.length}
timestep={item.data[0].metric.timestep} <MetricPlot
series={item.data[0].metric.series} timestep={item.data[0].metric.timestep}
metric={item.data[0].name} series={item.data[0].metric.series}
{cluster} metric={item.data[0].name}
subCluster={item.subCluster} {cluster}
forNode subCluster={item.subCluster}
enableFlip forNode
/> enableFlip
{/key} />
{:else if item.disabled === null} {/key}
<Card body class="mx-3" color="info"> {:else if item.disabled === null}
Global Metric List Not Initialized <Card body class="mx-3" color="info">
Can not determine {selectedMetric} availability: Please Reload Page Global Metric List Not Initialized
Can not determine {selectedMetric} availability: Please Reload Page
</Card>
{/if}
{:else}
<Card color="warning">
<CardHeader class="mb-0">
<b>Missing Metric</b>
</CardHeader>
<CardBody>
<p>No dataset(s) returned for <b>{selectedMetric}</b>.</p>
<p class="mb-1">Metric was not found in metric store for host <b>{item.host}</b>.</p>
</CardBody>
</Card> </Card>
{/if} {/if}
</Col> </Col>
{/each} {/each}
{/key} {/key}
</Row> </Row>
{:else}
<Row>
<Card color="warning">
<CardHeader class="mb-0">
<b>Missing Metric</b>
</CardHeader>
<CardBody>
<p>No datasets returned for <b>{selectedMetric}</b>.</p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</CardBody>
</Card>
</Row>
{/if} {/if}

View File

@@ -171,13 +171,18 @@
{#key metricData} {#key metricData}
<td> <td>
{#if metricData?.disabled} {#if metricData?.disabled}
<Card body class="mx-3" color="info" <Card body class="mx-2" color="info"
>Metric disabled for subcluster <code >Metric <b>{selectedMetrics[i]}</b> disabled for subcluster <code
>{metricData?.data?.name ? metricData.data.name : `Metric Index ${i}`}:{nodeData.subCluster}</code >{nodeData.subCluster}</code
></Card ></Card
> >
{:else if !metricData?.data}
<Card body class="mx-2" color="warning">
<p>No dataset(s) returned for <b>{selectedMetrics[i]}</b></p>
<p class="mb-1">Metric was not found in metric store for cluster <b>{cluster}</b>.</p>
</Card>
{:else if !metricData?.data?.name} {:else if !metricData?.data?.name}
<Card body class="mx-3" color="warning" <Card body class="mx-2" color="warning"
>Metric without name for subcluster <code >Metric without name for subcluster <code
>{`Metric Index ${i}`}:{nodeData.subCluster}</code >{`Metric Index ${i}`}:{nodeData.subCluster}</code
></Card ></Card