diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index 59c84f79..ed1ff38e 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -6,9 +6,7 @@ package metricstore import ( - "cmp" "fmt" - "slices" "time" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" @@ -16,25 +14,18 @@ import ( ) // HealthCheckResponse represents the result of a health check operation. -// -// Status indicates the monitoring state (Full, Partial, Failed). -// Error contains any error encountered during the health check. type HealthCheckResponse struct { Status schema.MonitoringState Error error } -// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing. -// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a -// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy. +// MaxMissingDataPoints is the threshold for stale data detection. +// A buffer is considered healthy if the gap between its last data point +// and the current time is within MaxMissingDataPoints * frequency. const MaxMissingDataPoints int64 = 5 -// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints. -// -// Returns true if the buffer is healthy (recent data within threshold), false otherwise. -// A nil buffer or empty buffer is considered unhealthy. +// bufferExists returns true if the buffer is non-nil and contains data. func (b *buffer) bufferExists() bool { - // Check if the buffer is empty if b == nil || b.data == nil || len(b.data) == 0 { return false } @@ -42,233 +33,139 @@ func (b *buffer) bufferExists() bool { return true } -// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints. -// -// Returns true if the buffer is healthy (recent data within threshold), false otherwise. -// A nil buffer or empty buffer is considered unhealthy. +// isBufferHealthy returns true if the buffer has recent data within +// MaxMissingDataPoints * frequency of the current time. func (b *buffer) isBufferHealthy() bool { - // Get the last endtime of the buffer bufferEnd := b.start + b.frequency*int64(len(b.data)) t := time.Now().Unix() - // Check if the buffer has recent data (within MaxMissingDataPoints threshold) - if t-bufferEnd > MaxMissingDataPoints*b.frequency { - return false - } - - return true + return t-bufferEnd <= MaxMissingDataPoints*b.frequency } -// MergeUniqueSorted merges two lists, sorts them, and removes duplicates. -// Requires 'cmp.Ordered' because we need to sort the data. -func mergeList[string cmp.Ordered](list1, list2 []string) []string { - // 1. Combine both lists - result := append(list1, list2...) - - // 2. Sort the combined list - slices.Sort(result) - - // 3. Compact removes consecutive duplicates (standard in Go 1.21+) - // e.g. [1, 1, 2, 3, 3] -> [1, 2, 3] - result = slices.Compact(result) - - return result -} - -// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below. +// collectMetricStatus walks the subtree rooted at l and classifies each +// expected metric into the healthy or degraded map. // -// A metric is considered: -// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values -// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values -// -// This routine walks the entire subtree starting from the current level. -// -// Parameters: -// - m: MemoryStore containing the global metric configuration -// -// Returns: -// - []string: Flat list of healthy metric names from this level and all children -// - []string: Flat list of degraded metric names (exist but have too many missing values) -// - error: Non-nil only for internal errors during recursion -// -// The routine mirrors healthCheck() but provides more granular classification: -// - healthCheck() finds problems (stale/missing) -// - getHealthyMetrics() separates healthy from degraded metrics -func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string, error) { +// Classification rules (evaluated per buffer, pessimistic): +// - A single stale buffer marks the metric as degraded permanently. +// - A healthy buffer only counts if no stale buffer has been seen. +// - Metrics absent from the global config or without any buffer remain +// in neither map and are later reported as missing. +func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, healthy, degraded map[string]bool) { l.lock.RLock() defer l.lock.RUnlock() - globalMetrics := m.Metrics + for _, metricName := range expectedMetrics { + if degraded[metricName] { + continue // already degraded, cannot improve + } + mc := m.Metrics[metricName] + b := l.metrics[mc.offset] + if b.bufferExists() { + if !b.isBufferHealthy() { + degraded[metricName] = true + delete(healthy, metricName) + } else if !degraded[metricName] { + healthy[metricName] = true + } + } + } + + for _, lvl := range l.children { + lvl.collectMetricStatus(m, expectedMetrics, healthy, degraded) + } +} + +// getHealthyMetrics walks the complete subtree rooted at l and classifies +// each expected metric by comparing the collected status against the +// expected list. +// +// Returns: +// - missingList: metrics not found in global config or without any buffer +// - degradedList: metrics with at least one stale buffer in the subtree +func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) { + healthy := make(map[string]bool, len(expectedMetrics)) + degraded := make(map[string]bool) + + l.collectMetricStatus(m, expectedMetrics, healthy, degraded) missingList := make([]string, 0) degradedList := make([]string, 0) - // Phase 1: Check metrics at this level for _, metricName := range expectedMetrics { - offset := globalMetrics[metricName].offset - b := l.metrics[offset] + if healthy[metricName] { + continue + } - if !b.bufferExists() { - missingList = append(missingList, metricName) - } else if !b.isBufferHealthy() { + if degraded[metricName] { degradedList = append(degradedList, metricName) + } else { + missingList = append(missingList, metricName) } } - // Phase 2: Recursively check child levels - for _, lvl := range l.children { - childMissing, childDegraded, err := lvl.getHealthyMetrics(m, expectedMetrics) - if err != nil { - return nil, nil, err - } - - missingList = mergeList(missingList, childMissing) - degradedList = mergeList(degradedList, childDegraded) - } - - return missingList, degradedList, nil + return degradedList, missingList } -// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists. +// GetHealthyMetrics returns missing and degraded metric lists for a node. // -// This routine walks the metric tree starting from the specified node selector -// and collects all metrics that have received data within the last MaxMissingDataPoints -// (default: 5 data points). Metrics are classified into two categories: +// It walks the metric tree starting from the node identified by selector +// and classifies each expected metric: +// - Missing: no buffer anywhere in the subtree, or metric not in global config +// - Degraded: at least one stale buffer exists in the subtree // -// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values -// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values -// -// The returned lists include both node-level metrics (e.g., "load", "mem_used") and -// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices. -// -// Parameters: -// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}. -// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster. -// The selector must match the hierarchy used during metric ingestion. -// -// Returns: -// - []string: Flat list of healthy metric names (recent data, few missing values) -// - []string: Flat list of degraded metric names (recent data, many missing values) -// - error: Non-nil if the node is not found or internal errors occur -// -// Example usage: -// -// selector := []string{"emmy", "node001"} -// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector) -// if err != nil { -// // Node not found or internal error -// return err -// } -// fmt.Printf("Healthy metrics: %v\n", healthyMetrics) -// // Output: ["load", "mem_used", "cpu_user", ...] -// fmt.Printf("Degraded metrics: %v\n", degradedMetrics) -// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values) -// -// Note: This routine provides more granular classification than HealthCheck: -// - HealthCheck reports stale/missing metrics (problems) -// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels) +// Metrics present in expectedMetrics but absent from both returned lists +// are considered fully healthy. func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) { lvl := m.root.findLevel(selector) if lvl == nil { - return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector) + return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) } - missingList, degradedList, err := lvl.getHealthyMetrics(m, expectedMetrics) - if err != nil { - return nil, nil, err - } - - return missingList, degradedList, nil + degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics) + return degradedList, missingList, nil } -// HealthCheck performs health checks on multiple nodes and returns their monitoring states. +// HealthCheck evaluates multiple nodes against a set of expected metrics +// and returns a monitoring state per node. // -// This routine provides a batch health check interface that evaluates multiple nodes -// against a specific set of expected metrics. For each node, it determines the overall -// monitoring state based on which metrics are healthy, degraded, or missing. -// -// Health Status Classification: -// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values) -// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing -// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale -// -// Parameters: -// - cluster: Cluster name (first element of selector path) -// - nodes: List of node hostnames to check -// - expectedMetrics: List of metric names that should be present on each node -// -// Returns: -// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node -// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed) -// -// Example usage: -// -// cluster := "emmy" -// nodes := []string{"node001", "node002", "node003"} -// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"} -// healthStates, err := ms.HealthCheck(cluster, nodes, expectedMetrics) -// if err != nil { -// return err -// } -// for hostname, state := range healthStates { -// fmt.Printf("Node %s: %s\n", hostname, state) -// } -// -// Note: This routine is optimized for batch operations where you need to check -// the same set of metrics across multiple nodes. +// States: +// - MonitoringStateFull: all expected metrics are healthy +// - MonitoringStatePartial: some metrics are missing or degraded +// - MonitoringStateFailed: node not found, or no healthy metrics at all func (m *MemoryStore) HealthCheck(cluster string, nodes []string, expectedMetrics []string, ) (map[string]schema.MonitoringState, error) { results := make(map[string]schema.MonitoringState, len(nodes)) - // Create a set of expected metrics for fast lookup - expectedSet := make(map[string]bool, len(expectedMetrics)) - for _, metric := range expectedMetrics { - expectedSet[metric] = true - } - - // Check each node for _, hostname := range nodes { selector := []string{cluster, hostname} - status := schema.MonitoringStateFull - healthyCount := 0 - degradedCount := 0 - missingCount := 0 - // Get healthy and degraded metrics for this node - missingList, degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics) + degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics) if err != nil { - // Node not found or internal error results[hostname] = schema.MonitoringStateFailed continue } - missingCount = len(missingList) - degradedCount = len(degradedList) - uniqueList := mergeList(missingList, degradedList) - healthyCount = len(expectedMetrics) - len(uniqueList) + degradedCount := len(degradedList) + missingCount := len(missingList) + + healthyCount := len(expectedMetrics) - degradedCount - missingCount - // Debug log missing and degraded metrics - if missingCount > 0 { - cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "missing metrics:", missingList) - } if degradedCount > 0 { - cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList) + cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "degraded metrics:", degradedList) + } + if missingCount > 0 { + cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList) } - // Determine overall health status - if missingCount > 0 || degradedCount > 0 { - if healthyCount == 0 { - // No healthy metrics at all - status = schema.MonitoringStateFailed - } else { - // Some healthy, some degraded/missing - status = schema.MonitoringStatePartial - } + switch { + case degradedCount == 0 && missingCount == 0: + results[hostname] = schema.MonitoringStateFull + case healthyCount == 0: + results[hostname] = schema.MonitoringStateFailed + default: + results[hostname] = schema.MonitoringStatePartial } - // else: all metrics healthy, status remains MonitoringStateFull - - results[hostname] = status } return results, nil diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index f96f49a2..4d68d76c 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -303,39 +303,39 @@ func TestGetHealthyMetrics(t *testing.T) { name string selector []string expectedMetrics []string - wantMissing []string wantDegraded []string + wantMissing []string wantErr bool }{ { name: "mixed health states", selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load", "mem_used", "cpu_user"}, - wantMissing: []string{"cpu_user"}, wantDegraded: []string{"mem_used"}, + wantMissing: []string{"cpu_user"}, wantErr: false, }, { name: "node not found", selector: []string{"testcluster", "nonexistent"}, expectedMetrics: []string{"load"}, - wantMissing: nil, wantDegraded: nil, + wantMissing: nil, wantErr: true, }, { name: "check only healthy metric", selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load"}, - wantMissing: []string{}, wantDegraded: []string{}, + wantMissing: []string{}, wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - missing, degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) + degraded, missing, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) if (err != nil) != tt.wantErr { t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr) @@ -346,17 +346,6 @@ func TestGetHealthyMetrics(t *testing.T) { return } - // Check missing list - if len(missing) != len(tt.wantMissing) { - t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing) - } else { - for i, m := range tt.wantMissing { - if missing[i] != m { - t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m) - } - } - } - // Check degraded list if len(degraded) != len(tt.wantDegraded) { t.Errorf("GetHealthyMetrics() degraded = %v, want %v", degraded, tt.wantDegraded) @@ -367,6 +356,17 @@ func TestGetHealthyMetrics(t *testing.T) { } } } + + // Check missing list + if len(missing) != len(tt.wantMissing) { + t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing) + } else { + for i, m := range tt.wantMissing { + if missing[i] != m { + t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m) + } + } + } }) } } diff --git a/tools/dataGenerator.sh b/tools/dataGenerator.sh index 6f488142..338fd190 100644 --- a/tools/dataGenerator.sh +++ b/tools/dataGenerator.sh @@ -12,6 +12,7 @@ API_USER="demo" # User for JWT generation # BASE NETWORK CONFIG SERVICE_ADDRESS="http://localhost:8080" NATS_SERVER="nats://0.0.0.0:4222" +REST_URL="${SERVICE_ADDRESS}/api/write" # NATS CREDENTIALS NATS_USER="root" @@ -27,18 +28,22 @@ JWT_STATIC="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NzU3Nzg4NDQsImlhdCI ALEX_HOSTS="a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904" FRITZ_HOSTS="f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378" -METRICS_STD="cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock" -METRICS_NODE="cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts" +ALEX_METRICS_HWTHREAD="cpu_user flops_any clock core_power ipc" +ALEX_METRICS_SOCKET="mem_bw cpu_power" +ALEX_METRICS_ACC="acc_utilization acc_mem_used acc_power nv_mem_util nv_temp nv_sm_clock" +ALEX_METRICS_NODE="cpu_load mem_used net_bytes_in net_bytes_out" + +FRITZ_METRICS_HWTHREAD="cpu_user flops_any flops_sp flops_dp clock ipc vectorization_ratio" +FRITZ_METRICS_SOCKET="mem_bw cpu_power mem_power" +FRITZ_METRICS_NODE="cpu_load mem_used ib_recv ib_xmit ib_recv_pkts ib_xmit_pkts nfs4_read nfs4_total" + ACCEL_IDS="00000000:49:00.0 00000000:0E:00.0 00000000:D1:00.0 00000000:90:00.0 00000000:13:00.0 00000000:96:00.0 00000000:CC:00.0 00000000:4F:00.0" # ========================================== # SETUP ENV (URL & TOKEN) # ========================================== -if [ "$CONNECTION_SCOPE" == "INTERNAL" ]; then - # 1. Set URL for Internal Mode - REST_URL="${SERVICE_ADDRESS}/metricstore/api/write" - +if [ "$CONNECTION_SCOPE" == "INTERNAL" ]; then # 2. Generate JWT dynamically echo "Setup: INTERNAL mode selected." echo "Generating JWT for user: $API_USER" @@ -48,10 +53,7 @@ if [ "$CONNECTION_SCOPE" == "INTERNAL" ]; then echo "Error: Failed to generate JWT from cc-backend." exit 1 fi -else - # 1. Set URL for External Mode - REST_URL="${SERVICE_ADDRESS}/api/write" - +else # 2. Use Static JWT echo "Setup: EXTERNAL mode selected." echo "Using static JWT." @@ -96,7 +98,7 @@ while [ true ]; do # 1. ALEX: HWTHREAD echo "Generating Alex: hwthread" { - for metric in $METRICS_STD; do + for metric in $ALEX_METRICS_HWTHREAD; do for hostname in $ALEX_HOSTS; do for id in {0..127}; do echo "$metric,cluster=alex,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" @@ -109,7 +111,7 @@ while [ true ]; do # 2. FRITZ: HWTHREAD echo "Generating Fritz: hwthread" { - for metric in $METRICS_STD; do + for metric in $FRITZ_METRICS_HWTHREAD; do for hostname in $FRITZ_HOSTS; do for id in {0..71}; do echo "$metric,cluster=fritz,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" @@ -122,7 +124,7 @@ while [ true ]; do # 3. ALEX: ACCELERATOR echo "Generating Alex: accelerator" { - for metric in $METRICS_STD; do + for metric in $ALEX_METRICS_ACC; do for hostname in $ALEX_HOSTS; do for id in $ACCEL_IDS; do echo "$metric,cluster=alex,hostname=$hostname,type=accelerator,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" @@ -132,23 +134,10 @@ while [ true ]; do } > sample_alex.txt send_payload "sample_alex.txt" "alex" - # 4. ALEX: MEMORY DOMAIN - echo "Generating Alex: memoryDomain" - { - for metric in $METRICS_STD; do - for hostname in $ALEX_HOSTS; do - for id in {0..7}; do - echo "$metric,cluster=alex,hostname=$hostname,type=memoryDomain,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" - done - done - done - } > sample_alex.txt - send_payload "sample_alex.txt" "alex" - # 5. ALEX: SOCKET echo "Generating Alex: socket" { - for metric in $METRICS_STD; do + for metric in $ALEX_METRICS_SOCKET; do for hostname in $ALEX_HOSTS; do for id in {0..1}; do echo "$metric,cluster=alex,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" @@ -161,7 +150,7 @@ while [ true ]; do # 6. FRITZ: SOCKET echo "Generating Fritz: socket" { - for metric in $METRICS_STD; do + for metric in $FRITZ_METRICS_SOCKET; do for hostname in $FRITZ_HOSTS; do for id in {0..1}; do echo "$metric,cluster=fritz,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" @@ -174,7 +163,7 @@ while [ true ]; do # 7. ALEX: NODE echo "Generating Alex: node" { - for metric in $METRICS_NODE; do + for metric in $ALEX_METRICS_NODE; do for hostname in $ALEX_HOSTS; do echo "$metric,cluster=alex,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" done @@ -185,7 +174,7 @@ while [ true ]; do # 8. FRITZ: NODE echo "Generating Fritz: node" { - for metric in $METRICS_NODE; do + for metric in $FRITZ_METRICS_NODE; do for hostname in $FRITZ_HOSTS; do echo "$metric,cluster=fritz,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" done diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index a393995f..50de27b5 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -68,12 +68,8 @@ energyFootprint { hardware, metric, value } } `); - const client = getContextClient(); - const ccconfig = getContext("cc-config"); - const showRoofline = !!ccconfig[`jobView_showRoofline`]; - const showStatsTable = !!ccconfig[`jobView_showStatTable`]; - /* Note: Actual metric data queried in Component, only require base infos here -> reduce backend load by requesting just stats */ + const client = getContextClient(); const query = gql` query ($dbid: ID!, $selectedMetrics: [String!]!, $selectedScopes: [MetricScope!]!) { scopedJobStats(id: $dbid, metrics: $selectedMetrics, scopes: $selectedScopes) { @@ -89,25 +85,68 @@ /* State Init */ let plots = $state({}); let isMetricsSelectionOpen = $state(false); - let selectedMetrics = $state([]); - let selectedScopes = $state([]); let totalMetrics = $state(0); - /* Derived */ - const showSummary = $derived((!!ccconfig[`jobView_showFootprint`] || !!ccconfig[`jobView_showPolarPlot`])) + /* Derived Init Return */ + const thisJob = $derived($initq?.data ? $initq.data.job : null); + + /* Derived Settings */ + const globalMetrics = $derived(thisJob ? getContext("globalMetrics") : null); + const clusterInfo = $derived(thisJob ? getContext("clusters") : null); + const ccconfig = $derived(thisJob ? getContext("cc-config") : null); + const showRoofline = $derived(ccconfig ? !!ccconfig[`jobView_showRoofline`] : false); + const showStatsTable = $derived(ccconfig ? !!ccconfig[`jobView_showStatTable`] : false); + const showSummary = $derived(ccconfig ? (!!ccconfig[`jobView_showFootprint`] || !!ccconfig[`jobView_showPolarPlot`]) : false) + + /* Derived Var Preprocessing*/ + let selectedMetrics = $derived.by(() => { + if(thisJob && ccconfig) { + if (thisJob.cluster) { + if (thisJob.subCluster) { + return ccconfig[`metricConfig_jobViewPlotMetrics:${thisJob.cluster}:${thisJob.subCluster}`] || + ccconfig[`metricConfig_jobViewPlotMetrics:${thisJob.cluster}`] || + ccconfig.metricConfig_jobViewPlotMetrics + } + return ccconfig[`metricConfig_jobViewPlotMetrics:${thisJob.cluster}`] || + ccconfig.metricConfig_jobViewPlotMetrics + } + return ccconfig.metricConfig_jobViewPlotMetrics + } + return []; + }); + + let selectedScopes = $derived.by(() => { + const pendingScopes = ["node"] + if (thisJob) { + const accScopeDefault = [...selectedMetrics].some(function (m) { + const thisCluster = clusterInfo.find((c) => c.name == thisJob.cluster); + const subCluster = thisCluster.subClusters.find((sc) => sc.name == thisJob.subCluster); + return subCluster.metricConfig.find((smc) => smc.name == m)?.scope === "accelerator"; + }); + + + if (accScopeDefault) pendingScopes.push("accelerator") + if (thisJob.numNodes === 1) { + pendingScopes.push("socket") + pendingScopes.push("core") + } + } + return[...new Set(pendingScopes)]; + }); + + /* Derived Query and Postprocessing*/ const jobMetrics = $derived(queryStore({ client: client, query: query, variables: { dbid, selectedMetrics, selectedScopes }, }) ); - + const missingMetrics = $derived.by(() => { - if ($initq?.data && $jobMetrics?.data) { - let job = $initq.data.job; + if (thisJob && $jobMetrics?.data) { let metrics = $jobMetrics.data.scopedJobStats; - let metricNames = $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster)) { + let metricNames = globalMetrics.reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === thisJob.cluster)) { names.push(gm.name); } return names; @@ -118,9 +157,10 @@ !metrics.some((jm) => jm.name == metric) && selectedMetrics.includes(metric) && !checkMetricDisabled( + globalMetrics, metric, - $initq.data.job.cluster, - $initq.data.job.subCluster, + thisJob.cluster, + thisJob.subCluster, ), ); } else { @@ -129,17 +169,16 @@ }); const missingHosts = $derived.by(() => { - if ($initq?.data && $jobMetrics?.data) { - let job = $initq.data.job; + if (thisJob && $jobMetrics?.data) { let metrics = $jobMetrics.data.scopedJobStats; - let metricNames = $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster)) { + let metricNames = globalMetrics.reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === thisJob.cluster)) { names.push(gm.name); } return names; }, []); - return job.resources + return thisJob.resources .map(({ hostname }) => ({ hostname: hostname, metrics: metricNames.filter( @@ -165,51 +204,19 @@ ? "Loading..." : $initq?.error ? "Error" - : `Job ${$initq.data.job.jobId} - ClusterCockpit`; - }); - - /* On Init */ - getContext("on-init")(() => { - let job = $initq.data.job; - if (!job) return; - const pendingMetrics = ( - ccconfig[`metricConfig_jobViewPlotMetrics:${job.cluster}:${job.subCluster}`] || - ccconfig[`metricConfig_jobViewPlotMetrics:${job.cluster}`] - ) || - $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster && av.subClusters.includes(job.subCluster))) { - names.push(gm.name); - } - return names; - }, []) - - // Select default Scopes to load: Check before if any metric has accelerator scope by default - const accScopeDefault = [...pendingMetrics].some(function (m) { - const cluster = $initq.data.clusters.find((c) => c.name == job.cluster); - const subCluster = cluster.subClusters.find((sc) => sc.name == job.subCluster); - return subCluster.metricConfig.find((smc) => smc.name == m)?.scope === "accelerator"; - }); - - const pendingScopes = ["node"] - if (accScopeDefault) pendingScopes.push("accelerator") - if (job.numNodes === 1) { - pendingScopes.push("socket") - pendingScopes.push("core") - } - - selectedMetrics = [...new Set(pendingMetrics)]; - selectedScopes = [...new Set(pendingScopes)]; + : `Job ${thisJob.jobId} - ClusterCockpit`; }); /* Functions */ - const orderAndMap = (grouped, selectedMetrics) => - selectedMetrics.map((metric) => ({ + const orderAndMap = (grouped, inputMetrics) => + inputMetrics.map((metric) => ({ metric: metric, data: grouped.find((group) => group[0].name == metric), disabled: checkMetricDisabled( + globalMetrics, metric, - $initq.data.job.cluster, - $initq.data.job.subCluster, + thisJob.cluster, + thisJob.subCluster, ), })); @@ -219,34 +226,34 @@ {#if $initq.error} {$initq.error.message} - {:else if $initq?.data} + {:else if thisJob} - {#if $initq.data?.job?.metaData?.message} + {#if thisJob?.metaData?.message} -
Job {$initq.data?.job?.jobId} ({$initq.data?.job?.cluster})
+
Job {thisJob?.jobId} ({thisJob?.cluster})
The following note was added by administrators:
- {@html $initq.data.job.metaData.message} + {@html thisJob.metaData.message}
{/if} - + - + - {#if $initq.data.job.concurrentJobs != null && $initq.data.job.concurrentJobs.items.length != 0} + {#if thisJob.concurrentJobs != null && thisJob.concurrentJobs.items.length != 0} - {$initq.data.job.concurrentJobs.items.length} Concurrent Jobs + {thisJob.concurrentJobs.items.length} Concurrent Jobs - roles.manager)}/> + roles.manager)}/> {/if} @@ -261,9 +268,9 @@ {#if $initq.error} {$initq.error.message} - {:else if $initq?.data} + {:else if thisJob} {#if showSummary} - + {/if} {:else} @@ -274,9 +281,9 @@ {#if $initq.error} {$initq.error.message} - {:else if $initq?.data} + {:else if thisJob} {#if showRoofline} - + {/if} {:else} @@ -285,10 +292,10 @@ -{#if $initq?.data && $initq.data.job.energyFootprint.length != 0} +{#if thisJob && thisJob?.energyFootprint?.length != 0} - + {/if} @@ -297,7 +304,7 @@ - {#if $initq?.data} + {#if thisJob} {#if job.numNodes > 1 && job.state === "running"} -