mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-05 07:37:30 +01:00
258 lines
7.8 KiB
Go
258 lines
7.8 KiB
Go
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
// All rights reserved.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package metricstoreclient - Query Building
|
|
//
|
|
// This file contains the query construction and scope transformation logic for cc-metric-store queries.
|
|
// It handles the complex mapping between requested metric scopes and native hardware topology,
|
|
// automatically aggregating or filtering metrics as needed.
|
|
//
|
|
// # Scope Transformations
|
|
//
|
|
// The buildScopeQueries function implements the core scope transformation algorithm.
|
|
// It handles 25+ different transformation cases, mapping between:
|
|
// - Accelerator (GPU) scope
|
|
// - HWThread (hardware thread/SMT) scope
|
|
// - Core (CPU core) scope
|
|
// - Socket (CPU package) scope
|
|
// - MemoryDomain (NUMA domain) scope
|
|
// - Node (full system) scope
|
|
//
|
|
// Transformations follow these rules:
|
|
// - Same scope: Return data as-is (e.g., Core → Core)
|
|
// - Coarser scope: Aggregate data (e.g., Core → Socket with Aggregate=true)
|
|
// - Finer scope: Error - cannot increase granularity
|
|
//
|
|
// # Query Building
|
|
//
|
|
// buildQueries and buildNodeQueries are the main entry points, handling job-specific
|
|
// and node-specific query construction respectively. They:
|
|
// - Validate metric configurations
|
|
// - Handle subcluster-specific metric filtering
|
|
// - Detect and skip duplicate scope requests
|
|
// - Call buildScopeQueries for each metric/scope/host combination
|
|
package metricstoreclient
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
|
)
|
|
|
|
// buildQueries constructs API queries for job-specific metric data.
|
|
// It iterates through metrics, scopes, and job resources to build the complete query set.
|
|
//
|
|
// The function handles:
|
|
// - Metric configuration validation and subcluster filtering
|
|
// - Scope deduplication to avoid redundant queries
|
|
// - Hardware thread list resolution (job-allocated vs full node)
|
|
// - Delegation to buildScopeQueries for scope transformations
|
|
//
|
|
// Returns queries and their corresponding assigned scopes (which may differ from requested scopes).
|
|
func (ccms *CCMetricStore) buildQueries(
|
|
job *schema.Job,
|
|
metrics []string,
|
|
scopes []schema.MetricScope,
|
|
resolution int,
|
|
) ([]APIQuery, []schema.MetricScope, error) {
|
|
// Initialize both slices together
|
|
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
|
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources))
|
|
|
|
topology, err := ccms.getTopology(job.Cluster, job.SubCluster)
|
|
if err != nil {
|
|
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error())
|
|
return nil, nil, err
|
|
}
|
|
|
|
for _, metric := range metrics {
|
|
remoteName := metric
|
|
mc := archive.GetMetricConfig(job.Cluster, metric)
|
|
if mc == nil {
|
|
cclog.Warnf("metric '%s' is not specified for cluster '%s' - skipping", metric, job.Cluster)
|
|
continue
|
|
}
|
|
|
|
// Skip if metric is removed for subcluster
|
|
if len(mc.SubClusters) != 0 {
|
|
isRemoved := false
|
|
for _, scConfig := range mc.SubClusters {
|
|
if scConfig.Name == job.SubCluster && scConfig.Remove {
|
|
isRemoved = true
|
|
break
|
|
}
|
|
}
|
|
if isRemoved {
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Avoid duplicates...
|
|
handledScopes := make([]schema.MetricScope, 0, 3)
|
|
|
|
scopesLoop:
|
|
for _, requestedScope := range scopes {
|
|
nativeScope := mc.Scope
|
|
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
|
|
continue
|
|
}
|
|
|
|
scope := nativeScope.Max(requestedScope)
|
|
for _, s := range handledScopes {
|
|
if scope == s {
|
|
continue scopesLoop
|
|
}
|
|
}
|
|
handledScopes = append(handledScopes, scope)
|
|
|
|
for _, host := range job.Resources {
|
|
hwthreads := host.HWThreads
|
|
if hwthreads == nil {
|
|
hwthreads = topology.Node
|
|
}
|
|
|
|
scopeResults, ok := metricstore.BuildScopeQueries(
|
|
nativeScope, requestedScope,
|
|
remoteName, host.Hostname,
|
|
topology, hwthreads, host.Accelerators,
|
|
)
|
|
|
|
if !ok {
|
|
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
|
}
|
|
|
|
for _, sr := range scopeResults {
|
|
queries = append(queries, APIQuery{
|
|
Metric: sr.Metric,
|
|
Hostname: sr.Hostname,
|
|
Aggregate: sr.Aggregate,
|
|
Type: sr.Type,
|
|
TypeIds: sr.TypeIds,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, sr.Scope)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return queries, assignedScope, nil
|
|
}
|
|
|
|
// buildNodeQueries constructs API queries for node-specific metric data (Systems View).
|
|
// Similar to buildQueries but uses full node topology instead of job-allocated resources.
|
|
//
|
|
// The function handles:
|
|
// - SubCluster topology resolution (either pre-loaded or per-node lookup)
|
|
// - Full node hardware thread lists (not job-specific subsets)
|
|
// - All accelerators on each node
|
|
// - Metric configuration validation with subcluster filtering
|
|
//
|
|
// Returns queries and their corresponding assigned scopes.
|
|
func (ccms *CCMetricStore) buildNodeQueries(
|
|
cluster string,
|
|
subCluster string,
|
|
nodes []string,
|
|
metrics []string,
|
|
scopes []schema.MetricScope,
|
|
resolution int,
|
|
) ([]APIQuery, []schema.MetricScope, error) {
|
|
// Initialize both slices together
|
|
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
|
|
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes))
|
|
|
|
for _, metric := range metrics {
|
|
remoteName := metric
|
|
mc := archive.GetMetricConfig(cluster, metric)
|
|
if mc == nil {
|
|
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
|
|
continue
|
|
}
|
|
|
|
// Skip if metric is removed for subcluster
|
|
if mc.SubClusters != nil {
|
|
isRemoved := false
|
|
for _, scConfig := range mc.SubClusters {
|
|
if scConfig.Name == subCluster && scConfig.Remove {
|
|
isRemoved = true
|
|
break
|
|
}
|
|
}
|
|
if isRemoved {
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Avoid duplicates...
|
|
handledScopes := make([]schema.MetricScope, 0, 3)
|
|
|
|
scopesLoop:
|
|
for _, requestedScope := range scopes {
|
|
nativeScope := mc.Scope
|
|
|
|
scope := nativeScope.Max(requestedScope)
|
|
for _, s := range handledScopes {
|
|
if scope == s {
|
|
continue scopesLoop
|
|
}
|
|
}
|
|
handledScopes = append(handledScopes, scope)
|
|
|
|
for _, hostname := range nodes {
|
|
var topology *schema.Topology
|
|
var err error
|
|
|
|
// If no subCluster given, get it by node
|
|
if subCluster == "" {
|
|
topology, err = ccms.getTopologyByNode(cluster, hostname)
|
|
} else {
|
|
topology, err = ccms.getTopology(cluster, subCluster)
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
|
|
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
|
|
acceleratorIds := topology.GetAcceleratorIDs()
|
|
|
|
// Moved check here if metric matches hardware specs
|
|
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
|
|
continue scopesLoop
|
|
}
|
|
|
|
scopeResults, ok := metricstore.BuildScopeQueries(
|
|
nativeScope, requestedScope,
|
|
remoteName, hostname,
|
|
topology, topology.Node, acceleratorIds,
|
|
)
|
|
|
|
if !ok {
|
|
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
|
}
|
|
|
|
for _, sr := range scopeResults {
|
|
queries = append(queries, APIQuery{
|
|
Metric: sr.Metric,
|
|
Hostname: sr.Hostname,
|
|
Aggregate: sr.Aggregate,
|
|
Type: sr.Type,
|
|
TypeIds: sr.TypeIds,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, sr.Scope)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return queries, assignedScope, nil
|
|
}
|
|
|