Files
cc-backend/internal/metricstoreclient/cc-metric-store-queries.go
2026-03-04 16:43:05 +01:00

258 lines
7.8 KiB
Go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstoreclient - Query Building
//
// This file contains the query construction and scope transformation logic for cc-metric-store queries.
// It handles the complex mapping between requested metric scopes and native hardware topology,
// automatically aggregating or filtering metrics as needed.
//
// # Scope Transformations
//
// The buildScopeQueries function implements the core scope transformation algorithm.
// It handles 25+ different transformation cases, mapping between:
// - Accelerator (GPU) scope
// - HWThread (hardware thread/SMT) scope
// - Core (CPU core) scope
// - Socket (CPU package) scope
// - MemoryDomain (NUMA domain) scope
// - Node (full system) scope
//
// Transformations follow these rules:
// - Same scope: Return data as-is (e.g., Core → Core)
// - Coarser scope: Aggregate data (e.g., Core → Socket with Aggregate=true)
// - Finer scope: Error - cannot increase granularity
//
// # Query Building
//
// buildQueries and buildNodeQueries are the main entry points, handling job-specific
// and node-specific query construction respectively. They:
// - Validate metric configurations
// - Handle subcluster-specific metric filtering
// - Detect and skip duplicate scope requests
// - Call buildScopeQueries for each metric/scope/host combination
package metricstoreclient
import (
"fmt"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// buildQueries constructs API queries for job-specific metric data.
// It iterates through metrics, scopes, and job resources to build the complete query set.
//
// The function handles:
// - Metric configuration validation and subcluster filtering
// - Scope deduplication to avoid redundant queries
// - Hardware thread list resolution (job-allocated vs full node)
// - Delegation to buildScopeQueries for scope transformations
//
// Returns queries and their corresponding assigned scopes (which may differ from requested scopes).
func (ccms *CCMetricStore) buildQueries(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources))
topology, err := ccms.getTopology(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error())
return nil, nil, err
}
for _, metric := range metrics {
remoteName := metric
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
cclog.Warnf("metric '%s' is not specified for cluster '%s' - skipping", metric, job.Cluster)
continue
}
// Skip if metric is removed for subcluster
if len(mc.SubClusters) != 0 {
isRemoved := false
for _, scConfig := range mc.SubClusters {
if scConfig.Name == job.SubCluster && scConfig.Remove {
isRemoved = true
break
}
}
if isRemoved {
continue
}
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
continue
}
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
}
handledScopes = append(handledScopes, scope)
for _, host := range job.Resources {
hwthreads := host.HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
scopeResults, ok := metricstore.BuildScopeQueries(
nativeScope, requestedScope,
remoteName, host.Hostname,
topology, hwthreads, host.Accelerators,
)
if !ok {
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
}
for _, sr := range scopeResults {
queries = append(queries, APIQuery{
Metric: sr.Metric,
Hostname: sr.Hostname,
Aggregate: sr.Aggregate,
Type: sr.Type,
TypeIds: sr.TypeIds,
Resolution: resolution,
})
assignedScope = append(assignedScope, sr.Scope)
}
}
}
}
return queries, assignedScope, nil
}
// buildNodeQueries constructs API queries for node-specific metric data (Systems View).
// Similar to buildQueries but uses full node topology instead of job-allocated resources.
//
// The function handles:
// - SubCluster topology resolution (either pre-loaded or per-node lookup)
// - Full node hardware thread lists (not job-specific subsets)
// - All accelerators on each node
// - Metric configuration validation with subcluster filtering
//
// Returns queries and their corresponding assigned scopes.
func (ccms *CCMetricStore) buildNodeQueries(
cluster string,
subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes))
for _, metric := range metrics {
remoteName := metric
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
continue
}
// Skip if metric is removed for subcluster
if mc.SubClusters != nil {
isRemoved := false
for _, scConfig := range mc.SubClusters {
if scConfig.Name == subCluster && scConfig.Remove {
isRemoved = true
break
}
}
if isRemoved {
continue
}
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
}
handledScopes = append(handledScopes, scope)
for _, hostname := range nodes {
var topology *schema.Topology
var err error
// If no subCluster given, get it by node
if subCluster == "" {
topology, err = ccms.getTopologyByNode(cluster, hostname)
} else {
topology, err = ccms.getTopology(cluster, subCluster)
}
if err != nil {
return nil, nil, err
}
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
acceleratorIds := topology.GetAcceleratorIDs()
// Moved check here if metric matches hardware specs
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
continue scopesLoop
}
scopeResults, ok := metricstore.BuildScopeQueries(
nativeScope, requestedScope,
remoteName, hostname,
topology, topology.Node, acceleratorIds,
)
if !ok {
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
}
for _, sr := range scopeResults {
queries = append(queries, APIQuery{
Metric: sr.Metric,
Hostname: sr.Hostname,
Aggregate: sr.Aggregate,
Type: sr.Type,
TypeIds: sr.TypeIds,
Resolution: resolution,
})
assignedScope = append(assignedScope, sr.Scope)
}
}
}
}
return queries, assignedScope, nil
}