mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-01-16 17:51:46 +01:00
1249 lines
37 KiB
Go
1249 lines
37 KiB
Go
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
// All rights reserved. This file is part of cc-backend.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// This file implements high-level query functions for loading job metric data
|
|
// with automatic scope transformation and aggregation.
|
|
//
|
|
// Key Concepts:
|
|
//
|
|
// Metric Scopes: Metrics are collected at different granularities (native scope):
|
|
// - HWThread: Per hardware thread
|
|
// - Core: Per CPU core
|
|
// - Socket: Per CPU socket
|
|
// - MemoryDomain: Per memory domain (NUMA)
|
|
// - Accelerator: Per GPU/accelerator
|
|
// - Node: Per compute node
|
|
//
|
|
// Scope Transformation: The buildQueries functions transform between native scope
|
|
// and requested scope by:
|
|
// - Aggregating finer-grained data (e.g., HWThread → Core → Socket → Node)
|
|
// - Rejecting requests for finer granularity than available
|
|
// - Handling special cases (e.g., Accelerator metrics)
|
|
//
|
|
// Query Building: Constructs APIQuery structures with proper selectors (Type, TypeIds)
|
|
// based on cluster topology and job resources.
|
|
package metricstore
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
|
)
|
|
|
|
// TestLoadDataCallback allows tests to override LoadData behavior for testing purposes.
|
|
// When set to a non-nil function, LoadData will call this function instead of the default implementation.
|
|
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
|
|
|
|
// LoadData loads metric data for a specific job with automatic scope transformation.
|
|
//
|
|
// This is the primary function for retrieving job metric data. It handles:
|
|
// - Building queries with scope transformation via buildQueries
|
|
// - Fetching data from the metric store
|
|
// - Organizing results by metric and scope
|
|
// - Converting NaN statistics to 0 for JSON compatibility
|
|
// - Partial error handling (returns data for successful queries even if some fail)
|
|
//
|
|
// Parameters:
|
|
// - job: Job metadata including cluster, resources, and time range
|
|
// - metrics: List of metric names to load
|
|
// - scopes: Requested metric scopes (will be transformed to match native scopes)
|
|
// - ctx: Context for cancellation (currently unused but reserved for future use)
|
|
// - resolution: Data resolution in seconds (0 for native resolution)
|
|
//
|
|
// Returns:
|
|
// - JobData: Map of metric → scope → JobMetric with time-series data and statistics
|
|
// - Error: Returns error if query building or fetching fails, or partial error listing failed hosts
|
|
//
|
|
// Example:
|
|
//
|
|
// jobData, err := LoadData(job, []string{"cpu_load", "mem_used"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 60)
|
|
func LoadData(
|
|
job *schema.Job,
|
|
metrics []string,
|
|
scopes []schema.MetricScope,
|
|
ctx context.Context,
|
|
resolution int,
|
|
) (schema.JobData, error) {
|
|
if TestLoadDataCallback != nil {
|
|
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
|
|
}
|
|
|
|
queries, assignedScope, err := buildQueries(job, metrics, scopes, int64(resolution))
|
|
if err != nil {
|
|
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
req := APIQueryRequest{
|
|
Cluster: job.Cluster,
|
|
From: job.StartTime,
|
|
To: job.StartTime + int64(job.Duration),
|
|
Queries: queries,
|
|
WithStats: true,
|
|
WithData: true,
|
|
}
|
|
|
|
resBody, err := FetchData(req)
|
|
if err != nil {
|
|
cclog.Errorf("Error while fetching data : %s", err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
var errors []string
|
|
jobData := make(schema.JobData)
|
|
for i, row := range resBody.Results {
|
|
query := req.Queries[i]
|
|
metric := query.Metric
|
|
scope := assignedScope[i]
|
|
mc := archive.GetMetricConfig(job.Cluster, metric)
|
|
if _, ok := jobData[metric]; !ok {
|
|
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
|
|
}
|
|
|
|
res := mc.Timestep
|
|
if len(row) > 0 {
|
|
res = int(row[0].Resolution)
|
|
}
|
|
|
|
jobMetric, ok := jobData[metric][scope]
|
|
if !ok {
|
|
jobMetric = &schema.JobMetric{
|
|
Unit: mc.Unit,
|
|
Timestep: res,
|
|
Series: make([]schema.Series, 0),
|
|
}
|
|
jobData[metric][scope] = jobMetric
|
|
}
|
|
|
|
for ndx, res := range row {
|
|
if res.Error != nil {
|
|
/* Build list for "partial errors", if any */
|
|
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
|
continue
|
|
}
|
|
|
|
id := (*string)(nil)
|
|
if query.Type != nil {
|
|
id = new(string)
|
|
*id = query.TypeIds[ndx]
|
|
}
|
|
|
|
sanitizeStats(&res)
|
|
|
|
jobMetric.Series = append(jobMetric.Series, schema.Series{
|
|
Hostname: query.Hostname,
|
|
Id: id,
|
|
Statistics: schema.MetricStatistics{
|
|
Avg: float64(res.Avg),
|
|
Min: float64(res.Min),
|
|
Max: float64(res.Max),
|
|
},
|
|
Data: res.Data,
|
|
})
|
|
}
|
|
|
|
// So that one can later check len(jobData):
|
|
if len(jobMetric.Series) == 0 {
|
|
delete(jobData[metric], scope)
|
|
if len(jobData[metric]) == 0 {
|
|
delete(jobData, metric)
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(errors) != 0 {
|
|
/* Returns list for "partial errors" */
|
|
return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
|
}
|
|
return jobData, nil
|
|
}
|
|
|
|
// Pre-converted scope strings avoid repeated string(MetricScope) allocations during
|
|
// query construction. These are used in APIQuery.Type field throughout buildQueries
|
|
// and buildNodeQueries functions. Converting once at package initialization improves
|
|
// performance for high-volume query building.
|
|
var (
|
|
hwthreadString = string(schema.MetricScopeHWThread)
|
|
coreString = string(schema.MetricScopeCore)
|
|
memoryDomainString = string(schema.MetricScopeMemoryDomain)
|
|
socketString = string(schema.MetricScopeSocket)
|
|
acceleratorString = string(schema.MetricScopeAccelerator)
|
|
)
|
|
|
|
// buildQueries constructs APIQuery structures with automatic scope transformation for a job.
|
|
//
|
|
// This function implements the core scope transformation logic, handling all combinations of
|
|
// native metric scopes and requested scopes. It uses the cluster topology to determine which
|
|
// hardware IDs to include in each query.
|
|
//
|
|
// Scope Transformation Rules:
|
|
// - If native scope >= requested scope: Aggregates data (Aggregate=true in APIQuery)
|
|
// - If native scope < requested scope: Returns error (cannot increase granularity)
|
|
// - Special handling for Accelerator scope (independent of CPU hierarchy)
|
|
//
|
|
// The function generates one or more APIQuery per (metric, scope, host) combination:
|
|
// - For non-aggregated queries: One query with all relevant IDs
|
|
// - For aggregated queries: May generate multiple queries (e.g., one per socket/core)
|
|
//
|
|
// Parameters:
|
|
// - job: Job metadata including cluster, subcluster, and resource allocation
|
|
// - metrics: List of metrics to query
|
|
// - scopes: Requested scopes for each metric
|
|
// - resolution: Data resolution in seconds
|
|
//
|
|
// Returns:
|
|
// - []APIQuery: List of queries to execute
|
|
// - []schema.MetricScope: Assigned scope for each query (after transformation)
|
|
// - error: Returns error if topology lookup fails or unhandled scope combination encountered
|
|
func buildQueries(
|
|
job *schema.Job,
|
|
metrics []string,
|
|
scopes []schema.MetricScope,
|
|
resolution int64,
|
|
) ([]APIQuery, []schema.MetricScope, error) {
|
|
if len(job.Resources) == 0 {
|
|
return nil, nil, fmt.Errorf("METRICDATA/CCMS > no resources allocated for job %d", job.JobID)
|
|
}
|
|
|
|
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
|
assignedScope := []schema.MetricScope{}
|
|
|
|
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
|
if scerr != nil {
|
|
return nil, nil, scerr
|
|
}
|
|
topology := subcluster.Topology
|
|
|
|
for _, metric := range metrics {
|
|
mc := archive.GetMetricConfig(job.Cluster, metric)
|
|
if mc == nil {
|
|
cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
|
continue
|
|
}
|
|
|
|
// Skip if metric is removed for subcluster
|
|
if len(mc.SubClusters) != 0 {
|
|
isRemoved := false
|
|
for _, scConfig := range mc.SubClusters {
|
|
if scConfig.Name == job.SubCluster && scConfig.Remove {
|
|
isRemoved = true
|
|
break
|
|
}
|
|
}
|
|
if isRemoved {
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Avoid duplicates using map for O(1) lookup
|
|
handledScopes := make(map[schema.MetricScope]bool, 3)
|
|
|
|
for _, requestedScope := range scopes {
|
|
nativeScope := mc.Scope
|
|
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
|
|
continue
|
|
}
|
|
|
|
scope := nativeScope.Max(requestedScope)
|
|
if handledScopes[scope] {
|
|
continue
|
|
}
|
|
handledScopes[scope] = true
|
|
|
|
for _, host := range job.Resources {
|
|
hwthreads := host.HWThreads
|
|
if hwthreads == nil {
|
|
hwthreads = topology.Node
|
|
}
|
|
|
|
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
|
|
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
|
|
if scope != schema.MetricScopeAccelerator {
|
|
// Skip all other catched cases
|
|
continue
|
|
}
|
|
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: false,
|
|
Type: &acceleratorString,
|
|
TypeIds: host.Accelerators,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
|
|
continue
|
|
}
|
|
|
|
// Accelerator -> Node
|
|
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
|
|
if len(host.Accelerators) == 0 {
|
|
continue
|
|
}
|
|
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &acceleratorString,
|
|
TypeIds: host.Accelerators,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// HWThread -> HWThread
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: false,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(hwthreads),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// HWThread -> Core
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
|
|
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
|
for _, core := range cores {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(topology.Core[core]),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// HWThread -> Socket
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
|
|
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
|
for _, socket := range sockets {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(topology.Socket[socket]),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// HWThread -> Node
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(hwthreads),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Core -> Core
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
|
|
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: false,
|
|
Type: &coreString,
|
|
TypeIds: intToStringSlice(cores),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Core -> Socket
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
|
|
sockets, _ := topology.GetSocketsFromCores(hwthreads)
|
|
for _, socket := range sockets {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &coreString,
|
|
TypeIds: intToStringSlice(topology.Socket[socket]),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Core -> Node
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
|
|
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &coreString,
|
|
TypeIds: intToStringSlice(cores),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// MemoryDomain -> MemoryDomain
|
|
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
|
|
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: false,
|
|
Type: &memoryDomainString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// MemoryDomain -> Node
|
|
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
|
|
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &memoryDomainString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Socket -> Socket
|
|
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
|
|
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: false,
|
|
Type: &socketString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Socket -> Node
|
|
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
|
|
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Aggregate: true,
|
|
Type: &socketString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Node -> Node
|
|
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: host.Hostname,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
|
}
|
|
}
|
|
}
|
|
|
|
return queries, assignedScope, nil
|
|
}
|
|
|
|
// LoadStats loads only metric statistics (avg/min/max) for a job at node scope.
|
|
//
|
|
// This is an optimized version of LoadData that fetches only statistics without
|
|
// time-series data, reducing bandwidth and memory usage. Always queries at node scope.
|
|
//
|
|
// Parameters:
|
|
// - job: Job metadata
|
|
// - metrics: List of metric names
|
|
// - ctx: Context (currently unused)
|
|
//
|
|
// Returns:
|
|
// - Map of metric → hostname → statistics
|
|
// - Error on query building or fetching failure
|
|
func LoadStats(
|
|
job *schema.Job,
|
|
metrics []string,
|
|
ctx context.Context,
|
|
) (map[string]map[string]schema.MetricStatistics, error) {
|
|
// TODO(#166): Add scope parameter for analysis view accelerator normalization
|
|
queries, _, err := buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0)
|
|
if err != nil {
|
|
cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
req := APIQueryRequest{
|
|
Cluster: job.Cluster,
|
|
From: job.StartTime,
|
|
To: job.StartTime + int64(job.Duration),
|
|
Queries: queries,
|
|
WithStats: true,
|
|
WithData: false,
|
|
}
|
|
|
|
resBody, err := FetchData(req)
|
|
if err != nil {
|
|
cclog.Errorf("Error while fetching data : %s", err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
|
|
for i, res := range resBody.Results {
|
|
query := req.Queries[i]
|
|
metric := query.Metric
|
|
data := res[0]
|
|
if data.Error != nil {
|
|
cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
|
|
continue
|
|
}
|
|
|
|
metricdata, ok := stats[metric]
|
|
if !ok {
|
|
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
|
|
stats[metric] = metricdata
|
|
}
|
|
|
|
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
|
|
cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname)
|
|
continue
|
|
}
|
|
|
|
metricdata[query.Hostname] = schema.MetricStatistics{
|
|
Avg: float64(data.Avg),
|
|
Min: float64(data.Min),
|
|
Max: float64(data.Max),
|
|
}
|
|
}
|
|
|
|
return stats, nil
|
|
}
|
|
|
|
// LoadScopedStats loads metric statistics for a job with scope-aware grouping.
|
|
//
|
|
// Similar to LoadStats but supports multiple scopes and returns statistics grouped
|
|
// by scope with hardware IDs (e.g., per-core, per-socket statistics).
|
|
//
|
|
// Parameters:
|
|
// - job: Job metadata
|
|
// - metrics: List of metric names
|
|
// - scopes: Requested metric scopes
|
|
// - ctx: Context (currently unused)
|
|
//
|
|
// Returns:
|
|
// - ScopedJobStats: Map of metric → scope → []ScopedStats (with hostname and ID)
|
|
// - Error or partial error listing failed queries
|
|
func LoadScopedStats(
|
|
job *schema.Job,
|
|
metrics []string,
|
|
scopes []schema.MetricScope,
|
|
ctx context.Context,
|
|
) (schema.ScopedJobStats, error) {
|
|
queries, assignedScope, err := buildQueries(job, metrics, scopes, 0)
|
|
if err != nil {
|
|
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
req := APIQueryRequest{
|
|
Cluster: job.Cluster,
|
|
From: job.StartTime,
|
|
To: job.StartTime + int64(job.Duration),
|
|
Queries: queries,
|
|
WithStats: true,
|
|
WithData: false,
|
|
}
|
|
|
|
resBody, err := FetchData(req)
|
|
if err != nil {
|
|
cclog.Errorf("Error while fetching data : %s", err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
var errors []string
|
|
scopedJobStats := make(schema.ScopedJobStats)
|
|
|
|
for i, row := range resBody.Results {
|
|
query := req.Queries[i]
|
|
metric := query.Metric
|
|
scope := assignedScope[i]
|
|
|
|
if _, ok := scopedJobStats[metric]; !ok {
|
|
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
|
|
}
|
|
|
|
if _, ok := scopedJobStats[metric][scope]; !ok {
|
|
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
|
|
}
|
|
|
|
for ndx, res := range row {
|
|
if res.Error != nil {
|
|
/* Build list for "partial errors", if any */
|
|
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
|
continue
|
|
}
|
|
|
|
id := (*string)(nil)
|
|
if query.Type != nil {
|
|
id = new(string)
|
|
*id = query.TypeIds[ndx]
|
|
}
|
|
|
|
sanitizeStats(&res)
|
|
|
|
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
|
|
Hostname: query.Hostname,
|
|
Id: id,
|
|
Data: &schema.MetricStatistics{
|
|
Avg: float64(res.Avg),
|
|
Min: float64(res.Min),
|
|
Max: float64(res.Max),
|
|
},
|
|
})
|
|
}
|
|
|
|
// So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty
|
|
if len(scopedJobStats[metric][scope]) == 0 {
|
|
delete(scopedJobStats[metric], scope)
|
|
if len(scopedJobStats[metric]) == 0 {
|
|
delete(scopedJobStats, metric)
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(errors) != 0 {
|
|
/* Returns list for "partial errors" */
|
|
return scopedJobStats, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
|
}
|
|
return scopedJobStats, nil
|
|
}
|
|
|
|
// LoadNodeData loads metric data for specific nodes in a cluster over a time range.
|
|
//
|
|
// Unlike LoadData which operates on job resources, this function queries arbitrary nodes
|
|
// directly. Useful for system monitoring and node status views.
|
|
//
|
|
// Parameters:
|
|
// - cluster: Cluster name
|
|
// - metrics: List of metric names
|
|
// - nodes: List of node hostnames (nil = all nodes in cluster via ForAllNodes)
|
|
// - scopes: Requested metric scopes (currently unused - always node scope)
|
|
// - from, to: Time range
|
|
// - ctx: Context (currently unused)
|
|
//
|
|
// Returns:
|
|
// - Map of hostname → metric → []JobMetric
|
|
// - Error or partial error listing failed queries
|
|
func LoadNodeData(
|
|
cluster string,
|
|
metrics, nodes []string,
|
|
scopes []schema.MetricScope,
|
|
from, to time.Time,
|
|
ctx context.Context,
|
|
) (map[string]map[string][]*schema.JobMetric, error) {
|
|
req := APIQueryRequest{
|
|
Cluster: cluster,
|
|
From: from.Unix(),
|
|
To: to.Unix(),
|
|
WithStats: true,
|
|
WithData: true,
|
|
}
|
|
|
|
if nodes == nil {
|
|
req.ForAllNodes = append(req.ForAllNodes, metrics...)
|
|
} else {
|
|
for _, node := range nodes {
|
|
for _, metric := range metrics {
|
|
req.Queries = append(req.Queries, APIQuery{
|
|
Hostname: node,
|
|
Metric: metric,
|
|
Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
resBody, err := FetchData(req)
|
|
if err != nil {
|
|
cclog.Errorf("Error while fetching data : %s", err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
var errors []string
|
|
data := make(map[string]map[string][]*schema.JobMetric)
|
|
for i, res := range resBody.Results {
|
|
var query APIQuery
|
|
if resBody.Queries != nil {
|
|
query = resBody.Queries[i]
|
|
} else {
|
|
query = req.Queries[i]
|
|
}
|
|
|
|
metric := query.Metric
|
|
qdata := res[0]
|
|
if qdata.Error != nil {
|
|
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
|
|
}
|
|
|
|
sanitizeStats(&qdata)
|
|
|
|
hostdata, ok := data[query.Hostname]
|
|
if !ok {
|
|
hostdata = make(map[string][]*schema.JobMetric)
|
|
data[query.Hostname] = hostdata
|
|
}
|
|
|
|
mc := archive.GetMetricConfig(cluster, metric)
|
|
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
|
Unit: mc.Unit,
|
|
Timestep: mc.Timestep,
|
|
Series: []schema.Series{
|
|
{
|
|
Hostname: query.Hostname,
|
|
Data: qdata.Data,
|
|
Statistics: schema.MetricStatistics{
|
|
Avg: float64(qdata.Avg),
|
|
Min: float64(qdata.Min),
|
|
Max: float64(qdata.Max),
|
|
},
|
|
},
|
|
},
|
|
})
|
|
}
|
|
|
|
if len(errors) != 0 {
|
|
/* Returns list of "partial errors" */
|
|
return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
|
}
|
|
|
|
return data, nil
|
|
}
|
|
|
|
// LoadNodeListData loads metric data for a list of nodes with full scope transformation support.
|
|
//
|
|
// This is the most flexible node data loading function, supporting arbitrary scopes and
|
|
// resolution. Uses buildNodeQueries for proper scope transformation based on topology.
|
|
//
|
|
// Parameters:
|
|
// - cluster: Cluster name
|
|
// - subCluster: SubCluster name (empty string to infer from node names)
|
|
// - nodes: List of node hostnames
|
|
// - metrics: List of metric names
|
|
// - scopes: Requested metric scopes
|
|
// - resolution: Data resolution in seconds
|
|
// - from, to: Time range
|
|
// - ctx: Context (currently unused)
|
|
//
|
|
// Returns:
|
|
// - Map of hostname → JobData (metric → scope → JobMetric)
|
|
// - Error or partial error listing failed queries
|
|
func LoadNodeListData(
|
|
cluster, subCluster string,
|
|
nodes []string,
|
|
metrics []string,
|
|
scopes []schema.MetricScope,
|
|
resolution int,
|
|
from, to time.Time,
|
|
ctx context.Context,
|
|
) (map[string]schema.JobData, error) {
|
|
// Note: Order of node data is not guaranteed after this point
|
|
queries, assignedScope, err := buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, int64(resolution))
|
|
if err != nil {
|
|
cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
req := APIQueryRequest{
|
|
Cluster: cluster,
|
|
Queries: queries,
|
|
From: from.Unix(),
|
|
To: to.Unix(),
|
|
WithStats: true,
|
|
WithData: true,
|
|
}
|
|
|
|
resBody, err := FetchData(req)
|
|
if err != nil {
|
|
cclog.Errorf("Error while fetching data : %s", err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
var errors []string
|
|
data := make(map[string]schema.JobData)
|
|
for i, row := range resBody.Results {
|
|
var query APIQuery
|
|
if resBody.Queries != nil {
|
|
query = resBody.Queries[i]
|
|
} else {
|
|
query = req.Queries[i]
|
|
}
|
|
|
|
metric := query.Metric
|
|
scope := assignedScope[i]
|
|
mc := archive.GetMetricConfig(cluster, metric)
|
|
|
|
res := mc.Timestep
|
|
if len(row) > 0 {
|
|
res = int(row[0].Resolution)
|
|
}
|
|
|
|
// Init Nested Map Data Structures If Not Found
|
|
hostData, ok := data[query.Hostname]
|
|
if !ok {
|
|
hostData = make(schema.JobData)
|
|
data[query.Hostname] = hostData
|
|
}
|
|
|
|
metricData, ok := hostData[metric]
|
|
if !ok {
|
|
metricData = make(map[schema.MetricScope]*schema.JobMetric)
|
|
data[query.Hostname][metric] = metricData
|
|
}
|
|
|
|
scopeData, ok := metricData[scope]
|
|
if !ok {
|
|
scopeData = &schema.JobMetric{
|
|
Unit: mc.Unit,
|
|
Timestep: res,
|
|
Series: make([]schema.Series, 0),
|
|
}
|
|
data[query.Hostname][metric][scope] = scopeData
|
|
}
|
|
|
|
for ndx, res := range row {
|
|
if res.Error != nil {
|
|
/* Build list for "partial errors", if any */
|
|
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
|
continue
|
|
}
|
|
|
|
id := (*string)(nil)
|
|
if query.Type != nil {
|
|
id = new(string)
|
|
*id = query.TypeIds[ndx]
|
|
}
|
|
|
|
sanitizeStats(&res)
|
|
|
|
scopeData.Series = append(scopeData.Series, schema.Series{
|
|
Hostname: query.Hostname,
|
|
Id: id,
|
|
Statistics: schema.MetricStatistics{
|
|
Avg: float64(res.Avg),
|
|
Min: float64(res.Min),
|
|
Max: float64(res.Max),
|
|
},
|
|
Data: res.Data,
|
|
})
|
|
}
|
|
}
|
|
|
|
if len(errors) != 0 {
|
|
/* Returns list of "partial errors" */
|
|
return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
|
}
|
|
|
|
return data, nil
|
|
}
|
|
|
|
// buildNodeQueries constructs APIQuery structures for node-based queries with scope transformation.
|
|
//
|
|
// Similar to buildQueries but operates on node lists rather than job resources.
|
|
// Supports dynamic subcluster lookup when subCluster parameter is empty.
|
|
//
|
|
// Parameters:
|
|
// - cluster: Cluster name
|
|
// - subCluster: SubCluster name (empty = infer from node hostnames)
|
|
// - nodes: List of node hostnames
|
|
// - metrics: List of metric names
|
|
// - scopes: Requested metric scopes
|
|
// - resolution: Data resolution in seconds
|
|
//
|
|
// Returns:
|
|
// - []APIQuery: List of queries to execute
|
|
// - []schema.MetricScope: Assigned scope for each query
|
|
// - error: Returns error if topology lookup fails or unhandled scope combination
|
|
func buildNodeQueries(
|
|
cluster string,
|
|
subCluster string,
|
|
nodes []string,
|
|
metrics []string,
|
|
scopes []schema.MetricScope,
|
|
resolution int64,
|
|
) ([]APIQuery, []schema.MetricScope, error) {
|
|
if len(nodes) == 0 {
|
|
return nil, nil, fmt.Errorf("METRICDATA/CCMS > no nodes specified for query")
|
|
}
|
|
|
|
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
|
|
assignedScope := []schema.MetricScope{}
|
|
|
|
// Get Topol before loop if subCluster given
|
|
var subClusterTopol *schema.SubCluster
|
|
var scterr error
|
|
if subCluster != "" {
|
|
subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster)
|
|
if scterr != nil {
|
|
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", cluster, subCluster, scterr.Error())
|
|
return nil, nil, scterr
|
|
}
|
|
}
|
|
|
|
for _, metric := range metrics {
|
|
mc := archive.GetMetricConfig(cluster, metric)
|
|
if mc == nil {
|
|
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
|
|
continue
|
|
}
|
|
|
|
// Skip if metric is removed for subcluster
|
|
if mc.SubClusters != nil {
|
|
isRemoved := false
|
|
for _, scConfig := range mc.SubClusters {
|
|
if scConfig.Name == subCluster && scConfig.Remove {
|
|
isRemoved = true
|
|
break
|
|
}
|
|
}
|
|
if isRemoved {
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Avoid duplicates using map for O(1) lookup
|
|
handledScopes := make(map[schema.MetricScope]bool, 3)
|
|
|
|
for _, requestedScope := range scopes {
|
|
nativeScope := mc.Scope
|
|
|
|
scope := nativeScope.Max(requestedScope)
|
|
if handledScopes[scope] {
|
|
continue
|
|
}
|
|
handledScopes[scope] = true
|
|
|
|
for _, hostname := range nodes {
|
|
|
|
// If no subCluster given, get it by node
|
|
if subCluster == "" {
|
|
subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname)
|
|
if scnerr != nil {
|
|
return nil, nil, scnerr
|
|
}
|
|
subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName)
|
|
if scterr != nil {
|
|
return nil, nil, scterr
|
|
}
|
|
}
|
|
|
|
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
|
|
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
|
|
topology := subClusterTopol.Topology
|
|
acceleratorIds := topology.GetAcceleratorIDs()
|
|
|
|
// Moved check here if metric matches hardware specs
|
|
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
|
|
continue
|
|
}
|
|
|
|
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
|
|
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
|
|
if scope != schema.MetricScopeAccelerator {
|
|
// Skip all other catched cases
|
|
continue
|
|
}
|
|
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: false,
|
|
Type: &acceleratorString,
|
|
TypeIds: acceleratorIds,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
|
|
continue
|
|
}
|
|
|
|
// Accelerator -> Node
|
|
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
|
|
if len(acceleratorIds) == 0 {
|
|
continue
|
|
}
|
|
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &acceleratorString,
|
|
TypeIds: acceleratorIds,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// HWThread -> HWThread
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: false,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(topology.Node),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// HWThread -> Core
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
|
|
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
|
|
for _, core := range cores {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(topology.Core[core]),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// HWThread -> Socket
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
|
|
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
|
|
for _, socket := range sockets {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(topology.Socket[socket]),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// HWThread -> Node
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &hwthreadString,
|
|
TypeIds: intToStringSlice(topology.Node),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Core -> Core
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
|
|
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: false,
|
|
Type: &coreString,
|
|
TypeIds: intToStringSlice(cores),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Core -> Socket
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
|
|
sockets, _ := topology.GetSocketsFromCores(topology.Node)
|
|
for _, socket := range sockets {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &coreString,
|
|
TypeIds: intToStringSlice(topology.Socket[socket]),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Core -> Node
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
|
|
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &coreString,
|
|
TypeIds: intToStringSlice(cores),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// MemoryDomain -> MemoryDomain
|
|
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
|
|
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: false,
|
|
Type: &memoryDomainString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// MemoryDomain -> Node
|
|
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
|
|
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &memoryDomainString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Socket -> Socket
|
|
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
|
|
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: false,
|
|
Type: &socketString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Socket -> Node
|
|
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
|
|
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Aggregate: true,
|
|
Type: &socketString,
|
|
TypeIds: intToStringSlice(sockets),
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
// Node -> Node
|
|
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
|
|
queries = append(queries, APIQuery{
|
|
Metric: metric,
|
|
Hostname: hostname,
|
|
Resolution: resolution,
|
|
})
|
|
assignedScope = append(assignedScope, scope)
|
|
continue
|
|
}
|
|
|
|
return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
|
}
|
|
}
|
|
}
|
|
|
|
return queries, assignedScope, nil
|
|
}
|
|
|
|
// sanitizeStats converts NaN statistics to zero for JSON compatibility.
|
|
//
|
|
// schema.Float with NaN values cannot be properly JSON-encoded, so we convert
|
|
// NaN to 0. This loses the distinction between "no data" and "zero value",
|
|
// but maintains API compatibility.
|
|
func sanitizeStats(data *APIMetricData) {
|
|
if data.Avg.IsNaN() {
|
|
data.Avg = schema.Float(0)
|
|
}
|
|
if data.Min.IsNaN() {
|
|
data.Min = schema.Float(0)
|
|
}
|
|
if data.Max.IsNaN() {
|
|
data.Max = schema.Float(0)
|
|
}
|
|
}
|
|
|
|
// intToStringSlice converts a slice of integers to a slice of strings.
|
|
// Used to convert hardware thread/core/socket IDs from topology (int) to APIQuery TypeIds (string).
|
|
//
|
|
// Optimized to reuse a byte buffer for string conversion, reducing allocations.
|
|
func intToStringSlice(is []int) []string {
|
|
if len(is) == 0 {
|
|
return nil
|
|
}
|
|
|
|
ss := make([]string, len(is))
|
|
buf := make([]byte, 0, 16) // Reusable buffer for integer conversion
|
|
for i, x := range is {
|
|
buf = strconv.AppendInt(buf[:0], int64(x), 10)
|
|
ss[i] = string(buf)
|
|
}
|
|
return ss
|
|
}
|