migrate changes from cc-backend PR#364

This commit is contained in:
Christoph Kluge
2026-02-20 15:10:02 +01:00
parent e1c1148160
commit bae7ec11b4
4 changed files with 266 additions and 53 deletions

View File

@@ -70,14 +70,15 @@ func (ccms *CCMetricStore) buildQueries(
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
assignedScope := []schema.MetricScope{}
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources))
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
if scerr != nil {
return nil, nil, scerr
topology, err := ccms.getTopology(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error())
return nil, nil, err
}
topology := subcluster.Topology
for _, metric := range metrics {
remoteName := metric
@@ -128,7 +129,7 @@ func (ccms *CCMetricStore) buildQueries(
hostQueries, hostScopes := buildScopeQueries(
nativeScope, requestedScope,
remoteName, host.Hostname,
&topology, hwthreads, host.Accelerators,
topology, hwthreads, host.Accelerators,
resolution,
)
@@ -163,19 +164,9 @@ func (ccms *CCMetricStore) buildNodeQueries(
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := []schema.MetricScope{}
// Get Topol before loop if subCluster given
var subClusterTopol *schema.SubCluster
var scterr error
if subCluster != "" {
subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster)
if scterr != nil {
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", cluster, subCluster, scterr.Error())
return nil, nil, scterr
}
}
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes))
for _, metric := range metrics {
remoteName := metric
@@ -215,22 +206,22 @@ func (ccms *CCMetricStore) buildNodeQueries(
handledScopes = append(handledScopes, scope)
for _, hostname := range nodes {
var topology *schema.Topology
var err error
// If no subCluster given, get it by node
if subCluster == "" {
subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname)
if scnerr != nil {
return nil, nil, scnerr
}
subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName)
if scterr != nil {
return nil, nil, scterr
}
topology, err = ccms.getTopologyByNode(cluster, hostname)
} else {
topology, err = ccms.getTopology(cluster, subCluster)
}
if err != nil {
return nil, nil, err
}
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
topology := subClusterTopol.Topology
acceleratorIds := topology.GetAcceleratorIDs()
// Moved check here if metric matches hardware specs
@@ -241,7 +232,7 @@ func (ccms *CCMetricStore) buildNodeQueries(
nodeQueries, nodeScopes := buildScopeQueries(
nativeScope, requestedScope,
remoteName, hostname,
&topology, topology.Node, acceleratorIds,
topology, topology.Node, acceleratorIds,
resolution,
)
@@ -278,7 +269,6 @@ func buildScopeQueries(
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
if scope != schema.MetricScopeAccelerator {
// Skip all other caught cases
return queries, scopes
}
@@ -451,6 +441,31 @@ func buildScopeQueries(
return queries, scopes
}
// MemoryDomain -> Socket
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeSocket {
memDomains, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
socketToDomains, err := topology.GetMemoryDomainsBySocket(memDomains)
if err != nil {
cclog.Errorf("Error mapping memory domains to sockets, return unchanged: %v", err)
return queries, scopes
}
// Create a query for each socket
for _, domains := range socketToDomains {
queries = append(queries, APIQuery{
Metric: metric,
Hostname: hostname,
Aggregate: true,
Type: &memoryDomainString,
TypeIds: intToStringSlice(domains),
Resolution: resolution,
})
// Add scope for each query, not just once
scopes = append(scopes, scope)
}
return queries, scopes
}
// Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)

View File

@@ -71,10 +71,11 @@ import (
// CCMetricStore is the HTTP client for communicating with cc-metric-store.
// It manages connection details, authentication, and provides methods for querying metrics.
type CCMetricStore struct {
client http.Client // HTTP client with 10-second timeout
jwt string // JWT Bearer token for authentication
url string // Base URL of cc-metric-store instance
queryEndpoint string // Full URL to query API endpoint
client http.Client // HTTP client with 10-second timeout
jwt string // JWT Bearer token for authentication
url string // Base URL of cc-metric-store instance
queryEndpoint string // Full URL to query API endpoint
topologyCache map[string]*schema.Topology // cluster -> topology cache
}
// APIQueryRequest represents a request to the cc-metric-store query API.
@@ -133,6 +134,7 @@ func NewCCMetricStore(url string, token string) *CCMetricStore {
client: http.Client{
Timeout: 10 * time.Second,
},
topologyCache: make(map[string]*schema.Topology),
}
}
@@ -185,6 +187,32 @@ func (ccms *CCMetricStore) doRequest(
return &resBody, nil
}
// getTopology returns the topology for a given cluster and subcluster, caching it if not already present
func (ccms *CCMetricStore) getTopology(cluster, subCluster string) (*schema.Topology, error) {
cacheKey := fmt.Sprintf("%s:%s", cluster, subCluster)
if topology, ok := ccms.topologyCache[cacheKey]; ok {
return topology, nil
}
subcluster, err := archive.GetSubCluster(cluster, subCluster)
if err != nil {
return nil, err
}
ccms.topologyCache[cacheKey] = &subcluster.Topology
return &subcluster.Topology, nil
}
// getTopologyByNode returns the topology for a given cluster and node, caching it if not already present
func (ccms *CCMetricStore) getTopologyByNode(cluster, node string) (*schema.Topology, error) {
subCluster, err := archive.GetSubClusterByNode(cluster, node)
if err != nil {
return nil, err
}
return ccms.getTopology(cluster, subCluster)
}
// LoadData retrieves time series data and statistics for the specified job and metrics.
// It queries data for the job's time range and resources, handling scope transformations automatically.
//
@@ -210,6 +238,12 @@ func (ccms *CCMetricStore) LoadData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: job.Cluster,
From: job.StartTime,
@@ -227,11 +261,37 @@ func (ccms *CCMetricStore) LoadData(
var errors []string
jobData := make(schema.JobData)
// Add safety check for potential index out of range errors
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
query := req.Queries[i]
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster)
continue
}
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
@@ -260,8 +320,15 @@ func (ccms *CCMetricStore) LoadData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res.Avg, &res.Min, &res.Max)
@@ -412,8 +479,15 @@ func (ccms *CCMetricStore) LoadScopedStats(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res.Avg, &res.Min, &res.Max)
@@ -561,6 +635,12 @@ func (ccms *CCMetricStore) LoadNodeListData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: cluster,
Queries: queries,
@@ -578,17 +658,47 @@ func (ccms *CCMetricStore) LoadNodeListData(
var errors []string
data := make(map[string]schema.JobData)
// Add safety check for index out of range issues
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
var query APIQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
if i < len(resBody.Queries) {
query = resBody.Queries[i]
} else {
cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d",
i, len(resBody.Queries))
continue
}
} else {
query = req.Queries[i]
}
// qdata := res[0]
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
continue
}
res := mc.Timestep
if len(row) > 0 {
@@ -627,8 +737,15 @@ func (ccms *CCMetricStore) LoadNodeListData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res.Avg, &res.Min, &res.Max)

View File

@@ -93,6 +93,12 @@ func (ccms *InternalMetricStore) LoadData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: job.Cluster,
From: job.StartTime,
@@ -110,9 +116,24 @@ func (ccms *InternalMetricStore) LoadData(
var errors []string
jobData := make(schema.JobData)
// Add safety check for potential index out of range errors
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
if len(row) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
@@ -120,6 +141,12 @@ func (ccms *InternalMetricStore) LoadData(
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster)
continue
}
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
@@ -148,8 +175,15 @@ func (ccms *InternalMetricStore) LoadData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res)
@@ -650,8 +684,15 @@ func (ccms *InternalMetricStore) LoadScopedStats(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res)
@@ -823,6 +864,12 @@ func (ccms *InternalMetricStore) LoadNodeListData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: cluster,
Queries: queries,
@@ -840,14 +887,36 @@ func (ccms *InternalMetricStore) LoadNodeListData(
var errors []string
data := make(map[string]schema.JobData)
// Add safety check for index out of range issues
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
if len(row) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
var query APIQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
if i < len(resBody.Queries) {
query = resBody.Queries[i]
} else {
cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d",
i, len(resBody.Queries))
continue
}
} else {
query = req.Queries[i]
}
@@ -855,6 +924,10 @@ func (ccms *InternalMetricStore) LoadNodeListData(
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
continue
}
res := mc.Timestep
if len(row) > 0 {
@@ -893,8 +966,15 @@ func (ccms *InternalMetricStore) LoadNodeListData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res)

View File

@@ -60,6 +60,7 @@ export function init(extraInitQuery = "") {
topology {
node
socket
memoryDomain
core
accelerators { id }
}
@@ -238,7 +239,7 @@ export function groupByScope(jobMetrics) {
const scopeGranularity = {
node: 10,
socket: 5,
memorydomain: 4,
memoryDomain: 4,
core: 3,
hwthread: 2,
accelerator: 1