Merge pull request #500 from ClusterCockpit/dev

Dev
This commit is contained in:
Jan Eitzinger
2026-02-24 06:46:49 +01:00
committed by GitHub
8 changed files with 324 additions and 83 deletions

2
go.mod
View File

@@ -9,7 +9,7 @@ tool (
require (
github.com/99designs/gqlgen v0.17.86
github.com/ClusterCockpit/cc-lib/v2 v2.5.1
github.com/ClusterCockpit/cc-lib/v2 v2.6.0
github.com/Masterminds/squirrel v1.5.4
github.com/aws/aws-sdk-go-v2 v1.41.1
github.com/aws/aws-sdk-go-v2/config v1.32.8

2
go.sum
View File

@@ -6,6 +6,8 @@ github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+
github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
github.com/ClusterCockpit/cc-lib/v2 v2.5.1 h1:s6M9tyPDty+4zTdQGJYKpGJM9Nz7N6ITMdjPvNSLX5g=
github.com/ClusterCockpit/cc-lib/v2 v2.5.1/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
github.com/ClusterCockpit/cc-lib/v2 v2.6.0 h1:Q7zvRAVhfYA9PDB18pfY9A/6Ws4oWpnv8+P9MBRUDzg=
github.com/ClusterCockpit/cc-lib/v2 v2.6.0/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=

View File

@@ -70,14 +70,15 @@ func (ccms *CCMetricStore) buildQueries(
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
assignedScope := []schema.MetricScope{}
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources))
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
if scerr != nil {
return nil, nil, scerr
topology, err := ccms.getTopology(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error())
return nil, nil, err
}
topology := subcluster.Topology
for _, metric := range metrics {
remoteName := metric
@@ -128,7 +129,7 @@ func (ccms *CCMetricStore) buildQueries(
hostQueries, hostScopes := buildScopeQueries(
nativeScope, requestedScope,
remoteName, host.Hostname,
&topology, hwthreads, host.Accelerators,
topology, hwthreads, host.Accelerators,
resolution,
)
@@ -163,19 +164,9 @@ func (ccms *CCMetricStore) buildNodeQueries(
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := []schema.MetricScope{}
// Get Topol before loop if subCluster given
var subClusterTopol *schema.SubCluster
var scterr error
if subCluster != "" {
subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster)
if scterr != nil {
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", cluster, subCluster, scterr.Error())
return nil, nil, scterr
}
}
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes))
for _, metric := range metrics {
remoteName := metric
@@ -215,22 +206,22 @@ func (ccms *CCMetricStore) buildNodeQueries(
handledScopes = append(handledScopes, scope)
for _, hostname := range nodes {
var topology *schema.Topology
var err error
// If no subCluster given, get it by node
if subCluster == "" {
subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname)
if scnerr != nil {
return nil, nil, scnerr
}
subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName)
if scterr != nil {
return nil, nil, scterr
}
topology, err = ccms.getTopologyByNode(cluster, hostname)
} else {
topology, err = ccms.getTopology(cluster, subCluster)
}
if err != nil {
return nil, nil, err
}
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
topology := subClusterTopol.Topology
acceleratorIds := topology.GetAcceleratorIDs()
// Moved check here if metric matches hardware specs
@@ -241,7 +232,7 @@ func (ccms *CCMetricStore) buildNodeQueries(
nodeQueries, nodeScopes := buildScopeQueries(
nativeScope, requestedScope,
remoteName, hostname,
&topology, topology.Node, acceleratorIds,
topology, topology.Node, acceleratorIds,
resolution,
)
@@ -278,7 +269,6 @@ func buildScopeQueries(
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
if scope != schema.MetricScopeAccelerator {
// Skip all other caught cases
return queries, scopes
}
@@ -451,6 +441,31 @@ func buildScopeQueries(
return queries, scopes
}
// MemoryDomain -> Socket
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeSocket {
memDomains, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
socketToDomains, err := topology.GetMemoryDomainsBySocket(memDomains)
if err != nil {
cclog.Errorf("Error mapping memory domains to sockets, return unchanged: %v", err)
return queries, scopes
}
// Create a query for each socket
for _, domains := range socketToDomains {
queries = append(queries, APIQuery{
Metric: metric,
Hostname: hostname,
Aggregate: true,
Type: &memoryDomainString,
TypeIds: intToStringSlice(domains),
Resolution: resolution,
})
// Add scope for each query, not just once
scopes = append(scopes, scope)
}
return queries, scopes
}
// Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)

View File

@@ -71,10 +71,11 @@ import (
// CCMetricStore is the HTTP client for communicating with cc-metric-store.
// It manages connection details, authentication, and provides methods for querying metrics.
type CCMetricStore struct {
client http.Client // HTTP client with 10-second timeout
jwt string // JWT Bearer token for authentication
url string // Base URL of cc-metric-store instance
queryEndpoint string // Full URL to query API endpoint
client http.Client // HTTP client with 10-second timeout
jwt string // JWT Bearer token for authentication
url string // Base URL of cc-metric-store instance
queryEndpoint string // Full URL to query API endpoint
topologyCache map[string]*schema.Topology // cluster -> topology cache
}
// APIQueryRequest represents a request to the cc-metric-store query API.
@@ -133,6 +134,7 @@ func NewCCMetricStore(url string, token string) *CCMetricStore {
client: http.Client{
Timeout: 10 * time.Second,
},
topologyCache: make(map[string]*schema.Topology),
}
}
@@ -185,6 +187,32 @@ func (ccms *CCMetricStore) doRequest(
return &resBody, nil
}
// getTopology returns the topology for a given cluster and subcluster, caching it if not already present
func (ccms *CCMetricStore) getTopology(cluster, subCluster string) (*schema.Topology, error) {
cacheKey := fmt.Sprintf("%s:%s", cluster, subCluster)
if topology, ok := ccms.topologyCache[cacheKey]; ok {
return topology, nil
}
subcluster, err := archive.GetSubCluster(cluster, subCluster)
if err != nil {
return nil, err
}
ccms.topologyCache[cacheKey] = &subcluster.Topology
return &subcluster.Topology, nil
}
// getTopologyByNode returns the topology for a given cluster and node, caching it if not already present
func (ccms *CCMetricStore) getTopologyByNode(cluster, node string) (*schema.Topology, error) {
subCluster, err := archive.GetSubClusterByNode(cluster, node)
if err != nil {
return nil, err
}
return ccms.getTopology(cluster, subCluster)
}
// LoadData retrieves time series data and statistics for the specified job and metrics.
// It queries data for the job's time range and resources, handling scope transformations automatically.
//
@@ -210,6 +238,12 @@ func (ccms *CCMetricStore) LoadData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: job.Cluster,
From: job.StartTime,
@@ -227,11 +261,37 @@ func (ccms *CCMetricStore) LoadData(
var errors []string
jobData := make(schema.JobData)
// Add safety check for potential index out of range errors
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
query := req.Queries[i]
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster)
continue
}
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
@@ -260,8 +320,15 @@ func (ccms *CCMetricStore) LoadData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res.Avg, &res.Min, &res.Max)
@@ -412,8 +479,15 @@ func (ccms *CCMetricStore) LoadScopedStats(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res.Avg, &res.Min, &res.Max)
@@ -561,6 +635,12 @@ func (ccms *CCMetricStore) LoadNodeListData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: cluster,
Queries: queries,
@@ -578,17 +658,47 @@ func (ccms *CCMetricStore) LoadNodeListData(
var errors []string
data := make(map[string]schema.JobData)
// Add safety check for index out of range issues
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
var query APIQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
if i < len(resBody.Queries) {
query = resBody.Queries[i]
} else {
cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d",
i, len(resBody.Queries))
continue
}
} else {
query = req.Queries[i]
}
// qdata := res[0]
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
continue
}
res := mc.Timestep
if len(row) > 0 {
@@ -627,8 +737,15 @@ func (ccms *CCMetricStore) LoadNodeListData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res.Avg, &res.Min, &res.Max)

View File

@@ -190,6 +190,8 @@ func (t *JobClassTagger) EventCallback() {
cclog.Fatal(err)
}
t.rules = make(map[string]ruleInfo)
parametersFile := filepath.Join(t.cfgPath, parametersFileName)
if util.CheckFileExists(parametersFile) {
cclog.Info("Merge parameters")
@@ -301,17 +303,21 @@ func (t *JobClassTagger) Register() error {
// - Shared parameters defined in parameters.json
// - Computed variables from the rule definition
//
// Rules are evaluated in arbitrary order. If multiple rules match, only the first
// encountered match is applied (FIXME: this should handle multiple matches).
// Rules are evaluated in arbitrary order. Multiple rules can match and apply
// their tags to the same job. Hint messages from all matching rules are collected
// and stored as a combined message in the job metadata.
func (t *JobClassTagger) Match(job *schema.Job) {
jobStats, err := t.getStatistics(job)
metricsList := t.getMetricConfig(job.Cluster, job.SubCluster)
cclog.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID)
cclog.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID)
if err != nil {
cclog.Errorf("job classification failed for job %d: %#v", job.JobID, err)
cclog.Errorf("job classification failed for job %d: %#v", job.JobID, err)
return
}
id := *job.ID
var messages []string
for tag, ri := range t.rules {
env := make(map[string]any)
maps.Copy(env, ri.env)
@@ -329,11 +335,13 @@ func (t *JobClassTagger) Match(job *schema.Job) {
}
// add metrics to env
skipRule := false
for _, m := range ri.metrics {
stats, ok := jobStats[m]
if !ok {
cclog.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m)
return
cclog.Errorf("job classification: missing metric '%s' for rule %s on job %d", m, tag, job.JobID)
skipRule = true
break
}
env[m] = map[string]any{
"min": stats.Min,
@@ -347,44 +355,55 @@ func (t *JobClassTagger) Match(job *schema.Job) {
},
}
}
if skipRule {
continue
}
// check rule requirements apply
requirementsMet := true
for _, r := range ri.requirements {
ok, err := expr.Run(r, env)
if err != nil {
cclog.Errorf("error running requirement for rule %s: %#v", tag, err)
return
requirementsMet = false
break
}
if !ok.(bool) {
cclog.Infof("requirement for rule %s not met", tag)
return
requirementsMet = false
break
}
}
if !requirementsMet {
continue
}
// validate rule expression
// evaluate rule variables
varError := false
for _, v := range ri.variables {
value, err := expr.Run(v.expr, env)
if err != nil {
cclog.Errorf("error running rule %s: %#v", tag, err)
return
cclog.Errorf("error evaluating variable %s for rule %s: %#v", v.name, tag, err)
varError = true
break
}
env[v.name] = value
}
// dump.P(env)
if varError {
continue
}
match, err := expr.Run(ri.rule, env)
if err != nil {
cclog.Errorf("error running rule %s: %#v", tag, err)
return
continue
}
if match.(bool) {
cclog.Info("Rule matches!")
id := *job.ID
if !t.repo.HasTag(id, t.tagType, tag) {
_, err := t.repo.AddTagOrCreateDirect(id, t.tagType, tag)
if err != nil {
return
if _, err := t.repo.AddTagOrCreateDirect(id, t.tagType, tag); err != nil {
cclog.Errorf("failed to add tag '%s' to job %d: %v", tag, id, err)
continue
}
}
@@ -392,17 +411,18 @@ func (t *JobClassTagger) Match(job *schema.Job) {
var msg bytes.Buffer
if err := ri.hint.Execute(&msg, env); err != nil {
cclog.Errorf("Template error: %s", err.Error())
return
}
// FIXME: Handle case where multiple tags apply
// FIXME: Handle case where multiple tags apply
err = t.repo.UpdateMetadata(job, "message", msg.String())
if err != nil {
return
continue
}
messages = append(messages, msg.String())
} else {
cclog.Info("Rule does not match!")
}
}
if len(messages) > 0 {
combined := strings.Join(messages, "\n")
if err := t.repo.UpdateMetadata(job, "message", combined); err != nil {
cclog.Errorf("failed to update metadata for job %d: %v", *job.ID, err)
}
}
}

View File

@@ -98,6 +98,8 @@ func (t *AppTagger) EventCallback() {
cclog.Fatal(err)
}
t.apps = make([]appInfo, 0)
for _, fn := range files {
if fn.IsDir() {
continue
@@ -163,7 +165,7 @@ func (t *AppTagger) Register() error {
// It fetches the job metadata, extracts the job script, and matches it against
// all configured application patterns using regular expressions.
// If a match is found, the corresponding application tag is added to the job.
// Only the first matching application is tagged.
// Multiple application tags can be applied if patterns for different apps match.
func (t *AppTagger) Match(job *schema.Job) {
r := repository.GetJobRepository()
@@ -199,6 +201,7 @@ func (t *AppTagger) Match(job *schema.Job) {
jobscriptLower := strings.ToLower(jobscript)
cclog.Debugf("AppTagger: matching job %d (script length: %d) against %d apps", id, len(jobscriptLower), len(t.apps))
matched := false
for _, a := range t.apps {
for _, re := range a.patterns {
if re.MatchString(jobscriptLower) {
@@ -210,10 +213,13 @@ func (t *AppTagger) Match(job *schema.Job) {
cclog.Errorf("AppTagger: failed to add tag '%s' to job %d: %v", a.tag, id, err)
}
}
return
matched = true
break // matched this app, move to next app
}
}
}
cclog.Debugf("AppTagger: no pattern matched for job %d on %s", id, job.Cluster)
if !matched {
cclog.Debugf("AppTagger: no pattern matched for job %d on %s", id, job.Cluster)
}
}

View File

@@ -93,6 +93,12 @@ func (ccms *InternalMetricStore) LoadData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: job.Cluster,
From: job.StartTime,
@@ -110,9 +116,24 @@ func (ccms *InternalMetricStore) LoadData(
var errors []string
jobData := make(schema.JobData)
// Add safety check for potential index out of range errors
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
if len(row) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
@@ -120,6 +141,12 @@ func (ccms *InternalMetricStore) LoadData(
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster)
continue
}
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
@@ -148,8 +175,15 @@ func (ccms *InternalMetricStore) LoadData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res)
@@ -650,8 +684,15 @@ func (ccms *InternalMetricStore) LoadScopedStats(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res)
@@ -823,6 +864,12 @@ func (ccms *InternalMetricStore) LoadNodeListData(
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: cluster,
Queries: queries,
@@ -840,14 +887,36 @@ func (ccms *InternalMetricStore) LoadNodeListData(
var errors []string
data := make(map[string]schema.JobData)
// Add safety check for index out of range issues
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
if len(row) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
// Safety check to prevent index out of range errors
if i >= len(req.Queries) || i >= len(assignedScope) {
cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d",
i, len(req.Queries), len(assignedScope))
continue
}
var query APIQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
if i < len(resBody.Queries) {
query = resBody.Queries[i]
} else {
cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d",
i, len(resBody.Queries))
continue
}
} else {
query = req.Queries[i]
}
@@ -855,6 +924,10 @@ func (ccms *InternalMetricStore) LoadNodeListData(
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
continue
}
res := mc.Timestep
if len(row) > 0 {
@@ -893,8 +966,15 @@ func (ccms *InternalMetricStore) LoadNodeListData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
// Check if ndx is within the bounds of TypeIds slice
if ndx < len(query.TypeIds) {
id = new(string)
*id = query.TypeIds[ndx]
} else {
// Log the error but continue processing
cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s",
ndx, len(query.TypeIds), query.Metric, query.Hostname)
}
}
sanitizeStats(&res)

View File

@@ -60,6 +60,7 @@ export function init(extraInitQuery = "") {
topology {
node
socket
memoryDomain
core
accelerators { id }
}
@@ -238,7 +239,7 @@ export function groupByScope(jobMetrics) {
const scopeGranularity = {
node: 10,
socket: 5,
memorydomain: 4,
memoryDomain: 4,
core: 3,
hwthread: 2,
accelerator: 1