diff --git a/Makefile b/Makefile index 3246538a..5829beae 100644 --- a/Makefile +++ b/Makefile @@ -84,4 +84,4 @@ $(VAR): $(SVELTE_TARGETS): $(SVELTE_SRC) $(info ===> BUILD frontend) - cd web/frontend && npm install && npm run build + cd web/frontend && npm ci && npm run build diff --git a/configs/tagger/jobclasses/highMemoryUsage.json b/configs/tagger/jobclasses/highMemoryUsage.json index 3c10b06f..f241457d 100644 --- a/configs/tagger/jobclasses/highMemoryUsage.json +++ b/configs/tagger/jobclasses/highMemoryUsage.json @@ -11,15 +11,11 @@ "job.duration > job_min_duration_seconds" ], "variables": [ - { - "name": "memory_threshold", - "expr": "mem_used.limits.peak * highmemoryusage_threshold_factor" - }, { "name": "memory_usage_pct", "expr": "mem_used.max / mem_used.limits.peak * 100.0" } ], - "rule": "mem_used.max > memory_threshold", + "rule": "mem_used.max > memory_used.limits.alert", "hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions." } diff --git a/configs/tagger/jobclasses/lowload.json b/configs/tagger/jobclasses/lowload.json index 7fa3ca3b..767d8f45 100644 --- a/configs/tagger/jobclasses/lowload.json +++ b/configs/tagger/jobclasses/lowload.json @@ -1,10 +1,7 @@ { "name": "Low CPU load", "tag": "lowload", - "parameters": [ - "lowcpuload_threshold_factor", - "job_min_duration_seconds" - ], + "parameters": ["lowcpuload_threshold_factor", "job_min_duration_seconds"], "metrics": ["cpu_load"], "requirements": [ "job.shared == \"none\"", @@ -13,9 +10,9 @@ "variables": [ { "name": "load_threshold", - "expr": "job.numCores * lowcpuload_threshold_factor" + "expr": "cpu_load.limits.peak * lowcpuload_threshold_factor" } ], "rule": "cpu_load.avg < load_threshold", - "hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}} \u00d7 {{.job.numCores}} allocated cores)." + "hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}})." } diff --git a/go.mod b/go.mod index 067ec05d..c561f627 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ tool ( require ( github.com/99designs/gqlgen v0.17.86 - github.com/ClusterCockpit/cc-lib/v2 v2.5.1 + github.com/ClusterCockpit/cc-lib/v2 v2.6.0 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.8 diff --git a/go.sum b/go.sum index c4835106..67e03188 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+ github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk= github.com/ClusterCockpit/cc-lib/v2 v2.5.1 h1:s6M9tyPDty+4zTdQGJYKpGJM9Nz7N6ITMdjPvNSLX5g= github.com/ClusterCockpit/cc-lib/v2 v2.5.1/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0= +github.com/ClusterCockpit/cc-lib/v2 v2.6.0 h1:Q7zvRAVhfYA9PDB18pfY9A/6Ws4oWpnv8+P9MBRUDzg= +github.com/ClusterCockpit/cc-lib/v2 v2.6.0/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= diff --git a/internal/metricstoreclient/cc-metric-store-queries.go b/internal/metricstoreclient/cc-metric-store-queries.go index acd8c979..d42c9355 100644 --- a/internal/metricstoreclient/cc-metric-store-queries.go +++ b/internal/metricstoreclient/cc-metric-store-queries.go @@ -70,14 +70,15 @@ func (ccms *CCMetricStore) buildQueries( scopes []schema.MetricScope, resolution int, ) ([]APIQuery, []schema.MetricScope, error) { + // Initialize both slices together queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) - assignedScope := []schema.MetricScope{} + assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources)) - subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster) - if scerr != nil { - return nil, nil, scerr + topology, err := ccms.getTopology(job.Cluster, job.SubCluster) + if err != nil { + cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error()) + return nil, nil, err } - topology := subcluster.Topology for _, metric := range metrics { remoteName := metric @@ -128,7 +129,7 @@ func (ccms *CCMetricStore) buildQueries( hostQueries, hostScopes := buildScopeQueries( nativeScope, requestedScope, remoteName, host.Hostname, - &topology, hwthreads, host.Accelerators, + topology, hwthreads, host.Accelerators, resolution, ) @@ -163,19 +164,9 @@ func (ccms *CCMetricStore) buildNodeQueries( scopes []schema.MetricScope, resolution int, ) ([]APIQuery, []schema.MetricScope, error) { + // Initialize both slices together queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes)) - assignedScope := []schema.MetricScope{} - - // Get Topol before loop if subCluster given - var subClusterTopol *schema.SubCluster - var scterr error - if subCluster != "" { - subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster) - if scterr != nil { - cclog.Errorf("could not load cluster %s subCluster %s topology: %s", cluster, subCluster, scterr.Error()) - return nil, nil, scterr - } - } + assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes)) for _, metric := range metrics { remoteName := metric @@ -215,22 +206,22 @@ func (ccms *CCMetricStore) buildNodeQueries( handledScopes = append(handledScopes, scope) for _, hostname := range nodes { + var topology *schema.Topology + var err error // If no subCluster given, get it by node if subCluster == "" { - subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname) - if scnerr != nil { - return nil, nil, scnerr - } - subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName) - if scterr != nil { - return nil, nil, scterr - } + topology, err = ccms.getTopologyByNode(cluster, hostname) + } else { + topology, err = ccms.getTopology(cluster, subCluster) + } + + if err != nil { + return nil, nil, err } // Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable // Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable - topology := subClusterTopol.Topology acceleratorIds := topology.GetAcceleratorIDs() // Moved check here if metric matches hardware specs @@ -241,7 +232,7 @@ func (ccms *CCMetricStore) buildNodeQueries( nodeQueries, nodeScopes := buildScopeQueries( nativeScope, requestedScope, remoteName, hostname, - &topology, topology.Node, acceleratorIds, + topology, topology.Node, acceleratorIds, resolution, ) @@ -278,7 +269,6 @@ func buildScopeQueries( // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { if scope != schema.MetricScopeAccelerator { - // Skip all other caught cases return queries, scopes } @@ -451,6 +441,31 @@ func buildScopeQueries( return queries, scopes } + // MemoryDomain -> Socket + if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeSocket { + memDomains, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) + socketToDomains, err := topology.GetMemoryDomainsBySocket(memDomains) + if err != nil { + cclog.Errorf("Error mapping memory domains to sockets, return unchanged: %v", err) + return queries, scopes + } + + // Create a query for each socket + for _, domains := range socketToDomains { + queries = append(queries, APIQuery{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &memoryDomainString, + TypeIds: intToStringSlice(domains), + Resolution: resolution, + }) + // Add scope for each query, not just once + scopes = append(scopes, scope) + } + return queries, scopes + } + // Socket -> Socket if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go index 81add789..7bf7d146 100644 --- a/internal/metricstoreclient/cc-metric-store.go +++ b/internal/metricstoreclient/cc-metric-store.go @@ -71,10 +71,11 @@ import ( // CCMetricStore is the HTTP client for communicating with cc-metric-store. // It manages connection details, authentication, and provides methods for querying metrics. type CCMetricStore struct { - client http.Client // HTTP client with 10-second timeout - jwt string // JWT Bearer token for authentication - url string // Base URL of cc-metric-store instance - queryEndpoint string // Full URL to query API endpoint + client http.Client // HTTP client with 10-second timeout + jwt string // JWT Bearer token for authentication + url string // Base URL of cc-metric-store instance + queryEndpoint string // Full URL to query API endpoint + topologyCache map[string]*schema.Topology // cluster -> topology cache } // APIQueryRequest represents a request to the cc-metric-store query API. @@ -133,6 +134,7 @@ func NewCCMetricStore(url string, token string) *CCMetricStore { client: http.Client{ Timeout: 10 * time.Second, }, + topologyCache: make(map[string]*schema.Topology), } } @@ -185,6 +187,32 @@ func (ccms *CCMetricStore) doRequest( return &resBody, nil } +// getTopology returns the topology for a given cluster and subcluster, caching it if not already present +func (ccms *CCMetricStore) getTopology(cluster, subCluster string) (*schema.Topology, error) { + cacheKey := fmt.Sprintf("%s:%s", cluster, subCluster) + if topology, ok := ccms.topologyCache[cacheKey]; ok { + return topology, nil + } + + subcluster, err := archive.GetSubCluster(cluster, subCluster) + if err != nil { + return nil, err + } + + ccms.topologyCache[cacheKey] = &subcluster.Topology + return &subcluster.Topology, nil +} + +// getTopologyByNode returns the topology for a given cluster and node, caching it if not already present +func (ccms *CCMetricStore) getTopologyByNode(cluster, node string) (*schema.Topology, error) { + subCluster, err := archive.GetSubClusterByNode(cluster, node) + if err != nil { + return nil, err + } + + return ccms.getTopology(cluster, subCluster) +} + // LoadData retrieves time series data and statistics for the specified job and metrics. // It queries data for the job's time range and resources, handling scope transformations automatically. // @@ -210,6 +238,12 @@ func (ccms *CCMetricStore) LoadData( return nil, err } + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d", + len(queries), len(assignedScope)) + } + req := APIQueryRequest{ Cluster: job.Cluster, From: job.StartTime, @@ -227,11 +261,37 @@ func (ccms *CCMetricStore) LoadData( var errors []string jobData := make(schema.JobData) + + // Add safety check for potential index out of range errors + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + for i, row := range resBody.Results { + // Safety check to prevent index out of range errors + if i >= len(req.Queries) || i >= len(assignedScope) { + cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d", + i, len(req.Queries), len(assignedScope)) + continue + } + query := req.Queries[i] metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(job.Cluster, metric) + + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster) + continue + } + if _, ok := jobData[metric]; !ok { jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric) } @@ -260,8 +320,15 @@ func (ccms *CCMetricStore) LoadData( id := (*string)(nil) if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] + // Check if ndx is within the bounds of TypeIds slice + if ndx < len(query.TypeIds) { + id = new(string) + *id = query.TypeIds[ndx] + } else { + // Log the error but continue processing + cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s", + ndx, len(query.TypeIds), query.Metric, query.Hostname) + } } sanitizeStats(&res.Avg, &res.Min, &res.Max) @@ -412,8 +479,15 @@ func (ccms *CCMetricStore) LoadScopedStats( id := (*string)(nil) if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] + // Check if ndx is within the bounds of TypeIds slice + if ndx < len(query.TypeIds) { + id = new(string) + *id = query.TypeIds[ndx] + } else { + // Log the error but continue processing + cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s", + ndx, len(query.TypeIds), query.Metric, query.Hostname) + } } sanitizeStats(&res.Avg, &res.Min, &res.Max) @@ -561,6 +635,12 @@ func (ccms *CCMetricStore) LoadNodeListData( return nil, err } + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d", + len(queries), len(assignedScope)) + } + req := APIQueryRequest{ Cluster: cluster, Queries: queries, @@ -578,17 +658,47 @@ func (ccms *CCMetricStore) LoadNodeListData( var errors []string data := make(map[string]schema.JobData) + + // Add safety check for index out of range issues + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + for i, row := range resBody.Results { + // Safety check to prevent index out of range errors + if i >= len(req.Queries) || i >= len(assignedScope) { + cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d", + i, len(req.Queries), len(assignedScope)) + continue + } + var query APIQuery if resBody.Queries != nil { - query = resBody.Queries[i] + if i < len(resBody.Queries) { + query = resBody.Queries[i] + } else { + cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d", + i, len(resBody.Queries)) + continue + } } else { query = req.Queries[i] } - // qdata := res[0] + metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster) + continue + } res := mc.Timestep if len(row) > 0 { @@ -627,8 +737,15 @@ func (ccms *CCMetricStore) LoadNodeListData( id := (*string)(nil) if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] + // Check if ndx is within the bounds of TypeIds slice + if ndx < len(query.TypeIds) { + id = new(string) + *id = query.TypeIds[ndx] + } else { + // Log the error but continue processing + cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s", + ndx, len(query.TypeIds), query.Metric, query.Hostname) + } } sanitizeStats(&res.Avg, &res.Min, &res.Max) diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index b5f30949..f8751047 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -190,6 +190,8 @@ func (t *JobClassTagger) EventCallback() { cclog.Fatal(err) } + t.rules = make(map[string]ruleInfo) + parametersFile := filepath.Join(t.cfgPath, parametersFileName) if util.CheckFileExists(parametersFile) { cclog.Info("Merge parameters") @@ -301,17 +303,21 @@ func (t *JobClassTagger) Register() error { // - Shared parameters defined in parameters.json // - Computed variables from the rule definition // -// Rules are evaluated in arbitrary order. If multiple rules match, only the first -// encountered match is applied (FIXME: this should handle multiple matches). +// Rules are evaluated in arbitrary order. Multiple rules can match and apply +// their tags to the same job. Hint messages from all matching rules are collected +// and stored as a combined message in the job metadata. func (t *JobClassTagger) Match(job *schema.Job) { jobStats, err := t.getStatistics(job) metricsList := t.getMetricConfig(job.Cluster, job.SubCluster) - cclog.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) + cclog.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) if err != nil { - cclog.Errorf("job classification failed for job %d: %#v", job.JobID, err) + cclog.Errorf("job classification failed for job %d: %#v", job.JobID, err) return } + id := *job.ID + var messages []string + for tag, ri := range t.rules { env := make(map[string]any) maps.Copy(env, ri.env) @@ -329,11 +335,13 @@ func (t *JobClassTagger) Match(job *schema.Job) { } // add metrics to env + skipRule := false for _, m := range ri.metrics { stats, ok := jobStats[m] if !ok { - cclog.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m) - return + cclog.Errorf("job classification: missing metric '%s' for rule %s on job %d", m, tag, job.JobID) + skipRule = true + break } env[m] = map[string]any{ "min": stats.Min, @@ -347,44 +355,55 @@ func (t *JobClassTagger) Match(job *schema.Job) { }, } } + if skipRule { + continue + } // check rule requirements apply + requirementsMet := true for _, r := range ri.requirements { ok, err := expr.Run(r, env) if err != nil { cclog.Errorf("error running requirement for rule %s: %#v", tag, err) - return + requirementsMet = false + break } if !ok.(bool) { cclog.Infof("requirement for rule %s not met", tag) - return + requirementsMet = false + break } } + if !requirementsMet { + continue + } - // validate rule expression + // evaluate rule variables + varError := false for _, v := range ri.variables { value, err := expr.Run(v.expr, env) if err != nil { - cclog.Errorf("error running rule %s: %#v", tag, err) - return + cclog.Errorf("error evaluating variable %s for rule %s: %#v", v.name, tag, err) + varError = true + break } env[v.name] = value } - - // dump.P(env) + if varError { + continue + } match, err := expr.Run(ri.rule, env) if err != nil { cclog.Errorf("error running rule %s: %#v", tag, err) - return + continue } if match.(bool) { cclog.Info("Rule matches!") - id := *job.ID if !t.repo.HasTag(id, t.tagType, tag) { - _, err := t.repo.AddTagOrCreateDirect(id, t.tagType, tag) - if err != nil { - return + if _, err := t.repo.AddTagOrCreateDirect(id, t.tagType, tag); err != nil { + cclog.Errorf("failed to add tag '%s' to job %d: %v", tag, id, err) + continue } } @@ -392,17 +411,18 @@ func (t *JobClassTagger) Match(job *schema.Job) { var msg bytes.Buffer if err := ri.hint.Execute(&msg, env); err != nil { cclog.Errorf("Template error: %s", err.Error()) - return - } - - // FIXME: Handle case where multiple tags apply - // FIXME: Handle case where multiple tags apply - err = t.repo.UpdateMetadata(job, "message", msg.String()) - if err != nil { - return + continue } + messages = append(messages, msg.String()) } else { cclog.Info("Rule does not match!") } } + + if len(messages) > 0 { + combined := strings.Join(messages, "\n") + if err := t.repo.UpdateMetadata(job, "message", combined); err != nil { + cclog.Errorf("failed to update metadata for job %d: %v", *job.ID, err) + } + } } diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index c82c87bc..f86dcb6c 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -98,6 +98,8 @@ func (t *AppTagger) EventCallback() { cclog.Fatal(err) } + t.apps = make([]appInfo, 0) + for _, fn := range files { if fn.IsDir() { continue @@ -163,7 +165,7 @@ func (t *AppTagger) Register() error { // It fetches the job metadata, extracts the job script, and matches it against // all configured application patterns using regular expressions. // If a match is found, the corresponding application tag is added to the job. -// Only the first matching application is tagged. +// Multiple application tags can be applied if patterns for different apps match. func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() @@ -199,6 +201,7 @@ func (t *AppTagger) Match(job *schema.Job) { jobscriptLower := strings.ToLower(jobscript) cclog.Debugf("AppTagger: matching job %d (script length: %d) against %d apps", id, len(jobscriptLower), len(t.apps)) + matched := false for _, a := range t.apps { for _, re := range a.patterns { if re.MatchString(jobscriptLower) { @@ -210,10 +213,13 @@ func (t *AppTagger) Match(job *schema.Job) { cclog.Errorf("AppTagger: failed to add tag '%s' to job %d: %v", a.tag, id, err) } } - return + matched = true + break // matched this app, move to next app } } } - cclog.Debugf("AppTagger: no pattern matched for job %d on %s", id, job.Cluster) + if !matched { + cclog.Debugf("AppTagger: no pattern matched for job %d on %s", id, job.Cluster) + } } diff --git a/pkg/metricstore/query.go b/pkg/metricstore/query.go index 7dce5dcd..0a61efaa 100644 --- a/pkg/metricstore/query.go +++ b/pkg/metricstore/query.go @@ -93,6 +93,12 @@ func (ccms *InternalMetricStore) LoadData( return nil, err } + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d", + len(queries), len(assignedScope)) + } + req := APIQueryRequest{ Cluster: job.Cluster, From: job.StartTime, @@ -110,9 +116,24 @@ func (ccms *InternalMetricStore) LoadData( var errors []string jobData := make(schema.JobData) + + // Add safety check for potential index out of range errors + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + for i, row := range resBody.Results { - if len(row) == 0 { - // No Data Found For Metric, Logged in FetchData to Warn + // Safety check to prevent index out of range errors + if i >= len(req.Queries) || i >= len(assignedScope) { + cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d", + i, len(req.Queries), len(assignedScope)) continue } @@ -120,6 +141,12 @@ func (ccms *InternalMetricStore) LoadData( metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(job.Cluster, metric) + + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster) + continue + } + if _, ok := jobData[metric]; !ok { jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric) } @@ -148,8 +175,15 @@ func (ccms *InternalMetricStore) LoadData( id := (*string)(nil) if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] + // Check if ndx is within the bounds of TypeIds slice + if ndx < len(query.TypeIds) { + id = new(string) + *id = query.TypeIds[ndx] + } else { + // Log the error but continue processing + cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s", + ndx, len(query.TypeIds), query.Metric, query.Hostname) + } } sanitizeStats(&res) @@ -650,8 +684,15 @@ func (ccms *InternalMetricStore) LoadScopedStats( id := (*string)(nil) if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] + // Check if ndx is within the bounds of TypeIds slice + if ndx < len(query.TypeIds) { + id = new(string) + *id = query.TypeIds[ndx] + } else { + // Log the error but continue processing + cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s", + ndx, len(query.TypeIds), query.Metric, query.Hostname) + } } sanitizeStats(&res) @@ -823,6 +864,12 @@ func (ccms *InternalMetricStore) LoadNodeListData( return nil, err } + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d", + len(queries), len(assignedScope)) + } + req := APIQueryRequest{ Cluster: cluster, Queries: queries, @@ -840,14 +887,36 @@ func (ccms *InternalMetricStore) LoadNodeListData( var errors []string data := make(map[string]schema.JobData) + + // Add safety check for index out of range issues + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + for i, row := range resBody.Results { - if len(row) == 0 { - // No Data Found For Metric, Logged in FetchData to Warn + // Safety check to prevent index out of range errors + if i >= len(req.Queries) || i >= len(assignedScope) { + cclog.Warnf("Index out of range prevented: i=%d, queries=%d, assignedScope=%d", + i, len(req.Queries), len(assignedScope)) continue } + var query APIQuery if resBody.Queries != nil { - query = resBody.Queries[i] + if i < len(resBody.Queries) { + query = resBody.Queries[i] + } else { + cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d", + i, len(resBody.Queries)) + continue + } } else { query = req.Queries[i] } @@ -855,6 +924,10 @@ func (ccms *InternalMetricStore) LoadNodeListData( metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster) + continue + } res := mc.Timestep if len(row) > 0 { @@ -893,8 +966,15 @@ func (ccms *InternalMetricStore) LoadNodeListData( id := (*string)(nil) if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] + // Check if ndx is within the bounds of TypeIds slice + if ndx < len(query.TypeIds) { + id = new(string) + *id = query.TypeIds[ndx] + } else { + // Log the error but continue processing + cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s", + ndx, len(query.TypeIds), query.Metric, query.Hostname) + } } sanitizeStats(&res) diff --git a/web/frontend/src/List.root.svelte b/web/frontend/src/List.root.svelte index 239bf5f1..e8af8f44 100644 --- a/web/frontend/src/List.root.svelte +++ b/web/frontend/src/List.root.svelte @@ -61,6 +61,9 @@ } return colbase }) + const sortedRows = $derived( + $stats.data ? sort($stats.data.rows, sorting, nameFilter) : [] + ); let stats = $derived( queryStore({ @@ -87,6 +90,40 @@ ); /* Functions */ + function exportCsv() { + const isUser = type === "USER"; + const header = [ + isUser ? "Username" : "Project", + ...(isUser ? ["Name"] : []), + "Total Jobs", + "Short Jobs", + ...(fetchRunning ? ["Total Cores", "Total Accelerators"] : []), + "Total Walltime", + "Total Core Hours", + "Total Accelerator Hours", + ]; + const rows = sortedRows.map((row) => [ + row.id, + ...(isUser ? [row?.name ?? ""] : []), + row.totalJobs, + row.shortJobs, + ...(fetchRunning ? [row.totalCores, row.totalAccs] : []), + row.totalWalltime, + row.totalCoreHours, + row.totalAccHours, + ]); + const csv = [header, ...rows] + .map((row) => row.map((v) => `"${String(v ?? "").replace(/"/g, '""')}"`).join(",")) + .join("\n"); + const blob = new Blob([csv], { type: "text/csv" }); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = `${type.toLowerCase()}s.csv`; + a.click(); + URL.revokeObjectURL(url); + } + function changeSorting(newField) { if (sorting.field == newField) { // Same Field, Change Direction @@ -137,6 +174,14 @@ PROJECT: 'project', }[type]}" /> + diff --git a/web/frontend/src/generic/utils.js b/web/frontend/src/generic/utils.js index b012843d..09239ec8 100644 --- a/web/frontend/src/generic/utils.js +++ b/web/frontend/src/generic/utils.js @@ -60,6 +60,7 @@ export function init(extraInitQuery = "") { topology { node socket + memoryDomain core accelerators { id } } @@ -238,7 +239,7 @@ export function groupByScope(jobMetrics) { const scopeGranularity = { node: 10, socket: 5, - memorydomain: 4, + memoryDomain: 4, core: 3, hwthread: 2, accelerator: 1 diff --git a/web/frontend/src/status/dashdetails/HealthDash.svelte b/web/frontend/src/status/dashdetails/HealthDash.svelte index b4063309..2730642b 100644 --- a/web/frontend/src/status/dashdetails/HealthDash.svelte +++ b/web/frontend/src/status/dashdetails/HealthDash.svelte @@ -107,6 +107,12 @@ return pendingTableData }); + const refinedStateData = $derived.by(() => { + return $statusQuery?.data?.nodeStates. + filter((e) => ['allocated', 'reserved', 'idle', 'mixed','down', 'unknown'].includes(e.state)). + sort((a, b) => b.count - a.count) + }); + const refinedHealthData = $derived.by(() => { return $statusQuery?.data?.nodeStates. filter((e) => ['full', 'partial', 'failed'].includes(e.state)).