Iterate over subCluster MetricConfig directly so that removed metrics are not included

Entire-Checkpoint: efb6f0a96069
Remove static linkage for helper tools
2026-03-21 15:27:30 +01:00 · 2026-03-20 11:39:34 +01:00 · 2026-03-20 09:34:49 +01:00 · 2026-03-20 09:19:13 +01:00 · 2026-03-20 08:23:46 +01:00 · 2026-03-20 08:21:16 +01:00
7 changed files with 110 additions and 35 deletions
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -5,6 +5,7 @@ before:
 builds:
  - env:
      - CGO_ENABLED=1
      - CC=x86_64-linux-musl-gcc
    goos:
      - linux
    goarch:
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -10,7 +10,10 @@ If you are upgrading from v1.5.0 you need to do another DB migration. This
 should not take long. For optimal database performance after the migration it is
 recommended to apply the new `optimize-db` flag, which runs the sqlite `ANALYZE`
 and `VACUUM` commands. Depending on your database size (more then 40GB) the
-`VACUUM` may take up to 2h.
+`VACUUM` may take up to 2h. You can also run the `ANALYZE` command manually.
 While we are confident that the memory issue with the metricstore cleanup move
 policy is fixed, it is still recommended to use delete policy for cleanup.
 This is also the default.
 ## Changes in 1.5.2
@@ -19,6 +22,14 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
 - **Memory spike in parquet writer**: Fixed memory spikes when using the
  metricstore move (archive) policy with the parquet writer. The writer now
  processes data in a streaming fashion to avoid accumulating large allocations.
 - **Top list query fixes**: Fixed top list queries in analysis and dashboard
  views.
 - **Exclude down nodes from HealthCheck**: Down nodes are now excluded from
  health checks in both the REST and NATS handlers.
 - **Node state priority order**: Node state determination now enforces a
  priority order. Exception: idle+down results in idle.
 - **Blocking ReceiveNats call**: Fixed a blocking NATS receive call in the
  metricstore.
 ### Database performance
@@ -33,6 +44,16 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
  write load.
 - **Increased default SQLite timeout**: The default SQLite connection timeout
  has been raised to reduce spurious timeout errors under load.
 - **Optimized stats queries**: Improved sortby handling in stats queries, fixed
  cache key passing, and simplified a stats query condition that caused an
  expensive unnecessary subquery.
 ### MetricStore performance
 - **Sharded WAL consumer**: The WAL consumer is now sharded for significantly
  higher write throughput.
 - **NATS contention fix**: Fixed contention in the metricstore NATS ingestion
  path.
 ### NATS API
@@ -52,6 +73,24 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
  operation.
 - **Checkpoint archiving log**: Added an informational log message when the
  metricstore checkpoint archiving process runs.
 - **Auth failure context**: Auth failure log messages now include more context
  information.
 ### Behavior changes
 - **DB-based metricHealth**: Replaced heuristic-based metric health with
  DB-based metric health for the node view, providing more accurate health
  status information.
 - **Removed minRunningFor filter remnants**: Cleaned up remaining `minRunningFor`
  references from the GraphQL schema and query builder.
 ### Frontend
 - **Streamlined statsSeries**: Unified stats series calculation and rendering
  across plot components.
 - **Clarified plot titles**: Improved titles in dashboard and health views.
 - **Bumped frontend dependencies**: Updated frontend dependencies to latest
  versions.
 ### Dependencies
@@ -67,7 +106,7 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
  running has to be allowed to execute the journalctl command.
 - The user configuration keys for the ui have changed. Therefore old user
  configuration persisted in the database is not used anymore. It is recommended
-  to configure the metrics shown in the ui-config sestion and remove all records
+  to configure the metrics shown in the ui-config section and remove all records
  in the table after the update.
 - Currently energy footprint metrics of type energy are ignored for calculating
  total energy.
--- a/go.sum
+++ b/go.sum
@@ -4,10 +4,6 @@ github.com/99designs/gqlgen v0.17.88 h1:neMQDgehMwT1vYIOx/w5ZYPUU/iMNAJzRO44I5In
 github.com/99designs/gqlgen v0.17.88/go.mod h1:qeqYFEgOeSKqWedOjogPizimp2iu4E23bdPvl4jTYic=
 github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
 github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
 github.com/ClusterCockpit/cc-lib/v2 v2.8.2 h1:rCLZk8wz8yq8xBnBEdVKigvA2ngR8dPmHbEFwxxb3jw=
 github.com/ClusterCockpit/cc-lib/v2 v2.8.2/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
 github.com/ClusterCockpit/cc-lib/v2 v2.9.0 h1:mzUYakcjwb+UP5II4jOvr36rSYct90gXBbtUg+nvm9c=
 github.com/ClusterCockpit/cc-lib/v2 v2.9.0/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
 github.com/ClusterCockpit/cc-lib/v2 v2.9.1 h1:eplKhXQyGAElBGCEGdmxwj7fLv26Op16uK0KxUePDak=
 github.com/ClusterCockpit/cc-lib/v2 v2.9.1/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
 github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
--- a/internal/graph/schema.resolvers.go
+++ b/internal/graph/schema.resolvers.go
@@ -676,6 +676,11 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
 			// Use request-scoped cache: multiple aliases with same (filter, groupBy)
 			// but different sortBy/page hit the DB only once.
 			if cache := getStatsGroupCache(ctx); cache != nil {
 				// Ensure the sort field is computed even if not in the GraphQL selection,
 				// because sortAndPageStats will sort by it in memory.
 				if sortBy != nil {
 					reqFields[sortByFieldName(*sortBy)] = true
 				}
 				key := statsCacheKey(filter, groupBy, reqFields)
 				var allStats []*model.JobsStatistics
 				allStats, err = cache.getOrCompute(key, func() ([]*model.JobsStatistics, error) {
--- a/internal/graph/stats_cache.go
+++ b/internal/graph/stats_cache.go
@@ -107,6 +107,33 @@ func sortAndPageStats(allStats []*model.JobsStatistics, sortBy *model.SortByAggr
 	return sorted
 }
 // sortByFieldName maps a SortByAggregate enum to the corresponding reqFields key.
 // This ensures the DB computes the column that sortAndPageStats will sort by.
 func sortByFieldName(sortBy model.SortByAggregate) string {
 	switch sortBy {
 	case model.SortByAggregateTotaljobs:
 		return "totalJobs"
 	case model.SortByAggregateTotalusers:
 		return "totalUsers"
 	case model.SortByAggregateTotalwalltime:
 		return "totalWalltime"
 	case model.SortByAggregateTotalnodes:
 		return "totalNodes"
 	case model.SortByAggregateTotalnodehours:
 		return "totalNodeHours"
 	case model.SortByAggregateTotalcores:
 		return "totalCores"
 	case model.SortByAggregateTotalcorehours:
 		return "totalCoreHours"
 	case model.SortByAggregateTotalaccs:
 		return "totalAccs"
 	case model.SortByAggregateTotalacchours:
 		return "totalAccHours"
 	default:
 		return "totalJobs"
 	}
 }
 // statsFieldGetter returns a function that extracts the sortable int field
 // from a JobsStatistics struct for the given sort key.
 func statsFieldGetter(sortBy model.SortByAggregate) func(*model.JobsStatistics) int {
--- a/pkg/archive/clusterConfig.go
+++ b/pkg/archive/clusterConfig.go
@@ -198,25 +198,12 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
 func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric {
 	metrics := make(map[string]*schema.Metric)
-	for _, c := range Clusters {
+	sc, err := GetSubCluster(cluster, subcluster)
-		if c.Name == cluster {
+	if err != nil {
-			for _, m := range c.MetricConfig {
+		return metrics
 				for _, s := range m.SubClusters {
 					if s.Name == subcluster {
 						metrics[m.Name] = &schema.Metric{
 							Name:    m.Name,
 							Unit:    s.Unit,
 							Peak:    s.Peak,
 							Normal:  s.Normal,
 							Caution: s.Caution,
 							Alert:   s.Alert,
 						}
 						break
 					}
 	}
-				_, ok := metrics[m.Name]
+	for _, m := range sc.MetricConfig {
 				if !ok {
 		metrics[m.Name] = &schema.Metric{
 			Name:    m.Name,
 			Unit:    m.Unit,
@@ -226,10 +213,6 @@ func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Me
 			Alert:   m.Alert,
 		}
 	}
 			}
 			break
 		}
 	}
 	return metrics
 }
--- a/pkg/archive/clusterConfig_test.go
+++ b/pkg/archive/clusterConfig_test.go
@@ -37,3 +37,27 @@ func TestClusterConfig(t *testing.T) {
 	// spew.Dump(archive.GlobalMetricList)
 	// t.Fail()
 }
 func TestGetMetricConfigSubClusterRespectsRemovedMetrics(t *testing.T) {
 	if err := archive.Init(json.RawMessage(`{"kind": "file","path": "testdata/archive"}`)); err != nil {
 		t.Fatal(err)
 	}
 	sc, err := archive.GetSubCluster("fritz", "spr2tb")
 	if err != nil {
 		t.Fatal(err)
 	}
 	metrics := archive.GetMetricConfigSubCluster("fritz", "spr2tb")
 	if len(metrics) != len(sc.MetricConfig) {
 		t.Fatalf("GetMetricConfigSubCluster() returned %d metrics, want %d", len(metrics), len(sc.MetricConfig))
 	}
 	if _, ok := metrics["flops_any"]; ok {
 		t.Fatalf("GetMetricConfigSubCluster() returned removed metric flops_any for subcluster spr2tb")
 	}
 	if _, ok := metrics["cpu_power"]; !ok {
 		t.Fatalf("GetMetricConfigSubCluster() missing active metric cpu_power for subcluster spr2tb")
 	}
 }
Author	SHA1	Message	Date
Jan Eitzinger	01ec70baa8	Iterate over subCluster MetricConfig directly so that removed metrics are not included Entire-Checkpoint: efb6f0a96069	2026-03-20 11:39:34 +01:00
Jan Eitzinger	fb176c5afb	Remove static linkage for helper tools	2026-03-20 09:34:49 +01:00
Jan Eitzinger	999d93efc3	Fix goreleaser config. Cleanup.	2026-03-20 09:19:13 +01:00
Jan Eitzinger	359962d166	Fix typo	2026-03-20 08:23:46 +01:00
Jan Eitzinger	60554896d5	Update ReleaseNote for upcoming release Entire-Checkpoint: 30099a746fc7	2026-03-20 08:21:16 +01:00
Jan Eitzinger	bf48389aeb	Optimize sortby in stats queries Entire-Checkpoint: 9b5b833472e1	2026-03-20 05:39:22 +01:00