6 Commits

Author SHA1 Message Date
01ec70baa8 Iterate over subCluster MetricConfig directly so that removed metrics are not included
Entire-Checkpoint: efb6f0a96069
2026-03-20 11:39:34 +01:00
fb176c5afb Remove static linkage for helper tools 2026-03-20 09:34:49 +01:00
999d93efc3 Fix goreleaser config. Cleanup. 2026-03-20 09:19:13 +01:00
359962d166 Fix typo 2026-03-20 08:23:46 +01:00
60554896d5 Update ReleaseNote for upcoming release
Entire-Checkpoint: 30099a746fc7
2026-03-20 08:21:16 +01:00
bf48389aeb Optimize sortby in stats queries
Entire-Checkpoint: 9b5b833472e1
2026-03-20 05:39:22 +01:00
7 changed files with 110 additions and 35 deletions

View File

@@ -5,6 +5,7 @@ before:
builds: builds:
- env: - env:
- CGO_ENABLED=1 - CGO_ENABLED=1
- CC=x86_64-linux-musl-gcc
goos: goos:
- linux - linux
goarch: goarch:

View File

@@ -10,7 +10,10 @@ If you are upgrading from v1.5.0 you need to do another DB migration. This
should not take long. For optimal database performance after the migration it is should not take long. For optimal database performance after the migration it is
recommended to apply the new `optimize-db` flag, which runs the sqlite `ANALYZE` recommended to apply the new `optimize-db` flag, which runs the sqlite `ANALYZE`
and `VACUUM` commands. Depending on your database size (more then 40GB) the and `VACUUM` commands. Depending on your database size (more then 40GB) the
`VACUUM` may take up to 2h. `VACUUM` may take up to 2h. You can also run the `ANALYZE` command manually.
While we are confident that the memory issue with the metricstore cleanup move
policy is fixed, it is still recommended to use delete policy for cleanup.
This is also the default.
## Changes in 1.5.2 ## Changes in 1.5.2
@@ -19,6 +22,14 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
- **Memory spike in parquet writer**: Fixed memory spikes when using the - **Memory spike in parquet writer**: Fixed memory spikes when using the
metricstore move (archive) policy with the parquet writer. The writer now metricstore move (archive) policy with the parquet writer. The writer now
processes data in a streaming fashion to avoid accumulating large allocations. processes data in a streaming fashion to avoid accumulating large allocations.
- **Top list query fixes**: Fixed top list queries in analysis and dashboard
views.
- **Exclude down nodes from HealthCheck**: Down nodes are now excluded from
health checks in both the REST and NATS handlers.
- **Node state priority order**: Node state determination now enforces a
priority order. Exception: idle+down results in idle.
- **Blocking ReceiveNats call**: Fixed a blocking NATS receive call in the
metricstore.
### Database performance ### Database performance
@@ -33,6 +44,16 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
write load. write load.
- **Increased default SQLite timeout**: The default SQLite connection timeout - **Increased default SQLite timeout**: The default SQLite connection timeout
has been raised to reduce spurious timeout errors under load. has been raised to reduce spurious timeout errors under load.
- **Optimized stats queries**: Improved sortby handling in stats queries, fixed
cache key passing, and simplified a stats query condition that caused an
expensive unnecessary subquery.
### MetricStore performance
- **Sharded WAL consumer**: The WAL consumer is now sharded for significantly
higher write throughput.
- **NATS contention fix**: Fixed contention in the metricstore NATS ingestion
path.
### NATS API ### NATS API
@@ -52,6 +73,24 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
operation. operation.
- **Checkpoint archiving log**: Added an informational log message when the - **Checkpoint archiving log**: Added an informational log message when the
metricstore checkpoint archiving process runs. metricstore checkpoint archiving process runs.
- **Auth failure context**: Auth failure log messages now include more context
information.
### Behavior changes
- **DB-based metricHealth**: Replaced heuristic-based metric health with
DB-based metric health for the node view, providing more accurate health
status information.
- **Removed minRunningFor filter remnants**: Cleaned up remaining `minRunningFor`
references from the GraphQL schema and query builder.
### Frontend
- **Streamlined statsSeries**: Unified stats series calculation and rendering
across plot components.
- **Clarified plot titles**: Improved titles in dashboard and health views.
- **Bumped frontend dependencies**: Updated frontend dependencies to latest
versions.
### Dependencies ### Dependencies
@@ -67,7 +106,7 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
running has to be allowed to execute the journalctl command. running has to be allowed to execute the journalctl command.
- The user configuration keys for the ui have changed. Therefore old user - The user configuration keys for the ui have changed. Therefore old user
configuration persisted in the database is not used anymore. It is recommended configuration persisted in the database is not used anymore. It is recommended
to configure the metrics shown in the ui-config sestion and remove all records to configure the metrics shown in the ui-config section and remove all records
in the table after the update. in the table after the update.
- Currently energy footprint metrics of type energy are ignored for calculating - Currently energy footprint metrics of type energy are ignored for calculating
total energy. total energy.

4
go.sum
View File

@@ -4,10 +4,6 @@ github.com/99designs/gqlgen v0.17.88 h1:neMQDgehMwT1vYIOx/w5ZYPUU/iMNAJzRO44I5In
github.com/99designs/gqlgen v0.17.88/go.mod h1:qeqYFEgOeSKqWedOjogPizimp2iu4E23bdPvl4jTYic= github.com/99designs/gqlgen v0.17.88/go.mod h1:qeqYFEgOeSKqWedOjogPizimp2iu4E23bdPvl4jTYic=
github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A= github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk= github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
github.com/ClusterCockpit/cc-lib/v2 v2.8.2 h1:rCLZk8wz8yq8xBnBEdVKigvA2ngR8dPmHbEFwxxb3jw=
github.com/ClusterCockpit/cc-lib/v2 v2.8.2/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
github.com/ClusterCockpit/cc-lib/v2 v2.9.0 h1:mzUYakcjwb+UP5II4jOvr36rSYct90gXBbtUg+nvm9c=
github.com/ClusterCockpit/cc-lib/v2 v2.9.0/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
github.com/ClusterCockpit/cc-lib/v2 v2.9.1 h1:eplKhXQyGAElBGCEGdmxwj7fLv26Op16uK0KxUePDak= github.com/ClusterCockpit/cc-lib/v2 v2.9.1 h1:eplKhXQyGAElBGCEGdmxwj7fLv26Op16uK0KxUePDak=
github.com/ClusterCockpit/cc-lib/v2 v2.9.1/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o= github.com/ClusterCockpit/cc-lib/v2 v2.9.1/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q= github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=

View File

@@ -676,6 +676,11 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
// Use request-scoped cache: multiple aliases with same (filter, groupBy) // Use request-scoped cache: multiple aliases with same (filter, groupBy)
// but different sortBy/page hit the DB only once. // but different sortBy/page hit the DB only once.
if cache := getStatsGroupCache(ctx); cache != nil { if cache := getStatsGroupCache(ctx); cache != nil {
// Ensure the sort field is computed even if not in the GraphQL selection,
// because sortAndPageStats will sort by it in memory.
if sortBy != nil {
reqFields[sortByFieldName(*sortBy)] = true
}
key := statsCacheKey(filter, groupBy, reqFields) key := statsCacheKey(filter, groupBy, reqFields)
var allStats []*model.JobsStatistics var allStats []*model.JobsStatistics
allStats, err = cache.getOrCompute(key, func() ([]*model.JobsStatistics, error) { allStats, err = cache.getOrCompute(key, func() ([]*model.JobsStatistics, error) {

View File

@@ -107,6 +107,33 @@ func sortAndPageStats(allStats []*model.JobsStatistics, sortBy *model.SortByAggr
return sorted return sorted
} }
// sortByFieldName maps a SortByAggregate enum to the corresponding reqFields key.
// This ensures the DB computes the column that sortAndPageStats will sort by.
func sortByFieldName(sortBy model.SortByAggregate) string {
switch sortBy {
case model.SortByAggregateTotaljobs:
return "totalJobs"
case model.SortByAggregateTotalusers:
return "totalUsers"
case model.SortByAggregateTotalwalltime:
return "totalWalltime"
case model.SortByAggregateTotalnodes:
return "totalNodes"
case model.SortByAggregateTotalnodehours:
return "totalNodeHours"
case model.SortByAggregateTotalcores:
return "totalCores"
case model.SortByAggregateTotalcorehours:
return "totalCoreHours"
case model.SortByAggregateTotalaccs:
return "totalAccs"
case model.SortByAggregateTotalacchours:
return "totalAccHours"
default:
return "totalJobs"
}
}
// statsFieldGetter returns a function that extracts the sortable int field // statsFieldGetter returns a function that extracts the sortable int field
// from a JobsStatistics struct for the given sort key. // from a JobsStatistics struct for the given sort key.
func statsFieldGetter(sortBy model.SortByAggregate) func(*model.JobsStatistics) int { func statsFieldGetter(sortBy model.SortByAggregate) func(*model.JobsStatistics) int {

View File

@@ -198,25 +198,12 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric { func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric {
metrics := make(map[string]*schema.Metric) metrics := make(map[string]*schema.Metric)
for _, c := range Clusters { sc, err := GetSubCluster(cluster, subcluster)
if c.Name == cluster { if err != nil {
for _, m := range c.MetricConfig { return metrics
for _, s := range m.SubClusters {
if s.Name == subcluster {
metrics[m.Name] = &schema.Metric{
Name: m.Name,
Unit: s.Unit,
Peak: s.Peak,
Normal: s.Normal,
Caution: s.Caution,
Alert: s.Alert,
}
break
}
} }
_, ok := metrics[m.Name] for _, m := range sc.MetricConfig {
if !ok {
metrics[m.Name] = &schema.Metric{ metrics[m.Name] = &schema.Metric{
Name: m.Name, Name: m.Name,
Unit: m.Unit, Unit: m.Unit,
@@ -226,10 +213,6 @@ func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Me
Alert: m.Alert, Alert: m.Alert,
} }
} }
}
break
}
}
return metrics return metrics
} }

View File

@@ -37,3 +37,27 @@ func TestClusterConfig(t *testing.T) {
// spew.Dump(archive.GlobalMetricList) // spew.Dump(archive.GlobalMetricList)
// t.Fail() // t.Fail()
} }
func TestGetMetricConfigSubClusterRespectsRemovedMetrics(t *testing.T) {
if err := archive.Init(json.RawMessage(`{"kind": "file","path": "testdata/archive"}`)); err != nil {
t.Fatal(err)
}
sc, err := archive.GetSubCluster("fritz", "spr2tb")
if err != nil {
t.Fatal(err)
}
metrics := archive.GetMetricConfigSubCluster("fritz", "spr2tb")
if len(metrics) != len(sc.MetricConfig) {
t.Fatalf("GetMetricConfigSubCluster() returned %d metrics, want %d", len(metrics), len(sc.MetricConfig))
}
if _, ok := metrics["flops_any"]; ok {
t.Fatalf("GetMetricConfigSubCluster() returned removed metric flops_any for subcluster spr2tb")
}
if _, ok := metrics["cpu_power"]; !ok {
t.Fatalf("GetMetricConfigSubCluster() missing active metric cpu_power for subcluster spr2tb")
}
}