mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-21 15:27:30 +01:00
Compare commits
6 Commits
feature/52
...
hotfix
| Author | SHA1 | Date | |
|---|---|---|---|
| 01ec70baa8 | |||
| fb176c5afb | |||
| 999d93efc3 | |||
| 359962d166 | |||
| 60554896d5 | |||
| bf48389aeb |
@@ -5,6 +5,7 @@ before:
|
|||||||
builds:
|
builds:
|
||||||
- env:
|
- env:
|
||||||
- CGO_ENABLED=1
|
- CGO_ENABLED=1
|
||||||
|
- CC=x86_64-linux-musl-gcc
|
||||||
goos:
|
goos:
|
||||||
- linux
|
- linux
|
||||||
goarch:
|
goarch:
|
||||||
|
|||||||
@@ -10,7 +10,10 @@ If you are upgrading from v1.5.0 you need to do another DB migration. This
|
|||||||
should not take long. For optimal database performance after the migration it is
|
should not take long. For optimal database performance after the migration it is
|
||||||
recommended to apply the new `optimize-db` flag, which runs the sqlite `ANALYZE`
|
recommended to apply the new `optimize-db` flag, which runs the sqlite `ANALYZE`
|
||||||
and `VACUUM` commands. Depending on your database size (more then 40GB) the
|
and `VACUUM` commands. Depending on your database size (more then 40GB) the
|
||||||
`VACUUM` may take up to 2h.
|
`VACUUM` may take up to 2h. You can also run the `ANALYZE` command manually.
|
||||||
|
While we are confident that the memory issue with the metricstore cleanup move
|
||||||
|
policy is fixed, it is still recommended to use delete policy for cleanup.
|
||||||
|
This is also the default.
|
||||||
|
|
||||||
## Changes in 1.5.2
|
## Changes in 1.5.2
|
||||||
|
|
||||||
@@ -19,6 +22,14 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
|
|||||||
- **Memory spike in parquet writer**: Fixed memory spikes when using the
|
- **Memory spike in parquet writer**: Fixed memory spikes when using the
|
||||||
metricstore move (archive) policy with the parquet writer. The writer now
|
metricstore move (archive) policy with the parquet writer. The writer now
|
||||||
processes data in a streaming fashion to avoid accumulating large allocations.
|
processes data in a streaming fashion to avoid accumulating large allocations.
|
||||||
|
- **Top list query fixes**: Fixed top list queries in analysis and dashboard
|
||||||
|
views.
|
||||||
|
- **Exclude down nodes from HealthCheck**: Down nodes are now excluded from
|
||||||
|
health checks in both the REST and NATS handlers.
|
||||||
|
- **Node state priority order**: Node state determination now enforces a
|
||||||
|
priority order. Exception: idle+down results in idle.
|
||||||
|
- **Blocking ReceiveNats call**: Fixed a blocking NATS receive call in the
|
||||||
|
metricstore.
|
||||||
|
|
||||||
### Database performance
|
### Database performance
|
||||||
|
|
||||||
@@ -33,6 +44,16 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
|
|||||||
write load.
|
write load.
|
||||||
- **Increased default SQLite timeout**: The default SQLite connection timeout
|
- **Increased default SQLite timeout**: The default SQLite connection timeout
|
||||||
has been raised to reduce spurious timeout errors under load.
|
has been raised to reduce spurious timeout errors under load.
|
||||||
|
- **Optimized stats queries**: Improved sortby handling in stats queries, fixed
|
||||||
|
cache key passing, and simplified a stats query condition that caused an
|
||||||
|
expensive unnecessary subquery.
|
||||||
|
|
||||||
|
### MetricStore performance
|
||||||
|
|
||||||
|
- **Sharded WAL consumer**: The WAL consumer is now sharded for significantly
|
||||||
|
higher write throughput.
|
||||||
|
- **NATS contention fix**: Fixed contention in the metricstore NATS ingestion
|
||||||
|
path.
|
||||||
|
|
||||||
### NATS API
|
### NATS API
|
||||||
|
|
||||||
@@ -52,6 +73,24 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
|
|||||||
operation.
|
operation.
|
||||||
- **Checkpoint archiving log**: Added an informational log message when the
|
- **Checkpoint archiving log**: Added an informational log message when the
|
||||||
metricstore checkpoint archiving process runs.
|
metricstore checkpoint archiving process runs.
|
||||||
|
- **Auth failure context**: Auth failure log messages now include more context
|
||||||
|
information.
|
||||||
|
|
||||||
|
### Behavior changes
|
||||||
|
|
||||||
|
- **DB-based metricHealth**: Replaced heuristic-based metric health with
|
||||||
|
DB-based metric health for the node view, providing more accurate health
|
||||||
|
status information.
|
||||||
|
- **Removed minRunningFor filter remnants**: Cleaned up remaining `minRunningFor`
|
||||||
|
references from the GraphQL schema and query builder.
|
||||||
|
|
||||||
|
### Frontend
|
||||||
|
|
||||||
|
- **Streamlined statsSeries**: Unified stats series calculation and rendering
|
||||||
|
across plot components.
|
||||||
|
- **Clarified plot titles**: Improved titles in dashboard and health views.
|
||||||
|
- **Bumped frontend dependencies**: Updated frontend dependencies to latest
|
||||||
|
versions.
|
||||||
|
|
||||||
### Dependencies
|
### Dependencies
|
||||||
|
|
||||||
@@ -67,7 +106,7 @@ and `VACUUM` commands. Depending on your database size (more then 40GB) the
|
|||||||
running has to be allowed to execute the journalctl command.
|
running has to be allowed to execute the journalctl command.
|
||||||
- The user configuration keys for the ui have changed. Therefore old user
|
- The user configuration keys for the ui have changed. Therefore old user
|
||||||
configuration persisted in the database is not used anymore. It is recommended
|
configuration persisted in the database is not used anymore. It is recommended
|
||||||
to configure the metrics shown in the ui-config sestion and remove all records
|
to configure the metrics shown in the ui-config section and remove all records
|
||||||
in the table after the update.
|
in the table after the update.
|
||||||
- Currently energy footprint metrics of type energy are ignored for calculating
|
- Currently energy footprint metrics of type energy are ignored for calculating
|
||||||
total energy.
|
total energy.
|
||||||
|
|||||||
4
go.sum
4
go.sum
@@ -4,10 +4,6 @@ github.com/99designs/gqlgen v0.17.88 h1:neMQDgehMwT1vYIOx/w5ZYPUU/iMNAJzRO44I5In
|
|||||||
github.com/99designs/gqlgen v0.17.88/go.mod h1:qeqYFEgOeSKqWedOjogPizimp2iu4E23bdPvl4jTYic=
|
github.com/99designs/gqlgen v0.17.88/go.mod h1:qeqYFEgOeSKqWedOjogPizimp2iu4E23bdPvl4jTYic=
|
||||||
github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
|
github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
|
||||||
github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
|
github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.8.2 h1:rCLZk8wz8yq8xBnBEdVKigvA2ngR8dPmHbEFwxxb3jw=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.8.2/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.9.0 h1:mzUYakcjwb+UP5II4jOvr36rSYct90gXBbtUg+nvm9c=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.9.0/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.9.1 h1:eplKhXQyGAElBGCEGdmxwj7fLv26Op16uK0KxUePDak=
|
github.com/ClusterCockpit/cc-lib/v2 v2.9.1 h1:eplKhXQyGAElBGCEGdmxwj7fLv26Op16uK0KxUePDak=
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.9.1/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
|
github.com/ClusterCockpit/cc-lib/v2 v2.9.1/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
|
||||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
|
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
|
||||||
|
|||||||
@@ -676,6 +676,11 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
|
|||||||
// Use request-scoped cache: multiple aliases with same (filter, groupBy)
|
// Use request-scoped cache: multiple aliases with same (filter, groupBy)
|
||||||
// but different sortBy/page hit the DB only once.
|
// but different sortBy/page hit the DB only once.
|
||||||
if cache := getStatsGroupCache(ctx); cache != nil {
|
if cache := getStatsGroupCache(ctx); cache != nil {
|
||||||
|
// Ensure the sort field is computed even if not in the GraphQL selection,
|
||||||
|
// because sortAndPageStats will sort by it in memory.
|
||||||
|
if sortBy != nil {
|
||||||
|
reqFields[sortByFieldName(*sortBy)] = true
|
||||||
|
}
|
||||||
key := statsCacheKey(filter, groupBy, reqFields)
|
key := statsCacheKey(filter, groupBy, reqFields)
|
||||||
var allStats []*model.JobsStatistics
|
var allStats []*model.JobsStatistics
|
||||||
allStats, err = cache.getOrCompute(key, func() ([]*model.JobsStatistics, error) {
|
allStats, err = cache.getOrCompute(key, func() ([]*model.JobsStatistics, error) {
|
||||||
|
|||||||
@@ -107,6 +107,33 @@ func sortAndPageStats(allStats []*model.JobsStatistics, sortBy *model.SortByAggr
|
|||||||
return sorted
|
return sorted
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sortByFieldName maps a SortByAggregate enum to the corresponding reqFields key.
|
||||||
|
// This ensures the DB computes the column that sortAndPageStats will sort by.
|
||||||
|
func sortByFieldName(sortBy model.SortByAggregate) string {
|
||||||
|
switch sortBy {
|
||||||
|
case model.SortByAggregateTotaljobs:
|
||||||
|
return "totalJobs"
|
||||||
|
case model.SortByAggregateTotalusers:
|
||||||
|
return "totalUsers"
|
||||||
|
case model.SortByAggregateTotalwalltime:
|
||||||
|
return "totalWalltime"
|
||||||
|
case model.SortByAggregateTotalnodes:
|
||||||
|
return "totalNodes"
|
||||||
|
case model.SortByAggregateTotalnodehours:
|
||||||
|
return "totalNodeHours"
|
||||||
|
case model.SortByAggregateTotalcores:
|
||||||
|
return "totalCores"
|
||||||
|
case model.SortByAggregateTotalcorehours:
|
||||||
|
return "totalCoreHours"
|
||||||
|
case model.SortByAggregateTotalaccs:
|
||||||
|
return "totalAccs"
|
||||||
|
case model.SortByAggregateTotalacchours:
|
||||||
|
return "totalAccHours"
|
||||||
|
default:
|
||||||
|
return "totalJobs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// statsFieldGetter returns a function that extracts the sortable int field
|
// statsFieldGetter returns a function that extracts the sortable int field
|
||||||
// from a JobsStatistics struct for the given sort key.
|
// from a JobsStatistics struct for the given sort key.
|
||||||
func statsFieldGetter(sortBy model.SortByAggregate) func(*model.JobsStatistics) int {
|
func statsFieldGetter(sortBy model.SortByAggregate) func(*model.JobsStatistics) int {
|
||||||
|
|||||||
@@ -198,25 +198,12 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
|
|||||||
func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric {
|
func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric {
|
||||||
metrics := make(map[string]*schema.Metric)
|
metrics := make(map[string]*schema.Metric)
|
||||||
|
|
||||||
for _, c := range Clusters {
|
sc, err := GetSubCluster(cluster, subcluster)
|
||||||
if c.Name == cluster {
|
if err != nil {
|
||||||
for _, m := range c.MetricConfig {
|
return metrics
|
||||||
for _, s := range m.SubClusters {
|
|
||||||
if s.Name == subcluster {
|
|
||||||
metrics[m.Name] = &schema.Metric{
|
|
||||||
Name: m.Name,
|
|
||||||
Unit: s.Unit,
|
|
||||||
Peak: s.Peak,
|
|
||||||
Normal: s.Normal,
|
|
||||||
Caution: s.Caution,
|
|
||||||
Alert: s.Alert,
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_, ok := metrics[m.Name]
|
for _, m := range sc.MetricConfig {
|
||||||
if !ok {
|
|
||||||
metrics[m.Name] = &schema.Metric{
|
metrics[m.Name] = &schema.Metric{
|
||||||
Name: m.Name,
|
Name: m.Name,
|
||||||
Unit: m.Unit,
|
Unit: m.Unit,
|
||||||
@@ -226,10 +213,6 @@ func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Me
|
|||||||
Alert: m.Alert,
|
Alert: m.Alert,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return metrics
|
return metrics
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,3 +37,27 @@ func TestClusterConfig(t *testing.T) {
|
|||||||
// spew.Dump(archive.GlobalMetricList)
|
// spew.Dump(archive.GlobalMetricList)
|
||||||
// t.Fail()
|
// t.Fail()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetMetricConfigSubClusterRespectsRemovedMetrics(t *testing.T) {
|
||||||
|
if err := archive.Init(json.RawMessage(`{"kind": "file","path": "testdata/archive"}`)); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
sc, err := archive.GetSubCluster("fritz", "spr2tb")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics := archive.GetMetricConfigSubCluster("fritz", "spr2tb")
|
||||||
|
if len(metrics) != len(sc.MetricConfig) {
|
||||||
|
t.Fatalf("GetMetricConfigSubCluster() returned %d metrics, want %d", len(metrics), len(sc.MetricConfig))
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := metrics["flops_any"]; ok {
|
||||||
|
t.Fatalf("GetMetricConfigSubCluster() returned removed metric flops_any for subcluster spr2tb")
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := metrics["cpu_power"]; !ok {
|
||||||
|
t.Fatalf("GetMetricConfigSubCluster() missing active metric cpu_power for subcluster spr2tb")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user