From cc21e0e62cde3c628cd4b7529ae28ad4e27612ec Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 25 Feb 2026 07:38:19 +0100 Subject: [PATCH 01/20] Make json the default checkpoint format --- pkg/metricstore/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/metricstore/config.go b/pkg/metricstore/config.go index 69ee3563..1efee61a 100644 --- a/pkg/metricstore/config.go +++ b/pkg/metricstore/config.go @@ -144,7 +144,7 @@ type MetricStoreConfig struct { // Accessed by Init(), Checkpointing(), and other lifecycle functions. var Keys MetricStoreConfig = MetricStoreConfig{ Checkpoints: Checkpoints{ - FileFormat: "avro", + FileFormat: "json", RootDir: "./var/checkpoints", }, Cleanup: &Cleanup{ From df3bc111a47043b4413a07254037f315a62b4a21 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 25 Feb 2026 13:23:44 +0100 Subject: [PATCH 02/20] sort healthTable onMount --- web/frontend/src/status/dashdetails/HealthDash.svelte | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/web/frontend/src/status/dashdetails/HealthDash.svelte b/web/frontend/src/status/dashdetails/HealthDash.svelte index 2730642b..aa6539ae 100644 --- a/web/frontend/src/status/dashdetails/HealthDash.svelte +++ b/web/frontend/src/status/dashdetails/HealthDash.svelte @@ -6,6 +6,7 @@ --> From 0a0db36433ea18ea0d3c4ddcf3d88fda3b566aa1 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Wed, 25 Feb 2026 19:12:18 +0100 Subject: [PATCH 03/20] load statusDetails GQL on tab change --- web/frontend/src/status/DashDetails.svelte | 15 +++++---- .../src/status/dashdetails/HealthDash.svelte | 19 ++++++------ .../status/dashdetails/StatisticsDash.svelte | 13 ++++---- .../src/status/dashdetails/StatusDash.svelte | 27 ++++++++-------- .../src/status/dashdetails/UsageDash.svelte | 31 ++++++++++--------- 5 files changed, 56 insertions(+), 49 deletions(-) diff --git a/web/frontend/src/status/DashDetails.svelte b/web/frontend/src/status/DashDetails.svelte index b46d0935..72c411b2 100644 --- a/web/frontend/src/status/DashDetails.svelte +++ b/web/frontend/src/status/DashDetails.svelte @@ -36,6 +36,9 @@ const { query: initq } = init(); const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false + /* State Init */ + let activeTab = $state(""); + /* Derived */ const subClusters = $derived($initq?.data?.clusters?.find((c) => c.name == presetCluster)?.subClusters || []); @@ -63,22 +66,22 @@ {:else} - + (activeTab = e.detail)}> - + - + - + @@ -86,7 +89,7 @@ {#each subClusters.map(sc => sc.name) as scn} - + {/each} @@ -94,7 +97,7 @@ - + diff --git a/web/frontend/src/status/dashdetails/HealthDash.svelte b/web/frontend/src/status/dashdetails/HealthDash.svelte index aa6539ae..a30552b1 100644 --- a/web/frontend/src/status/dashdetails/HealthDash.svelte +++ b/web/frontend/src/status/dashdetails/HealthDash.svelte @@ -29,6 +29,7 @@ /* Svelte 5 Props */ let { presetCluster, + loadMe = false, } = $props(); /* Const Init */ @@ -55,7 +56,7 @@ /* Derived */ let cluster = $derived(presetCluster); - const statusQuery = $derived(queryStore({ + const statusQuery = $derived(loadMe ? queryStore({ client: client, query: gql` query ( @@ -85,7 +86,7 @@ sorting: querySorting, }, requestPolicy: "network-only" - })); + }) : null); let healthTableData = $derived.by(() => { if ($statusQuery?.data) { @@ -161,16 +162,16 @@
-{#if $statusQuery.fetching} +{#if $statusQuery?.fetching} -{:else if $statusQuery.error} +{:else if $statusQuery?.error} - Status Query (States): {$statusQuery.error.message} + Status Query (States): {$statusQuery?.error?.message} {:else if $statusQuery?.data?.nodeStates} @@ -264,19 +265,19 @@
-{#if $statusQuery.fetching} +{#if $statusQuery?.fetching} -{:else if $statusQuery.error} +{:else if $statusQuery?.error} - Status Query (Details): {$statusQuery.error.message} + Status Query (Details): {$statusQuery?.error?.message} -{:else if $statusQuery.data} +{:else if $statusQuery?.data} diff --git a/web/frontend/src/status/dashdetails/StatisticsDash.svelte b/web/frontend/src/status/dashdetails/StatisticsDash.svelte index 2cf8621e..d83adc15 100644 --- a/web/frontend/src/status/dashdetails/StatisticsDash.svelte +++ b/web/frontend/src/status/dashdetails/StatisticsDash.svelte @@ -30,7 +30,8 @@ /* Svelte 5 Props */ let { - presetCluster + presetCluster, + loadMe = false, } = $props(); /* Const Init */ @@ -49,7 +50,7 @@ : ccconfig['statusView_selectedHistograms'] || []); // Note: nodeMetrics are requested on configured $timestep resolution - const metricStatusQuery = $derived(queryStore({ + const metricStatusQuery = $derived(loadMe ? queryStore({ client: client, query: gql` query ( @@ -75,7 +76,7 @@ selectedHistograms: selectedHistograms }, requestPolicy: "network-only" - })); + }) : null); @@ -100,18 +101,18 @@ - {#if $metricStatusQuery.fetching} + {#if $metricStatusQuery?.fetching} - {:else if $metricStatusQuery.error} + {:else if $metricStatusQuery?.error} {$metricStatusQuery.error.message} {/if} -{#if $metricStatusQuery.data} +{#if $metricStatusQuery?.data} {#if selectedHistograms} diff --git a/web/frontend/src/status/dashdetails/StatusDash.svelte b/web/frontend/src/status/dashdetails/StatusDash.svelte index 8d108964..0c2626d0 100644 --- a/web/frontend/src/status/dashdetails/StatusDash.svelte +++ b/web/frontend/src/status/dashdetails/StatusDash.svelte @@ -32,6 +32,7 @@ let { clusters, presetCluster, + loadMe = false, } = $props(); /* Const Init */ @@ -59,7 +60,7 @@ /* Derived */ let cluster = $derived(presetCluster); // States for Stacked charts - const statesTimed = $derived(queryStore({ + const statesTimed = $derived(loadMe ? queryStore({ client: client, query: gql` query ($filter: [NodeFilter!], $typeNode: String!, $typeHealth: String!) { @@ -81,11 +82,11 @@ typeHealth: "health" }, requestPolicy: "network-only" - })); + }) : null); // Note: nodeMetrics are requested on configured $timestep resolution // Result: The latest 5 minutes (datapoints) for each node independent of job - const statusQuery = $derived(queryStore({ + const statusQuery = $derived(loadMe ? queryStore({ client: client, query: gql` query ( @@ -184,11 +185,11 @@ sorting: { field: "startTime", type: "col", order: "DESC" } }, requestPolicy: "network-only" - })); + }) : null); /* Effects */ $effect(() => { - if ($statusQuery.data) { + if ($statusQuery?.data) { let subClusters = clusters.find( (c) => c.name == cluster, ).subClusters; @@ -374,19 +375,19 @@
-{#if $statesTimed.fetching} +{#if $statesTimed?.fetching} -{:else if $statesTimed.error} +{:else if $statesTimed?.error} - States Timed: {$statesTimed.error.message} + States Timed: {$statesTimed?.error?.message} -{:else if $statesTimed.data} +{:else if $statesTimed?.data}
@@ -427,19 +428,19 @@
-{#if $statusQuery.fetching} +{#if $statusQuery?.fetching} -{:else if $statusQuery.error} +{:else if $statusQuery?.error} - Status Query (Details): {$statusQuery.error.message} + Status Query (Details): {$statusQuery?.error?.message} -{:else if $statusQuery.data} +{:else if $statusQuery?.data} {#each clusters.find((c) => c.name == cluster).subClusters as subCluster, i} diff --git a/web/frontend/src/status/dashdetails/UsageDash.svelte b/web/frontend/src/status/dashdetails/UsageDash.svelte index 3fa197ae..2a9b3037 100644 --- a/web/frontend/src/status/dashdetails/UsageDash.svelte +++ b/web/frontend/src/status/dashdetails/UsageDash.svelte @@ -40,7 +40,8 @@ presetCluster, presetSubCluster = null, useCbColors = false, - useAltColors = false + useAltColors = false, + loadMe = false, } = $props(); /* Const Init */ @@ -62,7 +63,7 @@ ? [{ state: ["running"] }, { cluster: { eq: presetCluster} }, { subCluster: { eq: presetSubCluster } }] : [{ state: ["running"] }, { cluster: { eq: presetCluster} }] ); - const topJobsQuery = $derived(queryStore({ + const topJobsQuery = $derived(loadMe ? queryStore({ client: client, query: gql` query ( @@ -95,9 +96,9 @@ paging: pagingState // Top 10 }, requestPolicy: "network-only" - })); + }) : null); - const topNodesQuery = $derived(queryStore({ + const topNodesQuery = $derived(loadMe ? queryStore({ client: client, query: gql` query ( @@ -130,9 +131,9 @@ paging: pagingState }, requestPolicy: "network-only" - })); + }) : null); - const topAccsQuery = $derived(queryStore({ + const topAccsQuery = $derived(loadMe ? queryStore({ client: client, query: gql` query ( @@ -165,10 +166,10 @@ paging: pagingState }, requestPolicy: "network-only" - })); + }): null); // Note: nodeMetrics are requested on configured $timestep resolution - const nodeStatusQuery = $derived(queryStore({ + const nodeStatusQuery = $derived(loadMe ? queryStore({ client: client, query: gql` query ( @@ -198,7 +199,7 @@ numDurationBins: numDurationBins, }, requestPolicy: "network-only" - })); + }) : null); /* Functions */ function legendColors(targetIdx) { @@ -246,9 +247,9 @@
-{#if $topJobsQuery.fetching || $nodeStatusQuery.fetching} +{#if $topJobsQuery?.fetching || $nodeStatusQuery?.fetching} -{:else if $topJobsQuery.data && $nodeStatusQuery.data} +{:else if $topJobsQuery?.data && $nodeStatusQuery?.data} {#key $nodeStatusQuery.data.jobsStatistics[0].histDuration} @@ -354,9 +355,9 @@
-{#if $topNodesQuery.fetching || $nodeStatusQuery.fetching} +{#if $topNodesQuery?.fetching || $nodeStatusQuery?.fetching} -{:else if $topNodesQuery.data && $nodeStatusQuery.data} +{:else if $topNodesQuery?.data && $nodeStatusQuery?.data} -{#if $topAccsQuery.fetching || $nodeStatusQuery.fetching} +{#if $topAccsQuery?.fetching || $nodeStatusQuery?.fetching} -{:else if $topAccsQuery.data && $nodeStatusQuery.data} +{:else if $topAccsQuery?.data && $nodeStatusQuery?.data} Date: Thu, 26 Feb 2026 10:08:40 +0100 Subject: [PATCH 04/20] Introduce metric store binary checkpoints with write ahead log --- go.mod | 2 - go.sum | 11 - pkg/metricstore/avroCheckpoint.go | 481 ------------------ pkg/metricstore/avroHelper.go | 130 ----- pkg/metricstore/avroStruct.go | 167 ------- pkg/metricstore/checkpoint.go | 369 +++++--------- pkg/metricstore/config.go | 7 +- pkg/metricstore/configSchema.go | 2 +- pkg/metricstore/lineprotocol.go | 4 +- pkg/metricstore/metricstore.go | 22 +- pkg/metricstore/walCheckpoint.go | 787 ++++++++++++++++++++++++++++++ 11 files changed, 920 insertions(+), 1062 deletions(-) delete mode 100644 pkg/metricstore/avroCheckpoint.go delete mode 100644 pkg/metricstore/avroHelper.go delete mode 100644 pkg/metricstore/avroStruct.go create mode 100644 pkg/metricstore/walCheckpoint.go diff --git a/go.mod b/go.mod index e244062c..c561f627 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,6 @@ require ( github.com/influxdata/line-protocol/v2 v2.2.1 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 - github.com/linkedin/goavro/v2 v2.15.0 github.com/mattn/go-sqlite3 v1.14.34 github.com/parquet-go/parquet-go v0.27.0 github.com/qustavo/sqlhooks/v2 v2.1.0 @@ -80,7 +79,6 @@ require ( github.com/go-openapi/swag/yamlutils v0.25.4 // indirect github.com/go-viper/mapstructure/v2 v2.5.0 // indirect github.com/goccy/go-yaml v1.19.2 // indirect - github.com/golang/snappy v1.0.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect diff --git a/go.sum b/go.sum index f2929454..5586b9c5 100644 --- a/go.sum +++ b/go.sum @@ -4,8 +4,6 @@ github.com/99designs/gqlgen v0.17.86 h1:C8N3UTa5heXX6twl+b0AJyGkTwYL6dNmFrgZNLRc github.com/99designs/gqlgen v0.17.86/go.mod h1:KTrPl+vHA1IUzNlh4EYkl7+tcErL3MgKnhHrBcV74Fw= github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A= github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk= -github.com/ClusterCockpit/cc-lib/v2 v2.5.1 h1:s6M9tyPDty+4zTdQGJYKpGJM9Nz7N6ITMdjPvNSLX5g= -github.com/ClusterCockpit/cc-lib/v2 v2.5.1/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0= github.com/ClusterCockpit/cc-lib/v2 v2.6.0 h1:Q7zvRAVhfYA9PDB18pfY9A/6Ws4oWpnv8+P9MBRUDzg= github.com/ClusterCockpit/cc-lib/v2 v2.6.0/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= @@ -151,9 +149,6 @@ github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63Y github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA= github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjYXycRz/g5+RWDuYgPrE= -github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= -github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= @@ -226,8 +221,6 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -github.com/linkedin/goavro/v2 v2.15.0 h1:pDj1UrjUOO62iXhgBiE7jQkpNIc5/tA5eZsgolMjgVI= -github.com/linkedin/goavro/v2 v2.15.0/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk= github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.34 h1:3NtcvcUnFBPsuRcno8pUtupspG/GM+9nZ88zgJcp6Zk= @@ -289,14 +282,11 @@ github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKk github.com/stmcginnis/gofish v0.21.1 h1:sutDvBhmLh4RDOZ1DN8GUyYRu7f1ggvKMMnSaiqhwn4= github.com/stmcginnis/gofish v0.21.1/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE= @@ -378,7 +368,6 @@ gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/metricstore/avroCheckpoint.go b/pkg/metricstore/avroCheckpoint.go deleted file mode 100644 index 14898186..00000000 --- a/pkg/metricstore/avroCheckpoint.go +++ /dev/null @@ -1,481 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package metricstore - -import ( - "bufio" - "encoding/json" - "errors" - "fmt" - "os" - "path" - "sort" - "strconv" - "strings" - "sync" - "sync/atomic" - "time" - - cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" - "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/linkedin/goavro/v2" -) - -var ( - NumAvroWorkers int = DefaultAvroWorkers - startUp bool = true -) - -func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) { - levels := make([]*AvroLevel, 0) - selectors := make([][]string, 0) - as.root.lock.RLock() - // Cluster - for sel1, l1 := range as.root.children { - l1.lock.RLock() - // Node - for sel2, l2 := range l1.children { - l2.lock.RLock() - // Frequency - for sel3, l3 := range l2.children { - levels = append(levels, l3) - selectors = append(selectors, []string{sel1, sel2, sel3}) - } - l2.lock.RUnlock() - } - l1.lock.RUnlock() - } - as.root.lock.RUnlock() - - type workItem struct { - level *AvroLevel - dir string - selector []string - } - - n, errs := int32(0), int32(0) - - var wg sync.WaitGroup - wg.Add(NumAvroWorkers) - work := make(chan workItem, NumAvroWorkers*2) - for range NumAvroWorkers { - go func() { - defer wg.Done() - - for workItem := range work { - from := getTimestamp(workItem.dir) - - if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil { - if err == ErrNoNewArchiveData { - continue - } - - cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error()) - atomic.AddInt32(&errs, 1) - } else { - atomic.AddInt32(&n, 1) - } - } - }() - } - - for i := range len(levels) { - dir := path.Join(dir, path.Join(selectors[i]...)) - work <- workItem{ - level: levels[i], - dir: dir, - selector: selectors[i], - } - } - - close(work) - wg.Wait() - - if errs > 0 { - return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n) - } - - startUp = false - - return int(n), nil -} - -// getTimestamp returns the timestamp from the directory name -func getTimestamp(dir string) int64 { - // Extract the resolution and timestamp from the directory name - // The existing avro file will be in epoch timestamp format - // iterate over all the files in the directory and find the maximum timestamp - // and return it - - resolution := path.Base(dir) - dir = path.Dir(dir) - - files, err := os.ReadDir(dir) - if err != nil { - return 0 - } - var maxTS int64 = 0 - - if len(files) == 0 { - return 0 - } - - for _, file := range files { - if file.IsDir() { - continue - } - name := file.Name() - - if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") { - continue - } - - ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64) - if err != nil { - fmt.Printf("error while parsing timestamp: %s\n", err.Error()) - continue - } - - if ts > maxTS { - maxTS = ts - } - } - - interval, _ := time.ParseDuration(Keys.Checkpoints.Interval) - updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix() - - if startUp { - return 0 - } - - if updateTime < time.Now().Unix() { - return 0 - } - - return maxTS -} - -func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { - l.lock.Lock() - defer l.lock.Unlock() - - // fmt.Printf("Checkpointing directory: %s\n", dir) - // filepath contains the resolution - intRes, _ := strconv.Atoi(path.Base(dir)) - - // find smallest overall timestamp in l.data map and delete it from l.data - minTS := int64(1<<63 - 1) - for ts, dat := range l.data { - if ts < minTS && len(dat) != 0 { - minTS = ts - } - } - - if from == 0 && minTS != int64(1<<63-1) { - from = minTS - } - - if from == 0 { - return ErrNoNewArchiveData - } - - var schema string - var codec *goavro.Codec - recordList := make([]map[string]any, 0) - - var f *os.File - - filePath := dir + fmt.Sprintf("_%d.avro", from) - - var err error - - fp_, err_ := os.Stat(filePath) - if errors.Is(err_, os.ErrNotExist) { - err = os.MkdirAll(path.Dir(dir), 0o755) - if err != nil { - return fmt.Errorf("failed to create directory: %v", err) - } - } else if fp_.Size() != 0 { - f, err = os.Open(filePath) - if err != nil { - return fmt.Errorf("failed to open existing avro file: %v", err) - } - defer f.Close() - - br := bufio.NewReader(f) - - reader, err := goavro.NewOCFReader(br) - if err != nil { - return fmt.Errorf("failed to create OCF reader: %v", err) - } - codec = reader.Codec() - schema = codec.Schema() - } - - timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix() - - if dumpAll { - timeRef = time.Now().Unix() - } - - // Empty values - if len(l.data) == 0 { - // we checkpoint avro files every 60 seconds - repeat := 60 / intRes - - for range repeat { - recordList = append(recordList, make(map[string]any)) - } - } - - readFlag := true - - for ts := range l.data { - flag := false - if ts < timeRef { - data := l.data[ts] - - schemaGen, err := generateSchema(data) - if err != nil { - return err - } - - flag, schema, err = compareSchema(schema, schemaGen) - if err != nil { - return fmt.Errorf("failed to compare read and generated schema: %v", err) - } - if flag && readFlag && !errors.Is(err_, os.ErrNotExist) { - // Use closure to ensure file is closed even on error - err := func() error { - f2, err := os.Open(filePath) - if err != nil { - return fmt.Errorf("failed to open Avro file: %v", err) - } - defer f2.Close() - - br := bufio.NewReader(f2) - - ocfReader, err := goavro.NewOCFReader(br) - if err != nil { - return fmt.Errorf("failed to create OCF reader while changing schema: %v", err) - } - - for ocfReader.Scan() { - record, err := ocfReader.Read() - if err != nil { - return fmt.Errorf("failed to read record: %v", err) - } - - recordList = append(recordList, record.(map[string]any)) - } - - return nil - }() - if err != nil { - return err - } - - err = os.Remove(filePath) - if err != nil { - return fmt.Errorf("failed to delete file: %v", err) - } - - readFlag = false - } - codec, err = goavro.NewCodec(schema) - if err != nil { - return fmt.Errorf("failed to create codec after merged schema: %v", err) - } - - recordList = append(recordList, generateRecord(data)) - delete(l.data, ts) - } - } - - if len(recordList) == 0 { - return ErrNoNewArchiveData - } - - f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644) - if err != nil { - return fmt.Errorf("failed to append new avro file: %v", err) - } - defer f.Close() - - // fmt.Printf("Codec : %#v\n", codec) - - writer, err := goavro.NewOCFWriter(goavro.OCFConfig{ - W: f, - Codec: codec, - CompressionName: goavro.CompressionDeflateLabel, - }) - if err != nil { - return fmt.Errorf("failed to create OCF writer: %v", err) - } - - // Append the new record - if err := writer.Append(recordList); err != nil { - return fmt.Errorf("failed to append record: %v", err) - } - - return nil -} - -func compareSchema(schemaRead, schemaGen string) (bool, string, error) { - var genSchema, readSchema AvroSchema - - if schemaRead == "" { - return false, schemaGen, nil - } - - // Unmarshal the schema strings into AvroSchema structs - if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil { - return false, "", fmt.Errorf("failed to parse generated schema: %v", err) - } - if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil { - return false, "", fmt.Errorf("failed to parse read schema: %v", err) - } - - sort.Slice(genSchema.Fields, func(i, j int) bool { - return genSchema.Fields[i].Name < genSchema.Fields[j].Name - }) - - sort.Slice(readSchema.Fields, func(i, j int) bool { - return readSchema.Fields[i].Name < readSchema.Fields[j].Name - }) - - // Check if schemas are identical - schemasEqual := true - if len(genSchema.Fields) <= len(readSchema.Fields) { - - for i := range genSchema.Fields { - if genSchema.Fields[i].Name != readSchema.Fields[i].Name { - schemasEqual = false - break - } - } - - // If schemas are identical, return the read schema - if schemasEqual { - return false, schemaRead, nil - } - } - - // Create a map to hold unique fields from both schemas - fieldMap := make(map[string]AvroField) - - // Add fields from the read schema - for _, field := range readSchema.Fields { - fieldMap[field.Name] = field - } - - // Add or update fields from the generated schema - for _, field := range genSchema.Fields { - fieldMap[field.Name] = field - } - - // Create a union schema by collecting fields from the map - var mergedFields []AvroField - for _, field := range fieldMap { - mergedFields = append(mergedFields, field) - } - - // Sort fields by name for consistency - sort.Slice(mergedFields, func(i, j int) bool { - return mergedFields[i].Name < mergedFields[j].Name - }) - - // Create the merged schema - mergedSchema := AvroSchema{ - Type: "record", - Name: genSchema.Name, - Fields: mergedFields, - } - - // Check if schemas are identical - schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields) - if schemasEqual { - for i := range mergedSchema.Fields { - if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name { - schemasEqual = false - break - } - } - - if schemasEqual { - return false, schemaRead, nil - } - } - - // Marshal the merged schema back to JSON - mergedSchemaJSON, err := json.Marshal(mergedSchema) - if err != nil { - return false, "", fmt.Errorf("failed to marshal merged schema: %v", err) - } - - return true, string(mergedSchemaJSON), nil -} - -func generateSchema(data map[string]schema.Float) (string, error) { - // Define the Avro schema structure - schema := map[string]any{ - "type": "record", - "name": "DataRecord", - "fields": []map[string]any{}, - } - - fieldTracker := make(map[string]struct{}) - - for key := range data { - if _, exists := fieldTracker[key]; !exists { - key = correctKey(key) - - field := map[string]any{ - "name": key, - "type": "double", - "default": -1.0, - } - schema["fields"] = append(schema["fields"].([]map[string]any), field) - fieldTracker[key] = struct{}{} - } - } - - schemaString, err := json.Marshal(schema) - if err != nil { - return "", fmt.Errorf("failed to marshal schema: %v", err) - } - - return string(schemaString), nil -} - -func generateRecord(data map[string]schema.Float) map[string]any { - record := make(map[string]any) - - // Iterate through each map in data - for key, value := range data { - key = correctKey(key) - - // Set the value in the record - // avro only accepts basic types - record[key] = value.Double() - } - - return record -} - -func correctKey(key string) string { - key = strings.ReplaceAll(key, "_", "_0x5F_") - key = strings.ReplaceAll(key, ":", "_0x3A_") - key = strings.ReplaceAll(key, ".", "_0x2E_") - return key -} - -func ReplaceKey(key string) string { - key = strings.ReplaceAll(key, "_0x2E_", ".") - key = strings.ReplaceAll(key, "_0x3A_", ":") - key = strings.ReplaceAll(key, "_0x5F_", "_") - return key -} diff --git a/pkg/metricstore/avroHelper.go b/pkg/metricstore/avroHelper.go deleted file mode 100644 index f6bef36e..00000000 --- a/pkg/metricstore/avroHelper.go +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package metricstore - -import ( - "context" - "slices" - "strconv" - "strings" - "sync" - - cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" -) - -func DataStaging(wg *sync.WaitGroup, ctx context.Context) { - wg.Add(1) - go func() { - defer wg.Done() - - if Keys.Checkpoints.FileFormat == "json" { - return - } - - ms := GetMemoryStore() - var avroLevel *AvroLevel - oldSelector := make([]string, 0) - - for { - select { - case <-ctx.Done(): - // Drain any remaining messages in channel before exiting - for { - select { - case val, ok := <-LineProtocolMessages: - if !ok { - // Channel closed - return - } - // Process remaining message - freq, err := ms.GetMetricFrequency(val.MetricName) - if err != nil { - continue - } - - var metricName strings.Builder - for _, selectorName := range val.Selector { - metricName.WriteString(selectorName + SelectorDelimiter) - } - metricName.WriteString(val.MetricName) - - var selector []string - selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10)) - - if !stringSlicesEqual(oldSelector, selector) { - avroLevel = avroStore.root.findAvroLevelOrCreate(selector) - if avroLevel == nil { - cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) - } - oldSelector = slices.Clone(selector) - } - - if avroLevel != nil { - avroLevel.addMetric(metricName.String(), val.Value, val.Timestamp, int(freq)) - } - default: - // No more messages, exit - return - } - } - case val, ok := <-LineProtocolMessages: - if !ok { - // Channel closed, exit gracefully - return - } - - // Fetch the frequency of the metric from the global configuration - freq, err := ms.GetMetricFrequency(val.MetricName) - if err != nil { - cclog.Errorf("Error fetching metric frequency: %s\n", err) - continue - } - - var metricName strings.Builder - - for _, selectorName := range val.Selector { - metricName.WriteString(selectorName + SelectorDelimiter) - } - - metricName.WriteString(val.MetricName) - - // Create a new selector for the Avro level - // The selector is a slice of strings that represents the path to the - // Avro level. It is created by appending the cluster, node, and metric - // name to the selector. - var selector []string - selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10)) - - if !stringSlicesEqual(oldSelector, selector) { - // Get the Avro level for the metric - avroLevel = avroStore.root.findAvroLevelOrCreate(selector) - - // If the Avro level is nil, create a new one - if avroLevel == nil { - cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) - } - oldSelector = slices.Clone(selector) - } - - if avroLevel != nil { - avroLevel.addMetric(metricName.String(), val.Value, val.Timestamp, int(freq)) - } - } - } - }() -} - -func stringSlicesEqual(a, b []string) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} diff --git a/pkg/metricstore/avroStruct.go b/pkg/metricstore/avroStruct.go deleted file mode 100644 index 78a8d137..00000000 --- a/pkg/metricstore/avroStruct.go +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package metricstore - -import ( - "sync" - - "github.com/ClusterCockpit/cc-lib/v2/schema" -) - -var ( - LineProtocolMessages = make(chan *AvroStruct) - // SelectorDelimiter separates hierarchical selector components in metric names for Avro encoding - SelectorDelimiter = "_SEL_" -) - -var CheckpointBufferMinutes = DefaultCheckpointBufferMin - -type AvroStruct struct { - MetricName string - Cluster string - Node string - Selector []string - Value schema.Float - Timestamp int64 -} - -type AvroStore struct { - root AvroLevel -} - -var avroStore AvroStore - -type AvroLevel struct { - children map[string]*AvroLevel - data map[int64]map[string]schema.Float - lock sync.RWMutex -} - -type AvroField struct { - Name string `json:"name"` - Type any `json:"type"` - Default any `json:"default,omitempty"` -} - -type AvroSchema struct { - Type string `json:"type"` - Name string `json:"name"` - Fields []AvroField `json:"fields"` -} - -func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel { - if len(selector) == 0 { - return l - } - - // Allow concurrent reads: - l.lock.RLock() - var child *AvroLevel - var ok bool - if l.children == nil { - // Children map needs to be created... - l.lock.RUnlock() - } else { - child, ok := l.children[selector[0]] - l.lock.RUnlock() - if ok { - return child.findAvroLevelOrCreate(selector[1:]) - } - } - - // The level does not exist, take write lock for unique access: - l.lock.Lock() - // While this thread waited for the write lock, another thread - // could have created the child node. - if l.children != nil { - child, ok = l.children[selector[0]] - if ok { - l.lock.Unlock() - return child.findAvroLevelOrCreate(selector[1:]) - } - } - - child = &AvroLevel{ - data: make(map[int64]map[string]schema.Float, 0), - children: nil, - } - - if l.children != nil { - l.children[selector[0]] = child - } else { - l.children = map[string]*AvroLevel{selector[0]: child} - } - l.lock.Unlock() - return child.findAvroLevelOrCreate(selector[1:]) -} - -func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) { - l.lock.Lock() - defer l.lock.Unlock() - - KeyCounter := int(CheckpointBufferMinutes * 60 / Freq) - - // Create keys in advance for the given amount of time - if len(l.data) != KeyCounter { - if len(l.data) == 0 { - for i := range KeyCounter { - l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0) - } - } else { - // Get the last timestamp - var lastTS int64 - for ts := range l.data { - if ts > lastTS { - lastTS = ts - } - } - // Create keys for the next KeyCounter timestamps - l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0) - } - } - - closestTS := int64(0) - minDiff := int64(Freq) + 1 // Start with diff just outside the valid range - found := false - - // Iterate over timestamps and choose the one which is within range. - // Since its epoch time, we check if the difference is less than 60 seconds. - for ts, dat := range l.data { - // Check if timestamp is within range - diff := timestamp - ts - if diff < -int64(Freq) || diff > int64(Freq) { - continue - } - - // Metric already present at this timestamp — skip - if _, ok := dat[metricName]; ok { - continue - } - - // Check if this is the closest timestamp so far - if Abs(diff) < minDiff { - minDiff = Abs(diff) - closestTS = ts - found = true - } - } - - if found { - l.data[closestTS][metricName] = value - } -} - -func GetAvroStore() *AvroStore { - return &avroStore -} - -// Abs returns the absolute value of x. -func Abs(x int64) int64 { - if x < 0 { - return -x - } - return x -} diff --git a/pkg/metricstore/checkpoint.go b/pkg/metricstore/checkpoint.go index b4097ff2..590197e3 100644 --- a/pkg/metricstore/checkpoint.go +++ b/pkg/metricstore/checkpoint.go @@ -6,15 +6,15 @@ // This file implements checkpoint persistence for the in-memory metric store. // // Checkpoints enable graceful restarts by periodically saving in-memory metric -// data to disk in either JSON or Avro format. The checkpoint system: +// data to disk in JSON or binary format. The checkpoint system: // // Key Features: // - Periodic background checkpointing via the Checkpointing() worker -// - Two formats: JSON (human-readable) and Avro (compact, efficient) +// - Two format families: JSON (human-readable) and WAL+binary (compact, crash-safe) // - Parallel checkpoint creation and loading using worker pools -// - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|avro} +// - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|bin} +// - WAL file: checkpoint_dir/cluster/host/current.wal (append-only, per-entry) // - Only saves unarchived data (archived data is already persisted elsewhere) -// - Automatic format detection and fallback during loading // - GC optimization during loading to prevent excessive heap growth // // Checkpoint Workflow: @@ -27,8 +27,9 @@ // checkpoints/ // cluster1/ // host001/ -// 1234567890.json (timestamp = checkpoint start time) -// 1234567950.json +// 1234567890.json (JSON format: full subtree snapshot) +// 1234567890.bin (binary format: full subtree snapshot) +// current.wal (WAL format: append-only per-entry log) // host002/ // ... package metricstore @@ -52,7 +53,6 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/linkedin/goavro/v2" ) const ( @@ -86,47 +86,58 @@ var ( // Checkpointing starts a background worker that periodically saves metric data to disk. // -// The behavior depends on the configured file format: -// - JSON: Periodic checkpointing based on Keys.Checkpoints.Interval -// - Avro: Initial delay + periodic checkpointing at DefaultAvroCheckpointInterval -// -// The worker respects context cancellation and signals completion via the WaitGroup. +// Format behaviour: +// - "json": Periodic checkpointing based on Keys.Checkpoints.Interval +// - "wal": Periodic binary snapshots + WAL rotation at Keys.Checkpoints.Interval func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { lastCheckpointMu.Lock() lastCheckpoint = time.Now() lastCheckpointMu.Unlock() - if Keys.Checkpoints.FileFormat == "json" { - ms := GetMemoryStore() + ms := GetMemoryStore() - wg.Add(1) - go func() { - defer wg.Done() - d, err := time.ParseDuration(Keys.Checkpoints.Interval) - if err != nil { - cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error()) - } - if d <= 0 { - cclog.Warnf("[METRICSTORE]> checkpoint interval is zero or negative (%s), checkpointing disabled", d) + wg.Add(1) + go func() { + defer wg.Done() + + d, err := time.ParseDuration(Keys.Checkpoints.Interval) + if err != nil { + cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error()) + } + if d <= 0 { + cclog.Warnf("[METRICSTORE]> checkpoint interval is zero or negative (%s), checkpointing disabled", d) + return + } + + ticker := time.NewTicker(d) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): return - } + case <-ticker.C: + lastCheckpointMu.Lock() + from := lastCheckpoint + lastCheckpointMu.Unlock() - ticker := time.NewTicker(d) - defer ticker.Stop() + now := time.Now() + cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339)) - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - lastCheckpointMu.Lock() - from := lastCheckpoint - lastCheckpointMu.Unlock() - - cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339)) - now := time.Now() - n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, - from.Unix(), now.Unix()) + if Keys.Checkpoints.FileFormat == "wal" { + n, hostDirs, err := ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), now.Unix()) + if err != nil { + cclog.Errorf("[METRICSTORE]> binary checkpointing failed: %s", err.Error()) + } else { + cclog.Infof("[METRICSTORE]> done: %d binary snapshot files created", n) + lastCheckpointMu.Lock() + lastCheckpoint = now + lastCheckpointMu.Unlock() + // Rotate WAL files for successfully checkpointed hosts. + RotateWALFiles(hostDirs) + } + } else { + n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), now.Unix()) if err != nil { cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error()) } else { @@ -137,32 +148,8 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { } } } - }() - } else { - wg.Add(1) - go func() { - defer wg.Done() - - select { - case <-ctx.Done(): - return - case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute): - GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) - } - - ticker := time.NewTicker(DefaultAvroCheckpointInterval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) - } - } - }() - } + } + }() } // MarshalJSON provides optimized JSON encoding for CheckpointMetrics. @@ -190,7 +177,7 @@ func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) { return buf, nil } -// ToCheckpoint writes metric data to checkpoint files in parallel. +// ToCheckpoint writes metric data to checkpoint files in parallel (JSON format). // // Metrics at root and cluster levels are skipped. One file per host is created. // Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host @@ -378,7 +365,6 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error { return err } - gcCounter := 0 for _, clusterDir := range clustersDir { if !clusterDir.IsDir() { return errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory") @@ -394,16 +380,6 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error { return errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory") } - gcCounter++ - // if gcCounter%GCTriggerInterval == 0 { - // Forcing garbage collection runs here regulary during the loading of checkpoints - // will decrease the total heap size after loading everything back to memory is done. - // While loading data, the heap will grow fast, so the GC target size will double - // almost always. By forcing GCs here, we can keep it growing more slowly so that - // at the end, less memory is wasted. - // runtime.GC() - // } - work <- [2]string{clusterDir.Name(), hostDir.Name()} } } @@ -413,8 +389,8 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error { // FromCheckpoint loads checkpoint files from disk into memory in parallel. // -// Uses worker pool to load cluster/host combinations. Periodically triggers GC -// to prevent excessive heap growth. Returns number of files loaded and any errors. +// Uses worker pool to load cluster/host combinations. Returns number of files +// loaded and any errors. func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) { var wg sync.WaitGroup work := make(chan [2]string, Keys.NumWorkers*4) @@ -452,13 +428,11 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) { // FromCheckpointFiles is the main entry point for loading checkpoints at startup. // -// Automatically detects checkpoint format (JSON vs Avro) and falls back if needed. // Creates checkpoint directory if it doesn't exist. This function must be called // before any writes or reads, and can only be called once. func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { if _, err := os.Stat(dir); os.IsNotExist(err) { - // The directory does not exist, so create it using os.MkdirAll() - err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory + err := os.MkdirAll(dir, CheckpointDirPerms) if err != nil { cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err) } @@ -468,146 +442,6 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { return m.FromCheckpoint(dir, from) } -func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { - br := bufio.NewReader(f) - - fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:] - resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err) - } - - fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64) - - // Same logic according to lineprotocol - fromTimestamp -= (resolution / 2) - - if err != nil { - return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err) - } - - // fmt.Printf("File : %s with resolution : %d\n", fileName, resolution) - - var recordCounter int64 = 0 - - // Create a new OCF reader from the buffered reader - ocfReader, err := goavro.NewOCFReader(br) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err) - } - - metricsData := make(map[string]schema.FloatArray) - - for ocfReader.Scan() { - datum, err := ocfReader.Read() - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err) - } - - record, ok := datum.(map[string]any) - if !ok { - return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}") - } - - for key, value := range record { - metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64))) - } - - recordCounter += 1 - } - - to := (fromTimestamp + (recordCounter / (60 / resolution) * 60)) - if to < from { - return nil - } - - for key, floatArray := range metricsData { - metricName := ReplaceKey(key) - - if strings.Contains(metricName, SelectorDelimiter) { - subString := strings.Split(metricName, SelectorDelimiter) - - lvl := l - - for i := 0; i < len(subString)-1; i++ { - - sel := subString[i] - - if lvl.children == nil { - lvl.children = make(map[string]*Level) - } - - child, ok := lvl.children[sel] - if !ok { - child = &Level{ - metrics: make([]*buffer, len(m.Metrics)), - children: nil, - } - lvl.children[sel] = child - } - lvl = child - } - - leafMetricName := subString[len(subString)-1] - err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) - } - } else { - err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) - } - } - - } - - return nil -} - -func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error { - n := len(floatArray) - b := &buffer{ - frequency: resolution, - start: from, - data: floatArray[0:n:n], - prev: nil, - next: nil, - archived: true, - } - - minfo, ok := m.Metrics[metricName] - if !ok { - return nil - } - - prev := l.metrics[minfo.offset] - if prev == nil { - l.metrics[minfo.offset] = b - } else { - if prev.start > b.start { - return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start) - } - - b.prev = prev - prev.next = b - - missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency)) - if missingCount > 0 { - missingCount /= int(b.frequency) - - for range missingCount { - prev.data = append(prev.data, schema.NaN) - } - - prev.data = prev.data[0:len(prev.data):len(prev.data)] - } - } - l.metrics[minfo.offset] = b - - return nil -} - func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error { br := bufio.NewReader(f) cf := &CheckpointFile{} @@ -679,37 +513,37 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error { return nil } +// fromCheckpoint loads all checkpoint files (JSON, binary snapshot, WAL) for a +// single host directory. Snapshot files are loaded first (sorted by timestamp), +// then current.wal is replayed on top. func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) { direntries, err := os.ReadDir(dir) if err != nil { if os.IsNotExist(err) { return 0, nil } - return 0, err } allFiles := make([]fs.DirEntry, 0) + var walEntry fs.DirEntry filesLoaded := 0 + for _, e := range direntries { if e.IsDir() { - child := &Level{ - metrics: make([]*buffer, len(m.Metrics)), - children: make(map[string]*Level), - } - - files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from) - filesLoaded += files - if err != nil { - return filesLoaded, err - } - - l.children[e.Name()] = child - } else if strings.HasSuffix(e.Name(), ".json") || strings.HasSuffix(e.Name(), ".avro") { - allFiles = append(allFiles, e) - } else { + // Legacy: skip subdirectories (only used by old Avro format). + // These are ignored; their data is not loaded. + cclog.Debugf("[METRICSTORE]> skipping subdirectory %s in checkpoint dir %s", e.Name(), dir) continue } + + name := e.Name() + if strings.HasSuffix(name, ".json") || strings.HasSuffix(name, ".bin") { + allFiles = append(allFiles, e) + } else if name == "current.wal" { + walEntry = e + } + // Silently ignore other files (e.g., .tmp, .bin.tmp from interrupted writes). } files, err := findFiles(allFiles, from, true) @@ -719,54 +553,81 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, err loaders := map[string]func(*MemoryStore, *os.File, int64) error{ ".json": l.loadJSONFile, - ".avro": l.loadAvroFile, + ".bin": l.loadBinaryFile, } for _, filename := range files { ext := filepath.Ext(filename) loader := loaders[ext] if loader == nil { - cclog.Warnf("Unknown extension for file %s", filename) + cclog.Warnf("[METRICSTORE]> unknown extension for checkpoint file %s", filename) continue } - // Use a closure to ensure file is closed immediately after use err := func() error { f, err := os.Open(path.Join(dir, filename)) if err != nil { return err } defer f.Close() - return loader(m, f, from) }() if err != nil { return filesLoaded, err } + filesLoaded++ + } - filesLoaded += 1 + // Replay WAL after all snapshot files so it fills in data since the last snapshot. + if walEntry != nil { + err := func() error { + f, err := os.Open(path.Join(dir, walEntry.Name())) + if err != nil { + return err + } + defer f.Close() + return l.loadWALFile(m, f, from) + }() + if err != nil { + // WAL errors are non-fatal: the snapshot already loaded the bulk of data. + cclog.Warnf("[METRICSTORE]> WAL replay error for %s: %v (data since last snapshot may be missing)", dir, err) + } else { + filesLoaded++ + } } return filesLoaded, nil } -// This will probably get very slow over time! -// A solution could be some sort of an index file in which all other files -// and the timespan they contain is listed. -// NOTE: This now assumes that you have distinct timestamps for json and avro files -// Also, it assumes that the timestamps are not overlapping/self-modified. +// parseTimestampFromFilename extracts a Unix timestamp from a checkpoint filename. +// Supports ".json" (format: ".json") and ".bin" (format: ".bin"). +func parseTimestampFromFilename(name string) (int64, error) { + switch { + case strings.HasSuffix(name, ".json"): + return strconv.ParseInt(name[:len(name)-5], 10, 64) + case strings.HasSuffix(name, ".bin"): + return strconv.ParseInt(name[:len(name)-4], 10, 64) + default: + return 0, fmt.Errorf("unknown checkpoint extension for file %q", name) + } +} + +// findFiles returns filenames from direntries whose timestamps satisfy the filter. +// If findMoreRecentFiles is true, returns files with timestamps >= t (plus the +// last file before t if t falls between two files). func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) { nums := map[string]int64{} for _, e := range direntries { - if !strings.HasSuffix(e.Name(), ".json") && !strings.HasSuffix(e.Name(), ".avro") { + name := e.Name() + if !strings.HasSuffix(name, ".json") && !strings.HasSuffix(name, ".bin") { continue } - ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64) + ts, err := parseTimestampFromFilename(name) if err != nil { return nil, err } - nums[e.Name()] = ts + nums[name] = ts } sort.Slice(direntries, func(i, j int) bool { @@ -783,16 +644,12 @@ func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]s for i, e := range direntries { ts1 := nums[e.Name()] - // Logic to look for files in forward or direction - // If logic: All files greater than or after - // the given timestamp will be selected - // Else If logic: All files less than or before - // the given timestamp will be selected if findMoreRecentFiles && t <= ts1 { filenames = append(filenames, e.Name()) } else if !findMoreRecentFiles && ts1 <= t && ts1 != 0 { filenames = append(filenames, e.Name()) } + if i == len(direntries)-1 { continue } diff --git a/pkg/metricstore/config.go b/pkg/metricstore/config.go index 1efee61a..53716967 100644 --- a/pkg/metricstore/config.go +++ b/pkg/metricstore/config.go @@ -14,7 +14,7 @@ // ├─ RetentionInMemory: How long to keep data in RAM // ├─ MemoryCap: Memory limit in bytes (triggers forceFree) // ├─ Checkpoints: Persistence configuration -// │ ├─ FileFormat: "avro" or "json" +// │ ├─ FileFormat: "json" or "wal" // │ ├─ Interval: How often to save (e.g., "1h") // │ └─ RootDir: Checkpoint storage path // ├─ Cleanup: Long-term storage configuration @@ -55,16 +55,13 @@ const ( DefaultMaxWorkers = 10 DefaultBufferCapacity = 512 DefaultGCTriggerInterval = 100 - DefaultAvroWorkers = 4 - DefaultCheckpointBufferMin = 3 - DefaultAvroCheckpointInterval = time.Minute DefaultMemoryUsageTrackerInterval = 1 * time.Hour ) // Checkpoints configures periodic persistence of in-memory metric data. // // Fields: -// - FileFormat: "avro" (default, binary, compact) or "json" (human-readable, slower) +// - FileFormat: "json" (human-readable, periodic) or "wal" (binary snapshot + WAL, crash-safe) // - Interval: Duration string (e.g., "1h", "30m") between checkpoint saves // - RootDir: Filesystem path for checkpoint files (created if missing) type Checkpoints struct { diff --git a/pkg/metricstore/configSchema.go b/pkg/metricstore/configSchema.go index 6a748be0..67f30976 100644 --- a/pkg/metricstore/configSchema.go +++ b/pkg/metricstore/configSchema.go @@ -18,7 +18,7 @@ const configSchema = `{ "type": "object", "properties": { "file-format": { - "description": "Specify the format for checkpoint files. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.", + "description": "Specify the format for checkpoint files. Two variants: 'json' (human-readable, periodic) and 'wal' (binary snapshot + Write-Ahead Log, crash-safe). Default is 'json'.", "type": "string" }, "interval": { diff --git a/pkg/metricstore/lineprotocol.go b/pkg/metricstore/lineprotocol.go index bfbbef2d..1e04bba0 100644 --- a/pkg/metricstore/lineprotocol.go +++ b/pkg/metricstore/lineprotocol.go @@ -244,8 +244,8 @@ func DecodeLine(dec *lineprotocol.Decoder, time := t.Unix() - if Keys.Checkpoints.FileFormat != "json" { - LineProtocolMessages <- &AvroStruct{ + if Keys.Checkpoints.FileFormat == "wal" { + WALMessages <- &WALMessage{ MetricName: string(metricBuf), Cluster: cluster, Node: host, diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go index 789c6d07..3fe64d55 100644 --- a/pkg/metricstore/metricstore.go +++ b/pkg/metricstore/metricstore.go @@ -172,7 +172,7 @@ func Init(rawConfig json.RawMessage, metrics map[string]MetricConfig, wg *sync.W Retention(wg, ctx) Checkpointing(wg, ctx) CleanUp(wg, ctx) - DataStaging(wg, ctx) + WALStaging(wg, ctx) MemoryUsageTracker(wg, ctx) // Note: Signal handling has been removed from this function. @@ -264,7 +264,7 @@ func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) { // // The function will: // 1. Cancel the context to stop all background workers -// 2. Close NATS message channels if using Avro format +// 2. Close the WAL messages channel if using WAL format // 3. Write a final checkpoint to preserve in-memory data // 4. Log any errors encountered during shutdown // @@ -276,8 +276,8 @@ func Shutdown() { shutdownFunc() } - if Keys.Checkpoints.FileFormat != "json" { - close(LineProtocolMessages) + if Keys.Checkpoints.FileFormat == "wal" { + close(WALMessages) } cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir) @@ -286,10 +286,18 @@ func Shutdown() { ms := GetMemoryStore() - if Keys.Checkpoints.FileFormat == "json" { - files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) + lastCheckpointMu.Lock() + from := lastCheckpoint + lastCheckpointMu.Unlock() + + if Keys.Checkpoints.FileFormat == "wal" { + var hostDirs []string + files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix()) + if err == nil { + RotateWALFiles(hostDirs) + } } else { - files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true) + files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix()) } if err != nil { diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go new file mode 100644 index 00000000..e8a71ce2 --- /dev/null +++ b/pkg/metricstore/walCheckpoint.go @@ -0,0 +1,787 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstore provides walCheckpoint.go: WAL-based checkpoint implementation. +// +// This replaces the Avro shadow tree with an append-only Write-Ahead Log (WAL) +// per host, eliminating the extra memory overhead of the AvroStore and providing +// truly continuous (per-write) crash safety. +// +// # Architecture +// +// Metric write (DecodeLine) +// │ +// ├─► WriteToLevel() → main MemoryStore (unchanged) +// │ +// └─► WALMessages channel +// │ +// ▼ +// WALStaging goroutine +// │ +// ▼ +// checkpoints/cluster/host/current.wal (append-only, binary) +// +// Periodic checkpoint (Checkpointing goroutine): +// 1. Write .bin snapshot (column-oriented, from main tree) +// 2. Signal WALStaging to truncate current.wal per host +// +// On restart (FromCheckpoint): +// 1. Load most recent .bin snapshot +// 2. Replay current.wal (overwrite-safe: buffer.write handles duplicate timestamps) +// +// # WAL Record Format +// +// [4B magic 0xCC1DA7A1][4B payload_len][payload][4B CRC32] +// +// payload: +// [8B timestamp int64] +// [2B metric_name_len uint16][N metric name bytes] +// [1B selector_count uint8] +// per selector: [1B selector_len uint8][M selector bytes] +// [4B value float32 bits] +// +// # Binary Snapshot Format +// +// [4B magic 0xCC5B0001][8B from int64][8B to int64] +// Level tree (recursive): +// [4B num_metrics uint32] +// per metric: +// [2B name_len uint16][N name bytes] +// [8B frequency int64][8B start int64] +// [4B num_values uint32][num_values × 4B float32] +// [4B num_children uint32] +// per child: [2B name_len uint16][N name bytes] + Level (recursive) +package metricstore + +import ( + "bufio" + "context" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "math" + "os" + "path" + "sync" + "sync/atomic" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// Magic numbers for binary formats. +const ( + walFileMagic = uint32(0xCC1DA701) // WAL file header magic + walRecordMagic = uint32(0xCC1DA7A1) // WAL record magic + snapFileMagic = uint32(0xCC5B0001) // Binary snapshot magic +) + +// WALMessages is the channel for sending metric writes to the WAL staging goroutine. +// Buffered to allow burst writes without blocking the metric ingestion path. +var WALMessages = make(chan *WALMessage, 4096) + +// walRotateCh is used by the checkpoint goroutine to request WAL file rotation +// (close, delete, reopen) after a binary snapshot has been written. +var walRotateCh = make(chan walRotateReq, 256) + +// WALMessage represents a single metric write to be appended to the WAL. +// Cluster and Node are NOT stored in the WAL record (inferred from file path). +type WALMessage struct { + MetricName string + Cluster string + Node string + Selector []string + Value schema.Float + Timestamp int64 +} + +// walRotateReq requests WAL file rotation for a specific host directory. +// The done channel is closed by the WAL goroutine when rotation is complete. +type walRotateReq struct { + hostDir string + done chan struct{} +} + +// walFileState holds an open WAL file handle for one host directory. +type walFileState struct { + f *os.File +} + +// WALStaging starts a background goroutine that receives WALMessage items +// and appends binary WAL records to per-host current.wal files. +// Also handles WAL rotation requests from the checkpoint goroutine. +func WALStaging(wg *sync.WaitGroup, ctx context.Context) { + wg.Add(1) + go func() { + defer wg.Done() + + if Keys.Checkpoints.FileFormat == "json" { + return + } + + hostFiles := make(map[string]*walFileState) + + defer func() { + for _, ws := range hostFiles { + if ws.f != nil { + ws.f.Close() + } + } + }() + + getOrOpenWAL := func(hostDir string) *os.File { + ws, ok := hostFiles[hostDir] + if ok { + return ws.f + } + + if err := os.MkdirAll(hostDir, CheckpointDirPerms); err != nil { + cclog.Errorf("[METRICSTORE]> WAL: mkdir %s: %v", hostDir, err) + return nil + } + + walPath := path.Join(hostDir, "current.wal") + f, err := os.OpenFile(walPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, CheckpointFilePerms) + if err != nil { + cclog.Errorf("[METRICSTORE]> WAL: open %s: %v", walPath, err) + return nil + } + + // Write file header magic if file is new (empty). + info, err := f.Stat() + if err == nil && info.Size() == 0 { + var hdr [4]byte + binary.LittleEndian.PutUint32(hdr[:], walFileMagic) + if _, err := f.Write(hdr[:]); err != nil { + cclog.Errorf("[METRICSTORE]> WAL: write header %s: %v", walPath, err) + f.Close() + return nil + } + } + + hostFiles[hostDir] = &walFileState{f: f} + return f + } + + processMsg := func(msg *WALMessage) { + hostDir := path.Join(Keys.Checkpoints.RootDir, msg.Cluster, msg.Node) + f := getOrOpenWAL(hostDir) + if f == nil { + return + } + if err := writeWALRecord(f, msg); err != nil { + cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err) + } + } + + processRotate := func(req walRotateReq) { + ws, ok := hostFiles[req.hostDir] + if ok && ws.f != nil { + ws.f.Close() + walPath := path.Join(req.hostDir, "current.wal") + if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) { + cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err) + } + delete(hostFiles, req.hostDir) + } + close(req.done) + } + + drain := func() { + for { + select { + case msg, ok := <-WALMessages: + if !ok { + return + } + processMsg(msg) + case req := <-walRotateCh: + processRotate(req) + default: + return + } + } + } + + for { + select { + case <-ctx.Done(): + drain() + return + case msg, ok := <-WALMessages: + if !ok { + return + } + processMsg(msg) + case req := <-walRotateCh: + processRotate(req) + } + } + }() +} + +// RotateWALFiles sends rotation requests for the given host directories +// and blocks until all rotations complete. +func RotateWALFiles(hostDirs []string) { + dones := make([]chan struct{}, len(hostDirs)) + for i, dir := range hostDirs { + dones[i] = make(chan struct{}) + walRotateCh <- walRotateReq{hostDir: dir, done: dones[i]} + } + for _, done := range dones { + <-done + } +} + +// buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC). +func buildWALPayload(msg *WALMessage) []byte { + size := 8 + 2 + len(msg.MetricName) + 1 + 4 + for _, s := range msg.Selector { + size += 1 + len(s) + } + + buf := make([]byte, 0, size) + + // Timestamp (8 bytes, little-endian int64) + var ts [8]byte + binary.LittleEndian.PutUint64(ts[:], uint64(msg.Timestamp)) + buf = append(buf, ts[:]...) + + // Metric name (2-byte length prefix + bytes) + var mLen [2]byte + binary.LittleEndian.PutUint16(mLen[:], uint16(len(msg.MetricName))) + buf = append(buf, mLen[:]...) + buf = append(buf, msg.MetricName...) + + // Selector count (1 byte) + buf = append(buf, byte(len(msg.Selector))) + + // Selectors (1-byte length prefix + bytes each) + for _, sel := range msg.Selector { + buf = append(buf, byte(len(sel))) + buf = append(buf, sel...) + } + + // Value (4 bytes, float32 bit representation) + var val [4]byte + binary.LittleEndian.PutUint32(val[:], math.Float32bits(float32(msg.Value))) + buf = append(buf, val[:]...) + + return buf +} + +// writeWALRecord appends a binary WAL record to the file. +// Format: [4B magic][4B payload_len][payload][4B CRC32] +func writeWALRecord(f *os.File, msg *WALMessage) error { + payload := buildWALPayload(msg) + crc := crc32.ChecksumIEEE(payload) + + record := make([]byte, 0, 4+4+len(payload)+4) + + var magic [4]byte + binary.LittleEndian.PutUint32(magic[:], walRecordMagic) + record = append(record, magic[:]...) + + var pLen [4]byte + binary.LittleEndian.PutUint32(pLen[:], uint32(len(payload))) + record = append(record, pLen[:]...) + + record = append(record, payload...) + + var crcBytes [4]byte + binary.LittleEndian.PutUint32(crcBytes[:], crc) + record = append(record, crcBytes[:]...) + + _, err := f.Write(record) + return err +} + +// readWALRecord reads one WAL record from the reader. +// Returns (nil, nil) on clean EOF. Returns error on data corruption. +// A CRC mismatch indicates a truncated trailing record (expected on crash). +func readWALRecord(r io.Reader) (*WALMessage, error) { + var magic uint32 + if err := binary.Read(r, binary.LittleEndian, &magic); err != nil { + if err == io.EOF { + return nil, nil // Clean EOF + } + return nil, fmt.Errorf("read record magic: %w", err) + } + + if magic != walRecordMagic { + return nil, fmt.Errorf("invalid record magic 0x%08X (expected 0x%08X)", magic, walRecordMagic) + } + + var payloadLen uint32 + if err := binary.Read(r, binary.LittleEndian, &payloadLen); err != nil { + return nil, fmt.Errorf("read payload length: %w", err) + } + + if payloadLen > 1<<20 { // 1 MB sanity limit + return nil, fmt.Errorf("record payload too large: %d bytes", payloadLen) + } + + payload := make([]byte, payloadLen) + if _, err := io.ReadFull(r, payload); err != nil { + return nil, fmt.Errorf("read payload: %w", err) + } + + var storedCRC uint32 + if err := binary.Read(r, binary.LittleEndian, &storedCRC); err != nil { + return nil, fmt.Errorf("read CRC: %w", err) + } + + if crc32.ChecksumIEEE(payload) != storedCRC { + return nil, fmt.Errorf("CRC mismatch (truncated write or corruption)") + } + + return parseWALPayload(payload) +} + +// parseWALPayload decodes a binary payload into a WALMessage. +func parseWALPayload(payload []byte) (*WALMessage, error) { + if len(payload) < 8+2+1+4 { + return nil, fmt.Errorf("payload too short: %d bytes", len(payload)) + } + + offset := 0 + + // Timestamp (8 bytes) + ts := int64(binary.LittleEndian.Uint64(payload[offset : offset+8])) + offset += 8 + + // Metric name (2-byte length + bytes) + if offset+2 > len(payload) { + return nil, fmt.Errorf("metric name length overflows payload") + } + mLen := int(binary.LittleEndian.Uint16(payload[offset : offset+2])) + offset += 2 + + if offset+mLen > len(payload) { + return nil, fmt.Errorf("metric name overflows payload") + } + metricName := string(payload[offset : offset+mLen]) + offset += mLen + + // Selector count (1 byte) + if offset >= len(payload) { + return nil, fmt.Errorf("selector count overflows payload") + } + selCount := int(payload[offset]) + offset++ + + selectors := make([]string, selCount) + for i := range selCount { + if offset >= len(payload) { + return nil, fmt.Errorf("selector[%d] length overflows payload", i) + } + sLen := int(payload[offset]) + offset++ + + if offset+sLen > len(payload) { + return nil, fmt.Errorf("selector[%d] data overflows payload", i) + } + selectors[i] = string(payload[offset : offset+sLen]) + offset += sLen + } + + // Value (4 bytes, float32 bits) + if offset+4 > len(payload) { + return nil, fmt.Errorf("value overflows payload") + } + bits := binary.LittleEndian.Uint32(payload[offset : offset+4]) + value := schema.Float(math.Float32frombits(bits)) + + return &WALMessage{ + MetricName: metricName, + Timestamp: ts, + Selector: selectors, + Value: value, + }, nil +} + +// loadWALFile reads a WAL file and replays all valid records into the Level tree. +// l is the host-level node. Corrupt or partial trailing records are silently skipped +// (expected on crash). Records older than 'from' are skipped. +func (l *Level) loadWALFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + + // Verify file header magic. + var fileMagic uint32 + if err := binary.Read(br, binary.LittleEndian, &fileMagic); err != nil { + if err == io.EOF { + return nil // Empty file, no data + } + return fmt.Errorf("[METRICSTORE]> WAL: read file header: %w", err) + } + + if fileMagic != walFileMagic { + return fmt.Errorf("[METRICSTORE]> WAL: invalid file magic 0x%08X (expected 0x%08X)", fileMagic, walFileMagic) + } + + // Cache level lookups to avoid repeated tree traversal. + lvlCache := make(map[string]*Level) + + for { + msg, err := readWALRecord(br) + if err != nil { + // Truncated trailing record is expected after a crash; stop replaying. + cclog.Debugf("[METRICSTORE]> WAL: stopping replay at corrupted/partial record: %v", err) + break + } + if msg == nil { + break // Clean EOF + } + + if msg.Timestamp < from { + continue // Older than retention window + } + + minfo, ok := m.Metrics[msg.MetricName] + if !ok { + continue // Unknown metric (config may have changed) + } + + // Cache key is the null-separated selector path. + cacheKey := joinSelector(msg.Selector) + lvl, ok := lvlCache[cacheKey] + if !ok { + lvl = l.findLevelOrCreate(msg.Selector, len(m.Metrics)) + lvlCache[cacheKey] = lvl + } + + // Write directly to the buffer, same as WriteToLevel but without the + // global level lookup (we already have the right level). + lvl.lock.Lock() + b := lvl.metrics[minfo.offset] + if b == nil { + b = newBuffer(msg.Timestamp, minfo.Frequency) + lvl.metrics[minfo.offset] = b + } + nb, writeErr := b.write(msg.Timestamp, msg.Value) + if writeErr == nil && b != nb { + lvl.metrics[minfo.offset] = nb + } + // Ignore write errors for timestamps before buffer start (can happen when + // replaying WAL entries that predate a loaded snapshot's start time). + lvl.lock.Unlock() + } + + return nil +} + +// joinSelector builds a cache key from a selector slice using null bytes as separators. +func joinSelector(sel []string) string { + if len(sel) == 0 { + return "" + } + result := sel[0] + for i := 1; i < len(sel); i++ { + result += "\x00" + sel[i] + } + return result +} + +// ToCheckpointWAL writes binary snapshot files for all hosts in parallel. +// Returns the number of files written, the list of host directories that were +// successfully checkpointed (for WAL rotation), and any errors. +func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string, error) { + // Collect all cluster/host pairs. + m.root.lock.RLock() + totalHosts := 0 + for _, l1 := range m.root.children { + l1.lock.RLock() + totalHosts += len(l1.children) + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + levels := make([]*Level, 0, totalHosts) + selectors := make([][]string, 0, totalHosts) + + m.root.lock.RLock() + for sel1, l1 := range m.root.children { + l1.lock.RLock() + for sel2, l2 := range l1.children { + levels = append(levels, l2) + selectors = append(selectors, []string{sel1, sel2}) + } + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + type workItem struct { + level *Level + hostDir string + selector []string + } + + n, errs := int32(0), int32(0) + var successDirs []string + var successMu sync.Mutex + + var wg sync.WaitGroup + wg.Add(Keys.NumWorkers) + work := make(chan workItem, Keys.NumWorkers*2) + + for range Keys.NumWorkers { + go func() { + defer wg.Done() + for wi := range work { + err := wi.level.toCheckpointBinary(wi.hostDir, from, to, m) + if err != nil { + if err == ErrNoNewArchiveData { + continue + } + cclog.Errorf("[METRICSTORE]> binary checkpoint error for %s: %v", wi.hostDir, err) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + successMu.Lock() + successDirs = append(successDirs, wi.hostDir) + successMu.Unlock() + } + } + }() + } + + for i := range levels { + hostDir := path.Join(dir, path.Join(selectors[i]...)) + work <- workItem{ + level: levels[i], + hostDir: hostDir, + selector: selectors[i], + } + } + close(work) + wg.Wait() + + if errs > 0 { + return int(n), successDirs, fmt.Errorf("[METRICSTORE]> %d errors during binary checkpoint (%d successes)", errs, n) + } + return int(n), successDirs, nil +} + +// toCheckpointBinary writes a binary snapshot file for a single host-level node. +// Uses atomic rename (write to .tmp then rename) to avoid partial reads on crash. +func (l *Level) toCheckpointBinary(dir string, from, to int64, m *MemoryStore) error { + cf, err := l.toCheckpointFile(from, to, m) + if err != nil { + return err + } + if cf == nil { + return ErrNoNewArchiveData + } + + if err := os.MkdirAll(dir, CheckpointDirPerms); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + + // Write to a temp file first, then rename (atomic on POSIX). + tmpPath := path.Join(dir, fmt.Sprintf("%d.bin.tmp", from)) + finalPath := path.Join(dir, fmt.Sprintf("%d.bin", from)) + + f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms) + if err != nil { + return fmt.Errorf("open binary snapshot %s: %w", tmpPath, err) + } + + bw := bufio.NewWriter(f) + if err := writeBinarySnapshotFile(bw, cf); err != nil { + f.Close() + os.Remove(tmpPath) + return fmt.Errorf("write binary snapshot: %w", err) + } + if err := bw.Flush(); err != nil { + f.Close() + os.Remove(tmpPath) + return err + } + f.Close() + + return os.Rename(tmpPath, finalPath) +} + +// writeBinarySnapshotFile writes the binary snapshot file header and level tree. +func writeBinarySnapshotFile(w io.Writer, cf *CheckpointFile) error { + if err := binary.Write(w, binary.LittleEndian, snapFileMagic); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, cf.From); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, cf.To); err != nil { + return err + } + return writeBinaryLevel(w, cf) +} + +// writeBinaryLevel recursively writes a CheckpointFile level in binary format. +func writeBinaryLevel(w io.Writer, cf *CheckpointFile) error { + if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Metrics))); err != nil { + return err + } + + for name, metric := range cf.Metrics { + if err := writeString16(w, name); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, metric.Frequency); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, metric.Start); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, uint32(len(metric.Data))); err != nil { + return err + } + for _, v := range metric.Data { + if err := binary.Write(w, binary.LittleEndian, math.Float32bits(float32(v))); err != nil { + return err + } + } + } + + if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Children))); err != nil { + return err + } + + for name, child := range cf.Children { + if err := writeString16(w, name); err != nil { + return err + } + if err := writeBinaryLevel(w, child); err != nil { + return err + } + } + + return nil +} + +// writeString16 writes a 2-byte length-prefixed string to w. +func writeString16(w io.Writer, s string) error { + if err := binary.Write(w, binary.LittleEndian, uint16(len(s))); err != nil { + return err + } + _, err := io.WriteString(w, s) + return err +} + +// loadBinaryFile reads a binary snapshot file and loads data into the Level tree. +// The retention check (from) is applied to the file's 'to' timestamp. +func (l *Level) loadBinaryFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + + var magic uint32 + if err := binary.Read(br, binary.LittleEndian, &magic); err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read magic: %w", err) + } + if magic != snapFileMagic { + return fmt.Errorf("[METRICSTORE]> binary snapshot: invalid magic 0x%08X (expected 0x%08X)", magic, snapFileMagic) + } + + var fileFrom, fileTo int64 + if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read from: %w", err) + } + if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read to: %w", err) + } + + if fileTo != 0 && fileTo < from { + return nil // File is older than retention window, skip it + } + + cf, err := readBinaryLevel(br) + if err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read level tree: %w", err) + } + cf.From = fileFrom + cf.To = fileTo + + return l.loadFile(cf, m) +} + +// readBinaryLevel recursively reads a level from the binary snapshot format. +func readBinaryLevel(r io.Reader) (*CheckpointFile, error) { + cf := &CheckpointFile{ + Metrics: make(map[string]*CheckpointMetrics), + Children: make(map[string]*CheckpointFile), + } + + var numMetrics uint32 + if err := binary.Read(r, binary.LittleEndian, &numMetrics); err != nil { + return nil, fmt.Errorf("read num_metrics: %w", err) + } + + for range numMetrics { + name, err := readString16(r) + if err != nil { + return nil, fmt.Errorf("read metric name: %w", err) + } + + var freq, start int64 + if err := binary.Read(r, binary.LittleEndian, &freq); err != nil { + return nil, fmt.Errorf("read frequency for %s: %w", name, err) + } + if err := binary.Read(r, binary.LittleEndian, &start); err != nil { + return nil, fmt.Errorf("read start for %s: %w", name, err) + } + + var numValues uint32 + if err := binary.Read(r, binary.LittleEndian, &numValues); err != nil { + return nil, fmt.Errorf("read num_values for %s: %w", name, err) + } + + data := make([]schema.Float, numValues) + for i := range numValues { + var bits uint32 + if err := binary.Read(r, binary.LittleEndian, &bits); err != nil { + return nil, fmt.Errorf("read value[%d] for %s: %w", i, name, err) + } + data[i] = schema.Float(math.Float32frombits(bits)) + } + + cf.Metrics[name] = &CheckpointMetrics{ + Frequency: freq, + Start: start, + Data: data, + } + } + + var numChildren uint32 + if err := binary.Read(r, binary.LittleEndian, &numChildren); err != nil { + return nil, fmt.Errorf("read num_children: %w", err) + } + + for range numChildren { + childName, err := readString16(r) + if err != nil { + return nil, fmt.Errorf("read child name: %w", err) + } + + child, err := readBinaryLevel(r) + if err != nil { + return nil, fmt.Errorf("read child %s: %w", childName, err) + } + cf.Children[childName] = child + } + + return cf, nil +} + +// readString16 reads a 2-byte length-prefixed string from r. +func readString16(r io.Reader) (string, error) { + var sLen uint16 + if err := binary.Read(r, binary.LittleEndian, &sLen); err != nil { + return "", err + } + buf := make([]byte, sLen) + if _, err := io.ReadFull(r, buf); err != nil { + return "", err + } + return string(buf), nil +} From 348b6010e8eeeb4c80d9c1c4b5a1e737374dd229 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 26 Feb 2026 15:09:01 +0100 Subject: [PATCH 05/20] fix typo preventing template condition to work --- internal/auth/auth.go | 2 +- web/templates/login.tmpl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 9b1e2121..69f4f078 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -263,7 +263,7 @@ func GetAuthInstance() *Authentication { } // handleUserSync syncs or updates a user in the database based on configuration. -// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled. +// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled. func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) { r := repository.GetUserRepository() dbUser, err := r.GetUser(user.Username) diff --git a/web/templates/login.tmpl b/web/templates/login.tmpl index cd139261..4c4d9be8 100644 --- a/web/templates/login.tmpl +++ b/web/templates/login.tmpl @@ -38,8 +38,8 @@
- {{- if .Infos.hasOpenIDConnect}} - OpenID Connect Login + {{if .Infos.hasOpenIDConnect}} + OpenID Connect Login {{end}} From 6ecb9349677d5d2a399aa26b86d12c987a7cb3ef Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 27 Feb 2026 08:55:33 +0100 Subject: [PATCH 06/20] Switch to CC line-protocol package. Update cc-lib. --- go.mod | 14 +++++----- go.sum | 48 +++++++++++---------------------- internal/api/metricstore.go | 2 +- internal/api/nats.go | 2 +- pkg/metricstore/lineprotocol.go | 2 +- 5 files changed, 26 insertions(+), 42 deletions(-) diff --git a/go.mod b/go.mod index e244062c..afc21f2a 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,8 @@ tool ( require ( github.com/99designs/gqlgen v0.17.86 - github.com/ClusterCockpit/cc-lib/v2 v2.6.0 + github.com/ClusterCockpit/cc-lib/v2 v2.7.0 + github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.8 @@ -25,7 +26,6 @@ require ( github.com/golang-migrate/migrate/v4 v4.19.1 github.com/google/gops v0.3.29 github.com/gorilla/sessions v1.4.0 - github.com/influxdata/line-protocol/v2 v2.2.1 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 github.com/linkedin/goavro/v2 v2.15.0 @@ -92,10 +92,10 @@ require ( github.com/kr/pretty v0.3.1 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect - github.com/nats-io/nats.go v1.48.0 // indirect + github.com/nats-io/nats.go v1.49.0 // indirect github.com/nats-io/nkeys v0.4.15 // indirect github.com/nats-io/nuid v1.0.1 // indirect - github.com/oapi-codegen/runtime v1.1.2 // indirect + github.com/oapi-codegen/runtime v1.2.0 // indirect github.com/parquet-go/bitpack v1.0.0 // indirect github.com/parquet-go/jsonlite v1.4.0 // indirect github.com/pierrec/lz4/v4 v4.1.25 // indirect @@ -104,7 +104,7 @@ require ( github.com/rogpeppe/go-internal v1.10.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sosodev/duration v1.3.1 // indirect - github.com/stmcginnis/gofish v0.21.1 // indirect + github.com/stmcginnis/gofish v0.21.3 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/swaggo/files v1.0.1 // indirect github.com/twpayne/go-geom v1.6.1 // indirect @@ -113,9 +113,9 @@ require ( github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a // indirect + golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect golang.org/x/mod v0.33.0 // indirect - golang.org/x/net v0.50.0 // indirect + golang.org/x/net v0.51.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.41.0 // indirect golang.org/x/text v0.34.0 // indirect diff --git a/go.sum b/go.sum index f2929454..cedddd62 100644 --- a/go.sum +++ b/go.sum @@ -4,10 +4,10 @@ github.com/99designs/gqlgen v0.17.86 h1:C8N3UTa5heXX6twl+b0AJyGkTwYL6dNmFrgZNLRc github.com/99designs/gqlgen v0.17.86/go.mod h1:KTrPl+vHA1IUzNlh4EYkl7+tcErL3MgKnhHrBcV74Fw= github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A= github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk= -github.com/ClusterCockpit/cc-lib/v2 v2.5.1 h1:s6M9tyPDty+4zTdQGJYKpGJM9Nz7N6ITMdjPvNSLX5g= -github.com/ClusterCockpit/cc-lib/v2 v2.5.1/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0= -github.com/ClusterCockpit/cc-lib/v2 v2.6.0 h1:Q7zvRAVhfYA9PDB18pfY9A/6Ws4oWpnv8+P9MBRUDzg= -github.com/ClusterCockpit/cc-lib/v2 v2.6.0/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0= +github.com/ClusterCockpit/cc-lib/v2 v2.7.0 h1:EMTShk6rMTR1wlfmQ8SVCawH1OdltUbD3kVQmaW+5pE= +github.com/ClusterCockpit/cc-lib/v2 v2.7.0/go.mod h1:0Etx8WMs0lYZ4tiOQizY18CQop+2i3WROvU9rMUxHA4= +github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q= +github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0/go.mod h1:y42qUu+YFmu5fdNuUAS4VbbIKxVjxCvbVqFdpdh8ahY= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= @@ -95,8 +95,6 @@ github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7c github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM= github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= -github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= -github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= @@ -154,8 +152,6 @@ github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjY github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA= @@ -184,13 +180,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjw github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= -github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98= github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig= github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo= -github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY= -github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY= -github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= -github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= @@ -212,11 +203,8 @@ github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7X github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE= github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw= @@ -240,15 +228,14 @@ github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g= github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA= github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs= github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y= -github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U= -github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= +github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE= +github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw= github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/oapi-codegen/runtime v1.1.2 h1:P2+CubHq8fO4Q6fV1tqDBZHCwpVpvPg7oKiYzQgXIyI= -github.com/oapi-codegen/runtime v1.1.2/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= +github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4= +github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA= github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs= @@ -268,8 +255,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= -github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= -github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/prometheus/procfs v0.20.0 h1:AA7aCvjxwAquZAlonN7888f2u4IN8WVeFgBi4k82M4Q= +github.com/prometheus/procfs v0.20.0/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0= github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= @@ -286,8 +273,8 @@ github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NF github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4= github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg= github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= -github.com/stmcginnis/gofish v0.21.1 h1:sutDvBhmLh4RDOZ1DN8GUyYRu7f1ggvKMMnSaiqhwn4= -github.com/stmcginnis/gofish v0.21.1/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU= +github.com/stmcginnis/gofish v0.21.3 h1:EBLCHfORnbx7MPw7lplOOVe9QAD1T3XRVz6+a1Z4z5Q= +github.com/stmcginnis/gofish v0.21.3/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= @@ -328,8 +315,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= -golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a h1:ovFr6Z0MNmU7nH8VaX5xqw+05ST2uO1exVfZPVqRC5o= -golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA= +golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0= +golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= @@ -337,8 +324,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= -golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= +golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= +golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -370,16 +357,13 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= diff --git a/internal/api/metricstore.go b/internal/api/metricstore.go index 5c15bb2c..ff4deb6a 100644 --- a/internal/api/metricstore.go +++ b/internal/api/metricstore.go @@ -18,7 +18,7 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/metricstore" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" - "github.com/influxdata/line-protocol/v2/lineprotocol" + "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" ) // handleFree godoc diff --git a/internal/api/nats.go b/internal/api/nats.go index 02a03fae..efa4ab6f 100644 --- a/internal/api/nats.go +++ b/internal/api/nats.go @@ -21,7 +21,7 @@ import ( "github.com/ClusterCockpit/cc-lib/v2/nats" "github.com/ClusterCockpit/cc-lib/v2/receivers" "github.com/ClusterCockpit/cc-lib/v2/schema" - influx "github.com/influxdata/line-protocol/v2/lineprotocol" + influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" ) // NatsAPI provides NATS subscription-based handlers for Job and Node operations. diff --git a/pkg/metricstore/lineprotocol.go b/pkg/metricstore/lineprotocol.go index bfbbef2d..ed30dec7 100644 --- a/pkg/metricstore/lineprotocol.go +++ b/pkg/metricstore/lineprotocol.go @@ -14,7 +14,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/nats" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/influxdata/line-protocol/v2/lineprotocol" + "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" ) func ReceiveNats(ms *MemoryStore, From a1db8263d72b9727347ea69e0cc832ec67bd1235 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 27 Feb 2026 12:30:27 +0100 Subject: [PATCH 07/20] Document line protocol. Optimize REST writeMetric path --- internal/api/metricstore.go | 30 ++++-- pkg/metricstore/lineprotocol.go | 163 ++++++++++++++++++++++++++------ 2 files changed, 155 insertions(+), 38 deletions(-) diff --git a/internal/api/metricstore.go b/internal/api/metricstore.go index ff4deb6a..325b26ba 100644 --- a/internal/api/metricstore.go +++ b/internal/api/metricstore.go @@ -10,7 +10,6 @@ import ( "encoding/json" "errors" "fmt" - "io" "net/http" "strconv" "strings" @@ -90,16 +89,17 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /write/ [post] func writeMetrics(rw http.ResponseWriter, r *http.Request) { - bytes, err := io.ReadAll(r.Body) rw.Header().Add("Content-Type", "application/json") - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } + // Extract the "cluster" query parameter without allocating a url.Values map. + cluster := queryParam(r.URL.RawQuery, "cluster") + + // Stream directly from the request body instead of copying it into a + // temporary buffer via io.ReadAll. The line-protocol decoder supports + // io.Reader natively, so this avoids the largest heap allocation. ms := metricstore.GetMemoryStore() - dec := lineprotocol.NewDecoderWithBytes(bytes) - if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil { + dec := lineprotocol.NewDecoder(r.Body) + if err := metricstore.DecodeLine(dec, ms, cluster); err != nil { cclog.Errorf("/api/write error: %s", err.Error()) handleError(err, http.StatusBadRequest, rw) return @@ -107,6 +107,20 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) { rw.WriteHeader(http.StatusOK) } +// queryParam extracts a single query-parameter value from a raw query string +// without allocating a url.Values map. Returns "" if the key is not present. +func queryParam(raw, key string) string { + for raw != "" { + var kv string + kv, raw, _ = strings.Cut(raw, "&") + k, v, _ := strings.Cut(kv, "=") + if k == key { + return v + } + } + return "" +} + // handleDebug godoc // @summary Debug endpoint // @tags debug diff --git a/pkg/metricstore/lineprotocol.go b/pkg/metricstore/lineprotocol.go index f8c83e31..ecae3df1 100644 --- a/pkg/metricstore/lineprotocol.go +++ b/pkg/metricstore/lineprotocol.go @@ -3,9 +3,23 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. +// This file implements ingestion of InfluxDB line-protocol metric data received +// over NATS. Each line encodes one metric sample with the following structure: +// +// [,cluster=][,hostname=][,type=][,type-id=][,subtype=][,stype-id=] value= [] +// +// The measurement name identifies the metric (e.g. "cpu_load"). Tags provide +// routing information (cluster, host) and optional sub-device selectors (type, +// subtype). Only one field is expected per line: "value". +// +// After decoding, each sample is: +// 1. Written to the in-memory store via ms.WriteToLevel. +// 2. If the checkpoint format is "wal", also forwarded to the WAL staging +// goroutine via the WALMessages channel for durable write-ahead logging. package metricstore import ( + "bytes" "context" "fmt" "sync" @@ -17,6 +31,16 @@ import ( "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" ) +// ReceiveNats subscribes to all configured NATS subjects and feeds incoming +// line-protocol messages into the MemoryStore. +// +// When workers > 1 a pool of goroutines drains a shared channel so that +// multiple messages can be decoded in parallel. With workers == 1 the NATS +// callback decodes inline (no channel overhead, lower latency). +// +// The function blocks until ctx is cancelled and all worker goroutines have +// finished. It returns nil when the NATS client is not configured; callers +// should treat that as a no-op rather than an error. func ReceiveNats(ms *MemoryStore, workers int, ctx context.Context, @@ -75,8 +99,13 @@ func ReceiveNats(ms *MemoryStore, return nil } -// Place `prefix` in front of `buf` but if possible, -// do that inplace in `buf`. +// reorder prepends prefix to buf in-place when buf has enough spare capacity, +// avoiding an allocation. Falls back to a regular append otherwise. +// +// It is used to assemble the "type" and "subtype" selector +// strings when the type tag arrives before the type-id tag in the line, so the +// two byte slices need to be concatenated in tag-declaration order regardless +// of wire order. func reorder(buf, prefix []byte) []byte { n := len(prefix) m := len(buf) @@ -94,17 +123,83 @@ func reorder(buf, prefix []byte) []byte { } } -// Decode lines using dec and make write calls to the MemoryStore. -// If a line is missing its cluster tag, use clusterDefault as default. +// decodeState holds the per-call scratch buffers used by DecodeLine. +// Instances are recycled via decodeStatePool to avoid repeated allocations +// during high-throughput ingestion. +type decodeState struct { + // metricBuf holds a copy of the current measurement name (line-protocol + // measurement field). Copied because dec.Measurement() returns a slice + // that is invalidated by the next decoder call. + metricBuf []byte + + // selector is the sub-device path passed to WriteToLevel and WALMessage + // (e.g. ["socket0"] or ["socket0", "memctrl1"]). Reused across lines. + selector []string + + // typeBuf accumulates the concatenated "type"+"type-id" tag value for the + // current line. Reset at the start of each line's tag-decode loop. + typeBuf []byte + + // subTypeBuf accumulates the concatenated "subtype"+"stype-id" tag value. + // Reset at the start of each line's tag-decode loop. + subTypeBuf []byte + + // prevTypeBytes / prevTypeStr cache the last seen typeBuf content and its + // string conversion. Because consecutive lines in a batch typically address + // the same sub-device, the cache hit rate is very high and avoids + // repeated []byte→string allocations. + prevTypeBytes []byte + prevTypeStr string + + // prevSubTypeBytes / prevSubTypeStr are the same cache for the subtype. + prevSubTypeBytes []byte + prevSubTypeStr string +} + +// decodeStatePool recycles decodeState values across DecodeLine calls to +// reduce GC pressure during sustained metric ingestion. +var decodeStatePool = sync.Pool{ + New: func() any { + return &decodeState{ + metricBuf: make([]byte, 0, 16), + selector: make([]string, 0, 4), + typeBuf: make([]byte, 0, 16), + subTypeBuf: make([]byte, 0, 16), + } + }, +} + +// DecodeLine reads all lines from dec (InfluxDB line-protocol) and writes each +// decoded metric sample into ms. +// +// clusterDefault is used as the cluster name for lines that do not carry a +// "cluster" tag. Callers typically supply the ClusterTag value from the NATS +// subscription configuration. +// +// Performance notes: +// - A decodeState is obtained from decodeStatePool to reuse scratch buffers. +// - The Level pointer (host-level node in the metric tree) is cached across +// consecutive lines that share the same cluster+host pair to avoid +// repeated lock acquisitions on the root and cluster levels. +// - []byte→string conversions for type/subtype selectors are cached via +// prevType*/prevSubType* fields because batches typically repeat the same +// sub-device identifiers. +// - Timestamp parsing tries Second precision first; if that fails it retries +// Millisecond, Microsecond, and Nanosecond in turn. A missing timestamp +// falls back to time.Now(). +// +// When the checkpoint format is "wal" each successfully decoded sample is also +// sent to WALMessages so the WAL staging goroutine can persist it durably +// before the next binary snapshot. func DecodeLine(dec *lineprotocol.Decoder, ms *MemoryStore, clusterDefault string, ) error { // Reduce allocations in loop: t := time.Now() - metric, metricBuf := Metric{}, make([]byte, 0, 16) - selector := make([]string, 0, 4) - typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0) + metric := Metric{} + st := decodeStatePool.Get().(*decodeState) + defer decodeStatePool.Put(st) // Optimize for the case where all lines in a "batch" are about the same // cluster and host. By using `WriteToLevel` (level = host), we do not need @@ -121,7 +216,7 @@ func DecodeLine(dec *lineprotocol.Decoder, // Needs to be copied because another call to dec.* would // invalidate the returned slice. - metricBuf = append(metricBuf[:0], rawmeasurement...) + st.metricBuf = append(st.metricBuf[:0], rawmeasurement...) // The go compiler optimizes map[string(byteslice)] lookups: metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)] @@ -129,7 +224,7 @@ func DecodeLine(dec *lineprotocol.Decoder, continue } - typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0] + st.typeBuf, st.subTypeBuf = st.typeBuf[:0], st.subTypeBuf[:0] cluster, host := clusterDefault, "" for { key, val, err := dec.NextTag() @@ -162,41 +257,49 @@ func DecodeLine(dec *lineprotocol.Decoder, } // We cannot be sure that the "type" tag comes before the "type-id" tag: - if len(typeBuf) == 0 { - typeBuf = append(typeBuf, val...) + if len(st.typeBuf) == 0 { + st.typeBuf = append(st.typeBuf, val...) } else { - typeBuf = reorder(typeBuf, val) + st.typeBuf = reorder(st.typeBuf, val) } case "type-id": - typeBuf = append(typeBuf, val...) + st.typeBuf = append(st.typeBuf, val...) case "subtype": // We cannot be sure that the "subtype" tag comes before the "stype-id" tag: - if len(subTypeBuf) == 0 { - subTypeBuf = append(subTypeBuf, val...) + if len(st.subTypeBuf) == 0 { + st.subTypeBuf = append(st.subTypeBuf, val...) } else { - subTypeBuf = reorder(subTypeBuf, val) - // subTypeBuf = reorder(typeBuf, val) + st.subTypeBuf = reorder(st.subTypeBuf, val) } case "stype-id": - subTypeBuf = append(subTypeBuf, val...) + st.subTypeBuf = append(st.subTypeBuf, val...) default: } } // If the cluster or host changed, the lvl was set to nil if lvl == nil { - selector = selector[:2] - selector[0], selector[1] = cluster, host - lvl = ms.GetLevel(selector) + st.selector = st.selector[:2] + st.selector[0], st.selector[1] = cluster, host + lvl = ms.GetLevel(st.selector) prevCluster, prevHost = cluster, host } - // subtypes: - selector = selector[:0] - if len(typeBuf) > 0 { - selector = append(selector, string(typeBuf)) // <- Allocation :( - if len(subTypeBuf) > 0 { - selector = append(selector, string(subTypeBuf)) + // subtypes: cache []byte→string conversions; messages in a batch typically + // share the same type/subtype so the hit rate is very high. + st.selector = st.selector[:0] + if len(st.typeBuf) > 0 { + if !bytes.Equal(st.typeBuf, st.prevTypeBytes) { + st.prevTypeBytes = append(st.prevTypeBytes[:0], st.typeBuf...) + st.prevTypeStr = string(st.typeBuf) + } + st.selector = append(st.selector, st.prevTypeStr) + if len(st.subTypeBuf) > 0 { + if !bytes.Equal(st.subTypeBuf, st.prevSubTypeBytes) { + st.prevSubTypeBytes = append(st.prevSubTypeBytes[:0], st.subTypeBuf...) + st.prevSubTypeStr = string(st.subTypeBuf) + } + st.selector = append(st.selector, st.prevSubTypeStr) } } @@ -246,16 +349,16 @@ func DecodeLine(dec *lineprotocol.Decoder, if Keys.Checkpoints.FileFormat == "wal" { WALMessages <- &WALMessage{ - MetricName: string(metricBuf), + MetricName: string(st.metricBuf), Cluster: cluster, Node: host, - Selector: append([]string{}, selector...), + Selector: append([]string{}, st.selector...), Value: metric.Value, Timestamp: time, } } - if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil { + if err := ms.WriteToLevel(lvl, st.selector, time, []Metric{metric}); err != nil { return err } } From a418abc7d5ccfc806318caa9effa99d9f955fbcb Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 27 Feb 2026 14:40:26 +0100 Subject: [PATCH 08/20] Run go fix --- internal/api/rest.go | 2 +- pkg/archive/fsBackend.go | 6 ++---- pkg/archive/s3Backend.go | 6 ++---- pkg/archive/sqliteBackend.go | 6 ++---- pkg/metricstore/archive.go | 6 ++---- pkg/metricstore/checkpoint.go | 6 ++---- pkg/metricstore/metricstore.go | 12 ++++-------- pkg/metricstore/walCheckpoint.go | 14 +++++++------- 8 files changed, 22 insertions(+), 36 deletions(-) diff --git a/internal/api/rest.go b/internal/api/rest.go index 4d2385e3..613867a8 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -302,7 +302,7 @@ func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) { rw.Header().Set("Content-Type", "text/plain") rw.WriteHeader(http.StatusOK) - if _, err := rw.Write([]byte(fmt.Sprintf("Tagger %s started", name))); err != nil { + if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil { cclog.Errorf("Failed to write response: %v", err) } } diff --git a/pkg/archive/fsBackend.go b/pkg/archive/fsBackend.go index 07b86e2b..dfc870b4 100644 --- a/pkg/archive/fsBackend.go +++ b/pkg/archive/fsBackend.go @@ -501,9 +501,7 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { var wg sync.WaitGroup for range numWorkers { - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { for jobPath := range jobPaths { job, err := loadJobMeta(filepath.Join(jobPath, "meta.json")) if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) { @@ -529,7 +527,7 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { ch <- JobContainer{Meta: job, Data: nil} } } - }() + }) } clustersDir, err := os.ReadDir(fsa.path) diff --git a/pkg/archive/s3Backend.go b/pkg/archive/s3Backend.go index 84abd713..7b82d309 100644 --- a/pkg/archive/s3Backend.go +++ b/pkg/archive/s3Backend.go @@ -821,9 +821,7 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer { var wg sync.WaitGroup for range numWorkers { - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { for metaKey := range metaKeys { result, err := s3a.client.GetObject(ctx, &s3.GetObjectInput{ Bucket: aws.String(s3a.bucket), @@ -859,7 +857,7 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer { ch <- JobContainer{Meta: job, Data: nil} } } - }() + }) } for _, cluster := range s3a.clusters { diff --git a/pkg/archive/sqliteBackend.go b/pkg/archive/sqliteBackend.go index 50821367..3f214136 100644 --- a/pkg/archive/sqliteBackend.go +++ b/pkg/archive/sqliteBackend.go @@ -576,9 +576,7 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer { var wg sync.WaitGroup for range numWorkers { - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { for row := range jobRows { job, err := DecodeJobMeta(bytes.NewReader(row.metaBlob)) if err != nil { @@ -617,7 +615,7 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer { ch <- JobContainer{Meta: job, Data: nil} } } - }() + }) } for { diff --git a/pkg/metricstore/archive.go b/pkg/metricstore/archive.go index 784348b5..d3617f2c 100644 --- a/pkg/metricstore/archive.go +++ b/pkg/metricstore/archive.go @@ -49,9 +49,7 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) { // runWorker takes simple values to configure what it does func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) { - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { d, err := time.ParseDuration(interval) if err != nil { @@ -85,7 +83,7 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod } } } - }() + }) } var ErrNoNewArchiveData error = errors.New("all data already archived") diff --git a/pkg/metricstore/checkpoint.go b/pkg/metricstore/checkpoint.go index 590197e3..45b2bc2a 100644 --- a/pkg/metricstore/checkpoint.go +++ b/pkg/metricstore/checkpoint.go @@ -96,9 +96,7 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { ms := GetMemoryStore() - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { d, err := time.ParseDuration(Keys.Checkpoints.Interval) if err != nil { @@ -149,7 +147,7 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { } } } - }() + }) } // MarshalJSON provides optimized JSON encoding for CheckpointMetrics. diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go index 3fe64d55..d46c0d15 100644 --- a/pkg/metricstore/metricstore.go +++ b/pkg/metricstore/metricstore.go @@ -320,9 +320,7 @@ func Shutdown() { func Retention(wg *sync.WaitGroup, ctx context.Context) { ms := GetMemoryStore() - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { d, err := time.ParseDuration(Keys.RetentionInMemory) if err != nil { cclog.Fatal(err) @@ -361,7 +359,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) { state.mu.Unlock() } } - }() + }) } // MemoryUsageTracker starts a background goroutine that monitors memory usage. @@ -382,9 +380,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) { func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) { ms := GetMemoryStore() - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { d := DefaultMemoryUsageTrackerInterval if d <= 0 { @@ -470,7 +466,7 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) { } } } - }() + }) } // Free removes metric data older than the given time while preserving data for active nodes. diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go index e8a71ce2..685a8388 100644 --- a/pkg/metricstore/walCheckpoint.go +++ b/pkg/metricstore/walCheckpoint.go @@ -65,6 +65,7 @@ import ( "math" "os" "path" + "strings" "sync" "sync/atomic" @@ -114,9 +115,7 @@ type walFileState struct { // and appends binary WAL records to per-host current.wal files. // Also handles WAL rotation requests from the checkpoint goroutine. func WALStaging(wg *sync.WaitGroup, ctx context.Context) { - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { if Keys.Checkpoints.FileFormat == "json" { return @@ -220,7 +219,7 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) { processRotate(req) } } - }() + }) } // RotateWALFiles sends rotation requests for the given host directories @@ -478,11 +477,12 @@ func joinSelector(sel []string) string { if len(sel) == 0 { return "" } - result := sel[0] + var result strings.Builder + result.WriteString(sel[0]) for i := 1; i < len(sel); i++ { - result += "\x00" + sel[i] + result.WriteString("\x00" + sel[i]) } - return result + return result.String() } // ToCheckpointWAL writes binary snapshot files for all hosts in parallel. From 07b989cb81538bdce1e6dce50222c3cc7d76ab58 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Fri, 27 Feb 2026 14:44:32 +0100 Subject: [PATCH 09/20] Add new bufferPool implementation --- pkg/metricstore/buffer.go | 101 ++++++++++++++++++++++++++-- pkg/metricstore/level.go | 2 + pkg/metricstore/metricstore.go | 6 ++ pkg/metricstore/metricstore_test.go | 50 ++++++++++++++ 4 files changed, 155 insertions(+), 4 deletions(-) diff --git a/pkg/metricstore/buffer.go b/pkg/metricstore/buffer.go index 665d8012..f486e645 100644 --- a/pkg/metricstore/buffer.go +++ b/pkg/metricstore/buffer.go @@ -43,6 +43,7 @@ package metricstore import ( "errors" "sync" + "time" "github.com/ClusterCockpit/cc-lib/v2/schema" ) @@ -53,12 +54,102 @@ import ( // of data or reallocation needs to happen on writes. const BufferCap int = DefaultBufferCapacity -var bufferPool sync.Pool = sync.Pool{ - New: func() any { +// BufferPool is the global instance. +// It is initialized immediately when the package loads. +var bufferPool = NewPersistentBufferPool() + +type PersistentBufferPool struct { + pool []*buffer + mu sync.Mutex +} + +// NewPersistentBufferPool creates a dynamic pool for buffers. +func NewPersistentBufferPool() *PersistentBufferPool { + return &PersistentBufferPool{ + pool: make([]*buffer, 0), + } +} + +func (p *PersistentBufferPool) Get() *buffer { + p.mu.Lock() + defer p.mu.Unlock() + + n := len(p.pool) + if n == 0 { + // Pool is empty, allocate a new one return &buffer{ data: make([]schema.Float, 0, BufferCap), } - }, + } + + // Reuse existing buffer from the pool + b := p.pool[n-1] + p.pool[n-1] = nil // Avoid memory leak + p.pool = p.pool[:n-1] + return b +} + +func (p *PersistentBufferPool) Put(b *buffer) { + // Reset the buffer before putting it back + b.data = b.data[:0] + + p.mu.Lock() + defer p.mu.Unlock() + p.pool = append(p.pool, b) +} + +// GetSize returns the exact number of buffers currently sitting in the pool. +func (p *PersistentBufferPool) GetSize() int { + p.mu.Lock() + defer p.mu.Unlock() + return len(p.pool) +} + +// Clear drains all buffers currently in the pool, allowing the GC to collect them. +func (p *PersistentBufferPool) Clear() { + p.mu.Lock() + defer p.mu.Unlock() + for i := range p.pool { + p.pool[i] = nil + } + p.pool = p.pool[:0] +} + +// Clean removes buffers from the pool that haven't been used in the given duration. +// It uses a simple LRU approach based on the lastUsed timestamp. +func (p *PersistentBufferPool) Clean(threshold int64) { + p.mu.Lock() + defer p.mu.Unlock() + + // Filter in place + active := p.pool[:0] + for _, b := range p.pool { + if b.lastUsed >= threshold { + active = append(active, b) + } else { + // Buffer is older than the threshold, let it be collected by GC + } + } + + // Nullify the rest to prevent memory leaks + for i := len(active); i < len(p.pool); i++ { + p.pool[i] = nil + } + + p.pool = active +} + +// CleanAll removes all buffers from the pool. +func (p *PersistentBufferPool) CleanAll() { + p.mu.Lock() + defer p.mu.Unlock() + + // Nullify all buffers to prevent memory leaks + for i := range p.pool { + p.pool[i] = nil + } + + p.pool = p.pool[:0] } var ( @@ -94,10 +185,11 @@ type buffer struct { start int64 archived bool closed bool + lastUsed int64 } func newBuffer(ts, freq int64) *buffer { - b := bufferPool.Get().(*buffer) + b := bufferPool.Get() b.frequency = freq b.start = ts - (freq / 2) b.prev = nil @@ -240,6 +332,7 @@ func (b *buffer) free(t int64) (delme bool, n int) { if cap(b.prev.data) != BufferCap { b.prev.data = make([]schema.Float, 0, BufferCap) } + b.prev.lastUsed = time.Now().Unix() bufferPool.Put(b.prev) b.prev = nil } diff --git a/pkg/metricstore/level.go b/pkg/metricstore/level.go index 85c2ba7b..ef082579 100644 --- a/pkg/metricstore/level.go +++ b/pkg/metricstore/level.go @@ -42,6 +42,7 @@ package metricstore import ( "sync" + "time" "unsafe" "github.com/ClusterCockpit/cc-lib/v2/schema" @@ -192,6 +193,7 @@ func (l *Level) free(t int64) (int, error) { if cap(b.data) != BufferCap { b.data = make([]schema.Float, 0, BufferCap) } + b.lastUsed = time.Now().Unix() bufferPool.Put(b) l.metrics[i] = nil } diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go index d46c0d15..db3e4357 100644 --- a/pkg/metricstore/metricstore.go +++ b/pkg/metricstore/metricstore.go @@ -357,6 +357,9 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) { } state.mu.Unlock() + + // Clean up the buffer pool + bufferPool.Clean(state.lastRetentionTime) } } }) @@ -425,6 +428,9 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) { runtime.ReadMemStats(&mem) actualMemoryGB = float64(mem.Alloc) / 1e9 + bufferPool.CleanAll() + cclog.Infof("[METRICSTORE]> Cleaned up bufferPool\n") + if actualMemoryGB > float64(Keys.MemoryCap) { cclog.Warnf("[METRICSTORE]> memory usage %.2f GB exceeds cap %d GB, starting emergency buffer freeing", actualMemoryGB, Keys.MemoryCap) diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index eb1aff15..55c97e60 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -464,3 +464,53 @@ func TestBufferHealthChecks(t *testing.T) { }) } } + +func TestBufferPoolClean(t *testing.T) { + // Use a fresh pool for testing + pool := NewPersistentBufferPool() + + now := time.Now().Unix() + + // Create some buffers and put them in the pool with different lastUsed times + b1 := &buffer{lastUsed: now - 3600, data: make([]schema.Float, 0)} // 1 hour ago + b2 := &buffer{lastUsed: now - 7200, data: make([]schema.Float, 0)} // 2 hours ago + b3 := &buffer{lastUsed: now - 180000, data: make([]schema.Float, 0)} // 50 hours ago + b4 := &buffer{lastUsed: now - 200000, data: make([]schema.Float, 0)} // 55 hours ago + b5 := &buffer{lastUsed: now, data: make([]schema.Float, 0)} + + pool.Put(b1) + pool.Put(b2) + pool.Put(b3) + pool.Put(b4) + pool.Put(b5) + + if pool.GetSize() != 5 { + t.Fatalf("Expected pool size 5, got %d", pool.GetSize()) + } + + // Clean buffers older than 48 hours + timeUpdate := time.Now().Add(48 * time.Hour).Unix() + pool.Clean(timeUpdate) + + // Expected: b1, b2, b5 should remain. b3, b4 should be cleaned. + if pool.GetSize() != 3 { + t.Fatalf("Expected pool size 3 after clean, got %d", pool.GetSize()) + } + + validBufs := map[int64]bool{ + b1.lastUsed: true, + b2.lastUsed: true, + b5.lastUsed: true, + } + + for i := 0; i < 3; i++ { + b := pool.Get() + if !validBufs[b.lastUsed] { + t.Errorf("Found unexpected buffer with lastUsed %d", b.lastUsed) + } + } + + if pool.GetSize() != 0 { + t.Fatalf("Expected pool to be empty, got %d", pool.GetSize()) + } +} From 2e5d85c2231342dcac0034b93933f0bd969b1b13 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Fri, 27 Feb 2026 15:09:06 +0100 Subject: [PATCH 10/20] Udpate testcase --- pkg/metricstore/metricstore_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index 55c97e60..772fd7ea 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -489,7 +489,7 @@ func TestBufferPoolClean(t *testing.T) { } // Clean buffers older than 48 hours - timeUpdate := time.Now().Add(48 * time.Hour).Unix() + timeUpdate := time.Now().Add(-48 * time.Hour).Unix() pool.Clean(timeUpdate) // Expected: b1, b2, b5 should remain. b3, b4 should be cleaned. From d00aa2666dd2a5da67291638105a74012cc0062e Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 27 Feb 2026 15:20:09 +0100 Subject: [PATCH 11/20] activate update of roles and projects if updateUserOnLogin is set --- internal/repository/user.go | 70 ++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/internal/repository/user.go b/internal/repository/user.go index 966646dd..38a4980b 100644 --- a/internal/repository/user.go +++ b/internal/repository/user.go @@ -10,6 +10,7 @@ import ( "encoding/json" "errors" "fmt" + "reflect" "strings" "sync" @@ -187,8 +188,8 @@ func (r *UserRepository) AddUser(user *schema.User) error { } func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) error { - // user contains updated info, apply to dbuser - // TODO: Discuss updatable fields + // user contains updated info -> Apply to dbUser + // --- Simple Name Update --- if dbUser.Name != user.Name { if _, err := sq.Update("hpc_user").Set("name", user.Name).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil { cclog.Errorf("error while updating name of user '%s'", user.Username) @@ -196,13 +197,64 @@ func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) erro } } - // Toggled until greenlit - // if dbUser.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) { - // projects, _ := json.Marshal(user.Projects) - // if _, err := sq.Update("hpc_user").Set("projects", projects).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil { - // return err - // } - // } + // --- Def Helpers --- + // Helper to update roles + updateRoles := func(roles []string) error { + rolesJSON, _ := json.Marshal(roles) + _, err := sq.Update("hpc_user").Set("roles", rolesJSON).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec() + return err + } + + // Helper to update projects + updateProjects := func(projects []string) error { + projectsJSON, _ := json.Marshal(projects) + _, err := sq.Update("hpc_user").Set("projects", projectsJSON).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec() + return err + } + + // Helper to clear projects + clearProjects := func() error { + _, err := sq.Update("hpc_user").Set("projects", "[]").Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec() + return err + } + + // --- Manager Role Handling --- + if dbUser.HasRole(schema.RoleManager) && user.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) { + // Existing Manager: update projects + if err := updateProjects(user.Projects); err != nil { + return err + } + } else if dbUser.HasRole(schema.RoleUser) && user.HasRole(schema.RoleManager) && user.HasNotRoles([]schema.Role{schema.RoleAdmin}) { + // New Manager: update roles and projects + if err := updateRoles(user.Roles); err != nil { + return err + } + if err := updateProjects(user.Projects); err != nil { + return err + } + } else if dbUser.HasRole(schema.RoleManager) && user.HasNotRoles([]schema.Role{schema.RoleAdmin, schema.RoleManager}) { + // Remove Manager: update roles and clear projects + if err := updateRoles(user.Roles); err != nil { + return err + } + if err := clearProjects(); err != nil { + return err + } + } + + // --- Support Role Handling --- + if dbUser.HasRole(schema.RoleUser) && dbUser.HasNotRoles([]schema.Role{schema.RoleSupport}) && + user.HasRole(schema.RoleSupport) && user.HasNotRoles([]schema.Role{schema.RoleAdmin}) { + // New Support: update roles + if err := updateRoles(user.Roles); err != nil { + return err + } + } else if dbUser.HasRole(schema.RoleSupport) && user.HasNotRoles([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) { + // Remove Support: update roles + if err := updateRoles(user.Roles); err != nil { + return err + } + } return nil } From adebffd2515541da99098dea0bf03fd5ad789935 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 27 Feb 2026 17:40:32 +0100 Subject: [PATCH 12/20] Replace the old zip archive options for the metricstore node data by parquet files --- pkg/metricstore/archive.go | 221 +++++++++++++-------- pkg/metricstore/parquetArchive.go | 213 +++++++++++++++++++++ pkg/metricstore/parquetArchive_test.go | 255 +++++++++++++++++++++++++ 3 files changed, 606 insertions(+), 83 deletions(-) create mode 100644 pkg/metricstore/parquetArchive.go create mode 100644 pkg/metricstore/parquetArchive_test.go diff --git a/pkg/metricstore/archive.go b/pkg/metricstore/archive.go index d3617f2c..77f4264a 100644 --- a/pkg/metricstore/archive.go +++ b/pkg/metricstore/archive.go @@ -6,12 +6,9 @@ package metricstore import ( - "archive/zip" - "bufio" "context" "errors" "fmt" - "io" "os" "path/filepath" "sync" @@ -47,7 +44,7 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) { } } -// runWorker takes simple values to configure what it does +// cleanUpWorker takes simple values to configure what it does func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) { wg.Go(func() { @@ -75,10 +72,10 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod if err != nil { cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error()) } else { - if delete && cleanupDir == "" { + if delete { cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n) } else { - cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n) + cclog.Infof("[METRICSTORE]> done: %d checkpoint files archived to parquet", n) } } } @@ -88,17 +85,26 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod var ErrNoNewArchiveData error = errors.New("all data already archived") -// Delete or ZIP all checkpoint files older than `from` together and write them to the `cleanupDir`, -// deleting/moving them from the `checkpointsDir`. +// CleanupCheckpoints deletes or archives all checkpoint files older than `from`. +// When archiving, consolidates all hosts per cluster into a single Parquet file. func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteInstead bool) (int, error) { + if deleteInstead { + return deleteCheckpoints(checkpointsDir, from) + } + + return archiveCheckpoints(checkpointsDir, cleanupDir, from) +} + +// deleteCheckpoints removes checkpoint files older than `from` across all clusters/hosts. +func deleteCheckpoints(checkpointsDir string, from int64) (int, error) { entries1, err := os.ReadDir(checkpointsDir) if err != nil { return 0, err } type workItem struct { - cdir, adir string - cluster, host string + dir string + cluster, host string } var wg sync.WaitGroup @@ -109,13 +115,29 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns for worker := 0; worker < Keys.NumWorkers; worker++ { go func() { defer wg.Done() - for workItem := range work { - m, err := cleanupCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead) + for item := range work { + entries, err := os.ReadDir(item.dir) if err != nil { - cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error()) + cclog.Errorf("error reading %s/%s: %s", item.cluster, item.host, err.Error()) atomic.AddInt32(&errs, 1) + continue + } + + files, err := findFiles(entries, from, false) + if err != nil { + cclog.Errorf("error finding files in %s/%s: %s", item.cluster, item.host, err.Error()) + atomic.AddInt32(&errs, 1) + continue + } + + for _, checkpoint := range files { + if err := os.Remove(filepath.Join(item.dir, checkpoint)); err != nil { + cclog.Errorf("error deleting %s/%s/%s: %s", item.cluster, item.host, checkpoint, err.Error()) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + } } - atomic.AddInt32(&n, int32(m)) } }() } @@ -124,14 +146,14 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name())) if e != nil { err = e + continue } for _, de2 := range entries2 { - cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name()) - adir := filepath.Join(cleanupDir, de1.Name(), de2.Name()) work <- workItem{ - adir: adir, cdir: cdir, - cluster: de1.Name(), host: de2.Name(), + dir: filepath.Join(checkpointsDir, de1.Name(), de2.Name()), + cluster: de1.Name(), + host: de2.Name(), } } } @@ -142,85 +164,118 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns if err != nil { return int(n), err } - if errs > 0 { - return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n) + return int(n), fmt.Errorf("%d errors happened while deleting (%d successes)", errs, n) } return int(n), nil } -// Helper function for `CleanupCheckpoints`. -func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead bool) (int, error) { - entries, err := os.ReadDir(dir) +// archiveCheckpoints archives checkpoint files to Parquet format. +// Produces one Parquet file per cluster: //.parquet +func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, error) { + clusterEntries, err := os.ReadDir(checkpointsDir) if err != nil { return 0, err } - files, err := findFiles(entries, from, false) - if err != nil { - return 0, err - } + totalFiles := 0 - if deleteInstead { - n := 0 - for _, checkpoint := range files { - filename := filepath.Join(dir, checkpoint) - if err = os.Remove(filename); err != nil { - return n, err - } - n += 1 + for _, clusterEntry := range clusterEntries { + if !clusterEntry.IsDir() { + continue } - return n, nil - } - filename := filepath.Join(cleanupDir, fmt.Sprintf("%d.zip", from)) - f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) - if err != nil && os.IsNotExist(err) { - err = os.MkdirAll(cleanupDir, CheckpointDirPerms) - if err == nil { - f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) - } - } - if err != nil { - return 0, err - } - defer f.Close() - bw := bufio.NewWriter(f) - defer bw.Flush() - zw := zip.NewWriter(bw) - defer zw.Close() - - n := 0 - for _, checkpoint := range files { - // Use closure to ensure file is closed immediately after use, - // avoiding file descriptor leak from defer in loop - err := func() error { - filename := filepath.Join(dir, checkpoint) - r, err := os.Open(filename) - if err != nil { - return err - } - defer r.Close() - - w, err := zw.Create(checkpoint) - if err != nil { - return err - } - - if _, err = io.Copy(w, r); err != nil { - return err - } - - if err = os.Remove(filename); err != nil { - return err - } - return nil - }() + cluster := clusterEntry.Name() + hostEntries, err := os.ReadDir(filepath.Join(checkpointsDir, cluster)) if err != nil { - return n, err + return totalFiles, err } - n += 1 + + // Collect rows from all hosts in this cluster using worker pool + type hostResult struct { + rows []ParquetMetricRow + files []string // checkpoint filenames to delete after successful write + dir string // checkpoint directory for this host + } + + results := make(chan hostResult, len(hostEntries)) + work := make(chan struct { + dir, host string + }, Keys.NumWorkers) + + var wg sync.WaitGroup + errs := int32(0) + + wg.Add(Keys.NumWorkers) + for w := 0; w < Keys.NumWorkers; w++ { + go func() { + defer wg.Done() + for item := range work { + rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from) + if err != nil { + cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error()) + atomic.AddInt32(&errs, 1) + continue + } + if len(rows) > 0 { + results <- hostResult{rows: rows, files: files, dir: item.dir} + } + } + }() + } + + go func() { + for _, hostEntry := range hostEntries { + if !hostEntry.IsDir() { + continue + } + dir := filepath.Join(checkpointsDir, cluster, hostEntry.Name()) + work <- struct { + dir, host string + }{dir: dir, host: hostEntry.Name()} + } + close(work) + wg.Wait() + close(results) + }() + + // Collect all rows and file info + var allRows []ParquetMetricRow + var allResults []hostResult + for r := range results { + allRows = append(allRows, r.rows...) + allResults = append(allResults, r) + } + + if errs > 0 { + return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster) + } + + if len(allRows) == 0 { + continue + } + + // Write one Parquet file per cluster + parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from)) + if err := writeParquetArchive(parquetFile, allRows); err != nil { + return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err) + } + + // Delete archived checkpoint files + for _, result := range allResults { + for _, file := range result.files { + filename := filepath.Join(result.dir, file) + if err := os.Remove(filename); err != nil { + cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err) + } else { + totalFiles++ + } + } + } + + cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s", + len(allRows), totalFiles, cluster, parquetFile) } - return n, nil + return totalFiles, nil } diff --git a/pkg/metricstore/parquetArchive.go b/pkg/metricstore/parquetArchive.go new file mode 100644 index 00000000..420ee4e5 --- /dev/null +++ b/pkg/metricstore/parquetArchive.go @@ -0,0 +1,213 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +import ( + "bufio" + "encoding/binary" + "encoding/json" + "fmt" + "os" + "path/filepath" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + pq "github.com/parquet-go/parquet-go" +) + +// ParquetMetricRow is the long-format schema for archived metric data. +// One row per (host, metric, scope, scope_id, timestamp) data point. +// Sorted by (cluster, hostname, metric, timestamp) for optimal compression. +type ParquetMetricRow struct { + Cluster string `parquet:"cluster"` + Hostname string `parquet:"hostname"` + Metric string `parquet:"metric"` + Scope string `parquet:"scope"` + ScopeID string `parquet:"scope_id"` + Timestamp int64 `parquet:"timestamp"` + Frequency int64 `parquet:"frequency"` + Value float32 `parquet:"value"` +} + +// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows. +// The scope path is built from the hierarchy: host level is "node", then child names +// map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0"). +func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow { + for metricName, cm := range cf.Metrics { + ts := cm.Start + for _, v := range cm.Data { + if !v.IsNaN() { + rows = append(rows, ParquetMetricRow{ + Cluster: cluster, + Hostname: hostname, + Metric: metricName, + Scope: scope, + ScopeID: scopeID, + Timestamp: ts, + Frequency: cm.Frequency, + Value: float32(v), + }) + } + ts += cm.Frequency + } + } + + for childName, childCf := range cf.Children { + childScope, childScopeID := parseScopeFromName(childName) + rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows) + } + + return rows +} + +// parseScopeFromName infers scope and scope_id from a child level name. +// Examples: "socket0" → ("socket", "0"), "core12" → ("core", "12"), +// "a0" (accelerator) → ("accelerator", "0"). +// If the name doesn't match known patterns, it's used as-is for scope with empty scope_id. +func parseScopeFromName(name string) (string, string) { + prefixes := []struct { + prefix string + scope string + }{ + {"socket", "socket"}, + {"memoryDomain", "memoryDomain"}, + {"core", "core"}, + {"hwthread", "hwthread"}, + {"cpu", "hwthread"}, + {"accelerator", "accelerator"}, + } + + for _, p := range prefixes { + if len(name) > len(p.prefix) && name[:len(p.prefix)] == p.prefix { + id := name[len(p.prefix):] + if len(id) > 0 && id[0] >= '0' && id[0] <= '9' { + return p.scope, id + } + } + } + + return name, "" +} + +// writeParquetArchive writes rows to a Parquet file with Zstd compression. +func writeParquetArchive(filename string, rows []ParquetMetricRow) error { + if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil { + return fmt.Errorf("creating archive directory: %w", err) + } + + f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms) + if err != nil { + return fmt.Errorf("creating parquet file: %w", err) + } + defer f.Close() + + bw := bufio.NewWriterSize(f, 1<<20) // 1MB write buffer + + writer := pq.NewGenericWriter[ParquetMetricRow](bw, + pq.Compression(&pq.Zstd), + pq.SortingWriterConfig(pq.SortingColumns( + pq.Ascending("cluster"), + pq.Ascending("hostname"), + pq.Ascending("metric"), + pq.Ascending("timestamp"), + )), + ) + + if _, err := writer.Write(rows); err != nil { + return fmt.Errorf("writing parquet rows: %w", err) + } + + if err := writer.Close(); err != nil { + return fmt.Errorf("closing parquet writer: %w", err) + } + + if err := bw.Flush(); err != nil { + return fmt.Errorf("flushing parquet file: %w", err) + } + + return nil +} + +// loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns +// a CheckpointFile. Used by the Parquet archiver to read checkpoint data +// before converting it to Parquet format. +func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) { + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer f.Close() + + ext := filepath.Ext(filename) + switch ext { + case ".json": + cf := &CheckpointFile{} + br := bufio.NewReader(f) + if err := json.NewDecoder(br).Decode(cf); err != nil { + return nil, fmt.Errorf("decoding JSON checkpoint %s: %w", filename, err) + } + return cf, nil + + case ".bin": + br := bufio.NewReader(f) + var magic uint32 + if err := binary.Read(br, binary.LittleEndian, &magic); err != nil { + return nil, fmt.Errorf("reading magic from %s: %w", filename, err) + } + if magic != snapFileMagic { + return nil, fmt.Errorf("invalid snapshot magic in %s: 0x%08X", filename, magic) + } + var fileFrom, fileTo int64 + if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil { + return nil, fmt.Errorf("reading from-timestamp from %s: %w", filename, err) + } + if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil { + return nil, fmt.Errorf("reading to-timestamp from %s: %w", filename, err) + } + cf, err := readBinaryLevel(br) + if err != nil { + return nil, fmt.Errorf("reading binary level from %s: %w", filename, err) + } + cf.From = fileFrom + cf.To = fileTo + return cf, nil + + default: + return nil, fmt.Errorf("unsupported checkpoint extension: %s", ext) + } +} + +// archiveCheckpointsToParquet reads checkpoint files for a host directory, +// converts them to Parquet rows. Returns the rows and filenames that were processed. +func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) { + entries, err := os.ReadDir(dir) + if err != nil { + return nil, nil, err + } + + files, err := findFiles(entries, from, false) + if err != nil { + return nil, nil, err + } + + if len(files) == 0 { + return nil, nil, nil + } + + var rows []ParquetMetricRow + + for _, checkpoint := range files { + filename := filepath.Join(dir, checkpoint) + cf, err := loadCheckpointFileFromDisk(filename) + if err != nil { + cclog.Warnf("[METRICSTORE]> skipping unreadable checkpoint %s: %v", filename, err) + continue + } + + rows = flattenCheckpointFile(cf, cluster, host, "node", "", rows) + } + + return rows, files, nil +} diff --git a/pkg/metricstore/parquetArchive_test.go b/pkg/metricstore/parquetArchive_test.go new file mode 100644 index 00000000..d3d70c02 --- /dev/null +++ b/pkg/metricstore/parquetArchive_test.go @@ -0,0 +1,255 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + pq "github.com/parquet-go/parquet-go" +) + +func TestParseScopeFromName(t *testing.T) { + tests := []struct { + name string + wantScope string + wantID string + }{ + {"socket0", "socket", "0"}, + {"socket12", "socket", "12"}, + {"core0", "core", "0"}, + {"core127", "core", "127"}, + {"cpu0", "hwthread", "0"}, + {"hwthread5", "hwthread", "5"}, + {"memoryDomain0", "memoryDomain", "0"}, + {"accelerator0", "accelerator", "0"}, + {"unknown", "unknown", ""}, + {"socketX", "socketX", ""}, // not numeric suffix + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scope, id := parseScopeFromName(tt.name) + if scope != tt.wantScope || id != tt.wantID { + t.Errorf("parseScopeFromName(%q) = (%q, %q), want (%q, %q)", + tt.name, scope, id, tt.wantScope, tt.wantID) + } + }) + } +} + +func TestFlattenCheckpointFile(t *testing.T) { + cf := &CheckpointFile{ + From: 1000, + To: 1060, + Metrics: map[string]*CheckpointMetrics{ + "cpu_load": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{0.5, 0.7, schema.NaN}, + }, + }, + Children: map[string]*CheckpointFile{ + "socket0": { + Metrics: map[string]*CheckpointMetrics{ + "mem_bw": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{100.0, schema.NaN, 200.0}, + }, + }, + Children: make(map[string]*CheckpointFile), + }, + }, + } + + rows := flattenCheckpointFile(cf, "fritz", "node001", "node", "", nil) + + // cpu_load: 2 non-NaN values at node scope + // mem_bw: 2 non-NaN values at socket0 scope + if len(rows) != 4 { + t.Fatalf("expected 4 rows, got %d", len(rows)) + } + + // Verify a node-scope row + found := false + for _, r := range rows { + if r.Metric == "cpu_load" && r.Timestamp == 1000 { + found = true + if r.Cluster != "fritz" || r.Hostname != "node001" || r.Scope != "node" || r.Value != 0.5 { + t.Errorf("unexpected row: %+v", r) + } + } + } + if !found { + t.Error("expected cpu_load row at timestamp 1000") + } + + // Verify a socket-scope row + found = false + for _, r := range rows { + if r.Metric == "mem_bw" && r.Scope == "socket" && r.ScopeID == "0" { + found = true + } + } + if !found { + t.Error("expected mem_bw row with scope=socket, scope_id=0") + } +} + +func TestParquetArchiveRoundtrip(t *testing.T) { + tmpDir := t.TempDir() + + // Create checkpoint files on disk (JSON format) + cpDir := filepath.Join(tmpDir, "checkpoints", "testcluster", "node001") + if err := os.MkdirAll(cpDir, 0o755); err != nil { + t.Fatal(err) + } + + cf := &CheckpointFile{ + From: 1000, + To: 1180, + Metrics: map[string]*CheckpointMetrics{ + "cpu_load": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{0.5, 0.7, 0.9}, + }, + "mem_used": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{45.0, 46.0, 47.0}, + }, + }, + Children: map[string]*CheckpointFile{ + "socket0": { + Metrics: map[string]*CheckpointMetrics{ + "mem_bw": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{100.0, 110.0, 120.0}, + }, + }, + Children: make(map[string]*CheckpointFile), + }, + }, + } + + // Write JSON checkpoint + cpFile := filepath.Join(cpDir, "1000.json") + data, err := json.Marshal(cf) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(cpFile, data, 0o644); err != nil { + t.Fatal(err) + } + + // Archive to Parquet + archiveDir := filepath.Join(tmpDir, "archive") + rows, files, err := archiveCheckpointsToParquet(cpDir, "testcluster", "node001", 2000) + if err != nil { + t.Fatal(err) + } + if len(files) != 1 || files[0] != "1000.json" { + t.Fatalf("expected 1 file, got %v", files) + } + + parquetFile := filepath.Join(archiveDir, "testcluster", "1000.parquet") + if err := writeParquetArchive(parquetFile, rows); err != nil { + t.Fatal(err) + } + + // Read back and verify + f, err := os.Open(parquetFile) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + stat, _ := f.Stat() + pf, err := pq.OpenFile(f, stat.Size()) + if err != nil { + t.Fatal(err) + } + + reader := pq.NewGenericReader[ParquetMetricRow](pf) + readRows := make([]ParquetMetricRow, 100) + n, err := reader.Read(readRows) + if err != nil && n == 0 { + t.Fatal(err) + } + readRows = readRows[:n] + reader.Close() + + // We expect: cpu_load(3) + mem_used(3) + mem_bw(3) = 9 rows + if n != 9 { + t.Fatalf("expected 9 rows in parquet file, got %d", n) + } + + // Verify cluster and hostname are set correctly + for _, r := range readRows { + if r.Cluster != "testcluster" { + t.Errorf("expected cluster=testcluster, got %s", r.Cluster) + } + if r.Hostname != "node001" { + t.Errorf("expected hostname=node001, got %s", r.Hostname) + } + } + + // Verify parquet file is smaller than JSON (compression working) + if stat.Size() == 0 { + t.Error("parquet file is empty") + } + + t.Logf("Parquet file size: %d bytes for %d rows", stat.Size(), n) +} + +func TestLoadCheckpointFileFromDisk_JSON(t *testing.T) { + tmpDir := t.TempDir() + + cf := &CheckpointFile{ + From: 1000, + To: 1060, + Metrics: map[string]*CheckpointMetrics{ + "test_metric": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{1.0, 2.0, 3.0}, + }, + }, + Children: make(map[string]*CheckpointFile), + } + + filename := filepath.Join(tmpDir, "1000.json") + data, err := json.Marshal(cf) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filename, data, 0o644); err != nil { + t.Fatal(err) + } + + loaded, err := loadCheckpointFileFromDisk(filename) + if err != nil { + t.Fatal(err) + } + + if loaded.From != 1000 || loaded.To != 1060 { + t.Errorf("expected From=1000, To=1060, got From=%d, To=%d", loaded.From, loaded.To) + } + + m, ok := loaded.Metrics["test_metric"] + if !ok { + t.Fatal("expected test_metric in loaded checkpoint") + } + if m.Frequency != 60 || m.Start != 1000 || len(m.Data) != 3 { + t.Errorf("unexpected metric data: %+v", m) + } +} From 1ec41d8389e81e7c517960dda251ec6a8a53ad39 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Sat, 28 Feb 2026 19:34:33 +0100 Subject: [PATCH 13/20] Review and improve buffer pool implmentation. Add unit tests. --- pkg/metricstore/buffer.go | 75 ++-- pkg/metricstore/level.go | 9 +- pkg/metricstore/metricstore.go | 2 +- pkg/metricstore/metricstore_test.go | 520 ++++++++++++++++++++++++++++ 4 files changed, 566 insertions(+), 40 deletions(-) diff --git a/pkg/metricstore/buffer.go b/pkg/metricstore/buffer.go index f486e645..557a941c 100644 --- a/pkg/metricstore/buffer.go +++ b/pkg/metricstore/buffer.go @@ -54,6 +54,10 @@ import ( // of data or reallocation needs to happen on writes. const BufferCap int = DefaultBufferCapacity +// maxPoolSize caps the number of buffers held in the pool at any time. +// Prevents unbounded memory growth after large retention-cleanup bursts. +const maxPoolSize = 4096 + // BufferPool is the global instance. // It is initialized immediately when the package loads. var bufferPool = NewPersistentBufferPool() @@ -89,12 +93,18 @@ func (p *PersistentBufferPool) Get() *buffer { return b } +// Put returns b to the pool. The caller must set b.lastUsed = time.Now().Unix() +// before calling Put so that Clean() can evict idle entries correctly. func (p *PersistentBufferPool) Put(b *buffer) { // Reset the buffer before putting it back b.data = b.data[:0] p.mu.Lock() defer p.mu.Unlock() + if len(p.pool) >= maxPoolSize { + // Pool is full; drop the buffer and let GC collect it. + return + } p.pool = append(p.pool, b) } @@ -121,13 +131,11 @@ func (p *PersistentBufferPool) Clean(threshold int64) { p.mu.Lock() defer p.mu.Unlock() - // Filter in place + // Filter in place, retaining only buffers returned to the pool recently enough. active := p.pool[:0] for _, b := range p.pool { if b.lastUsed >= threshold { active = append(active, b) - } else { - // Buffer is older than the threshold, let it be collected by GC } } @@ -139,19 +147,6 @@ func (p *PersistentBufferPool) Clean(threshold int64) { p.pool = active } -// CleanAll removes all buffers from the pool. -func (p *PersistentBufferPool) CleanAll() { - p.mu.Lock() - defer p.mu.Unlock() - - // Nullify all buffers to prevent memory leaks - for i := range p.pool { - p.pool[i] = nil - } - - p.pool = p.pool[:0] -} - var ( // ErrNoData indicates no time-series data exists for the requested metric/level. ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level") @@ -276,11 +271,13 @@ func (b *buffer) firstWrite() int64 { // // Panics if 'data' slice is too small to hold all values in [from, to). func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) { - if from < b.firstWrite() { - if b.prev != nil { - return b.prev.read(from, to, data) + // Walk back to the buffer that covers 'from', adjusting if we hit the oldest. + for from < b.firstWrite() { + if b.prev == nil { + from = b.firstWrite() + break } - from = b.firstWrite() + b = b.prev } i := 0 @@ -292,16 +289,17 @@ func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int6 break } b = b.next - idx = 0 + // Recalculate idx in the new buffer; a gap between buffers may exist. + idx = int((t - b.start) / b.frequency) } if idx >= len(b.data) { if b.next == nil || to <= b.next.start { break } - data[i] += schema.NaN + data[i] += schema.NaN // NaN + anything = NaN; propagates missing data } else if t < b.start { - data[i] += schema.NaN + data[i] += schema.NaN // gap before this buffer's first write } else { data[i] += b.data[idx] } @@ -359,11 +357,12 @@ func (b *buffer) forceFreeOldest() (delme bool, n int) { // If the previous buffer signals it should be deleted: if delPrev { - // Clear links on the dying buffer to prevent leaks b.prev.next = nil - b.prev.data = nil // Release the underlying float slice immediately - - // Remove the link from the current buffer + if cap(b.prev.data) != BufferCap { + b.prev.data = make([]schema.Float, 0, BufferCap) + } + b.prev.lastUsed = time.Now().Unix() + bufferPool.Put(b.prev) b.prev = nil } return false, freed @@ -392,21 +391,27 @@ func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) erro return nil } - if err := b.prev.iterFromTo(from, to, callback); err != nil { - return err + // Collect overlapping buffers walking backwards (newest → oldest). + var matching []*buffer + for cur := b; cur != nil; cur = cur.prev { + if from <= cur.end() && cur.start <= to { + matching = append(matching, cur) + } } - if from <= b.end() && b.start <= to { - return callback(b) + // Invoke callback in chronological order (oldest → newest). + for i := len(matching) - 1; i >= 0; i-- { + if err := callback(matching[i]); err != nil { + return err + } } - return nil } func (b *buffer) count() int64 { - res := int64(len(b.data)) - if b.prev != nil { - res += b.prev.count() + var res int64 + for ; b != nil; b = b.prev { + res += int64(len(b.data)) } return res } diff --git a/pkg/metricstore/level.go b/pkg/metricstore/level.go index ef082579..2b24a2ea 100644 --- a/pkg/metricstore/level.go +++ b/pkg/metricstore/level.go @@ -238,12 +238,13 @@ func (l *Level) forceFree() (int, error) { // If delme is true, it means 'b' itself (the head) was the oldest // and needs to be removed from the slice. if delme { - // Nil out fields to ensure no hanging references - b.next = nil b.prev = nil - b.data = nil - + if cap(b.data) != BufferCap { + b.data = make([]schema.Float, 0, BufferCap) + } + b.lastUsed = time.Now().Unix() + bufferPool.Put(b) l.metrics[i] = nil } } diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go index db3e4357..b5b1a528 100644 --- a/pkg/metricstore/metricstore.go +++ b/pkg/metricstore/metricstore.go @@ -428,7 +428,7 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) { runtime.ReadMemStats(&mem) actualMemoryGB = float64(mem.Alloc) / 1e9 - bufferPool.CleanAll() + bufferPool.Clear() cclog.Infof("[METRICSTORE]> Cleaned up bufferPool\n") if actualMemoryGB > float64(Keys.MemoryCap) { diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index 772fd7ea..9087df2a 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -12,6 +12,526 @@ import ( "github.com/ClusterCockpit/cc-lib/v2/schema" ) +// ─── Buffer pool ───────────────────────────────────────────────────────────── + +// TestBufferPoolGetReuse verifies that Get() returns pooled buffers before +// allocating new ones, and that an empty pool allocates a fresh BufferCap buffer. +func TestBufferPoolGetReuse(t *testing.T) { + pool := NewPersistentBufferPool() + + original := &buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()} + pool.Put(original) + + reused := pool.Get() + if reused != original { + t.Error("Get() should return the previously pooled buffer") + } + if pool.GetSize() != 0 { + t.Errorf("pool size after Get() = %d, want 0", pool.GetSize()) + } + + // Empty pool must allocate a fresh buffer with the standard capacity. + fresh := pool.Get() + if fresh == nil { + t.Fatal("Get() from empty pool returned nil") + } + if cap(fresh.data) != BufferCap { + t.Errorf("fresh buffer cap = %d, want %d", cap(fresh.data), BufferCap) + } +} + +// TestBufferPoolClear verifies that Clear() drains all entries. +func TestBufferPoolClear(t *testing.T) { + pool := NewPersistentBufferPool() + for i := 0; i < 10; i++ { + pool.Put(&buffer{data: make([]schema.Float, 0), lastUsed: time.Now().Unix()}) + } + pool.Clear() + if pool.GetSize() != 0 { + t.Errorf("pool size after Clear() = %d, want 0", pool.GetSize()) + } +} + +// TestBufferPoolMaxSize verifies that Put() silently drops buffers once the +// pool reaches maxPoolSize, preventing unbounded memory growth. +func TestBufferPoolMaxSize(t *testing.T) { + pool := NewPersistentBufferPool() + for i := 0; i < maxPoolSize; i++ { + pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()}) + } + if pool.GetSize() != maxPoolSize { + t.Fatalf("pool size = %d, want %d", pool.GetSize(), maxPoolSize) + } + + pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()}) + if pool.GetSize() != maxPoolSize { + t.Errorf("pool size after overflow Put = %d, want %d (should not grow)", pool.GetSize(), maxPoolSize) + } +} + +// ─── Buffer helpers ─────────────────────────────────────────────────────────── + +// TestBufferEndFirstWrite verifies the end() and firstWrite() calculations. +func TestBufferEndFirstWrite(t *testing.T) { + // start=90, freq=10 → firstWrite = 90+5 = 95 + b := &buffer{data: make([]schema.Float, 4, BufferCap), frequency: 10, start: 90} + if fw := b.firstWrite(); fw != 95 { + t.Errorf("firstWrite() = %d, want 95", fw) + } + // end = firstWrite + len(data)*freq = 95 + 4*10 = 135 + if e := b.end(); e != 135 { + t.Errorf("end() = %d, want 135", e) + } +} + +// ─── Buffer write ───────────────────────────────────────────────────────────── + +// TestBufferWriteNaNFill verifies that skipped timestamps are filled with NaN. +func TestBufferWriteNaNFill(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + // skip 110 and 120 + b.write(130, schema.Float(4.0)) + + if len(b.data) != 4 { + t.Fatalf("len(data) = %d, want 4 (1 value + 2 NaN + 1 value)", len(b.data)) + } + if b.data[0] != schema.Float(1.0) { + t.Errorf("data[0] = %v, want 1.0", b.data[0]) + } + if !b.data[1].IsNaN() { + t.Errorf("data[1] should be NaN (gap), got %v", b.data[1]) + } + if !b.data[2].IsNaN() { + t.Errorf("data[2] should be NaN (gap), got %v", b.data[2]) + } + if b.data[3] != schema.Float(4.0) { + t.Errorf("data[3] = %v, want 4.0", b.data[3]) + } +} + +// TestBufferWriteCapacityOverflow verifies that exceeding capacity creates and +// links a new buffer rather than panicking or silently dropping data. +func TestBufferWriteCapacityOverflow(t *testing.T) { + // Cap=2 so the third write must overflow into a new buffer. + b := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 95} + + nb, _ := b.write(100, schema.Float(1.0)) + nb, _ = nb.write(110, schema.Float(2.0)) + nb, err := nb.write(120, schema.Float(3.0)) + if err != nil { + t.Fatalf("write() error = %v", err) + } + if nb == b { + t.Fatal("write() should have returned a new buffer after overflow") + } + if nb.prev != b { + t.Error("new buffer should link back to old via prev") + } + if b.next != nb { + t.Error("old buffer should link forward to new via next") + } + if len(b.data) != 2 { + t.Errorf("old buffer len = %d, want 2 (full)", len(b.data)) + } + if nb.data[0] != schema.Float(3.0) { + t.Errorf("new buffer data[0] = %v, want 3.0", nb.data[0]) + } +} + +// TestBufferWriteOverwrite verifies that writing to an already-occupied index +// replaces the value rather than appending. +func TestBufferWriteOverwrite(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + b.write(110, schema.Float(2.0)) + + // Overwrite the first slot. + b.write(100, schema.Float(99.0)) + if len(b.data) != 2 { + t.Errorf("len(data) after overwrite = %d, want 2 (no append)", len(b.data)) + } + if b.data[0] != schema.Float(99.0) { + t.Errorf("data[0] after overwrite = %v, want 99.0", b.data[0]) + } +} + +// ─── Buffer read ────────────────────────────────────────────────────────────── + +// TestBufferReadBeforeFirstWrite verifies that 'from' is clamped to firstWrite +// when the requested range starts before any data in the chain. +func TestBufferReadBeforeFirstWrite(t *testing.T) { + b := newBuffer(100, 10) // firstWrite = 100 + b.write(100, schema.Float(1.0)) + b.write(110, schema.Float(2.0)) + + data := make([]schema.Float, 10) + result, adjustedFrom, _, err := b.read(50, 120, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if adjustedFrom != 100 { + t.Errorf("adjustedFrom = %d, want 100 (clamped to firstWrite)", adjustedFrom) + } + if len(result) != 2 { + t.Errorf("len(result) = %d, want 2", len(result)) + } +} + +// TestBufferReadChain verifies that read() traverses a multi-buffer chain and +// returns contiguous values from both buffers. +// +// The switch to b.next in read() triggers on idx >= cap(b.data), so b1 must +// be full (len == cap) for the loop to advance to b2 without producing NaN. +func TestBufferReadChain(t *testing.T) { + // b1: cap=3, covers t=100..120. b2: covers t=130..150. b2 is head. + b1 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 95} + b1.data = append(b1.data, 1.0, 2.0, 3.0) // fills b1: len=cap=3 + + b2 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 125} + b2.data = append(b2.data, 4.0, 5.0, 6.0) // t=130,140,150 + b2.prev = b1 + b1.next = b2 + + data := make([]schema.Float, 6) + result, from, to, err := b2.read(100, 160, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if from != 100 || to != 160 { + t.Errorf("read() from/to = %d/%d, want 100/160", from, to) + } + if len(result) != 6 { + t.Fatalf("len(result) = %d, want 6", len(result)) + } + for i, want := range []schema.Float{1, 2, 3, 4, 5, 6} { + if result[i] != want { + t.Errorf("result[%d] = %v, want %v", i, result[i], want) + } + } +} + +// TestBufferReadIdxAfterSwitch is a regression test for the index recalculation +// bug after switching to b.next during a read. +// +// When both buffers share the same start time (can happen with checkpoint-loaded +// chains), the old code hardcoded idx=0 after the switch, causing reads at time t +// to return the wrong element from the next buffer. +func TestBufferReadIdxAfterSwitch(t *testing.T) { + // b1: cap=2, both buffers start at 0 (firstWrite=5). + // b1 carries t=5 and t=15; b2 carries t=5,15,25,35 with the same start. + // When reading reaches t=25 the loop overflows b1 (idx=2 >= cap=2) and + // switches to b2. The correct index in b2 is (25-0)/10=2 → b2.data[2]=30.0. + // The old code set idx=0 → b2.data[0]=10.0 (wrong). + b1 := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 0} + b1.data = append(b1.data, schema.Float(1.0), schema.Float(2.0)) // t=5, t=15 + + b2 := &buffer{data: make([]schema.Float, 0, 10), frequency: 10, start: 0} + b2.data = append(b2.data, + schema.Float(10.0), schema.Float(20.0), + schema.Float(30.0), schema.Float(40.0)) // t=5,15,25,35 + b2.prev = b1 + b1.next = b2 + + // from=0 triggers the walkback to b1 (from < b2.firstWrite=5). + // After clamping, the loop runs t=5,15,25,35. + data := make([]schema.Float, 4) + result, _, _, err := b2.read(0, 36, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if len(result) < 3 { + t.Fatalf("len(result) = %d, want >= 3", len(result)) + } + if result[0] != schema.Float(1.0) { + t.Errorf("result[0] (t=5) = %v, want 1.0 (from b1)", result[0]) + } + if result[1] != schema.Float(2.0) { + t.Errorf("result[1] (t=15) = %v, want 2.0 (from b1)", result[1]) + } + // This is the critical assertion: old code returned 10.0 (b2.data[0]). + if result[2] != schema.Float(30.0) { + t.Errorf("result[2] (t=25) = %v, want 30.0 (idx recalculation fix)", result[2]) + } +} + +// TestBufferReadNaNValues verifies that NaN slots written to the buffer are +// returned as NaN during read. +func TestBufferReadNaNValues(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + b.write(110, schema.NaN) + b.write(120, schema.Float(3.0)) + + data := make([]schema.Float, 3) + result, _, _, err := b.read(100, 130, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if len(result) != 3 { + t.Fatalf("len(result) = %d, want 3", len(result)) + } + if result[0] != schema.Float(1.0) { + t.Errorf("result[0] = %v, want 1.0", result[0]) + } + if !result[1].IsNaN() { + t.Errorf("result[1] should be NaN, got %v", result[1]) + } + if result[2] != schema.Float(3.0) { + t.Errorf("result[2] = %v, want 3.0", result[2]) + } +} + +// TestBufferReadAccumulation verifies the += accumulation pattern used for +// aggregation: values are added to whatever was already in the data slice. +func TestBufferReadAccumulation(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(3.0)) + b.write(110, schema.Float(5.0)) + + // Pre-populate data slice (simulates a second metric being summed in). + data := []schema.Float{2.0, 1.0, 0.0} + result, _, _, err := b.read(100, 120, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + // 2.0+3.0=5.0, 1.0+5.0=6.0 + if result[0] != schema.Float(5.0) { + t.Errorf("result[0] = %v, want 5.0 (2+3)", result[0]) + } + if result[1] != schema.Float(6.0) { + t.Errorf("result[1] = %v, want 6.0 (1+5)", result[1]) + } +} + +// ─── Buffer free ───────────────────────────────────────────────────────────── + +// newTestPool swaps out the package-level bufferPool for a fresh isolated one +// and returns a cleanup function that restores the original. +func newTestPool(t *testing.T) *PersistentBufferPool { + t.Helper() + pool := NewPersistentBufferPool() + saved := bufferPool + bufferPool = pool + t.Cleanup(func() { bufferPool = saved }) + return pool +} + +// TestBufferFreeRetention verifies that free() removes buffers whose entire +// time range falls before the retention threshold and returns them to the pool. +func TestBufferFreeRetention(t *testing.T) { + pool := newTestPool(t) + + // b1: firstWrite=5, end=25 b2: firstWrite=25, end=45 b3: firstWrite=45, end=65 + b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0} + b1.data = append(b1.data, 1.0, 2.0) + + b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20} + b2.data = append(b2.data, 3.0, 4.0) + b2.prev = b1 + b1.next = b2 + + b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40} + b3.data = append(b3.data, 5.0, 6.0) + b3.prev = b2 + b2.next = b3 + + // Threshold=30: b1.end()=25 < 30 → freed; b2.end()=45 >= 30 → kept. + delme, n := b3.free(30) + if delme { + t.Error("head buffer b3 should not be marked for deletion") + } + if n != 1 { + t.Errorf("freed count = %d, want 1", n) + } + if b2.prev != nil { + t.Error("b1 should have been unlinked from b2.prev") + } + if b3.prev != b2 { + t.Error("b3 should still reference b2") + } + if pool.GetSize() != 1 { + t.Errorf("pool size = %d, want 1 (b1 returned)", pool.GetSize()) + } +} + +// TestBufferFreeAll verifies that free() removes all buffers and signals the +// caller to delete the head when the entire chain is older than the threshold. +func TestBufferFreeAll(t *testing.T) { + pool := newTestPool(t) + + b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0} + b1.data = append(b1.data, 1.0, 2.0) // end=25 + + b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20} + b2.data = append(b2.data, 3.0, 4.0) // end=45 + b2.prev = b1 + b1.next = b2 + + // Threshold=100 > both ends → both should be freed. + delme, n := b2.free(100) + if !delme { + t.Error("head buffer b2 should be marked for deletion when all data is stale") + } + if n != 2 { + t.Errorf("freed count = %d, want 2", n) + } + // b1 was freed inside free(); b2 is returned with delme=true for the caller. + if pool.GetSize() != 1 { + t.Errorf("pool size = %d, want 1 (b1 returned; b2 returned by caller)", pool.GetSize()) + } +} + +// ─── forceFreeOldest ───────────────────────────────────────────────────────── + +// TestForceFreeOldestPoolReturn verifies that forceFreeOldest() returns the +// freed buffer to the pool (regression: previously it was just dropped). +func TestForceFreeOldestPoolReturn(t *testing.T) { + pool := newTestPool(t) + + b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0} + b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20} + b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40} + b1.data = append(b1.data, 1.0) + b2.data = append(b2.data, 2.0) + b3.data = append(b3.data, 3.0) + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + delme, n := b3.forceFreeOldest() + if delme { + t.Error("head b3 should not be marked for deletion (chain has 3 buffers)") + } + if n != 1 { + t.Errorf("freed count = %d, want 1", n) + } + if b2.prev != nil { + t.Error("b1 should have been unlinked from b2.prev after forceFreeOldest") + } + if b3.prev != b2 { + t.Error("b3 should still link to b2") + } + if pool.GetSize() != 1 { + t.Errorf("pool size = %d, want 1 (b1 returned to pool)", pool.GetSize()) + } +} + +// TestForceFreeOldestSingleBuffer verifies that forceFreeOldest() returns +// delme=true when the buffer is the only one in the chain. +func TestForceFreeOldestSingleBuffer(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + + delme, n := b.forceFreeOldest() + if !delme { + t.Error("single-buffer chain: expected delme=true (the buffer IS the oldest)") + } + if n != 1 { + t.Errorf("freed count = %d, want 1", n) + } +} + +// ─── iterFromTo ─────────────────────────────────────────────────────────────── + +// TestBufferIterFromToOrder verifies that iterFromTo invokes the callback in +// chronological order (oldest → newest). +func TestBufferIterFromToOrder(t *testing.T) { + // Each buffer has 2 data points so end() = firstWrite + 2*freq. + b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0} // end=25 + b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20} // end=45 + b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40} // end=65 + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + var order []*buffer + err := b3.iterFromTo(0, 100, func(b *buffer) error { + order = append(order, b) + return nil + }) + if err != nil { + t.Fatalf("iterFromTo() error = %v", err) + } + if len(order) != 3 { + t.Fatalf("callback count = %d, want 3", len(order)) + } + if order[0] != b1 || order[1] != b2 || order[2] != b3 { + t.Error("iterFromTo() did not call callbacks in chronological (oldest→newest) order") + } +} + +// TestBufferIterFromToFiltered verifies that iterFromTo only calls the callback +// for buffers whose time range overlaps [from, to]. +func TestBufferIterFromToFiltered(t *testing.T) { + // b1: end=25 b2: start=20, end=45 b3: start=40, end=65 + b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0} + b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20} + b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40} + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + // [30,50]: b1.end=25 < 30 → excluded; b2 and b3 overlap → included. + var visited []*buffer + b3.iterFromTo(30, 50, func(b *buffer) error { + visited = append(visited, b) + return nil + }) + if len(visited) != 2 { + t.Fatalf("visited count = %d, want 2 (b2 and b3)", len(visited)) + } + if visited[0] != b2 || visited[1] != b3 { + t.Errorf("visited = %v, want [b2, b3]", visited) + } +} + +// TestBufferIterFromToNilBuffer verifies that iterFromTo on a nil buffer is a +// safe no-op. +func TestBufferIterFromToNilBuffer(t *testing.T) { + var b *buffer + called := false + err := b.iterFromTo(0, 100, func(_ *buffer) error { + called = true + return nil + }) + if err != nil { + t.Errorf("iterFromTo(nil) error = %v, want nil", err) + } + if called { + t.Error("callback should not be called for a nil buffer") + } +} + +// ─── count ──────────────────────────────────────────────────────────────────── + +// TestBufferCount verifies that count() sums data-point lengths across the +// entire chain, including all prev links. +func TestBufferCount(t *testing.T) { + b1 := &buffer{data: make([]schema.Float, 3, BufferCap), frequency: 10, start: 0} + b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 35} + b3 := &buffer{data: make([]schema.Float, 5, BufferCap), frequency: 10, start: 60} + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + if got := b3.count(); got != 10 { + t.Errorf("count() = %d, want 10 (3+2+5)", got) + } + + // Single buffer. + lone := &buffer{data: make([]schema.Float, 7, BufferCap)} + if got := lone.count(); got != 7 { + t.Errorf("count() single buffer = %d, want 7", got) + } +} + +// ─── Existing tests below ──────────────────────────────────────────────────── + func TestAssignAggregationStrategy(t *testing.T) { tests := []struct { name string From 3d5a124321763c3b89323f3e02fcc3584245392a Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 2 Mar 2026 15:01:44 +0100 Subject: [PATCH 14/20] Refine patterns. Do not match commented lines. --- configs/tagger/apps/caracal.txt | 1 - configs/tagger/apps/lammps.txt | 2 +- internal/tagger/detectApp.go | 6 ++++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/configs/tagger/apps/caracal.txt b/configs/tagger/apps/caracal.txt index ed615121..5c5311f7 100644 --- a/configs/tagger/apps/caracal.txt +++ b/configs/tagger/apps/caracal.txt @@ -2,6 +2,5 @@ calc_rate qmdffgen dynamic evbopt -explore black_box poly_qmdff diff --git a/configs/tagger/apps/lammps.txt b/configs/tagger/apps/lammps.txt index d254f82f..38d3aa5d 100644 --- a/configs/tagger/apps/lammps.txt +++ b/configs/tagger/apps/lammps.txt @@ -1 +1 @@ -lmp +\blmp\s+ diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index f86dcb6c..54626eff 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -64,9 +64,11 @@ func (t *AppTagger) scanApp(f *os.File, fns string) { if line == "" { continue } - re, err := regexp.Compile(line) + // Wrap pattern to skip comment lines: match only if not preceded by # on the same line + wrapped := `(?m)^[^#]*` + line + re, err := regexp.Compile(wrapped) if err != nil { - cclog.Errorf("invalid regex pattern '%s' in %s: %v", line, fns, err) + cclog.Errorf("invalid regex pattern '%s' (wrapped: '%s') in %s: %v", line, wrapped, fns, err) continue } ai.patterns = append(ai.patterns, re) From a243e1749921abe59480c7b171a6c43cbcbbf09a Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Mon, 2 Mar 2026 15:27:06 +0100 Subject: [PATCH 15/20] Update to shutdown worker for WAL checkpointing mode --- configs/config-demo.json | 1 + pkg/metricstore/metricstore.go | 2 +- pkg/metricstore/walCheckpoint.go | 12 +++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/configs/config-demo.json b/configs/config-demo.json index 509c8f18..50dff298 100644 --- a/configs/config-demo.json +++ b/configs/config-demo.json @@ -21,6 +21,7 @@ ], "metric-store": { "checkpoints": { + "file-format": "wal", "interval": "12h" }, "retention-in-memory": "48h", diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go index b5b1a528..6d49624a 100644 --- a/pkg/metricstore/metricstore.go +++ b/pkg/metricstore/metricstore.go @@ -294,7 +294,7 @@ func Shutdown() { var hostDirs []string files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix()) if err == nil { - RotateWALFiles(hostDirs) + RotateWALFilesAfterShutdown(hostDirs) } } else { files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix()) diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go index 685a8388..07414d98 100644 --- a/pkg/metricstore/walCheckpoint.go +++ b/pkg/metricstore/walCheckpoint.go @@ -116,7 +116,6 @@ type walFileState struct { // Also handles WAL rotation requests from the checkpoint goroutine. func WALStaging(wg *sync.WaitGroup, ctx context.Context) { wg.Go(func() { - if Keys.Checkpoints.FileFormat == "json" { return } @@ -235,6 +234,17 @@ func RotateWALFiles(hostDirs []string) { } } +// RotateWALFiles sends rotation requests for the given host directories +// and blocks until all rotations complete. +func RotateWALFilesAfterShutdown(hostDirs []string) { + for _, dir := range hostDirs { + walPath := path.Join(dir, "current.wal") + if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) { + cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err) + } + } +} + // buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC). func buildWALPayload(msg *WALMessage) []byte { size := 8 + 2 + len(msg.MetricName) + 1 + 4 From 718ff60221028881d01d3b7dd05f21991cc84018 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 2 Mar 2026 14:10:28 +0100 Subject: [PATCH 16/20] clarify ccms logs --- cmd/cc-backend/main.go | 2 +- internal/metricdispatch/metricdata.go | 4 ++-- .../metricstoreclient/cc-metric-store-queries.go | 4 ++-- internal/metricstoreclient/cc-metric-store.go | 10 +++++----- pkg/metricstore/query.go | 14 +++++++------- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 81d397d2..5b51b963 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -339,7 +339,7 @@ func runServer(ctx context.Context) error { err := metricdispatch.Init(mscfg) if err != nil { - cclog.Debugf("initializing metricdispatch: %v", err) + cclog.Debugf("error while initializing external metricdispatch: %v", err) } else { haveMetricstore = true } diff --git a/internal/metricdispatch/metricdata.go b/internal/metricdispatch/metricdata.go index 36a10004..3f03234e 100755 --- a/internal/metricdispatch/metricdata.go +++ b/internal/metricdispatch/metricdata.go @@ -74,11 +74,11 @@ func Init(rawConfig json.RawMessage) error { dec := json.NewDecoder(bytes.NewReader(rawConfig)) dec.DisallowUnknownFields() if err := dec.Decode(&configs); err != nil { - return fmt.Errorf("[METRICDISPATCH]> Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error()) + return fmt.Errorf("[METRICDISPATCH]> External Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error()) } if len(configs) == 0 { - return fmt.Errorf("[METRICDISPATCH]> No metric store configurations found in config file") + return fmt.Errorf("[METRICDISPATCH]> No external metric store configurations found in config file") } for _, config := range configs { diff --git a/internal/metricstoreclient/cc-metric-store-queries.go b/internal/metricstoreclient/cc-metric-store-queries.go index d42c9355..949efa10 100644 --- a/internal/metricstoreclient/cc-metric-store-queries.go +++ b/internal/metricstoreclient/cc-metric-store-queries.go @@ -134,7 +134,7 @@ func (ccms *CCMetricStore) buildQueries( ) if len(hostQueries) == 0 && len(hostScopes) == 0 { - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) } queries = append(queries, hostQueries...) @@ -237,7 +237,7 @@ func (ccms *CCMetricStore) buildNodeQueries( ) if len(nodeQueries) == 0 && len(nodeScopes) == 0 { - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) } queries = append(queries, nodeQueries...) diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go index 7bf7d146..39c028d5 100644 --- a/internal/metricstoreclient/cc-metric-store.go +++ b/internal/metricstoreclient/cc-metric-store.go @@ -123,7 +123,7 @@ type APIMetricData struct { Max schema.Float `json:"max"` // Maximum value in time range } -// NewCCMetricStore creates and initializes a new CCMetricStore client. +// NewCCMetricStore creates and initializes a new (external) CCMetricStore client. // The url parameter should include the protocol and port (e.g., "http://localhost:8080"). // The token parameter is a JWT used for Bearer authentication; pass empty string if auth is disabled. func NewCCMetricStore(url string, token string) *CCMetricStore { @@ -356,7 +356,7 @@ func (ccms *CCMetricStore) LoadData( if len(errors) != 0 { /* Returns list for "partial errors" */ - return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return jobData, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return jobData, nil } @@ -514,7 +514,7 @@ func (ccms *CCMetricStore) LoadScopedStats( if len(errors) != 0 { /* Returns list for "partial errors" */ - return scopedJobStats, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return scopedJobStats, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return scopedJobStats, nil } @@ -604,7 +604,7 @@ func (ccms *CCMetricStore) LoadNodeData( if len(errors) != 0 { /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return data, nil @@ -765,7 +765,7 @@ func (ccms *CCMetricStore) LoadNodeListData( if len(errors) != 0 { /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return data, nil diff --git a/pkg/metricstore/query.go b/pkg/metricstore/query.go index 0a61efaa..735c45d6 100644 --- a/pkg/metricstore/query.go +++ b/pkg/metricstore/query.go @@ -211,7 +211,7 @@ func (ccms *InternalMetricStore) LoadData( if len(errors) != 0 { /* Returns list for "partial errors" */ - return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return jobData, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return jobData, nil } @@ -260,7 +260,7 @@ func buildQueries( resolution int64, ) ([]APIQuery, []schema.MetricScope, error) { if len(job.Resources) == 0 { - return nil, nil, fmt.Errorf("METRICDATA/CCMS > no resources allocated for job %d", job.JobID) + return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > no resources allocated for job %d", job.JobID) } queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) @@ -531,7 +531,7 @@ func buildQueries( continue } - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) } } } @@ -719,7 +719,7 @@ func (ccms *InternalMetricStore) LoadScopedStats( if len(errors) != 0 { /* Returns list for "partial errors" */ - return scopedJobStats, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return scopedJobStats, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return scopedJobStats, nil } @@ -824,7 +824,7 @@ func (ccms *InternalMetricStore) LoadNodeData( if len(errors) != 0 { /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return data, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return data, nil @@ -994,7 +994,7 @@ func (ccms *InternalMetricStore) LoadNodeListData( if len(errors) != 0 { /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) + return data, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) } return data, nil @@ -1313,7 +1313,7 @@ func buildNodeQueries( continue } - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) } } } From 32fd18543a150fb8ca9436091dfa37d97d0ffd10 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 2 Mar 2026 15:35:07 +0100 Subject: [PATCH 17/20] differentiate between expected and unexpected cases in external ccms queryBuilder --- .../metricstoreclient/cc-metric-store-queries.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/internal/metricstoreclient/cc-metric-store-queries.go b/internal/metricstoreclient/cc-metric-store-queries.go index 949efa10..7a04efc4 100644 --- a/internal/metricstoreclient/cc-metric-store-queries.go +++ b/internal/metricstoreclient/cc-metric-store-queries.go @@ -126,6 +126,7 @@ func (ccms *CCMetricStore) buildQueries( hwthreads = topology.Node } + // Note: Expected exceptions will return as empty slices -> Continue hostQueries, hostScopes := buildScopeQueries( nativeScope, requestedScope, remoteName, host.Hostname, @@ -133,8 +134,9 @@ func (ccms *CCMetricStore) buildQueries( resolution, ) - if len(hostQueries) == 0 && len(hostScopes) == 0 { - return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + // Note: Unexpected errors, such as unhandled cases, will return as nils -> Error + if hostQueries == nil && hostScopes == nil { + return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) } queries = append(queries, hostQueries...) @@ -269,6 +271,7 @@ func buildScopeQueries( // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { if scope != schema.MetricScopeAccelerator { + // Expected Exception -> Continue -> Return Empty Slices return queries, scopes } @@ -287,6 +290,7 @@ func buildScopeQueries( // Accelerator -> Node if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode { if len(accelerators) == 0 { + // Expected Exception -> Continue -> Return Empty Slices return queries, scopes } @@ -447,6 +451,7 @@ func buildScopeQueries( socketToDomains, err := topology.GetMemoryDomainsBySocket(memDomains) if err != nil { cclog.Errorf("Error mapping memory domains to sockets, return unchanged: %v", err) + // Rare Error Case -> Still Continue -> Return Empty Slices return queries, scopes } @@ -507,8 +512,8 @@ func buildScopeQueries( return queries, scopes } - // Unhandled case - return empty slices - return queries, scopes + // Unhandled Case -> Error -> Return nils + return nil, nil } // intToStringSlice converts a slice of integers to a slice of strings. From 38bb2dd4ec9953dadc271bcb30abd87374c1d601 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 2 Mar 2026 16:24:27 +0100 Subject: [PATCH 18/20] add outofindex checks to external ccms codebase --- internal/metricstoreclient/cc-metric-store.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go index 39c028d5..e2a84466 100644 --- a/internal/metricstoreclient/cc-metric-store.go +++ b/internal/metricstoreclient/cc-metric-store.go @@ -393,6 +393,10 @@ func (ccms *CCMetricStore) LoadStats( stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) for i, res := range resBody.Results { + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } query := req.Queries[i] metric := query.Metric data := res[0] @@ -562,6 +566,11 @@ func (ccms *CCMetricStore) LoadNodeData( var errors []string data := make(map[string]map[string][]*schema.JobMetric) for i, res := range resBody.Results { + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + var query APIQuery if resBody.Queries != nil { query = resBody.Queries[i] @@ -572,7 +581,6 @@ func (ccms *CCMetricStore) LoadNodeData( metric := query.Metric qdata := res[0] if qdata.Error != nil { - /* Build list for "partial errors", if any */ errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error)) } From 22c442db5bfb6cb6d074797e105adb6813cef30d Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 2 Mar 2026 18:47:45 +0100 Subject: [PATCH 19/20] Enable entire integration --- .claude/settings.json | 84 +++++++++++++++++++++++++++++++++++++++++++ .entire/.gitignore | 4 +++ .entire/settings.json | 4 +++ 3 files changed, 92 insertions(+) create mode 100644 .claude/settings.json create mode 100644 .entire/.gitignore create mode 100644 .entire/settings.json diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..5cfa5854 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,84 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Task", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code post-task" + } + ] + }, + { + "matcher": "TodoWrite", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code post-todo" + } + ] + } + ], + "PreToolUse": [ + { + "matcher": "Task", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code pre-task" + } + ] + } + ], + "SessionEnd": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code session-end" + } + ] + } + ], + "SessionStart": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code session-start" + } + ] + } + ], + "Stop": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code stop" + } + ] + } + ], + "UserPromptSubmit": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code user-prompt-submit" + } + ] + } + ] + }, + "permissions": { + "deny": [ + "Read(./.entire/metadata/**)" + ] + } +} diff --git a/.entire/.gitignore b/.entire/.gitignore new file mode 100644 index 00000000..2cffdefa --- /dev/null +++ b/.entire/.gitignore @@ -0,0 +1,4 @@ +tmp/ +settings.local.json +metadata/ +logs/ diff --git a/.entire/settings.json b/.entire/settings.json new file mode 100644 index 00000000..7cce5590 --- /dev/null +++ b/.entire/settings.json @@ -0,0 +1,4 @@ +{ + "enabled": true, + "telemetry": true +} From 15be664ad806a470d57e6675454338077bf937f0 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 3 Mar 2026 06:58:03 +0100 Subject: [PATCH 20/20] Add entire gitignore --- .entire/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .entire/.gitignore diff --git a/.entire/.gitignore b/.entire/.gitignore new file mode 100644 index 00000000..2cffdefa --- /dev/null +++ b/.entire/.gitignore @@ -0,0 +1,4 @@ +tmp/ +settings.local.json +metadata/ +logs/