From cc21e0e62cde3c628cd4b7529ae28ad4e27612ec Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Wed, 25 Feb 2026 07:38:19 +0100
Subject: [PATCH 01/20] Make json the default checkpoint format

---
 pkg/metricstore/config.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pkg/metricstore/config.go b/pkg/metricstore/config.go
index 69ee3563..1efee61a 100644
--- a/pkg/metricstore/config.go
+++ b/pkg/metricstore/config.go
@@ -144,7 +144,7 @@ type MetricStoreConfig struct {
 // Accessed by Init(), Checkpointing(), and other lifecycle functions.
 var Keys MetricStoreConfig = MetricStoreConfig{
 	Checkpoints: Checkpoints{
-		FileFormat: "avro",
+		FileFormat: "json",
 		RootDir:    "./var/checkpoints",
 	},
 	Cleanup: &Cleanup{

From df3bc111a47043b4413a07254037f315a62b4a21 Mon Sep 17 00:00:00 2001
From: Christoph Kluge <christoph.kluge@fau.de>
Date: Wed, 25 Feb 2026 13:23:44 +0100
Subject: [PATCH 02/20] sort healthTable onMount

---
 web/frontend/src/status/dashdetails/HealthDash.svelte | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/web/frontend/src/status/dashdetails/HealthDash.svelte b/web/frontend/src/status/dashdetails/HealthDash.svelte
index 2730642b..aa6539ae 100644
--- a/web/frontend/src/status/dashdetails/HealthDash.svelte
+++ b/web/frontend/src/status/dashdetails/HealthDash.svelte
@@ -6,6 +6,7 @@
 -->
 
  <script>
+  import { onMount } from "svelte";
   import {
     Row,
     Col,
@@ -140,6 +141,9 @@
     healthTableData = [...pendingHealthData];
   }
 
+  /* On Mount */
+  onMount(() => sortBy('healthState'));
+
 </script>
 
 <!-- Refresher and space for other options -->

From 0a0db36433ea18ea0d3c4ddcf3d88fda3b566aa1 Mon Sep 17 00:00:00 2001
From: Christoph Kluge <christoph.kluge@fau.de>
Date: Wed, 25 Feb 2026 19:12:18 +0100
Subject: [PATCH 03/20] load  statusDetails GQL on tab change

---
 web/frontend/src/status/DashDetails.svelte    | 15 +++++----
 .../src/status/dashdetails/HealthDash.svelte  | 19 ++++++------
 .../status/dashdetails/StatisticsDash.svelte  | 13 ++++----
 .../src/status/dashdetails/StatusDash.svelte  | 27 ++++++++--------
 .../src/status/dashdetails/UsageDash.svelte   | 31 ++++++++++---------
 5 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/web/frontend/src/status/DashDetails.svelte b/web/frontend/src/status/DashDetails.svelte
index b46d0935..72c411b2 100644
--- a/web/frontend/src/status/DashDetails.svelte
+++ b/web/frontend/src/status/DashDetails.svelte
@@ -36,6 +36,9 @@
   const { query: initq } = init();
   const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false
 
+  /* State Init */
+  let activeTab = $state("");
+
   /* Derived */
   const subClusters = $derived($initq?.data?.clusters?.find((c) => c.name == presetCluster)?.subClusters || []);
 </script>
@@ -63,22 +66,22 @@
   </Row>
 {:else}
   <Card class="overflow-auto" style="height: auto;">
-    <TabContent>
+    <TabContent on:tab={(e) => (activeTab = e.detail)}>
       <TabPane tabId="status-dash" tab="Status" active>
         <CardBody>
-          <StatusDash clusters={$initq.data.clusters} {presetCluster}></StatusDash>
+          <StatusDash clusters={$initq.data.clusters} {presetCluster} loadMe={(activeTab === "status-dash")}></StatusDash>
         </CardBody>
       </TabPane>
 
       <TabPane tabId="health-dash" tab="Metric Status">
         <CardBody>
-          <HealthDash {presetCluster}></HealthDash>
+          <HealthDash {presetCluster} loadMe={(activeTab === "health-dash")}></HealthDash>
         </CardBody>
       </TabPane>
 
       <TabPane tabId="usage-dash" tab="Cluster Usage">
         <CardBody>
-          <UsageDash {presetCluster} {useCbColors}></UsageDash>
+          <UsageDash {presetCluster} {useCbColors} loadMe={(activeTab === "usage-dash")}></UsageDash>
         </CardBody>
       </TabPane>
 
@@ -86,7 +89,7 @@
         {#each subClusters.map(sc => sc.name) as scn}
         <TabPane tabId="{scn}-usage-dash" tab="{scn.charAt(0).toUpperCase() + scn.slice(1)} Usage">
           <CardBody>
-            <UsageDash {presetCluster} presetSubCluster={scn} {useCbColors}></UsageDash>
+            <UsageDash {presetCluster} presetSubCluster={scn} {useCbColors} loadMe={(activeTab === `${scn}-usage-dash`)}></UsageDash>
           </CardBody>
         </TabPane>
         {/each}
@@ -94,7 +97,7 @@
       
       <TabPane tabId="metric-dash" tab="Statistics">
         <CardBody>
-          <StatisticsDash {presetCluster} {useCbColors}></StatisticsDash>
+          <StatisticsDash {presetCluster} {useCbColors} loadMe={(activeTab === "metric-dash")}></StatisticsDash>
         </CardBody>
       </TabPane>
     </TabContent>
diff --git a/web/frontend/src/status/dashdetails/HealthDash.svelte b/web/frontend/src/status/dashdetails/HealthDash.svelte
index aa6539ae..a30552b1 100644
--- a/web/frontend/src/status/dashdetails/HealthDash.svelte
+++ b/web/frontend/src/status/dashdetails/HealthDash.svelte
@@ -29,6 +29,7 @@
   /* Svelte 5 Props */
   let {
     presetCluster,
+    loadMe = false,
   } = $props();
 
   /* Const Init */
@@ -55,7 +56,7 @@
   /* Derived */
   let cluster = $derived(presetCluster);
 
-  const statusQuery = $derived(queryStore({
+  const statusQuery = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query (
@@ -85,7 +86,7 @@
       sorting: querySorting,
     },
     requestPolicy: "network-only"
-  }));
+  }) : null);
 
   let healthTableData = $derived.by(() => {
     if ($statusQuery?.data) {
@@ -161,16 +162,16 @@
 <hr/>
 
 <!-- Node Health Pis, later Charts -->
-{#if $statusQuery.fetching}
+{#if $statusQuery?.fetching}
   <Row cols={1} class="text-center mt-3">
     <Col>
       <Spinner />
     </Col>
   </Row>
-{:else if $statusQuery.error}
+{:else if $statusQuery?.error}
   <Row cols={1} class="text-center mt-3">
     <Col>  
-      <Card body color="danger">Status Query (States): {$statusQuery.error.message}</Card>
+      <Card body color="danger">Status Query (States): {$statusQuery?.error?.message}</Card>
     </Col>
   </Row>
 {:else if $statusQuery?.data?.nodeStates}
@@ -264,19 +265,19 @@
 <hr/>
 
 <!-- Tabular Info About Node States and Missing Metrics -->
-{#if $statusQuery.fetching}
+{#if $statusQuery?.fetching}
   <Row cols={1} class="text-center mt-3">
     <Col>
       <Spinner />
     </Col>
   </Row>
-{:else if $statusQuery.error}
+{:else if $statusQuery?.error}
   <Row cols={1} class="text-center mt-3">
     <Col>  
-      <Card body color="danger">Status Query (Details): {$statusQuery.error.message}</Card>
+      <Card body color="danger">Status Query (Details): {$statusQuery?.error?.message}</Card>
     </Col>
   </Row>
-{:else if $statusQuery.data}
+{:else if $statusQuery?.data}
   <Row>
     <Col>
       <Card>
diff --git a/web/frontend/src/status/dashdetails/StatisticsDash.svelte b/web/frontend/src/status/dashdetails/StatisticsDash.svelte
index 2cf8621e..d83adc15 100644
--- a/web/frontend/src/status/dashdetails/StatisticsDash.svelte
+++ b/web/frontend/src/status/dashdetails/StatisticsDash.svelte
@@ -30,7 +30,8 @@
 
   /* Svelte 5 Props */
   let {
-    presetCluster
+    presetCluster,
+    loadMe = false,
   } = $props();
 
   /* Const Init */
@@ -49,7 +50,7 @@
     : ccconfig['statusView_selectedHistograms'] || []);
 
   // Note: nodeMetrics are requested on configured $timestep resolution
-  const metricStatusQuery = $derived(queryStore({
+  const metricStatusQuery = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query (
@@ -75,7 +76,7 @@
       selectedHistograms: selectedHistograms
     },
     requestPolicy: "network-only"
-  }));
+  }) : null);
 </script>
 
 <!-- Loading indicators & Metric Sleect -->
@@ -100,18 +101,18 @@
 </Row>
 
 <Row cols={1} class="text-center mt-3">
-  {#if $metricStatusQuery.fetching}
+  {#if $metricStatusQuery?.fetching}
     <Col>
       <Spinner />
     </Col>
-  {:else if $metricStatusQuery.error}
+  {:else if $metricStatusQuery?.error}
     <Col>  
       <Card body color="danger">{$metricStatusQuery.error.message}</Card>
     </Col>
   {/if}
 </Row>
 
-{#if $metricStatusQuery.data}
+{#if $metricStatusQuery?.data}
   <!-- Selectable Stats as Histograms : Average Values of Running Jobs -->
   {#if selectedHistograms}
     <!-- Note: Ignore '#snippet' Error in IDE -->
diff --git a/web/frontend/src/status/dashdetails/StatusDash.svelte b/web/frontend/src/status/dashdetails/StatusDash.svelte
index 8d108964..0c2626d0 100644
--- a/web/frontend/src/status/dashdetails/StatusDash.svelte
+++ b/web/frontend/src/status/dashdetails/StatusDash.svelte
@@ -32,6 +32,7 @@
   let {
     clusters,
     presetCluster,
+    loadMe = false,
   } = $props();
 
   /* Const Init */
@@ -59,7 +60,7 @@
   /* Derived */
   let cluster = $derived(presetCluster);
   // States for Stacked charts
-  const statesTimed = $derived(queryStore({
+  const statesTimed = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query ($filter: [NodeFilter!], $typeNode: String!, $typeHealth: String!) {
@@ -81,11 +82,11 @@
       typeHealth: "health"
     },
     requestPolicy: "network-only"
-  }));
+  }) : null);
 
   // Note: nodeMetrics are requested on configured $timestep resolution
   // Result: The latest 5 minutes (datapoints) for each node independent of job
-  const statusQuery = $derived(queryStore({
+  const statusQuery = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query (
@@ -184,11 +185,11 @@
       sorting: { field: "startTime", type: "col", order: "DESC" }
     },
     requestPolicy: "network-only"
-  }));
+  }) : null);
 
   /* Effects */
   $effect(() => {
-    if ($statusQuery.data) {
+    if ($statusQuery?.data) {
       let subClusters = clusters.find(
         (c) => c.name == cluster,
       ).subClusters;
@@ -374,19 +375,19 @@
 <hr/>
 
 <!-- Node Stack Charts -->
-{#if $statesTimed.fetching}
+{#if $statesTimed?.fetching}
   <Row cols={1} class="text-center mt-3">
     <Col>
       <Spinner />
     </Col>
   </Row>
-{:else if $statesTimed.error}
+{:else if $statesTimed?.error}
   <Row cols={1} class="text-center mt-3">
     <Col>  
-      <Card body color="danger">States Timed: {$statesTimed.error.message}</Card>
+      <Card body color="danger">States Timed: {$statesTimed?.error?.message}</Card>
     </Col>
   </Row>
-{:else if $statesTimed.data}
+{:else if $statesTimed?.data}
   <Row cols={{ md: 2 , sm: 1}} class="mb-3 justify-content-center">
     <Col class="px-3 mt-2 mt-lg-0">
       <div>
@@ -427,19 +428,19 @@
 
 <hr/>
 <!-- Gauges & Roofline per Subcluster-->
-{#if $statusQuery.fetching}
+{#if $statusQuery?.fetching}
   <Row cols={1} class="text-center mt-3">
     <Col>
       <Spinner />
     </Col>
   </Row>
-{:else if $statusQuery.error}
+{:else if $statusQuery?.error}
   <Row cols={1} class="text-center mt-3">
     <Col>  
-      <Card body color="danger">Status Query (Details): {$statusQuery.error.message}</Card>
+      <Card body color="danger">Status Query (Details): {$statusQuery?.error?.message}</Card>
     </Col>
   </Row>
-{:else if $statusQuery.data}
+{:else if $statusQuery?.data}
   {#each clusters.find((c) => c.name == cluster).subClusters as subCluster, i}
     <Row cols={{ lg: 3, md: 1 , sm: 1}} class="mb-3 justify-content-center">
       <Col class="px-3">
diff --git a/web/frontend/src/status/dashdetails/UsageDash.svelte b/web/frontend/src/status/dashdetails/UsageDash.svelte
index 3fa197ae..2a9b3037 100644
--- a/web/frontend/src/status/dashdetails/UsageDash.svelte
+++ b/web/frontend/src/status/dashdetails/UsageDash.svelte
@@ -40,7 +40,8 @@
     presetCluster,
     presetSubCluster = null,
     useCbColors = false,
-    useAltColors = false
+    useAltColors = false,
+    loadMe = false,
   } = $props();
 
   /* Const Init */
@@ -62,7 +63,7 @@
       ? [{ state: ["running"] }, { cluster: { eq: presetCluster} }, { subCluster: { eq: presetSubCluster } }]
       : [{ state: ["running"] }, { cluster: { eq: presetCluster} }] 
   );
-  const topJobsQuery = $derived(queryStore({
+  const topJobsQuery = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query (
@@ -95,9 +96,9 @@
       paging: pagingState // Top 10
     },
     requestPolicy: "network-only"
-  }));
+  }) : null);
 
-  const topNodesQuery = $derived(queryStore({
+  const topNodesQuery = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query (
@@ -130,9 +131,9 @@
       paging: pagingState
     },
     requestPolicy: "network-only"
-  }));
+  }) : null);
 
-  const topAccsQuery = $derived(queryStore({
+  const topAccsQuery = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query (
@@ -165,10 +166,10 @@
       paging: pagingState
     },
     requestPolicy: "network-only"
-  }));
+  }): null);
 
   // Note: nodeMetrics are requested on configured $timestep resolution
-  const nodeStatusQuery = $derived(queryStore({
+  const nodeStatusQuery = $derived(loadMe ? queryStore({
     client: client,
     query: gql`
       query (
@@ -198,7 +199,7 @@
       numDurationBins: numDurationBins,
     },
     requestPolicy: "network-only"
-  }));
+  }) : null);
 
   /* Functions */
   function legendColors(targetIdx) {
@@ -246,9 +247,9 @@
 <hr/>
 
 <!-- Job Duration, Top Users and Projects-->
-{#if $topJobsQuery.fetching || $nodeStatusQuery.fetching}
+{#if $topJobsQuery?.fetching || $nodeStatusQuery?.fetching}
   <Spinner />
-{:else if $topJobsQuery.data && $nodeStatusQuery.data}
+{:else if $topJobsQuery?.data && $nodeStatusQuery?.data}
   <Row>
     <Col xs="12" lg="4" class="p-2">
       {#key $nodeStatusQuery.data.jobsStatistics[0].histDuration}
@@ -354,9 +355,9 @@
 <hr/>
 
 <!-- Node Distribution, Top Users and Projects-->
-{#if $topNodesQuery.fetching || $nodeStatusQuery.fetching}
+{#if $topNodesQuery?.fetching || $nodeStatusQuery?.fetching}
   <Spinner />
-{:else if $topNodesQuery.data && $nodeStatusQuery.data}
+{:else if $topNodesQuery?.data && $nodeStatusQuery?.data}
   <Row>
     <Col xs="12" lg="4" class="p-2">
       <Histogram
@@ -458,9 +459,9 @@
 <hr/>
 
 <!-- Acc Distribution, Top Users and Projects-->
-{#if $topAccsQuery.fetching || $nodeStatusQuery.fetching}
+{#if $topAccsQuery?.fetching || $nodeStatusQuery?.fetching}
   <Spinner />
-{:else if $topAccsQuery.data && $nodeStatusQuery.data}
+{:else if $topAccsQuery?.data && $nodeStatusQuery?.data}
   <Row>
     <Col xs="12" lg="4" class="p-2">
       <Histogram

From ca0f9a42c70e05c4b60f48ad87e19bd26ceebcc2 Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Thu, 26 Feb 2026 10:08:40 +0100
Subject: [PATCH 04/20] Introduce metric store binary checkpoints with write
 ahead log

---
 go.mod                            |   2 -
 go.sum                            |  11 -
 pkg/metricstore/avroCheckpoint.go | 481 ------------------
 pkg/metricstore/avroHelper.go     | 130 -----
 pkg/metricstore/avroStruct.go     | 167 -------
 pkg/metricstore/checkpoint.go     | 369 +++++---------
 pkg/metricstore/config.go         |   7 +-
 pkg/metricstore/configSchema.go   |   2 +-
 pkg/metricstore/lineprotocol.go   |   4 +-
 pkg/metricstore/metricstore.go    |  22 +-
 pkg/metricstore/walCheckpoint.go  | 787 ++++++++++++++++++++++++++++++
 11 files changed, 920 insertions(+), 1062 deletions(-)
 delete mode 100644 pkg/metricstore/avroCheckpoint.go
 delete mode 100644 pkg/metricstore/avroHelper.go
 delete mode 100644 pkg/metricstore/avroStruct.go
 create mode 100644 pkg/metricstore/walCheckpoint.go

diff --git a/go.mod b/go.mod
index e244062c..c561f627 100644
--- a/go.mod
+++ b/go.mod
@@ -28,7 +28,6 @@ require (
 	github.com/influxdata/line-protocol/v2 v2.2.1
 	github.com/jmoiron/sqlx v1.4.0
 	github.com/joho/godotenv v1.5.1
-	github.com/linkedin/goavro/v2 v2.15.0
 	github.com/mattn/go-sqlite3 v1.14.34
 	github.com/parquet-go/parquet-go v0.27.0
 	github.com/qustavo/sqlhooks/v2 v2.1.0
@@ -80,7 +79,6 @@ require (
 	github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
 	github.com/go-viper/mapstructure/v2 v2.5.0 // indirect
 	github.com/goccy/go-yaml v1.19.2 // indirect
-	github.com/golang/snappy v1.0.0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/gorilla/securecookie v1.1.2 // indirect
 	github.com/gorilla/websocket v1.5.3 // indirect
diff --git a/go.sum b/go.sum
index f2929454..5586b9c5 100644
--- a/go.sum
+++ b/go.sum
@@ -4,8 +4,6 @@ github.com/99designs/gqlgen v0.17.86 h1:C8N3UTa5heXX6twl+b0AJyGkTwYL6dNmFrgZNLRc
 github.com/99designs/gqlgen v0.17.86/go.mod h1:KTrPl+vHA1IUzNlh4EYkl7+tcErL3MgKnhHrBcV74Fw=
 github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
 github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
-github.com/ClusterCockpit/cc-lib/v2 v2.5.1 h1:s6M9tyPDty+4zTdQGJYKpGJM9Nz7N6ITMdjPvNSLX5g=
-github.com/ClusterCockpit/cc-lib/v2 v2.5.1/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
 github.com/ClusterCockpit/cc-lib/v2 v2.6.0 h1:Q7zvRAVhfYA9PDB18pfY9A/6Ws4oWpnv8+P9MBRUDzg=
 github.com/ClusterCockpit/cc-lib/v2 v2.6.0/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
 github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
@@ -151,9 +149,6 @@ github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63Y
 github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
 github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA=
 github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjYXycRz/g5+RWDuYgPrE=
-github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
-github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
@@ -226,8 +221,6 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm
 github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
 github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
 github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
-github.com/linkedin/goavro/v2 v2.15.0 h1:pDj1UrjUOO62iXhgBiE7jQkpNIc5/tA5eZsgolMjgVI=
-github.com/linkedin/goavro/v2 v2.15.0/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk=
 github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
 github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
 github.com/mattn/go-sqlite3 v1.14.34 h1:3NtcvcUnFBPsuRcno8pUtupspG/GM+9nZ88zgJcp6Zk=
@@ -289,14 +282,11 @@ github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKk
 github.com/stmcginnis/gofish v0.21.1 h1:sutDvBhmLh4RDOZ1DN8GUyYRu7f1ggvKMMnSaiqhwn4=
 github.com/stmcginnis/gofish v0.21.1/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
 github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
-github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE=
@@ -378,7 +368,6 @@ gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/pkg/metricstore/avroCheckpoint.go b/pkg/metricstore/avroCheckpoint.go
deleted file mode 100644
index 14898186..00000000
--- a/pkg/metricstore/avroCheckpoint.go
+++ /dev/null
@@ -1,481 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package metricstore
-
-import (
-	"bufio"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"os"
-	"path"
-	"sort"
-	"strconv"
-	"strings"
-	"sync"
-	"sync/atomic"
-	"time"
-
-	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/v2/schema"
-	"github.com/linkedin/goavro/v2"
-)
-
-var (
-	NumAvroWorkers int  = DefaultAvroWorkers
-	startUp        bool = true
-)
-
-func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
-	levels := make([]*AvroLevel, 0)
-	selectors := make([][]string, 0)
-	as.root.lock.RLock()
-	// Cluster
-	for sel1, l1 := range as.root.children {
-		l1.lock.RLock()
-		// Node
-		for sel2, l2 := range l1.children {
-			l2.lock.RLock()
-			// Frequency
-			for sel3, l3 := range l2.children {
-				levels = append(levels, l3)
-				selectors = append(selectors, []string{sel1, sel2, sel3})
-			}
-			l2.lock.RUnlock()
-		}
-		l1.lock.RUnlock()
-	}
-	as.root.lock.RUnlock()
-
-	type workItem struct {
-		level    *AvroLevel
-		dir      string
-		selector []string
-	}
-
-	n, errs := int32(0), int32(0)
-
-	var wg sync.WaitGroup
-	wg.Add(NumAvroWorkers)
-	work := make(chan workItem, NumAvroWorkers*2)
-	for range NumAvroWorkers {
-		go func() {
-			defer wg.Done()
-
-			for workItem := range work {
-				from := getTimestamp(workItem.dir)
-
-				if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
-					if err == ErrNoNewArchiveData {
-						continue
-					}
-
-					cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
-					atomic.AddInt32(&errs, 1)
-				} else {
-					atomic.AddInt32(&n, 1)
-				}
-			}
-		}()
-	}
-
-	for i := range len(levels) {
-		dir := path.Join(dir, path.Join(selectors[i]...))
-		work <- workItem{
-			level:    levels[i],
-			dir:      dir,
-			selector: selectors[i],
-		}
-	}
-
-	close(work)
-	wg.Wait()
-
-	if errs > 0 {
-		return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
-	}
-
-	startUp = false
-
-	return int(n), nil
-}
-
-// getTimestamp returns the timestamp from the directory name
-func getTimestamp(dir string) int64 {
-	// Extract the resolution and timestamp from the directory name
-	// The existing avro file will be in epoch timestamp format
-	// iterate over all the files in the directory and find the maximum timestamp
-	// and return it
-
-	resolution := path.Base(dir)
-	dir = path.Dir(dir)
-
-	files, err := os.ReadDir(dir)
-	if err != nil {
-		return 0
-	}
-	var maxTS int64 = 0
-
-	if len(files) == 0 {
-		return 0
-	}
-
-	for _, file := range files {
-		if file.IsDir() {
-			continue
-		}
-		name := file.Name()
-
-		if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
-			continue
-		}
-
-		ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
-		if err != nil {
-			fmt.Printf("error while parsing timestamp: %s\n", err.Error())
-			continue
-		}
-
-		if ts > maxTS {
-			maxTS = ts
-		}
-	}
-
-	interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
-	updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
-
-	if startUp {
-		return 0
-	}
-
-	if updateTime < time.Now().Unix() {
-		return 0
-	}
-
-	return maxTS
-}
-
-func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
-	l.lock.Lock()
-	defer l.lock.Unlock()
-
-	// fmt.Printf("Checkpointing directory: %s\n", dir)
-	// filepath contains the resolution
-	intRes, _ := strconv.Atoi(path.Base(dir))
-
-	// find smallest overall timestamp in l.data map and delete it from l.data
-	minTS := int64(1<<63 - 1)
-	for ts, dat := range l.data {
-		if ts < minTS && len(dat) != 0 {
-			minTS = ts
-		}
-	}
-
-	if from == 0 && minTS != int64(1<<63-1) {
-		from = minTS
-	}
-
-	if from == 0 {
-		return ErrNoNewArchiveData
-	}
-
-	var schema string
-	var codec *goavro.Codec
-	recordList := make([]map[string]any, 0)
-
-	var f *os.File
-
-	filePath := dir + fmt.Sprintf("_%d.avro", from)
-
-	var err error
-
-	fp_, err_ := os.Stat(filePath)
-	if errors.Is(err_, os.ErrNotExist) {
-		err = os.MkdirAll(path.Dir(dir), 0o755)
-		if err != nil {
-			return fmt.Errorf("failed to create directory: %v", err)
-		}
-	} else if fp_.Size() != 0 {
-		f, err = os.Open(filePath)
-		if err != nil {
-			return fmt.Errorf("failed to open existing avro file: %v", err)
-		}
-		defer f.Close()
-
-		br := bufio.NewReader(f)
-
-		reader, err := goavro.NewOCFReader(br)
-		if err != nil {
-			return fmt.Errorf("failed to create OCF reader: %v", err)
-		}
-		codec = reader.Codec()
-		schema = codec.Schema()
-	}
-
-	timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
-
-	if dumpAll {
-		timeRef = time.Now().Unix()
-	}
-
-	// Empty values
-	if len(l.data) == 0 {
-		// we checkpoint avro files every 60 seconds
-		repeat := 60 / intRes
-
-		for range repeat {
-			recordList = append(recordList, make(map[string]any))
-		}
-	}
-
-	readFlag := true
-
-	for ts := range l.data {
-		flag := false
-		if ts < timeRef {
-			data := l.data[ts]
-
-			schemaGen, err := generateSchema(data)
-			if err != nil {
-				return err
-			}
-
-			flag, schema, err = compareSchema(schema, schemaGen)
-			if err != nil {
-				return fmt.Errorf("failed to compare read and generated schema: %v", err)
-			}
-			if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
-				// Use closure to ensure file is closed even on error
-				err := func() error {
-					f2, err := os.Open(filePath)
-					if err != nil {
-						return fmt.Errorf("failed to open Avro file: %v", err)
-					}
-					defer f2.Close()
-
-					br := bufio.NewReader(f2)
-
-					ocfReader, err := goavro.NewOCFReader(br)
-					if err != nil {
-						return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
-					}
-
-					for ocfReader.Scan() {
-						record, err := ocfReader.Read()
-						if err != nil {
-							return fmt.Errorf("failed to read record: %v", err)
-						}
-
-						recordList = append(recordList, record.(map[string]any))
-					}
-
-					return nil
-				}()
-				if err != nil {
-					return err
-				}
-
-				err = os.Remove(filePath)
-				if err != nil {
-					return fmt.Errorf("failed to delete file: %v", err)
-				}
-
-				readFlag = false
-			}
-			codec, err = goavro.NewCodec(schema)
-			if err != nil {
-				return fmt.Errorf("failed to create codec after merged schema: %v", err)
-			}
-
-			recordList = append(recordList, generateRecord(data))
-			delete(l.data, ts)
-		}
-	}
-
-	if len(recordList) == 0 {
-		return ErrNoNewArchiveData
-	}
-
-	f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
-	if err != nil {
-		return fmt.Errorf("failed to append new avro file: %v", err)
-	}
-	defer f.Close()
-
-	// fmt.Printf("Codec : %#v\n", codec)
-
-	writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
-		W:               f,
-		Codec:           codec,
-		CompressionName: goavro.CompressionDeflateLabel,
-	})
-	if err != nil {
-		return fmt.Errorf("failed to create OCF writer: %v", err)
-	}
-
-	// Append the new record
-	if err := writer.Append(recordList); err != nil {
-		return fmt.Errorf("failed to append record: %v", err)
-	}
-
-	return nil
-}
-
-func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
-	var genSchema, readSchema AvroSchema
-
-	if schemaRead == "" {
-		return false, schemaGen, nil
-	}
-
-	// Unmarshal the schema strings into AvroSchema structs
-	if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
-		return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
-	}
-	if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
-		return false, "", fmt.Errorf("failed to parse read schema: %v", err)
-	}
-
-	sort.Slice(genSchema.Fields, func(i, j int) bool {
-		return genSchema.Fields[i].Name < genSchema.Fields[j].Name
-	})
-
-	sort.Slice(readSchema.Fields, func(i, j int) bool {
-		return readSchema.Fields[i].Name < readSchema.Fields[j].Name
-	})
-
-	// Check if schemas are identical
-	schemasEqual := true
-	if len(genSchema.Fields) <= len(readSchema.Fields) {
-
-		for i := range genSchema.Fields {
-			if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
-				schemasEqual = false
-				break
-			}
-		}
-
-		// If schemas are identical, return the read schema
-		if schemasEqual {
-			return false, schemaRead, nil
-		}
-	}
-
-	// Create a map to hold unique fields from both schemas
-	fieldMap := make(map[string]AvroField)
-
-	// Add fields from the read schema
-	for _, field := range readSchema.Fields {
-		fieldMap[field.Name] = field
-	}
-
-	// Add or update fields from the generated schema
-	for _, field := range genSchema.Fields {
-		fieldMap[field.Name] = field
-	}
-
-	// Create a union schema by collecting fields from the map
-	var mergedFields []AvroField
-	for _, field := range fieldMap {
-		mergedFields = append(mergedFields, field)
-	}
-
-	// Sort fields by name for consistency
-	sort.Slice(mergedFields, func(i, j int) bool {
-		return mergedFields[i].Name < mergedFields[j].Name
-	})
-
-	// Create the merged schema
-	mergedSchema := AvroSchema{
-		Type:   "record",
-		Name:   genSchema.Name,
-		Fields: mergedFields,
-	}
-
-	// Check if schemas are identical
-	schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
-	if schemasEqual {
-		for i := range mergedSchema.Fields {
-			if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
-				schemasEqual = false
-				break
-			}
-		}
-
-		if schemasEqual {
-			return false, schemaRead, nil
-		}
-	}
-
-	// Marshal the merged schema back to JSON
-	mergedSchemaJSON, err := json.Marshal(mergedSchema)
-	if err != nil {
-		return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
-	}
-
-	return true, string(mergedSchemaJSON), nil
-}
-
-func generateSchema(data map[string]schema.Float) (string, error) {
-	// Define the Avro schema structure
-	schema := map[string]any{
-		"type":   "record",
-		"name":   "DataRecord",
-		"fields": []map[string]any{},
-	}
-
-	fieldTracker := make(map[string]struct{})
-
-	for key := range data {
-		if _, exists := fieldTracker[key]; !exists {
-			key = correctKey(key)
-
-			field := map[string]any{
-				"name":    key,
-				"type":    "double",
-				"default": -1.0,
-			}
-			schema["fields"] = append(schema["fields"].([]map[string]any), field)
-			fieldTracker[key] = struct{}{}
-		}
-	}
-
-	schemaString, err := json.Marshal(schema)
-	if err != nil {
-		return "", fmt.Errorf("failed to marshal schema: %v", err)
-	}
-
-	return string(schemaString), nil
-}
-
-func generateRecord(data map[string]schema.Float) map[string]any {
-	record := make(map[string]any)
-
-	// Iterate through each map in data
-	for key, value := range data {
-		key = correctKey(key)
-
-		// Set the value in the record
-		// avro only accepts basic types
-		record[key] = value.Double()
-	}
-
-	return record
-}
-
-func correctKey(key string) string {
-	key = strings.ReplaceAll(key, "_", "_0x5F_")
-	key = strings.ReplaceAll(key, ":", "_0x3A_")
-	key = strings.ReplaceAll(key, ".", "_0x2E_")
-	return key
-}
-
-func ReplaceKey(key string) string {
-	key = strings.ReplaceAll(key, "_0x2E_", ".")
-	key = strings.ReplaceAll(key, "_0x3A_", ":")
-	key = strings.ReplaceAll(key, "_0x5F_", "_")
-	return key
-}
diff --git a/pkg/metricstore/avroHelper.go b/pkg/metricstore/avroHelper.go
deleted file mode 100644
index f6bef36e..00000000
--- a/pkg/metricstore/avroHelper.go
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package metricstore
-
-import (
-	"context"
-	"slices"
-	"strconv"
-	"strings"
-	"sync"
-
-	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
-)
-
-func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-
-		if Keys.Checkpoints.FileFormat == "json" {
-			return
-		}
-
-		ms := GetMemoryStore()
-		var avroLevel *AvroLevel
-		oldSelector := make([]string, 0)
-
-		for {
-			select {
-			case <-ctx.Done():
-				// Drain any remaining messages in channel before exiting
-				for {
-					select {
-					case val, ok := <-LineProtocolMessages:
-						if !ok {
-							// Channel closed
-							return
-						}
-						// Process remaining message
-						freq, err := ms.GetMetricFrequency(val.MetricName)
-						if err != nil {
-							continue
-						}
-
-						var metricName strings.Builder
-						for _, selectorName := range val.Selector {
-							metricName.WriteString(selectorName + SelectorDelimiter)
-						}
-						metricName.WriteString(val.MetricName)
-
-						var selector []string
-						selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
-
-						if !stringSlicesEqual(oldSelector, selector) {
-							avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
-							if avroLevel == nil {
-								cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
-							}
-							oldSelector = slices.Clone(selector)
-						}
-
-						if avroLevel != nil {
-							avroLevel.addMetric(metricName.String(), val.Value, val.Timestamp, int(freq))
-						}
-					default:
-						// No more messages, exit
-						return
-					}
-				}
-			case val, ok := <-LineProtocolMessages:
-				if !ok {
-					// Channel closed, exit gracefully
-					return
-				}
-
-				// Fetch the frequency of the metric from the global configuration
-				freq, err := ms.GetMetricFrequency(val.MetricName)
-				if err != nil {
-					cclog.Errorf("Error fetching metric frequency: %s\n", err)
-					continue
-				}
-
-				var metricName strings.Builder
-
-				for _, selectorName := range val.Selector {
-					metricName.WriteString(selectorName + SelectorDelimiter)
-				}
-
-				metricName.WriteString(val.MetricName)
-
-				// Create a new selector for the Avro level
-				// The selector is a slice of strings that represents the path to the
-				// Avro level. It is created by appending the cluster, node, and metric
-				// name to the selector.
-				var selector []string
-				selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
-
-				if !stringSlicesEqual(oldSelector, selector) {
-					// Get the Avro level for the metric
-					avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
-
-					// If the Avro level is nil, create a new one
-					if avroLevel == nil {
-						cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
-					}
-					oldSelector = slices.Clone(selector)
-				}
-
-				if avroLevel != nil {
-					avroLevel.addMetric(metricName.String(), val.Value, val.Timestamp, int(freq))
-				}
-			}
-		}
-	}()
-}
-
-func stringSlicesEqual(a, b []string) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	for i := range a {
-		if a[i] != b[i] {
-			return false
-		}
-	}
-	return true
-}
diff --git a/pkg/metricstore/avroStruct.go b/pkg/metricstore/avroStruct.go
deleted file mode 100644
index 78a8d137..00000000
--- a/pkg/metricstore/avroStruct.go
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package metricstore
-
-import (
-	"sync"
-
-	"github.com/ClusterCockpit/cc-lib/v2/schema"
-)
-
-var (
-	LineProtocolMessages = make(chan *AvroStruct)
-	// SelectorDelimiter separates hierarchical selector components in metric names for Avro encoding
-	SelectorDelimiter = "_SEL_"
-)
-
-var CheckpointBufferMinutes = DefaultCheckpointBufferMin
-
-type AvroStruct struct {
-	MetricName string
-	Cluster    string
-	Node       string
-	Selector   []string
-	Value      schema.Float
-	Timestamp  int64
-}
-
-type AvroStore struct {
-	root AvroLevel
-}
-
-var avroStore AvroStore
-
-type AvroLevel struct {
-	children map[string]*AvroLevel
-	data     map[int64]map[string]schema.Float
-	lock     sync.RWMutex
-}
-
-type AvroField struct {
-	Name    string `json:"name"`
-	Type    any    `json:"type"`
-	Default any    `json:"default,omitempty"`
-}
-
-type AvroSchema struct {
-	Type   string      `json:"type"`
-	Name   string      `json:"name"`
-	Fields []AvroField `json:"fields"`
-}
-
-func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
-	if len(selector) == 0 {
-		return l
-	}
-
-	// Allow concurrent reads:
-	l.lock.RLock()
-	var child *AvroLevel
-	var ok bool
-	if l.children == nil {
-		// Children map needs to be created...
-		l.lock.RUnlock()
-	} else {
-		child, ok := l.children[selector[0]]
-		l.lock.RUnlock()
-		if ok {
-			return child.findAvroLevelOrCreate(selector[1:])
-		}
-	}
-
-	// The level does not exist, take write lock for unique access:
-	l.lock.Lock()
-	// While this thread waited for the write lock, another thread
-	// could have created the child node.
-	if l.children != nil {
-		child, ok = l.children[selector[0]]
-		if ok {
-			l.lock.Unlock()
-			return child.findAvroLevelOrCreate(selector[1:])
-		}
-	}
-
-	child = &AvroLevel{
-		data:     make(map[int64]map[string]schema.Float, 0),
-		children: nil,
-	}
-
-	if l.children != nil {
-		l.children[selector[0]] = child
-	} else {
-		l.children = map[string]*AvroLevel{selector[0]: child}
-	}
-	l.lock.Unlock()
-	return child.findAvroLevelOrCreate(selector[1:])
-}
-
-func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
-	l.lock.Lock()
-	defer l.lock.Unlock()
-
-	KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
-
-	// Create keys in advance for the given amount of time
-	if len(l.data) != KeyCounter {
-		if len(l.data) == 0 {
-			for i := range KeyCounter {
-				l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
-			}
-		} else {
-			// Get the last timestamp
-			var lastTS int64
-			for ts := range l.data {
-				if ts > lastTS {
-					lastTS = ts
-				}
-			}
-			// Create keys for the next KeyCounter timestamps
-			l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
-		}
-	}
-
-	closestTS := int64(0)
-	minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
-	found := false
-
-	// Iterate over timestamps and choose the one which is within range.
-	// Since its epoch time, we check if the difference is less than 60 seconds.
-	for ts, dat := range l.data {
-		// Check if timestamp is within range
-		diff := timestamp - ts
-		if diff < -int64(Freq) || diff > int64(Freq) {
-			continue
-		}
-
-		// Metric already present at this timestamp — skip
-		if _, ok := dat[metricName]; ok {
-			continue
-		}
-
-		// Check if this is the closest timestamp so far
-		if Abs(diff) < minDiff {
-			minDiff = Abs(diff)
-			closestTS = ts
-			found = true
-		}
-	}
-
-	if found {
-		l.data[closestTS][metricName] = value
-	}
-}
-
-func GetAvroStore() *AvroStore {
-	return &avroStore
-}
-
-// Abs returns the absolute value of x.
-func Abs(x int64) int64 {
-	if x < 0 {
-		return -x
-	}
-	return x
-}
diff --git a/pkg/metricstore/checkpoint.go b/pkg/metricstore/checkpoint.go
index b4097ff2..590197e3 100644
--- a/pkg/metricstore/checkpoint.go
+++ b/pkg/metricstore/checkpoint.go
@@ -6,15 +6,15 @@
 // This file implements checkpoint persistence for the in-memory metric store.
 //
 // Checkpoints enable graceful restarts by periodically saving in-memory metric
-// data to disk in either JSON or Avro format. The checkpoint system:
+// data to disk in JSON or binary format. The checkpoint system:
 //
 // Key Features:
 //   - Periodic background checkpointing via the Checkpointing() worker
-//   - Two formats: JSON (human-readable) and Avro (compact, efficient)
+//   - Two format families: JSON (human-readable) and WAL+binary (compact, crash-safe)
 //   - Parallel checkpoint creation and loading using worker pools
-//   - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|avro}
+//   - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|bin}
+//   - WAL file: checkpoint_dir/cluster/host/current.wal (append-only, per-entry)
 //   - Only saves unarchived data (archived data is already persisted elsewhere)
-//   - Automatic format detection and fallback during loading
 //   - GC optimization during loading to prevent excessive heap growth
 //
 // Checkpoint Workflow:
@@ -27,8 +27,9 @@
 //	checkpoints/
 //	  cluster1/
 //	    host001/
-//	      1234567890.json  (timestamp = checkpoint start time)
-//	      1234567950.json
+//	      1234567890.json  (JSON format: full subtree snapshot)
+//	      1234567890.bin   (binary format: full subtree snapshot)
+//	      current.wal      (WAL format: append-only per-entry log)
 //	    host002/
 //	      ...
 package metricstore
@@ -52,7 +53,6 @@ import (
 
 	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
-	"github.com/linkedin/goavro/v2"
 )
 
 const (
@@ -86,47 +86,58 @@ var (
 
 // Checkpointing starts a background worker that periodically saves metric data to disk.
 //
-// The behavior depends on the configured file format:
-//   - JSON: Periodic checkpointing based on Keys.Checkpoints.Interval
-//   - Avro: Initial delay + periodic checkpointing at DefaultAvroCheckpointInterval
-//
-// The worker respects context cancellation and signals completion via the WaitGroup.
+// Format behaviour:
+//   - "json": Periodic checkpointing based on Keys.Checkpoints.Interval
+//   - "wal":  Periodic binary snapshots + WAL rotation at Keys.Checkpoints.Interval
 func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 	lastCheckpointMu.Lock()
 	lastCheckpoint = time.Now()
 	lastCheckpointMu.Unlock()
 
-	if Keys.Checkpoints.FileFormat == "json" {
-		ms := GetMemoryStore()
+	ms := GetMemoryStore()
 
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			d, err := time.ParseDuration(Keys.Checkpoints.Interval)
-			if err != nil {
-				cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error())
-			}
-			if d <= 0 {
-				cclog.Warnf("[METRICSTORE]> checkpoint interval is zero or negative (%s), checkpointing disabled", d)
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+
+		d, err := time.ParseDuration(Keys.Checkpoints.Interval)
+		if err != nil {
+			cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error())
+		}
+		if d <= 0 {
+			cclog.Warnf("[METRICSTORE]> checkpoint interval is zero or negative (%s), checkpointing disabled", d)
+			return
+		}
+
+		ticker := time.NewTicker(d)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
 				return
-			}
+			case <-ticker.C:
+				lastCheckpointMu.Lock()
+				from := lastCheckpoint
+				lastCheckpointMu.Unlock()
 
-			ticker := time.NewTicker(d)
-			defer ticker.Stop()
+				now := time.Now()
+				cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
 
-			for {
-				select {
-				case <-ctx.Done():
-					return
-				case <-ticker.C:
-					lastCheckpointMu.Lock()
-					from := lastCheckpoint
-					lastCheckpointMu.Unlock()
-
-					cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
-					now := time.Now()
-					n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
-						from.Unix(), now.Unix())
+				if Keys.Checkpoints.FileFormat == "wal" {
+					n, hostDirs, err := ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
+					if err != nil {
+						cclog.Errorf("[METRICSTORE]> binary checkpointing failed: %s", err.Error())
+					} else {
+						cclog.Infof("[METRICSTORE]> done: %d binary snapshot files created", n)
+						lastCheckpointMu.Lock()
+						lastCheckpoint = now
+						lastCheckpointMu.Unlock()
+						// Rotate WAL files for successfully checkpointed hosts.
+						RotateWALFiles(hostDirs)
+					}
+				} else {
+					n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
 					if err != nil {
 						cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
 					} else {
@@ -137,32 +148,8 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 					}
 				}
 			}
-		}()
-	} else {
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-
-			select {
-			case <-ctx.Done():
-				return
-			case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
-				GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
-			}
-
-			ticker := time.NewTicker(DefaultAvroCheckpointInterval)
-			defer ticker.Stop()
-
-			for {
-				select {
-				case <-ctx.Done():
-					return
-				case <-ticker.C:
-					GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
-				}
-			}
-		}()
-	}
+		}
+	}()
 }
 
 // MarshalJSON provides optimized JSON encoding for CheckpointMetrics.
@@ -190,7 +177,7 @@ func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
 	return buf, nil
 }
 
-// ToCheckpoint writes metric data to checkpoint files in parallel.
+// ToCheckpoint writes metric data to checkpoint files in parallel (JSON format).
 //
 // Metrics at root and cluster levels are skipped. One file per host is created.
 // Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host
@@ -378,7 +365,6 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
 		return err
 	}
 
-	gcCounter := 0
 	for _, clusterDir := range clustersDir {
 		if !clusterDir.IsDir() {
 			return errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
@@ -394,16 +380,6 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
 				return errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
 			}
 
-			gcCounter++
-			// if gcCounter%GCTriggerInterval == 0 {
-			// Forcing garbage collection runs here regulary during the loading of checkpoints
-			// will decrease the total heap size after loading everything back to memory is done.
-			// While loading data, the heap will grow fast, so the GC target size will double
-			// almost always. By forcing GCs here, we can keep it growing more slowly so that
-			// at the end, less memory is wasted.
-			// runtime.GC()
-			// }
-
 			work <- [2]string{clusterDir.Name(), hostDir.Name()}
 		}
 	}
@@ -413,8 +389,8 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
 
 // FromCheckpoint loads checkpoint files from disk into memory in parallel.
 //
-// Uses worker pool to load cluster/host combinations. Periodically triggers GC
-// to prevent excessive heap growth. Returns number of files loaded and any errors.
+// Uses worker pool to load cluster/host combinations. Returns number of files
+// loaded and any errors.
 func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
 	var wg sync.WaitGroup
 	work := make(chan [2]string, Keys.NumWorkers*4)
@@ -452,13 +428,11 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
 
 // FromCheckpointFiles is the main entry point for loading checkpoints at startup.
 //
-// Automatically detects checkpoint format (JSON vs Avro) and falls back if needed.
 // Creates checkpoint directory if it doesn't exist. This function must be called
 // before any writes or reads, and can only be called once.
 func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
 	if _, err := os.Stat(dir); os.IsNotExist(err) {
-		// The directory does not exist, so create it using os.MkdirAll()
-		err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
+		err := os.MkdirAll(dir, CheckpointDirPerms)
 		if err != nil {
 			cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
 		}
@@ -468,146 +442,6 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
 	return m.FromCheckpoint(dir, from)
 }
 
-func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
-	br := bufio.NewReader(f)
-
-	fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
-	resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
-	if err != nil {
-		return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
-	}
-
-	fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
-
-	// Same logic according to lineprotocol
-	fromTimestamp -= (resolution / 2)
-
-	if err != nil {
-		return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
-	}
-
-	// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
-
-	var recordCounter int64 = 0
-
-	// Create a new OCF reader from the buffered reader
-	ocfReader, err := goavro.NewOCFReader(br)
-	if err != nil {
-		return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err)
-	}
-
-	metricsData := make(map[string]schema.FloatArray)
-
-	for ocfReader.Scan() {
-		datum, err := ocfReader.Read()
-		if err != nil {
-			return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
-		}
-
-		record, ok := datum.(map[string]any)
-		if !ok {
-			return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}")
-		}
-
-		for key, value := range record {
-			metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
-		}
-
-		recordCounter += 1
-	}
-
-	to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
-	if to < from {
-		return nil
-	}
-
-	for key, floatArray := range metricsData {
-		metricName := ReplaceKey(key)
-
-		if strings.Contains(metricName, SelectorDelimiter) {
-			subString := strings.Split(metricName, SelectorDelimiter)
-
-			lvl := l
-
-			for i := 0; i < len(subString)-1; i++ {
-
-				sel := subString[i]
-
-				if lvl.children == nil {
-					lvl.children = make(map[string]*Level)
-				}
-
-				child, ok := lvl.children[sel]
-				if !ok {
-					child = &Level{
-						metrics:  make([]*buffer, len(m.Metrics)),
-						children: nil,
-					}
-					lvl.children[sel] = child
-				}
-				lvl = child
-			}
-
-			leafMetricName := subString[len(subString)-1]
-			err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
-			if err != nil {
-				return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
-			}
-		} else {
-			err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
-			if err != nil {
-				return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
-			}
-		}
-
-	}
-
-	return nil
-}
-
-func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
-	n := len(floatArray)
-	b := &buffer{
-		frequency: resolution,
-		start:     from,
-		data:      floatArray[0:n:n],
-		prev:      nil,
-		next:      nil,
-		archived:  true,
-	}
-
-	minfo, ok := m.Metrics[metricName]
-	if !ok {
-		return nil
-	}
-
-	prev := l.metrics[minfo.offset]
-	if prev == nil {
-		l.metrics[minfo.offset] = b
-	} else {
-		if prev.start > b.start {
-			return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
-		}
-
-		b.prev = prev
-		prev.next = b
-
-		missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
-		if missingCount > 0 {
-			missingCount /= int(b.frequency)
-
-			for range missingCount {
-				prev.data = append(prev.data, schema.NaN)
-			}
-
-			prev.data = prev.data[0:len(prev.data):len(prev.data)]
-		}
-	}
-	l.metrics[minfo.offset] = b
-
-	return nil
-}
-
 func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
 	br := bufio.NewReader(f)
 	cf := &CheckpointFile{}
@@ -679,37 +513,37 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
 	return nil
 }
 
+// fromCheckpoint loads all checkpoint files (JSON, binary snapshot, WAL) for a
+// single host directory. Snapshot files are loaded first (sorted by timestamp),
+// then current.wal is replayed on top.
 func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) {
 	direntries, err := os.ReadDir(dir)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return 0, nil
 		}
-
 		return 0, err
 	}
 
 	allFiles := make([]fs.DirEntry, 0)
+	var walEntry fs.DirEntry
 	filesLoaded := 0
+
 	for _, e := range direntries {
 		if e.IsDir() {
-			child := &Level{
-				metrics:  make([]*buffer, len(m.Metrics)),
-				children: make(map[string]*Level),
-			}
-
-			files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from)
-			filesLoaded += files
-			if err != nil {
-				return filesLoaded, err
-			}
-
-			l.children[e.Name()] = child
-		} else if strings.HasSuffix(e.Name(), ".json") || strings.HasSuffix(e.Name(), ".avro") {
-			allFiles = append(allFiles, e)
-		} else {
+			// Legacy: skip subdirectories (only used by old Avro format).
+			// These are ignored; their data is not loaded.
+			cclog.Debugf("[METRICSTORE]> skipping subdirectory %s in checkpoint dir %s", e.Name(), dir)
 			continue
 		}
+
+		name := e.Name()
+		if strings.HasSuffix(name, ".json") || strings.HasSuffix(name, ".bin") {
+			allFiles = append(allFiles, e)
+		} else if name == "current.wal" {
+			walEntry = e
+		}
+		// Silently ignore other files (e.g., .tmp, .bin.tmp from interrupted writes).
 	}
 
 	files, err := findFiles(allFiles, from, true)
@@ -719,54 +553,81 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, err
 
 	loaders := map[string]func(*MemoryStore, *os.File, int64) error{
 		".json": l.loadJSONFile,
-		".avro": l.loadAvroFile,
+		".bin":  l.loadBinaryFile,
 	}
 
 	for _, filename := range files {
 		ext := filepath.Ext(filename)
 		loader := loaders[ext]
 		if loader == nil {
-			cclog.Warnf("Unknown extension for file %s", filename)
+			cclog.Warnf("[METRICSTORE]> unknown extension for checkpoint file %s", filename)
 			continue
 		}
 
-		// Use a closure to ensure file is closed immediately after use
 		err := func() error {
 			f, err := os.Open(path.Join(dir, filename))
 			if err != nil {
 				return err
 			}
 			defer f.Close()
-
 			return loader(m, f, from)
 		}()
 		if err != nil {
 			return filesLoaded, err
 		}
+		filesLoaded++
+	}
 
-		filesLoaded += 1
+	// Replay WAL after all snapshot files so it fills in data since the last snapshot.
+	if walEntry != nil {
+		err := func() error {
+			f, err := os.Open(path.Join(dir, walEntry.Name()))
+			if err != nil {
+				return err
+			}
+			defer f.Close()
+			return l.loadWALFile(m, f, from)
+		}()
+		if err != nil {
+			// WAL errors are non-fatal: the snapshot already loaded the bulk of data.
+			cclog.Warnf("[METRICSTORE]> WAL replay error for %s: %v (data since last snapshot may be missing)", dir, err)
+		} else {
+			filesLoaded++
+		}
 	}
 
 	return filesLoaded, nil
 }
 
-// This will probably get very slow over time!
-// A solution could be some sort of an index file in which all other files
-// and the timespan they contain is listed.
-// NOTE: This now assumes that you have distinct timestamps for json and avro files
-// Also, it assumes that the timestamps are not overlapping/self-modified.
+// parseTimestampFromFilename extracts a Unix timestamp from a checkpoint filename.
+// Supports ".json" (format: "<ts>.json") and ".bin" (format: "<ts>.bin").
+func parseTimestampFromFilename(name string) (int64, error) {
+	switch {
+	case strings.HasSuffix(name, ".json"):
+		return strconv.ParseInt(name[:len(name)-5], 10, 64)
+	case strings.HasSuffix(name, ".bin"):
+		return strconv.ParseInt(name[:len(name)-4], 10, 64)
+	default:
+		return 0, fmt.Errorf("unknown checkpoint extension for file %q", name)
+	}
+}
+
+// findFiles returns filenames from direntries whose timestamps satisfy the filter.
+// If findMoreRecentFiles is true, returns files with timestamps >= t (plus the
+// last file before t if t falls between two files).
 func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
 	nums := map[string]int64{}
 	for _, e := range direntries {
-		if !strings.HasSuffix(e.Name(), ".json") && !strings.HasSuffix(e.Name(), ".avro") {
+		name := e.Name()
+		if !strings.HasSuffix(name, ".json") && !strings.HasSuffix(name, ".bin") {
 			continue
 		}
 
-		ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
+		ts, err := parseTimestampFromFilename(name)
 		if err != nil {
 			return nil, err
 		}
-		nums[e.Name()] = ts
+		nums[name] = ts
 	}
 
 	sort.Slice(direntries, func(i, j int) bool {
@@ -783,16 +644,12 @@ func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]s
 	for i, e := range direntries {
 		ts1 := nums[e.Name()]
 
-		// Logic to look for files in forward or direction
-		// If logic: All files greater than  or after
-		// the given timestamp will be selected
-		// Else If logic: All files less than  or before
-		// the given timestamp will be selected
 		if findMoreRecentFiles && t <= ts1 {
 			filenames = append(filenames, e.Name())
 		} else if !findMoreRecentFiles && ts1 <= t && ts1 != 0 {
 			filenames = append(filenames, e.Name())
 		}
+
 		if i == len(direntries)-1 {
 			continue
 		}
diff --git a/pkg/metricstore/config.go b/pkg/metricstore/config.go
index 1efee61a..53716967 100644
--- a/pkg/metricstore/config.go
+++ b/pkg/metricstore/config.go
@@ -14,7 +14,7 @@
 //	├─ RetentionInMemory: How long to keep data in RAM
 //	├─ MemoryCap: Memory limit in bytes (triggers forceFree)
 //	├─ Checkpoints: Persistence configuration
-//	│  ├─ FileFormat: "avro" or "json"
+//	│  ├─ FileFormat: "json" or "wal"
 //	│  ├─ Interval: How often to save (e.g., "1h")
 //	│  └─ RootDir: Checkpoint storage path
 //	├─ Cleanup: Long-term storage configuration
@@ -55,16 +55,13 @@ const (
 	DefaultMaxWorkers                 = 10
 	DefaultBufferCapacity             = 512
 	DefaultGCTriggerInterval          = 100
-	DefaultAvroWorkers                = 4
-	DefaultCheckpointBufferMin        = 3
-	DefaultAvroCheckpointInterval     = time.Minute
 	DefaultMemoryUsageTrackerInterval = 1 * time.Hour
 )
 
 // Checkpoints configures periodic persistence of in-memory metric data.
 //
 // Fields:
-//   - FileFormat: "avro" (default, binary, compact) or "json" (human-readable, slower)
+//   - FileFormat: "json" (human-readable, periodic) or "wal" (binary snapshot + WAL, crash-safe)
 //   - Interval:   Duration string (e.g., "1h", "30m") between checkpoint saves
 //   - RootDir:    Filesystem path for checkpoint files (created if missing)
 type Checkpoints struct {
diff --git a/pkg/metricstore/configSchema.go b/pkg/metricstore/configSchema.go
index 6a748be0..67f30976 100644
--- a/pkg/metricstore/configSchema.go
+++ b/pkg/metricstore/configSchema.go
@@ -18,7 +18,7 @@ const configSchema = `{
       "type": "object",
       "properties": {
         "file-format": {
-          "description": "Specify the format for checkpoint files. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
+          "description": "Specify the format for checkpoint files. Two variants: 'json' (human-readable, periodic) and 'wal' (binary snapshot + Write-Ahead Log, crash-safe). Default is 'json'.",
           "type": "string"
         },
         "interval": {
diff --git a/pkg/metricstore/lineprotocol.go b/pkg/metricstore/lineprotocol.go
index bfbbef2d..1e04bba0 100644
--- a/pkg/metricstore/lineprotocol.go
+++ b/pkg/metricstore/lineprotocol.go
@@ -244,8 +244,8 @@ func DecodeLine(dec *lineprotocol.Decoder,
 
 		time := t.Unix()
 
-		if Keys.Checkpoints.FileFormat != "json" {
-			LineProtocolMessages <- &AvroStruct{
+		if Keys.Checkpoints.FileFormat == "wal" {
+			WALMessages <- &WALMessage{
 				MetricName: string(metricBuf),
 				Cluster:    cluster,
 				Node:       host,
diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go
index 789c6d07..3fe64d55 100644
--- a/pkg/metricstore/metricstore.go
+++ b/pkg/metricstore/metricstore.go
@@ -172,7 +172,7 @@ func Init(rawConfig json.RawMessage, metrics map[string]MetricConfig, wg *sync.W
 	Retention(wg, ctx)
 	Checkpointing(wg, ctx)
 	CleanUp(wg, ctx)
-	DataStaging(wg, ctx)
+	WALStaging(wg, ctx)
 	MemoryUsageTracker(wg, ctx)
 
 	// Note: Signal handling has been removed from this function.
@@ -264,7 +264,7 @@ func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) {
 //
 // The function will:
 //  1. Cancel the context to stop all background workers
-//  2. Close NATS message channels if using Avro format
+//  2. Close the WAL messages channel if using WAL format
 //  3. Write a final checkpoint to preserve in-memory data
 //  4. Log any errors encountered during shutdown
 //
@@ -276,8 +276,8 @@ func Shutdown() {
 		shutdownFunc()
 	}
 
-	if Keys.Checkpoints.FileFormat != "json" {
-		close(LineProtocolMessages)
+	if Keys.Checkpoints.FileFormat == "wal" {
+		close(WALMessages)
 	}
 
 	cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
@@ -286,10 +286,18 @@ func Shutdown() {
 
 	ms := GetMemoryStore()
 
-	if Keys.Checkpoints.FileFormat == "json" {
-		files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
+	lastCheckpointMu.Lock()
+	from := lastCheckpoint
+	lastCheckpointMu.Unlock()
+
+	if Keys.Checkpoints.FileFormat == "wal" {
+		var hostDirs []string
+		files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
+		if err == nil {
+			RotateWALFiles(hostDirs)
+		}
 	} else {
-		files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
+		files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
 	}
 
 	if err != nil {
diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go
new file mode 100644
index 00000000..e8a71ce2
--- /dev/null
+++ b/pkg/metricstore/walCheckpoint.go
@@ -0,0 +1,787 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// Package metricstore provides walCheckpoint.go: WAL-based checkpoint implementation.
+//
+// This replaces the Avro shadow tree with an append-only Write-Ahead Log (WAL)
+// per host, eliminating the extra memory overhead of the AvroStore and providing
+// truly continuous (per-write) crash safety.
+//
+// # Architecture
+//
+//	Metric write (DecodeLine)
+//	      │
+//	      ├─► WriteToLevel()    → main MemoryStore (unchanged)
+//	      │
+//	      └─► WALMessages channel
+//	               │
+//	               ▼
+//	        WALStaging goroutine
+//	               │
+//	               ▼
+//	        checkpoints/cluster/host/current.wal  (append-only, binary)
+//
+//	Periodic checkpoint (Checkpointing goroutine):
+//	  1. Write <timestamp>.bin snapshot (column-oriented, from main tree)
+//	  2. Signal WALStaging to truncate current.wal per host
+//
+//	On restart (FromCheckpoint):
+//	  1. Load most recent <timestamp>.bin snapshot
+//	  2. Replay current.wal (overwrite-safe: buffer.write handles duplicate timestamps)
+//
+// # WAL Record Format
+//
+//	[4B magic 0xCC1DA7A1][4B payload_len][payload][4B CRC32]
+//
+//	payload:
+//	  [8B timestamp int64]
+//	  [2B metric_name_len uint16][N metric name bytes]
+//	  [1B selector_count uint8]
+//	  per selector: [1B selector_len uint8][M selector bytes]
+//	  [4B value float32 bits]
+//
+// # Binary Snapshot Format
+//
+//	[4B magic 0xCC5B0001][8B from int64][8B to int64]
+//	Level tree (recursive):
+//	  [4B num_metrics uint32]
+//	  per metric:
+//	    [2B name_len uint16][N name bytes]
+//	    [8B frequency int64][8B start int64]
+//	    [4B num_values uint32][num_values × 4B float32]
+//	  [4B num_children uint32]
+//	  per child: [2B name_len uint16][N name bytes] + Level (recursive)
+package metricstore
+
+import (
+	"bufio"
+	"context"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"math"
+	"os"
+	"path"
+	"sync"
+	"sync/atomic"
+
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+// Magic numbers for binary formats.
+const (
+	walFileMagic   = uint32(0xCC1DA701) // WAL file header magic
+	walRecordMagic = uint32(0xCC1DA7A1) // WAL record magic
+	snapFileMagic  = uint32(0xCC5B0001) // Binary snapshot magic
+)
+
+// WALMessages is the channel for sending metric writes to the WAL staging goroutine.
+// Buffered to allow burst writes without blocking the metric ingestion path.
+var WALMessages = make(chan *WALMessage, 4096)
+
+// walRotateCh is used by the checkpoint goroutine to request WAL file rotation
+// (close, delete, reopen) after a binary snapshot has been written.
+var walRotateCh = make(chan walRotateReq, 256)
+
+// WALMessage represents a single metric write to be appended to the WAL.
+// Cluster and Node are NOT stored in the WAL record (inferred from file path).
+type WALMessage struct {
+	MetricName string
+	Cluster    string
+	Node       string
+	Selector   []string
+	Value      schema.Float
+	Timestamp  int64
+}
+
+// walRotateReq requests WAL file rotation for a specific host directory.
+// The done channel is closed by the WAL goroutine when rotation is complete.
+type walRotateReq struct {
+	hostDir string
+	done    chan struct{}
+}
+
+// walFileState holds an open WAL file handle for one host directory.
+type walFileState struct {
+	f *os.File
+}
+
+// WALStaging starts a background goroutine that receives WALMessage items
+// and appends binary WAL records to per-host current.wal files.
+// Also handles WAL rotation requests from the checkpoint goroutine.
+func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+
+		if Keys.Checkpoints.FileFormat == "json" {
+			return
+		}
+
+		hostFiles := make(map[string]*walFileState)
+
+		defer func() {
+			for _, ws := range hostFiles {
+				if ws.f != nil {
+					ws.f.Close()
+				}
+			}
+		}()
+
+		getOrOpenWAL := func(hostDir string) *os.File {
+			ws, ok := hostFiles[hostDir]
+			if ok {
+				return ws.f
+			}
+
+			if err := os.MkdirAll(hostDir, CheckpointDirPerms); err != nil {
+				cclog.Errorf("[METRICSTORE]> WAL: mkdir %s: %v", hostDir, err)
+				return nil
+			}
+
+			walPath := path.Join(hostDir, "current.wal")
+			f, err := os.OpenFile(walPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, CheckpointFilePerms)
+			if err != nil {
+				cclog.Errorf("[METRICSTORE]> WAL: open %s: %v", walPath, err)
+				return nil
+			}
+
+			// Write file header magic if file is new (empty).
+			info, err := f.Stat()
+			if err == nil && info.Size() == 0 {
+				var hdr [4]byte
+				binary.LittleEndian.PutUint32(hdr[:], walFileMagic)
+				if _, err := f.Write(hdr[:]); err != nil {
+					cclog.Errorf("[METRICSTORE]> WAL: write header %s: %v", walPath, err)
+					f.Close()
+					return nil
+				}
+			}
+
+			hostFiles[hostDir] = &walFileState{f: f}
+			return f
+		}
+
+		processMsg := func(msg *WALMessage) {
+			hostDir := path.Join(Keys.Checkpoints.RootDir, msg.Cluster, msg.Node)
+			f := getOrOpenWAL(hostDir)
+			if f == nil {
+				return
+			}
+			if err := writeWALRecord(f, msg); err != nil {
+				cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err)
+			}
+		}
+
+		processRotate := func(req walRotateReq) {
+			ws, ok := hostFiles[req.hostDir]
+			if ok && ws.f != nil {
+				ws.f.Close()
+				walPath := path.Join(req.hostDir, "current.wal")
+				if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) {
+					cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err)
+				}
+				delete(hostFiles, req.hostDir)
+			}
+			close(req.done)
+		}
+
+		drain := func() {
+			for {
+				select {
+				case msg, ok := <-WALMessages:
+					if !ok {
+						return
+					}
+					processMsg(msg)
+				case req := <-walRotateCh:
+					processRotate(req)
+				default:
+					return
+				}
+			}
+		}
+
+		for {
+			select {
+			case <-ctx.Done():
+				drain()
+				return
+			case msg, ok := <-WALMessages:
+				if !ok {
+					return
+				}
+				processMsg(msg)
+			case req := <-walRotateCh:
+				processRotate(req)
+			}
+		}
+	}()
+}
+
+// RotateWALFiles sends rotation requests for the given host directories
+// and blocks until all rotations complete.
+func RotateWALFiles(hostDirs []string) {
+	dones := make([]chan struct{}, len(hostDirs))
+	for i, dir := range hostDirs {
+		dones[i] = make(chan struct{})
+		walRotateCh <- walRotateReq{hostDir: dir, done: dones[i]}
+	}
+	for _, done := range dones {
+		<-done
+	}
+}
+
+// buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC).
+func buildWALPayload(msg *WALMessage) []byte {
+	size := 8 + 2 + len(msg.MetricName) + 1 + 4
+	for _, s := range msg.Selector {
+		size += 1 + len(s)
+	}
+
+	buf := make([]byte, 0, size)
+
+	// Timestamp (8 bytes, little-endian int64)
+	var ts [8]byte
+	binary.LittleEndian.PutUint64(ts[:], uint64(msg.Timestamp))
+	buf = append(buf, ts[:]...)
+
+	// Metric name (2-byte length prefix + bytes)
+	var mLen [2]byte
+	binary.LittleEndian.PutUint16(mLen[:], uint16(len(msg.MetricName)))
+	buf = append(buf, mLen[:]...)
+	buf = append(buf, msg.MetricName...)
+
+	// Selector count (1 byte)
+	buf = append(buf, byte(len(msg.Selector)))
+
+	// Selectors (1-byte length prefix + bytes each)
+	for _, sel := range msg.Selector {
+		buf = append(buf, byte(len(sel)))
+		buf = append(buf, sel...)
+	}
+
+	// Value (4 bytes, float32 bit representation)
+	var val [4]byte
+	binary.LittleEndian.PutUint32(val[:], math.Float32bits(float32(msg.Value)))
+	buf = append(buf, val[:]...)
+
+	return buf
+}
+
+// writeWALRecord appends a binary WAL record to the file.
+// Format: [4B magic][4B payload_len][payload][4B CRC32]
+func writeWALRecord(f *os.File, msg *WALMessage) error {
+	payload := buildWALPayload(msg)
+	crc := crc32.ChecksumIEEE(payload)
+
+	record := make([]byte, 0, 4+4+len(payload)+4)
+
+	var magic [4]byte
+	binary.LittleEndian.PutUint32(magic[:], walRecordMagic)
+	record = append(record, magic[:]...)
+
+	var pLen [4]byte
+	binary.LittleEndian.PutUint32(pLen[:], uint32(len(payload)))
+	record = append(record, pLen[:]...)
+
+	record = append(record, payload...)
+
+	var crcBytes [4]byte
+	binary.LittleEndian.PutUint32(crcBytes[:], crc)
+	record = append(record, crcBytes[:]...)
+
+	_, err := f.Write(record)
+	return err
+}
+
+// readWALRecord reads one WAL record from the reader.
+// Returns (nil, nil) on clean EOF. Returns error on data corruption.
+// A CRC mismatch indicates a truncated trailing record (expected on crash).
+func readWALRecord(r io.Reader) (*WALMessage, error) {
+	var magic uint32
+	if err := binary.Read(r, binary.LittleEndian, &magic); err != nil {
+		if err == io.EOF {
+			return nil, nil // Clean EOF
+		}
+		return nil, fmt.Errorf("read record magic: %w", err)
+	}
+
+	if magic != walRecordMagic {
+		return nil, fmt.Errorf("invalid record magic 0x%08X (expected 0x%08X)", magic, walRecordMagic)
+	}
+
+	var payloadLen uint32
+	if err := binary.Read(r, binary.LittleEndian, &payloadLen); err != nil {
+		return nil, fmt.Errorf("read payload length: %w", err)
+	}
+
+	if payloadLen > 1<<20 { // 1 MB sanity limit
+		return nil, fmt.Errorf("record payload too large: %d bytes", payloadLen)
+	}
+
+	payload := make([]byte, payloadLen)
+	if _, err := io.ReadFull(r, payload); err != nil {
+		return nil, fmt.Errorf("read payload: %w", err)
+	}
+
+	var storedCRC uint32
+	if err := binary.Read(r, binary.LittleEndian, &storedCRC); err != nil {
+		return nil, fmt.Errorf("read CRC: %w", err)
+	}
+
+	if crc32.ChecksumIEEE(payload) != storedCRC {
+		return nil, fmt.Errorf("CRC mismatch (truncated write or corruption)")
+	}
+
+	return parseWALPayload(payload)
+}
+
+// parseWALPayload decodes a binary payload into a WALMessage.
+func parseWALPayload(payload []byte) (*WALMessage, error) {
+	if len(payload) < 8+2+1+4 {
+		return nil, fmt.Errorf("payload too short: %d bytes", len(payload))
+	}
+
+	offset := 0
+
+	// Timestamp (8 bytes)
+	ts := int64(binary.LittleEndian.Uint64(payload[offset : offset+8]))
+	offset += 8
+
+	// Metric name (2-byte length + bytes)
+	if offset+2 > len(payload) {
+		return nil, fmt.Errorf("metric name length overflows payload")
+	}
+	mLen := int(binary.LittleEndian.Uint16(payload[offset : offset+2]))
+	offset += 2
+
+	if offset+mLen > len(payload) {
+		return nil, fmt.Errorf("metric name overflows payload")
+	}
+	metricName := string(payload[offset : offset+mLen])
+	offset += mLen
+
+	// Selector count (1 byte)
+	if offset >= len(payload) {
+		return nil, fmt.Errorf("selector count overflows payload")
+	}
+	selCount := int(payload[offset])
+	offset++
+
+	selectors := make([]string, selCount)
+	for i := range selCount {
+		if offset >= len(payload) {
+			return nil, fmt.Errorf("selector[%d] length overflows payload", i)
+		}
+		sLen := int(payload[offset])
+		offset++
+
+		if offset+sLen > len(payload) {
+			return nil, fmt.Errorf("selector[%d] data overflows payload", i)
+		}
+		selectors[i] = string(payload[offset : offset+sLen])
+		offset += sLen
+	}
+
+	// Value (4 bytes, float32 bits)
+	if offset+4 > len(payload) {
+		return nil, fmt.Errorf("value overflows payload")
+	}
+	bits := binary.LittleEndian.Uint32(payload[offset : offset+4])
+	value := schema.Float(math.Float32frombits(bits))
+
+	return &WALMessage{
+		MetricName: metricName,
+		Timestamp:  ts,
+		Selector:   selectors,
+		Value:      value,
+	}, nil
+}
+
+// loadWALFile reads a WAL file and replays all valid records into the Level tree.
+// l is the host-level node. Corrupt or partial trailing records are silently skipped
+// (expected on crash). Records older than 'from' are skipped.
+func (l *Level) loadWALFile(m *MemoryStore, f *os.File, from int64) error {
+	br := bufio.NewReader(f)
+
+	// Verify file header magic.
+	var fileMagic uint32
+	if err := binary.Read(br, binary.LittleEndian, &fileMagic); err != nil {
+		if err == io.EOF {
+			return nil // Empty file, no data
+		}
+		return fmt.Errorf("[METRICSTORE]> WAL: read file header: %w", err)
+	}
+
+	if fileMagic != walFileMagic {
+		return fmt.Errorf("[METRICSTORE]> WAL: invalid file magic 0x%08X (expected 0x%08X)", fileMagic, walFileMagic)
+	}
+
+	// Cache level lookups to avoid repeated tree traversal.
+	lvlCache := make(map[string]*Level)
+
+	for {
+		msg, err := readWALRecord(br)
+		if err != nil {
+			// Truncated trailing record is expected after a crash; stop replaying.
+			cclog.Debugf("[METRICSTORE]> WAL: stopping replay at corrupted/partial record: %v", err)
+			break
+		}
+		if msg == nil {
+			break // Clean EOF
+		}
+
+		if msg.Timestamp < from {
+			continue // Older than retention window
+		}
+
+		minfo, ok := m.Metrics[msg.MetricName]
+		if !ok {
+			continue // Unknown metric (config may have changed)
+		}
+
+		// Cache key is the null-separated selector path.
+		cacheKey := joinSelector(msg.Selector)
+		lvl, ok := lvlCache[cacheKey]
+		if !ok {
+			lvl = l.findLevelOrCreate(msg.Selector, len(m.Metrics))
+			lvlCache[cacheKey] = lvl
+		}
+
+		// Write directly to the buffer, same as WriteToLevel but without the
+		// global level lookup (we already have the right level).
+		lvl.lock.Lock()
+		b := lvl.metrics[minfo.offset]
+		if b == nil {
+			b = newBuffer(msg.Timestamp, minfo.Frequency)
+			lvl.metrics[minfo.offset] = b
+		}
+		nb, writeErr := b.write(msg.Timestamp, msg.Value)
+		if writeErr == nil && b != nb {
+			lvl.metrics[minfo.offset] = nb
+		}
+		// Ignore write errors for timestamps before buffer start (can happen when
+		// replaying WAL entries that predate a loaded snapshot's start time).
+		lvl.lock.Unlock()
+	}
+
+	return nil
+}
+
+// joinSelector builds a cache key from a selector slice using null bytes as separators.
+func joinSelector(sel []string) string {
+	if len(sel) == 0 {
+		return ""
+	}
+	result := sel[0]
+	for i := 1; i < len(sel); i++ {
+		result += "\x00" + sel[i]
+	}
+	return result
+}
+
+// ToCheckpointWAL writes binary snapshot files for all hosts in parallel.
+// Returns the number of files written, the list of host directories that were
+// successfully checkpointed (for WAL rotation), and any errors.
+func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string, error) {
+	// Collect all cluster/host pairs.
+	m.root.lock.RLock()
+	totalHosts := 0
+	for _, l1 := range m.root.children {
+		l1.lock.RLock()
+		totalHosts += len(l1.children)
+		l1.lock.RUnlock()
+	}
+	m.root.lock.RUnlock()
+
+	levels := make([]*Level, 0, totalHosts)
+	selectors := make([][]string, 0, totalHosts)
+
+	m.root.lock.RLock()
+	for sel1, l1 := range m.root.children {
+		l1.lock.RLock()
+		for sel2, l2 := range l1.children {
+			levels = append(levels, l2)
+			selectors = append(selectors, []string{sel1, sel2})
+		}
+		l1.lock.RUnlock()
+	}
+	m.root.lock.RUnlock()
+
+	type workItem struct {
+		level    *Level
+		hostDir  string
+		selector []string
+	}
+
+	n, errs := int32(0), int32(0)
+	var successDirs []string
+	var successMu sync.Mutex
+
+	var wg sync.WaitGroup
+	wg.Add(Keys.NumWorkers)
+	work := make(chan workItem, Keys.NumWorkers*2)
+
+	for range Keys.NumWorkers {
+		go func() {
+			defer wg.Done()
+			for wi := range work {
+				err := wi.level.toCheckpointBinary(wi.hostDir, from, to, m)
+				if err != nil {
+					if err == ErrNoNewArchiveData {
+						continue
+					}
+					cclog.Errorf("[METRICSTORE]> binary checkpoint error for %s: %v", wi.hostDir, err)
+					atomic.AddInt32(&errs, 1)
+				} else {
+					atomic.AddInt32(&n, 1)
+					successMu.Lock()
+					successDirs = append(successDirs, wi.hostDir)
+					successMu.Unlock()
+				}
+			}
+		}()
+	}
+
+	for i := range levels {
+		hostDir := path.Join(dir, path.Join(selectors[i]...))
+		work <- workItem{
+			level:    levels[i],
+			hostDir:  hostDir,
+			selector: selectors[i],
+		}
+	}
+	close(work)
+	wg.Wait()
+
+	if errs > 0 {
+		return int(n), successDirs, fmt.Errorf("[METRICSTORE]> %d errors during binary checkpoint (%d successes)", errs, n)
+	}
+	return int(n), successDirs, nil
+}
+
+// toCheckpointBinary writes a binary snapshot file for a single host-level node.
+// Uses atomic rename (write to .tmp then rename) to avoid partial reads on crash.
+func (l *Level) toCheckpointBinary(dir string, from, to int64, m *MemoryStore) error {
+	cf, err := l.toCheckpointFile(from, to, m)
+	if err != nil {
+		return err
+	}
+	if cf == nil {
+		return ErrNoNewArchiveData
+	}
+
+	if err := os.MkdirAll(dir, CheckpointDirPerms); err != nil {
+		return fmt.Errorf("mkdir %s: %w", dir, err)
+	}
+
+	// Write to a temp file first, then rename (atomic on POSIX).
+	tmpPath := path.Join(dir, fmt.Sprintf("%d.bin.tmp", from))
+	finalPath := path.Join(dir, fmt.Sprintf("%d.bin", from))
+
+	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
+	if err != nil {
+		return fmt.Errorf("open binary snapshot %s: %w", tmpPath, err)
+	}
+
+	bw := bufio.NewWriter(f)
+	if err := writeBinarySnapshotFile(bw, cf); err != nil {
+		f.Close()
+		os.Remove(tmpPath)
+		return fmt.Errorf("write binary snapshot: %w", err)
+	}
+	if err := bw.Flush(); err != nil {
+		f.Close()
+		os.Remove(tmpPath)
+		return err
+	}
+	f.Close()
+
+	return os.Rename(tmpPath, finalPath)
+}
+
+// writeBinarySnapshotFile writes the binary snapshot file header and level tree.
+func writeBinarySnapshotFile(w io.Writer, cf *CheckpointFile) error {
+	if err := binary.Write(w, binary.LittleEndian, snapFileMagic); err != nil {
+		return err
+	}
+	if err := binary.Write(w, binary.LittleEndian, cf.From); err != nil {
+		return err
+	}
+	if err := binary.Write(w, binary.LittleEndian, cf.To); err != nil {
+		return err
+	}
+	return writeBinaryLevel(w, cf)
+}
+
+// writeBinaryLevel recursively writes a CheckpointFile level in binary format.
+func writeBinaryLevel(w io.Writer, cf *CheckpointFile) error {
+	if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Metrics))); err != nil {
+		return err
+	}
+
+	for name, metric := range cf.Metrics {
+		if err := writeString16(w, name); err != nil {
+			return err
+		}
+		if err := binary.Write(w, binary.LittleEndian, metric.Frequency); err != nil {
+			return err
+		}
+		if err := binary.Write(w, binary.LittleEndian, metric.Start); err != nil {
+			return err
+		}
+		if err := binary.Write(w, binary.LittleEndian, uint32(len(metric.Data))); err != nil {
+			return err
+		}
+		for _, v := range metric.Data {
+			if err := binary.Write(w, binary.LittleEndian, math.Float32bits(float32(v))); err != nil {
+				return err
+			}
+		}
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Children))); err != nil {
+		return err
+	}
+
+	for name, child := range cf.Children {
+		if err := writeString16(w, name); err != nil {
+			return err
+		}
+		if err := writeBinaryLevel(w, child); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// writeString16 writes a 2-byte length-prefixed string to w.
+func writeString16(w io.Writer, s string) error {
+	if err := binary.Write(w, binary.LittleEndian, uint16(len(s))); err != nil {
+		return err
+	}
+	_, err := io.WriteString(w, s)
+	return err
+}
+
+// loadBinaryFile reads a binary snapshot file and loads data into the Level tree.
+// The retention check (from) is applied to the file's 'to' timestamp.
+func (l *Level) loadBinaryFile(m *MemoryStore, f *os.File, from int64) error {
+	br := bufio.NewReader(f)
+
+	var magic uint32
+	if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
+		return fmt.Errorf("[METRICSTORE]> binary snapshot: read magic: %w", err)
+	}
+	if magic != snapFileMagic {
+		return fmt.Errorf("[METRICSTORE]> binary snapshot: invalid magic 0x%08X (expected 0x%08X)", magic, snapFileMagic)
+	}
+
+	var fileFrom, fileTo int64
+	if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil {
+		return fmt.Errorf("[METRICSTORE]> binary snapshot: read from: %w", err)
+	}
+	if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil {
+		return fmt.Errorf("[METRICSTORE]> binary snapshot: read to: %w", err)
+	}
+
+	if fileTo != 0 && fileTo < from {
+		return nil // File is older than retention window, skip it
+	}
+
+	cf, err := readBinaryLevel(br)
+	if err != nil {
+		return fmt.Errorf("[METRICSTORE]> binary snapshot: read level tree: %w", err)
+	}
+	cf.From = fileFrom
+	cf.To = fileTo
+
+	return l.loadFile(cf, m)
+}
+
+// readBinaryLevel recursively reads a level from the binary snapshot format.
+func readBinaryLevel(r io.Reader) (*CheckpointFile, error) {
+	cf := &CheckpointFile{
+		Metrics:  make(map[string]*CheckpointMetrics),
+		Children: make(map[string]*CheckpointFile),
+	}
+
+	var numMetrics uint32
+	if err := binary.Read(r, binary.LittleEndian, &numMetrics); err != nil {
+		return nil, fmt.Errorf("read num_metrics: %w", err)
+	}
+
+	for range numMetrics {
+		name, err := readString16(r)
+		if err != nil {
+			return nil, fmt.Errorf("read metric name: %w", err)
+		}
+
+		var freq, start int64
+		if err := binary.Read(r, binary.LittleEndian, &freq); err != nil {
+			return nil, fmt.Errorf("read frequency for %s: %w", name, err)
+		}
+		if err := binary.Read(r, binary.LittleEndian, &start); err != nil {
+			return nil, fmt.Errorf("read start for %s: %w", name, err)
+		}
+
+		var numValues uint32
+		if err := binary.Read(r, binary.LittleEndian, &numValues); err != nil {
+			return nil, fmt.Errorf("read num_values for %s: %w", name, err)
+		}
+
+		data := make([]schema.Float, numValues)
+		for i := range numValues {
+			var bits uint32
+			if err := binary.Read(r, binary.LittleEndian, &bits); err != nil {
+				return nil, fmt.Errorf("read value[%d] for %s: %w", i, name, err)
+			}
+			data[i] = schema.Float(math.Float32frombits(bits))
+		}
+
+		cf.Metrics[name] = &CheckpointMetrics{
+			Frequency: freq,
+			Start:     start,
+			Data:      data,
+		}
+	}
+
+	var numChildren uint32
+	if err := binary.Read(r, binary.LittleEndian, &numChildren); err != nil {
+		return nil, fmt.Errorf("read num_children: %w", err)
+	}
+
+	for range numChildren {
+		childName, err := readString16(r)
+		if err != nil {
+			return nil, fmt.Errorf("read child name: %w", err)
+		}
+
+		child, err := readBinaryLevel(r)
+		if err != nil {
+			return nil, fmt.Errorf("read child %s: %w", childName, err)
+		}
+		cf.Children[childName] = child
+	}
+
+	return cf, nil
+}
+
+// readString16 reads a 2-byte length-prefixed string from r.
+func readString16(r io.Reader) (string, error) {
+	var sLen uint16
+	if err := binary.Read(r, binary.LittleEndian, &sLen); err != nil {
+		return "", err
+	}
+	buf := make([]byte, sLen)
+	if _, err := io.ReadFull(r, buf); err != nil {
+		return "", err
+	}
+	return string(buf), nil
+}

From 348b6010e8eeeb4c80d9c1c4b5a1e737374dd229 Mon Sep 17 00:00:00 2001
From: Christoph Kluge <christoph.kluge@fau.de>
Date: Thu, 26 Feb 2026 15:09:01 +0100
Subject: [PATCH 05/20] fix typo preventing template condition to work

---
 internal/auth/auth.go    | 2 +-
 web/templates/login.tmpl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal/auth/auth.go b/internal/auth/auth.go
index 9b1e2121..69f4f078 100644
--- a/internal/auth/auth.go
+++ b/internal/auth/auth.go
@@ -263,7 +263,7 @@ func GetAuthInstance() *Authentication {
 }
 
 // handleUserSync syncs or updates a user in the database based on configuration.
-// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
+// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled.
 func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
 	r := repository.GetUserRepository()
 	dbUser, err := r.GetUser(user.Username)
diff --git a/web/templates/login.tmpl b/web/templates/login.tmpl
index cd139261..4c4d9be8 100644
--- a/web/templates/login.tmpl
+++ b/web/templates/login.tmpl
@@ -38,8 +38,8 @@
                                     <input class="form-control" type="password" id="password" name="password" required/>
                                 </div>
                                 <button type="submit" class="btn btn-success">Submit</button>
-                                {{- if .Infos.hasOpenIDConnect}}
-                                <a class="btn btn-primary" href="/oidc-login">OpenID Connect Login</a>
+                                {{if .Infos.hasOpenIDConnect}}
+                                    <a class="btn btn-primary" href="/oidc-login">OpenID Connect Login</a>
                                 {{end}}
                                 <input type="hidden" id="redirect" name="redirect" value="{{ .Redirect }}" />
                             </form>

From 6ecb9349677d5d2a399aa26b86d12c987a7cb3ef Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Fri, 27 Feb 2026 08:55:33 +0100
Subject: [PATCH 06/20] Switch to CC line-protocol package. Update cc-lib.

---
 go.mod                          | 14 +++++-----
 go.sum                          | 48 +++++++++++----------------------
 internal/api/metricstore.go     |  2 +-
 internal/api/nats.go            |  2 +-
 pkg/metricstore/lineprotocol.go |  2 +-
 5 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/go.mod b/go.mod
index e244062c..afc21f2a 100644
--- a/go.mod
+++ b/go.mod
@@ -9,7 +9,8 @@ tool (
 
 require (
 	github.com/99designs/gqlgen v0.17.86
-	github.com/ClusterCockpit/cc-lib/v2 v2.6.0
+	github.com/ClusterCockpit/cc-lib/v2 v2.7.0
+	github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0
 	github.com/Masterminds/squirrel v1.5.4
 	github.com/aws/aws-sdk-go-v2 v1.41.1
 	github.com/aws/aws-sdk-go-v2/config v1.32.8
@@ -25,7 +26,6 @@ require (
 	github.com/golang-migrate/migrate/v4 v4.19.1
 	github.com/google/gops v0.3.29
 	github.com/gorilla/sessions v1.4.0
-	github.com/influxdata/line-protocol/v2 v2.2.1
 	github.com/jmoiron/sqlx v1.4.0
 	github.com/joho/godotenv v1.5.1
 	github.com/linkedin/goavro/v2 v2.15.0
@@ -92,10 +92,10 @@ require (
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
 	github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
-	github.com/nats-io/nats.go v1.48.0 // indirect
+	github.com/nats-io/nats.go v1.49.0 // indirect
 	github.com/nats-io/nkeys v0.4.15 // indirect
 	github.com/nats-io/nuid v1.0.1 // indirect
-	github.com/oapi-codegen/runtime v1.1.2 // indirect
+	github.com/oapi-codegen/runtime v1.2.0 // indirect
 	github.com/parquet-go/bitpack v1.0.0 // indirect
 	github.com/parquet-go/jsonlite v1.4.0 // indirect
 	github.com/pierrec/lz4/v4 v4.1.25 // indirect
@@ -104,7 +104,7 @@ require (
 	github.com/rogpeppe/go-internal v1.10.0 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
 	github.com/sosodev/duration v1.3.1 // indirect
-	github.com/stmcginnis/gofish v0.21.1 // indirect
+	github.com/stmcginnis/gofish v0.21.3 // indirect
 	github.com/stretchr/objx v0.5.2 // indirect
 	github.com/swaggo/files v1.0.1 // indirect
 	github.com/twpayne/go-geom v1.6.1 // indirect
@@ -113,9 +113,9 @@ require (
 	github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect
 	go.yaml.in/yaml/v2 v2.4.3 // indirect
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
-	golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a // indirect
+	golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect
 	golang.org/x/mod v0.33.0 // indirect
-	golang.org/x/net v0.50.0 // indirect
+	golang.org/x/net v0.51.0 // indirect
 	golang.org/x/sync v0.19.0 // indirect
 	golang.org/x/sys v0.41.0 // indirect
 	golang.org/x/text v0.34.0 // indirect
diff --git a/go.sum b/go.sum
index f2929454..cedddd62 100644
--- a/go.sum
+++ b/go.sum
@@ -4,10 +4,10 @@ github.com/99designs/gqlgen v0.17.86 h1:C8N3UTa5heXX6twl+b0AJyGkTwYL6dNmFrgZNLRc
 github.com/99designs/gqlgen v0.17.86/go.mod h1:KTrPl+vHA1IUzNlh4EYkl7+tcErL3MgKnhHrBcV74Fw=
 github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A=
 github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk=
-github.com/ClusterCockpit/cc-lib/v2 v2.5.1 h1:s6M9tyPDty+4zTdQGJYKpGJM9Nz7N6ITMdjPvNSLX5g=
-github.com/ClusterCockpit/cc-lib/v2 v2.5.1/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
-github.com/ClusterCockpit/cc-lib/v2 v2.6.0 h1:Q7zvRAVhfYA9PDB18pfY9A/6Ws4oWpnv8+P9MBRUDzg=
-github.com/ClusterCockpit/cc-lib/v2 v2.6.0/go.mod h1:DZ8OIHPUZJpWqErLITt0B8P6/Q7CBW2IQSQ5YiFFaG0=
+github.com/ClusterCockpit/cc-lib/v2 v2.7.0 h1:EMTShk6rMTR1wlfmQ8SVCawH1OdltUbD3kVQmaW+5pE=
+github.com/ClusterCockpit/cc-lib/v2 v2.7.0/go.mod h1:0Etx8WMs0lYZ4tiOQizY18CQop+2i3WROvU9rMUxHA4=
+github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
+github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0/go.mod h1:y42qUu+YFmu5fdNuUAS4VbbIKxVjxCvbVqFdpdh8ahY=
 github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
 github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
 github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
@@ -95,8 +95,6 @@ github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7c
 github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
 github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM=
 github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
-github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
-github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
 github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
 github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
 github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
@@ -154,8 +152,6 @@ github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjY
 github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
 github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
@@ -184,13 +180,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjw
 github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
 github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
 github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
-github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
 github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
 github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
-github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
-github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
-github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
-github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
 github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8=
 github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs=
 github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo=
@@ -212,11 +203,8 @@ github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7X
 github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
 github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
 github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
-github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw=
@@ -240,15 +228,14 @@ github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
 github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
 github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
 github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
-github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
-github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
+github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE=
+github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw=
 github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
 github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
 github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
 github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
-github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
-github.com/oapi-codegen/runtime v1.1.2 h1:P2+CubHq8fO4Q6fV1tqDBZHCwpVpvPg7oKiYzQgXIyI=
-github.com/oapi-codegen/runtime v1.1.2/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
+github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4=
+github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0=
 github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
 github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA=
 github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs=
@@ -268,8 +255,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw
 github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
 github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
 github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
-github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws=
-github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw=
+github.com/prometheus/procfs v0.20.0 h1:AA7aCvjxwAquZAlonN7888f2u4IN8WVeFgBi4k82M4Q=
+github.com/prometheus/procfs v0.20.0/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo=
 github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0=
 github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU=
 github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
@@ -286,8 +273,8 @@ github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NF
 github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4=
 github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg=
 github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
-github.com/stmcginnis/gofish v0.21.1 h1:sutDvBhmLh4RDOZ1DN8GUyYRu7f1ggvKMMnSaiqhwn4=
-github.com/stmcginnis/gofish v0.21.1/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
+github.com/stmcginnis/gofish v0.21.3 h1:EBLCHfORnbx7MPw7lplOOVe9QAD1T3XRVz6+a1Z4z5Q=
+github.com/stmcginnis/gofish v0.21.3/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
@@ -328,8 +315,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
 golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
-golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a h1:ovFr6Z0MNmU7nH8VaX5xqw+05ST2uO1exVfZPVqRC5o=
-golang.org/x/exp v0.0.0-20260212183809-81e46e3db34a/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA=
+golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0=
+golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA=
 golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
 golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
 golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
@@ -337,8 +324,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
 golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
-golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
-golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=
+golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
+golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
 golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
 golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -370,16 +357,13 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc
 golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
 golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
 google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
diff --git a/internal/api/metricstore.go b/internal/api/metricstore.go
index 5c15bb2c..ff4deb6a 100644
--- a/internal/api/metricstore.go
+++ b/internal/api/metricstore.go
@@ -18,7 +18,7 @@ import (
 	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
 	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 
-	"github.com/influxdata/line-protocol/v2/lineprotocol"
+	"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
 )
 
 // handleFree godoc
diff --git a/internal/api/nats.go b/internal/api/nats.go
index 02a03fae..efa4ab6f 100644
--- a/internal/api/nats.go
+++ b/internal/api/nats.go
@@ -21,7 +21,7 @@ import (
 	"github.com/ClusterCockpit/cc-lib/v2/nats"
 	"github.com/ClusterCockpit/cc-lib/v2/receivers"
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
-	influx "github.com/influxdata/line-protocol/v2/lineprotocol"
+	influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
 )
 
 // NatsAPI provides NATS subscription-based handlers for Job and Node operations.
diff --git a/pkg/metricstore/lineprotocol.go b/pkg/metricstore/lineprotocol.go
index bfbbef2d..ed30dec7 100644
--- a/pkg/metricstore/lineprotocol.go
+++ b/pkg/metricstore/lineprotocol.go
@@ -14,7 +14,7 @@ import (
 	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	"github.com/ClusterCockpit/cc-lib/v2/nats"
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
-	"github.com/influxdata/line-protocol/v2/lineprotocol"
+	"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
 )
 
 func ReceiveNats(ms *MemoryStore,

From a1db8263d72b9727347ea69e0cc832ec67bd1235 Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Fri, 27 Feb 2026 12:30:27 +0100
Subject: [PATCH 07/20] Document line protocol. Optimize REST writeMetric path

---
 internal/api/metricstore.go     |  30 ++++--
 pkg/metricstore/lineprotocol.go | 163 ++++++++++++++++++++++++++------
 2 files changed, 155 insertions(+), 38 deletions(-)

diff --git a/internal/api/metricstore.go b/internal/api/metricstore.go
index ff4deb6a..325b26ba 100644
--- a/internal/api/metricstore.go
+++ b/internal/api/metricstore.go
@@ -10,7 +10,6 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"net/http"
 	"strconv"
 	"strings"
@@ -90,16 +89,17 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
 // @security    ApiKeyAuth
 // @router      /write/ [post]
 func writeMetrics(rw http.ResponseWriter, r *http.Request) {
-	bytes, err := io.ReadAll(r.Body)
 	rw.Header().Add("Content-Type", "application/json")
-	if err != nil {
-		handleError(err, http.StatusInternalServerError, rw)
-		return
-	}
 
+	// Extract the "cluster" query parameter without allocating a url.Values map.
+	cluster := queryParam(r.URL.RawQuery, "cluster")
+
+	// Stream directly from the request body instead of copying it into a
+	// temporary buffer via io.ReadAll. The line-protocol decoder supports
+	// io.Reader natively, so this avoids the largest heap allocation.
 	ms := metricstore.GetMemoryStore()
-	dec := lineprotocol.NewDecoderWithBytes(bytes)
-	if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
+	dec := lineprotocol.NewDecoder(r.Body)
+	if err := metricstore.DecodeLine(dec, ms, cluster); err != nil {
 		cclog.Errorf("/api/write error: %s", err.Error())
 		handleError(err, http.StatusBadRequest, rw)
 		return
@@ -107,6 +107,20 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
 	rw.WriteHeader(http.StatusOK)
 }
 
+// queryParam extracts a single query-parameter value from a raw query string
+// without allocating a url.Values map. Returns "" if the key is not present.
+func queryParam(raw, key string) string {
+	for raw != "" {
+		var kv string
+		kv, raw, _ = strings.Cut(raw, "&")
+		k, v, _ := strings.Cut(kv, "=")
+		if k == key {
+			return v
+		}
+	}
+	return ""
+}
+
 // handleDebug godoc
 // @summary Debug endpoint
 // @tags debug
diff --git a/pkg/metricstore/lineprotocol.go b/pkg/metricstore/lineprotocol.go
index f8c83e31..ecae3df1 100644
--- a/pkg/metricstore/lineprotocol.go
+++ b/pkg/metricstore/lineprotocol.go
@@ -3,9 +3,23 @@
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
 
+// This file implements ingestion of InfluxDB line-protocol metric data received
+// over NATS. Each line encodes one metric sample with the following structure:
+//
+//	<measurement>[,cluster=<c>][,hostname=<h>][,type=<t>][,type-id=<id>][,subtype=<s>][,stype-id=<id>] value=<v> [<timestamp>]
+//
+// The measurement name identifies the metric (e.g. "cpu_load"). Tags provide
+// routing information (cluster, host) and optional sub-device selectors (type,
+// subtype). Only one field is expected per line: "value".
+//
+// After decoding, each sample is:
+//  1. Written to the in-memory store via ms.WriteToLevel.
+//  2. If the checkpoint format is "wal", also forwarded to the WAL staging
+//     goroutine via the WALMessages channel for durable write-ahead logging.
 package metricstore
 
 import (
+	"bytes"
 	"context"
 	"fmt"
 	"sync"
@@ -17,6 +31,16 @@ import (
 	"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
 )
 
+// ReceiveNats subscribes to all configured NATS subjects and feeds incoming
+// line-protocol messages into the MemoryStore.
+//
+// When workers > 1 a pool of goroutines drains a shared channel so that
+// multiple messages can be decoded in parallel. With workers == 1 the NATS
+// callback decodes inline (no channel overhead, lower latency).
+//
+// The function blocks until ctx is cancelled and all worker goroutines have
+// finished. It returns nil when the NATS client is not configured; callers
+// should treat that as a no-op rather than an error.
 func ReceiveNats(ms *MemoryStore,
 	workers int,
 	ctx context.Context,
@@ -75,8 +99,13 @@ func ReceiveNats(ms *MemoryStore,
 	return nil
 }
 
-// Place `prefix` in front of `buf` but if possible,
-// do that inplace in `buf`.
+// reorder prepends prefix to buf in-place when buf has enough spare capacity,
+// avoiding an allocation. Falls back to a regular append otherwise.
+//
+// It is used to assemble the "type<type-id>" and "subtype<stype-id>" selector
+// strings when the type tag arrives before the type-id tag in the line, so the
+// two byte slices need to be concatenated in tag-declaration order regardless
+// of wire order.
 func reorder(buf, prefix []byte) []byte {
 	n := len(prefix)
 	m := len(buf)
@@ -94,17 +123,83 @@ func reorder(buf, prefix []byte) []byte {
 	}
 }
 
-// Decode lines using dec and make write calls to the MemoryStore.
-// If a line is missing its cluster tag, use clusterDefault as default.
+// decodeState holds the per-call scratch buffers used by DecodeLine.
+// Instances are recycled via decodeStatePool to avoid repeated allocations
+// during high-throughput ingestion.
+type decodeState struct {
+	// metricBuf holds a copy of the current measurement name (line-protocol
+	// measurement field). Copied because dec.Measurement() returns a slice
+	// that is invalidated by the next decoder call.
+	metricBuf []byte
+
+	// selector is the sub-device path passed to WriteToLevel and WALMessage
+	// (e.g. ["socket0"] or ["socket0", "memctrl1"]). Reused across lines.
+	selector []string
+
+	// typeBuf accumulates the concatenated "type"+"type-id" tag value for the
+	// current line. Reset at the start of each line's tag-decode loop.
+	typeBuf []byte
+
+	// subTypeBuf accumulates the concatenated "subtype"+"stype-id" tag value.
+	// Reset at the start of each line's tag-decode loop.
+	subTypeBuf []byte
+
+	// prevTypeBytes / prevTypeStr cache the last seen typeBuf content and its
+	// string conversion. Because consecutive lines in a batch typically address
+	// the same sub-device, the cache hit rate is very high and avoids
+	// repeated []byte→string allocations.
+	prevTypeBytes []byte
+	prevTypeStr   string
+
+	// prevSubTypeBytes / prevSubTypeStr are the same cache for the subtype.
+	prevSubTypeBytes []byte
+	prevSubTypeStr   string
+}
+
+// decodeStatePool recycles decodeState values across DecodeLine calls to
+// reduce GC pressure during sustained metric ingestion.
+var decodeStatePool = sync.Pool{
+	New: func() any {
+		return &decodeState{
+			metricBuf:  make([]byte, 0, 16),
+			selector:   make([]string, 0, 4),
+			typeBuf:    make([]byte, 0, 16),
+			subTypeBuf: make([]byte, 0, 16),
+		}
+	},
+}
+
+// DecodeLine reads all lines from dec (InfluxDB line-protocol) and writes each
+// decoded metric sample into ms.
+//
+// clusterDefault is used as the cluster name for lines that do not carry a
+// "cluster" tag. Callers typically supply the ClusterTag value from the NATS
+// subscription configuration.
+//
+// Performance notes:
+//   - A decodeState is obtained from decodeStatePool to reuse scratch buffers.
+//   - The Level pointer (host-level node in the metric tree) is cached across
+//     consecutive lines that share the same cluster+host pair to avoid
+//     repeated lock acquisitions on the root and cluster levels.
+//   - []byte→string conversions for type/subtype selectors are cached via
+//     prevType*/prevSubType* fields because batches typically repeat the same
+//     sub-device identifiers.
+//   - Timestamp parsing tries Second precision first; if that fails it retries
+//     Millisecond, Microsecond, and Nanosecond in turn. A missing timestamp
+//     falls back to time.Now().
+//
+// When the checkpoint format is "wal" each successfully decoded sample is also
+// sent to WALMessages so the WAL staging goroutine can persist it durably
+// before the next binary snapshot.
 func DecodeLine(dec *lineprotocol.Decoder,
 	ms *MemoryStore,
 	clusterDefault string,
 ) error {
 	// Reduce allocations in loop:
 	t := time.Now()
-	metric, metricBuf := Metric{}, make([]byte, 0, 16)
-	selector := make([]string, 0, 4)
-	typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
+	metric := Metric{}
+	st := decodeStatePool.Get().(*decodeState)
+	defer decodeStatePool.Put(st)
 
 	// Optimize for the case where all lines in a "batch" are about the same
 	// cluster and host. By using `WriteToLevel` (level = host), we do not need
@@ -121,7 +216,7 @@ func DecodeLine(dec *lineprotocol.Decoder,
 
 		// Needs to be copied because another call to dec.* would
 		// invalidate the returned slice.
-		metricBuf = append(metricBuf[:0], rawmeasurement...)
+		st.metricBuf = append(st.metricBuf[:0], rawmeasurement...)
 
 		// The go compiler optimizes map[string(byteslice)] lookups:
 		metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
@@ -129,7 +224,7 @@ func DecodeLine(dec *lineprotocol.Decoder,
 			continue
 		}
 
-		typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
+		st.typeBuf, st.subTypeBuf = st.typeBuf[:0], st.subTypeBuf[:0]
 		cluster, host := clusterDefault, ""
 		for {
 			key, val, err := dec.NextTag()
@@ -162,41 +257,49 @@ func DecodeLine(dec *lineprotocol.Decoder,
 				}
 
 				// We cannot be sure that the "type" tag comes before the "type-id" tag:
-				if len(typeBuf) == 0 {
-					typeBuf = append(typeBuf, val...)
+				if len(st.typeBuf) == 0 {
+					st.typeBuf = append(st.typeBuf, val...)
 				} else {
-					typeBuf = reorder(typeBuf, val)
+					st.typeBuf = reorder(st.typeBuf, val)
 				}
 			case "type-id":
-				typeBuf = append(typeBuf, val...)
+				st.typeBuf = append(st.typeBuf, val...)
 			case "subtype":
 				// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
-				if len(subTypeBuf) == 0 {
-					subTypeBuf = append(subTypeBuf, val...)
+				if len(st.subTypeBuf) == 0 {
+					st.subTypeBuf = append(st.subTypeBuf, val...)
 				} else {
-					subTypeBuf = reorder(subTypeBuf, val)
-					// subTypeBuf = reorder(typeBuf, val)
+					st.subTypeBuf = reorder(st.subTypeBuf, val)
 				}
 			case "stype-id":
-				subTypeBuf = append(subTypeBuf, val...)
+				st.subTypeBuf = append(st.subTypeBuf, val...)
 			default:
 			}
 		}
 
 		// If the cluster or host changed, the lvl was set to nil
 		if lvl == nil {
-			selector = selector[:2]
-			selector[0], selector[1] = cluster, host
-			lvl = ms.GetLevel(selector)
+			st.selector = st.selector[:2]
+			st.selector[0], st.selector[1] = cluster, host
+			lvl = ms.GetLevel(st.selector)
 			prevCluster, prevHost = cluster, host
 		}
 
-		// subtypes:
-		selector = selector[:0]
-		if len(typeBuf) > 0 {
-			selector = append(selector, string(typeBuf)) // <- Allocation :(
-			if len(subTypeBuf) > 0 {
-				selector = append(selector, string(subTypeBuf))
+		// subtypes: cache []byte→string conversions; messages in a batch typically
+		// share the same type/subtype so the hit rate is very high.
+		st.selector = st.selector[:0]
+		if len(st.typeBuf) > 0 {
+			if !bytes.Equal(st.typeBuf, st.prevTypeBytes) {
+				st.prevTypeBytes = append(st.prevTypeBytes[:0], st.typeBuf...)
+				st.prevTypeStr = string(st.typeBuf)
+			}
+			st.selector = append(st.selector, st.prevTypeStr)
+			if len(st.subTypeBuf) > 0 {
+				if !bytes.Equal(st.subTypeBuf, st.prevSubTypeBytes) {
+					st.prevSubTypeBytes = append(st.prevSubTypeBytes[:0], st.subTypeBuf...)
+					st.prevSubTypeStr = string(st.subTypeBuf)
+				}
+				st.selector = append(st.selector, st.prevSubTypeStr)
 			}
 		}
 
@@ -246,16 +349,16 @@ func DecodeLine(dec *lineprotocol.Decoder,
 
 		if Keys.Checkpoints.FileFormat == "wal" {
 			WALMessages <- &WALMessage{
-				MetricName: string(metricBuf),
+				MetricName: string(st.metricBuf),
 				Cluster:    cluster,
 				Node:       host,
-				Selector:   append([]string{}, selector...),
+				Selector:   append([]string{}, st.selector...),
 				Value:      metric.Value,
 				Timestamp:  time,
 			}
 		}
 
-		if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil {
+		if err := ms.WriteToLevel(lvl, st.selector, time, []Metric{metric}); err != nil {
 			return err
 		}
 	}

From a418abc7d5ccfc806318caa9effa99d9f955fbcb Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Fri, 27 Feb 2026 14:40:26 +0100
Subject: [PATCH 08/20] Run go fix

---
 internal/api/rest.go             |  2 +-
 pkg/archive/fsBackend.go         |  6 ++----
 pkg/archive/s3Backend.go         |  6 ++----
 pkg/archive/sqliteBackend.go     |  6 ++----
 pkg/metricstore/archive.go       |  6 ++----
 pkg/metricstore/checkpoint.go    |  6 ++----
 pkg/metricstore/metricstore.go   | 12 ++++--------
 pkg/metricstore/walCheckpoint.go | 14 +++++++-------
 8 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/internal/api/rest.go b/internal/api/rest.go
index 4d2385e3..613867a8 100644
--- a/internal/api/rest.go
+++ b/internal/api/rest.go
@@ -302,7 +302,7 @@ func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) {
 
 	rw.Header().Set("Content-Type", "text/plain")
 	rw.WriteHeader(http.StatusOK)
-	if _, err := rw.Write([]byte(fmt.Sprintf("Tagger %s started", name))); err != nil {
+	if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil {
 		cclog.Errorf("Failed to write response: %v", err)
 	}
 }
diff --git a/pkg/archive/fsBackend.go b/pkg/archive/fsBackend.go
index 07b86e2b..dfc870b4 100644
--- a/pkg/archive/fsBackend.go
+++ b/pkg/archive/fsBackend.go
@@ -501,9 +501,7 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
 		var wg sync.WaitGroup
 
 		for range numWorkers {
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
+			wg.Go(func() {
 				for jobPath := range jobPaths {
 					job, err := loadJobMeta(filepath.Join(jobPath, "meta.json"))
 					if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
@@ -529,7 +527,7 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
 						ch <- JobContainer{Meta: job, Data: nil}
 					}
 				}
-			}()
+			})
 		}
 
 		clustersDir, err := os.ReadDir(fsa.path)
diff --git a/pkg/archive/s3Backend.go b/pkg/archive/s3Backend.go
index 84abd713..7b82d309 100644
--- a/pkg/archive/s3Backend.go
+++ b/pkg/archive/s3Backend.go
@@ -821,9 +821,7 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
 		var wg sync.WaitGroup
 
 		for range numWorkers {
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
+			wg.Go(func() {
 				for metaKey := range metaKeys {
 					result, err := s3a.client.GetObject(ctx, &s3.GetObjectInput{
 						Bucket: aws.String(s3a.bucket),
@@ -859,7 +857,7 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
 						ch <- JobContainer{Meta: job, Data: nil}
 					}
 				}
-			}()
+			})
 		}
 
 		for _, cluster := range s3a.clusters {
diff --git a/pkg/archive/sqliteBackend.go b/pkg/archive/sqliteBackend.go
index 50821367..3f214136 100644
--- a/pkg/archive/sqliteBackend.go
+++ b/pkg/archive/sqliteBackend.go
@@ -576,9 +576,7 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer {
 		var wg sync.WaitGroup
 
 		for range numWorkers {
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
+			wg.Go(func() {
 				for row := range jobRows {
 					job, err := DecodeJobMeta(bytes.NewReader(row.metaBlob))
 					if err != nil {
@@ -617,7 +615,7 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer {
 						ch <- JobContainer{Meta: job, Data: nil}
 					}
 				}
-			}()
+			})
 		}
 
 		for {
diff --git a/pkg/metricstore/archive.go b/pkg/metricstore/archive.go
index 784348b5..d3617f2c 100644
--- a/pkg/metricstore/archive.go
+++ b/pkg/metricstore/archive.go
@@ -49,9 +49,7 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
 
 // runWorker takes simple values to configure what it does
 func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) {
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
+	wg.Go(func() {
 
 		d, err := time.ParseDuration(interval)
 		if err != nil {
@@ -85,7 +83,7 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod
 				}
 			}
 		}
-	}()
+	})
 }
 
 var ErrNoNewArchiveData error = errors.New("all data already archived")
diff --git a/pkg/metricstore/checkpoint.go b/pkg/metricstore/checkpoint.go
index 590197e3..45b2bc2a 100644
--- a/pkg/metricstore/checkpoint.go
+++ b/pkg/metricstore/checkpoint.go
@@ -96,9 +96,7 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 
 	ms := GetMemoryStore()
 
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
+	wg.Go(func() {
 
 		d, err := time.ParseDuration(Keys.Checkpoints.Interval)
 		if err != nil {
@@ -149,7 +147,7 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 				}
 			}
 		}
-	}()
+	})
 }
 
 // MarshalJSON provides optimized JSON encoding for CheckpointMetrics.
diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go
index 3fe64d55..d46c0d15 100644
--- a/pkg/metricstore/metricstore.go
+++ b/pkg/metricstore/metricstore.go
@@ -320,9 +320,7 @@ func Shutdown() {
 func Retention(wg *sync.WaitGroup, ctx context.Context) {
 	ms := GetMemoryStore()
 
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
+	wg.Go(func() {
 		d, err := time.ParseDuration(Keys.RetentionInMemory)
 		if err != nil {
 			cclog.Fatal(err)
@@ -361,7 +359,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
 				state.mu.Unlock()
 			}
 		}
-	}()
+	})
 }
 
 // MemoryUsageTracker starts a background goroutine that monitors memory usage.
@@ -382,9 +380,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
 func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
 	ms := GetMemoryStore()
 
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
+	wg.Go(func() {
 		d := DefaultMemoryUsageTrackerInterval
 
 		if d <= 0 {
@@ -470,7 +466,7 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
 				}
 			}
 		}
-	}()
+	})
 }
 
 // Free removes metric data older than the given time while preserving data for active nodes.
diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go
index e8a71ce2..685a8388 100644
--- a/pkg/metricstore/walCheckpoint.go
+++ b/pkg/metricstore/walCheckpoint.go
@@ -65,6 +65,7 @@ import (
 	"math"
 	"os"
 	"path"
+	"strings"
 	"sync"
 	"sync/atomic"
 
@@ -114,9 +115,7 @@ type walFileState struct {
 // and appends binary WAL records to per-host current.wal files.
 // Also handles WAL rotation requests from the checkpoint goroutine.
 func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
+	wg.Go(func() {
 
 		if Keys.Checkpoints.FileFormat == "json" {
 			return
@@ -220,7 +219,7 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
 				processRotate(req)
 			}
 		}
-	}()
+	})
 }
 
 // RotateWALFiles sends rotation requests for the given host directories
@@ -478,11 +477,12 @@ func joinSelector(sel []string) string {
 	if len(sel) == 0 {
 		return ""
 	}
-	result := sel[0]
+	var result strings.Builder
+	result.WriteString(sel[0])
 	for i := 1; i < len(sel); i++ {
-		result += "\x00" + sel[i]
+		result.WriteString("\x00" + sel[i])
 	}
-	return result
+	return result.String()
 }
 
 // ToCheckpointWAL writes binary snapshot files for all hosts in parallel.

From 07b989cb81538bdce1e6dce50222c3cc7d76ab58 Mon Sep 17 00:00:00 2001
From: Aditya Ujeniya <adityauj@gmail.com>
Date: Fri, 27 Feb 2026 14:44:32 +0100
Subject: [PATCH 09/20] Add new bufferPool implementation

---
 pkg/metricstore/buffer.go           | 101 ++++++++++++++++++++++++++--
 pkg/metricstore/level.go            |   2 +
 pkg/metricstore/metricstore.go      |   6 ++
 pkg/metricstore/metricstore_test.go |  50 ++++++++++++++
 4 files changed, 155 insertions(+), 4 deletions(-)

diff --git a/pkg/metricstore/buffer.go b/pkg/metricstore/buffer.go
index 665d8012..f486e645 100644
--- a/pkg/metricstore/buffer.go
+++ b/pkg/metricstore/buffer.go
@@ -43,6 +43,7 @@ package metricstore
 import (
 	"errors"
 	"sync"
+	"time"
 
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )
@@ -53,12 +54,102 @@ import (
 // of data or reallocation needs to happen on writes.
 const BufferCap int = DefaultBufferCapacity
 
-var bufferPool sync.Pool = sync.Pool{
-	New: func() any {
+// BufferPool is the global instance.
+// It is initialized immediately when the package loads.
+var bufferPool = NewPersistentBufferPool()
+
+type PersistentBufferPool struct {
+	pool []*buffer
+	mu   sync.Mutex
+}
+
+// NewPersistentBufferPool creates a dynamic pool for buffers.
+func NewPersistentBufferPool() *PersistentBufferPool {
+	return &PersistentBufferPool{
+		pool: make([]*buffer, 0),
+	}
+}
+
+func (p *PersistentBufferPool) Get() *buffer {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	n := len(p.pool)
+	if n == 0 {
+		// Pool is empty, allocate a new one
 		return &buffer{
 			data: make([]schema.Float, 0, BufferCap),
 		}
-	},
+	}
+
+	// Reuse existing buffer from the pool
+	b := p.pool[n-1]
+	p.pool[n-1] = nil // Avoid memory leak
+	p.pool = p.pool[:n-1]
+	return b
+}
+
+func (p *PersistentBufferPool) Put(b *buffer) {
+	// Reset the buffer before putting it back
+	b.data = b.data[:0]
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.pool = append(p.pool, b)
+}
+
+// GetSize returns the exact number of buffers currently sitting in the pool.
+func (p *PersistentBufferPool) GetSize() int {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return len(p.pool)
+}
+
+// Clear drains all buffers currently in the pool, allowing the GC to collect them.
+func (p *PersistentBufferPool) Clear() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	for i := range p.pool {
+		p.pool[i] = nil
+	}
+	p.pool = p.pool[:0]
+}
+
+// Clean removes buffers from the pool that haven't been used in the given duration.
+// It uses a simple LRU approach based on the lastUsed timestamp.
+func (p *PersistentBufferPool) Clean(threshold int64) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Filter in place
+	active := p.pool[:0]
+	for _, b := range p.pool {
+		if b.lastUsed >= threshold {
+			active = append(active, b)
+		} else {
+			// Buffer is older than the threshold, let it be collected by GC
+		}
+	}
+
+	// Nullify the rest to prevent memory leaks
+	for i := len(active); i < len(p.pool); i++ {
+		p.pool[i] = nil
+	}
+
+	p.pool = active
+}
+
+// CleanAll removes all buffers from the pool.
+func (p *PersistentBufferPool) CleanAll() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Nullify all buffers to prevent memory leaks
+	for i := range p.pool {
+		p.pool[i] = nil
+	}
+
+	p.pool = p.pool[:0]
 }
 
 var (
@@ -94,10 +185,11 @@ type buffer struct {
 	start     int64
 	archived  bool
 	closed    bool
+	lastUsed  int64
 }
 
 func newBuffer(ts, freq int64) *buffer {
-	b := bufferPool.Get().(*buffer)
+	b := bufferPool.Get()
 	b.frequency = freq
 	b.start = ts - (freq / 2)
 	b.prev = nil
@@ -240,6 +332,7 @@ func (b *buffer) free(t int64) (delme bool, n int) {
 			if cap(b.prev.data) != BufferCap {
 				b.prev.data = make([]schema.Float, 0, BufferCap)
 			}
+			b.prev.lastUsed = time.Now().Unix()
 			bufferPool.Put(b.prev)
 			b.prev = nil
 		}
diff --git a/pkg/metricstore/level.go b/pkg/metricstore/level.go
index 85c2ba7b..ef082579 100644
--- a/pkg/metricstore/level.go
+++ b/pkg/metricstore/level.go
@@ -42,6 +42,7 @@ package metricstore
 
 import (
 	"sync"
+	"time"
 	"unsafe"
 
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
@@ -192,6 +193,7 @@ func (l *Level) free(t int64) (int, error) {
 				if cap(b.data) != BufferCap {
 					b.data = make([]schema.Float, 0, BufferCap)
 				}
+				b.lastUsed = time.Now().Unix()
 				bufferPool.Put(b)
 				l.metrics[i] = nil
 			}
diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go
index d46c0d15..db3e4357 100644
--- a/pkg/metricstore/metricstore.go
+++ b/pkg/metricstore/metricstore.go
@@ -357,6 +357,9 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
 				}
 
 				state.mu.Unlock()
+
+				// Clean up the buffer pool
+				bufferPool.Clean(state.lastRetentionTime)
 			}
 		}
 	})
@@ -425,6 +428,9 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
 				runtime.ReadMemStats(&mem)
 				actualMemoryGB = float64(mem.Alloc) / 1e9
 
+				bufferPool.CleanAll()
+				cclog.Infof("[METRICSTORE]> Cleaned up bufferPool\n")
+
 				if actualMemoryGB > float64(Keys.MemoryCap) {
 					cclog.Warnf("[METRICSTORE]> memory usage %.2f GB exceeds cap %d GB, starting emergency buffer freeing", actualMemoryGB, Keys.MemoryCap)
 
diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go
index eb1aff15..55c97e60 100644
--- a/pkg/metricstore/metricstore_test.go
+++ b/pkg/metricstore/metricstore_test.go
@@ -464,3 +464,53 @@ func TestBufferHealthChecks(t *testing.T) {
 		})
 	}
 }
+
+func TestBufferPoolClean(t *testing.T) {
+	// Use a fresh pool for testing
+	pool := NewPersistentBufferPool()
+
+	now := time.Now().Unix()
+
+	// Create some buffers and put them in the pool with different lastUsed times
+	b1 := &buffer{lastUsed: now - 3600, data: make([]schema.Float, 0)}   // 1 hour ago
+	b2 := &buffer{lastUsed: now - 7200, data: make([]schema.Float, 0)}   // 2 hours ago
+	b3 := &buffer{lastUsed: now - 180000, data: make([]schema.Float, 0)} // 50 hours ago
+	b4 := &buffer{lastUsed: now - 200000, data: make([]schema.Float, 0)} // 55 hours ago
+	b5 := &buffer{lastUsed: now, data: make([]schema.Float, 0)}
+
+	pool.Put(b1)
+	pool.Put(b2)
+	pool.Put(b3)
+	pool.Put(b4)
+	pool.Put(b5)
+
+	if pool.GetSize() != 5 {
+		t.Fatalf("Expected pool size 5, got %d", pool.GetSize())
+	}
+
+	// Clean buffers older than 48 hours
+	timeUpdate := time.Now().Add(48 * time.Hour).Unix()
+	pool.Clean(timeUpdate)
+
+	// Expected: b1, b2, b5 should remain. b3, b4 should be cleaned.
+	if pool.GetSize() != 3 {
+		t.Fatalf("Expected pool size 3 after clean, got %d", pool.GetSize())
+	}
+
+	validBufs := map[int64]bool{
+		b1.lastUsed: true,
+		b2.lastUsed: true,
+		b5.lastUsed: true,
+	}
+
+	for i := 0; i < 3; i++ {
+		b := pool.Get()
+		if !validBufs[b.lastUsed] {
+			t.Errorf("Found unexpected buffer with lastUsed %d", b.lastUsed)
+		}
+	}
+
+	if pool.GetSize() != 0 {
+		t.Fatalf("Expected pool to be empty, got %d", pool.GetSize())
+	}
+}

From 2e5d85c2231342dcac0034b93933f0bd969b1b13 Mon Sep 17 00:00:00 2001
From: Aditya Ujeniya <adityauj@gmail.com>
Date: Fri, 27 Feb 2026 15:09:06 +0100
Subject: [PATCH 10/20] Udpate testcase

---
 pkg/metricstore/metricstore_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go
index 55c97e60..772fd7ea 100644
--- a/pkg/metricstore/metricstore_test.go
+++ b/pkg/metricstore/metricstore_test.go
@@ -489,7 +489,7 @@ func TestBufferPoolClean(t *testing.T) {
 	}
 
 	// Clean buffers older than 48 hours
-	timeUpdate := time.Now().Add(48 * time.Hour).Unix()
+	timeUpdate := time.Now().Add(-48 * time.Hour).Unix()
 	pool.Clean(timeUpdate)
 
 	// Expected: b1, b2, b5 should remain. b3, b4 should be cleaned.

From d00aa2666dd2a5da67291638105a74012cc0062e Mon Sep 17 00:00:00 2001
From: Christoph Kluge <christoph.kluge@fau.de>
Date: Fri, 27 Feb 2026 15:20:09 +0100
Subject: [PATCH 11/20] activate update of roles and projects if
 updateUserOnLogin is set

---
 internal/repository/user.go | 70 ++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/internal/repository/user.go b/internal/repository/user.go
index 966646dd..38a4980b 100644
--- a/internal/repository/user.go
+++ b/internal/repository/user.go
@@ -10,6 +10,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"reflect"
 	"strings"
 	"sync"
 
@@ -187,8 +188,8 @@ func (r *UserRepository) AddUser(user *schema.User) error {
 }
 
 func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) error {
-	// user contains updated info, apply to dbuser
-	// TODO: Discuss updatable fields
+	// user contains updated info -> Apply to dbUser
+	// --- Simple Name Update ---
 	if dbUser.Name != user.Name {
 		if _, err := sq.Update("hpc_user").Set("name", user.Name).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil {
 			cclog.Errorf("error while updating name of user '%s'", user.Username)
@@ -196,13 +197,64 @@ func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) erro
 		}
 	}
 
-	// Toggled until greenlit
-	// if dbUser.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) {
-	// 	projects, _ := json.Marshal(user.Projects)
-	// 	if _, err := sq.Update("hpc_user").Set("projects", projects).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil {
-	// 		return err
-	// 	}
-	// }
+	// --- Def Helpers ---
+	// Helper to update roles
+	updateRoles := func(roles []string) error {
+		rolesJSON, _ := json.Marshal(roles)
+		_, err := sq.Update("hpc_user").Set("roles", rolesJSON).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec()
+		return err
+	}
+
+	// Helper to update projects
+	updateProjects := func(projects []string) error {
+		projectsJSON, _ := json.Marshal(projects)
+		_, err := sq.Update("hpc_user").Set("projects", projectsJSON).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec()
+		return err
+	}
+
+	// Helper to clear projects
+	clearProjects := func() error {
+		_, err := sq.Update("hpc_user").Set("projects", "[]").Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec()
+		return err
+	}
+
+	// --- Manager Role Handling ---
+	if dbUser.HasRole(schema.RoleManager) && user.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) {
+		// Existing Manager: update projects
+		if err := updateProjects(user.Projects); err != nil {
+			return err
+		}
+	} else if dbUser.HasRole(schema.RoleUser) && user.HasRole(schema.RoleManager) && user.HasNotRoles([]schema.Role{schema.RoleAdmin}) {
+		// New Manager: update roles and projects
+		if err := updateRoles(user.Roles); err != nil {
+			return err
+		}
+		if err := updateProjects(user.Projects); err != nil {
+			return err
+		}
+	} else if dbUser.HasRole(schema.RoleManager) && user.HasNotRoles([]schema.Role{schema.RoleAdmin, schema.RoleManager}) {
+		// Remove Manager: update roles and clear projects
+		if err := updateRoles(user.Roles); err != nil {
+			return err
+		}
+		if err := clearProjects(); err != nil {
+			return err
+		}
+	}
+
+	// --- Support Role Handling ---
+	if dbUser.HasRole(schema.RoleUser) && dbUser.HasNotRoles([]schema.Role{schema.RoleSupport}) &&
+		user.HasRole(schema.RoleSupport) && user.HasNotRoles([]schema.Role{schema.RoleAdmin}) {
+		// New Support: update roles
+		if err := updateRoles(user.Roles); err != nil {
+			return err
+		}
+	} else if dbUser.HasRole(schema.RoleSupport) && user.HasNotRoles([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
+		// Remove Support: update roles
+		if err := updateRoles(user.Roles); err != nil {
+			return err
+		}
+	}
 
 	return nil
 }

From adebffd2515541da99098dea0bf03fd5ad789935 Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Fri, 27 Feb 2026 17:40:32 +0100
Subject: [PATCH 12/20] Replace the old zip archive options for the metricstore
 node data by parquet files

---
 pkg/metricstore/archive.go             | 221 +++++++++++++--------
 pkg/metricstore/parquetArchive.go      | 213 +++++++++++++++++++++
 pkg/metricstore/parquetArchive_test.go | 255 +++++++++++++++++++++++++
 3 files changed, 606 insertions(+), 83 deletions(-)
 create mode 100644 pkg/metricstore/parquetArchive.go
 create mode 100644 pkg/metricstore/parquetArchive_test.go

diff --git a/pkg/metricstore/archive.go b/pkg/metricstore/archive.go
index d3617f2c..77f4264a 100644
--- a/pkg/metricstore/archive.go
+++ b/pkg/metricstore/archive.go
@@ -6,12 +6,9 @@
 package metricstore
 
 import (
-	"archive/zip"
-	"bufio"
 	"context"
 	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"sync"
@@ -47,7 +44,7 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
 	}
 }
 
-// runWorker takes simple values to configure what it does
+// cleanUpWorker takes simple values to configure what it does
 func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) {
 	wg.Go(func() {
 
@@ -75,10 +72,10 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod
 				if err != nil {
 					cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error())
 				} else {
-					if delete && cleanupDir == "" {
+					if delete {
 						cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n)
 					} else {
-						cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
+						cclog.Infof("[METRICSTORE]> done: %d checkpoint files archived to parquet", n)
 					}
 				}
 			}
@@ -88,17 +85,26 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod
 
 var ErrNoNewArchiveData error = errors.New("all data already archived")
 
-// Delete or ZIP all checkpoint files older than `from` together and write them to the `cleanupDir`,
-// deleting/moving them from the `checkpointsDir`.
+// CleanupCheckpoints deletes or archives all checkpoint files older than `from`.
+// When archiving, consolidates all hosts per cluster into a single Parquet file.
 func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteInstead bool) (int, error) {
+	if deleteInstead {
+		return deleteCheckpoints(checkpointsDir, from)
+	}
+
+	return archiveCheckpoints(checkpointsDir, cleanupDir, from)
+}
+
+// deleteCheckpoints removes checkpoint files older than `from` across all clusters/hosts.
+func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
 	entries1, err := os.ReadDir(checkpointsDir)
 	if err != nil {
 		return 0, err
 	}
 
 	type workItem struct {
-		cdir, adir    string
-		cluster, host string
+		dir            string
+		cluster, host  string
 	}
 
 	var wg sync.WaitGroup
@@ -109,13 +115,29 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
 	for worker := 0; worker < Keys.NumWorkers; worker++ {
 		go func() {
 			defer wg.Done()
-			for workItem := range work {
-				m, err := cleanupCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
+			for item := range work {
+				entries, err := os.ReadDir(item.dir)
 				if err != nil {
-					cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
+					cclog.Errorf("error reading %s/%s: %s", item.cluster, item.host, err.Error())
 					atomic.AddInt32(&errs, 1)
+					continue
+				}
+
+				files, err := findFiles(entries, from, false)
+				if err != nil {
+					cclog.Errorf("error finding files in %s/%s: %s", item.cluster, item.host, err.Error())
+					atomic.AddInt32(&errs, 1)
+					continue
+				}
+
+				for _, checkpoint := range files {
+					if err := os.Remove(filepath.Join(item.dir, checkpoint)); err != nil {
+						cclog.Errorf("error deleting %s/%s/%s: %s", item.cluster, item.host, checkpoint, err.Error())
+						atomic.AddInt32(&errs, 1)
+					} else {
+						atomic.AddInt32(&n, 1)
+					}
 				}
-				atomic.AddInt32(&n, int32(m))
 			}
 		}()
 	}
@@ -124,14 +146,14 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
 		entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
 		if e != nil {
 			err = e
+			continue
 		}
 
 		for _, de2 := range entries2 {
-			cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
-			adir := filepath.Join(cleanupDir, de1.Name(), de2.Name())
 			work <- workItem{
-				adir: adir, cdir: cdir,
-				cluster: de1.Name(), host: de2.Name(),
+				dir:     filepath.Join(checkpointsDir, de1.Name(), de2.Name()),
+				cluster: de1.Name(),
+				host:    de2.Name(),
 			}
 		}
 	}
@@ -142,85 +164,118 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
 	if err != nil {
 		return int(n), err
 	}
-
 	if errs > 0 {
-		return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
+		return int(n), fmt.Errorf("%d errors happened while deleting (%d successes)", errs, n)
 	}
 	return int(n), nil
 }
 
-// Helper function for `CleanupCheckpoints`.
-func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead bool) (int, error) {
-	entries, err := os.ReadDir(dir)
+// archiveCheckpoints archives checkpoint files to Parquet format.
+// Produces one Parquet file per cluster: <cleanupDir>/<cluster>/<timestamp>.parquet
+func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, error) {
+	clusterEntries, err := os.ReadDir(checkpointsDir)
 	if err != nil {
 		return 0, err
 	}
 
-	files, err := findFiles(entries, from, false)
-	if err != nil {
-		return 0, err
-	}
+	totalFiles := 0
 
-	if deleteInstead {
-		n := 0
-		for _, checkpoint := range files {
-			filename := filepath.Join(dir, checkpoint)
-			if err = os.Remove(filename); err != nil {
-				return n, err
-			}
-			n += 1
+	for _, clusterEntry := range clusterEntries {
+		if !clusterEntry.IsDir() {
+			continue
 		}
-		return n, nil
-	}
 
-	filename := filepath.Join(cleanupDir, fmt.Sprintf("%d.zip", from))
-	f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
-	if err != nil && os.IsNotExist(err) {
-		err = os.MkdirAll(cleanupDir, CheckpointDirPerms)
-		if err == nil {
-			f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
-		}
-	}
-	if err != nil {
-		return 0, err
-	}
-	defer f.Close()
-	bw := bufio.NewWriter(f)
-	defer bw.Flush()
-	zw := zip.NewWriter(bw)
-	defer zw.Close()
-
-	n := 0
-	for _, checkpoint := range files {
-		// Use closure to ensure file is closed immediately after use,
-		// avoiding file descriptor leak from defer in loop
-		err := func() error {
-			filename := filepath.Join(dir, checkpoint)
-			r, err := os.Open(filename)
-			if err != nil {
-				return err
-			}
-			defer r.Close()
-
-			w, err := zw.Create(checkpoint)
-			if err != nil {
-				return err
-			}
-
-			if _, err = io.Copy(w, r); err != nil {
-				return err
-			}
-
-			if err = os.Remove(filename); err != nil {
-				return err
-			}
-			return nil
-		}()
+		cluster := clusterEntry.Name()
+		hostEntries, err := os.ReadDir(filepath.Join(checkpointsDir, cluster))
 		if err != nil {
-			return n, err
+			return totalFiles, err
 		}
-		n += 1
+
+		// Collect rows from all hosts in this cluster using worker pool
+		type hostResult struct {
+			rows  []ParquetMetricRow
+			files []string // checkpoint filenames to delete after successful write
+			dir   string   // checkpoint directory for this host
+		}
+
+		results := make(chan hostResult, len(hostEntries))
+		work := make(chan struct {
+			dir, host string
+		}, Keys.NumWorkers)
+
+		var wg sync.WaitGroup
+		errs := int32(0)
+
+		wg.Add(Keys.NumWorkers)
+		for w := 0; w < Keys.NumWorkers; w++ {
+			go func() {
+				defer wg.Done()
+				for item := range work {
+					rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from)
+					if err != nil {
+						cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error())
+						atomic.AddInt32(&errs, 1)
+						continue
+					}
+					if len(rows) > 0 {
+						results <- hostResult{rows: rows, files: files, dir: item.dir}
+					}
+				}
+			}()
+		}
+
+		go func() {
+			for _, hostEntry := range hostEntries {
+				if !hostEntry.IsDir() {
+					continue
+				}
+				dir := filepath.Join(checkpointsDir, cluster, hostEntry.Name())
+				work <- struct {
+					dir, host string
+				}{dir: dir, host: hostEntry.Name()}
+			}
+			close(work)
+			wg.Wait()
+			close(results)
+		}()
+
+		// Collect all rows and file info
+		var allRows []ParquetMetricRow
+		var allResults []hostResult
+		for r := range results {
+			allRows = append(allRows, r.rows...)
+			allResults = append(allResults, r)
+		}
+
+		if errs > 0 {
+			return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster)
+		}
+
+		if len(allRows) == 0 {
+			continue
+		}
+
+		// Write one Parquet file per cluster
+		parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
+		if err := writeParquetArchive(parquetFile, allRows); err != nil {
+			return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err)
+		}
+
+		// Delete archived checkpoint files
+		for _, result := range allResults {
+			for _, file := range result.files {
+				filename := filepath.Join(result.dir, file)
+				if err := os.Remove(filename); err != nil {
+					cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err)
+				} else {
+					totalFiles++
+				}
+			}
+		}
+
+		cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s",
+			len(allRows), totalFiles, cluster, parquetFile)
 	}
 
-	return n, nil
+	return totalFiles, nil
 }
diff --git a/pkg/metricstore/parquetArchive.go b/pkg/metricstore/parquetArchive.go
new file mode 100644
index 00000000..420ee4e5
--- /dev/null
+++ b/pkg/metricstore/parquetArchive.go
@@ -0,0 +1,213 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package metricstore
+
+import (
+	"bufio"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	pq "github.com/parquet-go/parquet-go"
+)
+
+// ParquetMetricRow is the long-format schema for archived metric data.
+// One row per (host, metric, scope, scope_id, timestamp) data point.
+// Sorted by (cluster, hostname, metric, timestamp) for optimal compression.
+type ParquetMetricRow struct {
+	Cluster   string  `parquet:"cluster"`
+	Hostname  string  `parquet:"hostname"`
+	Metric    string  `parquet:"metric"`
+	Scope     string  `parquet:"scope"`
+	ScopeID   string  `parquet:"scope_id"`
+	Timestamp int64   `parquet:"timestamp"`
+	Frequency int64   `parquet:"frequency"`
+	Value     float32 `parquet:"value"`
+}
+
+// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows.
+// The scope path is built from the hierarchy: host level is "node", then child names
+// map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0").
+func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow {
+	for metricName, cm := range cf.Metrics {
+		ts := cm.Start
+		for _, v := range cm.Data {
+			if !v.IsNaN() {
+				rows = append(rows, ParquetMetricRow{
+					Cluster:   cluster,
+					Hostname:  hostname,
+					Metric:    metricName,
+					Scope:     scope,
+					ScopeID:   scopeID,
+					Timestamp: ts,
+					Frequency: cm.Frequency,
+					Value:     float32(v),
+				})
+			}
+			ts += cm.Frequency
+		}
+	}
+
+	for childName, childCf := range cf.Children {
+		childScope, childScopeID := parseScopeFromName(childName)
+		rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows)
+	}
+
+	return rows
+}
+
+// parseScopeFromName infers scope and scope_id from a child level name.
+// Examples: "socket0" → ("socket", "0"), "core12" → ("core", "12"),
+// "a0" (accelerator) → ("accelerator", "0").
+// If the name doesn't match known patterns, it's used as-is for scope with empty scope_id.
+func parseScopeFromName(name string) (string, string) {
+	prefixes := []struct {
+		prefix string
+		scope  string
+	}{
+		{"socket", "socket"},
+		{"memoryDomain", "memoryDomain"},
+		{"core", "core"},
+		{"hwthread", "hwthread"},
+		{"cpu", "hwthread"},
+		{"accelerator", "accelerator"},
+	}
+
+	for _, p := range prefixes {
+		if len(name) > len(p.prefix) && name[:len(p.prefix)] == p.prefix {
+			id := name[len(p.prefix):]
+			if len(id) > 0 && id[0] >= '0' && id[0] <= '9' {
+				return p.scope, id
+			}
+		}
+	}
+
+	return name, ""
+}
+
+// writeParquetArchive writes rows to a Parquet file with Zstd compression.
+func writeParquetArchive(filename string, rows []ParquetMetricRow) error {
+	if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil {
+		return fmt.Errorf("creating archive directory: %w", err)
+	}
+
+	f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
+	if err != nil {
+		return fmt.Errorf("creating parquet file: %w", err)
+	}
+	defer f.Close()
+
+	bw := bufio.NewWriterSize(f, 1<<20) // 1MB write buffer
+
+	writer := pq.NewGenericWriter[ParquetMetricRow](bw,
+		pq.Compression(&pq.Zstd),
+		pq.SortingWriterConfig(pq.SortingColumns(
+			pq.Ascending("cluster"),
+			pq.Ascending("hostname"),
+			pq.Ascending("metric"),
+			pq.Ascending("timestamp"),
+		)),
+	)
+
+	if _, err := writer.Write(rows); err != nil {
+		return fmt.Errorf("writing parquet rows: %w", err)
+	}
+
+	if err := writer.Close(); err != nil {
+		return fmt.Errorf("closing parquet writer: %w", err)
+	}
+
+	if err := bw.Flush(); err != nil {
+		return fmt.Errorf("flushing parquet file: %w", err)
+	}
+
+	return nil
+}
+
+// loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns
+// a CheckpointFile. Used by the Parquet archiver to read checkpoint data
+// before converting it to Parquet format.
+func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) {
+	f, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	ext := filepath.Ext(filename)
+	switch ext {
+	case ".json":
+		cf := &CheckpointFile{}
+		br := bufio.NewReader(f)
+		if err := json.NewDecoder(br).Decode(cf); err != nil {
+			return nil, fmt.Errorf("decoding JSON checkpoint %s: %w", filename, err)
+		}
+		return cf, nil
+
+	case ".bin":
+		br := bufio.NewReader(f)
+		var magic uint32
+		if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
+			return nil, fmt.Errorf("reading magic from %s: %w", filename, err)
+		}
+		if magic != snapFileMagic {
+			return nil, fmt.Errorf("invalid snapshot magic in %s: 0x%08X", filename, magic)
+		}
+		var fileFrom, fileTo int64
+		if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil {
+			return nil, fmt.Errorf("reading from-timestamp from %s: %w", filename, err)
+		}
+		if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil {
+			return nil, fmt.Errorf("reading to-timestamp from %s: %w", filename, err)
+		}
+		cf, err := readBinaryLevel(br)
+		if err != nil {
+			return nil, fmt.Errorf("reading binary level from %s: %w", filename, err)
+		}
+		cf.From = fileFrom
+		cf.To = fileTo
+		return cf, nil
+
+	default:
+		return nil, fmt.Errorf("unsupported checkpoint extension: %s", ext)
+	}
+}
+
+// archiveCheckpointsToParquet reads checkpoint files for a host directory,
+// converts them to Parquet rows. Returns the rows and filenames that were processed.
+func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	files, err := findFiles(entries, from, false)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if len(files) == 0 {
+		return nil, nil, nil
+	}
+
+	var rows []ParquetMetricRow
+
+	for _, checkpoint := range files {
+		filename := filepath.Join(dir, checkpoint)
+		cf, err := loadCheckpointFileFromDisk(filename)
+		if err != nil {
+			cclog.Warnf("[METRICSTORE]> skipping unreadable checkpoint %s: %v", filename, err)
+			continue
+		}
+
+		rows = flattenCheckpointFile(cf, cluster, host, "node", "", rows)
+	}
+
+	return rows, files, nil
+}
diff --git a/pkg/metricstore/parquetArchive_test.go b/pkg/metricstore/parquetArchive_test.go
new file mode 100644
index 00000000..d3d70c02
--- /dev/null
+++ b/pkg/metricstore/parquetArchive_test.go
@@ -0,0 +1,255 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package metricstore
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	pq "github.com/parquet-go/parquet-go"
+)
+
+func TestParseScopeFromName(t *testing.T) {
+	tests := []struct {
+		name      string
+		wantScope string
+		wantID    string
+	}{
+		{"socket0", "socket", "0"},
+		{"socket12", "socket", "12"},
+		{"core0", "core", "0"},
+		{"core127", "core", "127"},
+		{"cpu0", "hwthread", "0"},
+		{"hwthread5", "hwthread", "5"},
+		{"memoryDomain0", "memoryDomain", "0"},
+		{"accelerator0", "accelerator", "0"},
+		{"unknown", "unknown", ""},
+		{"socketX", "socketX", ""}, // not numeric suffix
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			scope, id := parseScopeFromName(tt.name)
+			if scope != tt.wantScope || id != tt.wantID {
+				t.Errorf("parseScopeFromName(%q) = (%q, %q), want (%q, %q)",
+					tt.name, scope, id, tt.wantScope, tt.wantID)
+			}
+		})
+	}
+}
+
+func TestFlattenCheckpointFile(t *testing.T) {
+	cf := &CheckpointFile{
+		From: 1000,
+		To:   1060,
+		Metrics: map[string]*CheckpointMetrics{
+			"cpu_load": {
+				Frequency: 60,
+				Start:     1000,
+				Data:      []schema.Float{0.5, 0.7, schema.NaN},
+			},
+		},
+		Children: map[string]*CheckpointFile{
+			"socket0": {
+				Metrics: map[string]*CheckpointMetrics{
+					"mem_bw": {
+						Frequency: 60,
+						Start:     1000,
+						Data:      []schema.Float{100.0, schema.NaN, 200.0},
+					},
+				},
+				Children: make(map[string]*CheckpointFile),
+			},
+		},
+	}
+
+	rows := flattenCheckpointFile(cf, "fritz", "node001", "node", "", nil)
+
+	// cpu_load: 2 non-NaN values at node scope
+	// mem_bw: 2 non-NaN values at socket0 scope
+	if len(rows) != 4 {
+		t.Fatalf("expected 4 rows, got %d", len(rows))
+	}
+
+	// Verify a node-scope row
+	found := false
+	for _, r := range rows {
+		if r.Metric == "cpu_load" && r.Timestamp == 1000 {
+			found = true
+			if r.Cluster != "fritz" || r.Hostname != "node001" || r.Scope != "node" || r.Value != 0.5 {
+				t.Errorf("unexpected row: %+v", r)
+			}
+		}
+	}
+	if !found {
+		t.Error("expected cpu_load row at timestamp 1000")
+	}
+
+	// Verify a socket-scope row
+	found = false
+	for _, r := range rows {
+		if r.Metric == "mem_bw" && r.Scope == "socket" && r.ScopeID == "0" {
+			found = true
+		}
+	}
+	if !found {
+		t.Error("expected mem_bw row with scope=socket, scope_id=0")
+	}
+}
+
+func TestParquetArchiveRoundtrip(t *testing.T) {
+	tmpDir := t.TempDir()
+
+	// Create checkpoint files on disk (JSON format)
+	cpDir := filepath.Join(tmpDir, "checkpoints", "testcluster", "node001")
+	if err := os.MkdirAll(cpDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	cf := &CheckpointFile{
+		From: 1000,
+		To:   1180,
+		Metrics: map[string]*CheckpointMetrics{
+			"cpu_load": {
+				Frequency: 60,
+				Start:     1000,
+				Data:      []schema.Float{0.5, 0.7, 0.9},
+			},
+			"mem_used": {
+				Frequency: 60,
+				Start:     1000,
+				Data:      []schema.Float{45.0, 46.0, 47.0},
+			},
+		},
+		Children: map[string]*CheckpointFile{
+			"socket0": {
+				Metrics: map[string]*CheckpointMetrics{
+					"mem_bw": {
+						Frequency: 60,
+						Start:     1000,
+						Data:      []schema.Float{100.0, 110.0, 120.0},
+					},
+				},
+				Children: make(map[string]*CheckpointFile),
+			},
+		},
+	}
+
+	// Write JSON checkpoint
+	cpFile := filepath.Join(cpDir, "1000.json")
+	data, err := json.Marshal(cf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(cpFile, data, 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	// Archive to Parquet
+	archiveDir := filepath.Join(tmpDir, "archive")
+	rows, files, err := archiveCheckpointsToParquet(cpDir, "testcluster", "node001", 2000)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(files) != 1 || files[0] != "1000.json" {
+		t.Fatalf("expected 1 file, got %v", files)
+	}
+
+	parquetFile := filepath.Join(archiveDir, "testcluster", "1000.parquet")
+	if err := writeParquetArchive(parquetFile, rows); err != nil {
+		t.Fatal(err)
+	}
+
+	// Read back and verify
+	f, err := os.Open(parquetFile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	stat, _ := f.Stat()
+	pf, err := pq.OpenFile(f, stat.Size())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	reader := pq.NewGenericReader[ParquetMetricRow](pf)
+	readRows := make([]ParquetMetricRow, 100)
+	n, err := reader.Read(readRows)
+	if err != nil && n == 0 {
+		t.Fatal(err)
+	}
+	readRows = readRows[:n]
+	reader.Close()
+
+	// We expect: cpu_load(3) + mem_used(3) + mem_bw(3) = 9 rows
+	if n != 9 {
+		t.Fatalf("expected 9 rows in parquet file, got %d", n)
+	}
+
+	// Verify cluster and hostname are set correctly
+	for _, r := range readRows {
+		if r.Cluster != "testcluster" {
+			t.Errorf("expected cluster=testcluster, got %s", r.Cluster)
+		}
+		if r.Hostname != "node001" {
+			t.Errorf("expected hostname=node001, got %s", r.Hostname)
+		}
+	}
+
+	// Verify parquet file is smaller than JSON (compression working)
+	if stat.Size() == 0 {
+		t.Error("parquet file is empty")
+	}
+
+	t.Logf("Parquet file size: %d bytes for %d rows", stat.Size(), n)
+}
+
+func TestLoadCheckpointFileFromDisk_JSON(t *testing.T) {
+	tmpDir := t.TempDir()
+
+	cf := &CheckpointFile{
+		From: 1000,
+		To:   1060,
+		Metrics: map[string]*CheckpointMetrics{
+			"test_metric": {
+				Frequency: 60,
+				Start:     1000,
+				Data:      []schema.Float{1.0, 2.0, 3.0},
+			},
+		},
+		Children: make(map[string]*CheckpointFile),
+	}
+
+	filename := filepath.Join(tmpDir, "1000.json")
+	data, err := json.Marshal(cf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filename, data, 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	loaded, err := loadCheckpointFileFromDisk(filename)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if loaded.From != 1000 || loaded.To != 1060 {
+		t.Errorf("expected From=1000, To=1060, got From=%d, To=%d", loaded.From, loaded.To)
+	}
+
+	m, ok := loaded.Metrics["test_metric"]
+	if !ok {
+		t.Fatal("expected test_metric in loaded checkpoint")
+	}
+	if m.Frequency != 60 || m.Start != 1000 || len(m.Data) != 3 {
+		t.Errorf("unexpected metric data: %+v", m)
+	}
+}

From 1ec41d8389e81e7c517960dda251ec6a8a53ad39 Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Sat, 28 Feb 2026 19:34:33 +0100
Subject: [PATCH 13/20] Review and improve buffer pool implmentation. Add unit
 tests.

---
 pkg/metricstore/buffer.go           |  75 ++--
 pkg/metricstore/level.go            |   9 +-
 pkg/metricstore/metricstore.go      |   2 +-
 pkg/metricstore/metricstore_test.go | 520 ++++++++++++++++++++++++++++
 4 files changed, 566 insertions(+), 40 deletions(-)

diff --git a/pkg/metricstore/buffer.go b/pkg/metricstore/buffer.go
index f486e645..557a941c 100644
--- a/pkg/metricstore/buffer.go
+++ b/pkg/metricstore/buffer.go
@@ -54,6 +54,10 @@ import (
 // of data or reallocation needs to happen on writes.
 const BufferCap int = DefaultBufferCapacity
 
+// maxPoolSize caps the number of buffers held in the pool at any time.
+// Prevents unbounded memory growth after large retention-cleanup bursts.
+const maxPoolSize = 4096
+
 // BufferPool is the global instance.
 // It is initialized immediately when the package loads.
 var bufferPool = NewPersistentBufferPool()
@@ -89,12 +93,18 @@ func (p *PersistentBufferPool) Get() *buffer {
 	return b
 }
 
+// Put returns b to the pool. The caller must set b.lastUsed = time.Now().Unix()
+// before calling Put so that Clean() can evict idle entries correctly.
 func (p *PersistentBufferPool) Put(b *buffer) {
 	// Reset the buffer before putting it back
 	b.data = b.data[:0]
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	if len(p.pool) >= maxPoolSize {
+		// Pool is full; drop the buffer and let GC collect it.
+		return
+	}
 	p.pool = append(p.pool, b)
 }
 
@@ -121,13 +131,11 @@ func (p *PersistentBufferPool) Clean(threshold int64) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 
-	// Filter in place
+	// Filter in place, retaining only buffers returned to the pool recently enough.
 	active := p.pool[:0]
 	for _, b := range p.pool {
 		if b.lastUsed >= threshold {
 			active = append(active, b)
-		} else {
-			// Buffer is older than the threshold, let it be collected by GC
 		}
 	}
 
@@ -139,19 +147,6 @@ func (p *PersistentBufferPool) Clean(threshold int64) {
 	p.pool = active
 }
 
-// CleanAll removes all buffers from the pool.
-func (p *PersistentBufferPool) CleanAll() {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Nullify all buffers to prevent memory leaks
-	for i := range p.pool {
-		p.pool[i] = nil
-	}
-
-	p.pool = p.pool[:0]
-}
-
 var (
 	// ErrNoData indicates no time-series data exists for the requested metric/level.
 	ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
@@ -276,11 +271,13 @@ func (b *buffer) firstWrite() int64 {
 //
 // Panics if 'data' slice is too small to hold all values in [from, to).
 func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
-	if from < b.firstWrite() {
-		if b.prev != nil {
-			return b.prev.read(from, to, data)
+	// Walk back to the buffer that covers 'from', adjusting if we hit the oldest.
+	for from < b.firstWrite() {
+		if b.prev == nil {
+			from = b.firstWrite()
+			break
 		}
-		from = b.firstWrite()
+		b = b.prev
 	}
 
 	i := 0
@@ -292,16 +289,17 @@ func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int6
 				break
 			}
 			b = b.next
-			idx = 0
+			// Recalculate idx in the new buffer; a gap between buffers may exist.
+			idx = int((t - b.start) / b.frequency)
 		}
 
 		if idx >= len(b.data) {
 			if b.next == nil || to <= b.next.start {
 				break
 			}
-			data[i] += schema.NaN
+			data[i] += schema.NaN // NaN + anything = NaN; propagates missing data
 		} else if t < b.start {
-			data[i] += schema.NaN
+			data[i] += schema.NaN // gap before this buffer's first write
 		} else {
 			data[i] += b.data[idx]
 		}
@@ -359,11 +357,12 @@ func (b *buffer) forceFreeOldest() (delme bool, n int) {
 
 		// If the previous buffer signals it should be deleted:
 		if delPrev {
-			// Clear links on the dying buffer to prevent leaks
 			b.prev.next = nil
-			b.prev.data = nil // Release the underlying float slice immediately
-
-			// Remove the link from the current buffer
+			if cap(b.prev.data) != BufferCap {
+				b.prev.data = make([]schema.Float, 0, BufferCap)
+			}
+			b.prev.lastUsed = time.Now().Unix()
+			bufferPool.Put(b.prev)
 			b.prev = nil
 		}
 		return false, freed
@@ -392,21 +391,27 @@ func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) erro
 		return nil
 	}
 
-	if err := b.prev.iterFromTo(from, to, callback); err != nil {
-		return err
+	// Collect overlapping buffers walking backwards (newest → oldest).
+	var matching []*buffer
+	for cur := b; cur != nil; cur = cur.prev {
+		if from <= cur.end() && cur.start <= to {
+			matching = append(matching, cur)
+		}
 	}
 
-	if from <= b.end() && b.start <= to {
-		return callback(b)
+	// Invoke callback in chronological order (oldest → newest).
+	for i := len(matching) - 1; i >= 0; i-- {
+		if err := callback(matching[i]); err != nil {
+			return err
+		}
 	}
-
 	return nil
 }
 
 func (b *buffer) count() int64 {
-	res := int64(len(b.data))
-	if b.prev != nil {
-		res += b.prev.count()
+	var res int64
+	for ; b != nil; b = b.prev {
+		res += int64(len(b.data))
 	}
 	return res
 }
diff --git a/pkg/metricstore/level.go b/pkg/metricstore/level.go
index ef082579..2b24a2ea 100644
--- a/pkg/metricstore/level.go
+++ b/pkg/metricstore/level.go
@@ -238,12 +238,13 @@ func (l *Level) forceFree() (int, error) {
 			// If delme is true, it means 'b' itself (the head) was the oldest
 			// and needs to be removed from the slice.
 			if delme {
-				// Nil out fields to ensure no hanging references
-
 				b.next = nil
 				b.prev = nil
-				b.data = nil
-
+				if cap(b.data) != BufferCap {
+					b.data = make([]schema.Float, 0, BufferCap)
+				}
+				b.lastUsed = time.Now().Unix()
+				bufferPool.Put(b)
 				l.metrics[i] = nil
 			}
 		}
diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go
index db3e4357..b5b1a528 100644
--- a/pkg/metricstore/metricstore.go
+++ b/pkg/metricstore/metricstore.go
@@ -428,7 +428,7 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
 				runtime.ReadMemStats(&mem)
 				actualMemoryGB = float64(mem.Alloc) / 1e9
 
-				bufferPool.CleanAll()
+				bufferPool.Clear()
 				cclog.Infof("[METRICSTORE]> Cleaned up bufferPool\n")
 
 				if actualMemoryGB > float64(Keys.MemoryCap) {
diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go
index 772fd7ea..9087df2a 100644
--- a/pkg/metricstore/metricstore_test.go
+++ b/pkg/metricstore/metricstore_test.go
@@ -12,6 +12,526 @@ import (
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )
 
+// ─── Buffer pool ─────────────────────────────────────────────────────────────
+
+// TestBufferPoolGetReuse verifies that Get() returns pooled buffers before
+// allocating new ones, and that an empty pool allocates a fresh BufferCap buffer.
+func TestBufferPoolGetReuse(t *testing.T) {
+	pool := NewPersistentBufferPool()
+
+	original := &buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()}
+	pool.Put(original)
+
+	reused := pool.Get()
+	if reused != original {
+		t.Error("Get() should return the previously pooled buffer")
+	}
+	if pool.GetSize() != 0 {
+		t.Errorf("pool size after Get() = %d, want 0", pool.GetSize())
+	}
+
+	// Empty pool must allocate a fresh buffer with the standard capacity.
+	fresh := pool.Get()
+	if fresh == nil {
+		t.Fatal("Get() from empty pool returned nil")
+	}
+	if cap(fresh.data) != BufferCap {
+		t.Errorf("fresh buffer cap = %d, want %d", cap(fresh.data), BufferCap)
+	}
+}
+
+// TestBufferPoolClear verifies that Clear() drains all entries.
+func TestBufferPoolClear(t *testing.T) {
+	pool := NewPersistentBufferPool()
+	for i := 0; i < 10; i++ {
+		pool.Put(&buffer{data: make([]schema.Float, 0), lastUsed: time.Now().Unix()})
+	}
+	pool.Clear()
+	if pool.GetSize() != 0 {
+		t.Errorf("pool size after Clear() = %d, want 0", pool.GetSize())
+	}
+}
+
+// TestBufferPoolMaxSize verifies that Put() silently drops buffers once the
+// pool reaches maxPoolSize, preventing unbounded memory growth.
+func TestBufferPoolMaxSize(t *testing.T) {
+	pool := NewPersistentBufferPool()
+	for i := 0; i < maxPoolSize; i++ {
+		pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()})
+	}
+	if pool.GetSize() != maxPoolSize {
+		t.Fatalf("pool size = %d, want %d", pool.GetSize(), maxPoolSize)
+	}
+
+	pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()})
+	if pool.GetSize() != maxPoolSize {
+		t.Errorf("pool size after overflow Put = %d, want %d (should not grow)", pool.GetSize(), maxPoolSize)
+	}
+}
+
+// ─── Buffer helpers ───────────────────────────────────────────────────────────
+
+// TestBufferEndFirstWrite verifies the end() and firstWrite() calculations.
+func TestBufferEndFirstWrite(t *testing.T) {
+	// start=90, freq=10 → firstWrite = 90+5 = 95
+	b := &buffer{data: make([]schema.Float, 4, BufferCap), frequency: 10, start: 90}
+	if fw := b.firstWrite(); fw != 95 {
+		t.Errorf("firstWrite() = %d, want 95", fw)
+	}
+	// end = firstWrite + len(data)*freq = 95 + 4*10 = 135
+	if e := b.end(); e != 135 {
+		t.Errorf("end() = %d, want 135", e)
+	}
+}
+
+// ─── Buffer write ─────────────────────────────────────────────────────────────
+
+// TestBufferWriteNaNFill verifies that skipped timestamps are filled with NaN.
+func TestBufferWriteNaNFill(t *testing.T) {
+	b := newBuffer(100, 10)
+	b.write(100, schema.Float(1.0))
+	// skip 110 and 120
+	b.write(130, schema.Float(4.0))
+
+	if len(b.data) != 4 {
+		t.Fatalf("len(data) = %d, want 4 (1 value + 2 NaN + 1 value)", len(b.data))
+	}
+	if b.data[0] != schema.Float(1.0) {
+		t.Errorf("data[0] = %v, want 1.0", b.data[0])
+	}
+	if !b.data[1].IsNaN() {
+		t.Errorf("data[1] should be NaN (gap), got %v", b.data[1])
+	}
+	if !b.data[2].IsNaN() {
+		t.Errorf("data[2] should be NaN (gap), got %v", b.data[2])
+	}
+	if b.data[3] != schema.Float(4.0) {
+		t.Errorf("data[3] = %v, want 4.0", b.data[3])
+	}
+}
+
+// TestBufferWriteCapacityOverflow verifies that exceeding capacity creates and
+// links a new buffer rather than panicking or silently dropping data.
+func TestBufferWriteCapacityOverflow(t *testing.T) {
+	// Cap=2 so the third write must overflow into a new buffer.
+	b := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 95}
+
+	nb, _ := b.write(100, schema.Float(1.0))
+	nb, _ = nb.write(110, schema.Float(2.0))
+	nb, err := nb.write(120, schema.Float(3.0))
+	if err != nil {
+		t.Fatalf("write() error = %v", err)
+	}
+	if nb == b {
+		t.Fatal("write() should have returned a new buffer after overflow")
+	}
+	if nb.prev != b {
+		t.Error("new buffer should link back to old via prev")
+	}
+	if b.next != nb {
+		t.Error("old buffer should link forward to new via next")
+	}
+	if len(b.data) != 2 {
+		t.Errorf("old buffer len = %d, want 2 (full)", len(b.data))
+	}
+	if nb.data[0] != schema.Float(3.0) {
+		t.Errorf("new buffer data[0] = %v, want 3.0", nb.data[0])
+	}
+}
+
+// TestBufferWriteOverwrite verifies that writing to an already-occupied index
+// replaces the value rather than appending.
+func TestBufferWriteOverwrite(t *testing.T) {
+	b := newBuffer(100, 10)
+	b.write(100, schema.Float(1.0))
+	b.write(110, schema.Float(2.0))
+
+	// Overwrite the first slot.
+	b.write(100, schema.Float(99.0))
+	if len(b.data) != 2 {
+		t.Errorf("len(data) after overwrite = %d, want 2 (no append)", len(b.data))
+	}
+	if b.data[0] != schema.Float(99.0) {
+		t.Errorf("data[0] after overwrite = %v, want 99.0", b.data[0])
+	}
+}
+
+// ─── Buffer read ──────────────────────────────────────────────────────────────
+
+// TestBufferReadBeforeFirstWrite verifies that 'from' is clamped to firstWrite
+// when the requested range starts before any data in the chain.
+func TestBufferReadBeforeFirstWrite(t *testing.T) {
+	b := newBuffer(100, 10) // firstWrite = 100
+	b.write(100, schema.Float(1.0))
+	b.write(110, schema.Float(2.0))
+
+	data := make([]schema.Float, 10)
+	result, adjustedFrom, _, err := b.read(50, 120, data)
+	if err != nil {
+		t.Fatalf("read() error = %v", err)
+	}
+	if adjustedFrom != 100 {
+		t.Errorf("adjustedFrom = %d, want 100 (clamped to firstWrite)", adjustedFrom)
+	}
+	if len(result) != 2 {
+		t.Errorf("len(result) = %d, want 2", len(result))
+	}
+}
+
+// TestBufferReadChain verifies that read() traverses a multi-buffer chain and
+// returns contiguous values from both buffers.
+//
+// The switch to b.next in read() triggers on idx >= cap(b.data), so b1 must
+// be full (len == cap) for the loop to advance to b2 without producing NaN.
+func TestBufferReadChain(t *testing.T) {
+	// b1: cap=3, covers t=100..120.  b2: covers t=130..150.  b2 is head.
+	b1 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 95}
+	b1.data = append(b1.data, 1.0, 2.0, 3.0) // fills b1: len=cap=3
+
+	b2 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 125}
+	b2.data = append(b2.data, 4.0, 5.0, 6.0) // t=130,140,150
+	b2.prev = b1
+	b1.next = b2
+
+	data := make([]schema.Float, 6)
+	result, from, to, err := b2.read(100, 160, data)
+	if err != nil {
+		t.Fatalf("read() error = %v", err)
+	}
+	if from != 100 || to != 160 {
+		t.Errorf("read() from/to = %d/%d, want 100/160", from, to)
+	}
+	if len(result) != 6 {
+		t.Fatalf("len(result) = %d, want 6", len(result))
+	}
+	for i, want := range []schema.Float{1, 2, 3, 4, 5, 6} {
+		if result[i] != want {
+			t.Errorf("result[%d] = %v, want %v", i, result[i], want)
+		}
+	}
+}
+
+// TestBufferReadIdxAfterSwitch is a regression test for the index recalculation
+// bug after switching to b.next during a read.
+//
+// When both buffers share the same start time (can happen with checkpoint-loaded
+// chains), the old code hardcoded idx=0 after the switch, causing reads at time t
+// to return the wrong element from the next buffer.
+func TestBufferReadIdxAfterSwitch(t *testing.T) {
+	// b1: cap=2, both buffers start at 0 (firstWrite=5).
+	// b1 carries t=5 and t=15; b2 carries t=5,15,25,35 with the same start.
+	// When reading reaches t=25 the loop overflows b1 (idx=2 >= cap=2) and
+	// switches to b2. The correct index in b2 is (25-0)/10=2 → b2.data[2]=30.0.
+	// The old code set idx=0 → b2.data[0]=10.0 (wrong).
+	b1 := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 0}
+	b1.data = append(b1.data, schema.Float(1.0), schema.Float(2.0)) // t=5, t=15
+
+	b2 := &buffer{data: make([]schema.Float, 0, 10), frequency: 10, start: 0}
+	b2.data = append(b2.data,
+		schema.Float(10.0), schema.Float(20.0),
+		schema.Float(30.0), schema.Float(40.0)) // t=5,15,25,35
+	b2.prev = b1
+	b1.next = b2
+
+	// from=0 triggers the walkback to b1 (from < b2.firstWrite=5).
+	// After clamping, the loop runs t=5,15,25,35.
+	data := make([]schema.Float, 4)
+	result, _, _, err := b2.read(0, 36, data)
+	if err != nil {
+		t.Fatalf("read() error = %v", err)
+	}
+	if len(result) < 3 {
+		t.Fatalf("len(result) = %d, want >= 3", len(result))
+	}
+	if result[0] != schema.Float(1.0) {
+		t.Errorf("result[0] (t=5) = %v, want 1.0 (from b1)", result[0])
+	}
+	if result[1] != schema.Float(2.0) {
+		t.Errorf("result[1] (t=15) = %v, want 2.0 (from b1)", result[1])
+	}
+	// This is the critical assertion: old code returned 10.0 (b2.data[0]).
+	if result[2] != schema.Float(30.0) {
+		t.Errorf("result[2] (t=25) = %v, want 30.0 (idx recalculation fix)", result[2])
+	}
+}
+
+// TestBufferReadNaNValues verifies that NaN slots written to the buffer are
+// returned as NaN during read.
+func TestBufferReadNaNValues(t *testing.T) {
+	b := newBuffer(100, 10)
+	b.write(100, schema.Float(1.0))
+	b.write(110, schema.NaN)
+	b.write(120, schema.Float(3.0))
+
+	data := make([]schema.Float, 3)
+	result, _, _, err := b.read(100, 130, data)
+	if err != nil {
+		t.Fatalf("read() error = %v", err)
+	}
+	if len(result) != 3 {
+		t.Fatalf("len(result) = %d, want 3", len(result))
+	}
+	if result[0] != schema.Float(1.0) {
+		t.Errorf("result[0] = %v, want 1.0", result[0])
+	}
+	if !result[1].IsNaN() {
+		t.Errorf("result[1] should be NaN, got %v", result[1])
+	}
+	if result[2] != schema.Float(3.0) {
+		t.Errorf("result[2] = %v, want 3.0", result[2])
+	}
+}
+
+// TestBufferReadAccumulation verifies the += accumulation pattern used for
+// aggregation: values are added to whatever was already in the data slice.
+func TestBufferReadAccumulation(t *testing.T) {
+	b := newBuffer(100, 10)
+	b.write(100, schema.Float(3.0))
+	b.write(110, schema.Float(5.0))
+
+	// Pre-populate data slice (simulates a second metric being summed in).
+	data := []schema.Float{2.0, 1.0, 0.0}
+	result, _, _, err := b.read(100, 120, data)
+	if err != nil {
+		t.Fatalf("read() error = %v", err)
+	}
+	// 2.0+3.0=5.0, 1.0+5.0=6.0
+	if result[0] != schema.Float(5.0) {
+		t.Errorf("result[0] = %v, want 5.0 (2+3)", result[0])
+	}
+	if result[1] != schema.Float(6.0) {
+		t.Errorf("result[1] = %v, want 6.0 (1+5)", result[1])
+	}
+}
+
+// ─── Buffer free ─────────────────────────────────────────────────────────────
+
+// newTestPool swaps out the package-level bufferPool for a fresh isolated one
+// and returns a cleanup function that restores the original.
+func newTestPool(t *testing.T) *PersistentBufferPool {
+	t.Helper()
+	pool := NewPersistentBufferPool()
+	saved := bufferPool
+	bufferPool = pool
+	t.Cleanup(func() { bufferPool = saved })
+	return pool
+}
+
+// TestBufferFreeRetention verifies that free() removes buffers whose entire
+// time range falls before the retention threshold and returns them to the pool.
+func TestBufferFreeRetention(t *testing.T) {
+	pool := newTestPool(t)
+
+	// b1: firstWrite=5, end=25  b2: firstWrite=25, end=45  b3: firstWrite=45, end=65
+	b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
+	b1.data = append(b1.data, 1.0, 2.0)
+
+	b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
+	b2.data = append(b2.data, 3.0, 4.0)
+	b2.prev = b1
+	b1.next = b2
+
+	b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40}
+	b3.data = append(b3.data, 5.0, 6.0)
+	b3.prev = b2
+	b2.next = b3
+
+	// Threshold=30: b1.end()=25 < 30 → freed; b2.end()=45 >= 30 → kept.
+	delme, n := b3.free(30)
+	if delme {
+		t.Error("head buffer b3 should not be marked for deletion")
+	}
+	if n != 1 {
+		t.Errorf("freed count = %d, want 1", n)
+	}
+	if b2.prev != nil {
+		t.Error("b1 should have been unlinked from b2.prev")
+	}
+	if b3.prev != b2 {
+		t.Error("b3 should still reference b2")
+	}
+	if pool.GetSize() != 1 {
+		t.Errorf("pool size = %d, want 1 (b1 returned)", pool.GetSize())
+	}
+}
+
+// TestBufferFreeAll verifies that free() removes all buffers and signals the
+// caller to delete the head when the entire chain is older than the threshold.
+func TestBufferFreeAll(t *testing.T) {
+	pool := newTestPool(t)
+
+	b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
+	b1.data = append(b1.data, 1.0, 2.0) // end=25
+
+	b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
+	b2.data = append(b2.data, 3.0, 4.0) // end=45
+	b2.prev = b1
+	b1.next = b2
+
+	// Threshold=100 > both ends → both should be freed.
+	delme, n := b2.free(100)
+	if !delme {
+		t.Error("head buffer b2 should be marked for deletion when all data is stale")
+	}
+	if n != 2 {
+		t.Errorf("freed count = %d, want 2", n)
+	}
+	// b1 was freed inside free(); b2 is returned with delme=true for the caller.
+	if pool.GetSize() != 1 {
+		t.Errorf("pool size = %d, want 1 (b1 returned; b2 returned by caller)", pool.GetSize())
+	}
+}
+
+// ─── forceFreeOldest ─────────────────────────────────────────────────────────
+
+// TestForceFreeOldestPoolReturn verifies that forceFreeOldest() returns the
+// freed buffer to the pool (regression: previously it was just dropped).
+func TestForceFreeOldestPoolReturn(t *testing.T) {
+	pool := newTestPool(t)
+
+	b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
+	b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
+	b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40}
+	b1.data = append(b1.data, 1.0)
+	b2.data = append(b2.data, 2.0)
+	b3.data = append(b3.data, 3.0)
+	b2.prev = b1
+	b1.next = b2
+	b3.prev = b2
+	b2.next = b3
+
+	delme, n := b3.forceFreeOldest()
+	if delme {
+		t.Error("head b3 should not be marked for deletion (chain has 3 buffers)")
+	}
+	if n != 1 {
+		t.Errorf("freed count = %d, want 1", n)
+	}
+	if b2.prev != nil {
+		t.Error("b1 should have been unlinked from b2.prev after forceFreeOldest")
+	}
+	if b3.prev != b2 {
+		t.Error("b3 should still link to b2")
+	}
+	if pool.GetSize() != 1 {
+		t.Errorf("pool size = %d, want 1 (b1 returned to pool)", pool.GetSize())
+	}
+}
+
+// TestForceFreeOldestSingleBuffer verifies that forceFreeOldest() returns
+// delme=true when the buffer is the only one in the chain.
+func TestForceFreeOldestSingleBuffer(t *testing.T) {
+	b := newBuffer(100, 10)
+	b.write(100, schema.Float(1.0))
+
+	delme, n := b.forceFreeOldest()
+	if !delme {
+		t.Error("single-buffer chain: expected delme=true (the buffer IS the oldest)")
+	}
+	if n != 1 {
+		t.Errorf("freed count = %d, want 1", n)
+	}
+}
+
+// ─── iterFromTo ───────────────────────────────────────────────────────────────
+
+// TestBufferIterFromToOrder verifies that iterFromTo invokes the callback in
+// chronological order (oldest → newest).
+func TestBufferIterFromToOrder(t *testing.T) {
+	// Each buffer has 2 data points so end() = firstWrite + 2*freq.
+	b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0}  // end=25
+	b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20} // end=45
+	b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40} // end=65
+	b2.prev = b1
+	b1.next = b2
+	b3.prev = b2
+	b2.next = b3
+
+	var order []*buffer
+	err := b3.iterFromTo(0, 100, func(b *buffer) error {
+		order = append(order, b)
+		return nil
+	})
+	if err != nil {
+		t.Fatalf("iterFromTo() error = %v", err)
+	}
+	if len(order) != 3 {
+		t.Fatalf("callback count = %d, want 3", len(order))
+	}
+	if order[0] != b1 || order[1] != b2 || order[2] != b3 {
+		t.Error("iterFromTo() did not call callbacks in chronological (oldest→newest) order")
+	}
+}
+
+// TestBufferIterFromToFiltered verifies that iterFromTo only calls the callback
+// for buffers whose time range overlaps [from, to].
+func TestBufferIterFromToFiltered(t *testing.T) {
+	// b1: end=25  b2: start=20, end=45  b3: start=40, end=65
+	b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0}
+	b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20}
+	b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40}
+	b2.prev = b1
+	b1.next = b2
+	b3.prev = b2
+	b2.next = b3
+
+	// [30,50]: b1.end=25 < 30 → excluded; b2 and b3 overlap → included.
+	var visited []*buffer
+	b3.iterFromTo(30, 50, func(b *buffer) error {
+		visited = append(visited, b)
+		return nil
+	})
+	if len(visited) != 2 {
+		t.Fatalf("visited count = %d, want 2 (b2 and b3)", len(visited))
+	}
+	if visited[0] != b2 || visited[1] != b3 {
+		t.Errorf("visited = %v, want [b2, b3]", visited)
+	}
+}
+
+// TestBufferIterFromToNilBuffer verifies that iterFromTo on a nil buffer is a
+// safe no-op.
+func TestBufferIterFromToNilBuffer(t *testing.T) {
+	var b *buffer
+	called := false
+	err := b.iterFromTo(0, 100, func(_ *buffer) error {
+		called = true
+		return nil
+	})
+	if err != nil {
+		t.Errorf("iterFromTo(nil) error = %v, want nil", err)
+	}
+	if called {
+		t.Error("callback should not be called for a nil buffer")
+	}
+}
+
+// ─── count ────────────────────────────────────────────────────────────────────
+
+// TestBufferCount verifies that count() sums data-point lengths across the
+// entire chain, including all prev links.
+func TestBufferCount(t *testing.T) {
+	b1 := &buffer{data: make([]schema.Float, 3, BufferCap), frequency: 10, start: 0}
+	b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 35}
+	b3 := &buffer{data: make([]schema.Float, 5, BufferCap), frequency: 10, start: 60}
+	b2.prev = b1
+	b1.next = b2
+	b3.prev = b2
+	b2.next = b3
+
+	if got := b3.count(); got != 10 {
+		t.Errorf("count() = %d, want 10 (3+2+5)", got)
+	}
+
+	// Single buffer.
+	lone := &buffer{data: make([]schema.Float, 7, BufferCap)}
+	if got := lone.count(); got != 7 {
+		t.Errorf("count() single buffer = %d, want 7", got)
+	}
+}
+
+// ─── Existing tests below ────────────────────────────────────────────────────
+
 func TestAssignAggregationStrategy(t *testing.T) {
 	tests := []struct {
 		name     string

From 3d5a124321763c3b89323f3e02fcc3584245392a Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Mon, 2 Mar 2026 15:01:44 +0100
Subject: [PATCH 14/20] Refine patterns. Do not match commented lines.

---
 configs/tagger/apps/caracal.txt | 1 -
 configs/tagger/apps/lammps.txt  | 2 +-
 internal/tagger/detectApp.go    | 6 ++++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/configs/tagger/apps/caracal.txt b/configs/tagger/apps/caracal.txt
index ed615121..5c5311f7 100644
--- a/configs/tagger/apps/caracal.txt
+++ b/configs/tagger/apps/caracal.txt
@@ -2,6 +2,5 @@ calc_rate
 qmdffgen
 dynamic
 evbopt
-explore
 black_box
 poly_qmdff
diff --git a/configs/tagger/apps/lammps.txt b/configs/tagger/apps/lammps.txt
index d254f82f..38d3aa5d 100644
--- a/configs/tagger/apps/lammps.txt
+++ b/configs/tagger/apps/lammps.txt
@@ -1 +1 @@
-lmp
+\blmp\s+
diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go
index f86dcb6c..54626eff 100644
--- a/internal/tagger/detectApp.go
+++ b/internal/tagger/detectApp.go
@@ -64,9 +64,11 @@ func (t *AppTagger) scanApp(f *os.File, fns string) {
 		if line == "" {
 			continue
 		}
-		re, err := regexp.Compile(line)
+		// Wrap pattern to skip comment lines: match only if not preceded by # on the same line
+		wrapped := `(?m)^[^#]*` + line
+		re, err := regexp.Compile(wrapped)
 		if err != nil {
-			cclog.Errorf("invalid regex pattern '%s' in %s: %v", line, fns, err)
+			cclog.Errorf("invalid regex pattern '%s' (wrapped: '%s') in %s: %v", line, wrapped, fns, err)
 			continue
 		}
 		ai.patterns = append(ai.patterns, re)

From a243e1749921abe59480c7b171a6c43cbcbbf09a Mon Sep 17 00:00:00 2001
From: Aditya Ujeniya <adityauj@gmail.com>
Date: Mon, 2 Mar 2026 15:27:06 +0100
Subject: [PATCH 15/20] Update to shutdown worker for WAL checkpointing mode

---
 configs/config-demo.json         |  1 +
 pkg/metricstore/metricstore.go   |  2 +-
 pkg/metricstore/walCheckpoint.go | 12 +++++++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/configs/config-demo.json b/configs/config-demo.json
index 509c8f18..50dff298 100644
--- a/configs/config-demo.json
+++ b/configs/config-demo.json
@@ -21,6 +21,7 @@
   ],
   "metric-store": {
     "checkpoints": {
+      "file-format": "wal",
       "interval": "12h"
     },
     "retention-in-memory": "48h",
diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go
index b5b1a528..6d49624a 100644
--- a/pkg/metricstore/metricstore.go
+++ b/pkg/metricstore/metricstore.go
@@ -294,7 +294,7 @@ func Shutdown() {
 		var hostDirs []string
 		files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
 		if err == nil {
-			RotateWALFiles(hostDirs)
+			RotateWALFilesAfterShutdown(hostDirs)
 		}
 	} else {
 		files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go
index 685a8388..07414d98 100644
--- a/pkg/metricstore/walCheckpoint.go
+++ b/pkg/metricstore/walCheckpoint.go
@@ -116,7 +116,6 @@ type walFileState struct {
 // Also handles WAL rotation requests from the checkpoint goroutine.
 func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
 	wg.Go(func() {
-
 		if Keys.Checkpoints.FileFormat == "json" {
 			return
 		}
@@ -235,6 +234,17 @@ func RotateWALFiles(hostDirs []string) {
 	}
 }
 
+// RotateWALFiles sends rotation requests for the given host directories
+// and blocks until all rotations complete.
+func RotateWALFilesAfterShutdown(hostDirs []string) {
+	for _, dir := range hostDirs {
+		walPath := path.Join(dir, "current.wal")
+		if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) {
+			cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err)
+		}
+	}
+}
+
 // buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC).
 func buildWALPayload(msg *WALMessage) []byte {
 	size := 8 + 2 + len(msg.MetricName) + 1 + 4

From 718ff60221028881d01d3b7dd05f21991cc84018 Mon Sep 17 00:00:00 2001
From: Christoph Kluge <christoph.kluge@fau.de>
Date: Mon, 2 Mar 2026 14:10:28 +0100
Subject: [PATCH 16/20] clarify ccms logs

---
 cmd/cc-backend/main.go                             |  2 +-
 internal/metricdispatch/metricdata.go              |  4 ++--
 .../metricstoreclient/cc-metric-store-queries.go   |  4 ++--
 internal/metricstoreclient/cc-metric-store.go      | 10 +++++-----
 pkg/metricstore/query.go                           | 14 +++++++-------
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go
index 81d397d2..5b51b963 100644
--- a/cmd/cc-backend/main.go
+++ b/cmd/cc-backend/main.go
@@ -339,7 +339,7 @@ func runServer(ctx context.Context) error {
 		err := metricdispatch.Init(mscfg)
 
 		if err != nil {
-			cclog.Debugf("initializing metricdispatch: %v", err)
+			cclog.Debugf("error while initializing external metricdispatch: %v", err)
 		} else {
 			haveMetricstore = true
 		}
diff --git a/internal/metricdispatch/metricdata.go b/internal/metricdispatch/metricdata.go
index 36a10004..3f03234e 100755
--- a/internal/metricdispatch/metricdata.go
+++ b/internal/metricdispatch/metricdata.go
@@ -74,11 +74,11 @@ func Init(rawConfig json.RawMessage) error {
 		dec := json.NewDecoder(bytes.NewReader(rawConfig))
 		dec.DisallowUnknownFields()
 		if err := dec.Decode(&configs); err != nil {
-			return fmt.Errorf("[METRICDISPATCH]> Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error())
+			return fmt.Errorf("[METRICDISPATCH]> External Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error())
 		}
 
 		if len(configs) == 0 {
-			return fmt.Errorf("[METRICDISPATCH]> No metric store configurations found in config file")
+			return fmt.Errorf("[METRICDISPATCH]> No external metric store configurations found in config file")
 		}
 
 		for _, config := range configs {
diff --git a/internal/metricstoreclient/cc-metric-store-queries.go b/internal/metricstoreclient/cc-metric-store-queries.go
index d42c9355..949efa10 100644
--- a/internal/metricstoreclient/cc-metric-store-queries.go
+++ b/internal/metricstoreclient/cc-metric-store-queries.go
@@ -134,7 +134,7 @@ func (ccms *CCMetricStore) buildQueries(
 				)
 
 				if len(hostQueries) == 0 && len(hostScopes) == 0 {
-					return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
+					return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
 				}
 
 				queries = append(queries, hostQueries...)
@@ -237,7 +237,7 @@ func (ccms *CCMetricStore) buildNodeQueries(
 				)
 
 				if len(nodeQueries) == 0 && len(nodeScopes) == 0 {
-					return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
+					return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
 				}
 
 				queries = append(queries, nodeQueries...)
diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go
index 7bf7d146..39c028d5 100644
--- a/internal/metricstoreclient/cc-metric-store.go
+++ b/internal/metricstoreclient/cc-metric-store.go
@@ -123,7 +123,7 @@ type APIMetricData struct {
 	Max        schema.Float   `json:"max"`        // Maximum value in time range
 }
 
-// NewCCMetricStore creates and initializes a new CCMetricStore client.
+// NewCCMetricStore creates and initializes a new (external) CCMetricStore client.
 // The url parameter should include the protocol and port (e.g., "http://localhost:8080").
 // The token parameter is a JWT used for Bearer authentication; pass empty string if auth is disabled.
 func NewCCMetricStore(url string, token string) *CCMetricStore {
@@ -356,7 +356,7 @@ func (ccms *CCMetricStore) LoadData(
 
 	if len(errors) != 0 {
 		/* Returns list for "partial errors" */
-		return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return jobData, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 	return jobData, nil
 }
@@ -514,7 +514,7 @@ func (ccms *CCMetricStore) LoadScopedStats(
 
 	if len(errors) != 0 {
 		/* Returns list for "partial errors" */
-		return scopedJobStats, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return scopedJobStats, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 	return scopedJobStats, nil
 }
@@ -604,7 +604,7 @@ func (ccms *CCMetricStore) LoadNodeData(
 
 	if len(errors) != 0 {
 		/* Returns list of "partial errors" */
-		return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 
 	return data, nil
@@ -765,7 +765,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
 
 	if len(errors) != 0 {
 		/* Returns list of "partial errors" */
-		return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 
 	return data, nil
diff --git a/pkg/metricstore/query.go b/pkg/metricstore/query.go
index 0a61efaa..735c45d6 100644
--- a/pkg/metricstore/query.go
+++ b/pkg/metricstore/query.go
@@ -211,7 +211,7 @@ func (ccms *InternalMetricStore) LoadData(
 
 	if len(errors) != 0 {
 		/* Returns list for "partial errors" */
-		return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return jobData, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 	return jobData, nil
 }
@@ -260,7 +260,7 @@ func buildQueries(
 	resolution int64,
 ) ([]APIQuery, []schema.MetricScope, error) {
 	if len(job.Resources) == 0 {
-		return nil, nil, fmt.Errorf("METRICDATA/CCMS > no resources allocated for job %d", job.JobID)
+		return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > no resources allocated for job %d", job.JobID)
 	}
 
 	queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
@@ -531,7 +531,7 @@ func buildQueries(
 					continue
 				}
 
-				return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
+				return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
 			}
 		}
 	}
@@ -719,7 +719,7 @@ func (ccms *InternalMetricStore) LoadScopedStats(
 
 	if len(errors) != 0 {
 		/* Returns list for "partial errors" */
-		return scopedJobStats, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return scopedJobStats, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 	return scopedJobStats, nil
 }
@@ -824,7 +824,7 @@ func (ccms *InternalMetricStore) LoadNodeData(
 
 	if len(errors) != 0 {
 		/* Returns list of "partial errors" */
-		return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return data, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 
 	return data, nil
@@ -994,7 +994,7 @@ func (ccms *InternalMetricStore) LoadNodeListData(
 
 	if len(errors) != 0 {
 		/* Returns list of "partial errors" */
-		return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
+		return data, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
 	}
 
 	return data, nil
@@ -1313,7 +1313,7 @@ func buildNodeQueries(
 					continue
 				}
 
-				return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
+				return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
 			}
 		}
 	}

From 32fd18543a150fb8ca9436091dfa37d97d0ffd10 Mon Sep 17 00:00:00 2001
From: Christoph Kluge <christoph.kluge@fau.de>
Date: Mon, 2 Mar 2026 15:35:07 +0100
Subject: [PATCH 17/20] differentiate between expected and unexpected cases in
 external ccms queryBuilder

---
 .../metricstoreclient/cc-metric-store-queries.go    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/internal/metricstoreclient/cc-metric-store-queries.go b/internal/metricstoreclient/cc-metric-store-queries.go
index 949efa10..7a04efc4 100644
--- a/internal/metricstoreclient/cc-metric-store-queries.go
+++ b/internal/metricstoreclient/cc-metric-store-queries.go
@@ -126,6 +126,7 @@ func (ccms *CCMetricStore) buildQueries(
 					hwthreads = topology.Node
 				}
 
+				// Note: Expected exceptions will return as empty slices -> Continue
 				hostQueries, hostScopes := buildScopeQueries(
 					nativeScope, requestedScope,
 					remoteName, host.Hostname,
@@ -133,8 +134,9 @@ func (ccms *CCMetricStore) buildQueries(
 					resolution,
 				)
 
-				if len(hostQueries) == 0 && len(hostScopes) == 0 {
-					return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
+				// Note: Unexpected errors, such as unhandled cases, will return as nils -> Error
+				if hostQueries == nil && hostScopes == nil {
+					return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
 				}
 
 				queries = append(queries, hostQueries...)
@@ -269,6 +271,7 @@ func buildScopeQueries(
 	// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
 	if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
 		if scope != schema.MetricScopeAccelerator {
+			// Expected Exception -> Continue -> Return Empty Slices
 			return queries, scopes
 		}
 
@@ -287,6 +290,7 @@ func buildScopeQueries(
 	// Accelerator -> Node
 	if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
 		if len(accelerators) == 0 {
+			// Expected Exception -> Continue -> Return Empty Slices
 			return queries, scopes
 		}
 
@@ -447,6 +451,7 @@ func buildScopeQueries(
 		socketToDomains, err := topology.GetMemoryDomainsBySocket(memDomains)
 		if err != nil {
 			cclog.Errorf("Error mapping memory domains to sockets, return unchanged: %v", err)
+			// Rare Error Case -> Still Continue -> Return Empty Slices
 			return queries, scopes
 		}
 
@@ -507,8 +512,8 @@ func buildScopeQueries(
 		return queries, scopes
 	}
 
-	// Unhandled case - return empty slices
-	return queries, scopes
+	// Unhandled Case -> Error -> Return nils
+	return nil, nil
 }
 
 // intToStringSlice converts a slice of integers to a slice of strings.

From 38bb2dd4ec9953dadc271bcb30abd87374c1d601 Mon Sep 17 00:00:00 2001
From: Christoph Kluge <christoph.kluge@fau.de>
Date: Mon, 2 Mar 2026 16:24:27 +0100
Subject: [PATCH 18/20] add outofindex checks to external ccms codebase

---
 internal/metricstoreclient/cc-metric-store.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go
index 39c028d5..e2a84466 100644
--- a/internal/metricstoreclient/cc-metric-store.go
+++ b/internal/metricstoreclient/cc-metric-store.go
@@ -393,6 +393,10 @@ func (ccms *CCMetricStore) LoadStats(
 
 	stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
 	for i, res := range resBody.Results {
+		if len(res) == 0 {
+			// No Data Found For Metric, Logged in FetchData to Warn
+			continue
+		}
 		query := req.Queries[i]
 		metric := query.Metric
 		data := res[0]
@@ -562,6 +566,11 @@ func (ccms *CCMetricStore) LoadNodeData(
 	var errors []string
 	data := make(map[string]map[string][]*schema.JobMetric)
 	for i, res := range resBody.Results {
+		if len(res) == 0 {
+			// No Data Found For Metric, Logged in FetchData to Warn
+			continue
+		}
+
 		var query APIQuery
 		if resBody.Queries != nil {
 			query = resBody.Queries[i]
@@ -572,7 +581,6 @@ func (ccms *CCMetricStore) LoadNodeData(
 		metric := query.Metric
 		qdata := res[0]
 		if qdata.Error != nil {
-			/* Build list for "partial errors", if any */
 			errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
 		}
 

From 22c442db5bfb6cb6d074797e105adb6813cef30d Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Mon, 2 Mar 2026 18:47:45 +0100
Subject: [PATCH 19/20] Enable entire integration

---
 .claude/settings.json | 84 +++++++++++++++++++++++++++++++++++++++++++
 .entire/.gitignore    |  4 +++
 .entire/settings.json |  4 +++
 3 files changed, 92 insertions(+)
 create mode 100644 .claude/settings.json
 create mode 100644 .entire/.gitignore
 create mode 100644 .entire/settings.json

diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 00000000..5cfa5854
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,84 @@
+{
+  "hooks": {
+    "PostToolUse": [
+      {
+        "matcher": "Task",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "entire hooks claude-code post-task"
+          }
+        ]
+      },
+      {
+        "matcher": "TodoWrite",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "entire hooks claude-code post-todo"
+          }
+        ]
+      }
+    ],
+    "PreToolUse": [
+      {
+        "matcher": "Task",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "entire hooks claude-code pre-task"
+          }
+        ]
+      }
+    ],
+    "SessionEnd": [
+      {
+        "matcher": "",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "entire hooks claude-code session-end"
+          }
+        ]
+      }
+    ],
+    "SessionStart": [
+      {
+        "matcher": "",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "entire hooks claude-code session-start"
+          }
+        ]
+      }
+    ],
+    "Stop": [
+      {
+        "matcher": "",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "entire hooks claude-code stop"
+          }
+        ]
+      }
+    ],
+    "UserPromptSubmit": [
+      {
+        "matcher": "",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "entire hooks claude-code user-prompt-submit"
+          }
+        ]
+      }
+    ]
+  },
+  "permissions": {
+    "deny": [
+      "Read(./.entire/metadata/**)"
+    ]
+  }
+}
diff --git a/.entire/.gitignore b/.entire/.gitignore
new file mode 100644
index 00000000..2cffdefa
--- /dev/null
+++ b/.entire/.gitignore
@@ -0,0 +1,4 @@
+tmp/
+settings.local.json
+metadata/
+logs/
diff --git a/.entire/settings.json b/.entire/settings.json
new file mode 100644
index 00000000..7cce5590
--- /dev/null
+++ b/.entire/settings.json
@@ -0,0 +1,4 @@
+{
+  "enabled": true,
+  "telemetry": true
+}

From 15be664ad806a470d57e6675454338077bf937f0 Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Tue, 3 Mar 2026 06:58:03 +0100
Subject: [PATCH 20/20] Add entire gitignore

---
 .entire/.gitignore | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .entire/.gitignore

diff --git a/.entire/.gitignore b/.entire/.gitignore
new file mode 100644
index 00000000..2cffdefa
--- /dev/null
+++ b/.entire/.gitignore
@@ -0,0 +1,4 @@
+tmp/
+settings.local.json
+metadata/
+logs/