Merge branch 'dev' into optimize-checkpoint-loading

Attempt to minimize heap allocations by using caching
2026-03-01 05:47:29 +01:00 · 2026-02-27 09:31:50 +01:00 · 2026-02-25 07:52:02 +01:00 · 2026-02-25 07:39:38 +01:00 · 2026-02-23 14:21:32 +01:00 · 2026-02-23 14:21:17 +01:00
18 changed files with 831 additions and 2429 deletions
--- a/internal/api/metricstore.go
+++ b/internal/api/metricstore.go
@@ -10,6 +10,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io"
 	"net/http"
 	"strconv"
 	"strings"
@@ -89,17 +90,16 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
 // @security    ApiKeyAuth
 // @router      /write/ [post]
 func writeMetrics(rw http.ResponseWriter, r *http.Request) {
+	bytes, err := io.ReadAll(r.Body)
 	rw.Header().Add("Content-Type", "application/json")
+	if err != nil {
+		handleError(err, http.StatusInternalServerError, rw)
+		return
+	}

-	// Extract the "cluster" query parameter without allocating a url.Values map.
-	cluster := queryParam(r.URL.RawQuery, "cluster")
-
-	// Stream directly from the request body instead of copying it into a
-	// temporary buffer via io.ReadAll. The line-protocol decoder supports
-	// io.Reader natively, so this avoids the largest heap allocation.
 	ms := metricstore.GetMemoryStore()
-	dec := lineprotocol.NewDecoder(r.Body)
-	if err := metricstore.DecodeLine(dec, ms, cluster); err != nil {
+	dec := lineprotocol.NewDecoderWithBytes(bytes)
+	if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
 		cclog.Errorf("/api/write error: %s", err.Error())
 		handleError(err, http.StatusBadRequest, rw)
 		return
@@ -107,20 +107,6 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
 	rw.WriteHeader(http.StatusOK)
 }

-// queryParam extracts a single query-parameter value from a raw query string
-// without allocating a url.Values map. Returns "" if the key is not present.
-func queryParam(raw, key string) string {
-	for raw != "" {
-		var kv string
-		kv, raw, _ = strings.Cut(raw, "&")
-		k, v, _ := strings.Cut(kv, "=")
-		if k == key {
-			return v
-		}
-	}
-	return ""
-}
-
 // handleDebug godoc
 // @summary Debug endpoint
 // @tags debug
--- a/internal/api/rest.go
+++ b/internal/api/rest.go
@@ -302,7 +302,7 @@ func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) {

 	rw.Header().Set("Content-Type", "text/plain")
 	rw.WriteHeader(http.StatusOK)
-	if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil {
+	if _, err := rw.Write([]byte(fmt.Sprintf("Tagger %s started", name))); err != nil {
 		cclog.Errorf("Failed to write response: %v", err)
 	}
 }
--- a/pkg/archive/fsBackend.go
+++ b/pkg/archive/fsBackend.go
@@ -501,7 +501,9 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
 		var wg sync.WaitGroup

 		for range numWorkers {
-			wg.Go(func() {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
 				for jobPath := range jobPaths {
 					job, err := loadJobMeta(filepath.Join(jobPath, "meta.json"))
 					if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
@@ -527,7 +529,7 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
 						ch <- JobContainer{Meta: job, Data: nil}
 					}
 				}
-			})
+			}()
 		}

 		clustersDir, err := os.ReadDir(fsa.path)
--- a/pkg/archive/s3Backend.go
+++ b/pkg/archive/s3Backend.go
@@ -821,7 +821,9 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
 		var wg sync.WaitGroup

 		for range numWorkers {
-			wg.Go(func() {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
 				for metaKey := range metaKeys {
 					result, err := s3a.client.GetObject(ctx, &s3.GetObjectInput{
 						Bucket: aws.String(s3a.bucket),
@@ -857,7 +859,7 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
 						ch <- JobContainer{Meta: job, Data: nil}
 					}
 				}
-			})
+			}()
 		}

 		for _, cluster := range s3a.clusters {
--- a/pkg/archive/sqliteBackend.go
+++ b/pkg/archive/sqliteBackend.go
@@ -576,7 +576,9 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer {
 		var wg sync.WaitGroup

 		for range numWorkers {
-			wg.Go(func() {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
 				for row := range jobRows {
 					job, err := DecodeJobMeta(bytes.NewReader(row.metaBlob))
 					if err != nil {
@@ -615,7 +617,7 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer {
 						ch <- JobContainer{Meta: job, Data: nil}
 					}
 				}
-			})
+			}()
 		}

 		for {
--- a/pkg/metricstore/archive.go
+++ b/pkg/metricstore/archive.go
@@ -6,9 +6,12 @@
 package metricstore

 import (
+	"archive/zip"
+	"bufio"
 	"context"
 	"errors"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"sync"
@@ -44,9 +47,11 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
 	}
 }

-// cleanUpWorker takes simple values to configure what it does
+// runWorker takes simple values to configure what it does
 func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) {
-	wg.Go(func() {
+	wg.Add(1)
+	go func() {
+		defer wg.Done()

 		d, err := time.ParseDuration(interval)
 		if err != nil {
@@ -72,38 +77,29 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod
 				if err != nil {
 					cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error())
 				} else {
-					if delete {
+					if delete && cleanupDir == "" {
 						cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n)
 					} else {
-						cclog.Infof("[METRICSTORE]> done: %d checkpoint files archived to parquet", n)
+						cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
 					}
 				}
 			}
 		}
-	})
+	}()
 }

 var ErrNoNewArchiveData error = errors.New("all data already archived")

-// CleanupCheckpoints deletes or archives all checkpoint files older than `from`.
-// When archiving, consolidates all hosts per cluster into a single Parquet file.
+// Delete or ZIP all checkpoint files older than `from` together and write them to the `cleanupDir`,
+// deleting/moving them from the `checkpointsDir`.
 func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteInstead bool) (int, error) {
-	if deleteInstead {
-		return deleteCheckpoints(checkpointsDir, from)
-	}
-
-	return archiveCheckpoints(checkpointsDir, cleanupDir, from)
-}
-
-// deleteCheckpoints removes checkpoint files older than `from` across all clusters/hosts.
-func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
 	entries1, err := os.ReadDir(checkpointsDir)
 	if err != nil {
 		return 0, err
 	}

 	type workItem struct {
-		dir            string
+		cdir, adir    string
 		cluster, host string
 	}

@@ -115,29 +111,13 @@ func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
 	for worker := 0; worker < Keys.NumWorkers; worker++ {
 		go func() {
 			defer wg.Done()
-			for item := range work {
-				entries, err := os.ReadDir(item.dir)
+			for workItem := range work {
+				m, err := cleanupCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
 				if err != nil {
-					cclog.Errorf("error reading %s/%s: %s", item.cluster, item.host, err.Error())
+					cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
 					atomic.AddInt32(&errs, 1)
-					continue
-				}
-
-				files, err := findFiles(entries, from, false)
-				if err != nil {
-					cclog.Errorf("error finding files in %s/%s: %s", item.cluster, item.host, err.Error())
-					atomic.AddInt32(&errs, 1)
-					continue
-				}
-
-				for _, checkpoint := range files {
-					if err := os.Remove(filepath.Join(item.dir, checkpoint)); err != nil {
-						cclog.Errorf("error deleting %s/%s/%s: %s", item.cluster, item.host, checkpoint, err.Error())
-						atomic.AddInt32(&errs, 1)
-					} else {
-						atomic.AddInt32(&n, 1)
-					}
 				}
+				atomic.AddInt32(&n, int32(m))
 			}
 		}()
 	}
@@ -146,14 +126,14 @@ func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
 		entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
 		if e != nil {
 			err = e
-			continue
 		}

 		for _, de2 := range entries2 {
+			cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
+			adir := filepath.Join(cleanupDir, de1.Name(), de2.Name())
 			work <- workItem{
-				dir:     filepath.Join(checkpointsDir, de1.Name(), de2.Name()),
-				cluster: de1.Name(),
-				host:    de2.Name(),
+				adir: adir, cdir: cdir,
+				cluster: de1.Name(), host: de2.Name(),
 			}
 		}
 	}
@@ -164,118 +144,85 @@ func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
 	if err != nil {
 		return int(n), err
 	}
+
 	if errs > 0 {
-		return int(n), fmt.Errorf("%d errors happened while deleting (%d successes)", errs, n)
+		return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
 	}
 	return int(n), nil
 }

-// archiveCheckpoints archives checkpoint files to Parquet format.
-// Produces one Parquet file per cluster: <cleanupDir>/<cluster>/<timestamp>.parquet
-func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, error) {
-	clusterEntries, err := os.ReadDir(checkpointsDir)
+// Helper function for `CleanupCheckpoints`.
+func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead bool) (int, error) {
+	entries, err := os.ReadDir(dir)
 	if err != nil {
 		return 0, err
 	}

-	totalFiles := 0
-
-	for _, clusterEntry := range clusterEntries {
-		if !clusterEntry.IsDir() {
-			continue
-		}
-
-		cluster := clusterEntry.Name()
-		hostEntries, err := os.ReadDir(filepath.Join(checkpointsDir, cluster))
+	files, err := findFiles(entries, from, false)
 	if err != nil {
-			return totalFiles, err
+		return 0, err
 	}

-		// Collect rows from all hosts in this cluster using worker pool
-		type hostResult struct {
-			rows  []ParquetMetricRow
-			files []string // checkpoint filenames to delete after successful write
-			dir   string   // checkpoint directory for this host
+	if deleteInstead {
+		n := 0
+		for _, checkpoint := range files {
+			filename := filepath.Join(dir, checkpoint)
+			if err = os.Remove(filename); err != nil {
+				return n, err
+			}
+			n += 1
+		}
+		return n, nil
 	}

-		results := make(chan hostResult, len(hostEntries))
-		work := make(chan struct {
-			dir, host string
-		}, Keys.NumWorkers)
-
-		var wg sync.WaitGroup
-		errs := int32(0)
-
-		wg.Add(Keys.NumWorkers)
-		for w := 0; w < Keys.NumWorkers; w++ {
-			go func() {
-				defer wg.Done()
-				for item := range work {
-					rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from)
+	filename := filepath.Join(cleanupDir, fmt.Sprintf("%d.zip", from))
+	f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
+	if err != nil && os.IsNotExist(err) {
+		err = os.MkdirAll(cleanupDir, CheckpointDirPerms)
+		if err == nil {
+			f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
+		}
+	}
 	if err != nil {
-						cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error())
-						atomic.AddInt32(&errs, 1)
-						continue
+		return 0, err
 	}
-					if len(rows) > 0 {
-						results <- hostResult{rows: rows, files: files, dir: item.dir}
+	defer f.Close()
+	bw := bufio.NewWriter(f)
+	defer bw.Flush()
+	zw := zip.NewWriter(bw)
+	defer zw.Close()
+
+	n := 0
+	for _, checkpoint := range files {
+		// Use closure to ensure file is closed immediately after use,
+		// avoiding file descriptor leak from defer in loop
+		err := func() error {
+			filename := filepath.Join(dir, checkpoint)
+			r, err := os.Open(filename)
+			if err != nil {
+				return err
 			}
+			defer r.Close()
+
+			w, err := zw.Create(checkpoint)
+			if err != nil {
+				return err
 			}
+
+			if _, err = io.Copy(w, r); err != nil {
+				return err
+			}
+
+			if err = os.Remove(filename); err != nil {
+				return err
+			}
+			return nil
 		}()
+		if err != nil {
+			return n, err
+		}
+		n += 1
 	}

-		go func() {
-			for _, hostEntry := range hostEntries {
-				if !hostEntry.IsDir() {
-					continue
-				}
-				dir := filepath.Join(checkpointsDir, cluster, hostEntry.Name())
-				work <- struct {
-					dir, host string
-				}{dir: dir, host: hostEntry.Name()}
-			}
-			close(work)
-			wg.Wait()
-			close(results)
-		}()
-
-		// Collect all rows and file info
-		var allRows []ParquetMetricRow
-		var allResults []hostResult
-		for r := range results {
-			allRows = append(allRows, r.rows...)
-			allResults = append(allResults, r)
-		}
-
-		if errs > 0 {
-			return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster)
-		}
-
-		if len(allRows) == 0 {
-			continue
-		}
-
-		// Write one Parquet file per cluster
-		parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
-		if err := writeParquetArchive(parquetFile, allRows); err != nil {
-			return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err)
-		}
-
-		// Delete archived checkpoint files
-		for _, result := range allResults {
-			for _, file := range result.files {
-				filename := filepath.Join(result.dir, file)
-				if err := os.Remove(filename); err != nil {
-					cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err)
-				} else {
-					totalFiles++
-				}
-			}
-		}
-
-		cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s",
-			len(allRows), totalFiles, cluster, parquetFile)
-	}
-
-	return totalFiles, nil
+	return n, nil
 }
--- a/pkg/metricstore/binaryCheckpoint.go
+++ b/pkg/metricstore/binaryCheckpoint.go
@@ -0,0 +1,274 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// This file implements the binary checkpoint format for fast loading.
+//
+// The binary format stores metric data in column-oriented layout (per-metric
+// float64 arrays) for maximum load speed. Float32 arrays are read/written
+// as raw bytes, avoiding per-element parsing overhead.
+//
+// File format:
+//
+//	Header (28 bytes):
+//	  magic:    [4]byte  "CCMS"
+//	  version:  uint32   LE
+//	  from:     int64    LE
+//	  to:       int64    LE
+//
+//	Body (recursive):
+//	  nmetrics: uint32   LE
+//	  Per metric:
+//	    name_len: uint16   LE
+//	    name:     []byte
+//	    freq:     int64    LE
+//	    start:    int64    LE
+//	    nvalues:  uint32   LE
+//	    data:     []float64 LE  (NaN = missing)
+//	  nchildren: uint32   LE
+//	  Per child:
+//	    name_len: uint16   LE
+//	    name:     []byte
+//	    (recursive body)
+package metricstore
+
+import (
+	"bufio"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"unsafe"
+
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+var (
+	binaryMagic     = [4]byte{'C', 'C', 'M', 'S'}
+	binaryVersion   = uint32(1)
+	binaryByteOrder = binary.LittleEndian
+	floatSize       = int(unsafe.Sizeof(schema.Float(0))) // schema.Float is float64
+)
+
+// writeBinaryCheckpoint writes a CheckpointFile to a binary checkpoint file on disk.
+func writeBinaryCheckpoint(filePath string, cf *CheckpointFile) error {
+	f, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
+	if err != nil && os.IsNotExist(err) {
+		if err2 := os.MkdirAll(path.Dir(filePath), CheckpointDirPerms); err2 != nil {
+			return err2
+		}
+		f, err = os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
+	}
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	bw := bufio.NewWriter(f)
+
+	// Write header
+	if _, err := bw.Write(binaryMagic[:]); err != nil {
+		return err
+	}
+	if err := binary.Write(bw, binaryByteOrder, binaryVersion); err != nil {
+		return err
+	}
+	if err := binary.Write(bw, binaryByteOrder, cf.From); err != nil {
+		return err
+	}
+	if err := binary.Write(bw, binaryByteOrder, cf.To); err != nil {
+		return err
+	}
+
+	// Write body (metrics + children recursively)
+	if err := writeBinaryBody(bw, cf); err != nil {
+		return err
+	}
+
+	return bw.Flush()
+}
+
+// writeBinaryBody writes the metrics and children of a CheckpointFile.
+func writeBinaryBody(w io.Writer, cf *CheckpointFile) error {
+	if err := binary.Write(w, binaryByteOrder, uint32(len(cf.Metrics))); err != nil {
+		return err
+	}
+
+	for name, metric := range cf.Metrics {
+		nameBytes := []byte(name)
+		if err := binary.Write(w, binaryByteOrder, uint16(len(nameBytes))); err != nil {
+			return err
+		}
+		if _, err := w.Write(nameBytes); err != nil {
+			return err
+		}
+		if err := binary.Write(w, binaryByteOrder, metric.Frequency); err != nil {
+			return err
+		}
+		if err := binary.Write(w, binaryByteOrder, metric.Start); err != nil {
+			return err
+		}
+		if err := binary.Write(w, binaryByteOrder, uint32(len(metric.Data))); err != nil {
+			return err
+		}
+		if err := writeFloatArray(w, metric.Data); err != nil {
+			return err
+		}
+	}
+
+	if err := binary.Write(w, binaryByteOrder, uint32(len(cf.Children))); err != nil {
+		return err
+	}
+
+	for name, child := range cf.Children {
+		nameBytes := []byte(name)
+		if err := binary.Write(w, binaryByteOrder, uint16(len(nameBytes))); err != nil {
+			return err
+		}
+		if _, err := w.Write(nameBytes); err != nil {
+			return err
+		}
+		if err := writeBinaryBody(w, child); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// writeFloatArray writes a schema.Float slice as raw little-endian float64 bytes.
+func writeFloatArray(w io.Writer, data []schema.Float) error {
+	if len(data) == 0 {
+		return nil
+	}
+	buf := unsafe.Slice((*byte)(unsafe.Pointer(&data[0])), len(data)*floatSize)
+	_, err := w.Write(buf)
+	return err
+}
+
+// loadBinaryFile reads a binary checkpoint file into a CheckpointFile.
+func loadBinaryFile(filePath string) (*CheckpointFile, error) {
+	f, err := os.Open(filePath)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	br := bufio.NewReader(f)
+
+	var magic [4]byte
+	if _, err := io.ReadFull(br, magic[:]); err != nil {
+		return nil, fmt.Errorf("reading magic: %w", err)
+	}
+	if magic != binaryMagic {
+		return nil, fmt.Errorf("[METRICSTORE]> invalid binary checkpoint magic in %s", filePath)
+	}
+
+	var version uint32
+	if err := binary.Read(br, binaryByteOrder, &version); err != nil {
+		return nil, fmt.Errorf("reading version: %w", err)
+	}
+	if version != binaryVersion {
+		return nil, fmt.Errorf("[METRICSTORE]> unsupported binary checkpoint version %d in %s", version, filePath)
+	}
+
+	cf := &CheckpointFile{}
+	if err := binary.Read(br, binaryByteOrder, &cf.From); err != nil {
+		return nil, fmt.Errorf("reading from: %w", err)
+	}
+	if err := binary.Read(br, binaryByteOrder, &cf.To); err != nil {
+		return nil, fmt.Errorf("reading to: %w", err)
+	}
+
+	if err := readBinaryBody(br, cf); err != nil {
+		return nil, err
+	}
+
+	return cf, nil
+}
+
+// readBinaryBody reads the metrics and children of a CheckpointFile.
+func readBinaryBody(r io.Reader, cf *CheckpointFile) error {
+	var nmetrics uint32
+	if err := binary.Read(r, binaryByteOrder, &nmetrics); err != nil {
+		return fmt.Errorf("reading metric count: %w", err)
+	}
+
+	cf.Metrics = make(map[string]*CheckpointMetrics, nmetrics)
+
+	for range nmetrics {
+		var nameLen uint16
+		if err := binary.Read(r, binaryByteOrder, &nameLen); err != nil {
+			return fmt.Errorf("reading metric name length: %w", err)
+		}
+		nameBytes := make([]byte, nameLen)
+		if _, err := io.ReadFull(r, nameBytes); err != nil {
+			return fmt.Errorf("reading metric name: %w", err)
+		}
+
+		cm := &CheckpointMetrics{}
+		if err := binary.Read(r, binaryByteOrder, &cm.Frequency); err != nil {
+			return fmt.Errorf("reading frequency: %w", err)
+		}
+		if err := binary.Read(r, binaryByteOrder, &cm.Start); err != nil {
+			return fmt.Errorf("reading start: %w", err)
+		}
+
+		var nvalues uint32
+		if err := binary.Read(r, binaryByteOrder, &nvalues); err != nil {
+			return fmt.Errorf("reading value count: %w", err)
+		}
+
+		var err error
+		cm.Data, err = readFloatArray(r, int(nvalues))
+		if err != nil {
+			return fmt.Errorf("reading data for %s: %w", string(nameBytes), err)
+		}
+
+		cf.Metrics[string(nameBytes)] = cm
+	}
+
+	var nchildren uint32
+	if err := binary.Read(r, binaryByteOrder, &nchildren); err != nil {
+		return fmt.Errorf("reading children count: %w", err)
+	}
+
+	cf.Children = make(map[string]*CheckpointFile, nchildren)
+
+	for range nchildren {
+		var nameLen uint16
+		if err := binary.Read(r, binaryByteOrder, &nameLen); err != nil {
+			return fmt.Errorf("reading child name length: %w", err)
+		}
+		nameBytes := make([]byte, nameLen)
+		if _, err := io.ReadFull(r, nameBytes); err != nil {
+			return fmt.Errorf("reading child name: %w", err)
+		}
+
+		child := &CheckpointFile{}
+		if err := readBinaryBody(r, child); err != nil {
+			return fmt.Errorf("reading child %s: %w", string(nameBytes), err)
+		}
+
+		cf.Children[string(nameBytes)] = child
+	}
+
+	return nil
+}
+
+// readFloatArray reads n float32 values from raw little-endian bytes.
+func readFloatArray(r io.Reader, n int) ([]schema.Float, error) {
+	if n == 0 {
+		return nil, nil
+	}
+
+	data := make([]schema.Float, n)
+	buf := unsafe.Slice((*byte)(unsafe.Pointer(&data[0])), n*floatSize)
+	if _, err := io.ReadFull(r, buf); err != nil {
+		return nil, err
+	}
+
+	return data, nil
+}
--- a/pkg/metricstore/buffer.go
+++ b/pkg/metricstore/buffer.go
@@ -43,7 +43,6 @@ package metricstore
 import (
 	"errors"
 	"sync"
-	"time"

 	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )
@@ -54,97 +53,12 @@ import (
 // of data or reallocation needs to happen on writes.
 const BufferCap int = DefaultBufferCapacity

-// maxPoolSize caps the number of buffers held in the pool at any time.
-// Prevents unbounded memory growth after large retention-cleanup bursts.
-const maxPoolSize = 4096
-
-// BufferPool is the global instance.
-// It is initialized immediately when the package loads.
-var bufferPool = NewPersistentBufferPool()
-
-type PersistentBufferPool struct {
-	pool []*buffer
-	mu   sync.Mutex
-}
-
-// NewPersistentBufferPool creates a dynamic pool for buffers.
-func NewPersistentBufferPool() *PersistentBufferPool {
-	return &PersistentBufferPool{
-		pool: make([]*buffer, 0),
-	}
-}
-
-func (p *PersistentBufferPool) Get() *buffer {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	n := len(p.pool)
-	if n == 0 {
-		// Pool is empty, allocate a new one
+var bufferPool sync.Pool = sync.Pool{
+	New: func() any {
 		return &buffer{
 			data: make([]schema.Float, 0, BufferCap),
 		}
-	}
-
-	// Reuse existing buffer from the pool
-	b := p.pool[n-1]
-	p.pool[n-1] = nil // Avoid memory leak
-	p.pool = p.pool[:n-1]
-	return b
-}
-
-// Put returns b to the pool. The caller must set b.lastUsed = time.Now().Unix()
-// before calling Put so that Clean() can evict idle entries correctly.
-func (p *PersistentBufferPool) Put(b *buffer) {
-	// Reset the buffer before putting it back
-	b.data = b.data[:0]
-
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	if len(p.pool) >= maxPoolSize {
-		// Pool is full; drop the buffer and let GC collect it.
-		return
-	}
-	p.pool = append(p.pool, b)
-}
-
-// GetSize returns the exact number of buffers currently sitting in the pool.
-func (p *PersistentBufferPool) GetSize() int {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	return len(p.pool)
-}
-
-// Clear drains all buffers currently in the pool, allowing the GC to collect them.
-func (p *PersistentBufferPool) Clear() {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	for i := range p.pool {
-		p.pool[i] = nil
-	}
-	p.pool = p.pool[:0]
-}
-
-// Clean removes buffers from the pool that haven't been used in the given duration.
-// It uses a simple LRU approach based on the lastUsed timestamp.
-func (p *PersistentBufferPool) Clean(threshold int64) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Filter in place, retaining only buffers returned to the pool recently enough.
-	active := p.pool[:0]
-	for _, b := range p.pool {
-		if b.lastUsed >= threshold {
-			active = append(active, b)
-		}
-	}
-
-	// Nullify the rest to prevent memory leaks
-	for i := len(active); i < len(p.pool); i++ {
-		p.pool[i] = nil
-	}
-
-	p.pool = active
+	},
 }

 var (
@@ -180,11 +94,10 @@ type buffer struct {
 	start     int64
 	archived  bool
 	closed    bool
-	lastUsed  int64
 }

 func newBuffer(ts, freq int64) *buffer {
-	b := bufferPool.Get()
+	b := bufferPool.Get().(*buffer)
 	b.frequency = freq
 	b.start = ts - (freq / 2)
 	b.prev = nil
@@ -271,13 +184,11 @@ func (b *buffer) firstWrite() int64 {
 //
 // Panics if 'data' slice is too small to hold all values in [from, to).
 func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
-	// Walk back to the buffer that covers 'from', adjusting if we hit the oldest.
-	for from < b.firstWrite() {
-		if b.prev == nil {
-			from = b.firstWrite()
-			break
+	if from < b.firstWrite() {
+		if b.prev != nil {
+			return b.prev.read(from, to, data)
 		}
-		b = b.prev
+		from = b.firstWrite()
 	}

 	i := 0
@@ -289,17 +200,16 @@ func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int6
 				break
 			}
 			b = b.next
-			// Recalculate idx in the new buffer; a gap between buffers may exist.
-			idx = int((t - b.start) / b.frequency)
+			idx = 0
 		}

 		if idx >= len(b.data) {
 			if b.next == nil || to <= b.next.start {
 				break
 			}
-			data[i] += schema.NaN // NaN + anything = NaN; propagates missing data
+			data[i] += schema.NaN
 		} else if t < b.start {
-			data[i] += schema.NaN // gap before this buffer's first write
+			data[i] += schema.NaN
 		} else {
 			data[i] += b.data[idx]
 		}
@@ -330,7 +240,6 @@ func (b *buffer) free(t int64) (delme bool, n int) {
 			if cap(b.prev.data) != BufferCap {
 				b.prev.data = make([]schema.Float, 0, BufferCap)
 			}
-			b.prev.lastUsed = time.Now().Unix()
 			bufferPool.Put(b.prev)
 			b.prev = nil
 		}
@@ -357,12 +266,11 @@ func (b *buffer) forceFreeOldest() (delme bool, n int) {

 		// If the previous buffer signals it should be deleted:
 		if delPrev {
+			// Clear links on the dying buffer to prevent leaks
 			b.prev.next = nil
-			if cap(b.prev.data) != BufferCap {
-				b.prev.data = make([]schema.Float, 0, BufferCap)
-			}
-			b.prev.lastUsed = time.Now().Unix()
-			bufferPool.Put(b.prev)
+			b.prev.data = nil // Release the underlying float slice immediately
+
+			// Remove the link from the current buffer
 			b.prev = nil
 		}
 		return false, freed
@@ -391,27 +299,21 @@ func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) erro
 		return nil
 	}

-	// Collect overlapping buffers walking backwards (newest → oldest).
-	var matching []*buffer
-	for cur := b; cur != nil; cur = cur.prev {
-		if from <= cur.end() && cur.start <= to {
-			matching = append(matching, cur)
-		}
-	}
-
-	// Invoke callback in chronological order (oldest → newest).
-	for i := len(matching) - 1; i >= 0; i-- {
-		if err := callback(matching[i]); err != nil {
+	if err := b.prev.iterFromTo(from, to, callback); err != nil {
 		return err
 	}
+
+	if from <= b.end() && b.start <= to {
+		return callback(b)
 	}
+
 	return nil
 }

 func (b *buffer) count() int64 {
-	var res int64
-	for ; b != nil; b = b.prev {
-		res += int64(len(b.data))
+	res := int64(len(b.data))
+	if b.prev != nil {
+		res += b.prev.count()
 	}
 	return res
 }
--- a/pkg/metricstore/checkpoint.go
+++ b/pkg/metricstore/checkpoint.go
@@ -6,15 +6,16 @@
 // This file implements checkpoint persistence for the in-memory metric store.
 //
 // Checkpoints enable graceful restarts by periodically saving in-memory metric
-// data to disk in JSON or binary format. The checkpoint system:
+// data to disk. The checkpoint system supports two write formats:
+//   - binary (default): fast loading via raw float32 arrays
+//   - json: human-readable, slightly slower to load
 //
 // Key Features:
 //   - Periodic background checkpointing via the Checkpointing() worker
-//   - Two format families: JSON (human-readable) and WAL+binary (compact, crash-safe)
 //   - Parallel checkpoint creation and loading using worker pools
-//   - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|bin}
-//   - WAL file: checkpoint_dir/cluster/host/current.wal (append-only, per-entry)
+//   - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{bin|json}
 //   - Only saves unarchived data (archived data is already persisted elsewhere)
+//   - Automatic format detection during loading (supports bin, json, and legacy avro)
 //   - GC optimization during loading to prevent excessive heap growth
 //
 // Checkpoint Workflow:
@@ -27,9 +28,8 @@
 //	checkpoints/
 //	  cluster1/
 //	    host001/
-//	      1234567890.json  (JSON format: full subtree snapshot)
-//	      1234567890.bin   (binary format: full subtree snapshot)
-//	      current.wal      (WAL format: append-only per-entry log)
+//	      1234567890.bin  (timestamp = checkpoint start time)
+//	      1234567950.bin
 //	    host002/
 //	      ...
 package metricstore
@@ -58,7 +58,6 @@ import (
 const (
 	CheckpointFilePerms = 0o644 // File permissions for checkpoint files
 	CheckpointDirPerms  = 0o755 // Directory permissions for checkpoint directories
-	GCTriggerInterval   = DefaultGCTriggerInterval // Interval for triggering GC during checkpoint loading
 )

 // CheckpointMetrics represents metric data in a checkpoint file.
@@ -86,9 +85,9 @@ var (

 // Checkpointing starts a background worker that periodically saves metric data to disk.
 //
-// Format behaviour:
-//   - "json": Periodic checkpointing based on Keys.Checkpoints.Interval
-//   - "wal":  Periodic binary snapshots + WAL rotation at Keys.Checkpoints.Interval
+// Checkpoints are written at the configured interval (Keys.Checkpoints.Interval) in
+// either binary or JSON format. The worker respects context cancellation and signals
+// completion via the WaitGroup.
 func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 	lastCheckpointMu.Lock()
 	lastCheckpoint = time.Now()
@@ -97,7 +96,6 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 	ms := GetMemoryStore()

 	wg.Go(func() {
-
 		d, err := time.ParseDuration(Keys.Checkpoints.Interval)
 		if err != nil {
 			cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error())
@@ -119,23 +117,10 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 				from := lastCheckpoint
 				lastCheckpointMu.Unlock()

-				now := time.Now()
 				cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
-
-				if Keys.Checkpoints.FileFormat == "wal" {
-					n, hostDirs, err := ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
-					if err != nil {
-						cclog.Errorf("[METRICSTORE]> binary checkpointing failed: %s", err.Error())
-					} else {
-						cclog.Infof("[METRICSTORE]> done: %d binary snapshot files created", n)
-						lastCheckpointMu.Lock()
-						lastCheckpoint = now
-						lastCheckpointMu.Unlock()
-						// Rotate WAL files for successfully checkpointed hosts.
-						RotateWALFiles(hostDirs)
-					}
-				} else {
-					n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
+				now := time.Now()
+				n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
+					from.Unix(), now.Unix())
 				if err != nil {
 					cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
 				} else {
@@ -146,10 +131,156 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 				}
 			}
 		}
-		}
 	})
 }

+// UnmarshalJSON provides optimized JSON decoding for CheckpointMetrics.
+//
+// Mirrors the optimized MarshalJSON by manually parsing JSON to avoid
+// per-element interface dispatch and allocation overhead of the generic
+// json.Unmarshal path for []schema.Float.
+func (cm *CheckpointMetrics) UnmarshalJSON(input []byte) error {
+	// Minimal manual JSON parsing for the known structure:
+	// {"frequency":N,"start":N,"data":[...]}
+	// Field order may vary, so we parse field names.
+
+	if len(input) < 2 || input[0] != '{' {
+		return fmt.Errorf("expected JSON object")
+	}
+
+	i := 1 // skip '{'
+	for i < len(input) {
+		// Skip whitespace
+		for i < len(input) && (input[i] == ' ' || input[i] == '\t' || input[i] == '\n' || input[i] == '\r') {
+			i++
+		}
+		if i >= len(input) || input[i] == '}' {
+			break
+		}
+		if input[i] == ',' {
+			i++
+			continue
+		}
+
+		// Parse field name
+		if input[i] != '"' {
+			return fmt.Errorf("expected field name at pos %d", i)
+		}
+		i++
+		nameStart := i
+		for i < len(input) && input[i] != '"' {
+			i++
+		}
+		fieldName := string(input[nameStart:i])
+		i++ // skip closing '"'
+
+		// Skip ':'
+		for i < len(input) && (input[i] == ' ' || input[i] == ':') {
+			i++
+		}
+
+		switch fieldName {
+		case "frequency":
+			numStart := i
+			for i < len(input) && input[i] != ',' && input[i] != '}' {
+				i++
+			}
+			v, err := strconv.ParseInt(string(input[numStart:i]), 10, 64)
+			if err != nil {
+				return fmt.Errorf("invalid frequency: %w", err)
+			}
+			cm.Frequency = v
+
+		case "start":
+			numStart := i
+			for i < len(input) && input[i] != ',' && input[i] != '}' {
+				i++
+			}
+			v, err := strconv.ParseInt(string(input[numStart:i]), 10, 64)
+			if err != nil {
+				return fmt.Errorf("invalid start: %w", err)
+			}
+			cm.Start = v
+
+		case "data":
+			if input[i] != '[' {
+				return fmt.Errorf("expected '[' for data array at pos %d", i)
+			}
+			i++ // skip '['
+
+			cm.Data = make([]schema.Float, 0, 256)
+			for i < len(input) {
+				// Skip whitespace
+				for i < len(input) && (input[i] == ' ' || input[i] == '\t' || input[i] == '\n' || input[i] == '\r') {
+					i++
+				}
+				if i >= len(input) {
+					break
+				}
+				if input[i] == ']' {
+					i++
+					break
+				}
+				if input[i] == ',' {
+					i++
+					continue
+				}
+
+				// Parse value: number or null
+				if input[i] == 'n' {
+					// "null"
+					cm.Data = append(cm.Data, schema.NaN)
+					i += 4
+				} else {
+					numStart := i
+					for i < len(input) && input[i] != ',' && input[i] != ']' && input[i] != ' ' {
+						i++
+					}
+					v, err := strconv.ParseFloat(string(input[numStart:i]), 64)
+					if err != nil {
+						return fmt.Errorf("invalid data value: %w", err)
+					}
+					cm.Data = append(cm.Data, schema.Float(v))
+				}
+			}
+
+		default:
+			// Skip unknown field value
+			depth := 0
+			inStr := false
+			for i < len(input) {
+				if inStr {
+					if input[i] == '\\' {
+						i++
+					} else if input[i] == '"' {
+						inStr = false
+					}
+				} else {
+					switch input[i] {
+					case '"':
+						inStr = true
+					case '{', '[':
+						depth++
+					case '}', ']':
+						if depth == 0 {
+							goto doneSkip
+						}
+						depth--
+					case ',':
+						if depth == 0 {
+							goto doneSkip
+						}
+					}
+				}
+				i++
+			}
+		doneSkip:
+		}
+	}
+
+	return nil
+}
+
 // MarshalJSON provides optimized JSON encoding for CheckpointMetrics.
 //
 // Since schema.Float has custom MarshalJSON, serializing []Float has significant overhead.
@@ -175,7 +306,7 @@ func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
 	return buf, nil
 }

-// ToCheckpoint writes metric data to checkpoint files in parallel (JSON format).
+// ToCheckpoint writes metric data to checkpoint files in parallel.
 //
 // Metrics at root and cluster levels are skipped. One file per host is created.
 // Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host
@@ -322,7 +453,8 @@ func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFil
 	return retval, nil
 }

-// toCheckpoint writes a Level's data to a JSON checkpoint file.
+// toCheckpoint writes a Level's data to a checkpoint file.
+// The format (binary or JSON) is determined by Keys.Checkpoints.FileFormat.
 // Creates directory if needed. Returns ErrNoNewArchiveData if nothing to save.
 func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
 	cf, err := l.toCheckpointFile(from, to, m)
@@ -334,12 +466,23 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
 		return ErrNoNewArchiveData
 	}

-	filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
-	f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
+	if Keys.Checkpoints.FileFormat == "json" {
+		return writeJSONCheckpoint(dir, from, cf)
+	}
+
+	// Default: binary format
+	filePath := path.Join(dir, fmt.Sprintf("%d.bin", from))
+	return writeBinaryCheckpoint(filePath, cf)
+}
+
+// writeJSONCheckpoint writes a CheckpointFile in JSON format.
+func writeJSONCheckpoint(dir string, from int64, cf *CheckpointFile) error {
+	filePath := path.Join(dir, fmt.Sprintf("%d.json", from))
+	f, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
 	if err != nil && os.IsNotExist(err) {
 		err = os.MkdirAll(dir, CheckpointDirPerms)
 		if err == nil {
-			f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
+			f, err = os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
 		}
 	}
 	if err != nil {
@@ -356,40 +499,56 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
 }

 // enqueueCheckpointHosts traverses checkpoint directory and enqueues cluster/host pairs.
-// Returns error if directory structure is invalid.
-func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
+// Returns the set of cluster names found and any error if directory structure is invalid.
+func enqueueCheckpointHosts(dir string, work chan<- [2]string) (map[string]struct{}, error) {
 	clustersDir, err := os.ReadDir(dir)
 	if err != nil {
-		return err
+		return nil, err
 	}

+	clusters := make(map[string]struct{}, len(clustersDir))
+
 	for _, clusterDir := range clustersDir {
 		if !clusterDir.IsDir() {
-			return errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
+			return nil, errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
 		}

+		clusters[clusterDir.Name()] = struct{}{}
+
 		hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
 		if err != nil {
-			return err
+			return nil, err
 		}

 		for _, hostDir := range hostsDir {
 			if !hostDir.IsDir() {
-				return errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
+				return nil, errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
 			}

 			work <- [2]string{clusterDir.Name(), hostDir.Name()}
 		}
 	}

-	return nil
+	return clusters, nil
 }

 // FromCheckpoint loads checkpoint files from disk into memory in parallel.
 //
-// Uses worker pool to load cluster/host combinations. Returns number of files
-// loaded and any errors.
+// Pre-creates cluster-level entries to reduce lock contention during parallel loading.
+// Uses worker pool to load cluster/host combinations. Returns number of files loaded and any errors.
 func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
+	// Pre-create cluster-level entries to eliminate write-lock contention on m.root
+	// during parallel loading. Workers only contend at the cluster level (independent).
+	clusterDirs, err := os.ReadDir(dir)
+	if err != nil && !os.IsNotExist(err) {
+		return 0, err
+	}
+	for _, d := range clusterDirs {
+		if d.IsDir() {
+			m.root.findLevelOrCreate([]string{d.Name()}, len(m.Metrics))
+		}
+	}
+
 	var wg sync.WaitGroup
 	work := make(chan [2]string, Keys.NumWorkers*4)
 	n, errs := int32(0), int32(0)
@@ -410,7 +569,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
 		}()
 	}

-	err := enqueueCheckpointHosts(dir, work)
+	_, err = enqueueCheckpointHosts(dir, work)
 	close(work)
 	wg.Wait()

@@ -426,11 +585,13 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {

 // FromCheckpointFiles is the main entry point for loading checkpoints at startup.
 //
+// Automatically detects checkpoint format (binary, JSON, or legacy Avro).
 // Creates checkpoint directory if it doesn't exist. This function must be called
 // before any writes or reads, and can only be called once.
 func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
 	if _, err := os.Stat(dir); os.IsNotExist(err) {
-		err := os.MkdirAll(dir, CheckpointDirPerms)
+		// The directory does not exist, so create it using os.MkdirAll()
+		err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
 		if err != nil {
 			cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
 		}
@@ -440,10 +601,11 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
 	return m.FromCheckpoint(dir, from)
 }

-func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
-	br := bufio.NewReader(f)
-	cf := &CheckpointFile{}
-	if err := json.NewDecoder(br).Decode(cf); err != nil {
+// loadBinaryCheckpointFile loads a binary checkpoint file into the Level tree.
+// Binary files are decoded in the same way as JSON files (via loadFile).
+func (l *Level) loadBinaryCheckpointFile(m *MemoryStore, filePath string, from int64) error {
+	cf, err := loadBinaryFile(filePath)
+	if err != nil {
 		return err
 	}

@@ -451,11 +613,7 @@ func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
 		return nil
 	}

-	if err := l.loadFile(cf, m); err != nil {
-		return err
-	}
-
-	return nil
+	return l.loadFile(cf, m)
 }

 func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
@@ -511,37 +669,25 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
 	return nil
 }

-// fromCheckpoint loads all checkpoint files (JSON, binary snapshot, WAL) for a
-// single host directory. Snapshot files are loaded first (sorted by timestamp),
-// then current.wal is replayed on top.
 func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) {
 	direntries, err := os.ReadDir(dir)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return 0, nil
 		}
+
 		return 0, err
 	}

-	allFiles := make([]fs.DirEntry, 0)
-	var walEntry fs.DirEntry
+	allFiles := make([]fs.DirEntry, 0, len(direntries))
 	filesLoaded := 0
-
 	for _, e := range direntries {
 		if e.IsDir() {
-			// Legacy: skip subdirectories (only used by old Avro format).
-			// These are ignored; their data is not loaded.
-			cclog.Debugf("[METRICSTORE]> skipping subdirectory %s in checkpoint dir %s", e.Name(), dir)
+			cclog.Warnf("[METRICSTORE]> unexpected subdirectory '%s' in checkpoint dir '%s', skipping", e.Name(), dir)
 			continue
-		}
-
-		name := e.Name()
-		if strings.HasSuffix(name, ".json") || strings.HasSuffix(name, ".bin") {
+		} else if strings.HasSuffix(e.Name(), ".bin") || strings.HasSuffix(e.Name(), ".json") {
 			allFiles = append(allFiles, e)
-		} else if name == "current.wal" {
-			walEntry = e
 		}
-		// Silently ignore other files (e.g., .tmp, .bin.tmp from interrupted writes).
 	}

 	files, err := findFiles(allFiles, from, true)
@@ -549,47 +695,109 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, err
 		return filesLoaded, err
 	}

-	loaders := map[string]func(*MemoryStore, *os.File, int64) error{
-		".json": l.loadJSONFile,
-		".bin":  l.loadBinaryFile,
+	if len(files) == 0 {
+		return 0, nil
 	}

+	// Separate files by type
+	var binFiles, jsonFiles []string
 	for _, filename := range files {
-		ext := filepath.Ext(filename)
-		loader := loaders[ext]
-		if loader == nil {
-			cclog.Warnf("[METRICSTORE]> unknown extension for checkpoint file %s", filename)
+		switch filepath.Ext(filename) {
+		case ".bin":
+			binFiles = append(binFiles, filename)
+		case ".json":
+			jsonFiles = append(jsonFiles, filename)
+		default:
+			cclog.Warnf("[METRICSTORE]> unknown extension for file %s", filename)
+		}
+	}
+
+	// Parallel binary decoding: decode files concurrently, then apply sequentially
+	if len(binFiles) > 0 {
+		type decodedFile struct {
+			cf  *CheckpointFile
+			err error
+		}
+
+		decoded := make([]decodedFile, len(binFiles))
+		var decodeWg sync.WaitGroup
+
+		for i, filename := range binFiles {
+			decodeWg.Add(1)
+			go func(idx int, fname string) {
+				defer decodeWg.Done()
+				cf, err := loadBinaryFile(path.Join(dir, fname))
+				if err != nil {
+					decoded[idx] = decodedFile{err: fmt.Errorf("decoding %s: %w", fname, err)}
+					return
+				}
+				decoded[idx] = decodedFile{cf: cf}
+			}(i, filename)
+		}
+
+		decodeWg.Wait()
+
+		for i, d := range decoded {
+			if d.err != nil {
+				return filesLoaded, d.err
+			}
+
+			if d.cf.To != 0 && d.cf.To < from {
 				continue
 			}

-		err := func() error {
-			f, err := os.Open(path.Join(dir, filename))
-			if err != nil {
-				return err
-			}
-			defer f.Close()
-			return loader(m, f, from)
-		}()
-		if err != nil {
-			return filesLoaded, err
+			if err := l.loadFile(d.cf, m); err != nil {
+				return filesLoaded, fmt.Errorf("loading %s: %w", binFiles[i], err)
 			}
 			filesLoaded++
 		}
+	}

-	// Replay WAL after all snapshot files so it fills in data since the last snapshot.
-	if walEntry != nil {
-		err := func() error {
-			f, err := os.Open(path.Join(dir, walEntry.Name()))
+	// Parallel JSON decoding: decode files concurrently, then apply sequentially
+	if len(jsonFiles) > 0 {
+		type decodedFile struct {
+			cf  *CheckpointFile
+			err error
+		}
+
+		decoded := make([]decodedFile, len(jsonFiles))
+		var decodeWg sync.WaitGroup
+
+		for i, filename := range jsonFiles {
+			decodeWg.Add(1)
+			go func(idx int, fname string) {
+				defer decodeWg.Done()
+				f, err := os.Open(path.Join(dir, fname))
 				if err != nil {
-				return err
+					decoded[idx] = decodedFile{err: err}
+					return
 				}
 				defer f.Close()
-			return l.loadWALFile(m, f, from)
-		}()
-		if err != nil {
-			// WAL errors are non-fatal: the snapshot already loaded the bulk of data.
-			cclog.Warnf("[METRICSTORE]> WAL replay error for %s: %v (data since last snapshot may be missing)", dir, err)
-		} else {
+
+				cf := &CheckpointFile{}
+				if err := json.NewDecoder(bufio.NewReader(f)).Decode(cf); err != nil {
+					decoded[idx] = decodedFile{err: fmt.Errorf("decoding %s: %w", fname, err)}
+					return
+				}
+
+				decoded[idx] = decodedFile{cf: cf}
+			}(i, filename)
+		}
+
+		decodeWg.Wait()
+
+		for i, d := range decoded {
+			if d.err != nil {
+				return filesLoaded, d.err
+			}
+
+			if d.cf.To != 0 && d.cf.To < from {
+				continue
+			}
+
+			if err := l.loadFile(d.cf, m); err != nil {
+				return filesLoaded, fmt.Errorf("loading %s: %w", jsonFiles[i], err)
+			}
 			filesLoaded++
 		}
 	}
@@ -597,69 +805,79 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, err
 	return filesLoaded, nil
 }

-// parseTimestampFromFilename extracts a Unix timestamp from a checkpoint filename.
-// Supports ".json" (format: "<ts>.json") and ".bin" (format: "<ts>.bin").
-func parseTimestampFromFilename(name string) (int64, error) {
-	switch {
-	case strings.HasSuffix(name, ".json"):
-		return strconv.ParseInt(name[:len(name)-5], 10, 64)
-	case strings.HasSuffix(name, ".bin"):
-		return strconv.ParseInt(name[:len(name)-4], 10, 64)
-	default:
-		return 0, fmt.Errorf("unknown checkpoint extension for file %q", name)
-	}
+// findFiles filters and sorts checkpoint files by timestamp.
+//
+// When findMoreRecentFiles is true, returns files with timestamp >= t (for loading),
+// plus the immediately preceding file if it straddles the boundary.
+// When false, returns files with timestamp <= t (for cleanup).
+//
+// Filters before sorting so only relevant files are sorted, keeping performance
+// stable regardless of total directory size.
+func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
+	type fileEntry struct {
+		name string
+		ts   int64
 	}

-// findFiles returns filenames from direntries whose timestamps satisfy the filter.
-// If findMoreRecentFiles is true, returns files with timestamps >= t (plus the
-// last file before t if t falls between two files).
-func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
-	nums := map[string]int64{}
+	// Parse timestamps and pre-filter in a single pass
+	var candidates []fileEntry
+	var bestPreceding *fileEntry // Track the file just before the cutoff (for boundary straddling)
+
 	for _, e := range direntries {
 		name := e.Name()
-		if !strings.HasSuffix(name, ".json") && !strings.HasSuffix(name, ".bin") {
+		ext := filepath.Ext(name)
+		if ext != ".bin" && ext != ".json" {
 			continue
 		}

-		ts, err := parseTimestampFromFilename(name)
+		// Parse timestamp from filename: for .bin and .json it's just "TIMESTAMP.ext"
+		baseName := name[:len(name)-len(ext)]
+		// Handle legacy format with prefix (e.g., "60_TIMESTAMP.avro")
+		if idx := strings.Index(baseName, "_"); idx >= 0 {
+			baseName = baseName[idx+1:]
+		}
+		ts, err := strconv.ParseInt(baseName, 10, 64)
 		if err != nil {
 			return nil, err
 		}
-		nums[name] = ts
+
+		if findMoreRecentFiles {
+			if ts >= t {
+				candidates = append(candidates, fileEntry{name, ts})
+			} else {
+				// Track the most recent file before the cutoff for boundary straddling
+				if bestPreceding == nil || ts > bestPreceding.ts {
+					bestPreceding = &fileEntry{name, ts}
+				}
+			}
+		} else {
+			if ts <= t && ts != 0 {
+				candidates = append(candidates, fileEntry{name, ts})
+			}
+		}
 	}

-	sort.Slice(direntries, func(i, j int) bool {
-		a, b := direntries[i], direntries[j]
-		return nums[a.Name()] < nums[b.Name()]
-	})
+	// Include the boundary-straddling file if we found one and there are also files after the cutoff
+	if findMoreRecentFiles && bestPreceding != nil && len(candidates) > 0 {
+		candidates = append(candidates, *bestPreceding)
+	}

-	if len(nums) == 0 {
+	if len(candidates) == 0 {
+		// If searching for recent files and we only have a preceding file, include it
+		if findMoreRecentFiles && bestPreceding != nil {
+			return []string{bestPreceding.name}, nil
+		}
 		return nil, nil
 	}

-	filenames := make([]string, 0)
+	// Sort only the filtered candidates
+	sort.Slice(candidates, func(i, j int) bool {
+		return candidates[i].ts < candidates[j].ts
+	})

-	for i, e := range direntries {
-		ts1 := nums[e.Name()]
-
-		if findMoreRecentFiles && t <= ts1 {
-			filenames = append(filenames, e.Name())
-		} else if !findMoreRecentFiles && ts1 <= t && ts1 != 0 {
-			filenames = append(filenames, e.Name())
-		}
-
-		if i == len(direntries)-1 {
-			continue
-		}
-
-		enext := direntries[i+1]
-		ts2 := nums[enext.Name()]
-
-		if findMoreRecentFiles {
-			if ts1 < t && t < ts2 {
-				filenames = append(filenames, e.Name())
-			}
-		}
+	filenames := make([]string, len(candidates))
+	for i, c := range candidates {
+		filenames[i] = c.name
 	}

 	return filenames, nil
--- a/pkg/metricstore/config.go
+++ b/pkg/metricstore/config.go
@@ -14,7 +14,7 @@
 //	├─ RetentionInMemory: How long to keep data in RAM
 //	├─ MemoryCap: Memory limit in bytes (triggers forceFree)
 //	├─ Checkpoints: Persistence configuration
-//	│  ├─ FileFormat: "json" or "wal"
+//	│  ├─ FileFormat: "binary" or "json"
 //	│  ├─ Interval: How often to save (e.g., "1h")
 //	│  └─ RootDir: Checkpoint storage path
 //	├─ Cleanup: Long-term storage configuration
@@ -54,14 +54,13 @@ import (
 const (
 	DefaultMaxWorkers                 = 10
 	DefaultBufferCapacity             = 512
-	DefaultGCTriggerInterval          = 100
 	DefaultMemoryUsageTrackerInterval = 1 * time.Hour
 )

 // Checkpoints configures periodic persistence of in-memory metric data.
 //
 // Fields:
-//   - FileFormat: "json" (human-readable, periodic) or "wal" (binary snapshot + WAL, crash-safe)
+//   - FileFormat: "binary" (default, fast loading) or "json" (human-readable)
 //   - Interval:   Duration string (e.g., "1h", "30m") between checkpoint saves
 //   - RootDir:    Filesystem path for checkpoint files (created if missing)
 type Checkpoints struct {
@@ -141,7 +140,7 @@ type MetricStoreConfig struct {
 // Accessed by Init(), Checkpointing(), and other lifecycle functions.
 var Keys MetricStoreConfig = MetricStoreConfig{
 	Checkpoints: Checkpoints{
-		FileFormat: "json",
+		FileFormat: "binary",
 		RootDir:    "./var/checkpoints",
 	},
 	Cleanup: &Cleanup{
--- a/pkg/metricstore/configSchema.go
+++ b/pkg/metricstore/configSchema.go
@@ -18,8 +18,9 @@ const configSchema = `{
      "type": "object",
      "properties": {
        "file-format": {
-          "description": "Specify the format for checkpoint files. Two variants: 'json' (human-readable, periodic) and 'wal' (binary snapshot + Write-Ahead Log, crash-safe). Default is 'json'.",
-          "type": "string"
+          "description": "Specify the format for checkpoint files: 'binary' (default, fast loading) or 'json' (human-readable).",
+          "type": "string",
+          "enum": ["binary", "json"]
        },
        "interval": {
          "description": "Interval at which the metrics should be checkpointed.",
--- a/pkg/metricstore/level.go
+++ b/pkg/metricstore/level.go
@@ -42,7 +42,6 @@ package metricstore

 import (
 	"sync"
-	"time"
 	"unsafe"

 	"github.com/ClusterCockpit/cc-lib/v2/schema"
@@ -193,7 +192,6 @@ func (l *Level) free(t int64) (int, error) {
 				if cap(b.data) != BufferCap {
 					b.data = make([]schema.Float, 0, BufferCap)
 				}
-				b.lastUsed = time.Now().Unix()
 				bufferPool.Put(b)
 				l.metrics[i] = nil
 			}
@@ -238,13 +236,12 @@ func (l *Level) forceFree() (int, error) {
 			// If delme is true, it means 'b' itself (the head) was the oldest
 			// and needs to be removed from the slice.
 			if delme {
+				// Nil out fields to ensure no hanging references
+
 				b.next = nil
 				b.prev = nil
-				if cap(b.data) != BufferCap {
-					b.data = make([]schema.Float, 0, BufferCap)
-				}
-				b.lastUsed = time.Now().Unix()
-				bufferPool.Put(b)
+				b.data = nil
+
 				l.metrics[i] = nil
 			}
 		}
--- a/pkg/metricstore/lineprotocol.go
+++ b/pkg/metricstore/lineprotocol.go
@@ -3,19 +3,6 @@
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.

-// This file implements ingestion of InfluxDB line-protocol metric data received
-// over NATS. Each line encodes one metric sample with the following structure:
-//
-//	<measurement>[,cluster=<c>][,hostname=<h>][,type=<t>][,type-id=<id>][,subtype=<s>][,stype-id=<id>] value=<v> [<timestamp>]
-//
-// The measurement name identifies the metric (e.g. "cpu_load"). Tags provide
-// routing information (cluster, host) and optional sub-device selectors (type,
-// subtype). Only one field is expected per line: "value".
-//
-// After decoding, each sample is:
-//  1. Written to the in-memory store via ms.WriteToLevel.
-//  2. If the checkpoint format is "wal", also forwarded to the WAL staging
-//     goroutine via the WALMessages channel for durable write-ahead logging.
 package metricstore

 import (
@@ -31,16 +18,6 @@ import (
 	"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
 )

-// ReceiveNats subscribes to all configured NATS subjects and feeds incoming
-// line-protocol messages into the MemoryStore.
-//
-// When workers > 1 a pool of goroutines drains a shared channel so that
-// multiple messages can be decoded in parallel. With workers == 1 the NATS
-// callback decodes inline (no channel overhead, lower latency).
-//
-// The function blocks until ctx is cancelled and all worker goroutines have
-// finished. It returns nil when the NATS client is not configured; callers
-// should treat that as a no-op rather than an error.
 func ReceiveNats(ms *MemoryStore,
 	workers int,
 	ctx context.Context,
@@ -99,13 +76,8 @@ func ReceiveNats(ms *MemoryStore,
 	return nil
 }

-// reorder prepends prefix to buf in-place when buf has enough spare capacity,
-// avoiding an allocation. Falls back to a regular append otherwise.
-//
-// It is used to assemble the "type<type-id>" and "subtype<stype-id>" selector
-// strings when the type tag arrives before the type-id tag in the line, so the
-// two byte slices need to be concatenated in tag-declaration order regardless
-// of wire order.
+// Place `prefix` in front of `buf` but if possible,
+// do that inplace in `buf`.
 func reorder(buf, prefix []byte) []byte {
 	n := len(prefix)
 	m := len(buf)
@@ -123,41 +95,17 @@ func reorder(buf, prefix []byte) []byte {
 	}
 }

-// decodeState holds the per-call scratch buffers used by DecodeLine.
-// Instances are recycled via decodeStatePool to avoid repeated allocations
-// during high-throughput ingestion.
 type decodeState struct {
-	// metricBuf holds a copy of the current measurement name (line-protocol
-	// measurement field). Copied because dec.Measurement() returns a slice
-	// that is invalidated by the next decoder call.
 	metricBuf        []byte
-
-	// selector is the sub-device path passed to WriteToLevel and WALMessage
-	// (e.g. ["socket0"] or ["socket0", "memctrl1"]). Reused across lines.
 	selector         []string
-
-	// typeBuf accumulates the concatenated "type"+"type-id" tag value for the
-	// current line. Reset at the start of each line's tag-decode loop.
 	typeBuf          []byte
-
-	// subTypeBuf accumulates the concatenated "subtype"+"stype-id" tag value.
-	// Reset at the start of each line's tag-decode loop.
 	subTypeBuf       []byte
-
-	// prevTypeBytes / prevTypeStr cache the last seen typeBuf content and its
-	// string conversion. Because consecutive lines in a batch typically address
-	// the same sub-device, the cache hit rate is very high and avoids
-	// repeated []byte→string allocations.
 	prevTypeBytes    []byte
 	prevTypeStr      string
-
-	// prevSubTypeBytes / prevSubTypeStr are the same cache for the subtype.
 	prevSubTypeBytes []byte
 	prevSubTypeStr   string
 }

-// decodeStatePool recycles decodeState values across DecodeLine calls to
-// reduce GC pressure during sustained metric ingestion.
 var decodeStatePool = sync.Pool{
 	New: func() any {
 		return &decodeState{
@@ -169,28 +117,8 @@ var decodeStatePool = sync.Pool{
 	},
 }

-// DecodeLine reads all lines from dec (InfluxDB line-protocol) and writes each
-// decoded metric sample into ms.
-//
-// clusterDefault is used as the cluster name for lines that do not carry a
-// "cluster" tag. Callers typically supply the ClusterTag value from the NATS
-// subscription configuration.
-//
-// Performance notes:
-//   - A decodeState is obtained from decodeStatePool to reuse scratch buffers.
-//   - The Level pointer (host-level node in the metric tree) is cached across
-//     consecutive lines that share the same cluster+host pair to avoid
-//     repeated lock acquisitions on the root and cluster levels.
-//   - []byte→string conversions for type/subtype selectors are cached via
-//     prevType*/prevSubType* fields because batches typically repeat the same
-//     sub-device identifiers.
-//   - Timestamp parsing tries Second precision first; if that fails it retries
-//     Millisecond, Microsecond, and Nanosecond in turn. A missing timestamp
-//     falls back to time.Now().
-//
-// When the checkpoint format is "wal" each successfully decoded sample is also
-// sent to WALMessages so the WAL staging goroutine can persist it durably
-// before the next binary snapshot.
+// Decode lines using dec and make write calls to the MemoryStore.
+// If a line is missing its cluster tag, use clusterDefault as default.
 func DecodeLine(dec *lineprotocol.Decoder,
 	ms *MemoryStore,
 	clusterDefault string,
@@ -347,17 +275,6 @@ func DecodeLine(dec *lineprotocol.Decoder,

 		time := t.Unix()

-		if Keys.Checkpoints.FileFormat == "wal" {
-			WALMessages <- &WALMessage{
-				MetricName: string(st.metricBuf),
-				Cluster:    cluster,
-				Node:       host,
-				Selector:   append([]string{}, st.selector...),
-				Value:      metric.Value,
-				Timestamp:  time,
-			}
-		}
-
 		if err := ms.WriteToLevel(lvl, st.selector, time, []Metric{metric}); err != nil {
 			return err
 		}
--- a/pkg/metricstore/metricstore.go
+++ b/pkg/metricstore/metricstore.go
@@ -8,7 +8,7 @@
 //
 // The package organizes metrics in a tree structure (cluster → host → component) and
 // provides concurrent read/write access to metric data with configurable aggregation strategies.
-// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
+// Background goroutines handle periodic checkpointing (binary or JSON format), archiving old data,
 // and enforcing retention policies.
 //
 // Key features:
@@ -151,6 +151,12 @@ func Init(rawConfig json.RawMessage, metrics map[string]MetricConfig, wg *sync.W

 	restoreFrom := startupTime.Add(-d)
 	cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
+
+	// Lower GC target during loading to prevent excessive heap growth.
+	// During checkpoint loading the heap grows rapidly, causing the GC to
+	// double its target repeatedly. A lower percentage keeps it tighter.
+	oldGCPercent := debug.SetGCPercent(20)
+
 	files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
 	loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
 	if err != nil {
@@ -159,20 +165,16 @@ func Init(rawConfig json.RawMessage, metrics map[string]MetricConfig, wg *sync.W
 		cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
 	}

-	// Try to use less memory by forcing a GC run here and then
-	// lowering the target percentage. The default of 100 means
-	// that only once the ratio of new allocations execeds the
-	// previously active heap, a GC is triggered.
-	// Forcing a GC here will set the "previously active heap"
-	// to a minumum.
-	// runtime.GC()
+	// Restore GC target and force a collection to set a tight baseline
+	// for the "previously active heap" size, reducing long-term memory waste.
+	debug.SetGCPercent(oldGCPercent)
+	runtime.GC()

 	ctx, shutdown := context.WithCancel(context.Background())

 	Retention(wg, ctx)
 	Checkpointing(wg, ctx)
 	CleanUp(wg, ctx)
-	WALStaging(wg, ctx)
 	MemoryUsageTracker(wg, ctx)

 	// Note: Signal handling has been removed from this function.
@@ -264,7 +266,7 @@ func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) {
 //
 // The function will:
 //  1. Cancel the context to stop all background workers
-//  2. Close the WAL messages channel if using WAL format
+//  2. Close NATS message channels if using Avro format
 //  3. Write a final checkpoint to preserve in-memory data
 //  4. Log any errors encountered during shutdown
 //
@@ -276,30 +278,10 @@ func Shutdown() {
 		shutdownFunc()
 	}

-	if Keys.Checkpoints.FileFormat == "wal" {
-		close(WALMessages)
-	}
-
 	cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
-	var files int
-	var err error
-
 	ms := GetMemoryStore()

-	lastCheckpointMu.Lock()
-	from := lastCheckpoint
-	lastCheckpointMu.Unlock()
-
-	if Keys.Checkpoints.FileFormat == "wal" {
-		var hostDirs []string
-		files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
-		if err == nil {
-			RotateWALFiles(hostDirs)
-		}
-	} else {
-		files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
-	}
-
+	files, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
 	if err != nil {
 		cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
 	}
@@ -320,7 +302,9 @@ func Shutdown() {
 func Retention(wg *sync.WaitGroup, ctx context.Context) {
 	ms := GetMemoryStore()

-	wg.Go(func() {
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
 		d, err := time.ParseDuration(Keys.RetentionInMemory)
 		if err != nil {
 			cclog.Fatal(err)
@@ -357,12 +341,9 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
 				}

 				state.mu.Unlock()
-
-				// Clean up the buffer pool
-				bufferPool.Clean(state.lastRetentionTime)
 			}
 		}
-	})
+	}()
 }

 // MemoryUsageTracker starts a background goroutine that monitors memory usage.
@@ -383,7 +364,9 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
 func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
 	ms := GetMemoryStore()

-	wg.Go(func() {
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
 		d := DefaultMemoryUsageTrackerInterval

 		if d <= 0 {
@@ -428,9 +411,6 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
 				runtime.ReadMemStats(&mem)
 				actualMemoryGB = float64(mem.Alloc) / 1e9

-				bufferPool.Clear()
-				cclog.Infof("[METRICSTORE]> Cleaned up bufferPool\n")
-
 				if actualMemoryGB > float64(Keys.MemoryCap) {
 					cclog.Warnf("[METRICSTORE]> memory usage %.2f GB exceeds cap %d GB, starting emergency buffer freeing", actualMemoryGB, Keys.MemoryCap)

@@ -472,7 +452,7 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
 				}
 			}
 		}
-	})
+	}()
 }

 // Free removes metric data older than the given time while preserving data for active nodes.
--- a/pkg/metricstore/metricstore_test.go
+++ b/pkg/metricstore/metricstore_test.go
@@ -12,526 +12,6 @@ import (
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

-// ─── Buffer pool ─────────────────────────────────────────────────────────────
-
-// TestBufferPoolGetReuse verifies that Get() returns pooled buffers before
-// allocating new ones, and that an empty pool allocates a fresh BufferCap buffer.
-func TestBufferPoolGetReuse(t *testing.T) {
-	pool := NewPersistentBufferPool()
-
-	original := &buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()}
-	pool.Put(original)
-
-	reused := pool.Get()
-	if reused != original {
-		t.Error("Get() should return the previously pooled buffer")
-	}
-	if pool.GetSize() != 0 {
-		t.Errorf("pool size after Get() = %d, want 0", pool.GetSize())
-	}
-
-	// Empty pool must allocate a fresh buffer with the standard capacity.
-	fresh := pool.Get()
-	if fresh == nil {
-		t.Fatal("Get() from empty pool returned nil")
-	}
-	if cap(fresh.data) != BufferCap {
-		t.Errorf("fresh buffer cap = %d, want %d", cap(fresh.data), BufferCap)
-	}
-}
-
-// TestBufferPoolClear verifies that Clear() drains all entries.
-func TestBufferPoolClear(t *testing.T) {
-	pool := NewPersistentBufferPool()
-	for i := 0; i < 10; i++ {
-		pool.Put(&buffer{data: make([]schema.Float, 0), lastUsed: time.Now().Unix()})
-	}
-	pool.Clear()
-	if pool.GetSize() != 0 {
-		t.Errorf("pool size after Clear() = %d, want 0", pool.GetSize())
-	}
-}
-
-// TestBufferPoolMaxSize verifies that Put() silently drops buffers once the
-// pool reaches maxPoolSize, preventing unbounded memory growth.
-func TestBufferPoolMaxSize(t *testing.T) {
-	pool := NewPersistentBufferPool()
-	for i := 0; i < maxPoolSize; i++ {
-		pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()})
-	}
-	if pool.GetSize() != maxPoolSize {
-		t.Fatalf("pool size = %d, want %d", pool.GetSize(), maxPoolSize)
-	}
-
-	pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()})
-	if pool.GetSize() != maxPoolSize {
-		t.Errorf("pool size after overflow Put = %d, want %d (should not grow)", pool.GetSize(), maxPoolSize)
-	}
-}
-
-// ─── Buffer helpers ───────────────────────────────────────────────────────────
-
-// TestBufferEndFirstWrite verifies the end() and firstWrite() calculations.
-func TestBufferEndFirstWrite(t *testing.T) {
-	// start=90, freq=10 → firstWrite = 90+5 = 95
-	b := &buffer{data: make([]schema.Float, 4, BufferCap), frequency: 10, start: 90}
-	if fw := b.firstWrite(); fw != 95 {
-		t.Errorf("firstWrite() = %d, want 95", fw)
-	}
-	// end = firstWrite + len(data)*freq = 95 + 4*10 = 135
-	if e := b.end(); e != 135 {
-		t.Errorf("end() = %d, want 135", e)
-	}
-}
-
-// ─── Buffer write ─────────────────────────────────────────────────────────────
-
-// TestBufferWriteNaNFill verifies that skipped timestamps are filled with NaN.
-func TestBufferWriteNaNFill(t *testing.T) {
-	b := newBuffer(100, 10)
-	b.write(100, schema.Float(1.0))
-	// skip 110 and 120
-	b.write(130, schema.Float(4.0))
-
-	if len(b.data) != 4 {
-		t.Fatalf("len(data) = %d, want 4 (1 value + 2 NaN + 1 value)", len(b.data))
-	}
-	if b.data[0] != schema.Float(1.0) {
-		t.Errorf("data[0] = %v, want 1.0", b.data[0])
-	}
-	if !b.data[1].IsNaN() {
-		t.Errorf("data[1] should be NaN (gap), got %v", b.data[1])
-	}
-	if !b.data[2].IsNaN() {
-		t.Errorf("data[2] should be NaN (gap), got %v", b.data[2])
-	}
-	if b.data[3] != schema.Float(4.0) {
-		t.Errorf("data[3] = %v, want 4.0", b.data[3])
-	}
-}
-
-// TestBufferWriteCapacityOverflow verifies that exceeding capacity creates and
-// links a new buffer rather than panicking or silently dropping data.
-func TestBufferWriteCapacityOverflow(t *testing.T) {
-	// Cap=2 so the third write must overflow into a new buffer.
-	b := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 95}
-
-	nb, _ := b.write(100, schema.Float(1.0))
-	nb, _ = nb.write(110, schema.Float(2.0))
-	nb, err := nb.write(120, schema.Float(3.0))
-	if err != nil {
-		t.Fatalf("write() error = %v", err)
-	}
-	if nb == b {
-		t.Fatal("write() should have returned a new buffer after overflow")
-	}
-	if nb.prev != b {
-		t.Error("new buffer should link back to old via prev")
-	}
-	if b.next != nb {
-		t.Error("old buffer should link forward to new via next")
-	}
-	if len(b.data) != 2 {
-		t.Errorf("old buffer len = %d, want 2 (full)", len(b.data))
-	}
-	if nb.data[0] != schema.Float(3.0) {
-		t.Errorf("new buffer data[0] = %v, want 3.0", nb.data[0])
-	}
-}
-
-// TestBufferWriteOverwrite verifies that writing to an already-occupied index
-// replaces the value rather than appending.
-func TestBufferWriteOverwrite(t *testing.T) {
-	b := newBuffer(100, 10)
-	b.write(100, schema.Float(1.0))
-	b.write(110, schema.Float(2.0))
-
-	// Overwrite the first slot.
-	b.write(100, schema.Float(99.0))
-	if len(b.data) != 2 {
-		t.Errorf("len(data) after overwrite = %d, want 2 (no append)", len(b.data))
-	}
-	if b.data[0] != schema.Float(99.0) {
-		t.Errorf("data[0] after overwrite = %v, want 99.0", b.data[0])
-	}
-}
-
-// ─── Buffer read ──────────────────────────────────────────────────────────────
-
-// TestBufferReadBeforeFirstWrite verifies that 'from' is clamped to firstWrite
-// when the requested range starts before any data in the chain.
-func TestBufferReadBeforeFirstWrite(t *testing.T) {
-	b := newBuffer(100, 10) // firstWrite = 100
-	b.write(100, schema.Float(1.0))
-	b.write(110, schema.Float(2.0))
-
-	data := make([]schema.Float, 10)
-	result, adjustedFrom, _, err := b.read(50, 120, data)
-	if err != nil {
-		t.Fatalf("read() error = %v", err)
-	}
-	if adjustedFrom != 100 {
-		t.Errorf("adjustedFrom = %d, want 100 (clamped to firstWrite)", adjustedFrom)
-	}
-	if len(result) != 2 {
-		t.Errorf("len(result) = %d, want 2", len(result))
-	}
-}
-
-// TestBufferReadChain verifies that read() traverses a multi-buffer chain and
-// returns contiguous values from both buffers.
-//
-// The switch to b.next in read() triggers on idx >= cap(b.data), so b1 must
-// be full (len == cap) for the loop to advance to b2 without producing NaN.
-func TestBufferReadChain(t *testing.T) {
-	// b1: cap=3, covers t=100..120.  b2: covers t=130..150.  b2 is head.
-	b1 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 95}
-	b1.data = append(b1.data, 1.0, 2.0, 3.0) // fills b1: len=cap=3
-
-	b2 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 125}
-	b2.data = append(b2.data, 4.0, 5.0, 6.0) // t=130,140,150
-	b2.prev = b1
-	b1.next = b2
-
-	data := make([]schema.Float, 6)
-	result, from, to, err := b2.read(100, 160, data)
-	if err != nil {
-		t.Fatalf("read() error = %v", err)
-	}
-	if from != 100 || to != 160 {
-		t.Errorf("read() from/to = %d/%d, want 100/160", from, to)
-	}
-	if len(result) != 6 {
-		t.Fatalf("len(result) = %d, want 6", len(result))
-	}
-	for i, want := range []schema.Float{1, 2, 3, 4, 5, 6} {
-		if result[i] != want {
-			t.Errorf("result[%d] = %v, want %v", i, result[i], want)
-		}
-	}
-}
-
-// TestBufferReadIdxAfterSwitch is a regression test for the index recalculation
-// bug after switching to b.next during a read.
-//
-// When both buffers share the same start time (can happen with checkpoint-loaded
-// chains), the old code hardcoded idx=0 after the switch, causing reads at time t
-// to return the wrong element from the next buffer.
-func TestBufferReadIdxAfterSwitch(t *testing.T) {
-	// b1: cap=2, both buffers start at 0 (firstWrite=5).
-	// b1 carries t=5 and t=15; b2 carries t=5,15,25,35 with the same start.
-	// When reading reaches t=25 the loop overflows b1 (idx=2 >= cap=2) and
-	// switches to b2. The correct index in b2 is (25-0)/10=2 → b2.data[2]=30.0.
-	// The old code set idx=0 → b2.data[0]=10.0 (wrong).
-	b1 := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 0}
-	b1.data = append(b1.data, schema.Float(1.0), schema.Float(2.0)) // t=5, t=15
-
-	b2 := &buffer{data: make([]schema.Float, 0, 10), frequency: 10, start: 0}
-	b2.data = append(b2.data,
-		schema.Float(10.0), schema.Float(20.0),
-		schema.Float(30.0), schema.Float(40.0)) // t=5,15,25,35
-	b2.prev = b1
-	b1.next = b2
-
-	// from=0 triggers the walkback to b1 (from < b2.firstWrite=5).
-	// After clamping, the loop runs t=5,15,25,35.
-	data := make([]schema.Float, 4)
-	result, _, _, err := b2.read(0, 36, data)
-	if err != nil {
-		t.Fatalf("read() error = %v", err)
-	}
-	if len(result) < 3 {
-		t.Fatalf("len(result) = %d, want >= 3", len(result))
-	}
-	if result[0] != schema.Float(1.0) {
-		t.Errorf("result[0] (t=5) = %v, want 1.0 (from b1)", result[0])
-	}
-	if result[1] != schema.Float(2.0) {
-		t.Errorf("result[1] (t=15) = %v, want 2.0 (from b1)", result[1])
-	}
-	// This is the critical assertion: old code returned 10.0 (b2.data[0]).
-	if result[2] != schema.Float(30.0) {
-		t.Errorf("result[2] (t=25) = %v, want 30.0 (idx recalculation fix)", result[2])
-	}
-}
-
-// TestBufferReadNaNValues verifies that NaN slots written to the buffer are
-// returned as NaN during read.
-func TestBufferReadNaNValues(t *testing.T) {
-	b := newBuffer(100, 10)
-	b.write(100, schema.Float(1.0))
-	b.write(110, schema.NaN)
-	b.write(120, schema.Float(3.0))
-
-	data := make([]schema.Float, 3)
-	result, _, _, err := b.read(100, 130, data)
-	if err != nil {
-		t.Fatalf("read() error = %v", err)
-	}
-	if len(result) != 3 {
-		t.Fatalf("len(result) = %d, want 3", len(result))
-	}
-	if result[0] != schema.Float(1.0) {
-		t.Errorf("result[0] = %v, want 1.0", result[0])
-	}
-	if !result[1].IsNaN() {
-		t.Errorf("result[1] should be NaN, got %v", result[1])
-	}
-	if result[2] != schema.Float(3.0) {
-		t.Errorf("result[2] = %v, want 3.0", result[2])
-	}
-}
-
-// TestBufferReadAccumulation verifies the += accumulation pattern used for
-// aggregation: values are added to whatever was already in the data slice.
-func TestBufferReadAccumulation(t *testing.T) {
-	b := newBuffer(100, 10)
-	b.write(100, schema.Float(3.0))
-	b.write(110, schema.Float(5.0))
-
-	// Pre-populate data slice (simulates a second metric being summed in).
-	data := []schema.Float{2.0, 1.0, 0.0}
-	result, _, _, err := b.read(100, 120, data)
-	if err != nil {
-		t.Fatalf("read() error = %v", err)
-	}
-	// 2.0+3.0=5.0, 1.0+5.0=6.0
-	if result[0] != schema.Float(5.0) {
-		t.Errorf("result[0] = %v, want 5.0 (2+3)", result[0])
-	}
-	if result[1] != schema.Float(6.0) {
-		t.Errorf("result[1] = %v, want 6.0 (1+5)", result[1])
-	}
-}
-
-// ─── Buffer free ─────────────────────────────────────────────────────────────
-
-// newTestPool swaps out the package-level bufferPool for a fresh isolated one
-// and returns a cleanup function that restores the original.
-func newTestPool(t *testing.T) *PersistentBufferPool {
-	t.Helper()
-	pool := NewPersistentBufferPool()
-	saved := bufferPool
-	bufferPool = pool
-	t.Cleanup(func() { bufferPool = saved })
-	return pool
-}
-
-// TestBufferFreeRetention verifies that free() removes buffers whose entire
-// time range falls before the retention threshold and returns them to the pool.
-func TestBufferFreeRetention(t *testing.T) {
-	pool := newTestPool(t)
-
-	// b1: firstWrite=5, end=25  b2: firstWrite=25, end=45  b3: firstWrite=45, end=65
-	b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
-	b1.data = append(b1.data, 1.0, 2.0)
-
-	b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
-	b2.data = append(b2.data, 3.0, 4.0)
-	b2.prev = b1
-	b1.next = b2
-
-	b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40}
-	b3.data = append(b3.data, 5.0, 6.0)
-	b3.prev = b2
-	b2.next = b3
-
-	// Threshold=30: b1.end()=25 < 30 → freed; b2.end()=45 >= 30 → kept.
-	delme, n := b3.free(30)
-	if delme {
-		t.Error("head buffer b3 should not be marked for deletion")
-	}
-	if n != 1 {
-		t.Errorf("freed count = %d, want 1", n)
-	}
-	if b2.prev != nil {
-		t.Error("b1 should have been unlinked from b2.prev")
-	}
-	if b3.prev != b2 {
-		t.Error("b3 should still reference b2")
-	}
-	if pool.GetSize() != 1 {
-		t.Errorf("pool size = %d, want 1 (b1 returned)", pool.GetSize())
-	}
-}
-
-// TestBufferFreeAll verifies that free() removes all buffers and signals the
-// caller to delete the head when the entire chain is older than the threshold.
-func TestBufferFreeAll(t *testing.T) {
-	pool := newTestPool(t)
-
-	b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
-	b1.data = append(b1.data, 1.0, 2.0) // end=25
-
-	b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
-	b2.data = append(b2.data, 3.0, 4.0) // end=45
-	b2.prev = b1
-	b1.next = b2
-
-	// Threshold=100 > both ends → both should be freed.
-	delme, n := b2.free(100)
-	if !delme {
-		t.Error("head buffer b2 should be marked for deletion when all data is stale")
-	}
-	if n != 2 {
-		t.Errorf("freed count = %d, want 2", n)
-	}
-	// b1 was freed inside free(); b2 is returned with delme=true for the caller.
-	if pool.GetSize() != 1 {
-		t.Errorf("pool size = %d, want 1 (b1 returned; b2 returned by caller)", pool.GetSize())
-	}
-}
-
-// ─── forceFreeOldest ─────────────────────────────────────────────────────────
-
-// TestForceFreeOldestPoolReturn verifies that forceFreeOldest() returns the
-// freed buffer to the pool (regression: previously it was just dropped).
-func TestForceFreeOldestPoolReturn(t *testing.T) {
-	pool := newTestPool(t)
-
-	b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
-	b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
-	b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40}
-	b1.data = append(b1.data, 1.0)
-	b2.data = append(b2.data, 2.0)
-	b3.data = append(b3.data, 3.0)
-	b2.prev = b1
-	b1.next = b2
-	b3.prev = b2
-	b2.next = b3
-
-	delme, n := b3.forceFreeOldest()
-	if delme {
-		t.Error("head b3 should not be marked for deletion (chain has 3 buffers)")
-	}
-	if n != 1 {
-		t.Errorf("freed count = %d, want 1", n)
-	}
-	if b2.prev != nil {
-		t.Error("b1 should have been unlinked from b2.prev after forceFreeOldest")
-	}
-	if b3.prev != b2 {
-		t.Error("b3 should still link to b2")
-	}
-	if pool.GetSize() != 1 {
-		t.Errorf("pool size = %d, want 1 (b1 returned to pool)", pool.GetSize())
-	}
-}
-
-// TestForceFreeOldestSingleBuffer verifies that forceFreeOldest() returns
-// delme=true when the buffer is the only one in the chain.
-func TestForceFreeOldestSingleBuffer(t *testing.T) {
-	b := newBuffer(100, 10)
-	b.write(100, schema.Float(1.0))
-
-	delme, n := b.forceFreeOldest()
-	if !delme {
-		t.Error("single-buffer chain: expected delme=true (the buffer IS the oldest)")
-	}
-	if n != 1 {
-		t.Errorf("freed count = %d, want 1", n)
-	}
-}
-
-// ─── iterFromTo ───────────────────────────────────────────────────────────────
-
-// TestBufferIterFromToOrder verifies that iterFromTo invokes the callback in
-// chronological order (oldest → newest).
-func TestBufferIterFromToOrder(t *testing.T) {
-	// Each buffer has 2 data points so end() = firstWrite + 2*freq.
-	b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0}  // end=25
-	b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20} // end=45
-	b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40} // end=65
-	b2.prev = b1
-	b1.next = b2
-	b3.prev = b2
-	b2.next = b3
-
-	var order []*buffer
-	err := b3.iterFromTo(0, 100, func(b *buffer) error {
-		order = append(order, b)
-		return nil
-	})
-	if err != nil {
-		t.Fatalf("iterFromTo() error = %v", err)
-	}
-	if len(order) != 3 {
-		t.Fatalf("callback count = %d, want 3", len(order))
-	}
-	if order[0] != b1 || order[1] != b2 || order[2] != b3 {
-		t.Error("iterFromTo() did not call callbacks in chronological (oldest→newest) order")
-	}
-}
-
-// TestBufferIterFromToFiltered verifies that iterFromTo only calls the callback
-// for buffers whose time range overlaps [from, to].
-func TestBufferIterFromToFiltered(t *testing.T) {
-	// b1: end=25  b2: start=20, end=45  b3: start=40, end=65
-	b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0}
-	b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20}
-	b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40}
-	b2.prev = b1
-	b1.next = b2
-	b3.prev = b2
-	b2.next = b3
-
-	// [30,50]: b1.end=25 < 30 → excluded; b2 and b3 overlap → included.
-	var visited []*buffer
-	b3.iterFromTo(30, 50, func(b *buffer) error {
-		visited = append(visited, b)
-		return nil
-	})
-	if len(visited) != 2 {
-		t.Fatalf("visited count = %d, want 2 (b2 and b3)", len(visited))
-	}
-	if visited[0] != b2 || visited[1] != b3 {
-		t.Errorf("visited = %v, want [b2, b3]", visited)
-	}
-}
-
-// TestBufferIterFromToNilBuffer verifies that iterFromTo on a nil buffer is a
-// safe no-op.
-func TestBufferIterFromToNilBuffer(t *testing.T) {
-	var b *buffer
-	called := false
-	err := b.iterFromTo(0, 100, func(_ *buffer) error {
-		called = true
-		return nil
-	})
-	if err != nil {
-		t.Errorf("iterFromTo(nil) error = %v, want nil", err)
-	}
-	if called {
-		t.Error("callback should not be called for a nil buffer")
-	}
-}
-
-// ─── count ────────────────────────────────────────────────────────────────────
-
-// TestBufferCount verifies that count() sums data-point lengths across the
-// entire chain, including all prev links.
-func TestBufferCount(t *testing.T) {
-	b1 := &buffer{data: make([]schema.Float, 3, BufferCap), frequency: 10, start: 0}
-	b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 35}
-	b3 := &buffer{data: make([]schema.Float, 5, BufferCap), frequency: 10, start: 60}
-	b2.prev = b1
-	b1.next = b2
-	b3.prev = b2
-	b2.next = b3
-
-	if got := b3.count(); got != 10 {
-		t.Errorf("count() = %d, want 10 (3+2+5)", got)
-	}
-
-	// Single buffer.
-	lone := &buffer{data: make([]schema.Float, 7, BufferCap)}
-	if got := lone.count(); got != 7 {
-		t.Errorf("count() single buffer = %d, want 7", got)
-	}
-}
-
-// ─── Existing tests below ────────────────────────────────────────────────────
-
 func TestAssignAggregationStrategy(t *testing.T) {
 	tests := []struct {
 		name     string
@@ -984,53 +464,3 @@ func TestBufferHealthChecks(t *testing.T) {
 		})
 	}
 }
-
-func TestBufferPoolClean(t *testing.T) {
-	// Use a fresh pool for testing
-	pool := NewPersistentBufferPool()
-
-	now := time.Now().Unix()
-
-	// Create some buffers and put them in the pool with different lastUsed times
-	b1 := &buffer{lastUsed: now - 3600, data: make([]schema.Float, 0)}   // 1 hour ago
-	b2 := &buffer{lastUsed: now - 7200, data: make([]schema.Float, 0)}   // 2 hours ago
-	b3 := &buffer{lastUsed: now - 180000, data: make([]schema.Float, 0)} // 50 hours ago
-	b4 := &buffer{lastUsed: now - 200000, data: make([]schema.Float, 0)} // 55 hours ago
-	b5 := &buffer{lastUsed: now, data: make([]schema.Float, 0)}
-
-	pool.Put(b1)
-	pool.Put(b2)
-	pool.Put(b3)
-	pool.Put(b4)
-	pool.Put(b5)
-
-	if pool.GetSize() != 5 {
-		t.Fatalf("Expected pool size 5, got %d", pool.GetSize())
-	}
-
-	// Clean buffers older than 48 hours
-	timeUpdate := time.Now().Add(-48 * time.Hour).Unix()
-	pool.Clean(timeUpdate)
-
-	// Expected: b1, b2, b5 should remain. b3, b4 should be cleaned.
-	if pool.GetSize() != 3 {
-		t.Fatalf("Expected pool size 3 after clean, got %d", pool.GetSize())
-	}
-
-	validBufs := map[int64]bool{
-		b1.lastUsed: true,
-		b2.lastUsed: true,
-		b5.lastUsed: true,
-	}
-
-	for i := 0; i < 3; i++ {
-		b := pool.Get()
-		if !validBufs[b.lastUsed] {
-			t.Errorf("Found unexpected buffer with lastUsed %d", b.lastUsed)
-		}
-	}
-
-	if pool.GetSize() != 0 {
-		t.Fatalf("Expected pool to be empty, got %d", pool.GetSize())
-	}
-}
--- a/pkg/metricstore/parquetArchive.go
+++ b/pkg/metricstore/parquetArchive.go
@@ -1,213 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package metricstore
-
-import (
-	"bufio"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-
-	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
-	pq "github.com/parquet-go/parquet-go"
-)
-
-// ParquetMetricRow is the long-format schema for archived metric data.
-// One row per (host, metric, scope, scope_id, timestamp) data point.
-// Sorted by (cluster, hostname, metric, timestamp) for optimal compression.
-type ParquetMetricRow struct {
-	Cluster   string  `parquet:"cluster"`
-	Hostname  string  `parquet:"hostname"`
-	Metric    string  `parquet:"metric"`
-	Scope     string  `parquet:"scope"`
-	ScopeID   string  `parquet:"scope_id"`
-	Timestamp int64   `parquet:"timestamp"`
-	Frequency int64   `parquet:"frequency"`
-	Value     float32 `parquet:"value"`
-}
-
-// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows.
-// The scope path is built from the hierarchy: host level is "node", then child names
-// map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0").
-func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow {
-	for metricName, cm := range cf.Metrics {
-		ts := cm.Start
-		for _, v := range cm.Data {
-			if !v.IsNaN() {
-				rows = append(rows, ParquetMetricRow{
-					Cluster:   cluster,
-					Hostname:  hostname,
-					Metric:    metricName,
-					Scope:     scope,
-					ScopeID:   scopeID,
-					Timestamp: ts,
-					Frequency: cm.Frequency,
-					Value:     float32(v),
-				})
-			}
-			ts += cm.Frequency
-		}
-	}
-
-	for childName, childCf := range cf.Children {
-		childScope, childScopeID := parseScopeFromName(childName)
-		rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows)
-	}
-
-	return rows
-}
-
-// parseScopeFromName infers scope and scope_id from a child level name.
-// Examples: "socket0" → ("socket", "0"), "core12" → ("core", "12"),
-// "a0" (accelerator) → ("accelerator", "0").
-// If the name doesn't match known patterns, it's used as-is for scope with empty scope_id.
-func parseScopeFromName(name string) (string, string) {
-	prefixes := []struct {
-		prefix string
-		scope  string
-	}{
-		{"socket", "socket"},
-		{"memoryDomain", "memoryDomain"},
-		{"core", "core"},
-		{"hwthread", "hwthread"},
-		{"cpu", "hwthread"},
-		{"accelerator", "accelerator"},
-	}
-
-	for _, p := range prefixes {
-		if len(name) > len(p.prefix) && name[:len(p.prefix)] == p.prefix {
-			id := name[len(p.prefix):]
-			if len(id) > 0 && id[0] >= '0' && id[0] <= '9' {
-				return p.scope, id
-			}
-		}
-	}
-
-	return name, ""
-}
-
-// writeParquetArchive writes rows to a Parquet file with Zstd compression.
-func writeParquetArchive(filename string, rows []ParquetMetricRow) error {
-	if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil {
-		return fmt.Errorf("creating archive directory: %w", err)
-	}
-
-	f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
-	if err != nil {
-		return fmt.Errorf("creating parquet file: %w", err)
-	}
-	defer f.Close()
-
-	bw := bufio.NewWriterSize(f, 1<<20) // 1MB write buffer
-
-	writer := pq.NewGenericWriter[ParquetMetricRow](bw,
-		pq.Compression(&pq.Zstd),
-		pq.SortingWriterConfig(pq.SortingColumns(
-			pq.Ascending("cluster"),
-			pq.Ascending("hostname"),
-			pq.Ascending("metric"),
-			pq.Ascending("timestamp"),
-		)),
-	)
-
-	if _, err := writer.Write(rows); err != nil {
-		return fmt.Errorf("writing parquet rows: %w", err)
-	}
-
-	if err := writer.Close(); err != nil {
-		return fmt.Errorf("closing parquet writer: %w", err)
-	}
-
-	if err := bw.Flush(); err != nil {
-		return fmt.Errorf("flushing parquet file: %w", err)
-	}
-
-	return nil
-}
-
-// loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns
-// a CheckpointFile. Used by the Parquet archiver to read checkpoint data
-// before converting it to Parquet format.
-func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) {
-	f, err := os.Open(filename)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	ext := filepath.Ext(filename)
-	switch ext {
-	case ".json":
-		cf := &CheckpointFile{}
-		br := bufio.NewReader(f)
-		if err := json.NewDecoder(br).Decode(cf); err != nil {
-			return nil, fmt.Errorf("decoding JSON checkpoint %s: %w", filename, err)
-		}
-		return cf, nil
-
-	case ".bin":
-		br := bufio.NewReader(f)
-		var magic uint32
-		if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
-			return nil, fmt.Errorf("reading magic from %s: %w", filename, err)
-		}
-		if magic != snapFileMagic {
-			return nil, fmt.Errorf("invalid snapshot magic in %s: 0x%08X", filename, magic)
-		}
-		var fileFrom, fileTo int64
-		if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil {
-			return nil, fmt.Errorf("reading from-timestamp from %s: %w", filename, err)
-		}
-		if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil {
-			return nil, fmt.Errorf("reading to-timestamp from %s: %w", filename, err)
-		}
-		cf, err := readBinaryLevel(br)
-		if err != nil {
-			return nil, fmt.Errorf("reading binary level from %s: %w", filename, err)
-		}
-		cf.From = fileFrom
-		cf.To = fileTo
-		return cf, nil
-
-	default:
-		return nil, fmt.Errorf("unsupported checkpoint extension: %s", ext)
-	}
-}
-
-// archiveCheckpointsToParquet reads checkpoint files for a host directory,
-// converts them to Parquet rows. Returns the rows and filenames that were processed.
-func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) {
-	entries, err := os.ReadDir(dir)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	files, err := findFiles(entries, from, false)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	if len(files) == 0 {
-		return nil, nil, nil
-	}
-
-	var rows []ParquetMetricRow
-
-	for _, checkpoint := range files {
-		filename := filepath.Join(dir, checkpoint)
-		cf, err := loadCheckpointFileFromDisk(filename)
-		if err != nil {
-			cclog.Warnf("[METRICSTORE]> skipping unreadable checkpoint %s: %v", filename, err)
-			continue
-		}
-
-		rows = flattenCheckpointFile(cf, cluster, host, "node", "", rows)
-	}
-
-	return rows, files, nil
-}
--- a/pkg/metricstore/parquetArchive_test.go
+++ b/pkg/metricstore/parquetArchive_test.go
@@ -1,255 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package metricstore
-
-import (
-	"encoding/json"
-	"os"
-	"path/filepath"
-	"testing"
-
-	"github.com/ClusterCockpit/cc-lib/v2/schema"
-	pq "github.com/parquet-go/parquet-go"
-)
-
-func TestParseScopeFromName(t *testing.T) {
-	tests := []struct {
-		name      string
-		wantScope string
-		wantID    string
-	}{
-		{"socket0", "socket", "0"},
-		{"socket12", "socket", "12"},
-		{"core0", "core", "0"},
-		{"core127", "core", "127"},
-		{"cpu0", "hwthread", "0"},
-		{"hwthread5", "hwthread", "5"},
-		{"memoryDomain0", "memoryDomain", "0"},
-		{"accelerator0", "accelerator", "0"},
-		{"unknown", "unknown", ""},
-		{"socketX", "socketX", ""}, // not numeric suffix
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			scope, id := parseScopeFromName(tt.name)
-			if scope != tt.wantScope || id != tt.wantID {
-				t.Errorf("parseScopeFromName(%q) = (%q, %q), want (%q, %q)",
-					tt.name, scope, id, tt.wantScope, tt.wantID)
-			}
-		})
-	}
-}
-
-func TestFlattenCheckpointFile(t *testing.T) {
-	cf := &CheckpointFile{
-		From: 1000,
-		To:   1060,
-		Metrics: map[string]*CheckpointMetrics{
-			"cpu_load": {
-				Frequency: 60,
-				Start:     1000,
-				Data:      []schema.Float{0.5, 0.7, schema.NaN},
-			},
-		},
-		Children: map[string]*CheckpointFile{
-			"socket0": {
-				Metrics: map[string]*CheckpointMetrics{
-					"mem_bw": {
-						Frequency: 60,
-						Start:     1000,
-						Data:      []schema.Float{100.0, schema.NaN, 200.0},
-					},
-				},
-				Children: make(map[string]*CheckpointFile),
-			},
-		},
-	}
-
-	rows := flattenCheckpointFile(cf, "fritz", "node001", "node", "", nil)
-
-	// cpu_load: 2 non-NaN values at node scope
-	// mem_bw: 2 non-NaN values at socket0 scope
-	if len(rows) != 4 {
-		t.Fatalf("expected 4 rows, got %d", len(rows))
-	}
-
-	// Verify a node-scope row
-	found := false
-	for _, r := range rows {
-		if r.Metric == "cpu_load" && r.Timestamp == 1000 {
-			found = true
-			if r.Cluster != "fritz" || r.Hostname != "node001" || r.Scope != "node" || r.Value != 0.5 {
-				t.Errorf("unexpected row: %+v", r)
-			}
-		}
-	}
-	if !found {
-		t.Error("expected cpu_load row at timestamp 1000")
-	}
-
-	// Verify a socket-scope row
-	found = false
-	for _, r := range rows {
-		if r.Metric == "mem_bw" && r.Scope == "socket" && r.ScopeID == "0" {
-			found = true
-		}
-	}
-	if !found {
-		t.Error("expected mem_bw row with scope=socket, scope_id=0")
-	}
-}
-
-func TestParquetArchiveRoundtrip(t *testing.T) {
-	tmpDir := t.TempDir()
-
-	// Create checkpoint files on disk (JSON format)
-	cpDir := filepath.Join(tmpDir, "checkpoints", "testcluster", "node001")
-	if err := os.MkdirAll(cpDir, 0o755); err != nil {
-		t.Fatal(err)
-	}
-
-	cf := &CheckpointFile{
-		From: 1000,
-		To:   1180,
-		Metrics: map[string]*CheckpointMetrics{
-			"cpu_load": {
-				Frequency: 60,
-				Start:     1000,
-				Data:      []schema.Float{0.5, 0.7, 0.9},
-			},
-			"mem_used": {
-				Frequency: 60,
-				Start:     1000,
-				Data:      []schema.Float{45.0, 46.0, 47.0},
-			},
-		},
-		Children: map[string]*CheckpointFile{
-			"socket0": {
-				Metrics: map[string]*CheckpointMetrics{
-					"mem_bw": {
-						Frequency: 60,
-						Start:     1000,
-						Data:      []schema.Float{100.0, 110.0, 120.0},
-					},
-				},
-				Children: make(map[string]*CheckpointFile),
-			},
-		},
-	}
-
-	// Write JSON checkpoint
-	cpFile := filepath.Join(cpDir, "1000.json")
-	data, err := json.Marshal(cf)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if err := os.WriteFile(cpFile, data, 0o644); err != nil {
-		t.Fatal(err)
-	}
-
-	// Archive to Parquet
-	archiveDir := filepath.Join(tmpDir, "archive")
-	rows, files, err := archiveCheckpointsToParquet(cpDir, "testcluster", "node001", 2000)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if len(files) != 1 || files[0] != "1000.json" {
-		t.Fatalf("expected 1 file, got %v", files)
-	}
-
-	parquetFile := filepath.Join(archiveDir, "testcluster", "1000.parquet")
-	if err := writeParquetArchive(parquetFile, rows); err != nil {
-		t.Fatal(err)
-	}
-
-	// Read back and verify
-	f, err := os.Open(parquetFile)
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	stat, _ := f.Stat()
-	pf, err := pq.OpenFile(f, stat.Size())
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	reader := pq.NewGenericReader[ParquetMetricRow](pf)
-	readRows := make([]ParquetMetricRow, 100)
-	n, err := reader.Read(readRows)
-	if err != nil && n == 0 {
-		t.Fatal(err)
-	}
-	readRows = readRows[:n]
-	reader.Close()
-
-	// We expect: cpu_load(3) + mem_used(3) + mem_bw(3) = 9 rows
-	if n != 9 {
-		t.Fatalf("expected 9 rows in parquet file, got %d", n)
-	}
-
-	// Verify cluster and hostname are set correctly
-	for _, r := range readRows {
-		if r.Cluster != "testcluster" {
-			t.Errorf("expected cluster=testcluster, got %s", r.Cluster)
-		}
-		if r.Hostname != "node001" {
-			t.Errorf("expected hostname=node001, got %s", r.Hostname)
-		}
-	}
-
-	// Verify parquet file is smaller than JSON (compression working)
-	if stat.Size() == 0 {
-		t.Error("parquet file is empty")
-	}
-
-	t.Logf("Parquet file size: %d bytes for %d rows", stat.Size(), n)
-}
-
-func TestLoadCheckpointFileFromDisk_JSON(t *testing.T) {
-	tmpDir := t.TempDir()
-
-	cf := &CheckpointFile{
-		From: 1000,
-		To:   1060,
-		Metrics: map[string]*CheckpointMetrics{
-			"test_metric": {
-				Frequency: 60,
-				Start:     1000,
-				Data:      []schema.Float{1.0, 2.0, 3.0},
-			},
-		},
-		Children: make(map[string]*CheckpointFile),
-	}
-
-	filename := filepath.Join(tmpDir, "1000.json")
-	data, err := json.Marshal(cf)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if err := os.WriteFile(filename, data, 0o644); err != nil {
-		t.Fatal(err)
-	}
-
-	loaded, err := loadCheckpointFileFromDisk(filename)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if loaded.From != 1000 || loaded.To != 1060 {
-		t.Errorf("expected From=1000, To=1060, got From=%d, To=%d", loaded.From, loaded.To)
-	}
-
-	m, ok := loaded.Metrics["test_metric"]
-	if !ok {
-		t.Fatal("expected test_metric in loaded checkpoint")
-	}
-	if m.Frequency != 60 || m.Start != 1000 || len(m.Data) != 3 {
-		t.Errorf("unexpected metric data: %+v", m)
-	}
-}
--- a/pkg/metricstore/walCheckpoint.go
+++ b/pkg/metricstore/walCheckpoint.go
@@ -1,787 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-// Package metricstore provides walCheckpoint.go: WAL-based checkpoint implementation.
-//
-// This replaces the Avro shadow tree with an append-only Write-Ahead Log (WAL)
-// per host, eliminating the extra memory overhead of the AvroStore and providing
-// truly continuous (per-write) crash safety.
-//
-// # Architecture
-//
-//	Metric write (DecodeLine)
-//	      │
-//	      ├─► WriteToLevel()    → main MemoryStore (unchanged)
-//	      │
-//	      └─► WALMessages channel
-//	               │
-//	               ▼
-//	        WALStaging goroutine
-//	               │
-//	               ▼
-//	        checkpoints/cluster/host/current.wal  (append-only, binary)
-//
-//	Periodic checkpoint (Checkpointing goroutine):
-//	  1. Write <timestamp>.bin snapshot (column-oriented, from main tree)
-//	  2. Signal WALStaging to truncate current.wal per host
-//
-//	On restart (FromCheckpoint):
-//	  1. Load most recent <timestamp>.bin snapshot
-//	  2. Replay current.wal (overwrite-safe: buffer.write handles duplicate timestamps)
-//
-// # WAL Record Format
-//
-//	[4B magic 0xCC1DA7A1][4B payload_len][payload][4B CRC32]
-//
-//	payload:
-//	  [8B timestamp int64]
-//	  [2B metric_name_len uint16][N metric name bytes]
-//	  [1B selector_count uint8]
-//	  per selector: [1B selector_len uint8][M selector bytes]
-//	  [4B value float32 bits]
-//
-// # Binary Snapshot Format
-//
-//	[4B magic 0xCC5B0001][8B from int64][8B to int64]
-//	Level tree (recursive):
-//	  [4B num_metrics uint32]
-//	  per metric:
-//	    [2B name_len uint16][N name bytes]
-//	    [8B frequency int64][8B start int64]
-//	    [4B num_values uint32][num_values × 4B float32]
-//	  [4B num_children uint32]
-//	  per child: [2B name_len uint16][N name bytes] + Level (recursive)
-package metricstore
-
-import (
-	"bufio"
-	"context"
-	"encoding/binary"
-	"fmt"
-	"hash/crc32"
-	"io"
-	"math"
-	"os"
-	"path"
-	"strings"
-	"sync"
-	"sync/atomic"
-
-	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/v2/schema"
-)
-
-// Magic numbers for binary formats.
-const (
-	walFileMagic   = uint32(0xCC1DA701) // WAL file header magic
-	walRecordMagic = uint32(0xCC1DA7A1) // WAL record magic
-	snapFileMagic  = uint32(0xCC5B0001) // Binary snapshot magic
-)
-
-// WALMessages is the channel for sending metric writes to the WAL staging goroutine.
-// Buffered to allow burst writes without blocking the metric ingestion path.
-var WALMessages = make(chan *WALMessage, 4096)
-
-// walRotateCh is used by the checkpoint goroutine to request WAL file rotation
-// (close, delete, reopen) after a binary snapshot has been written.
-var walRotateCh = make(chan walRotateReq, 256)
-
-// WALMessage represents a single metric write to be appended to the WAL.
-// Cluster and Node are NOT stored in the WAL record (inferred from file path).
-type WALMessage struct {
-	MetricName string
-	Cluster    string
-	Node       string
-	Selector   []string
-	Value      schema.Float
-	Timestamp  int64
-}
-
-// walRotateReq requests WAL file rotation for a specific host directory.
-// The done channel is closed by the WAL goroutine when rotation is complete.
-type walRotateReq struct {
-	hostDir string
-	done    chan struct{}
-}
-
-// walFileState holds an open WAL file handle for one host directory.
-type walFileState struct {
-	f *os.File
-}
-
-// WALStaging starts a background goroutine that receives WALMessage items
-// and appends binary WAL records to per-host current.wal files.
-// Also handles WAL rotation requests from the checkpoint goroutine.
-func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
-	wg.Go(func() {
-
-		if Keys.Checkpoints.FileFormat == "json" {
-			return
-		}
-
-		hostFiles := make(map[string]*walFileState)
-
-		defer func() {
-			for _, ws := range hostFiles {
-				if ws.f != nil {
-					ws.f.Close()
-				}
-			}
-		}()
-
-		getOrOpenWAL := func(hostDir string) *os.File {
-			ws, ok := hostFiles[hostDir]
-			if ok {
-				return ws.f
-			}
-
-			if err := os.MkdirAll(hostDir, CheckpointDirPerms); err != nil {
-				cclog.Errorf("[METRICSTORE]> WAL: mkdir %s: %v", hostDir, err)
-				return nil
-			}
-
-			walPath := path.Join(hostDir, "current.wal")
-			f, err := os.OpenFile(walPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, CheckpointFilePerms)
-			if err != nil {
-				cclog.Errorf("[METRICSTORE]> WAL: open %s: %v", walPath, err)
-				return nil
-			}
-
-			// Write file header magic if file is new (empty).
-			info, err := f.Stat()
-			if err == nil && info.Size() == 0 {
-				var hdr [4]byte
-				binary.LittleEndian.PutUint32(hdr[:], walFileMagic)
-				if _, err := f.Write(hdr[:]); err != nil {
-					cclog.Errorf("[METRICSTORE]> WAL: write header %s: %v", walPath, err)
-					f.Close()
-					return nil
-				}
-			}
-
-			hostFiles[hostDir] = &walFileState{f: f}
-			return f
-		}
-
-		processMsg := func(msg *WALMessage) {
-			hostDir := path.Join(Keys.Checkpoints.RootDir, msg.Cluster, msg.Node)
-			f := getOrOpenWAL(hostDir)
-			if f == nil {
-				return
-			}
-			if err := writeWALRecord(f, msg); err != nil {
-				cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err)
-			}
-		}
-
-		processRotate := func(req walRotateReq) {
-			ws, ok := hostFiles[req.hostDir]
-			if ok && ws.f != nil {
-				ws.f.Close()
-				walPath := path.Join(req.hostDir, "current.wal")
-				if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) {
-					cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err)
-				}
-				delete(hostFiles, req.hostDir)
-			}
-			close(req.done)
-		}
-
-		drain := func() {
-			for {
-				select {
-				case msg, ok := <-WALMessages:
-					if !ok {
-						return
-					}
-					processMsg(msg)
-				case req := <-walRotateCh:
-					processRotate(req)
-				default:
-					return
-				}
-			}
-		}
-
-		for {
-			select {
-			case <-ctx.Done():
-				drain()
-				return
-			case msg, ok := <-WALMessages:
-				if !ok {
-					return
-				}
-				processMsg(msg)
-			case req := <-walRotateCh:
-				processRotate(req)
-			}
-		}
-	})
-}
-
-// RotateWALFiles sends rotation requests for the given host directories
-// and blocks until all rotations complete.
-func RotateWALFiles(hostDirs []string) {
-	dones := make([]chan struct{}, len(hostDirs))
-	for i, dir := range hostDirs {
-		dones[i] = make(chan struct{})
-		walRotateCh <- walRotateReq{hostDir: dir, done: dones[i]}
-	}
-	for _, done := range dones {
-		<-done
-	}
-}
-
-// buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC).
-func buildWALPayload(msg *WALMessage) []byte {
-	size := 8 + 2 + len(msg.MetricName) + 1 + 4
-	for _, s := range msg.Selector {
-		size += 1 + len(s)
-	}
-
-	buf := make([]byte, 0, size)
-
-	// Timestamp (8 bytes, little-endian int64)
-	var ts [8]byte
-	binary.LittleEndian.PutUint64(ts[:], uint64(msg.Timestamp))
-	buf = append(buf, ts[:]...)
-
-	// Metric name (2-byte length prefix + bytes)
-	var mLen [2]byte
-	binary.LittleEndian.PutUint16(mLen[:], uint16(len(msg.MetricName)))
-	buf = append(buf, mLen[:]...)
-	buf = append(buf, msg.MetricName...)
-
-	// Selector count (1 byte)
-	buf = append(buf, byte(len(msg.Selector)))
-
-	// Selectors (1-byte length prefix + bytes each)
-	for _, sel := range msg.Selector {
-		buf = append(buf, byte(len(sel)))
-		buf = append(buf, sel...)
-	}
-
-	// Value (4 bytes, float32 bit representation)
-	var val [4]byte
-	binary.LittleEndian.PutUint32(val[:], math.Float32bits(float32(msg.Value)))
-	buf = append(buf, val[:]...)
-
-	return buf
-}
-
-// writeWALRecord appends a binary WAL record to the file.
-// Format: [4B magic][4B payload_len][payload][4B CRC32]
-func writeWALRecord(f *os.File, msg *WALMessage) error {
-	payload := buildWALPayload(msg)
-	crc := crc32.ChecksumIEEE(payload)
-
-	record := make([]byte, 0, 4+4+len(payload)+4)
-
-	var magic [4]byte
-	binary.LittleEndian.PutUint32(magic[:], walRecordMagic)
-	record = append(record, magic[:]...)
-
-	var pLen [4]byte
-	binary.LittleEndian.PutUint32(pLen[:], uint32(len(payload)))
-	record = append(record, pLen[:]...)
-
-	record = append(record, payload...)
-
-	var crcBytes [4]byte
-	binary.LittleEndian.PutUint32(crcBytes[:], crc)
-	record = append(record, crcBytes[:]...)
-
-	_, err := f.Write(record)
-	return err
-}
-
-// readWALRecord reads one WAL record from the reader.
-// Returns (nil, nil) on clean EOF. Returns error on data corruption.
-// A CRC mismatch indicates a truncated trailing record (expected on crash).
-func readWALRecord(r io.Reader) (*WALMessage, error) {
-	var magic uint32
-	if err := binary.Read(r, binary.LittleEndian, &magic); err != nil {
-		if err == io.EOF {
-			return nil, nil // Clean EOF
-		}
-		return nil, fmt.Errorf("read record magic: %w", err)
-	}
-
-	if magic != walRecordMagic {
-		return nil, fmt.Errorf("invalid record magic 0x%08X (expected 0x%08X)", magic, walRecordMagic)
-	}
-
-	var payloadLen uint32
-	if err := binary.Read(r, binary.LittleEndian, &payloadLen); err != nil {
-		return nil, fmt.Errorf("read payload length: %w", err)
-	}
-
-	if payloadLen > 1<<20 { // 1 MB sanity limit
-		return nil, fmt.Errorf("record payload too large: %d bytes", payloadLen)
-	}
-
-	payload := make([]byte, payloadLen)
-	if _, err := io.ReadFull(r, payload); err != nil {
-		return nil, fmt.Errorf("read payload: %w", err)
-	}
-
-	var storedCRC uint32
-	if err := binary.Read(r, binary.LittleEndian, &storedCRC); err != nil {
-		return nil, fmt.Errorf("read CRC: %w", err)
-	}
-
-	if crc32.ChecksumIEEE(payload) != storedCRC {
-		return nil, fmt.Errorf("CRC mismatch (truncated write or corruption)")
-	}
-
-	return parseWALPayload(payload)
-}
-
-// parseWALPayload decodes a binary payload into a WALMessage.
-func parseWALPayload(payload []byte) (*WALMessage, error) {
-	if len(payload) < 8+2+1+4 {
-		return nil, fmt.Errorf("payload too short: %d bytes", len(payload))
-	}
-
-	offset := 0
-
-	// Timestamp (8 bytes)
-	ts := int64(binary.LittleEndian.Uint64(payload[offset : offset+8]))
-	offset += 8
-
-	// Metric name (2-byte length + bytes)
-	if offset+2 > len(payload) {
-		return nil, fmt.Errorf("metric name length overflows payload")
-	}
-	mLen := int(binary.LittleEndian.Uint16(payload[offset : offset+2]))
-	offset += 2
-
-	if offset+mLen > len(payload) {
-		return nil, fmt.Errorf("metric name overflows payload")
-	}
-	metricName := string(payload[offset : offset+mLen])
-	offset += mLen
-
-	// Selector count (1 byte)
-	if offset >= len(payload) {
-		return nil, fmt.Errorf("selector count overflows payload")
-	}
-	selCount := int(payload[offset])
-	offset++
-
-	selectors := make([]string, selCount)
-	for i := range selCount {
-		if offset >= len(payload) {
-			return nil, fmt.Errorf("selector[%d] length overflows payload", i)
-		}
-		sLen := int(payload[offset])
-		offset++
-
-		if offset+sLen > len(payload) {
-			return nil, fmt.Errorf("selector[%d] data overflows payload", i)
-		}
-		selectors[i] = string(payload[offset : offset+sLen])
-		offset += sLen
-	}
-
-	// Value (4 bytes, float32 bits)
-	if offset+4 > len(payload) {
-		return nil, fmt.Errorf("value overflows payload")
-	}
-	bits := binary.LittleEndian.Uint32(payload[offset : offset+4])
-	value := schema.Float(math.Float32frombits(bits))
-
-	return &WALMessage{
-		MetricName: metricName,
-		Timestamp:  ts,
-		Selector:   selectors,
-		Value:      value,
-	}, nil
-}
-
-// loadWALFile reads a WAL file and replays all valid records into the Level tree.
-// l is the host-level node. Corrupt or partial trailing records are silently skipped
-// (expected on crash). Records older than 'from' are skipped.
-func (l *Level) loadWALFile(m *MemoryStore, f *os.File, from int64) error {
-	br := bufio.NewReader(f)
-
-	// Verify file header magic.
-	var fileMagic uint32
-	if err := binary.Read(br, binary.LittleEndian, &fileMagic); err != nil {
-		if err == io.EOF {
-			return nil // Empty file, no data
-		}
-		return fmt.Errorf("[METRICSTORE]> WAL: read file header: %w", err)
-	}
-
-	if fileMagic != walFileMagic {
-		return fmt.Errorf("[METRICSTORE]> WAL: invalid file magic 0x%08X (expected 0x%08X)", fileMagic, walFileMagic)
-	}
-
-	// Cache level lookups to avoid repeated tree traversal.
-	lvlCache := make(map[string]*Level)
-
-	for {
-		msg, err := readWALRecord(br)
-		if err != nil {
-			// Truncated trailing record is expected after a crash; stop replaying.
-			cclog.Debugf("[METRICSTORE]> WAL: stopping replay at corrupted/partial record: %v", err)
-			break
-		}
-		if msg == nil {
-			break // Clean EOF
-		}
-
-		if msg.Timestamp < from {
-			continue // Older than retention window
-		}
-
-		minfo, ok := m.Metrics[msg.MetricName]
-		if !ok {
-			continue // Unknown metric (config may have changed)
-		}
-
-		// Cache key is the null-separated selector path.
-		cacheKey := joinSelector(msg.Selector)
-		lvl, ok := lvlCache[cacheKey]
-		if !ok {
-			lvl = l.findLevelOrCreate(msg.Selector, len(m.Metrics))
-			lvlCache[cacheKey] = lvl
-		}
-
-		// Write directly to the buffer, same as WriteToLevel but without the
-		// global level lookup (we already have the right level).
-		lvl.lock.Lock()
-		b := lvl.metrics[minfo.offset]
-		if b == nil {
-			b = newBuffer(msg.Timestamp, minfo.Frequency)
-			lvl.metrics[minfo.offset] = b
-		}
-		nb, writeErr := b.write(msg.Timestamp, msg.Value)
-		if writeErr == nil && b != nb {
-			lvl.metrics[minfo.offset] = nb
-		}
-		// Ignore write errors for timestamps before buffer start (can happen when
-		// replaying WAL entries that predate a loaded snapshot's start time).
-		lvl.lock.Unlock()
-	}
-
-	return nil
-}
-
-// joinSelector builds a cache key from a selector slice using null bytes as separators.
-func joinSelector(sel []string) string {
-	if len(sel) == 0 {
-		return ""
-	}
-	var result strings.Builder
-	result.WriteString(sel[0])
-	for i := 1; i < len(sel); i++ {
-		result.WriteString("\x00" + sel[i])
-	}
-	return result.String()
-}
-
-// ToCheckpointWAL writes binary snapshot files for all hosts in parallel.
-// Returns the number of files written, the list of host directories that were
-// successfully checkpointed (for WAL rotation), and any errors.
-func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string, error) {
-	// Collect all cluster/host pairs.
-	m.root.lock.RLock()
-	totalHosts := 0
-	for _, l1 := range m.root.children {
-		l1.lock.RLock()
-		totalHosts += len(l1.children)
-		l1.lock.RUnlock()
-	}
-	m.root.lock.RUnlock()
-
-	levels := make([]*Level, 0, totalHosts)
-	selectors := make([][]string, 0, totalHosts)
-
-	m.root.lock.RLock()
-	for sel1, l1 := range m.root.children {
-		l1.lock.RLock()
-		for sel2, l2 := range l1.children {
-			levels = append(levels, l2)
-			selectors = append(selectors, []string{sel1, sel2})
-		}
-		l1.lock.RUnlock()
-	}
-	m.root.lock.RUnlock()
-
-	type workItem struct {
-		level    *Level
-		hostDir  string
-		selector []string
-	}
-
-	n, errs := int32(0), int32(0)
-	var successDirs []string
-	var successMu sync.Mutex
-
-	var wg sync.WaitGroup
-	wg.Add(Keys.NumWorkers)
-	work := make(chan workItem, Keys.NumWorkers*2)
-
-	for range Keys.NumWorkers {
-		go func() {
-			defer wg.Done()
-			for wi := range work {
-				err := wi.level.toCheckpointBinary(wi.hostDir, from, to, m)
-				if err != nil {
-					if err == ErrNoNewArchiveData {
-						continue
-					}
-					cclog.Errorf("[METRICSTORE]> binary checkpoint error for %s: %v", wi.hostDir, err)
-					atomic.AddInt32(&errs, 1)
-				} else {
-					atomic.AddInt32(&n, 1)
-					successMu.Lock()
-					successDirs = append(successDirs, wi.hostDir)
-					successMu.Unlock()
-				}
-			}
-		}()
-	}
-
-	for i := range levels {
-		hostDir := path.Join(dir, path.Join(selectors[i]...))
-		work <- workItem{
-			level:    levels[i],
-			hostDir:  hostDir,
-			selector: selectors[i],
-		}
-	}
-	close(work)
-	wg.Wait()
-
-	if errs > 0 {
-		return int(n), successDirs, fmt.Errorf("[METRICSTORE]> %d errors during binary checkpoint (%d successes)", errs, n)
-	}
-	return int(n), successDirs, nil
-}
-
-// toCheckpointBinary writes a binary snapshot file for a single host-level node.
-// Uses atomic rename (write to .tmp then rename) to avoid partial reads on crash.
-func (l *Level) toCheckpointBinary(dir string, from, to int64, m *MemoryStore) error {
-	cf, err := l.toCheckpointFile(from, to, m)
-	if err != nil {
-		return err
-	}
-	if cf == nil {
-		return ErrNoNewArchiveData
-	}
-
-	if err := os.MkdirAll(dir, CheckpointDirPerms); err != nil {
-		return fmt.Errorf("mkdir %s: %w", dir, err)
-	}
-
-	// Write to a temp file first, then rename (atomic on POSIX).
-	tmpPath := path.Join(dir, fmt.Sprintf("%d.bin.tmp", from))
-	finalPath := path.Join(dir, fmt.Sprintf("%d.bin", from))
-
-	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
-	if err != nil {
-		return fmt.Errorf("open binary snapshot %s: %w", tmpPath, err)
-	}
-
-	bw := bufio.NewWriter(f)
-	if err := writeBinarySnapshotFile(bw, cf); err != nil {
-		f.Close()
-		os.Remove(tmpPath)
-		return fmt.Errorf("write binary snapshot: %w", err)
-	}
-	if err := bw.Flush(); err != nil {
-		f.Close()
-		os.Remove(tmpPath)
-		return err
-	}
-	f.Close()
-
-	return os.Rename(tmpPath, finalPath)
-}
-
-// writeBinarySnapshotFile writes the binary snapshot file header and level tree.
-func writeBinarySnapshotFile(w io.Writer, cf *CheckpointFile) error {
-	if err := binary.Write(w, binary.LittleEndian, snapFileMagic); err != nil {
-		return err
-	}
-	if err := binary.Write(w, binary.LittleEndian, cf.From); err != nil {
-		return err
-	}
-	if err := binary.Write(w, binary.LittleEndian, cf.To); err != nil {
-		return err
-	}
-	return writeBinaryLevel(w, cf)
-}
-
-// writeBinaryLevel recursively writes a CheckpointFile level in binary format.
-func writeBinaryLevel(w io.Writer, cf *CheckpointFile) error {
-	if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Metrics))); err != nil {
-		return err
-	}
-
-	for name, metric := range cf.Metrics {
-		if err := writeString16(w, name); err != nil {
-			return err
-		}
-		if err := binary.Write(w, binary.LittleEndian, metric.Frequency); err != nil {
-			return err
-		}
-		if err := binary.Write(w, binary.LittleEndian, metric.Start); err != nil {
-			return err
-		}
-		if err := binary.Write(w, binary.LittleEndian, uint32(len(metric.Data))); err != nil {
-			return err
-		}
-		for _, v := range metric.Data {
-			if err := binary.Write(w, binary.LittleEndian, math.Float32bits(float32(v))); err != nil {
-				return err
-			}
-		}
-	}
-
-	if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Children))); err != nil {
-		return err
-	}
-
-	for name, child := range cf.Children {
-		if err := writeString16(w, name); err != nil {
-			return err
-		}
-		if err := writeBinaryLevel(w, child); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// writeString16 writes a 2-byte length-prefixed string to w.
-func writeString16(w io.Writer, s string) error {
-	if err := binary.Write(w, binary.LittleEndian, uint16(len(s))); err != nil {
-		return err
-	}
-	_, err := io.WriteString(w, s)
-	return err
-}
-
-// loadBinaryFile reads a binary snapshot file and loads data into the Level tree.
-// The retention check (from) is applied to the file's 'to' timestamp.
-func (l *Level) loadBinaryFile(m *MemoryStore, f *os.File, from int64) error {
-	br := bufio.NewReader(f)
-
-	var magic uint32
-	if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
-		return fmt.Errorf("[METRICSTORE]> binary snapshot: read magic: %w", err)
-	}
-	if magic != snapFileMagic {
-		return fmt.Errorf("[METRICSTORE]> binary snapshot: invalid magic 0x%08X (expected 0x%08X)", magic, snapFileMagic)
-	}
-
-	var fileFrom, fileTo int64
-	if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil {
-		return fmt.Errorf("[METRICSTORE]> binary snapshot: read from: %w", err)
-	}
-	if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil {
-		return fmt.Errorf("[METRICSTORE]> binary snapshot: read to: %w", err)
-	}
-
-	if fileTo != 0 && fileTo < from {
-		return nil // File is older than retention window, skip it
-	}
-
-	cf, err := readBinaryLevel(br)
-	if err != nil {
-		return fmt.Errorf("[METRICSTORE]> binary snapshot: read level tree: %w", err)
-	}
-	cf.From = fileFrom
-	cf.To = fileTo
-
-	return l.loadFile(cf, m)
-}
-
-// readBinaryLevel recursively reads a level from the binary snapshot format.
-func readBinaryLevel(r io.Reader) (*CheckpointFile, error) {
-	cf := &CheckpointFile{
-		Metrics:  make(map[string]*CheckpointMetrics),
-		Children: make(map[string]*CheckpointFile),
-	}
-
-	var numMetrics uint32
-	if err := binary.Read(r, binary.LittleEndian, &numMetrics); err != nil {
-		return nil, fmt.Errorf("read num_metrics: %w", err)
-	}
-
-	for range numMetrics {
-		name, err := readString16(r)
-		if err != nil {
-			return nil, fmt.Errorf("read metric name: %w", err)
-		}
-
-		var freq, start int64
-		if err := binary.Read(r, binary.LittleEndian, &freq); err != nil {
-			return nil, fmt.Errorf("read frequency for %s: %w", name, err)
-		}
-		if err := binary.Read(r, binary.LittleEndian, &start); err != nil {
-			return nil, fmt.Errorf("read start for %s: %w", name, err)
-		}
-
-		var numValues uint32
-		if err := binary.Read(r, binary.LittleEndian, &numValues); err != nil {
-			return nil, fmt.Errorf("read num_values for %s: %w", name, err)
-		}
-
-		data := make([]schema.Float, numValues)
-		for i := range numValues {
-			var bits uint32
-			if err := binary.Read(r, binary.LittleEndian, &bits); err != nil {
-				return nil, fmt.Errorf("read value[%d] for %s: %w", i, name, err)
-			}
-			data[i] = schema.Float(math.Float32frombits(bits))
-		}
-
-		cf.Metrics[name] = &CheckpointMetrics{
-			Frequency: freq,
-			Start:     start,
-			Data:      data,
-		}
-	}
-
-	var numChildren uint32
-	if err := binary.Read(r, binary.LittleEndian, &numChildren); err != nil {
-		return nil, fmt.Errorf("read num_children: %w", err)
-	}
-
-	for range numChildren {
-		childName, err := readString16(r)
-		if err != nil {
-			return nil, fmt.Errorf("read child name: %w", err)
-		}
-
-		child, err := readBinaryLevel(r)
-		if err != nil {
-			return nil, fmt.Errorf("read child %s: %w", childName, err)
-		}
-		cf.Children[childName] = child
-	}
-
-	return cf, nil
-}
-
-// readString16 reads a 2-byte length-prefixed string from r.
-func readString16(r io.Reader) (string, error) {
-	var sLen uint16
-	if err := binary.Read(r, binary.LittleEndian, &sLen); err != nil {
-		return "", err
-	}
-	buf := make([]byte, sLen)
-	if _, err := io.ReadFull(r, buf); err != nil {
-		return "", err
-	}
-	return string(buf), nil
-}
Author	SHA1	Message	Date
Jan Eitzinger	23ce1722a9	Merge branch 'dev' into optimize-checkpoint-loading	2026-02-27 09:31:50 +01:00
Jan Eitzinger	9c0104a252	Attempt to minimize heap allocations by using caching	2026-02-25 07:52:02 +01:00
Jan Eitzinger	3555fb6255	Merge branch 'dev' into optimize-checkpoint-loading	2026-02-25 07:39:38 +01:00
Jan Eitzinger	62d2143979	Merge branch 'dev' into optimize-checkpoint-loading	2026-02-23 14:21:32 +01:00
Jan Eitzinger	9fc1836c30	Replace avro checkpoints with native binary format	2026-02-23 14:21:17 +01:00
Jan Eitzinger	b6c574c7ec	Merge branch 'dev' into optimize-checkpoint-loading	2026-02-23 08:53:26 +01:00
Jan Eitzinger	8b7cb587c5	Modernize code	2026-02-19 09:01:54 +01:00
Jan Eitzinger	415467967d	Apply optimizations to checkpoint loading	2026-02-19 08:40:37 +01:00