Merge pull request #525 from ClusterCockpit/hotfix

Hotfix
2026-05-24 13:47:30 +02:00 · 2026-03-18 19:31:05 +01:00
parent a7e5ecaf6c 09501df3c2
commit d4a0ae173f
8 changed files with 251 additions and 152 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -229,6 +229,7 @@ The backend supports a NATS-based API as an alternative to the REST API for job
 ### Setup
 1. Configure NATS client connection in `config.json`:
   ```json
   {
     "nats": {
@@ -240,6 +241,7 @@ The backend supports a NATS-based API as an alternative to the REST API for job
   ```
 2. Configure API subjects in `config.json` under `main`:
   ```json
   {
     "main": {
@@ -252,6 +254,7 @@ The backend supports a NATS-based API as an alternative to the REST API for job
     }
   }
   ```
   - `subject-job-event` (required): NATS subject for job start/stop events
   - `subject-node-state` (required): NATS subject for node state updates
   - `job-concurrency` (optional, default: 8): Number of concurrent worker goroutines for job events
@@ -264,19 +267,23 @@ Messages use **InfluxDB line protocol** format with the following structure:
 #### Job Events
 **Start Job:**
 ```
 job,function=start_job event="{\"jobId\":123,\"user\":\"alice\",\"cluster\":\"test\", ...}" 1234567890000000000
 ```
 **Stop Job:**
 ```
 job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1234567890,\"stopTime\":1234571490,\"jobState\":\"completed\"}" 1234571490000000000
 ```
 **Tags:**
 - `function`: Either `start_job` or `stop_job`
 **Fields:**
 - `event`: JSON payload containing job data (see REST API documentation for schema)
 #### Node State Updates
@@ -307,9 +314,31 @@ job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1
 - Messages are logged; no responses are sent back to publishers
 - If NATS client is unavailable, API subscriptions are skipped (logged as warning)
 ## Development Guidelines
 ### Performance
 This application processes large volumes of HPC monitoring data (metrics, job
 records, archives) at scale. All code changes must prioritize maximum throughput
 and minimal latency. Avoid unnecessary allocations, prefer streaming over
 buffering, and be mindful of lock contention. When in doubt, benchmark.
 ### Change Impact Analysis
 For any significant change, you MUST:
 1. **Check all call paths**: Trace every caller of modified functions to ensure
   correctness is preserved throughout the call chain.
 2. **Evaluate side effects**: Identify and verify all side effects — database
   writes, cache invalidations, channel sends, goroutine lifecycle changes, file
   I/O, and external API calls.
 3. **Consider concurrency implications**: This codebase uses goroutines and
   channels extensively. Verify that changes do not introduce races, deadlocks,
   or contention bottlenecks.
 ## Dependencies
- Go 1.24.0+ (check go.mod for exact version)
+- Go 1.25.0+ (check go.mod for exact version)
 - Node.js (for frontend builds)
 - SQLite 3 (only supported database)
 - Optional: NATS server for NATS API integration
--- a/internal/api/nats.go
+++ b/internal/api/nats.go
@@ -402,12 +402,21 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
 	repo := repository.GetNodeRepository()
 	requestReceived := time.Now().Unix()
-	// Build nodeList per subcluster for health check
+	// Pre-compute node states; only include non-down nodes in health check
 	nodeStates := make(map[string]schema.SchedulerState, len(req.Nodes))
 	for _, node := range req.Nodes {
 		nodeStates[node.Hostname] = determineState(node.States)
 	}
 	// Build nodeList per subcluster for health check, skipping down nodes
 	m := make(map[string][]string)
 	metricNames := make(map[string][]string)
 	healthResults := make(map[string]metricstore.HealthCheckResult)
 	for _, node := range req.Nodes {
 		if nodeStates[node.Hostname] == schema.NodeStateDown {
 			continue
 		}
 		if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
 			m[sc] = append(m[sc], node.Hostname)
 		}
@@ -436,12 +445,17 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
 	updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
 	for _, node := range req.Nodes {
-		state := determineState(node.States)
+		state := nodeStates[node.Hostname]
-		healthState := schema.MonitoringStateFailed
+		var healthState schema.MonitoringState
 		var healthMetrics string
-		if result, ok := healthResults[node.Hostname]; ok {
+		if state == schema.NodeStateDown {
-			healthState = result.State
+			healthState = schema.MonitoringStateFull
-			healthMetrics = result.HealthMetrics
+		} else {
 			healthState = schema.MonitoringStateFailed
 			if result, ok := healthResults[node.Hostname]; ok {
 				healthState = result.State
 				healthMetrics = result.HealthMetrics
 			}
 		}
 		nodeState := schema.NodeStateDB{
 			TimeStamp:       requestReceived,
--- a/internal/api/node.go
+++ b/internal/api/node.go
@@ -34,21 +34,28 @@ func metricListToNames(metricList map[string]*schema.Metric) []string {
 	return names
 }
-// this routine assumes that only one of them exists per node
+// determineState resolves multiple states to a single state using priority order:
 // allocated > reserved > idle > down > mixed.
 // Exception: if both idle and down are present, down is returned.
 func determineState(states []string) schema.SchedulerState {
-	for _, state := range states {
+	stateSet := make(map[string]bool, len(states))
-		switch strings.ToLower(state) {
+	for _, s := range states {
-		case "allocated":
+		stateSet[strings.ToLower(s)] = true
-			return schema.NodeStateAllocated
+	}
-		case "reserved":
+
-			return schema.NodeStateReserved
+	switch {
-		case "idle":
+	case stateSet["allocated"]:
-			return schema.NodeStateIdle
+		return schema.NodeStateAllocated
-		case "down":
+	case stateSet["reserved"]:
-			return schema.NodeStateDown
+		return schema.NodeStateReserved
-		case "mixed":
+	case stateSet["idle"] && stateSet["down"]:
-			return schema.NodeStateMixed
+		return schema.NodeStateDown
-		}
+	case stateSet["idle"]:
 		return schema.NodeStateIdle
 	case stateSet["down"]:
 		return schema.NodeStateDown
 	case stateSet["mixed"]:
 		return schema.NodeStateMixed
 	}
 	return schema.NodeStateUnknown
@@ -79,14 +86,23 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
 	requestReceived := time.Now().Unix()
 	repo := repository.GetNodeRepository()
 	// Step 1: Pre-compute node states; only include non-down nodes in health check
 	nodeStates := make(map[string]schema.SchedulerState, len(req.Nodes))
 	for _, node := range req.Nodes {
 		nodeStates[node.Hostname] = determineState(node.States)
 	}
 	m := make(map[string][]string)
 	metricNames := make(map[string][]string)
 	healthResults := make(map[string]metricstore.HealthCheckResult)
 	startMs := time.Now()
-	// Step 1: Build nodeList and metricList per subcluster
+	// Step 2: Build nodeList and metricList per subcluster, skipping down nodes
 	for _, node := range req.Nodes {
 		if nodeStates[node.Hostname] == schema.NodeStateDown {
 			continue
 		}
 		if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
 			m[sc] = append(m[sc], node.Hostname)
 		}
@@ -99,7 +115,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
 		}
 	}
-	// Step 2: Determine which metric store to query and perform health check
+	// Step 3: Determine which metric store to query and perform health check
 	healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
 	if err != nil {
 		cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
@@ -118,12 +134,17 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
 	updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
 	for _, node := range req.Nodes {
-		state := determineState(node.States)
+		state := nodeStates[node.Hostname]
-		healthState := schema.MonitoringStateFailed
+		var healthState schema.MonitoringState
 		var healthMetrics string
-		if result, ok := healthResults[node.Hostname]; ok {
+		if state == schema.NodeStateDown {
-			healthState = result.State
+			healthState = schema.MonitoringStateFull
-			healthMetrics = result.HealthMetrics
+		} else {
 			healthState = schema.MonitoringStateFailed
 			if result, ok := healthResults[node.Hostname]; ok {
 				healthState = result.State
 				healthMetrics = result.HealthMetrics
 			}
 		}
 		nodeState := schema.NodeStateDB{
 			TimeStamp:       requestReceived,
--- a/internal/repository/stats.go
+++ b/internal/repository/stats.go
@@ -156,7 +156,7 @@ func (r *JobRepository) buildStatsQuery(
 	columns = append(columns, "COUNT(*) as totalJobs")
-	if need("totalUsers") && col != "job.hpc_user" {
+	if need("totalUsers") {
 		columns = append(columns, "COUNT(DISTINCT job.hpc_user) AS totalUsers")
 	} else {
 		columns = append(columns, "0 AS totalUsers")
@@ -360,7 +360,7 @@ func (r *JobRepository) JobsStats(
 	var jobs, users, walltime, nodes, nodeHours, cores, coreHours, accs, accHours, runningJobs, shortJobs sql.NullInt64
 	if err := row.Scan(&jobs, &users, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours, &runningJobs, &shortJobs); err != nil {
-		cclog.Warn("Error while scanning rows")
+		cclog.Warnf("Error scanning job statistics row: %v", err)
 		return nil, err
 	}
--- a/pkg/metricstore/archive.go
+++ b/pkg/metricstore/archive.go
@@ -168,8 +168,9 @@ func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
 // archiveCheckpoints archives checkpoint files to Parquet format.
 // Produces one Parquet file per cluster: <cleanupDir>/<cluster>/<timestamp>.parquet
-// Each host's rows are written as a separate row group to avoid accumulating
+// Workers load checkpoint files from disk and send CheckpointFile trees on a
-// all data in memory at once.
+// back-pressured channel. The main thread streams each tree directly to Parquet
 // rows without materializing all rows in memory.
 func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, error) {
 	cclog.Info("[METRICSTORE]> start archiving checkpoints to parquet")
 	startTime := time.Now()
@@ -192,14 +193,16 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 			return totalFiles, err
 		}
-		// Stream per-host rows to parquet writer via worker pool
+		// Workers load checkpoint files from disk; main thread writes to parquet.
 		type hostResult struct {
-			rows  []ParquetMetricRow
+			checkpoints []*CheckpointFile
-			files []string // checkpoint filenames to delete after successful write
+			hostname    string
-			dir   string   // checkpoint directory for this host
+			files       []string // checkpoint filenames to delete after successful write
 			dir         string   // checkpoint directory for this host
 		}
-		results := make(chan hostResult, len(hostEntries))
+		// Small buffer provides back-pressure: at most NumWorkers+2 results in flight.
 		results := make(chan hostResult, 2)
 		work := make(chan struct {
 			dir, host string
 		}, Keys.NumWorkers)
@@ -212,14 +215,19 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 			go func() {
 				defer wg.Done()
 				for item := range work {
-					rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from)
+					checkpoints, files, err := loadCheckpointFiles(item.dir, from)
 					if err != nil {
 						cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error())
 						atomic.AddInt32(&errs, 1)
 						continue
 					}
-					if len(rows) > 0 {
+					if len(checkpoints) > 0 {
-						results <- hostResult{rows: rows, files: files, dir: item.dir}
+						results <- hostResult{
 							checkpoints: checkpoints,
 							hostname:    item.host,
 							files:       files,
 							dir:         item.dir,
 						}
 					}
 				}
 			}()
@@ -240,7 +248,7 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 			close(results)
 		}()
-		// Open streaming writer and write each host's rows as a row group
+		// Open streaming writer and write each host's checkpoint files as a row group
 		parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
 		writer, err := newParquetArchiveWriter(parquetFile)
 		if err != nil {
@@ -259,9 +267,13 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 		for r := range results {
 			if writeErr == nil {
-				sortParquetRows(r.rows)
+				// Stream each checkpoint file directly to parquet rows.
-				if err := writer.WriteHostRows(r.rows); err != nil {
+				// Each checkpoint is processed and discarded before the next.
-					writeErr = err
+				for _, cf := range r.checkpoints {
 					if err := writer.WriteCheckpointFile(cf, cluster, r.hostname, "node", ""); err != nil {
 						writeErr = err
 						break
 					}
 				}
 			}
 			// Always track files for deletion (even if write failed, we still drain)
--- a/pkg/metricstore/parquetArchive.go
+++ b/pkg/metricstore/parquetArchive.go
@@ -14,7 +14,6 @@ import (
 	"path/filepath"
 	"sort"
 	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	pq "github.com/parquet-go/parquet-go"
 )
@@ -32,37 +31,6 @@ type ParquetMetricRow struct {
 	Value     float32 `parquet:"value"`
 }
 // flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows.
 // The scope path is built from the hierarchy: host level is "node", then child names
 // map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0").
 func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow {
 	for metricName, cm := range cf.Metrics {
 		ts := cm.Start
 		for _, v := range cm.Data {
 			if !v.IsNaN() {
 				rows = append(rows, ParquetMetricRow{
 					Cluster:   cluster,
 					Hostname:  hostname,
 					Metric:    metricName,
 					Scope:     scope,
 					ScopeID:   scopeID,
 					Timestamp: ts,
 					Frequency: cm.Frequency,
 					Value:     float32(v),
 				})
 			}
 			ts += cm.Frequency
 		}
 	}
 	for childName, childCf := range cf.Children {
 		childScope, childScopeID := parseScopeFromName(childName)
 		rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows)
 	}
 	return rows
 }
 // parseScopeFromName infers scope and scope_id from a child level name.
 // Examples: "socket0" → ("socket", "0"), "core12" → ("core", "12"),
 // "a0" (accelerator) → ("accelerator", "0").
@@ -93,15 +61,17 @@ func parseScopeFromName(name string) (string, string) {
 }
 // parquetArchiveWriter supports incremental writes to a Parquet file.
-// Each call to WriteHostRows writes one row group (typically one host's data),
+// Uses streaming writes to avoid accumulating all rows in memory.
 // avoiding accumulation of all rows in memory.
 type parquetArchiveWriter struct {
 	writer *pq.GenericWriter[ParquetMetricRow]
 	bw     *bufio.Writer
 	f      *os.File
 	batch  []ParquetMetricRow // reusable batch buffer
 	count  int
 }
 const parquetBatchSize = 1024
 // newParquetArchiveWriter creates a streaming Parquet writer with Zstd compression.
 func newParquetArchiveWriter(filename string) (*parquetArchiveWriter, error) {
 	if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil {
@@ -119,31 +89,85 @@ func newParquetArchiveWriter(filename string) (*parquetArchiveWriter, error) {
 		pq.Compression(&pq.Zstd),
 	)
-	return &parquetArchiveWriter{writer: writer, bw: bw, f: f}, nil
+	return &parquetArchiveWriter{
 		writer: writer,
 		bw:     bw,
 		f:      f,
 		batch:  make([]ParquetMetricRow, 0, parquetBatchSize),
 	}, nil
 }
-// WriteHostRows sorts rows by (metric, timestamp) in-place, writes them,
+// WriteCheckpointFile streams a CheckpointFile tree directly to Parquet rows,
-// and flushes to create a separate row group.
+// writing metrics in sorted order without materializing all rows in memory.
-func (w *parquetArchiveWriter) WriteHostRows(rows []ParquetMetricRow) error {
+// Produces one row group per call (typically one host's data).
-	sort.Slice(rows, func(i, j int) bool {
+func (w *parquetArchiveWriter) WriteCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string) error {
-		if rows[i].Metric != rows[j].Metric {
+	w.writeLevel(cf, cluster, hostname, scope, scopeID)
 			return rows[i].Metric < rows[j].Metric
 		}
 		return rows[i].Timestamp < rows[j].Timestamp
 	})
-	if _, err := w.writer.Write(rows); err != nil {
+	// Flush remaining batch
-		return fmt.Errorf("writing parquet rows: %w", err)
+	if len(w.batch) > 0 {
 		if _, err := w.writer.Write(w.batch); err != nil {
 			return fmt.Errorf("writing parquet rows: %w", err)
 		}
 		w.count += len(w.batch)
 		w.batch = w.batch[:0]
 	}
 	if err := w.writer.Flush(); err != nil {
 		return fmt.Errorf("flushing parquet row group: %w", err)
 	}
 	w.count += len(rows)
 	return nil
 }
 // writeLevel recursively writes metrics from a CheckpointFile level.
 // Metric names and child names are sorted for deterministic, compression-friendly output.
 func (w *parquetArchiveWriter) writeLevel(cf *CheckpointFile, cluster, hostname, scope, scopeID string) {
 	// Sort metric names for deterministic order
 	metricNames := make([]string, 0, len(cf.Metrics))
 	for name := range cf.Metrics {
 		metricNames = append(metricNames, name)
 	}
 	sort.Strings(metricNames)
 	for _, metricName := range metricNames {
 		cm := cf.Metrics[metricName]
 		ts := cm.Start
 		for _, v := range cm.Data {
 			if !v.IsNaN() {
 				w.batch = append(w.batch, ParquetMetricRow{
 					Cluster:   cluster,
 					Hostname:  hostname,
 					Metric:    metricName,
 					Scope:     scope,
 					ScopeID:   scopeID,
 					Timestamp: ts,
 					Frequency: cm.Frequency,
 					Value:     float32(v),
 				})
 				if len(w.batch) >= parquetBatchSize {
 					w.writer.Write(w.batch)
 					w.count += len(w.batch)
 					w.batch = w.batch[:0]
 				}
 			}
 			ts += cm.Frequency
 		}
 	}
 	// Sort child names for deterministic order
 	childNames := make([]string, 0, len(cf.Children))
 	for name := range cf.Children {
 		childNames = append(childNames, name)
 	}
 	sort.Strings(childNames)
 	for _, childName := range childNames {
 		childScope, childScopeID := parseScopeFromName(childName)
 		w.writeLevel(cf.Children[childName], cluster, hostname, childScope, childScopeID)
 	}
 }
 // Close finalises the Parquet file (footer, buffered I/O, file handle).
 func (w *parquetArchiveWriter) Close() error {
 	if err := w.writer.Close(); err != nil {
@@ -159,16 +183,6 @@ func (w *parquetArchiveWriter) Close() error {
 	return w.f.Close()
 }
 // sortParquetRows sorts rows by (metric, timestamp) in-place.
 func sortParquetRows(rows []ParquetMetricRow) {
 	sort.Slice(rows, func(i, j int) bool {
 		if rows[i].Metric != rows[j].Metric {
 			return rows[i].Metric < rows[j].Metric
 		}
 		return rows[i].Timestamp < rows[j].Timestamp
 	})
 }
 // loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns
 // a CheckpointFile. Used by the Parquet archiver to read checkpoint data
 // before converting it to Parquet format.
@@ -218,22 +232,10 @@ func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) {
 	}
 }
-// estimateRowCount estimates the number of Parquet rows a CheckpointFile will produce.
+// loadCheckpointFiles reads checkpoint files for a host directory and returns
-// Used for pre-allocating the rows slice to avoid repeated append doubling.
+// the loaded CheckpointFiles and their filenames. Processes one file at a time
-func estimateRowCount(cf *CheckpointFile) int {
+// to avoid holding all checkpoint data in memory simultaneously.
-	n := 0
+func loadCheckpointFiles(dir string, from int64) ([]*CheckpointFile, []string, error) {
 	for _, cm := range cf.Metrics {
 		n += len(cm.Data)
 	}
 	for _, child := range cf.Children {
 		n += estimateRowCount(child)
 	}
 	return n
 }
 // archiveCheckpointsToParquet reads checkpoint files for a host directory,
 // converts them to Parquet rows. Returns the rows and filenames that were processed.
 func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) {
 	entries, err := os.ReadDir(dir)
 	if err != nil {
 		return nil, nil, err
@@ -248,36 +250,18 @@ func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]Parqu
 		return nil, nil, nil
 	}
-	// First pass: load checkpoints and estimate total rows for pre-allocation.
+	var checkpoints []*CheckpointFile
-	type loaded struct {
+	var processedFiles []string
 		cf       *CheckpointFile
 		filename string
 	}
 	var checkpoints []loaded
 	totalEstimate := 0
 	for _, checkpoint := range files {
 		filename := filepath.Join(dir, checkpoint)
 		cf, err := loadCheckpointFileFromDisk(filename)
 		if err != nil {
 			cclog.Warnf("[METRICSTORE]> skipping unreadable checkpoint %s: %v", filename, err)
 			continue
 		}
-		totalEstimate += estimateRowCount(cf)
+		checkpoints = append(checkpoints, cf)
-		checkpoints = append(checkpoints, loaded{cf: cf, filename: checkpoint})
+		processedFiles = append(processedFiles, checkpoint)
 	}
-	if len(checkpoints) == 0 {
+	return checkpoints, processedFiles, nil
 		return nil, nil, nil
 	}
 	rows := make([]ParquetMetricRow, 0, totalEstimate)
 	processedFiles := make([]string, 0, len(checkpoints))
 	for _, cp := range checkpoints {
 		rows = flattenCheckpointFile(cp.cf, cluster, host, "node", "", rows)
 		processedFiles = append(processedFiles, cp.filename)
 	}
 	return rows, processedFiles, nil
 }
--- a/pkg/metricstore/parquetArchive_test.go
+++ b/pkg/metricstore/parquetArchive_test.go
@@ -44,7 +44,7 @@ func TestParseScopeFromName(t *testing.T) {
 	}
 }
-func TestFlattenCheckpointFile(t *testing.T) {
+func TestWriteCheckpointFile(t *testing.T) {
 	cf := &CheckpointFile{
 		From: 1000,
 		To:   1060,
@@ -69,17 +69,55 @@ func TestFlattenCheckpointFile(t *testing.T) {
 		},
 	}
-	rows := flattenCheckpointFile(cf, "fritz", "node001", "node", "", nil)
+	tmpDir := t.TempDir()
 	parquetFile := filepath.Join(tmpDir, "test.parquet")
 	writer, err := newParquetArchiveWriter(parquetFile)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := writer.WriteCheckpointFile(cf, "fritz", "node001", "node", ""); err != nil {
 		t.Fatal(err)
 	}
 	if err := writer.Close(); err != nil {
 		t.Fatal(err)
 	}
 	// cpu_load: 2 non-NaN values at node scope
 	// mem_bw: 2 non-NaN values at socket0 scope
-	if len(rows) != 4 {
+	if writer.count != 4 {
-		t.Fatalf("expected 4 rows, got %d", len(rows))
+		t.Fatalf("expected 4 rows written, got %d", writer.count)
 	}
 	// Read back and verify
 	f, err := os.Open(parquetFile)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
 	stat, _ := f.Stat()
 	pf, err := pq.OpenFile(f, stat.Size())
 	if err != nil {
 		t.Fatal(err)
 	}
 	reader := pq.NewGenericReader[ParquetMetricRow](pf)
 	readRows := make([]ParquetMetricRow, 100)
 	n, err := reader.Read(readRows)
 	if err != nil && n == 0 {
 		t.Fatal(err)
 	}
 	readRows = readRows[:n]
 	reader.Close()
 	if n != 4 {
 		t.Fatalf("expected 4 rows, got %d", n)
 	}
 	// Verify a node-scope row
 	found := false
-	for _, r := range rows {
+	for _, r := range readRows {
 		if r.Metric == "cpu_load" && r.Timestamp == 1000 {
 			found = true
 			if r.Cluster != "fritz" || r.Hostname != "node001" || r.Scope != "node" || r.Value != 0.5 {
@@ -93,7 +131,7 @@ func TestFlattenCheckpointFile(t *testing.T) {
 	// Verify a socket-scope row
 	found = false
-	for _, r := range rows {
+	for _, r := range readRows {
 		if r.Metric == "mem_bw" && r.Scope == "socket" && r.ScopeID == "0" {
 			found = true
 		}
@@ -153,7 +191,7 @@ func TestParquetArchiveRoundtrip(t *testing.T) {
 	// Archive to Parquet
 	archiveDir := filepath.Join(tmpDir, "archive")
-	rows, files, err := archiveCheckpointsToParquet(cpDir, "testcluster", "node001", 2000)
+	checkpoints, files, err := loadCheckpointFiles(cpDir, 2000)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -166,9 +204,10 @@ func TestParquetArchiveRoundtrip(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	sortParquetRows(rows)
+	for _, cp := range checkpoints {
-	if err := writer.WriteHostRows(rows); err != nil {
+		if err := writer.WriteCheckpointFile(cp, "testcluster", "node001", "node", ""); err != nil {
-		t.Fatal(err)
+			t.Fatal(err)
 		}
 	}
 	if err := writer.Close(); err != nil {
 		t.Fatal(err)
--- a/web/frontend/src/status/dashdetails/HealthDash.svelte
+++ b/web/frontend/src/status/dashdetails/HealthDash.svelte
@@ -161,7 +161,7 @@
 <hr/>
-<!-- Node Health Pis, later Charts -->
+<!-- Node State and Metric Health Pis -->
 {#if $statusQuery?.fetching}
  <Row cols={1} class="text-center mt-3">
    <Col>
@@ -222,7 +222,7 @@
      <div bind:clientWidth={pieWidth}>
        {#key refinedHealthData}
          <h4 class="text-center">
-            Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Node Health
+            Current {cluster.charAt(0).toUpperCase() + cluster.slice(1)} Metric Health
          </h4>
          <Pie
            canvasId="hpcpie-health"