From 937984d11f7b9503b58a29afbe3536fe40a2675c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Sat, 28 Mar 2026 06:55:45 +0100 Subject: [PATCH] fix: WAL rotation skipped for all nodes due to non-blocking send on small channel RotateWALFiles used a non-blocking send (select/default) on rotation channels buffered at 64. With thousands of nodes and few shards, the channel fills instantly and nearly all hosts are skipped, leaving WAL files unrotated indefinitely. Replace with a blocking send using a shared 2-minute deadline so the checkpoint goroutine waits for the staging goroutine to drain the channel instead of immediately giving up. Co-Authored-By: Claude Opus 4.6 Entire-Checkpoint: a1ec897216fa --- pkg/metricstore/walCheckpoint.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go index 35a5d6d5..749ff756 100644 --- a/pkg/metricstore/walCheckpoint.go +++ b/pkg/metricstore/walCheckpoint.go @@ -358,6 +358,7 @@ func RotateWALFiles(hostDirs []string) { if walShardRotateChs == nil || walShuttingDown.Load() { return } + deadline := time.After(2 * time.Minute) dones := make([]chan struct{}, 0, len(hostDirs)) for _, dir := range hostDirs { done := make(chan struct{}) @@ -365,16 +366,18 @@ func RotateWALFiles(hostDirs []string) { select { case walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: done}: dones = append(dones, done) - default: - // Channel full or goroutine not consuming — skip this host. - cclog.Warnf("[METRICSTORE]> WAL rotation skipped for %s (channel full)", dir) + case <-deadline: + cclog.Warnf("[METRICSTORE]> WAL rotation send timed out, %d of %d hosts remaining", + len(hostDirs)-len(dones), len(hostDirs)) + goto waitDones } } +waitDones: for _, done := range dones { select { case <-done: case <-time.After(30 * time.Second): - cclog.Warn("[METRICSTORE]> WAL rotation timed out, continuing") + cclog.Warn("[METRICSTORE]> WAL rotation completion timed out, continuing") return } }