fix: WAL rotation skipped for all nodes due to non-blocking send on small channel

RotateWALFiles used a non-blocking send (select/default) on rotation
channels buffered at 64. With thousands of nodes and few shards, the
channel fills instantly and nearly all hosts are skipped, leaving WAL
files unrotated indefinitely.

Replace with a blocking send using a shared 2-minute deadline so the
checkpoint goroutine waits for the staging goroutine to drain the
channel instead of immediately giving up.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Entire-Checkpoint: a1ec897216fa
This commit is contained in:
2026-03-28 06:55:45 +01:00
parent 3d99aec185
commit 937984d11f

View File

@@ -358,6 +358,7 @@ func RotateWALFiles(hostDirs []string) {
if walShardRotateChs == nil || walShuttingDown.Load() {
return
}
deadline := time.After(2 * time.Minute)
dones := make([]chan struct{}, 0, len(hostDirs))
for _, dir := range hostDirs {
done := make(chan struct{})
@@ -365,16 +366,18 @@ func RotateWALFiles(hostDirs []string) {
select {
case walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: done}:
dones = append(dones, done)
default:
// Channel full or goroutine not consuming — skip this host.
cclog.Warnf("[METRICSTORE]> WAL rotation skipped for %s (channel full)", dir)
case <-deadline:
cclog.Warnf("[METRICSTORE]> WAL rotation send timed out, %d of %d hosts remaining",
len(hostDirs)-len(dones), len(hostDirs))
goto waitDones
}
}
waitDones:
for _, done := range dones {
select {
case <-done:
case <-time.After(30 * time.Second):
cclog.Warn("[METRICSTORE]> WAL rotation timed out, continuing")
cclog.Warn("[METRICSTORE]> WAL rotation completion timed out, continuing")
return
}
}