fix: Pause WAL writes during binary checkpoint to prevent message drops

WAL writes during checkpoint are redundant since the binary snapshot captures all in-memory data. Pausing eliminates channel saturation (1.4M+ dropped messages) caused by disk I/O contention between checkpoint writes and WAL staging. Also removes direct WAL file deletion in checkpoint workers that raced with the staging goroutine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Entire-Checkpoint: 34d698f40bac
2026-05-16 09:57:30 +02:00 · 2026-03-29 11:13:39 +02:00
parent 937984d11f
commit fc47b12fed
2 changed files with 14 additions and 5 deletions
--- a/pkg/metricstore/checkpoint.go
+++ b/pkg/metricstore/checkpoint.go
@@ -126,6 +126,10 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 				cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))

 				if Keys.Checkpoints.FileFormat == "wal" {
+					// Pause WAL writes: the binary snapshot captures all in-memory
+					// data, so WAL records written during checkpoint are redundant
+					// and would be deleted during rotation anyway.
+					walCheckpointActive.Store(true)
 					n, hostDirs, err := ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
 					if err != nil {
 						cclog.Errorf("[METRICSTORE]> binary checkpointing failed: %s", err.Error())
@@ -138,6 +142,8 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
 						// Rotate WAL files for successfully checkpointed hosts.
 						RotateWALFiles(hostDirs)
 					}
+					walCheckpointActive.Store(false)
+					walDropped.Store(0)
 				} else {
 					n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
 					if err != nil {