fix: Pause WAL writes during binary checkpoint to prevent message drops

WAL writes during checkpoint are redundant since the binary snapshot captures all in-memory data. Pausing eliminates channel saturation (1.4M+ dropped messages) caused by disk I/O contention between checkpoint writes and WAL staging. Also removes direct WAL file deletion in checkpoint workers that raced with the staging goroutine. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Entire-Checkpoint: 34d698f40bac
2026-07-18 13:20:38 +02:00 · 2026-03-29 11:13:39 +02:00
parent 937984d11f
commit fc47b12fed
2 changed files with 14 additions and 5 deletions
@@ -99,6 +99,11 @@ var walStagingWg sync.WaitGroup
 // SendWALMessage from sending on a closed channel (which panics in Go).
 var walShuttingDown atomic.Bool

+// walCheckpointActive is set during binary checkpoint writes.
+// While active, SendWALMessage skips sending (returns true) because the
+// snapshot captures all in-memory data, making WAL writes redundant.
+var walCheckpointActive atomic.Bool
+
 // WALMessage represents a single metric write to be appended to the WAL.
 // Cluster and Node are NOT stored in the WAL record (inferred from file path).
 type WALMessage struct {
@@ -146,6 +151,9 @@ func SendWALMessage(msg *WALMessage) bool {
 	if walShardChs == nil || walShuttingDown.Load() {
 		return false
 	}
+	if walCheckpointActive.Load() {
+		return true // Data safe in memory; snapshot will capture it
+	}
 	shard := walShardIndex(msg.Cluster, msg.Node)
 	select {
 	case walShardChs[shard] <- msg:
@@ -727,11 +735,6 @@ func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string
 					atomic.AddInt32(&errs, 1)
 				} else {
 					atomic.AddInt32(&n, 1)
-					// Delete WAL immediately after successful snapshot.
-					walPath := path.Join(wi.hostDir, "current.wal")
-					if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) {
-						cclog.Errorf("[METRICSTORE]> WAL remove %s: %v", walPath, err)
-					}
 					successMu.Lock()
 					successDirs = append(successDirs, wi.hostDir)
 					successMu.Unlock()