mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-30 12:37:30 +02:00
fix: WAL rotation skipped for all nodes due to non-blocking send on small channel
RotateWALFiles used a non-blocking send (select/default) on rotation channels buffered at 64. With thousands of nodes and few shards, the channel fills instantly and nearly all hosts are skipped, leaving WAL files unrotated indefinitely. Replace with a blocking send using a shared 2-minute deadline so the checkpoint goroutine waits for the staging goroutine to drain the channel instead of immediately giving up. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Entire-Checkpoint: a1ec897216fa
This commit is contained in:
@@ -358,6 +358,7 @@ func RotateWALFiles(hostDirs []string) {
|
||||
if walShardRotateChs == nil || walShuttingDown.Load() {
|
||||
return
|
||||
}
|
||||
deadline := time.After(2 * time.Minute)
|
||||
dones := make([]chan struct{}, 0, len(hostDirs))
|
||||
for _, dir := range hostDirs {
|
||||
done := make(chan struct{})
|
||||
@@ -365,16 +366,18 @@ func RotateWALFiles(hostDirs []string) {
|
||||
select {
|
||||
case walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: done}:
|
||||
dones = append(dones, done)
|
||||
default:
|
||||
// Channel full or goroutine not consuming — skip this host.
|
||||
cclog.Warnf("[METRICSTORE]> WAL rotation skipped for %s (channel full)", dir)
|
||||
case <-deadline:
|
||||
cclog.Warnf("[METRICSTORE]> WAL rotation send timed out, %d of %d hosts remaining",
|
||||
len(hostDirs)-len(dones), len(hostDirs))
|
||||
goto waitDones
|
||||
}
|
||||
}
|
||||
waitDones:
|
||||
for _, done := range dones {
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(30 * time.Second):
|
||||
cclog.Warn("[METRICSTORE]> WAL rotation timed out, continuing")
|
||||
cclog.Warn("[METRICSTORE]> WAL rotation completion timed out, continuing")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user