mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-30 12:37:30 +02:00
fix: WAL rotation skipped for all nodes due to non-blocking send on small channel
RotateWALFiles used a non-blocking send (select/default) on rotation channels buffered at 64. With thousands of nodes and few shards, the channel fills instantly and nearly all hosts are skipped, leaving WAL files unrotated indefinitely. Replace with a blocking send using a shared 2-minute deadline so the checkpoint goroutine waits for the staging goroutine to drain the channel instead of immediately giving up. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Entire-Checkpoint: a1ec897216fa
This commit is contained in:
@@ -358,6 +358,7 @@ func RotateWALFiles(hostDirs []string) {
|
|||||||
if walShardRotateChs == nil || walShuttingDown.Load() {
|
if walShardRotateChs == nil || walShuttingDown.Load() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
deadline := time.After(2 * time.Minute)
|
||||||
dones := make([]chan struct{}, 0, len(hostDirs))
|
dones := make([]chan struct{}, 0, len(hostDirs))
|
||||||
for _, dir := range hostDirs {
|
for _, dir := range hostDirs {
|
||||||
done := make(chan struct{})
|
done := make(chan struct{})
|
||||||
@@ -365,16 +366,18 @@ func RotateWALFiles(hostDirs []string) {
|
|||||||
select {
|
select {
|
||||||
case walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: done}:
|
case walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: done}:
|
||||||
dones = append(dones, done)
|
dones = append(dones, done)
|
||||||
default:
|
case <-deadline:
|
||||||
// Channel full or goroutine not consuming — skip this host.
|
cclog.Warnf("[METRICSTORE]> WAL rotation send timed out, %d of %d hosts remaining",
|
||||||
cclog.Warnf("[METRICSTORE]> WAL rotation skipped for %s (channel full)", dir)
|
len(hostDirs)-len(dones), len(hostDirs))
|
||||||
|
goto waitDones
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
waitDones:
|
||||||
for _, done := range dones {
|
for _, done := range dones {
|
||||||
select {
|
select {
|
||||||
case <-done:
|
case <-done:
|
||||||
case <-time.After(30 * time.Second):
|
case <-time.After(30 * time.Second):
|
||||||
cclog.Warn("[METRICSTORE]> WAL rotation timed out, continuing")
|
cclog.Warn("[METRICSTORE]> WAL rotation completion timed out, continuing")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user