mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-15 04:17:30 +01:00
fix: Fix metricstore memory explosion from broken emergency free and batch aborts
- Fix MemoryUsageTracker: remove premature bufferPool.Clear() that prevented mem.Alloc from decreasing, replace broken ForceFree loop (100 iterations with no GC) with progressive time-based Free at 75%/50%/25% retention, add bufferPool.Clear()+GC between steps so memory stats update correctly - Enable debug.FreeOSMemory() after emergency freeing to return memory to OS - Add adaptive ticker: 30s checks when memory >80% of cap, normal otherwise - Reduce default memory check interval from 1h to 5min - Don't abort entire NATS batch on single write error (out-of-order timestamp), log warning and continue processing remaining lines - Prune empty levels from tree after free() to reduce overhead - Include buffer struct overhead in sizeInBytes() for more accurate reporting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Entire-Checkpoint: 7ce28627fc1d
This commit is contained in:
@@ -200,17 +200,33 @@ func (l *Level) free(t int64) (int, error) {
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range l.children {
|
||||
m, err := l.free(t)
|
||||
for key, child := range l.children {
|
||||
m, err := child.free(t)
|
||||
n += m
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
if child.isEmpty() {
|
||||
delete(l.children, key)
|
||||
}
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// isEmpty returns true if this level has no metrics and no children.
|
||||
func (l *Level) isEmpty() bool {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
for _, b := range l.metrics {
|
||||
if b != nil {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return len(l.children) == 0
|
||||
}
|
||||
|
||||
// forceFree removes the oldest buffer from each metric chain in the subtree.
|
||||
//
|
||||
// Unlike free(), which removes based on time threshold, this unconditionally removes
|
||||
@@ -278,6 +294,7 @@ func (l *Level) sizeInBytes() int64 {
|
||||
for _, b := range l.metrics {
|
||||
if b != nil {
|
||||
size += b.count() * int64(unsafe.Sizeof(schema.Float(0)))
|
||||
size += b.bufferCount() * int64(unsafe.Sizeof(buffer{}))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user