fix: Fix metricstore memory explosion from broken emergency free and batch aborts

- Fix MemoryUsageTracker: remove premature bufferPool.Clear() that prevented
  mem.Alloc from decreasing, replace broken ForceFree loop (100 iterations
  with no GC) with progressive time-based Free at 75%/50%/25% retention,
  add bufferPool.Clear()+GC between steps so memory stats update correctly
- Enable debug.FreeOSMemory() after emergency freeing to return memory to OS
- Add adaptive ticker: 30s checks when memory >80% of cap, normal otherwise
- Reduce default memory check interval from 1h to 5min
- Don't abort entire NATS batch on single write error (out-of-order timestamp),
  log warning and continue processing remaining lines
- Prune empty levels from tree after free() to reduce overhead
- Include buffer struct overhead in sizeInBytes() for more accurate reporting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Entire-Checkpoint: 7ce28627fc1d
This commit is contained in:
2026-03-13 07:57:35 +01:00
parent 126f65879a
commit 8234ad3126
5 changed files with 71 additions and 31 deletions

View File

@@ -200,17 +200,33 @@ func (l *Level) free(t int64) (int, error) {
}
}
for _, l := range l.children {
m, err := l.free(t)
for key, child := range l.children {
m, err := child.free(t)
n += m
if err != nil {
return n, err
}
if child.isEmpty() {
delete(l.children, key)
}
}
return n, nil
}
// isEmpty returns true if this level has no metrics and no children.
func (l *Level) isEmpty() bool {
l.lock.RLock()
defer l.lock.RUnlock()
for _, b := range l.metrics {
if b != nil {
return false
}
}
return len(l.children) == 0
}
// forceFree removes the oldest buffer from each metric chain in the subtree.
//
// Unlike free(), which removes based on time threshold, this unconditionally removes
@@ -278,6 +294,7 @@ func (l *Level) sizeInBytes() int64 {
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(schema.Float(0)))
size += b.bufferCount() * int64(unsafe.Sizeof(buffer{}))
}
}