fix: checkpoint initialization gap on restarts

Entire-Checkpoint: 3f4d366b037c
This commit is contained in:
2026-03-27 06:59:58 +01:00
parent 71fc9efec7
commit 0ce2fa2fbe
3 changed files with 33 additions and 11 deletions

View File

@@ -86,14 +86,16 @@ var (
// Checkpointing starts a background worker that periodically saves metric data to disk.
//
// Checkpoints are written every 12 hours (hardcoded).
// restoreFrom is the earliest timestamp of data loaded from checkpoint files at startup.
// The first periodic checkpoint after restart will cover [restoreFrom, now], ensuring that
// loaded data is re-persisted before old checkpoint files are cleaned up.
//
// Format behaviour:
// - "json": Periodic checkpointing every checkpointInterval
// - "wal": Periodic binary snapshots + WAL rotation every checkpointInterval
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
func Checkpointing(wg *sync.WaitGroup, ctx context.Context, restoreFrom time.Time) {
lastCheckpointMu.Lock()
lastCheckpoint = time.Now()
lastCheckpoint = restoreFrom
lastCheckpointMu.Unlock()
ms := GetMemoryStore()
@@ -337,25 +339,35 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
return ErrNoNewArchiveData
}
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
finalPath := path.Join(dir, fmt.Sprintf("%d.json", from))
tmpPath := finalPath + ".tmp"
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, CheckpointDirPerms)
if err == nil {
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
f, err = os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
}
}
if err != nil {
return err
}
defer f.Close()
bw := bufio.NewWriter(f)
if err = json.NewEncoder(bw).Encode(cf); err != nil {
f.Close()
os.Remove(tmpPath)
return err
}
return bw.Flush()
if err = bw.Flush(); err != nil {
f.Close()
os.Remove(tmpPath)
return err
}
f.Close()
return os.Rename(tmpPath, finalPath)
}
// enqueueCheckpointHosts traverses checkpoint directory and enqueues cluster/host pairs.
@@ -470,7 +482,7 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
data: metric.Data[0:n:n],
prev: nil,
next: nil,
archived: true,
archived: false,
}
minfo, ok := m.Metrics[name]