mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-28 21:37:31 +01:00
Replace the old zip archive options for the metricstore node data by parquet files
This commit is contained in:
@@ -6,12 +6,9 @@
|
||||
package metricstore
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bufio"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
@@ -47,7 +44,7 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// runWorker takes simple values to configure what it does
|
||||
// cleanUpWorker takes simple values to configure what it does
|
||||
func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) {
|
||||
wg.Go(func() {
|
||||
|
||||
@@ -75,10 +72,10 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error())
|
||||
} else {
|
||||
if delete && cleanupDir == "" {
|
||||
if delete {
|
||||
cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n)
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
|
||||
cclog.Infof("[METRICSTORE]> done: %d checkpoint files archived to parquet", n)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -88,17 +85,26 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod
|
||||
|
||||
var ErrNoNewArchiveData error = errors.New("all data already archived")
|
||||
|
||||
// Delete or ZIP all checkpoint files older than `from` together and write them to the `cleanupDir`,
|
||||
// deleting/moving them from the `checkpointsDir`.
|
||||
// CleanupCheckpoints deletes or archives all checkpoint files older than `from`.
|
||||
// When archiving, consolidates all hosts per cluster into a single Parquet file.
|
||||
func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteInstead bool) (int, error) {
|
||||
if deleteInstead {
|
||||
return deleteCheckpoints(checkpointsDir, from)
|
||||
}
|
||||
|
||||
return archiveCheckpoints(checkpointsDir, cleanupDir, from)
|
||||
}
|
||||
|
||||
// deleteCheckpoints removes checkpoint files older than `from` across all clusters/hosts.
|
||||
func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
|
||||
entries1, err := os.ReadDir(checkpointsDir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
type workItem struct {
|
||||
cdir, adir string
|
||||
cluster, host string
|
||||
dir string
|
||||
cluster, host string
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
@@ -109,13 +115,29 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for workItem := range work {
|
||||
m, err := cleanupCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
|
||||
for item := range work {
|
||||
entries, err := os.ReadDir(item.dir)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
|
||||
cclog.Errorf("error reading %s/%s: %s", item.cluster, item.host, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
files, err := findFiles(entries, from, false)
|
||||
if err != nil {
|
||||
cclog.Errorf("error finding files in %s/%s: %s", item.cluster, item.host, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, checkpoint := range files {
|
||||
if err := os.Remove(filepath.Join(item.dir, checkpoint)); err != nil {
|
||||
cclog.Errorf("error deleting %s/%s/%s: %s", item.cluster, item.host, checkpoint, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
atomic.AddInt32(&n, int32(m))
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -124,14 +146,14 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
|
||||
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
continue
|
||||
}
|
||||
|
||||
for _, de2 := range entries2 {
|
||||
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
|
||||
adir := filepath.Join(cleanupDir, de1.Name(), de2.Name())
|
||||
work <- workItem{
|
||||
adir: adir, cdir: cdir,
|
||||
cluster: de1.Name(), host: de2.Name(),
|
||||
dir: filepath.Join(checkpointsDir, de1.Name(), de2.Name()),
|
||||
cluster: de1.Name(),
|
||||
host: de2.Name(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -142,85 +164,118 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
|
||||
return int(n), fmt.Errorf("%d errors happened while deleting (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Helper function for `CleanupCheckpoints`.
|
||||
func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead bool) (int, error) {
|
||||
entries, err := os.ReadDir(dir)
|
||||
// archiveCheckpoints archives checkpoint files to Parquet format.
|
||||
// Produces one Parquet file per cluster: <cleanupDir>/<cluster>/<timestamp>.parquet
|
||||
func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, error) {
|
||||
clusterEntries, err := os.ReadDir(checkpointsDir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
files, err := findFiles(entries, from, false)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
totalFiles := 0
|
||||
|
||||
if deleteInstead {
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return n, err
|
||||
}
|
||||
n += 1
|
||||
for _, clusterEntry := range clusterEntries {
|
||||
if !clusterEntry.IsDir() {
|
||||
continue
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
filename := filepath.Join(cleanupDir, fmt.Sprintf("%d.zip", from))
|
||||
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(cleanupDir, CheckpointDirPerms)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
bw := bufio.NewWriter(f)
|
||||
defer bw.Flush()
|
||||
zw := zip.NewWriter(bw)
|
||||
defer zw.Close()
|
||||
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
// Use closure to ensure file is closed immediately after use,
|
||||
// avoiding file descriptor leak from defer in loop
|
||||
err := func() error {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
r, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
w, err := zw.Create(checkpoint)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = io.Copy(w, r); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}()
|
||||
cluster := clusterEntry.Name()
|
||||
hostEntries, err := os.ReadDir(filepath.Join(checkpointsDir, cluster))
|
||||
if err != nil {
|
||||
return n, err
|
||||
return totalFiles, err
|
||||
}
|
||||
n += 1
|
||||
|
||||
// Collect rows from all hosts in this cluster using worker pool
|
||||
type hostResult struct {
|
||||
rows []ParquetMetricRow
|
||||
files []string // checkpoint filenames to delete after successful write
|
||||
dir string // checkpoint directory for this host
|
||||
}
|
||||
|
||||
results := make(chan hostResult, len(hostEntries))
|
||||
work := make(chan struct {
|
||||
dir, host string
|
||||
}, Keys.NumWorkers)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
errs := int32(0)
|
||||
|
||||
wg.Add(Keys.NumWorkers)
|
||||
for w := 0; w < Keys.NumWorkers; w++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for item := range work {
|
||||
rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from)
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
continue
|
||||
}
|
||||
if len(rows) > 0 {
|
||||
results <- hostResult{rows: rows, files: files, dir: item.dir}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
go func() {
|
||||
for _, hostEntry := range hostEntries {
|
||||
if !hostEntry.IsDir() {
|
||||
continue
|
||||
}
|
||||
dir := filepath.Join(checkpointsDir, cluster, hostEntry.Name())
|
||||
work <- struct {
|
||||
dir, host string
|
||||
}{dir: dir, host: hostEntry.Name()}
|
||||
}
|
||||
close(work)
|
||||
wg.Wait()
|
||||
close(results)
|
||||
}()
|
||||
|
||||
// Collect all rows and file info
|
||||
var allRows []ParquetMetricRow
|
||||
var allResults []hostResult
|
||||
for r := range results {
|
||||
allRows = append(allRows, r.rows...)
|
||||
allResults = append(allResults, r)
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster)
|
||||
}
|
||||
|
||||
if len(allRows) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Write one Parquet file per cluster
|
||||
parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
|
||||
if err := writeParquetArchive(parquetFile, allRows); err != nil {
|
||||
return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err)
|
||||
}
|
||||
|
||||
// Delete archived checkpoint files
|
||||
for _, result := range allResults {
|
||||
for _, file := range result.files {
|
||||
filename := filepath.Join(result.dir, file)
|
||||
if err := os.Remove(filename); err != nil {
|
||||
cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err)
|
||||
} else {
|
||||
totalFiles++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s",
|
||||
len(allRows), totalFiles, cluster, parquetFile)
|
||||
}
|
||||
|
||||
return n, nil
|
||||
return totalFiles, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user