mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-21 07:17:30 +01:00
Patching parquet archive writer for high memory usage
This commit is contained in:
@@ -43,7 +43,6 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
|
||||
// cleanUpWorker takes simple values to configure what it does
|
||||
func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) {
|
||||
wg.Go(func() {
|
||||
|
||||
d, err := time.ParseDuration(interval)
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> error parsing %s interval duration: %v\n", mode, err)
|
||||
@@ -99,8 +98,8 @@ func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
|
||||
}
|
||||
|
||||
type workItem struct {
|
||||
dir string
|
||||
cluster, host string
|
||||
dir string
|
||||
cluster, host string
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
@@ -187,9 +186,8 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
|
||||
return totalFiles, err
|
||||
}
|
||||
|
||||
// Collect rows from all hosts in this cluster using worker pool
|
||||
// Collect files to delete from all hosts in this cluster using worker pool
|
||||
type hostResult struct {
|
||||
rows []ParquetMetricRow
|
||||
files []string // checkpoint filenames to delete after successful write
|
||||
dir string // checkpoint directory for this host
|
||||
}
|
||||
@@ -199,6 +197,8 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
|
||||
dir, host string
|
||||
}, Keys.NumWorkers)
|
||||
|
||||
rowChan := make(chan *ParquetMetricRow, 10000)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
errs := int32(0)
|
||||
|
||||
@@ -207,19 +207,20 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for item := range work {
|
||||
rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from)
|
||||
files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from, rowChan)
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
continue
|
||||
}
|
||||
if len(rows) > 0 {
|
||||
results <- hostResult{rows: rows, files: files, dir: item.dir}
|
||||
if len(files) > 0 {
|
||||
results <- hostResult{files: files, dir: item.dir}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Produce work items
|
||||
go func() {
|
||||
for _, hostEntry := range hostEntries {
|
||||
if !hostEntry.IsDir() {
|
||||
@@ -231,15 +232,22 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
|
||||
}{dir: dir, host: hostEntry.Name()}
|
||||
}
|
||||
close(work)
|
||||
}()
|
||||
|
||||
// Wait for all workers and close rowChan and results
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(rowChan)
|
||||
close(results)
|
||||
}()
|
||||
|
||||
// Collect all rows and file info
|
||||
var allRows []ParquetMetricRow
|
||||
// Concurrently write from rowChan to Parquet
|
||||
parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
|
||||
rowCount, writerErr := writeParquetArchiveStream(parquetFile, rowChan)
|
||||
|
||||
// Collect all file info
|
||||
var allResults []hostResult
|
||||
for r := range results {
|
||||
allRows = append(allRows, r.rows...)
|
||||
allResults = append(allResults, r)
|
||||
}
|
||||
|
||||
@@ -247,17 +255,18 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
|
||||
return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster)
|
||||
}
|
||||
|
||||
if len(allRows) == 0 {
|
||||
if writerErr != nil {
|
||||
return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, writerErr)
|
||||
}
|
||||
|
||||
if rowCount == 0 {
|
||||
// Cleanup empty parquet file if created
|
||||
os.Remove(parquetFile)
|
||||
continue
|
||||
}
|
||||
|
||||
// Write one Parquet file per cluster
|
||||
parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
|
||||
if err := writeParquetArchive(parquetFile, allRows); err != nil {
|
||||
return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err)
|
||||
}
|
||||
|
||||
// Delete archived checkpoint files
|
||||
totalFilesCluster := 0
|
||||
for _, result := range allResults {
|
||||
for _, file := range result.files {
|
||||
filename := filepath.Join(result.dir, file)
|
||||
@@ -265,12 +274,13 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
|
||||
cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err)
|
||||
} else {
|
||||
totalFiles++
|
||||
totalFilesCluster++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s",
|
||||
len(allRows), totalFiles, cluster, parquetFile)
|
||||
rowCount, totalFilesCluster, cluster, parquetFile)
|
||||
}
|
||||
|
||||
return totalFiles, nil
|
||||
|
||||
Reference in New Issue
Block a user