Patching parquet archive writer for high memory usage

2026-03-21 07:17:30 +01:00 · 2026-03-17 18:41:47 +01:00
parent 032d1e0692
commit 0a4c4d8e57
3 changed files with 112 additions and 52 deletions
--- a/pkg/metricstore/archive.go
+++ b/pkg/metricstore/archive.go
@@ -43,7 +43,6 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
 // cleanUpWorker takes simple values to configure what it does
 func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) {
 	wg.Go(func() {
-
 		d, err := time.ParseDuration(interval)
 		if err != nil {
 			cclog.Fatalf("[METRICSTORE]> error parsing %s interval duration: %v\n", mode, err)
@@ -99,8 +98,8 @@ func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
 	}

 	type workItem struct {
-		dir            string
-		cluster, host  string
+		dir           string
+		cluster, host string
 	}

 	var wg sync.WaitGroup
@@ -187,9 +186,8 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 			return totalFiles, err
 		}

-		// Collect rows from all hosts in this cluster using worker pool
+		// Collect files to delete from all hosts in this cluster using worker pool
 		type hostResult struct {
-			rows  []ParquetMetricRow
 			files []string // checkpoint filenames to delete after successful write
 			dir   string   // checkpoint directory for this host
 		}
@@ -199,6 +197,8 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 			dir, host string
 		}, Keys.NumWorkers)

+		rowChan := make(chan *ParquetMetricRow, 10000)
+
 		var wg sync.WaitGroup
 		errs := int32(0)

@@ -207,19 +207,20 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 			go func() {
 				defer wg.Done()
 				for item := range work {
-					rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from)
+					files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from, rowChan)
 					if err != nil {
 						cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error())
 						atomic.AddInt32(&errs, 1)
 						continue
 					}
-					if len(rows) > 0 {
-						results <- hostResult{rows: rows, files: files, dir: item.dir}
+					if len(files) > 0 {
+						results <- hostResult{files: files, dir: item.dir}
 					}
 				}
 			}()
 		}

+		// Produce work items
 		go func() {
 			for _, hostEntry := range hostEntries {
 				if !hostEntry.IsDir() {
@@ -231,15 +232,22 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 				}{dir: dir, host: hostEntry.Name()}
 			}
 			close(work)
+		}()
+
+		// Wait for all workers and close rowChan and results
+		go func() {
 			wg.Wait()
+			close(rowChan)
 			close(results)
 		}()

-		// Collect all rows and file info
-		var allRows []ParquetMetricRow
+		// Concurrently write from rowChan to Parquet
+		parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
+		rowCount, writerErr := writeParquetArchiveStream(parquetFile, rowChan)
+
+		// Collect all file info
 		var allResults []hostResult
 		for r := range results {
-			allRows = append(allRows, r.rows...)
 			allResults = append(allResults, r)
 		}

@@ -247,17 +255,18 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 			return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster)
 		}

-		if len(allRows) == 0 {
+		if writerErr != nil {
+			return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, writerErr)
+		}
+
+		if rowCount == 0 {
+			// Cleanup empty parquet file if created
+			os.Remove(parquetFile)
 			continue
 		}

-		// Write one Parquet file per cluster
-		parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
-		if err := writeParquetArchive(parquetFile, allRows); err != nil {
-			return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err)
-		}
-
 		// Delete archived checkpoint files
+		totalFilesCluster := 0
 		for _, result := range allResults {
 			for _, file := range result.files {
 				filename := filepath.Join(result.dir, file)
@@ -265,12 +274,13 @@ func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, err
 					cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err)
 				} else {
 					totalFiles++
+					totalFilesCluster++
 				}
 			}
 		}

 		cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s",
-			len(allRows), totalFiles, cluster, parquetFile)
+			rowCount, totalFilesCluster, cluster, parquetFile)
 	}

 	return totalFiles, nil