mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-21 07:17:30 +01:00
Patching parquet archive writer for high memory usage
This commit is contained in:
@@ -31,15 +31,15 @@ type ParquetMetricRow struct {
|
||||
Value float32 `parquet:"value"`
|
||||
}
|
||||
|
||||
// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows.
|
||||
// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows via a channel.
|
||||
// The scope path is built from the hierarchy: host level is "node", then child names
|
||||
// map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0").
|
||||
func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow {
|
||||
func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rowChan chan<- *ParquetMetricRow) {
|
||||
for metricName, cm := range cf.Metrics {
|
||||
ts := cm.Start
|
||||
for _, v := range cm.Data {
|
||||
if !v.IsNaN() {
|
||||
rows = append(rows, ParquetMetricRow{
|
||||
rowChan <- &ParquetMetricRow{
|
||||
Cluster: cluster,
|
||||
Hostname: hostname,
|
||||
Metric: metricName,
|
||||
@@ -48,7 +48,7 @@ func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID
|
||||
Timestamp: ts,
|
||||
Frequency: cm.Frequency,
|
||||
Value: float32(v),
|
||||
})
|
||||
}
|
||||
}
|
||||
ts += cm.Frequency
|
||||
}
|
||||
@@ -56,10 +56,8 @@ func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID
|
||||
|
||||
for childName, childCf := range cf.Children {
|
||||
childScope, childScopeID := parseScopeFromName(childName)
|
||||
rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows)
|
||||
flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rowChan)
|
||||
}
|
||||
|
||||
return rows
|
||||
}
|
||||
|
||||
// parseScopeFromName infers scope and scope_id from a child level name.
|
||||
@@ -91,15 +89,23 @@ func parseScopeFromName(name string) (string, string) {
|
||||
return name, ""
|
||||
}
|
||||
|
||||
// writeParquetArchive writes rows to a Parquet file with Zstd compression.
|
||||
func writeParquetArchive(filename string, rows []ParquetMetricRow) error {
|
||||
// writeParquetArchiveStream writes rows from a channel to a Parquet file with Zstd compression in batches.
|
||||
func writeParquetArchiveStream(filename string, rowChan <-chan *ParquetMetricRow) (int, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil {
|
||||
return fmt.Errorf("creating archive directory: %w", err)
|
||||
go func() {
|
||||
for range rowChan {
|
||||
}
|
||||
}()
|
||||
return 0, fmt.Errorf("creating archive directory: %w", err)
|
||||
}
|
||||
|
||||
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating parquet file: %w", err)
|
||||
go func() {
|
||||
for range rowChan {
|
||||
}
|
||||
}()
|
||||
return 0, fmt.Errorf("creating parquet file: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
@@ -115,19 +121,45 @@ func writeParquetArchive(filename string, rows []ParquetMetricRow) error {
|
||||
)),
|
||||
)
|
||||
|
||||
if _, err := writer.Write(rows); err != nil {
|
||||
return fmt.Errorf("writing parquet rows: %w", err)
|
||||
batchSize := 4096
|
||||
batch := make([]ParquetMetricRow, 0, batchSize)
|
||||
rowCount := 0
|
||||
var writeErr error
|
||||
|
||||
for rowPtr := range rowChan {
|
||||
if writeErr != nil {
|
||||
continue // Drain the channel to prevent worker deadlock
|
||||
}
|
||||
batch = append(batch, *rowPtr)
|
||||
if len(batch) >= batchSize {
|
||||
if _, err := writer.Write(batch); err != nil {
|
||||
writeErr = fmt.Errorf("writing parquet batch: %w", err)
|
||||
}
|
||||
rowCount += len(batch)
|
||||
batch = batch[:0]
|
||||
}
|
||||
}
|
||||
|
||||
if writeErr != nil {
|
||||
return rowCount, writeErr
|
||||
}
|
||||
|
||||
if len(batch) > 0 {
|
||||
if _, err := writer.Write(batch); err != nil {
|
||||
return rowCount, fmt.Errorf("writing remaining parquet batch: %w", err)
|
||||
}
|
||||
rowCount += len(batch)
|
||||
}
|
||||
|
||||
if err := writer.Close(); err != nil {
|
||||
return fmt.Errorf("closing parquet writer: %w", err)
|
||||
return rowCount, fmt.Errorf("closing parquet writer: %w", err)
|
||||
}
|
||||
|
||||
if err := bw.Flush(); err != nil {
|
||||
return fmt.Errorf("flushing parquet file: %w", err)
|
||||
return rowCount, fmt.Errorf("flushing parquet file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
return rowCount, nil
|
||||
}
|
||||
|
||||
// loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns
|
||||
@@ -180,24 +212,22 @@ func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) {
|
||||
}
|
||||
|
||||
// archiveCheckpointsToParquet reads checkpoint files for a host directory,
|
||||
// converts them to Parquet rows. Returns the rows and filenames that were processed.
|
||||
func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) {
|
||||
// converts them to Parquet rows. Returns the filenames that were processed.
|
||||
func archiveCheckpointsToParquet(dir, cluster, host string, from int64, rowChan chan<- *ParquetMetricRow) ([]string, error) {
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
files, err := findFiles(entries, from, false)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(files) == 0 {
|
||||
return nil, nil, nil
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var rows []ParquetMetricRow
|
||||
|
||||
for _, checkpoint := range files {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
cf, err := loadCheckpointFileFromDisk(filename)
|
||||
@@ -206,8 +236,8 @@ func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]Parqu
|
||||
continue
|
||||
}
|
||||
|
||||
rows = flattenCheckpointFile(cf, cluster, host, "node", "", rows)
|
||||
flattenCheckpointFile(cf, cluster, host, "node", "", rowChan)
|
||||
}
|
||||
|
||||
return rows, files, nil
|
||||
return files, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user