Add nodestate retention and archiving

This commit is contained in:
2026-02-12 09:21:44 +01:00
parent 865cd3db54
commit 54ea5d7900
7 changed files with 400 additions and 0 deletions

View File

@@ -71,6 +71,23 @@ type ProgramConfig struct {
// If exists, will enable dynamic zoom in frontend metric plots using the configured values // If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"resampling"` EnableResampling *ResampleConfig `json:"resampling"`
// Node state retention configuration
NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
}
type NodeStateRetention struct {
Policy string `json:"policy"` // "delete" or "parquet"
Age int `json:"age"` // hours, default 24
TargetKind string `json:"target-kind"` // "file" or "s3"
TargetPath string `json:"target-path"`
TargetEndpoint string `json:"target-endpoint"`
TargetBucket string `json:"target-bucket"`
TargetAccessKey string `json:"target-access-key"`
TargetSecretKey string `json:"target-secret-key"`
TargetRegion string `json:"target-region"`
TargetUsePathStyle bool `json:"target-use-path-style"`
MaxFileSizeMB int `json:"max-file-size-mb"`
} }
type ResampleConfig struct { type ResampleConfig struct {

View File

@@ -130,6 +130,59 @@ var configSchema = `
} }
}, },
"required": ["subject-job-event", "subject-node-state"] "required": ["subject-job-event", "subject-node-state"]
},
"nodestate-retention": {
"description": "Node state retention configuration for cleaning up old node_state rows.",
"type": "object",
"properties": {
"policy": {
"description": "Retention policy: 'delete' to remove old rows, 'parquet' to archive then delete.",
"type": "string",
"enum": ["delete", "parquet"]
},
"age": {
"description": "Retention age in hours (default: 24).",
"type": "integer"
},
"target-kind": {
"description": "Target kind for parquet archiving: 'file' or 's3'.",
"type": "string",
"enum": ["file", "s3"]
},
"target-path": {
"description": "Filesystem path for parquet file target.",
"type": "string"
},
"target-endpoint": {
"description": "S3 endpoint URL.",
"type": "string"
},
"target-bucket": {
"description": "S3 bucket name.",
"type": "string"
},
"target-access-key": {
"description": "S3 access key.",
"type": "string"
},
"target-secret-key": {
"description": "S3 secret key.",
"type": "string"
},
"target-region": {
"description": "S3 region.",
"type": "string"
},
"target-use-path-style": {
"description": "Use path-style S3 addressing.",
"type": "boolean"
},
"max-file-size-mb": {
"description": "Maximum parquet file size in MB (default: 128).",
"type": "integer"
}
},
"required": ["policy"]
} }
} }
}` }`

View File

@@ -225,6 +225,71 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
// return nil // return nil
// } // }
// NodeStateWithNode combines a node state row with denormalized node info.
type NodeStateWithNode struct {
ID int64 `db:"id"`
TimeStamp int64 `db:"time_stamp"`
NodeState string `db:"node_state"`
HealthState string `db:"health_state"`
HealthMetrics string `db:"health_metrics"`
CpusAllocated int `db:"cpus_allocated"`
MemoryAllocated int64 `db:"memory_allocated"`
GpusAllocated int `db:"gpus_allocated"`
JobsRunning int `db:"jobs_running"`
Hostname string `db:"hostname"`
Cluster string `db:"cluster"`
SubCluster string `db:"subcluster"`
}
// FindNodeStatesBefore returns all node_state rows with time_stamp < cutoff,
// joined with node info for denormalized archiving.
func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode, error) {
rows, err := sq.Select(
"node_state.id", "node_state.time_stamp", "node_state.node_state",
"node_state.health_state", "node_state.health_metrics",
"node_state.cpus_allocated", "node_state.memory_allocated",
"node_state.gpus_allocated", "node_state.jobs_running",
"node.hostname", "node.cluster", "node.subcluster",
).
From("node_state").
Join("node ON node_state.node_id = node.id").
Where(sq.Lt{"node_state.time_stamp": cutoff}).
Where("node_state.id NOT IN (SELECT MAX(id) FROM node_state GROUP BY node_id)").
OrderBy("node_state.time_stamp ASC").
RunWith(r.DB).Query()
if err != nil {
return nil, err
}
defer rows.Close()
var result []NodeStateWithNode
for rows.Next() {
var ns NodeStateWithNode
if err := rows.Scan(&ns.ID, &ns.TimeStamp, &ns.NodeState,
&ns.HealthState, &ns.HealthMetrics,
&ns.CpusAllocated, &ns.MemoryAllocated,
&ns.GpusAllocated, &ns.JobsRunning,
&ns.Hostname, &ns.Cluster, &ns.SubCluster); err != nil {
return nil, err
}
result = append(result, ns)
}
return result, nil
}
// DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff,
// but always preserves the latest row per node_id.
func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) {
res, err := r.DB.Exec(
`DELETE FROM node_state WHERE time_stamp < ? AND id NOT IN (SELECT MAX(id) FROM node_state GROUP BY node_id)`,
cutoff,
)
if err != nil {
return 0, err
}
return res.RowsAffected()
}
func (r *NodeRepository) DeleteNode(id int64) error { func (r *NodeRepository) DeleteNode(id int64) error {
_, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id) _, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id)
if err != nil { if err != nil {

View File

@@ -0,0 +1,120 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskmanager
import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/go-co-op/gocron/v2"
)
func RegisterNodeStateRetentionDeleteService(ageHours int) {
cclog.Info("Register node state retention delete service")
s.NewJob(gocron.DurationJob(1*time.Hour),
gocron.NewTask(
func() {
cutoff := time.Now().Unix() - int64(ageHours*3600)
nodeRepo := repository.GetNodeRepository()
cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff)
if err != nil {
cclog.Errorf("NodeState retention: error deleting old rows: %v", err)
} else if cnt > 0 {
cclog.Infof("NodeState retention: deleted %d old rows", cnt)
}
}))
}
func RegisterNodeStateRetentionParquetService(cfg *config.NodeStateRetention) {
cclog.Info("Register node state retention parquet service")
maxFileSizeMB := cfg.MaxFileSizeMB
if maxFileSizeMB <= 0 {
maxFileSizeMB = 128
}
ageHours := cfg.Age
if ageHours <= 0 {
ageHours = 24
}
var target pqarchive.ParquetTarget
var err error
switch cfg.TargetKind {
case "s3":
target, err = pqarchive.NewS3Target(pqarchive.S3TargetConfig{
Endpoint: cfg.TargetEndpoint,
Bucket: cfg.TargetBucket,
AccessKey: cfg.TargetAccessKey,
SecretKey: cfg.TargetSecretKey,
Region: cfg.TargetRegion,
UsePathStyle: cfg.TargetUsePathStyle,
})
default:
target, err = pqarchive.NewFileTarget(cfg.TargetPath)
}
if err != nil {
cclog.Errorf("NodeState parquet retention: failed to create target: %v", err)
return
}
s.NewJob(gocron.DurationJob(1*time.Hour),
gocron.NewTask(
func() {
cutoff := time.Now().Unix() - int64(ageHours*3600)
nodeRepo := repository.GetNodeRepository()
rows, err := nodeRepo.FindNodeStatesBefore(cutoff)
if err != nil {
cclog.Errorf("NodeState parquet retention: error finding rows: %v", err)
return
}
if len(rows) == 0 {
return
}
cclog.Infof("NodeState parquet retention: archiving %d rows", len(rows))
pw := pqarchive.NewNodeStateParquetWriter(target, maxFileSizeMB)
for _, ns := range rows {
row := pqarchive.ParquetNodeStateRow{
TimeStamp: ns.TimeStamp,
NodeState: ns.NodeState,
HealthState: ns.HealthState,
HealthMetrics: ns.HealthMetrics,
CpusAllocated: int32(ns.CpusAllocated),
MemoryAllocated: ns.MemoryAllocated,
GpusAllocated: int32(ns.GpusAllocated),
JobsRunning: int32(ns.JobsRunning),
Hostname: ns.Hostname,
Cluster: ns.Cluster,
SubCluster: ns.SubCluster,
}
if err := pw.AddRow(row); err != nil {
cclog.Errorf("NodeState parquet retention: add row: %v", err)
continue
}
}
if err := pw.Close(); err != nil {
cclog.Errorf("NodeState parquet retention: close writer: %v", err)
return
}
cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff)
if err != nil {
cclog.Errorf("NodeState parquet retention: error deleting rows: %v", err)
} else {
cclog.Infof("NodeState parquet retention: deleted %d rows from db", cnt)
}
}))
}

View File

@@ -144,9 +144,30 @@ func Start(cronCfg, archiveConfig json.RawMessage) {
RegisterUpdateDurationWorker() RegisterUpdateDurationWorker()
RegisterCommitJobService() RegisterCommitJobService()
if config.Keys.NodeStateRetention != nil && config.Keys.NodeStateRetention.Policy != "" {
initNodeStateRetention()
}
s.Start() s.Start()
} }
func initNodeStateRetention() {
cfg := config.Keys.NodeStateRetention
age := cfg.Age
if age <= 0 {
age = 24
}
switch cfg.Policy {
case "delete":
RegisterNodeStateRetentionDeleteService(age)
case "parquet":
RegisterNodeStateRetentionParquetService(cfg)
default:
cclog.Warnf("Unknown nodestate-retention policy: %s", cfg.Policy)
}
}
// Shutdown stops the task manager and its scheduler. // Shutdown stops the task manager and its scheduler.
func Shutdown() { func Shutdown() {
if s != nil { if s != nil {

View File

@@ -0,0 +1,20 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package parquet
type ParquetNodeStateRow struct {
TimeStamp int64 `parquet:"time_stamp"`
NodeState string `parquet:"node_state"`
HealthState string `parquet:"health_state"`
HealthMetrics string `parquet:"health_metrics,optional"`
CpusAllocated int32 `parquet:"cpus_allocated"`
MemoryAllocated int64 `parquet:"memory_allocated"`
GpusAllocated int32 `parquet:"gpus_allocated"`
JobsRunning int32 `parquet:"jobs_running"`
Hostname string `parquet:"hostname"`
Cluster string `parquet:"cluster"`
SubCluster string `parquet:"subcluster"`
}

View File

@@ -0,0 +1,104 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package parquet
import (
"bytes"
"fmt"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
pq "github.com/parquet-go/parquet-go"
)
// NodeStateParquetWriter batches ParquetNodeStateRows and flushes them to a target
// when the estimated size exceeds maxSizeBytes.
type NodeStateParquetWriter struct {
target ParquetTarget
maxSizeBytes int64
rows []ParquetNodeStateRow
currentSize int64
fileCounter int
datePrefix string
}
// NewNodeStateParquetWriter creates a new writer for node state parquet files.
func NewNodeStateParquetWriter(target ParquetTarget, maxSizeMB int) *NodeStateParquetWriter {
return &NodeStateParquetWriter{
target: target,
maxSizeBytes: int64(maxSizeMB) * 1024 * 1024,
datePrefix: time.Now().Format("2006-01-02"),
}
}
// AddRow adds a row to the current batch. If the estimated batch size
// exceeds the configured maximum, the batch is flushed first.
func (pw *NodeStateParquetWriter) AddRow(row ParquetNodeStateRow) error {
rowSize := estimateNodeStateRowSize(&row)
if pw.currentSize+rowSize > pw.maxSizeBytes && len(pw.rows) > 0 {
if err := pw.Flush(); err != nil {
return err
}
}
pw.rows = append(pw.rows, row)
pw.currentSize += rowSize
return nil
}
// Flush writes the current batch to a parquet file on the target.
func (pw *NodeStateParquetWriter) Flush() error {
if len(pw.rows) == 0 {
return nil
}
pw.fileCounter++
fileName := fmt.Sprintf("cc-nodestate-%s-%03d.parquet", pw.datePrefix, pw.fileCounter)
data, err := writeNodeStateParquetBytes(pw.rows)
if err != nil {
return fmt.Errorf("write parquet buffer: %w", err)
}
if err := pw.target.WriteFile(fileName, data); err != nil {
return fmt.Errorf("write parquet file %q: %w", fileName, err)
}
cclog.Infof("NodeState retention: wrote %s (%d rows, %d bytes)", fileName, len(pw.rows), len(data))
pw.rows = pw.rows[:0]
pw.currentSize = 0
return nil
}
// Close flushes any remaining rows and finalizes the writer.
func (pw *NodeStateParquetWriter) Close() error {
return pw.Flush()
}
func writeNodeStateParquetBytes(rows []ParquetNodeStateRow) ([]byte, error) {
var buf bytes.Buffer
writer := pq.NewGenericWriter[ParquetNodeStateRow](&buf,
pq.Compression(&pq.Snappy),
)
if _, err := writer.Write(rows); err != nil {
return nil, err
}
if err := writer.Close(); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func estimateNodeStateRowSize(row *ParquetNodeStateRow) int64 {
size := int64(100) // fixed numeric fields
size += int64(len(row.NodeState) + len(row.HealthState) + len(row.HealthMetrics))
size += int64(len(row.Hostname) + len(row.Cluster) + len(row.SubCluster))
return size
}