mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-18 00:41:46 +01:00
Update job archive retention to uniform policy with json and parquet target format
This commit is contained in:
@@ -6,157 +6,329 @@
|
||||
package taskmanager
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
func RegisterRetentionDeleteService(age int, includeDB bool, omitTagged bool) {
|
||||
// createParquetTarget creates a ParquetTarget (file or S3) from the retention config.
|
||||
func createParquetTarget(cfg Retention) (pqarchive.ParquetTarget, error) {
|
||||
switch cfg.TargetKind {
|
||||
case "s3":
|
||||
return pqarchive.NewS3Target(pqarchive.S3TargetConfig{
|
||||
Endpoint: cfg.TargetEndpoint,
|
||||
Bucket: cfg.TargetBucket,
|
||||
AccessKey: cfg.TargetAccessKey,
|
||||
SecretKey: cfg.TargetSecretKey,
|
||||
Region: cfg.TargetRegion,
|
||||
UsePathStyle: cfg.TargetUsePathStyle,
|
||||
})
|
||||
default:
|
||||
return pqarchive.NewFileTarget(cfg.TargetPath)
|
||||
}
|
||||
}
|
||||
|
||||
// createTargetBackend creates a secondary archive backend (file or S3) for JSON copy/move.
|
||||
func createTargetBackend(cfg Retention) (archive.ArchiveBackend, error) {
|
||||
var raw json.RawMessage
|
||||
var err error
|
||||
|
||||
switch cfg.TargetKind {
|
||||
case "s3":
|
||||
raw, err = json.Marshal(map[string]interface{}{
|
||||
"kind": "s3",
|
||||
"endpoint": cfg.TargetEndpoint,
|
||||
"bucket": cfg.TargetBucket,
|
||||
"access-key": cfg.TargetAccessKey,
|
||||
"secret-key": cfg.TargetSecretKey,
|
||||
"region": cfg.TargetRegion,
|
||||
"use-path-style": cfg.TargetUsePathStyle,
|
||||
})
|
||||
default:
|
||||
raw, err = json.Marshal(map[string]string{
|
||||
"kind": "file",
|
||||
"path": cfg.TargetPath,
|
||||
})
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal target config: %w", err)
|
||||
}
|
||||
return archive.InitBackend(raw)
|
||||
}
|
||||
|
||||
// transferJobsJSON copies job data from source archive to target backend in JSON format.
|
||||
func transferJobsJSON(jobs []*schema.Job, src archive.ArchiveBackend, dst archive.ArchiveBackend) error {
|
||||
// Transfer cluster configs for all clusters referenced by jobs
|
||||
clustersDone := make(map[string]bool)
|
||||
for _, job := range jobs {
|
||||
if clustersDone[job.Cluster] {
|
||||
continue
|
||||
}
|
||||
clusterCfg, err := src.LoadClusterCfg(job.Cluster)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err)
|
||||
} else {
|
||||
if err := dst.StoreClusterCfg(job.Cluster, clusterCfg); err != nil {
|
||||
cclog.Warnf("Retention: store cluster config %q: %v", job.Cluster, err)
|
||||
}
|
||||
}
|
||||
clustersDone[job.Cluster] = true
|
||||
}
|
||||
|
||||
for _, job := range jobs {
|
||||
meta, err := src.LoadJobMeta(job)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
data, err := src.LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
if err := dst.ImportJob(meta, &data); err != nil {
|
||||
cclog.Warnf("Retention: import job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// transferJobsParquet converts jobs to Parquet format, organized by cluster.
|
||||
func transferJobsParquet(jobs []*schema.Job, src archive.ArchiveBackend, target pqarchive.ParquetTarget, maxSizeMB int) error {
|
||||
cw := pqarchive.NewClusterAwareParquetWriter(target, maxSizeMB)
|
||||
|
||||
// Set cluster configs for all clusters referenced by jobs
|
||||
clustersDone := make(map[string]bool)
|
||||
for _, job := range jobs {
|
||||
if clustersDone[job.Cluster] {
|
||||
continue
|
||||
}
|
||||
clusterCfg, err := src.LoadClusterCfg(job.Cluster)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err)
|
||||
} else {
|
||||
cw.SetClusterConfig(job.Cluster, clusterCfg)
|
||||
}
|
||||
clustersDone[job.Cluster] = true
|
||||
}
|
||||
|
||||
for _, job := range jobs {
|
||||
meta, err := src.LoadJobMeta(job)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
data, err := src.LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
row, err := pqarchive.JobToParquetRow(meta, &data)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: convert job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
if err := cw.AddJob(*row); err != nil {
|
||||
cclog.Errorf("Retention: add job %d to writer: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
return cw.Close()
|
||||
}
|
||||
|
||||
// cleanupAfterTransfer removes jobs from archive and optionally from DB.
|
||||
func cleanupAfterTransfer(jobs []*schema.Job, startTime int64, includeDB bool, omitTagged bool) {
|
||||
archive.GetHandle().CleanUp(jobs)
|
||||
|
||||
if includeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged)
|
||||
if err != nil {
|
||||
cclog.Errorf("Retention: delete jobs from db: %v", err)
|
||||
} else {
|
||||
cclog.Infof("Retention: removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
cclog.Errorf("Retention: db optimization error: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// readCopyMarker reads the last-processed timestamp from a copy marker file.
|
||||
func readCopyMarker(cfg Retention) int64 {
|
||||
var data []byte
|
||||
var err error
|
||||
|
||||
switch cfg.TargetKind {
|
||||
case "s3":
|
||||
// For S3 we store the marker locally alongside the config
|
||||
data, err = os.ReadFile(copyMarkerPath(cfg))
|
||||
default:
|
||||
data, err = os.ReadFile(filepath.Join(cfg.TargetPath, ".copy-marker"))
|
||||
}
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
ts, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return ts
|
||||
}
|
||||
|
||||
// writeCopyMarker writes the last-processed timestamp to a copy marker file.
|
||||
func writeCopyMarker(cfg Retention, ts int64) {
|
||||
content := []byte(strconv.FormatInt(ts, 10))
|
||||
var err error
|
||||
|
||||
switch cfg.TargetKind {
|
||||
case "s3":
|
||||
err = os.WriteFile(copyMarkerPath(cfg), content, 0o640)
|
||||
default:
|
||||
err = os.WriteFile(filepath.Join(cfg.TargetPath, ".copy-marker"), content, 0o640)
|
||||
}
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention: write copy marker: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func copyMarkerPath(cfg Retention) string {
|
||||
// For S3 targets, store the marker in a local temp-style path derived from the bucket name
|
||||
return filepath.Join(os.TempDir(), fmt.Sprintf("cc-copy-marker-%s", cfg.TargetBucket))
|
||||
}
|
||||
|
||||
func RegisterRetentionDeleteService(cfg Retention) {
|
||||
cclog.Info("Register retention delete service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(3, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged)
|
||||
startTime := time.Now().Unix() - int64(cfg.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().CleanUp(jobs)
|
||||
|
||||
if includeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while deleting retention jobs from db: %s", err.Error())
|
||||
} else {
|
||||
cclog.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
cclog.Errorf("Error occured in db optimization: %s", err.Error())
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
func RegisterRetentionMoveService(age int, includeDB bool, location string, omitTagged bool) {
|
||||
cclog.Info("Register retention move service")
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(4, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while looking for retention jobs: %s", err.Error())
|
||||
}
|
||||
archive.GetHandle().Move(jobs, location)
|
||||
|
||||
if includeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while deleting retention jobs from db: %v", err)
|
||||
} else {
|
||||
cclog.Infof("Retention: Removed %d jobs from db", cnt)
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
cclog.Errorf("Error occured in db optimization: %v", err)
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
func RegisterRetentionParquetService(retention Retention) {
|
||||
cclog.Info("Register retention parquet service")
|
||||
|
||||
maxFileSizeMB := retention.MaxFileSizeMB
|
||||
if maxFileSizeMB <= 0 {
|
||||
maxFileSizeMB = 512
|
||||
}
|
||||
|
||||
var target pqarchive.ParquetTarget
|
||||
var err error
|
||||
|
||||
switch retention.TargetKind {
|
||||
case "s3":
|
||||
target, err = pqarchive.NewS3Target(pqarchive.S3TargetConfig{
|
||||
Endpoint: retention.TargetEndpoint,
|
||||
Bucket: retention.TargetBucket,
|
||||
AccessKey: retention.TargetAccessKey,
|
||||
SecretKey: retention.TargetSecretKey,
|
||||
Region: retention.TargetRegion,
|
||||
UsePathStyle: retention.TargetUsePathStyle,
|
||||
})
|
||||
default:
|
||||
target, err = pqarchive.NewFileTarget(retention.TargetPath)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("Parquet retention: failed to create target: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(retention.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime, retention.OmitTagged)
|
||||
if err != nil {
|
||||
cclog.Warnf("Parquet retention: error finding jobs: %v", err)
|
||||
cclog.Warnf("Retention delete: error finding jobs: %v", err)
|
||||
return
|
||||
}
|
||||
if len(jobs) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Infof("Parquet retention: processing %d jobs", len(jobs))
|
||||
ar := archive.GetHandle()
|
||||
pw := pqarchive.NewParquetWriter(target, maxFileSizeMB)
|
||||
cclog.Infof("Retention delete: processing %d jobs", len(jobs))
|
||||
cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged)
|
||||
}))
|
||||
}
|
||||
|
||||
for _, job := range jobs {
|
||||
meta, err := ar.LoadJobMeta(job)
|
||||
if err != nil {
|
||||
cclog.Warnf("Parquet retention: load meta for job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
func RegisterRetentionCopyService(cfg Retention) {
|
||||
cclog.Infof("Register retention copy service (format=%s, target=%s)", cfg.Format, cfg.TargetKind)
|
||||
|
||||
data, err := ar.LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Warnf("Parquet retention: load data for job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
maxFileSizeMB := cfg.MaxFileSizeMB
|
||||
if maxFileSizeMB <= 0 {
|
||||
maxFileSizeMB = 512
|
||||
}
|
||||
|
||||
row, err := pqarchive.JobToParquetRow(meta, &data)
|
||||
if err != nil {
|
||||
cclog.Warnf("Parquet retention: convert job %d: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(4, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
cutoff := time.Now().Unix() - int64(cfg.Age*24*3600)
|
||||
lastProcessed := readCopyMarker(cfg)
|
||||
|
||||
if err := pw.AddJob(*row); err != nil {
|
||||
cclog.Errorf("Parquet retention: add job %d to writer: %v", job.JobID, err)
|
||||
continue
|
||||
}
|
||||
jobs, err := jobRepo.FindJobsBetween(lastProcessed, cutoff, cfg.OmitTagged)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention copy: error finding jobs: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := pw.Close(); err != nil {
|
||||
cclog.Errorf("Parquet retention: close writer: %v", err)
|
||||
if len(jobs) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ar.CleanUp(jobs)
|
||||
cclog.Infof("Retention copy: processing %d jobs", len(jobs))
|
||||
ar := archive.GetHandle()
|
||||
|
||||
if retention.IncludeDB {
|
||||
cnt, err := jobRepo.DeleteJobsBefore(startTime, retention.OmitTagged)
|
||||
switch cfg.Format {
|
||||
case "parquet":
|
||||
target, err := createParquetTarget(cfg)
|
||||
if err != nil {
|
||||
cclog.Errorf("Parquet retention: delete jobs from db: %v", err)
|
||||
} else {
|
||||
cclog.Infof("Parquet retention: removed %d jobs from db", cnt)
|
||||
cclog.Errorf("Retention copy: create parquet target: %v", err)
|
||||
return
|
||||
}
|
||||
if err = jobRepo.Optimize(); err != nil {
|
||||
cclog.Errorf("Parquet retention: db optimization error: %v", err)
|
||||
if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil {
|
||||
cclog.Errorf("Retention copy: parquet transfer: %v", err)
|
||||
return
|
||||
}
|
||||
default: // json
|
||||
dst, err := createTargetBackend(cfg)
|
||||
if err != nil {
|
||||
cclog.Errorf("Retention copy: create target backend: %v", err)
|
||||
return
|
||||
}
|
||||
if err := transferJobsJSON(jobs, ar, dst); err != nil {
|
||||
cclog.Errorf("Retention copy: json transfer: %v", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
writeCopyMarker(cfg, cutoff)
|
||||
}))
|
||||
}
|
||||
|
||||
func RegisterRetentionMoveService(cfg Retention) {
|
||||
cclog.Infof("Register retention move service (format=%s, target=%s)", cfg.Format, cfg.TargetKind)
|
||||
|
||||
maxFileSizeMB := cfg.MaxFileSizeMB
|
||||
if maxFileSizeMB <= 0 {
|
||||
maxFileSizeMB = 512
|
||||
}
|
||||
|
||||
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))),
|
||||
gocron.NewTask(
|
||||
func() {
|
||||
startTime := time.Now().Unix() - int64(cfg.Age*24*3600)
|
||||
jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged)
|
||||
if err != nil {
|
||||
cclog.Warnf("Retention move: error finding jobs: %v", err)
|
||||
return
|
||||
}
|
||||
if len(jobs) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Infof("Retention move: processing %d jobs", len(jobs))
|
||||
ar := archive.GetHandle()
|
||||
|
||||
switch cfg.Format {
|
||||
case "parquet":
|
||||
target, err := createParquetTarget(cfg)
|
||||
if err != nil {
|
||||
cclog.Errorf("Retention move: create parquet target: %v", err)
|
||||
return
|
||||
}
|
||||
if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil {
|
||||
cclog.Errorf("Retention move: parquet transfer: %v", err)
|
||||
return
|
||||
}
|
||||
default: // json
|
||||
dst, err := createTargetBackend(cfg)
|
||||
if err != nil {
|
||||
cclog.Errorf("Retention move: create target backend: %v", err)
|
||||
return
|
||||
}
|
||||
if err := transferJobsJSON(jobs, ar, dst); err != nil {
|
||||
cclog.Errorf("Retention move: json transfer: %v", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged)
|
||||
}))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user