Merge pull request #534 from ClusterCockpit/hotfix

feat: Add command line switch to trigger manual metricstore checkpoin…
This commit is contained in:
Jan Eitzinger
2026-03-23 19:28:06 +01:00
committed by GitHub
3 changed files with 61 additions and 7 deletions

View File

@@ -11,7 +11,8 @@ import "flag"
var (
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB bool
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB,
flagCleanupCheckpoints bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
)
@@ -28,6 +29,7 @@ func cliInit() {
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagOptimizeDB, "optimize-db", false, "Optimize database: run VACUUM to reclaim space, then ANALYZE to update query planner statistics")
flag.BoolVar(&flagCleanupCheckpoints, "cleanup-checkpoints", false, "Clean up old checkpoint files (delete or archive) based on retention settings, then exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")

View File

@@ -14,6 +14,7 @@ import (
"fmt"
"os"
"os/signal"
goruntime "runtime"
"runtime/debug"
"strings"
"sync"
@@ -536,6 +537,43 @@ func run() error {
return err
}
// Handle checkpoint cleanup
if flagCleanupCheckpoints {
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg == nil {
return fmt.Errorf("metric-store configuration required for checkpoint cleanup")
}
if err := json.Unmarshal(mscfg, &metricstore.Keys); err != nil {
return fmt.Errorf("decoding metric-store config: %w", err)
}
if metricstore.Keys.NumWorkers <= 0 {
metricstore.Keys.NumWorkers = min(goruntime.NumCPU()/2+1, metricstore.DefaultMaxWorkers)
}
d, err := time.ParseDuration(metricstore.Keys.RetentionInMemory)
if err != nil {
return fmt.Errorf("parsing retention-in-memory: %w", err)
}
from := time.Now().Add(-d)
deleteMode := metricstore.Keys.Cleanup == nil || metricstore.Keys.Cleanup.Mode != "archive"
cleanupDir := ""
if !deleteMode {
cleanupDir = metricstore.Keys.Cleanup.RootDir
}
cclog.Infof("Cleaning up checkpoints older than %s...", from.Format(time.RFC3339))
n, err := metricstore.CleanupCheckpoints(
metricstore.Keys.Checkpoints.RootDir, cleanupDir, from.Unix(), deleteMode)
if err != nil {
return fmt.Errorf("checkpoint cleanup: %w", err)
}
if deleteMode {
cclog.Exitf("Cleanup done: %d checkpoint files deleted.", n)
} else {
cclog.Exitf("Cleanup done: %d checkpoint files archived to parquet.", n)
}
}
// Exit if start server is not requested
if !flagServer {
cclog.Exit("No errors, server flag not set. Exiting cc-backend.")

View File

@@ -54,11 +54,16 @@
const paging = { itemsPerPage: 50, page: 1 };
const sorting = { field: "startTime", type: "col", order: "DESC" };
const nodeMetricsQuery = gql`
query ($cluster: String!, $nodes: [String!], $from: Time!, $to: Time!) {
query (
$cluster: String!,
$nodes: [String!],
$from: Time!,
$to: Time!,
$nodeFilter: [NodeFilter!]!,
$sorting: OrderByInput!
) {
nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) {
host
nodeState
metricHealth
subCluster
metrics {
name
@@ -79,6 +84,13 @@
}
}
}
},
nodeStatus: nodes(filter: $nodeFilter, order: $sorting) {
count
items {
schedulerState
healthState
}
}
}
`;
@@ -146,6 +158,8 @@
nodes: [hostname],
from: from?.toISOString(),
to: to?.toISOString(),
nodeFilter: { hostname: { eq: hostname }},
sorting // $sorting unused in backend: Use placeholder
},
})
);
@@ -157,8 +171,8 @@
})
);
const thisNodeState = $derived($nodeMetricsData?.data?.nodeMetrics[0]?.nodeState || 'notindb');
const thisMetricHealth = $derived($nodeMetricsData?.data?.nodeMetrics[0]?.metricHealth || 'unknown');
const thisNodeState = $derived($nodeMetricsData?.data?.nodeStatus?.items[0]?.schedulerState || 'notindb');
const thisMetricHealth = $derived($nodeMetricsData?.data?.nodeStatus?.items[0]?.healthState || 'unknown');
</script>
<Row cols={{ xs: 2, lg: 3}}>