Merge pull request #534 from ClusterCockpit/hotfix

feat: Add command line switch to trigger manual metricstore checkpoin…
This commit is contained in:
Jan Eitzinger
2026-03-23 19:28:06 +01:00
committed by GitHub
3 changed files with 61 additions and 7 deletions

View File

@@ -11,7 +11,8 @@ import "flag"
var ( var (
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB bool flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB,
flagCleanupCheckpoints bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
) )
@@ -28,6 +29,7 @@ func cliInit() {
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit") flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit") flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagOptimizeDB, "optimize-db", false, "Optimize database: run VACUUM to reclaim space, then ANALYZE to update query planner statistics") flag.BoolVar(&flagOptimizeDB, "optimize-db", false, "Optimize database: run VACUUM to reclaim space, then ANALYZE to update query planner statistics")
flag.BoolVar(&flagCleanupCheckpoints, "cleanup-checkpoints", false, "Clean up old checkpoint files (delete or archive) based on retention settings, then exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`") flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>") flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")

View File

@@ -14,6 +14,7 @@ import (
"fmt" "fmt"
"os" "os"
"os/signal" "os/signal"
goruntime "runtime"
"runtime/debug" "runtime/debug"
"strings" "strings"
"sync" "sync"
@@ -536,6 +537,43 @@ func run() error {
return err return err
} }
// Handle checkpoint cleanup
if flagCleanupCheckpoints {
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg == nil {
return fmt.Errorf("metric-store configuration required for checkpoint cleanup")
}
if err := json.Unmarshal(mscfg, &metricstore.Keys); err != nil {
return fmt.Errorf("decoding metric-store config: %w", err)
}
if metricstore.Keys.NumWorkers <= 0 {
metricstore.Keys.NumWorkers = min(goruntime.NumCPU()/2+1, metricstore.DefaultMaxWorkers)
}
d, err := time.ParseDuration(metricstore.Keys.RetentionInMemory)
if err != nil {
return fmt.Errorf("parsing retention-in-memory: %w", err)
}
from := time.Now().Add(-d)
deleteMode := metricstore.Keys.Cleanup == nil || metricstore.Keys.Cleanup.Mode != "archive"
cleanupDir := ""
if !deleteMode {
cleanupDir = metricstore.Keys.Cleanup.RootDir
}
cclog.Infof("Cleaning up checkpoints older than %s...", from.Format(time.RFC3339))
n, err := metricstore.CleanupCheckpoints(
metricstore.Keys.Checkpoints.RootDir, cleanupDir, from.Unix(), deleteMode)
if err != nil {
return fmt.Errorf("checkpoint cleanup: %w", err)
}
if deleteMode {
cclog.Exitf("Cleanup done: %d checkpoint files deleted.", n)
} else {
cclog.Exitf("Cleanup done: %d checkpoint files archived to parquet.", n)
}
}
// Exit if start server is not requested // Exit if start server is not requested
if !flagServer { if !flagServer {
cclog.Exit("No errors, server flag not set. Exiting cc-backend.") cclog.Exit("No errors, server flag not set. Exiting cc-backend.")

View File

@@ -54,11 +54,16 @@
const paging = { itemsPerPage: 50, page: 1 }; const paging = { itemsPerPage: 50, page: 1 };
const sorting = { field: "startTime", type: "col", order: "DESC" }; const sorting = { field: "startTime", type: "col", order: "DESC" };
const nodeMetricsQuery = gql` const nodeMetricsQuery = gql`
query ($cluster: String!, $nodes: [String!], $from: Time!, $to: Time!) { query (
$cluster: String!,
$nodes: [String!],
$from: Time!,
$to: Time!,
$nodeFilter: [NodeFilter!]!,
$sorting: OrderByInput!
) {
nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) { nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) {
host host
nodeState
metricHealth
subCluster subCluster
metrics { metrics {
name name
@@ -79,7 +84,14 @@
} }
} }
} }
} },
nodeStatus: nodes(filter: $nodeFilter, order: $sorting) {
count
items {
schedulerState
healthState
}
}
} }
`; `;
const nodeJobsQuery = gql` const nodeJobsQuery = gql`
@@ -146,6 +158,8 @@
nodes: [hostname], nodes: [hostname],
from: from?.toISOString(), from: from?.toISOString(),
to: to?.toISOString(), to: to?.toISOString(),
nodeFilter: { hostname: { eq: hostname }},
sorting // $sorting unused in backend: Use placeholder
}, },
}) })
); );
@@ -157,8 +171,8 @@
}) })
); );
const thisNodeState = $derived($nodeMetricsData?.data?.nodeMetrics[0]?.nodeState || 'notindb'); const thisNodeState = $derived($nodeMetricsData?.data?.nodeStatus?.items[0]?.schedulerState || 'notindb');
const thisMetricHealth = $derived($nodeMetricsData?.data?.nodeMetrics[0]?.metricHealth || 'unknown'); const thisMetricHealth = $derived($nodeMetricsData?.data?.nodeStatus?.items[0]?.healthState || 'unknown');
</script> </script>
<Row cols={{ xs: 2, lg: 3}}> <Row cols={{ xs: 2, lg: 3}}>