Merge branch 'main' into feature/526-average-resample

This commit is contained in:
2026-06-17 06:30:54 +02:00
69 changed files with 5681 additions and 5268 deletions

View File

@@ -11,7 +11,8 @@ import "flag"
var (
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB bool
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB,
flagCleanupCheckpoints bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
)
@@ -28,6 +29,7 @@ func cliInit() {
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagOptimizeDB, "optimize-db", false, "Optimize database: run VACUUM to reclaim space, then ANALYZE to update query planner statistics")
flag.BoolVar(&flagCleanupCheckpoints, "cleanup-checkpoints", false, "Clean up old checkpoint files (delete or archive) based on retention settings, then exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")

View File

@@ -14,6 +14,7 @@ import (
"fmt"
"os"
"os/signal"
goruntime "runtime"
"runtime/debug"
"strings"
"sync"
@@ -171,14 +172,20 @@ func handleUserCommands() error {
return fmt.Errorf("--add-user and --del-user can only be used if authentication is enabled")
}
if !config.Keys.DisableAuthentication {
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
// Always initialize the auth subsystem so the HTTP server and REST API have a
// valid (non-nil) auth instance, even when authentication is disabled. With
// authentication disabled, Init only sets up an ephemeral session store and
// registers no authenticators (see auth.Init).
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
if !config.Keys.DisableAuthentication {
cclog.Warn("Authentication enabled but no auth configuration found")
}
auth.Init(nil)
}
if !config.Keys.DisableAuthentication {
// Check for default security keys
checkDefaultSecurityKeys()
@@ -336,6 +343,12 @@ func initSubsystems() error {
}
func runServer(ctx context.Context) error {
// Derive a cancelable context so the startup-error path below can trigger the
// same graceful-shutdown sequence as a signal (via the signal handler that
// waits on ctx.Done()).
ctx, cancel := context.WithCancel(ctx)
defer cancel()
var wg sync.WaitGroup
// Initialize metric store if configuration is provided
@@ -437,26 +450,32 @@ func runServer(ctx context.Context) error {
// Wait for either:
// 1. An error from server startup
// 2. Completion of all goroutines (normal shutdown or crash)
var runErr error
select {
case err := <-errChan:
case runErr = <-errChan:
// errChan will be closed when waitDone is closed, which happens
// when all goroutines complete (either from normal shutdown or error)
if err != nil {
return err
}
case <-time.After(100 * time.Millisecond):
// Give the server 100ms to start and report any immediate startup errors
// After that, just wait for normal shutdown completion
select {
case err := <-errChan:
if err != nil {
return err
}
case runErr = <-errChan:
case <-waitDone:
// Normal shutdown completed
}
}
if runErr != nil {
// A subsystem failed (e.g. the HTTP server could not bind). Trigger the
// graceful-shutdown path for the subsystems that were already started
// (metricstore checkpoint, archiver flush, taskmanager) by cancelling the
// context the signal handler waits on, then wait for it to finish so we
// don't exit before the final checkpoint is written.
cancel()
<-waitDone
return runErr
}
cclog.Print("Graceful shutdown completed!")
return nil
}
@@ -536,6 +555,43 @@ func run() error {
return err
}
// Handle checkpoint cleanup
if flagCleanupCheckpoints {
mscfg := ccconf.GetPackageConfig("metric-store")
if mscfg == nil {
return fmt.Errorf("metric-store configuration required for checkpoint cleanup")
}
if err := json.Unmarshal(mscfg, &metricstore.Keys); err != nil {
return fmt.Errorf("decoding metric-store config: %w", err)
}
if metricstore.Keys.NumWorkers <= 0 {
metricstore.Keys.NumWorkers = min(goruntime.NumCPU()/2+1, metricstore.DefaultMaxWorkers)
}
d, err := time.ParseDuration(metricstore.Keys.RetentionInMemory)
if err != nil {
return fmt.Errorf("parsing retention-in-memory: %w", err)
}
from := time.Now().Add(-d)
deleteMode := metricstore.Keys.Cleanup == nil || metricstore.Keys.Cleanup.Mode != "archive"
cleanupDir := ""
if !deleteMode {
cleanupDir = metricstore.Keys.Cleanup.RootDir
}
cclog.Infof("Cleaning up checkpoints older than %s...", from.Format(time.RFC3339))
n, err := metricstore.CleanupCheckpoints(
metricstore.Keys.Checkpoints.RootDir, cleanupDir, from.Unix(), deleteMode)
if err != nil {
return fmt.Errorf("checkpoint cleanup: %w", err)
}
if deleteMode {
cclog.Exitf("Cleanup done: %d checkpoint files deleted.", n)
} else {
cclog.Exitf("Cleanup done: %d checkpoint files archived to parquet.", n)
}
}
// Exit if start server is not requested
if !flagServer {
cclog.Exit("No errors, server flag not set. Exiting cc-backend.")

View File

@@ -18,10 +18,12 @@ import (
"net/http"
"os"
"strings"
"sync"
"time"
"github.com/99designs/gqlgen/graphql"
"github.com/99designs/gqlgen/graphql/handler"
"github.com/99designs/gqlgen/graphql/handler/extension"
"github.com/99designs/gqlgen/graphql/handler/transport"
"github.com/99designs/gqlgen/graphql/playground"
"github.com/ClusterCockpit/cc-backend/internal/api"
@@ -49,6 +51,12 @@ const (
envDebug = "DEBUG"
)
// maxQueryComplexity bounds the cost of a single GraphQL query to mitigate
// denial-of-service via deeply nested or heavily aliased queries. The default
// per-field cost is 1, so this leaves ample headroom for legitimate dashboard
// queries while rejecting pathological ones.
const maxQueryComplexity = 5000
// Server encapsulates the HTTP server state and dependencies
type Server struct {
router chi.Router
@@ -89,6 +97,7 @@ func (s *Server) init() error {
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
graphQLServer.AddTransport(transport.POST{})
graphQLServer.Use(extension.FixedComplexityLimit(maxQueryComplexity))
// Inject a per-request stats cache so that grouped statistics queries
// sharing the same (filter, groupBy) pair are executed only once.
@@ -128,11 +137,49 @@ func (s *Server) init() error {
s.router.Use(middleware.Compress(5))
s.router.Use(middleware.Recoverer)
s.router.Use(cors.Handler(cors.Options{
AllowCredentials: true,
AllowCredentials: false,
AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"},
AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"},
AllowedOrigins: []string{"*"},
}))
s.router.Use(func(next http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
h := rw.Header()
h.Set("X-Content-Type-Options", "nosniff")
h.Set("X-Frame-Options", "DENY")
h.Set("Referrer-Policy", "same-origin")
// Conservative CSP: blocks clickjacking (frame-ancestors), plugin
// content (object-src) and <base> injection (base-uri) without
// restricting scripts/styles, so it cannot break the self-hosted
// SPA which relies on inline scripts. A full script-src policy needs
// per-template nonces and should be added separately.
h.Set("Content-Security-Policy", "frame-ancestors 'none'; object-src 'none'; base-uri 'self'")
if r.TLS != nil || r.Header.Get("X-Forwarded-Proto") == "https" {
h.Set("Strict-Transport-Security", "max-age=31536000; includeSubDomains")
}
next.ServeHTTP(rw, r)
})
})
// CSRF defense-in-depth on top of the SameSite=Lax session cookie: reject
// cross-site state-changing requests. Modern browsers set Sec-Fetch-Site on
// every request, so this stops a malicious site from driving cookie-
// authenticated POST/PUT/DELETE/PATCH calls. It fails open when the header is
// absent or not "cross-site", so non-browser API clients and the same-origin
// SPA are unaffected.
s.router.Use(func(next http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
switch r.Method {
case http.MethodGet, http.MethodHead, http.MethodOptions, http.MethodTrace:
default:
if r.Header.Get("Sec-Fetch-Site") == "cross-site" {
http.Error(rw, "cross-site request blocked", http.StatusForbidden)
return
}
}
next.ServeHTTP(rw, r)
})
})
s.restAPIHandle = api.New()
@@ -344,20 +391,20 @@ func (s *Server) init() error {
// Server timeout defaults (in seconds)
const (
defaultReadTimeout = 20
defaultWriteTimeout = 20
defaultReadHeaderTimeout = 20
defaultWriteTimeout = 20
)
func (s *Server) Start(ctx context.Context) error {
// Use configurable timeouts with defaults
readTimeout := time.Duration(defaultReadTimeout) * time.Second
readHeaderTimeout := time.Duration(defaultReadHeaderTimeout) * time.Second
writeTimeout := time.Duration(defaultWriteTimeout) * time.Second
s.server = &http.Server{
ReadTimeout: readTimeout,
WriteTimeout: writeTimeout,
Handler: s.router,
Addr: config.Keys.Addr,
ReadHeaderTimeout: readHeaderTimeout,
WriteTimeout: writeTimeout,
Handler: s.router,
Addr: config.Keys.Addr,
}
// Start http or https server
@@ -368,7 +415,9 @@ func (s *Server) Start(ctx context.Context) error {
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
go func() {
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
if err := http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently)); err != nil {
cclog.Errorf("HTTP-to-HTTPS redirect listener on :80 failed: %v", err)
}
}()
}
@@ -399,16 +448,6 @@ func (s *Server) Start(ctx context.Context) error {
return fmt.Errorf("dropping privileges: %w", err)
}
// Handle context cancellation for graceful shutdown
go func() {
<-ctx.Done()
shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := s.server.Shutdown(shutdownCtx); err != nil {
cclog.Errorf("Server shutdown error: %v", err)
}
}()
if err = s.server.Serve(listener); err != nil && err != http.ErrServerClosed {
return fmt.Errorf("server failed: %w", err)
}
@@ -416,29 +455,58 @@ func (s *Server) Start(ctx context.Context) error {
}
func (s *Server) Shutdown(ctx context.Context) {
// Create a shutdown context with timeout
shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
shutdownStart := time.Now()
natsStart := time.Now()
nc := nats.GetClient()
if nc != nil {
nc.Close()
}
// Stop the NATS API worker goroutines after the client is closed (no more
// subscription callbacks can enqueue once the connection is down).
if s.natsAPIHandle != nil {
s.natsAPIHandle.Shutdown()
}
cclog.Infof("Shutdown: NATS closed (%v)", time.Since(natsStart))
// First shut down the server gracefully (waiting for all ongoing requests)
httpStart := time.Now()
shutdownCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
defer cancel()
if err := s.server.Shutdown(shutdownCtx); err != nil {
cclog.Errorf("Server shutdown error: %v", err)
}
cclog.Infof("Shutdown: HTTP server stopped (%v)", time.Since(httpStart))
// Archive all the metric store data
ms := metricstore.GetMemoryStore()
// Run metricstore and archiver shutdown concurrently.
// They are independent: metricstore writes .bin snapshots,
// archiver flushes pending job archives.
storeStart := time.Now()
done := make(chan struct{})
go func() {
defer close(done)
var wg sync.WaitGroup
if ms != nil {
metricstore.Shutdown()
if ms := metricstore.GetMemoryStore(); ms != nil {
wg.Go(func() {
metricstore.Shutdown()
})
}
wg.Go(func() {
if err := archiver.Shutdown(60 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown: %v", err)
}
})
wg.Wait()
}()
select {
case <-done:
cclog.Infof("Shutdown: metricstore + archiver completed (%v)", time.Since(storeStart))
case <-time.After(60 * time.Second):
cclog.Warnf("Shutdown deadline exceeded after %v, forcing exit", time.Since(shutdownStart))
}
// Shutdown archiver with 10 second timeout for fast shutdown
if err := archiver.Shutdown(10 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown: %v", err)
}
cclog.Infof("Shutdown: total time %v", time.Since(shutdownStart))
}