mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-06-19 09:47:29 +02:00
Merge branch 'main' into feature/526-average-resample
This commit is contained in:
@@ -11,7 +11,8 @@ import "flag"
|
||||
|
||||
var (
|
||||
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
|
||||
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB bool
|
||||
flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags, flagOptimizeDB,
|
||||
flagCleanupCheckpoints bool
|
||||
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
|
||||
)
|
||||
|
||||
@@ -28,6 +29,7 @@ func cliInit() {
|
||||
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
|
||||
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
|
||||
flag.BoolVar(&flagOptimizeDB, "optimize-db", false, "Optimize database: run VACUUM to reclaim space, then ANALYZE to update query planner statistics")
|
||||
flag.BoolVar(&flagCleanupCheckpoints, "cleanup-checkpoints", false, "Clean up old checkpoint files (delete or archive) based on retention settings, then exit")
|
||||
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
|
||||
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: <username>:[admin,support,manager,api,user]:<password>")
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
goruntime "runtime"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -171,14 +172,20 @@ func handleUserCommands() error {
|
||||
return fmt.Errorf("--add-user and --del-user can only be used if authentication is enabled")
|
||||
}
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
cclog.Warn("Authentication disabled due to missing configuration")
|
||||
auth.Init(nil)
|
||||
// Always initialize the auth subsystem so the HTTP server and REST API have a
|
||||
// valid (non-nil) auth instance, even when authentication is disabled. With
|
||||
// authentication disabled, Init only sets up an ephemeral session store and
|
||||
// registers no authenticators (see auth.Init).
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
cclog.Warn("Authentication enabled but no auth configuration found")
|
||||
}
|
||||
auth.Init(nil)
|
||||
}
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
// Check for default security keys
|
||||
checkDefaultSecurityKeys()
|
||||
|
||||
@@ -336,6 +343,12 @@ func initSubsystems() error {
|
||||
}
|
||||
|
||||
func runServer(ctx context.Context) error {
|
||||
// Derive a cancelable context so the startup-error path below can trigger the
|
||||
// same graceful-shutdown sequence as a signal (via the signal handler that
|
||||
// waits on ctx.Done()).
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Initialize metric store if configuration is provided
|
||||
@@ -437,26 +450,32 @@ func runServer(ctx context.Context) error {
|
||||
// Wait for either:
|
||||
// 1. An error from server startup
|
||||
// 2. Completion of all goroutines (normal shutdown or crash)
|
||||
var runErr error
|
||||
select {
|
||||
case err := <-errChan:
|
||||
case runErr = <-errChan:
|
||||
// errChan will be closed when waitDone is closed, which happens
|
||||
// when all goroutines complete (either from normal shutdown or error)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
// Give the server 100ms to start and report any immediate startup errors
|
||||
// After that, just wait for normal shutdown completion
|
||||
select {
|
||||
case err := <-errChan:
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case runErr = <-errChan:
|
||||
case <-waitDone:
|
||||
// Normal shutdown completed
|
||||
}
|
||||
}
|
||||
|
||||
if runErr != nil {
|
||||
// A subsystem failed (e.g. the HTTP server could not bind). Trigger the
|
||||
// graceful-shutdown path for the subsystems that were already started
|
||||
// (metricstore checkpoint, archiver flush, taskmanager) by cancelling the
|
||||
// context the signal handler waits on, then wait for it to finish so we
|
||||
// don't exit before the final checkpoint is written.
|
||||
cancel()
|
||||
<-waitDone
|
||||
return runErr
|
||||
}
|
||||
|
||||
cclog.Print("Graceful shutdown completed!")
|
||||
return nil
|
||||
}
|
||||
@@ -536,6 +555,43 @@ func run() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Handle checkpoint cleanup
|
||||
if flagCleanupCheckpoints {
|
||||
mscfg := ccconf.GetPackageConfig("metric-store")
|
||||
if mscfg == nil {
|
||||
return fmt.Errorf("metric-store configuration required for checkpoint cleanup")
|
||||
}
|
||||
if err := json.Unmarshal(mscfg, &metricstore.Keys); err != nil {
|
||||
return fmt.Errorf("decoding metric-store config: %w", err)
|
||||
}
|
||||
if metricstore.Keys.NumWorkers <= 0 {
|
||||
metricstore.Keys.NumWorkers = min(goruntime.NumCPU()/2+1, metricstore.DefaultMaxWorkers)
|
||||
}
|
||||
|
||||
d, err := time.ParseDuration(metricstore.Keys.RetentionInMemory)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parsing retention-in-memory: %w", err)
|
||||
}
|
||||
from := time.Now().Add(-d)
|
||||
deleteMode := metricstore.Keys.Cleanup == nil || metricstore.Keys.Cleanup.Mode != "archive"
|
||||
cleanupDir := ""
|
||||
if !deleteMode {
|
||||
cleanupDir = metricstore.Keys.Cleanup.RootDir
|
||||
}
|
||||
|
||||
cclog.Infof("Cleaning up checkpoints older than %s...", from.Format(time.RFC3339))
|
||||
n, err := metricstore.CleanupCheckpoints(
|
||||
metricstore.Keys.Checkpoints.RootDir, cleanupDir, from.Unix(), deleteMode)
|
||||
if err != nil {
|
||||
return fmt.Errorf("checkpoint cleanup: %w", err)
|
||||
}
|
||||
if deleteMode {
|
||||
cclog.Exitf("Cleanup done: %d checkpoint files deleted.", n)
|
||||
} else {
|
||||
cclog.Exitf("Cleanup done: %d checkpoint files archived to parquet.", n)
|
||||
}
|
||||
}
|
||||
|
||||
// Exit if start server is not requested
|
||||
if !flagServer {
|
||||
cclog.Exit("No errors, server flag not set. Exiting cc-backend.")
|
||||
|
||||
@@ -18,10 +18,12 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql"
|
||||
"github.com/99designs/gqlgen/graphql/handler"
|
||||
"github.com/99designs/gqlgen/graphql/handler/extension"
|
||||
"github.com/99designs/gqlgen/graphql/handler/transport"
|
||||
"github.com/99designs/gqlgen/graphql/playground"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/api"
|
||||
@@ -49,6 +51,12 @@ const (
|
||||
envDebug = "DEBUG"
|
||||
)
|
||||
|
||||
// maxQueryComplexity bounds the cost of a single GraphQL query to mitigate
|
||||
// denial-of-service via deeply nested or heavily aliased queries. The default
|
||||
// per-field cost is 1, so this leaves ample headroom for legitimate dashboard
|
||||
// queries while rejecting pathological ones.
|
||||
const maxQueryComplexity = 5000
|
||||
|
||||
// Server encapsulates the HTTP server state and dependencies
|
||||
type Server struct {
|
||||
router chi.Router
|
||||
@@ -89,6 +97,7 @@ func (s *Server) init() error {
|
||||
generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||
|
||||
graphQLServer.AddTransport(transport.POST{})
|
||||
graphQLServer.Use(extension.FixedComplexityLimit(maxQueryComplexity))
|
||||
|
||||
// Inject a per-request stats cache so that grouped statistics queries
|
||||
// sharing the same (filter, groupBy) pair are executed only once.
|
||||
@@ -128,11 +137,49 @@ func (s *Server) init() error {
|
||||
s.router.Use(middleware.Compress(5))
|
||||
s.router.Use(middleware.Recoverer)
|
||||
s.router.Use(cors.Handler(cors.Options{
|
||||
AllowCredentials: true,
|
||||
AllowCredentials: false,
|
||||
AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"},
|
||||
AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"},
|
||||
AllowedOrigins: []string{"*"},
|
||||
}))
|
||||
s.router.Use(func(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
h := rw.Header()
|
||||
h.Set("X-Content-Type-Options", "nosniff")
|
||||
h.Set("X-Frame-Options", "DENY")
|
||||
h.Set("Referrer-Policy", "same-origin")
|
||||
// Conservative CSP: blocks clickjacking (frame-ancestors), plugin
|
||||
// content (object-src) and <base> injection (base-uri) without
|
||||
// restricting scripts/styles, so it cannot break the self-hosted
|
||||
// SPA which relies on inline scripts. A full script-src policy needs
|
||||
// per-template nonces and should be added separately.
|
||||
h.Set("Content-Security-Policy", "frame-ancestors 'none'; object-src 'none'; base-uri 'self'")
|
||||
if r.TLS != nil || r.Header.Get("X-Forwarded-Proto") == "https" {
|
||||
h.Set("Strict-Transport-Security", "max-age=31536000; includeSubDomains")
|
||||
}
|
||||
next.ServeHTTP(rw, r)
|
||||
})
|
||||
})
|
||||
|
||||
// CSRF defense-in-depth on top of the SameSite=Lax session cookie: reject
|
||||
// cross-site state-changing requests. Modern browsers set Sec-Fetch-Site on
|
||||
// every request, so this stops a malicious site from driving cookie-
|
||||
// authenticated POST/PUT/DELETE/PATCH calls. It fails open when the header is
|
||||
// absent or not "cross-site", so non-browser API clients and the same-origin
|
||||
// SPA are unaffected.
|
||||
s.router.Use(func(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
switch r.Method {
|
||||
case http.MethodGet, http.MethodHead, http.MethodOptions, http.MethodTrace:
|
||||
default:
|
||||
if r.Header.Get("Sec-Fetch-Site") == "cross-site" {
|
||||
http.Error(rw, "cross-site request blocked", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
}
|
||||
next.ServeHTTP(rw, r)
|
||||
})
|
||||
})
|
||||
|
||||
s.restAPIHandle = api.New()
|
||||
|
||||
@@ -344,20 +391,20 @@ func (s *Server) init() error {
|
||||
|
||||
// Server timeout defaults (in seconds)
|
||||
const (
|
||||
defaultReadTimeout = 20
|
||||
defaultWriteTimeout = 20
|
||||
defaultReadHeaderTimeout = 20
|
||||
defaultWriteTimeout = 20
|
||||
)
|
||||
|
||||
func (s *Server) Start(ctx context.Context) error {
|
||||
// Use configurable timeouts with defaults
|
||||
readTimeout := time.Duration(defaultReadTimeout) * time.Second
|
||||
readHeaderTimeout := time.Duration(defaultReadHeaderTimeout) * time.Second
|
||||
writeTimeout := time.Duration(defaultWriteTimeout) * time.Second
|
||||
|
||||
s.server = &http.Server{
|
||||
ReadTimeout: readTimeout,
|
||||
WriteTimeout: writeTimeout,
|
||||
Handler: s.router,
|
||||
Addr: config.Keys.Addr,
|
||||
ReadHeaderTimeout: readHeaderTimeout,
|
||||
WriteTimeout: writeTimeout,
|
||||
Handler: s.router,
|
||||
Addr: config.Keys.Addr,
|
||||
}
|
||||
|
||||
// Start http or https server
|
||||
@@ -368,7 +415,9 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
|
||||
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
|
||||
go func() {
|
||||
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
|
||||
if err := http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently)); err != nil {
|
||||
cclog.Errorf("HTTP-to-HTTPS redirect listener on :80 failed: %v", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -399,16 +448,6 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
return fmt.Errorf("dropping privileges: %w", err)
|
||||
}
|
||||
|
||||
// Handle context cancellation for graceful shutdown
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||
cclog.Errorf("Server shutdown error: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
if err = s.server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
return fmt.Errorf("server failed: %w", err)
|
||||
}
|
||||
@@ -416,29 +455,58 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
}
|
||||
|
||||
func (s *Server) Shutdown(ctx context.Context) {
|
||||
// Create a shutdown context with timeout
|
||||
shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
shutdownStart := time.Now()
|
||||
|
||||
natsStart := time.Now()
|
||||
nc := nats.GetClient()
|
||||
if nc != nil {
|
||||
nc.Close()
|
||||
}
|
||||
// Stop the NATS API worker goroutines after the client is closed (no more
|
||||
// subscription callbacks can enqueue once the connection is down).
|
||||
if s.natsAPIHandle != nil {
|
||||
s.natsAPIHandle.Shutdown()
|
||||
}
|
||||
cclog.Infof("Shutdown: NATS closed (%v)", time.Since(natsStart))
|
||||
|
||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||
httpStart := time.Now()
|
||||
shutdownCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
|
||||
defer cancel()
|
||||
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||
cclog.Errorf("Server shutdown error: %v", err)
|
||||
}
|
||||
cclog.Infof("Shutdown: HTTP server stopped (%v)", time.Since(httpStart))
|
||||
|
||||
// Archive all the metric store data
|
||||
ms := metricstore.GetMemoryStore()
|
||||
// Run metricstore and archiver shutdown concurrently.
|
||||
// They are independent: metricstore writes .bin snapshots,
|
||||
// archiver flushes pending job archives.
|
||||
storeStart := time.Now()
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
defer close(done)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
if ms != nil {
|
||||
metricstore.Shutdown()
|
||||
if ms := metricstore.GetMemoryStore(); ms != nil {
|
||||
wg.Go(func() {
|
||||
metricstore.Shutdown()
|
||||
})
|
||||
}
|
||||
|
||||
wg.Go(func() {
|
||||
if err := archiver.Shutdown(60 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
wg.Wait()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
cclog.Infof("Shutdown: metricstore + archiver completed (%v)", time.Since(storeStart))
|
||||
case <-time.After(60 * time.Second):
|
||||
cclog.Warnf("Shutdown deadline exceeded after %v, forcing exit", time.Since(shutdownStart))
|
||||
}
|
||||
|
||||
// Shutdown archiver with 10 second timeout for fast shutdown
|
||||
if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown: %v", err)
|
||||
}
|
||||
cclog.Infof("Shutdown: total time %v", time.Since(shutdownStart))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user