mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-06-08 04:37:29 +02:00
Fix critical/severe issues in init, startup and shutdown
- auth: do not abort the server when authentication is disabled. auth.Init is now always called; with disable-authentication it sets up an ephemeral session store (SESSION_KEY not required) and registers no authenticators, so the unconditional auth.GetAuthInstance() callers (server init, api.New()) always get a valid instance. - main: run the graceful-shutdown sequence on the startup-error path. runServer derives a cancelable context and, on a server-start failure, cancels it and waits so the metricstore final checkpoint / WAL rotation, archiver flush and taskmanager shutdown actually run before exit. - server: log the :80 HTTP->HTTPS redirect listener error instead of dropping it. - archiver: guard Shutdown against being called when Start never ran (avoids close(nil) panic / blocking on a nil workerDone). - nats API: stop worker goroutines on shutdown via a stop channel + idempotent Shutdown(); workers and subscription callbacks select on stop and the channels are never closed, so no send-on-closed-channel can occur. Wired into Server.Shutdown after the NATS client is closed. - metricstore: make Shutdown idempotent (nil shutdownFunc, early return) and release shutdownFuncMu before the checkpoint write. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Entire-Checkpoint: 3c179f9caa8f
This commit is contained in:
@@ -172,14 +172,20 @@ func handleUserCommands() error {
|
||||
return fmt.Errorf("--add-user and --del-user can only be used if authentication is enabled")
|
||||
}
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
cclog.Warn("Authentication disabled due to missing configuration")
|
||||
auth.Init(nil)
|
||||
// Always initialize the auth subsystem so the HTTP server and REST API have a
|
||||
// valid (non-nil) auth instance, even when authentication is disabled. With
|
||||
// authentication disabled, Init only sets up an ephemeral session store and
|
||||
// registers no authenticators (see auth.Init).
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
if !config.Keys.DisableAuthentication {
|
||||
cclog.Warn("Authentication enabled but no auth configuration found")
|
||||
}
|
||||
auth.Init(nil)
|
||||
}
|
||||
|
||||
if !config.Keys.DisableAuthentication {
|
||||
// Check for default security keys
|
||||
checkDefaultSecurityKeys()
|
||||
|
||||
@@ -337,6 +343,12 @@ func initSubsystems() error {
|
||||
}
|
||||
|
||||
func runServer(ctx context.Context) error {
|
||||
// Derive a cancelable context so the startup-error path below can trigger the
|
||||
// same graceful-shutdown sequence as a signal (via the signal handler that
|
||||
// waits on ctx.Done()).
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Initialize metric store if configuration is provided
|
||||
@@ -438,26 +450,32 @@ func runServer(ctx context.Context) error {
|
||||
// Wait for either:
|
||||
// 1. An error from server startup
|
||||
// 2. Completion of all goroutines (normal shutdown or crash)
|
||||
var runErr error
|
||||
select {
|
||||
case err := <-errChan:
|
||||
case runErr = <-errChan:
|
||||
// errChan will be closed when waitDone is closed, which happens
|
||||
// when all goroutines complete (either from normal shutdown or error)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
// Give the server 100ms to start and report any immediate startup errors
|
||||
// After that, just wait for normal shutdown completion
|
||||
select {
|
||||
case err := <-errChan:
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case runErr = <-errChan:
|
||||
case <-waitDone:
|
||||
// Normal shutdown completed
|
||||
}
|
||||
}
|
||||
|
||||
if runErr != nil {
|
||||
// A subsystem failed (e.g. the HTTP server could not bind). Trigger the
|
||||
// graceful-shutdown path for the subsystems that were already started
|
||||
// (metricstore checkpoint, archiver flush, taskmanager) by cancelling the
|
||||
// context the signal handler waits on, then wait for it to finish so we
|
||||
// don't exit before the final checkpoint is written.
|
||||
cancel()
|
||||
<-waitDone
|
||||
return runErr
|
||||
}
|
||||
|
||||
cclog.Print("Graceful shutdown completed!")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -415,7 +415,9 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
|
||||
if !strings.HasSuffix(config.Keys.Addr, ":80") && config.Keys.RedirectHTTPTo != "" {
|
||||
go func() {
|
||||
http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently))
|
||||
if err := http.ListenAndServe(":80", http.RedirectHandler(config.Keys.RedirectHTTPTo, http.StatusMovedPermanently)); err != nil {
|
||||
cclog.Errorf("HTTP-to-HTTPS redirect listener on :80 failed: %v", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -460,6 +462,11 @@ func (s *Server) Shutdown(ctx context.Context) {
|
||||
if nc != nil {
|
||||
nc.Close()
|
||||
}
|
||||
// Stop the NATS API worker goroutines after the client is closed (no more
|
||||
// subscription callbacks can enqueue once the connection is down).
|
||||
if s.natsAPIHandle != nil {
|
||||
s.natsAPIHandle.Shutdown()
|
||||
}
|
||||
cclog.Infof("Shutdown: NATS closed (%v)", time.Since(natsStart))
|
||||
|
||||
httpStart := time.Now()
|
||||
|
||||
Reference in New Issue
Block a user