mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-06-06 03:37:29 +02:00
Fix critical/severe issues in init, startup and shutdown
- auth: do not abort the server when authentication is disabled. auth.Init is now always called; with disable-authentication it sets up an ephemeral session store (SESSION_KEY not required) and registers no authenticators, so the unconditional auth.GetAuthInstance() callers (server init, api.New()) always get a valid instance. - main: run the graceful-shutdown sequence on the startup-error path. runServer derives a cancelable context and, on a server-start failure, cancels it and waits so the metricstore final checkpoint / WAL rotation, archiver flush and taskmanager shutdown actually run before exit. - server: log the :80 HTTP->HTTPS redirect listener error instead of dropping it. - archiver: guard Shutdown against being called when Start never ran (avoids close(nil) panic / blocking on a nil workerDone). - nats API: stop worker goroutines on shutdown via a stop channel + idempotent Shutdown(); workers and subscription callbacks select on stop and the channels are never closed, so no send-on-closed-channel can occur. Wired into Server.Shutdown after the NATS client is closed. - metricstore: make Shutdown idempotent (nil shutdownFunc, early return) and release shutdownFuncMu before the checkpoint write. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Entire-Checkpoint: 3c179f9caa8f
This commit is contained in:
@@ -78,6 +78,12 @@ type NatsAPI struct {
|
||||
jobCh chan natsMessage
|
||||
// nodeCh receives node state messages for processing by worker goroutines.
|
||||
nodeCh chan natsMessage
|
||||
// stop signals worker goroutines and subscription callbacks to stop.
|
||||
// Closing it (via Shutdown) makes workers exit and callbacks drop further
|
||||
// messages instead of blocking; the channels are never closed so in-flight
|
||||
// callbacks can never send on a closed channel.
|
||||
stop chan struct{}
|
||||
stopOnce sync.Once
|
||||
}
|
||||
|
||||
// NewNatsAPI creates a new NatsAPI instance with channel-based worker pools.
|
||||
@@ -99,6 +105,7 @@ func NewNatsAPI() *NatsAPI {
|
||||
JobRepository: repository.GetJobRepository(),
|
||||
jobCh: make(chan natsMessage, jobConc),
|
||||
nodeCh: make(chan natsMessage, nodeConc),
|
||||
stop: make(chan struct{}),
|
||||
}
|
||||
|
||||
// Start worker goroutines
|
||||
@@ -112,17 +119,36 @@ func NewNatsAPI() *NatsAPI {
|
||||
return api
|
||||
}
|
||||
|
||||
// Shutdown stops the worker goroutines and tells subscription callbacks to stop
|
||||
// enqueueing. It is safe to call multiple times. Callers must ensure the NATS
|
||||
// client is closed first so no new callbacks are invoked.
|
||||
func (api *NatsAPI) Shutdown() {
|
||||
api.stopOnce.Do(func() {
|
||||
close(api.stop)
|
||||
})
|
||||
}
|
||||
|
||||
// jobWorker processes job event messages from the job channel.
|
||||
func (api *NatsAPI) jobWorker() {
|
||||
for msg := range api.jobCh {
|
||||
api.handleJobEvent(msg.subject, msg.data)
|
||||
for {
|
||||
select {
|
||||
case <-api.stop:
|
||||
return
|
||||
case msg := <-api.jobCh:
|
||||
api.handleJobEvent(msg.subject, msg.data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nodeWorker processes node state messages from the node channel.
|
||||
func (api *NatsAPI) nodeWorker() {
|
||||
for msg := range api.nodeCh {
|
||||
api.handleNodeState(msg.subject, msg.data)
|
||||
for {
|
||||
select {
|
||||
case <-api.stop:
|
||||
return
|
||||
case msg := <-api.nodeCh:
|
||||
api.handleNodeState(msg.subject, msg.data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,13 +166,19 @@ func (api *NatsAPI) StartSubscriptions() error {
|
||||
s := config.Keys.APISubjects
|
||||
|
||||
if err := client.Subscribe(s.SubjectJobEvent, func(subject string, data []byte) {
|
||||
api.jobCh <- natsMessage{subject: subject, data: data}
|
||||
select {
|
||||
case api.jobCh <- natsMessage{subject: subject, data: data}:
|
||||
case <-api.stop:
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := client.Subscribe(s.SubjectNodeState, func(subject string, data []byte) {
|
||||
api.nodeCh <- natsMessage{subject: subject, data: data}
|
||||
select {
|
||||
case api.nodeCh <- natsMessage{subject: subject, data: data}:
|
||||
case <-api.stop:
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user