Fix critical/severe issues in init, startup and shutdown

- auth: do not abort the server when authentication is disabled. auth.Init
  is now always called; with disable-authentication it sets up an ephemeral
  session store (SESSION_KEY not required) and registers no authenticators,
  so the unconditional auth.GetAuthInstance() callers (server init,
  api.New()) always get a valid instance.
- main: run the graceful-shutdown sequence on the startup-error path. runServer
  derives a cancelable context and, on a server-start failure, cancels it and
  waits so the metricstore final checkpoint / WAL rotation, archiver flush and
  taskmanager shutdown actually run before exit.
- server: log the :80 HTTP->HTTPS redirect listener error instead of dropping it.
- archiver: guard Shutdown against being called when Start never ran
  (avoids close(nil) panic / blocking on a nil workerDone).
- nats API: stop worker goroutines on shutdown via a stop channel + idempotent
  Shutdown(); workers and subscription callbacks select on stop and the
  channels are never closed, so no send-on-closed-channel can occur. Wired
  into Server.Shutdown after the NATS client is closed.
- metricstore: make Shutdown idempotent (nil shutdownFunc, early return) and
  release shutdownFuncMu before the checkpoint write.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Entire-Checkpoint: 3c179f9caa8f
This commit is contained in:
2026-06-05 10:16:28 +02:00
parent 56ae1de011
commit 1b72b0b5ad
6 changed files with 119 additions and 32 deletions

View File

@@ -78,6 +78,12 @@ type NatsAPI struct {
jobCh chan natsMessage
// nodeCh receives node state messages for processing by worker goroutines.
nodeCh chan natsMessage
// stop signals worker goroutines and subscription callbacks to stop.
// Closing it (via Shutdown) makes workers exit and callbacks drop further
// messages instead of blocking; the channels are never closed so in-flight
// callbacks can never send on a closed channel.
stop chan struct{}
stopOnce sync.Once
}
// NewNatsAPI creates a new NatsAPI instance with channel-based worker pools.
@@ -99,6 +105,7 @@ func NewNatsAPI() *NatsAPI {
JobRepository: repository.GetJobRepository(),
jobCh: make(chan natsMessage, jobConc),
nodeCh: make(chan natsMessage, nodeConc),
stop: make(chan struct{}),
}
// Start worker goroutines
@@ -112,17 +119,36 @@ func NewNatsAPI() *NatsAPI {
return api
}
// Shutdown stops the worker goroutines and tells subscription callbacks to stop
// enqueueing. It is safe to call multiple times. Callers must ensure the NATS
// client is closed first so no new callbacks are invoked.
func (api *NatsAPI) Shutdown() {
api.stopOnce.Do(func() {
close(api.stop)
})
}
// jobWorker processes job event messages from the job channel.
func (api *NatsAPI) jobWorker() {
for msg := range api.jobCh {
api.handleJobEvent(msg.subject, msg.data)
for {
select {
case <-api.stop:
return
case msg := <-api.jobCh:
api.handleJobEvent(msg.subject, msg.data)
}
}
}
// nodeWorker processes node state messages from the node channel.
func (api *NatsAPI) nodeWorker() {
for msg := range api.nodeCh {
api.handleNodeState(msg.subject, msg.data)
for {
select {
case <-api.stop:
return
case msg := <-api.nodeCh:
api.handleNodeState(msg.subject, msg.data)
}
}
}
@@ -140,13 +166,19 @@ func (api *NatsAPI) StartSubscriptions() error {
s := config.Keys.APISubjects
if err := client.Subscribe(s.SubjectJobEvent, func(subject string, data []byte) {
api.jobCh <- natsMessage{subject: subject, data: data}
select {
case api.jobCh <- natsMessage{subject: subject, data: data}:
case <-api.stop:
}
}); err != nil {
return err
}
if err := client.Subscribe(s.SubjectNodeState, func(subject string, data []byte) {
api.nodeCh <- natsMessage{subject: subject, data: data}
select {
case api.nodeCh <- natsMessage{subject: subject, data: data}:
case <-api.stop:
}
}); err != nil {
return err
}