mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-12-13 02:46:16 +01:00
251 lines
8.4 KiB
Go
251 lines
8.4 KiB
Go
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
// All rights reserved. This file is part of cc-backend.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package archiver provides asynchronous job archiving functionality for ClusterCockpit.
|
|
//
|
|
// The archiver runs a background worker goroutine that processes job archiving requests
|
|
// from a buffered channel. When jobs complete, their metric data is archived from the
|
|
// metric store to the configured archive backend (filesystem, S3, etc.).
|
|
//
|
|
// # Architecture
|
|
//
|
|
// The archiver uses a producer-consumer pattern:
|
|
// - Producer: TriggerArchiving() sends jobs to archiveChannel
|
|
// - Consumer: archivingWorker() processes jobs from the channel
|
|
// - Coordination: sync.WaitGroup tracks pending archive operations
|
|
//
|
|
// # Lifecycle
|
|
//
|
|
// 1. Start(repo, ctx) - Initialize worker with context for cancellation
|
|
// 2. TriggerArchiving(job) - Queue job for archiving (called when job stops)
|
|
// 3. archivingWorker() - Background goroutine processes jobs
|
|
// 4. Shutdown(timeout) - Graceful shutdown with timeout
|
|
//
|
|
// # Graceful Shutdown
|
|
//
|
|
// The archiver supports graceful shutdown with configurable timeout:
|
|
// - Closes channel to reject new jobs
|
|
// - Waits for pending jobs to complete (up to timeout)
|
|
// - Cancels context if timeout exceeded
|
|
// - Ensures worker goroutine exits cleanly
|
|
//
|
|
// # Example Usage
|
|
//
|
|
// // Initialize archiver
|
|
// ctx, cancel := context.WithCancel(context.Background())
|
|
// defer cancel()
|
|
// archiver.Start(jobRepository, ctx)
|
|
//
|
|
// // Trigger archiving when job completes
|
|
// archiver.TriggerArchiving(job)
|
|
//
|
|
// // Graceful shutdown with 10 second timeout
|
|
// if err := archiver.Shutdown(10 * time.Second); err != nil {
|
|
// log.Printf("Archiver shutdown timeout: %v", err)
|
|
// }
|
|
package archiver
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
|
"github.com/ClusterCockpit/cc-lib/schema"
|
|
sq "github.com/Masterminds/squirrel"
|
|
)
|
|
|
|
var (
|
|
archivePending sync.WaitGroup
|
|
archiveChannel chan *schema.Job
|
|
jobRepo *repository.JobRepository
|
|
shutdownCtx context.Context
|
|
shutdownCancel context.CancelFunc
|
|
workerDone chan struct{}
|
|
)
|
|
|
|
// Start initializes the archiver and starts the background worker goroutine.
|
|
//
|
|
// The archiver processes job archiving requests asynchronously via a buffered channel.
|
|
// Jobs are sent to the channel using TriggerArchiving() and processed by the worker.
|
|
//
|
|
// Parameters:
|
|
// - r: JobRepository instance for database operations
|
|
// - ctx: Context for cancellation (shutdown signal propagation)
|
|
//
|
|
// The worker goroutine will run until:
|
|
// - ctx is cancelled (via parent shutdown)
|
|
// - archiveChannel is closed (via Shutdown())
|
|
//
|
|
// Must be called before TriggerArchiving(). Safe to call only once.
|
|
func Start(r *repository.JobRepository, ctx context.Context) {
|
|
shutdownCtx, shutdownCancel = context.WithCancel(ctx)
|
|
archiveChannel = make(chan *schema.Job, 128)
|
|
workerDone = make(chan struct{})
|
|
jobRepo = r
|
|
|
|
go archivingWorker()
|
|
}
|
|
|
|
// archivingWorker is the background goroutine that processes job archiving requests.
|
|
//
|
|
// The worker loop:
|
|
// 1. Blocks waiting for jobs on archiveChannel or shutdown signal
|
|
// 2. Fetches job metadata from repository
|
|
// 3. Archives job data to configured backend (calls ArchiveJob)
|
|
// 4. Updates job footprint and energy metrics in database
|
|
// 5. Marks job as successfully archived
|
|
// 6. Calls job stop hooks
|
|
//
|
|
// The worker exits when:
|
|
// - shutdownCtx is cancelled (timeout during shutdown)
|
|
// - archiveChannel is closed (normal shutdown)
|
|
//
|
|
// Errors during archiving are logged and the job is marked as failed,
|
|
// but the worker continues processing other jobs.
|
|
func archivingWorker() {
|
|
defer close(workerDone)
|
|
|
|
for {
|
|
select {
|
|
case <-shutdownCtx.Done():
|
|
cclog.Info("Archive worker received shutdown signal")
|
|
return
|
|
|
|
case job, ok := <-archiveChannel:
|
|
if !ok {
|
|
cclog.Info("Archive channel closed, worker exiting")
|
|
return
|
|
}
|
|
|
|
start := time.Now()
|
|
// not using meta data, called to load JobMeta into Cache?
|
|
// will fail if job meta not in repository
|
|
if _, err := jobRepo.FetchMetadata(job); err != nil {
|
|
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
|
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
|
archivePending.Done()
|
|
continue
|
|
}
|
|
|
|
// ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend
|
|
// Use shutdown context to allow cancellation
|
|
jobMeta, err := ArchiveJob(job, shutdownCtx)
|
|
if err != nil {
|
|
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
|
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
|
archivePending.Done()
|
|
continue
|
|
}
|
|
|
|
stmt := sq.Update("job").Where("job.id = ?", job.ID)
|
|
|
|
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
|
|
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
|
archivePending.Done()
|
|
continue
|
|
}
|
|
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
|
|
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
|
archivePending.Done()
|
|
continue
|
|
}
|
|
// Update the jobs database entry one last time:
|
|
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
|
|
if err := jobRepo.Execute(stmt); err != nil {
|
|
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
|
|
archivePending.Done()
|
|
continue
|
|
}
|
|
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
|
cclog.Infof("archiving job (dbid: %d) successful", job.ID)
|
|
|
|
repository.CallJobStopHooks(job)
|
|
archivePending.Done()
|
|
}
|
|
}
|
|
}
|
|
|
|
// TriggerArchiving queues a job for asynchronous archiving.
|
|
//
|
|
// This function should be called when a job completes (stops) to archive its
|
|
// metric data from the metric store to the configured archive backend.
|
|
//
|
|
// The function:
|
|
// 1. Increments the pending job counter (WaitGroup)
|
|
// 2. Sends the job to the archiving channel (buffered, capacity 128)
|
|
// 3. Returns immediately (non-blocking unless channel is full)
|
|
//
|
|
// The actual archiving is performed asynchronously by the worker goroutine.
|
|
// Upon completion, the worker will decrement the pending counter.
|
|
//
|
|
// Panics if Start() has not been called first.
|
|
func TriggerArchiving(job *schema.Job) {
|
|
if archiveChannel == nil {
|
|
cclog.Fatal("Cannot archive without archiving channel. Did you Start the archiver?")
|
|
}
|
|
|
|
archivePending.Add(1)
|
|
archiveChannel <- job
|
|
}
|
|
|
|
// Shutdown performs a graceful shutdown of the archiver with a configurable timeout.
|
|
//
|
|
// The shutdown process:
|
|
// 1. Closes archiveChannel - no new jobs will be accepted
|
|
// 2. Waits for pending jobs to complete (up to timeout duration)
|
|
// 3. If timeout is exceeded:
|
|
// - Cancels shutdownCtx to interrupt ongoing ArchiveJob operations
|
|
// - Returns error indicating timeout
|
|
// 4. Waits for worker goroutine to exit cleanly
|
|
//
|
|
// Parameters:
|
|
// - timeout: Maximum duration to wait for pending jobs to complete
|
|
// (recommended: 10-30 seconds for production)
|
|
//
|
|
// Returns:
|
|
// - nil if all jobs completed within timeout
|
|
// - error if timeout was exceeded (some jobs may not have been archived)
|
|
//
|
|
// Jobs that don't complete within the timeout will be marked as failed.
|
|
// The function always ensures the worker goroutine exits before returning.
|
|
//
|
|
// Example:
|
|
//
|
|
// if err := archiver.Shutdown(10 * time.Second); err != nil {
|
|
// log.Printf("Some jobs did not complete: %v", err)
|
|
// }
|
|
func Shutdown(timeout time.Duration) error {
|
|
cclog.Info("Initiating archiver shutdown...")
|
|
|
|
// Close channel to signal no more jobs will be accepted
|
|
close(archiveChannel)
|
|
|
|
// Create a channel to signal when all jobs are done
|
|
done := make(chan struct{})
|
|
go func() {
|
|
archivePending.Wait()
|
|
close(done)
|
|
}()
|
|
|
|
// Wait for jobs to complete or timeout
|
|
select {
|
|
case <-done:
|
|
cclog.Info("All archive jobs completed successfully")
|
|
// Wait for worker to exit
|
|
<-workerDone
|
|
return nil
|
|
case <-time.After(timeout):
|
|
cclog.Warn("Archiver shutdown timeout exceeded, cancelling remaining operations")
|
|
// Cancel any ongoing operations
|
|
shutdownCancel()
|
|
// Wait for worker to exit
|
|
<-workerDone
|
|
return fmt.Errorf("archiver shutdown timeout after %v", timeout)
|
|
}
|
|
}
|