fix: Transfer always to main job table before archiving

2026-02-11 21:41:46 +01:00 · 2026-02-09 19:57:46 +01:00
parent 51e9d33f9f
commit ac7eb93141
4 changed files with 91 additions and 58 deletions
--- a/internal/api/job.go
+++ b/internal/api/job.go
@@ -754,6 +754,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
 		return
 	}

+	isCached := false
 	job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
 	if err != nil {
 		// Try cached jobs if not found in main repository
@@ -764,9 +765,10 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
 			return
 		}
 		job = cachedJob
+		isCached = true
 	}

-	api.checkAndHandleStopJob(rw, job, req)
+	api.checkAndHandleStopJob(rw, job, req, isCached)
 }

 // deleteJobByID godoc
@@ -923,7 +925,7 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
 	}
 }

-func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) {
+func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) {
 	// Sanity checks
 	if job.State != schema.JobStateRunning {
 		handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
@@ -948,11 +950,21 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
 	api.JobRepository.Mutex.Lock()
 	defer api.JobRepository.Mutex.Unlock()

-	if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
-		if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
-			handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
+	// If the job is still in job_cache, transfer it to the job table first
+	// so that job.ID always points to the job table for downstream code
+	if isCached {
+		newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
+		if err != nil {
+			handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw)
 			return
 		}
+		cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
+		job.ID = &newID
+	}
+
+	if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
+		handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
+		return
 	}

 	cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
--- a/internal/api/nats.go
+++ b/internal/api/nats.go
@@ -251,6 +251,7 @@ func (api *NatsAPI) handleStopJob(payload string) {
 		return
 	}

+	isCached := false
 	job, err := api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
 	if err != nil {
 		cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
@@ -260,6 +261,7 @@ func (api *NatsAPI) handleStopJob(payload string) {
 			return
 		}
 		job = cachedJob
+		isCached = true
 	}

 	if job.State != schema.JobStateRunning {
@@ -287,16 +289,26 @@ func (api *NatsAPI) handleStopJob(payload string) {
 	api.JobRepository.Mutex.Lock()
 	defer api.JobRepository.Mutex.Unlock()

-	if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
-		if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
-			cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
-				job.JobID, job.ID, job.Cluster, job.State, err)
+	// If the job is still in job_cache, transfer it to the job table first
+	if isCached {
+		newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
+		if err != nil {
+			cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v",
+				job.JobID, *job.ID, job.Cluster, err)
 			return
 		}
+		cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
+		job.ID = &newID
+	}
+
+	if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
+		cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
+			job.JobID, *job.ID, job.Cluster, job.State, err)
+		return
 	}

 	cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
-		job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
+		*job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)

 	if job.MonitoringStatus == schema.MonitoringStatusDisabled {
 		return