Added support for extracting Resources and MetaData

2026-02-06 11:11:45 +01:00 · 2023-09-26 15:54:33 +02:00
parent 41cd1171fb
commit 36b0f33208
1 changed files with 102 additions and 52 deletions
--- a/internal/scheduler/slurmRest.go
+++ b/internal/scheduler/slurmRest.go
@@ -8,11 +8,11 @@ import (
 	"database/sql"
 	"encoding/json"
 	"fmt"
 	"math"
 	"net/http"
 	"net/url"
 	"os"
 	"os/exec"
 	"regexp"
 	"strconv"
 	"time"
@@ -141,25 +141,7 @@ func queryAllJobs() ([]interface{}, error) {
 	return jobs, nil
 }
-func printSlurmInfo(job map[string]interface{}) string {
+func printSlurmInfo(job openapi.V0038JobResponseProperties) string {
 	cpusPerTask := "1"
 	tasksStr, ok := job["tasks"].(string)
 	if !ok {
 		tasksInt, _ := job["tasks"].(int)
 		tasksStr = strconv.Itoa(tasksInt)
 	}
 	cpusStr, ok := job["cpus"].(string)
 	if !ok {
 		cpusInt, _ := job["cpus"].(int)
 		cpusStr = strconv.Itoa(cpusInt)
 	}
 	tasks, _ := strconv.Atoi(tasksStr)
 	cpus, _ := strconv.Atoi(cpusStr)
 	if tasks > 0 {
 		cpusPerTask = strconv.Itoa(int(math.Round(float64(cpus) / float64(tasks))))
 	}
 	text := fmt.Sprintf(`
 	    JobId=%v JobName=%v
@@ -178,22 +160,22 @@ func printSlurmInfo(job map[string]interface{}) string {
 		WorkDir=%v
 		StdErr=%v
 		StdOut=%v`,
-		job["job_id"], job["name"],
+		job.JobId, job.Name,
-		job["user_name"], job["user_id"], job["group_id"],
+		job.UserName, job.UserId, job.GroupId,
-		job["account"], job["qos"],
+		job.Account, job.Qos,
-		job["requeue"], job["restart_cnt"], job["batch_flag"],
+		job.Requeue, job.RestartCnt, job.BatchFlag,
-		job["time_limit"],
+		job.TimeLimit, job.SubmitTime,
-		time.Unix(int64(job["submit_time"].(float64)), 0).Format(time.RFC3338),
+		//time.Unix(int64(*.(float64)), 0).Format(time.RFC1123),
-		job["partition"],
+		job.Partition,
-		job["nodes"],
+		job.Nodes,
-		job["node_count"], cpus, tasks, cpusPerTask,
+		job.NodeCount, job.Cpus, job.Tasks, job.CpusPerTask,
-		job["tasks_per_node"], job["tasks_per_socket"], job["tasks_per_core"],
+		job.TasksPerBoard, job.TasksPerSocket, job.TasksPerCore,
-		job["tres_req_str"],
+		job.TresAllocStr,
-		job["tres_alloc_str"],
+		job.TresAllocStr,
-		job["command"],
+		job.Command,
-		job["current_working_directory"],
+		job.CurrentWorkingDirectory,
-		job["standard_error"],
+		job.StandardError,
-		job["standard_output"],
+		job.StandardOutput,
 	)
 	return text
@@ -251,6 +233,40 @@ func (sd *SlurmRestScheduler) Init(rawConfig json.RawMessage) error {
 	return err
 }
 func (sd *SlurmRestScheduler) checkAndHandleStopJob(job *schema.Job, req *StopJobRequest) {
 	// Sanity checks
 	if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
 		log.Errorf("stopTime must be larger than startTime and only running jobs can be stopped")
 		return
 	}
 	if req.State != "" && !req.State.Valid() {
 		log.Errorf("invalid job state: %#v", req.State)
 		return
 	} else if req.State == "" {
 		req.State = schema.JobStateCompleted
 	}
 	// Mark job as stopped in the database (update state and duration)
 	job.Duration = int32(req.StopTime - job.StartTime.Unix())
 	job.State = req.State
 	if err := sd.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
 		log.Errorf("marking job as stopped failed: %s", err.Error())
 		return
 	}
 	log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
 	// Monitoring is disabled...
 	if job.MonitoringStatus == schema.MonitoringStatusDisabled {
 		return
 	}
 	// Trigger async archiving
 	sd.JobRepository.TriggerArchiving(job)
 }
 func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsResponse) {
 	// Iterate over the Jobs slice
@@ -269,10 +285,7 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
 		// is "running" one of JSON state?
 		if *job.JobState == "running" {
-			// Check if combination of (job_id, cluster_id, start_time) already exists:
+			jobs, err := sd.JobRepository.FindRunningJobs(*job.Cluster)
 			// jobs, err := sd.JobRepository.FindRunningJobs(job.Cluster, job.StartTime)
 			jobs, err := sd.FindRunningJobs(job.Cluster)
 			if err != nil {
 				log.Fatalf("Failed to find running jobs: %v", err)
 			}
@@ -298,6 +311,42 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
 						log.Fatalf("JSON marshaling failed: %s", err)
 					}
 					var resources []*schema.Resource
 					// Define a regular expression to match "gres/gpu=x"
 					regex := regexp.MustCompile(`gres/gpu=(\d+)`)
 					// Find all matches in the input string
 					matches := regex.FindAllStringSubmatch(*job.TresAllocStr, -1)
 					// Initialize a variable to store the total number of GPUs
 					var totalGPUs int32
 					// Iterate through the matches
 					match := matches[0]
 					if len(match) == 2 {
 						gpuCount, _ := strconv.Atoi(match[1])
 						totalGPUs += int32(gpuCount)
 					}
 					for _, node := range job.JobResources.AllocatedNodes {
 						var res schema.Resource
 						res.Hostname = *node.Nodename
 						for k, v := range node.Sockets.Cores {
 							fmt.Printf("core id[%s] value[%s]\n", k, v)
 							threadID, _ := strconv.Atoi(k)
 							res.HWThreads = append(res.HWThreads, threadID)
 						}
 						res.Accelerators = append(res.Accelerators, *job.TresAllocStr)
 						// cpu=512,mem=1875G,node=4,billing=512,gres\/gpu=32,gres\/gpu:a40=32
 						resources = append(resources, &res)
 					}
 					var metaData map[string]string
 					metaData["jobName"] = *job.Name
 					metaData["slurmInfo"] = printSlurmInfo(job)
 					// metaData["jobScript"] = "What to put here?"
 					metaDataInBytes, err := json.Marshal(metaData)
 					var defaultJob schema.BaseJob = schema.BaseJob{
 						JobID:     int64(*job.JobId),
 						User:      *job.UserName,
@@ -308,8 +357,8 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
 						ArrayJobId:   int64(*job.ArrayJobId),
 						NumNodes:     *job.NodeCount,
 						NumHWThreads: *job.Cpus,
-						// NumAcc:       job.NumAcc,
+						NumAcc:       totalGPUs,
-						Exclusive: exclusive,
+						Exclusive:    exclusive,
 						// MonitoringStatus: job.MonitoringStatus,
 						// SMT:            *job.TasksPerCore,
 						State: schema.JobState(*job.JobState),
@@ -321,10 +370,10 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
 						RawResources: jobResourcesInBytes,
 						// "job_resources": "allocated_nodes" "sockets":
 						// very important; has to be right
-						Resources: job.JobResources,
+						Resources:   resources,
-						// RawMetaData:    job.RawMetaData,
+						RawMetaData: metaDataInBytes,
 						// optional metadata with'jobScript 'jobName': 'slurmInfo':
-						// MetaData:       job.MetaData,
+						MetaData: metaData,
 						// ConcurrentJobs: job.ConcurrentJobs,
 					}
 					req := &schema.JobMeta{
@@ -342,23 +391,24 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
 			}
 		} else {
 			// Check if completed job with combination of (job_id, cluster_id, start_time) already exists:
-			existingJob, err := sd.JobRepository.Find(job.JobID, &job.Cluster, job.StartTime)
+			var jobID int64
 			jobID = int64(*job.JobId)
 			existingJob, err := sd.JobRepository.Find(&jobID, job.Cluster, job.StartTime)
 			if err == nil {
-				existingJob.BaseJob.Duration = *job.EndTime - *job.StartTime
+				existingJob.BaseJob.Duration = int32(*job.EndTime - *job.StartTime)
 				existingJob.BaseJob.State = schema.JobState(*job.JobState)
-				existingJob.BaseJob.Walltime = job.StartTime
+				existingJob.BaseJob.Walltime = *job.StartTime
-				var jobID int64
+
 				jobID = int64(*job.JobId)
 				req := &StopJobRequest{
 					Cluster:   job.Cluster,
 					JobId:     &jobID,
 					State:     schema.JobState(*job.JobState),
-					StartTime: existingJob.StartTime,
+					StartTime: &existingJob.StartTimeUnix,
 					StopTime:  *job.EndTime,
 				}
 				// req := new(schema.JobMeta)
-				id, err := sd.JobRepository.checkAndHandleStopJob(job, req)
+				sd.checkAndHandleStopJob(existingJob, req)
 			}
 		}