Added support for extracting Resources and MetaData

This commit is contained in:
Bole Ma 2023-09-26 15:54:33 +02:00
parent 41cd1171fb
commit 36b0f33208

View File

@ -8,11 +8,11 @@ import (
"database/sql" "database/sql"
"encoding/json" "encoding/json"
"fmt" "fmt"
"math"
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"os/exec" "os/exec"
"regexp"
"strconv" "strconv"
"time" "time"
@ -141,25 +141,7 @@ func queryAllJobs() ([]interface{}, error) {
return jobs, nil return jobs, nil
} }
func printSlurmInfo(job map[string]interface{}) string { func printSlurmInfo(job openapi.V0038JobResponseProperties) string {
cpusPerTask := "1"
tasksStr, ok := job["tasks"].(string)
if !ok {
tasksInt, _ := job["tasks"].(int)
tasksStr = strconv.Itoa(tasksInt)
}
cpusStr, ok := job["cpus"].(string)
if !ok {
cpusInt, _ := job["cpus"].(int)
cpusStr = strconv.Itoa(cpusInt)
}
tasks, _ := strconv.Atoi(tasksStr)
cpus, _ := strconv.Atoi(cpusStr)
if tasks > 0 {
cpusPerTask = strconv.Itoa(int(math.Round(float64(cpus) / float64(tasks))))
}
text := fmt.Sprintf(` text := fmt.Sprintf(`
JobId=%v JobName=%v JobId=%v JobName=%v
@ -178,22 +160,22 @@ func printSlurmInfo(job map[string]interface{}) string {
WorkDir=%v WorkDir=%v
StdErr=%v StdErr=%v
StdOut=%v`, StdOut=%v`,
job["job_id"], job["name"], job.JobId, job.Name,
job["user_name"], job["user_id"], job["group_id"], job.UserName, job.UserId, job.GroupId,
job["account"], job["qos"], job.Account, job.Qos,
job["requeue"], job["restart_cnt"], job["batch_flag"], job.Requeue, job.RestartCnt, job.BatchFlag,
job["time_limit"], job.TimeLimit, job.SubmitTime,
time.Unix(int64(job["submit_time"].(float64)), 0).Format(time.RFC3338), //time.Unix(int64(*.(float64)), 0).Format(time.RFC1123),
job["partition"], job.Partition,
job["nodes"], job.Nodes,
job["node_count"], cpus, tasks, cpusPerTask, job.NodeCount, job.Cpus, job.Tasks, job.CpusPerTask,
job["tasks_per_node"], job["tasks_per_socket"], job["tasks_per_core"], job.TasksPerBoard, job.TasksPerSocket, job.TasksPerCore,
job["tres_req_str"], job.TresAllocStr,
job["tres_alloc_str"], job.TresAllocStr,
job["command"], job.Command,
job["current_working_directory"], job.CurrentWorkingDirectory,
job["standard_error"], job.StandardError,
job["standard_output"], job.StandardOutput,
) )
return text return text
@ -251,6 +233,40 @@ func (sd *SlurmRestScheduler) Init(rawConfig json.RawMessage) error {
return err return err
} }
func (sd *SlurmRestScheduler) checkAndHandleStopJob(job *schema.Job, req *StopJobRequest) {
// Sanity checks
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
log.Errorf("stopTime must be larger than startTime and only running jobs can be stopped")
return
}
if req.State != "" && !req.State.Valid() {
log.Errorf("invalid job state: %#v", req.State)
return
} else if req.State == "" {
req.State = schema.JobStateCompleted
}
// Mark job as stopped in the database (update state and duration)
job.Duration = int32(req.StopTime - job.StartTime.Unix())
job.State = req.State
if err := sd.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
log.Errorf("marking job as stopped failed: %s", err.Error())
return
}
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
// Monitoring is disabled...
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
return
}
// Trigger async archiving
sd.JobRepository.TriggerArchiving(job)
}
func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsResponse) { func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsResponse) {
// Iterate over the Jobs slice // Iterate over the Jobs slice
@ -269,10 +285,7 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
// is "running" one of JSON state? // is "running" one of JSON state?
if *job.JobState == "running" { if *job.JobState == "running" {
// Check if combination of (job_id, cluster_id, start_time) already exists: jobs, err := sd.JobRepository.FindRunningJobs(*job.Cluster)
// jobs, err := sd.JobRepository.FindRunningJobs(job.Cluster, job.StartTime)
jobs, err := sd.FindRunningJobs(job.Cluster)
if err != nil { if err != nil {
log.Fatalf("Failed to find running jobs: %v", err) log.Fatalf("Failed to find running jobs: %v", err)
} }
@ -298,6 +311,42 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
log.Fatalf("JSON marshaling failed: %s", err) log.Fatalf("JSON marshaling failed: %s", err)
} }
var resources []*schema.Resource
// Define a regular expression to match "gres/gpu=x"
regex := regexp.MustCompile(`gres/gpu=(\d+)`)
// Find all matches in the input string
matches := regex.FindAllStringSubmatch(*job.TresAllocStr, -1)
// Initialize a variable to store the total number of GPUs
var totalGPUs int32
// Iterate through the matches
match := matches[0]
if len(match) == 2 {
gpuCount, _ := strconv.Atoi(match[1])
totalGPUs += int32(gpuCount)
}
for _, node := range job.JobResources.AllocatedNodes {
var res schema.Resource
res.Hostname = *node.Nodename
for k, v := range node.Sockets.Cores {
fmt.Printf("core id[%s] value[%s]\n", k, v)
threadID, _ := strconv.Atoi(k)
res.HWThreads = append(res.HWThreads, threadID)
}
res.Accelerators = append(res.Accelerators, *job.TresAllocStr)
// cpu=512,mem=1875G,node=4,billing=512,gres\/gpu=32,gres\/gpu:a40=32
resources = append(resources, &res)
}
var metaData map[string]string
metaData["jobName"] = *job.Name
metaData["slurmInfo"] = printSlurmInfo(job)
// metaData["jobScript"] = "What to put here?"
metaDataInBytes, err := json.Marshal(metaData)
var defaultJob schema.BaseJob = schema.BaseJob{ var defaultJob schema.BaseJob = schema.BaseJob{
JobID: int64(*job.JobId), JobID: int64(*job.JobId),
User: *job.UserName, User: *job.UserName,
@ -308,8 +357,8 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
ArrayJobId: int64(*job.ArrayJobId), ArrayJobId: int64(*job.ArrayJobId),
NumNodes: *job.NodeCount, NumNodes: *job.NodeCount,
NumHWThreads: *job.Cpus, NumHWThreads: *job.Cpus,
// NumAcc: job.NumAcc, NumAcc: totalGPUs,
Exclusive: exclusive, Exclusive: exclusive,
// MonitoringStatus: job.MonitoringStatus, // MonitoringStatus: job.MonitoringStatus,
// SMT: *job.TasksPerCore, // SMT: *job.TasksPerCore,
State: schema.JobState(*job.JobState), State: schema.JobState(*job.JobState),
@ -321,10 +370,10 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
RawResources: jobResourcesInBytes, RawResources: jobResourcesInBytes,
// "job_resources": "allocated_nodes" "sockets": // "job_resources": "allocated_nodes" "sockets":
// very important; has to be right // very important; has to be right
Resources: job.JobResources, Resources: resources,
// RawMetaData: job.RawMetaData, RawMetaData: metaDataInBytes,
// optional metadata with'jobScript 'jobName': 'slurmInfo': // optional metadata with'jobScript 'jobName': 'slurmInfo':
// MetaData: job.MetaData, MetaData: metaData,
// ConcurrentJobs: job.ConcurrentJobs, // ConcurrentJobs: job.ConcurrentJobs,
} }
req := &schema.JobMeta{ req := &schema.JobMeta{
@ -342,23 +391,24 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
} }
} else { } else {
// Check if completed job with combination of (job_id, cluster_id, start_time) already exists: // Check if completed job with combination of (job_id, cluster_id, start_time) already exists:
existingJob, err := sd.JobRepository.Find(job.JobID, &job.Cluster, job.StartTime) var jobID int64
jobID = int64(*job.JobId)
existingJob, err := sd.JobRepository.Find(&jobID, job.Cluster, job.StartTime)
if err == nil { if err == nil {
existingJob.BaseJob.Duration = *job.EndTime - *job.StartTime existingJob.BaseJob.Duration = int32(*job.EndTime - *job.StartTime)
existingJob.BaseJob.State = schema.JobState(*job.JobState) existingJob.BaseJob.State = schema.JobState(*job.JobState)
existingJob.BaseJob.Walltime = job.StartTime existingJob.BaseJob.Walltime = *job.StartTime
var jobID int64
jobID = int64(*job.JobId)
req := &StopJobRequest{ req := &StopJobRequest{
Cluster: job.Cluster, Cluster: job.Cluster,
JobId: &jobID, JobId: &jobID,
State: schema.JobState(*job.JobState), State: schema.JobState(*job.JobState),
StartTime: existingJob.StartTime, StartTime: &existingJob.StartTimeUnix,
StopTime: *job.EndTime, StopTime: *job.EndTime,
} }
// req := new(schema.JobMeta) // req := new(schema.JobMeta)
id, err := sd.JobRepository.checkAndHandleStopJob(job, req) sd.checkAndHandleStopJob(existingJob, req)
} }
} }