mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-26 13:29:05 +01:00
Added support for extracting Resources and MetaData
This commit is contained in:
parent
41cd1171fb
commit
36b0f33208
@ -8,11 +8,11 @@ import (
|
|||||||
"database/sql"
|
"database/sql"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -141,25 +141,7 @@ func queryAllJobs() ([]interface{}, error) {
|
|||||||
return jobs, nil
|
return jobs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func printSlurmInfo(job map[string]interface{}) string {
|
func printSlurmInfo(job openapi.V0038JobResponseProperties) string {
|
||||||
cpusPerTask := "1"
|
|
||||||
tasksStr, ok := job["tasks"].(string)
|
|
||||||
if !ok {
|
|
||||||
tasksInt, _ := job["tasks"].(int)
|
|
||||||
tasksStr = strconv.Itoa(tasksInt)
|
|
||||||
}
|
|
||||||
|
|
||||||
cpusStr, ok := job["cpus"].(string)
|
|
||||||
if !ok {
|
|
||||||
cpusInt, _ := job["cpus"].(int)
|
|
||||||
cpusStr = strconv.Itoa(cpusInt)
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks, _ := strconv.Atoi(tasksStr)
|
|
||||||
cpus, _ := strconv.Atoi(cpusStr)
|
|
||||||
if tasks > 0 {
|
|
||||||
cpusPerTask = strconv.Itoa(int(math.Round(float64(cpus) / float64(tasks))))
|
|
||||||
}
|
|
||||||
|
|
||||||
text := fmt.Sprintf(`
|
text := fmt.Sprintf(`
|
||||||
JobId=%v JobName=%v
|
JobId=%v JobName=%v
|
||||||
@ -178,22 +160,22 @@ func printSlurmInfo(job map[string]interface{}) string {
|
|||||||
WorkDir=%v
|
WorkDir=%v
|
||||||
StdErr=%v
|
StdErr=%v
|
||||||
StdOut=%v`,
|
StdOut=%v`,
|
||||||
job["job_id"], job["name"],
|
job.JobId, job.Name,
|
||||||
job["user_name"], job["user_id"], job["group_id"],
|
job.UserName, job.UserId, job.GroupId,
|
||||||
job["account"], job["qos"],
|
job.Account, job.Qos,
|
||||||
job["requeue"], job["restart_cnt"], job["batch_flag"],
|
job.Requeue, job.RestartCnt, job.BatchFlag,
|
||||||
job["time_limit"],
|
job.TimeLimit, job.SubmitTime,
|
||||||
time.Unix(int64(job["submit_time"].(float64)), 0).Format(time.RFC3338),
|
//time.Unix(int64(*.(float64)), 0).Format(time.RFC1123),
|
||||||
job["partition"],
|
job.Partition,
|
||||||
job["nodes"],
|
job.Nodes,
|
||||||
job["node_count"], cpus, tasks, cpusPerTask,
|
job.NodeCount, job.Cpus, job.Tasks, job.CpusPerTask,
|
||||||
job["tasks_per_node"], job["tasks_per_socket"], job["tasks_per_core"],
|
job.TasksPerBoard, job.TasksPerSocket, job.TasksPerCore,
|
||||||
job["tres_req_str"],
|
job.TresAllocStr,
|
||||||
job["tres_alloc_str"],
|
job.TresAllocStr,
|
||||||
job["command"],
|
job.Command,
|
||||||
job["current_working_directory"],
|
job.CurrentWorkingDirectory,
|
||||||
job["standard_error"],
|
job.StandardError,
|
||||||
job["standard_output"],
|
job.StandardOutput,
|
||||||
)
|
)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
@ -251,6 +233,40 @@ func (sd *SlurmRestScheduler) Init(rawConfig json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (sd *SlurmRestScheduler) checkAndHandleStopJob(job *schema.Job, req *StopJobRequest) {
|
||||||
|
|
||||||
|
// Sanity checks
|
||||||
|
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
|
||||||
|
log.Errorf("stopTime must be larger than startTime and only running jobs can be stopped")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if req.State != "" && !req.State.Valid() {
|
||||||
|
log.Errorf("invalid job state: %#v", req.State)
|
||||||
|
return
|
||||||
|
} else if req.State == "" {
|
||||||
|
req.State = schema.JobStateCompleted
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark job as stopped in the database (update state and duration)
|
||||||
|
job.Duration = int32(req.StopTime - job.StartTime.Unix())
|
||||||
|
job.State = req.State
|
||||||
|
if err := sd.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||||
|
log.Errorf("marking job as stopped failed: %s", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
|
||||||
|
|
||||||
|
// Monitoring is disabled...
|
||||||
|
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trigger async archiving
|
||||||
|
sd.JobRepository.TriggerArchiving(job)
|
||||||
|
}
|
||||||
|
|
||||||
func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsResponse) {
|
func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsResponse) {
|
||||||
|
|
||||||
// Iterate over the Jobs slice
|
// Iterate over the Jobs slice
|
||||||
@ -269,10 +285,7 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
|
|||||||
// is "running" one of JSON state?
|
// is "running" one of JSON state?
|
||||||
if *job.JobState == "running" {
|
if *job.JobState == "running" {
|
||||||
|
|
||||||
// Check if combination of (job_id, cluster_id, start_time) already exists:
|
jobs, err := sd.JobRepository.FindRunningJobs(*job.Cluster)
|
||||||
// jobs, err := sd.JobRepository.FindRunningJobs(job.Cluster, job.StartTime)
|
|
||||||
|
|
||||||
jobs, err := sd.FindRunningJobs(job.Cluster)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Failed to find running jobs: %v", err)
|
log.Fatalf("Failed to find running jobs: %v", err)
|
||||||
}
|
}
|
||||||
@ -298,6 +311,42 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
|
|||||||
log.Fatalf("JSON marshaling failed: %s", err)
|
log.Fatalf("JSON marshaling failed: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var resources []*schema.Resource
|
||||||
|
|
||||||
|
// Define a regular expression to match "gres/gpu=x"
|
||||||
|
regex := regexp.MustCompile(`gres/gpu=(\d+)`)
|
||||||
|
|
||||||
|
// Find all matches in the input string
|
||||||
|
matches := regex.FindAllStringSubmatch(*job.TresAllocStr, -1)
|
||||||
|
|
||||||
|
// Initialize a variable to store the total number of GPUs
|
||||||
|
var totalGPUs int32
|
||||||
|
// Iterate through the matches
|
||||||
|
match := matches[0]
|
||||||
|
if len(match) == 2 {
|
||||||
|
gpuCount, _ := strconv.Atoi(match[1])
|
||||||
|
totalGPUs += int32(gpuCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, node := range job.JobResources.AllocatedNodes {
|
||||||
|
var res schema.Resource
|
||||||
|
res.Hostname = *node.Nodename
|
||||||
|
for k, v := range node.Sockets.Cores {
|
||||||
|
fmt.Printf("core id[%s] value[%s]\n", k, v)
|
||||||
|
threadID, _ := strconv.Atoi(k)
|
||||||
|
res.HWThreads = append(res.HWThreads, threadID)
|
||||||
|
}
|
||||||
|
res.Accelerators = append(res.Accelerators, *job.TresAllocStr)
|
||||||
|
// cpu=512,mem=1875G,node=4,billing=512,gres\/gpu=32,gres\/gpu:a40=32
|
||||||
|
resources = append(resources, &res)
|
||||||
|
}
|
||||||
|
|
||||||
|
var metaData map[string]string
|
||||||
|
metaData["jobName"] = *job.Name
|
||||||
|
metaData["slurmInfo"] = printSlurmInfo(job)
|
||||||
|
// metaData["jobScript"] = "What to put here?"
|
||||||
|
metaDataInBytes, err := json.Marshal(metaData)
|
||||||
|
|
||||||
var defaultJob schema.BaseJob = schema.BaseJob{
|
var defaultJob schema.BaseJob = schema.BaseJob{
|
||||||
JobID: int64(*job.JobId),
|
JobID: int64(*job.JobId),
|
||||||
User: *job.UserName,
|
User: *job.UserName,
|
||||||
@ -308,7 +357,7 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
|
|||||||
ArrayJobId: int64(*job.ArrayJobId),
|
ArrayJobId: int64(*job.ArrayJobId),
|
||||||
NumNodes: *job.NodeCount,
|
NumNodes: *job.NodeCount,
|
||||||
NumHWThreads: *job.Cpus,
|
NumHWThreads: *job.Cpus,
|
||||||
// NumAcc: job.NumAcc,
|
NumAcc: totalGPUs,
|
||||||
Exclusive: exclusive,
|
Exclusive: exclusive,
|
||||||
// MonitoringStatus: job.MonitoringStatus,
|
// MonitoringStatus: job.MonitoringStatus,
|
||||||
// SMT: *job.TasksPerCore,
|
// SMT: *job.TasksPerCore,
|
||||||
@ -321,10 +370,10 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
|
|||||||
RawResources: jobResourcesInBytes,
|
RawResources: jobResourcesInBytes,
|
||||||
// "job_resources": "allocated_nodes" "sockets":
|
// "job_resources": "allocated_nodes" "sockets":
|
||||||
// very important; has to be right
|
// very important; has to be right
|
||||||
Resources: job.JobResources,
|
Resources: resources,
|
||||||
// RawMetaData: job.RawMetaData,
|
RawMetaData: metaDataInBytes,
|
||||||
// optional metadata with'jobScript 'jobName': 'slurmInfo':
|
// optional metadata with'jobScript 'jobName': 'slurmInfo':
|
||||||
// MetaData: job.MetaData,
|
MetaData: metaData,
|
||||||
// ConcurrentJobs: job.ConcurrentJobs,
|
// ConcurrentJobs: job.ConcurrentJobs,
|
||||||
}
|
}
|
||||||
req := &schema.JobMeta{
|
req := &schema.JobMeta{
|
||||||
@ -342,23 +391,24 @@ func (sd *SlurmRestScheduler) HandleJobsResponse(jobsResponse openapi.V0038JobsR
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Check if completed job with combination of (job_id, cluster_id, start_time) already exists:
|
// Check if completed job with combination of (job_id, cluster_id, start_time) already exists:
|
||||||
existingJob, err := sd.JobRepository.Find(job.JobID, &job.Cluster, job.StartTime)
|
|
||||||
|
|
||||||
if err == nil {
|
|
||||||
existingJob.BaseJob.Duration = *job.EndTime - *job.StartTime
|
|
||||||
existingJob.BaseJob.State = schema.JobState(*job.JobState)
|
|
||||||
existingJob.BaseJob.Walltime = job.StartTime
|
|
||||||
var jobID int64
|
var jobID int64
|
||||||
jobID = int64(*job.JobId)
|
jobID = int64(*job.JobId)
|
||||||
|
existingJob, err := sd.JobRepository.Find(&jobID, job.Cluster, job.StartTime)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
existingJob.BaseJob.Duration = int32(*job.EndTime - *job.StartTime)
|
||||||
|
existingJob.BaseJob.State = schema.JobState(*job.JobState)
|
||||||
|
existingJob.BaseJob.Walltime = *job.StartTime
|
||||||
|
|
||||||
req := &StopJobRequest{
|
req := &StopJobRequest{
|
||||||
Cluster: job.Cluster,
|
Cluster: job.Cluster,
|
||||||
JobId: &jobID,
|
JobId: &jobID,
|
||||||
State: schema.JobState(*job.JobState),
|
State: schema.JobState(*job.JobState),
|
||||||
StartTime: existingJob.StartTime,
|
StartTime: &existingJob.StartTimeUnix,
|
||||||
StopTime: *job.EndTime,
|
StopTime: *job.EndTime,
|
||||||
}
|
}
|
||||||
// req := new(schema.JobMeta)
|
// req := new(schema.JobMeta)
|
||||||
id, err := sd.JobRepository.checkAndHandleStopJob(job, req)
|
sd.checkAndHandleStopJob(existingJob, req)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user