cc-backend/internal/scheduler/slurmRest.go
2023-09-05 23:26:34 +02:00

349 lines
9.1 KiB
Go

// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package scheduler
import (
"database/sql"
"encoding/json"
"fmt"
"math"
"net/http"
"net/url"
"os"
"os/exec"
"strconv"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// A Response struct to map the Entire Response
type Response struct {
Name string `json:"name"`
Jobs []Job `json:"job_entries"`
}
type SlurmRestSchedulerConfig struct {
URL string `json:"url"`
}
type SlurmRestScheduler struct {
url string
}
var client *http.Client
func queryDB(qtime int64, clusterName string) ([]interface{}, error) {
apiEndpoint := "/slurmdb/v0.0.38/jobs"
// Construct the query parameters
queryParams := url.Values{}
queryParams.Set("users", "user1,user2")
queryParams.Set("submit_time", "2023-01-01T00:00:00")
// Add the query parameters to the API endpoint
apiEndpoint += "?" + queryParams.Encode()
// Create a new HTTP GET request
req, err := http.NewRequest("GET", apiEndpoint, nil)
if err != nil {
log.Errorf("Error creating request:", err)
}
// Send the request
resp, err := client.Do(req)
if err != nil {
log.Errorf("Error sending request:", err)
}
defer resp.Body.Close()
// Check the response status code
if resp.StatusCode != http.StatusOK {
log.Errorf("API request failed with status:", resp.Status)
}
// Read the response body
// Here you can parse the response body as needed
// For simplicity, let's just print the response body
var dbOutput []byte
_, err = resp.Body.Read(dbOutput)
if err != nil {
log.Errorf("Error reading response body:", err)
}
log.Errorf("API response:", string(dbOutput))
dataJobs := make(map[string]interface{})
err = json.Unmarshal(dbOutput, &dataJobs)
if err != nil {
log.Errorf("Error parsing JSON response:", err)
os.Exit(1)
}
if _, ok := dataJobs["jobs"]; !ok {
log.Errorf("ERROR: jobs not found - response incomplete")
os.Exit(1)
}
jobs, _ := dataJobs["jobs"].([]interface{})
return jobs, nil
}
func queryAllJobs() ([]interface{}, error) {
var ctlOutput []byte
apiEndpoint := "/slurm/v0.0.38/jobs"
// Create a new HTTP GET request with query parameters
req, err := http.NewRequest("GET", apiEndpoint, nil)
if err != nil {
log.Errorf("Error creating request:", err)
}
// Send the request
resp, err := client.Do(req)
if err != nil {
log.Errorf("Error sending request:", err)
}
defer resp.Body.Close()
// Check the response status code
if resp.StatusCode != http.StatusOK {
log.Errorf("API request failed with status:", resp.Status)
}
_, err = resp.Body.Read(ctlOutput)
if err != nil {
log.Errorf("Error reading response body:", err)
}
dataJob := make(map[string]interface{})
err = json.Unmarshal(ctlOutput, &dataJob)
if err != nil {
log.Errorf("Error parsing JSON response:", err)
os.Exit(1)
}
if _, ok := dataJob["jobs"]; !ok {
log.Errorf("ERROR: jobs not found - response incomplete")
os.Exit(1)
}
jobs, _ := dataJob["jobs"].([]interface{})
return jobs, nil
}
func printSlurmInfo(job map[string]interface{}) string {
cpusPerTask := "1"
tasksStr, ok := job["tasks"].(string)
if !ok {
tasksInt, _ := job["tasks"].(int)
tasksStr = strconv.Itoa(tasksInt)
}
cpusStr, ok := job["cpus"].(string)
if !ok {
cpusInt, _ := job["cpus"].(int)
cpusStr = strconv.Itoa(cpusInt)
}
tasks, _ := strconv.Atoi(tasksStr)
cpus, _ := strconv.Atoi(cpusStr)
if tasks > 0 {
cpusPerTask = strconv.Itoa(int(math.Round(float64(cpus) / float64(tasks))))
}
text := fmt.Sprintf(`
JobId=%v JobName=%v
UserId=%v(%v) GroupId=%v
Account=%v QOS=%v
Requeue=%v Restarts=%v BatchFlag=%v
TimeLimit=%v
SubmitTime=%v
Partition=%v
NodeList=%v
NumNodes=%v NumCPUs=%v NumTasks=%v CPUs/Task=%v
NTasksPerNode:Socket:Core=%v:%v:%v
TRES_req=%v
TRES_alloc=%v
Command=%v
WorkDir=%v
StdErr=%v
StdOut=%v`,
job["job_id"], job["name"],
job["user_name"], job["user_id"], job["group_id"],
job["account"], job["qos"],
job["requeue"], job["restart_cnt"], job["batch_flag"],
job["time_limit"],
time.Unix(int64(job["submit_time"].(float64)), 0).Format(time.RFC3338),
job["partition"],
job["nodes"],
job["node_count"], cpus, tasks, cpusPerTask,
job["tasks_per_node"], job["tasks_per_socket"], job["tasks_per_core"],
job["tres_req_str"],
job["tres_alloc_str"],
job["command"],
job["current_working_directory"],
job["standard_error"],
job["standard_output"],
)
return text
}
func exitWithError(err error, output []byte) {
if exitError, ok := err.(*exec.ExitError); ok {
if exitError.ExitCode() == 28 {
fmt.Fprintf(os.Stderr, "ERROR: API call failed with timeout; check slurmrestd.\nOutput:\n%s\n", output)
} else {
fmt.Fprintf(os.Stderr, "ERROR: API call failed with code %d;\nOutput:\n%s\n", exitError.ExitCode(), output)
}
} else {
log.Errorf("ERROR:", err)
}
os.Exit(1)
}
func loadClusterConfig(filename string) (map[string]interface{}, error) {
clusterConfigData := make(map[string]interface{})
file, err := os.Open(filename)
if err != nil {
log.Errorf("Cluster config file not found. No cores/GPU ids available.")
return clusterConfigData, err
}
defer file.Close()
decoder := json.NewDecoder(file)
err = decoder.Decode(&clusterConfigData)
if err != nil {
log.Errorf("Error decoding cluster config file:", err)
}
return clusterConfigData, err
}
func (sd *SlurmRestScheduler) Init(rawConfig json.RawMessage) error {
clusterConfigData, err := loadClusterConfig("cluster-fritz.json")
for k, v := range clusterConfigData {
switch c := v.(type) {
case string:
fmt.Printf("Item %q is a string, containing %q\n", k, c)
case float64:
fmt.Printf("Looks like item %q is a number, specifically %f\n", k, c)
default:
fmt.Printf("Not sure what type item %q is, but I think it might be %T\n", k, c)
}
}
// Create an HTTP client
client = &http.Client{}
return err
}
func (sd *SlurmRestScheduler) Sync() {
// for _, job := range jobs.GetJobs() {
// fmt.Printf("Job %s - %s\n", job.GetJobId(), job.GetJobState())
// }
jobsResponse, err := queryAllJobs()
if err != nil {
log.Fatal(err.Error())
}
// Fetch an example instance of V0037JobsResponse
// jobsResponse := V0037JobsResponse{}
// Iterate over the Jobs slice
for _, job := range jobsResponse.Jobs {
// Process each job
fmt.Printf("Job ID: %s\n", job.JobID)
fmt.Printf("Job Name: %s\n", *job.Name)
fmt.Printf("Job State: %s\n", *job.JobState)
fmt.Println("Job StartTime:", *job.StartTime)
// is aquire lock to avoid race condition between API calls needed?
// aquire lock to avoid race condition between API calls
var unlockOnce sync.Once
sd.RepositoryMutex.Lock()
defer unlockOnce.Do(sd.RepositoryMutex.Unlock)
// is "running" one of JSON state?
if *job.JobState == "running" {
// Check if combination of (job_id, cluster_id, start_time) already exists:
jobs, err := sd.JobRepository.FindAll(job.JobID, &job.Cluster, job.StartTime)
if err != nil || err != sql.ErrNoRows {
log.Errorf("checking for duplicate failed: %s", err.Error())
return
} else if err == nil {
if len(jobs) == 0 {
var defaultJob schema.BaseJob = schema.BaseJob{
JobID: job.JobID,
User: job.User,
Project: job.Project,
Cluster: job.Cluster,
SubCluster: job.SubCluster,
Partition: job.Partition,
ArrayJobId: job.ArrayJobId,
NumNodes: job.NumNodes,
NumHWThreads: job.NumHWThreads,
NumAcc: job.NumAcc,
Exclusive: job.Exclusive,
MonitoringStatus: job.MonitoringStatus,
SMT: job.SMT,
State: job.State,
Duration: job.Duration,
Walltime: job.Walltime,
Tags: job.Tags,
RawResources: job.RawResources,
Resources: job.Resources,
RawMetaData: job.RawMetaData,
MetaData: job.MetaData,
ConcurrentJobs: job.ConcurrentJobs,
}
req := &schema.JobMeta{
BaseJob: defaultJob,
StartTime: job.StartTime,
Statistics: make(map[string]schema.JobStatistics),
}
// req := new(schema.JobMeta)
id, err := sd.JobRepository.Start(req)
} else {
for _, job := range jobs {
log.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d", job.ID)
}
}
}
} else {
// Check if completed job with combination of (job_id, cluster_id, start_time) already exists:
existingJob, err := sd.JobRepository.Find(job.JobID, &job.Cluster, job.StartTime)
if err == nil {
existingJob.BaseJob.Duration = job.EndTime - job.StartTime
existingJob.BaseJob.State = job.State
existingJob.BaseJob.Walltime = job.StartTime
req := &StopJobRequest{
Cluster: job.Cluster,
JobId: job.JobId,
State: job.State,
StartTime: existingJob.StartTime,
StopTime: job.StartTime,
}
// req := new(schema.JobMeta)
id, err := sd.JobRepository.checkAndHandleStopJob(job, req)
}
}
}
}