// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. // All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. package scheduler import ( "database/sql" "encoding/json" "fmt" "math" "net/http" "net/url" "os" "os/exec" "strconv" "sync" "time" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" ) // A Response struct to map the Entire Response type Response struct { Name string `json:"name"` Jobs []Job `json:"job_entries"` } type SlurmRestSchedulerConfig struct { URL string `json:"url"` } type SlurmRestScheduler struct { url string } var client *http.Client func queryDB(qtime int64, clusterName string) ([]interface{}, error) { apiEndpoint := "/slurmdb/v0.0.38/jobs" // Construct the query parameters queryParams := url.Values{} queryParams.Set("users", "user1,user2") queryParams.Set("submit_time", "2023-01-01T00:00:00") // Add the query parameters to the API endpoint apiEndpoint += "?" + queryParams.Encode() // Create a new HTTP GET request req, err := http.NewRequest("GET", apiEndpoint, nil) if err != nil { log.Errorf("Error creating request:", err) } // Send the request resp, err := client.Do(req) if err != nil { log.Errorf("Error sending request:", err) } defer resp.Body.Close() // Check the response status code if resp.StatusCode != http.StatusOK { log.Errorf("API request failed with status:", resp.Status) } // Read the response body // Here you can parse the response body as needed // For simplicity, let's just print the response body var dbOutput []byte _, err = resp.Body.Read(dbOutput) if err != nil { log.Errorf("Error reading response body:", err) } log.Errorf("API response:", string(dbOutput)) dataJobs := make(map[string]interface{}) err = json.Unmarshal(dbOutput, &dataJobs) if err != nil { log.Errorf("Error parsing JSON response:", err) os.Exit(1) } if _, ok := dataJobs["jobs"]; !ok { log.Errorf("ERROR: jobs not found - response incomplete") os.Exit(1) } jobs, _ := dataJobs["jobs"].([]interface{}) return jobs, nil } func queryAllJobs() ([]interface{}, error) { var ctlOutput []byte apiEndpoint := "/slurm/v0.0.38/jobs" // Create a new HTTP GET request with query parameters req, err := http.NewRequest("GET", apiEndpoint, nil) if err != nil { log.Errorf("Error creating request:", err) } // Send the request resp, err := client.Do(req) if err != nil { log.Errorf("Error sending request:", err) } defer resp.Body.Close() // Check the response status code if resp.StatusCode != http.StatusOK { log.Errorf("API request failed with status:", resp.Status) } _, err = resp.Body.Read(ctlOutput) if err != nil { log.Errorf("Error reading response body:", err) } dataJob := make(map[string]interface{}) err = json.Unmarshal(ctlOutput, &dataJob) if err != nil { log.Errorf("Error parsing JSON response:", err) os.Exit(1) } if _, ok := dataJob["jobs"]; !ok { log.Errorf("ERROR: jobs not found - response incomplete") os.Exit(1) } jobs, _ := dataJob["jobs"].([]interface{}) return jobs, nil } func printSlurmInfo(job map[string]interface{}) string { cpusPerTask := "1" tasksStr, ok := job["tasks"].(string) if !ok { tasksInt, _ := job["tasks"].(int) tasksStr = strconv.Itoa(tasksInt) } cpusStr, ok := job["cpus"].(string) if !ok { cpusInt, _ := job["cpus"].(int) cpusStr = strconv.Itoa(cpusInt) } tasks, _ := strconv.Atoi(tasksStr) cpus, _ := strconv.Atoi(cpusStr) if tasks > 0 { cpusPerTask = strconv.Itoa(int(math.Round(float64(cpus) / float64(tasks)))) } text := fmt.Sprintf(` JobId=%v JobName=%v UserId=%v(%v) GroupId=%v Account=%v QOS=%v Requeue=%v Restarts=%v BatchFlag=%v TimeLimit=%v SubmitTime=%v Partition=%v NodeList=%v NumNodes=%v NumCPUs=%v NumTasks=%v CPUs/Task=%v NTasksPerNode:Socket:Core=%v:%v:%v TRES_req=%v TRES_alloc=%v Command=%v WorkDir=%v StdErr=%v StdOut=%v`, job["job_id"], job["name"], job["user_name"], job["user_id"], job["group_id"], job["account"], job["qos"], job["requeue"], job["restart_cnt"], job["batch_flag"], job["time_limit"], time.Unix(int64(job["submit_time"].(float64)), 0).Format(time.RFC3338), job["partition"], job["nodes"], job["node_count"], cpus, tasks, cpusPerTask, job["tasks_per_node"], job["tasks_per_socket"], job["tasks_per_core"], job["tres_req_str"], job["tres_alloc_str"], job["command"], job["current_working_directory"], job["standard_error"], job["standard_output"], ) return text } func exitWithError(err error, output []byte) { if exitError, ok := err.(*exec.ExitError); ok { if exitError.ExitCode() == 28 { fmt.Fprintf(os.Stderr, "ERROR: API call failed with timeout; check slurmrestd.\nOutput:\n%s\n", output) } else { fmt.Fprintf(os.Stderr, "ERROR: API call failed with code %d;\nOutput:\n%s\n", exitError.ExitCode(), output) } } else { log.Errorf("ERROR:", err) } os.Exit(1) } func loadClusterConfig(filename string) (map[string]interface{}, error) { clusterConfigData := make(map[string]interface{}) file, err := os.Open(filename) if err != nil { log.Errorf("Cluster config file not found. No cores/GPU ids available.") return clusterConfigData, err } defer file.Close() decoder := json.NewDecoder(file) err = decoder.Decode(&clusterConfigData) if err != nil { log.Errorf("Error decoding cluster config file:", err) } return clusterConfigData, err } func (sd *SlurmRestScheduler) Init(rawConfig json.RawMessage) error { clusterConfigData, err := loadClusterConfig("cluster-fritz.json") for k, v := range clusterConfigData { switch c := v.(type) { case string: fmt.Printf("Item %q is a string, containing %q\n", k, c) case float64: fmt.Printf("Looks like item %q is a number, specifically %f\n", k, c) default: fmt.Printf("Not sure what type item %q is, but I think it might be %T\n", k, c) } } // Create an HTTP client client = &http.Client{} return err } func (sd *SlurmRestScheduler) Sync() { // for _, job := range jobs.GetJobs() { // fmt.Printf("Job %s - %s\n", job.GetJobId(), job.GetJobState()) // } jobsResponse, err := queryAllJobs() if err != nil { log.Fatal(err.Error()) } // Fetch an example instance of V0037JobsResponse // jobsResponse := V0037JobsResponse{} // Iterate over the Jobs slice for _, job := range jobsResponse.Jobs { // Process each job fmt.Printf("Job ID: %s\n", job.JobID) fmt.Printf("Job Name: %s\n", *job.Name) fmt.Printf("Job State: %s\n", *job.JobState) fmt.Println("Job StartTime:", *job.StartTime) // is aquire lock to avoid race condition between API calls needed? // aquire lock to avoid race condition between API calls var unlockOnce sync.Once sd.RepositoryMutex.Lock() defer unlockOnce.Do(sd.RepositoryMutex.Unlock) // is "running" one of JSON state? if *job.JobState == "running" { // Check if combination of (job_id, cluster_id, start_time) already exists: jobs, err := sd.JobRepository.FindAll(job.JobID, &job.Cluster, job.StartTime) if err != nil || err != sql.ErrNoRows { log.Errorf("checking for duplicate failed: %s", err.Error()) return } else if err == nil { if len(jobs) == 0 { var defaultJob schema.BaseJob = schema.BaseJob{ JobID: job.JobID, User: job.User, Project: job.Project, Cluster: job.Cluster, SubCluster: job.SubCluster, Partition: job.Partition, ArrayJobId: job.ArrayJobId, NumNodes: job.NumNodes, NumHWThreads: job.NumHWThreads, NumAcc: job.NumAcc, Exclusive: job.Exclusive, MonitoringStatus: job.MonitoringStatus, SMT: job.SMT, State: job.State, Duration: job.Duration, Walltime: job.Walltime, Tags: job.Tags, RawResources: job.RawResources, Resources: job.Resources, RawMetaData: job.RawMetaData, MetaData: job.MetaData, ConcurrentJobs: job.ConcurrentJobs, } req := &schema.JobMeta{ BaseJob: defaultJob, StartTime: job.StartTime, Statistics: make(map[string]schema.JobStatistics), } // req := new(schema.JobMeta) id, err := sd.JobRepository.Start(req) } else { for _, job := range jobs { log.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d", job.ID) } } } } else { // Check if completed job with combination of (job_id, cluster_id, start_time) already exists: existingJob, err := sd.JobRepository.Find(job.JobID, &job.Cluster, job.StartTime) if err == nil { existingJob.BaseJob.Duration = job.EndTime - job.StartTime existingJob.BaseJob.State = job.State existingJob.BaseJob.Walltime = job.StartTime req := &StopJobRequest{ Cluster: job.Cluster, JobId: job.JobId, State: job.State, StartTime: existingJob.StartTime, StopTime: job.StartTime, } // req := new(schema.JobMeta) id, err := sd.JobRepository.checkAndHandleStopJob(job, req) } } } }