mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-26 05:19:05 +01:00
Added JSON Payload Converter
This commit is contained in:
parent
59f6658344
commit
a3fbdbcf90
2
go.mod
2
go.mod
@ -33,7 +33,7 @@ require (
|
|||||||
require (
|
require (
|
||||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
|
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
|
||||||
github.com/ClusterCockpit/slurm-rest-client-0_0_37 v0.0.0-20230901125459-dc653ac37420 // indirect
|
github.com/ClusterCockpit/slurm-rest-client-0_0_37 v0.0.0-20230901125459-dc653ac37420 // indirect
|
||||||
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20230906120742-0f15562ea666 // indirect
|
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20231010134848-707e8b20bde7 // indirect
|
||||||
github.com/KyleBanks/depth v1.2.1 // indirect
|
github.com/KyleBanks/depth v1.2.1 // indirect
|
||||||
github.com/agnivade/levenshtein v1.1.1 // indirect
|
github.com/agnivade/levenshtein v1.1.1 // indirect
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||||
|
2
go.sum
2
go.sum
@ -86,6 +86,8 @@ github.com/ClusterCockpit/slurm-rest-client-0_0_37 v0.0.0-20230901125459-dc653ac
|
|||||||
github.com/ClusterCockpit/slurm-rest-client-0_0_37 v0.0.0-20230901125459-dc653ac37420/go.mod h1:oNgVG2puNj9cNw/KgqLbgE1pPOn8jXORX3ErP58LcAA=
|
github.com/ClusterCockpit/slurm-rest-client-0_0_37 v0.0.0-20230901125459-dc653ac37420/go.mod h1:oNgVG2puNj9cNw/KgqLbgE1pPOn8jXORX3ErP58LcAA=
|
||||||
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20230906120742-0f15562ea666 h1:8PofHcOwEMmeAFqJjvAEgnu7rbRHAwJhd2XJ9u/YxiU=
|
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20230906120742-0f15562ea666 h1:8PofHcOwEMmeAFqJjvAEgnu7rbRHAwJhd2XJ9u/YxiU=
|
||||||
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20230906120742-0f15562ea666/go.mod h1:vxaj1my0GNoCXx4bYyOTA/IZP/IOZImtdOIn4T7pCa4=
|
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20230906120742-0f15562ea666/go.mod h1:vxaj1my0GNoCXx4bYyOTA/IZP/IOZImtdOIn4T7pCa4=
|
||||||
|
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20231010134848-707e8b20bde7 h1:YY/qDtFsp1DOJw/jyobiIBiIh1/yD2IVOdcK7EVEIKs=
|
||||||
|
github.com/ClusterCockpit/slurm-rest-client-0_0_38 v0.0.0-20231010134848-707e8b20bde7/go.mod h1:vxaj1my0GNoCXx4bYyOTA/IZP/IOZImtdOIn4T7pCa4=
|
||||||
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
|
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
|
||||||
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
|
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
|
||||||
github.com/Masterminds/squirrel v1.5.3 h1:YPpoceAcxuzIljlr5iWpNKaql7hLeG1KLSrhvdHpkZc=
|
github.com/Masterminds/squirrel v1.5.3 h1:YPpoceAcxuzIljlr5iWpNKaql7hLeG1KLSrhvdHpkZc=
|
||||||
|
278
internal/scheduler/payloadConverter.go
Normal file
278
internal/scheduler/payloadConverter.go
Normal file
@ -0,0 +1,278 @@
|
|||||||
|
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package scheduler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MetricConfig struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Unit struct {
|
||||||
|
Base string `json:"base"`
|
||||||
|
} `json:"unit"`
|
||||||
|
Scope string `json:"scope"`
|
||||||
|
Aggregation string `json:"aggregation"`
|
||||||
|
Timestep int `json:"timestep"`
|
||||||
|
Peak float64 `json:"peak"`
|
||||||
|
Normal float64 `json:"normal"`
|
||||||
|
Caution float64 `json:"caution"`
|
||||||
|
Alert float64 `json:"alert"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SubCluster struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Nodes string `json:"nodes"`
|
||||||
|
ProcessorType string `json:"processorType"`
|
||||||
|
SocketsPerNode int `json:"socketsPerNode"`
|
||||||
|
CoresPerSocket int `json:"coresPerSocket"`
|
||||||
|
ThreadsPerCore int `json:"threadsPerCore"`
|
||||||
|
FlopRateScalar struct {
|
||||||
|
Unit struct {
|
||||||
|
Base string `json:"base"`
|
||||||
|
Prefix string `json:"prefix"`
|
||||||
|
} `json:"unit"`
|
||||||
|
Value float64 `json:"value"`
|
||||||
|
} `json:"flopRateScalar"`
|
||||||
|
FlopRateSimd struct {
|
||||||
|
Unit struct {
|
||||||
|
Base string `json:"base"`
|
||||||
|
Prefix string `json:"prefix"`
|
||||||
|
} `json:"unit"`
|
||||||
|
Value float64 `json:"value"`
|
||||||
|
} `json:"flopRateSimd"`
|
||||||
|
MemoryBandwidth struct {
|
||||||
|
Unit struct {
|
||||||
|
Base string `json:"base"`
|
||||||
|
Prefix string `json:"prefix"`
|
||||||
|
} `json:"unit"`
|
||||||
|
Value float64 `json:"value"`
|
||||||
|
} `json:"memoryBandwidth"`
|
||||||
|
Topology struct {
|
||||||
|
Node []int `json:"node"`
|
||||||
|
Socket [][]int `json:"socket"`
|
||||||
|
MemoryDomain [][]int `json:"memoryDomain"`
|
||||||
|
Core [][]int `json:"core"`
|
||||||
|
Accelerators []struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
Model string `json:"model"`
|
||||||
|
} `json:"accelerators"`
|
||||||
|
} `json:"topology"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ClusterConfig struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
MetricConfig []MetricConfig `json:"metricConfig"`
|
||||||
|
SubClusters []SubCluster `json:"subClusters"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Meta struct {
|
||||||
|
Plugin struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
} `json:"plugin"`
|
||||||
|
Slurm struct {
|
||||||
|
Version struct {
|
||||||
|
Major int `json:"major"`
|
||||||
|
Micro int `json:"micro"`
|
||||||
|
Minor int `json:"minor"`
|
||||||
|
} `json:"version"`
|
||||||
|
Release string `json:"release"`
|
||||||
|
} `json:"Slurm"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Job struct {
|
||||||
|
Account string `json:"account"`
|
||||||
|
AccrueTime int `json:"accrue_time"`
|
||||||
|
AdminComment string `json:"admin_comment"`
|
||||||
|
ArrayJobID int `json:"array_job_id"`
|
||||||
|
ArrayTaskID interface{} `json:"array_task_id"`
|
||||||
|
ArrayMaxTasks int `json:"array_max_tasks"`
|
||||||
|
ArrayTaskString string `json:"array_task_string"`
|
||||||
|
AssociationID int `json:"association_id"`
|
||||||
|
BatchFeatures string `json:"batch_features"`
|
||||||
|
BatchFlag bool `json:"batch_flag"`
|
||||||
|
BatchHost string `json:"batch_host"`
|
||||||
|
Flags []string `json:"flags"`
|
||||||
|
BurstBuffer string `json:"burst_buffer"`
|
||||||
|
BurstBufferState string `json:"burst_buffer_state"`
|
||||||
|
Cluster string `json:"cluster"`
|
||||||
|
ClusterFeatures string `json:"cluster_features"`
|
||||||
|
Command string `json:"command"`
|
||||||
|
Comment string `json:"comment"`
|
||||||
|
Container string `json:"container"`
|
||||||
|
Contiguous bool `json:"contiguous"`
|
||||||
|
CoreSpec interface{} `json:"core_spec"`
|
||||||
|
ThreadSpec interface{} `json:"thread_spec"`
|
||||||
|
CoresPerSocket interface{} `json:"cores_per_socket"`
|
||||||
|
BillableTres interface{} `json:"billable_tres"`
|
||||||
|
CPUPerTask interface{} `json:"cpus_per_task"`
|
||||||
|
CPUFrequencyMinimum interface{} `json:"cpu_frequency_minimum"`
|
||||||
|
CPUFrequencyMaximum interface{} `json:"cpu_frequency_maximum"`
|
||||||
|
CPUFrequencyGovernor interface{} `json:"cpu_frequency_governor"`
|
||||||
|
CPUPerTres string `json:"cpus_per_tres"`
|
||||||
|
Deadline int `json:"deadline"`
|
||||||
|
DelayBoot int `json:"delay_boot"`
|
||||||
|
Dependency string `json:"dependency"`
|
||||||
|
DerivedExitCode int `json:"derived_exit_code"`
|
||||||
|
EligibleTime int `json:"eligible_time"`
|
||||||
|
EndTime int `json:"end_time"`
|
||||||
|
ExcludedNodes string `json:"excluded_nodes"`
|
||||||
|
ExitCode int `json:"exit_code"`
|
||||||
|
Features string `json:"features"`
|
||||||
|
FederationOrigin string `json:"federation_origin"`
|
||||||
|
FederationSiblingsActive string `json:"federation_siblings_active"`
|
||||||
|
FederationSiblingsViable string `json:"federation_siblings_viable"`
|
||||||
|
GresDetail []string `json:"gres_detail"`
|
||||||
|
GroupID int `json:"group_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
JobID int `json:"job_id"`
|
||||||
|
JobState string `json:"job_state"`
|
||||||
|
LastSchedEvaluation int `json:"last_sched_evaluation"`
|
||||||
|
Licenses string `json:"licenses"`
|
||||||
|
MaxCPUs int `json:"max_cpus"`
|
||||||
|
MaxNodes int `json:"max_nodes"`
|
||||||
|
MCSLabel string `json:"mcs_label"`
|
||||||
|
MemoryPerTres string `json:"memory_per_tres"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Nodes string `json:"nodes"`
|
||||||
|
Nice interface{} `json:"nice"`
|
||||||
|
TasksPerCore interface{} `json:"tasks_per_core"`
|
||||||
|
TasksPerNode int `json:"tasks_per_node"`
|
||||||
|
TasksPerSocket interface{} `json:"tasks_per_socket"`
|
||||||
|
TasksPerBoard int `json:"tasks_per_board"`
|
||||||
|
CPUs int `json:"cpus"`
|
||||||
|
NodeCount int `json:"node_count"`
|
||||||
|
Tasks int `json:"tasks"`
|
||||||
|
HETJobID int `json:"het_job_id"`
|
||||||
|
HETJobIDSet string `json:"het_job_id_set"`
|
||||||
|
HETJobOffset int `json:"het_job_offset"`
|
||||||
|
Partition string `json:"partition"`
|
||||||
|
MemoryPerNode interface{} `json:"memory_per_node"`
|
||||||
|
MemoryPerCPU int `json:"memory_per_cpu"`
|
||||||
|
MinimumCPUsPerNode int `json:"minimum_cpus_per_node"`
|
||||||
|
MinimumTmpDiskPerNode int `json:"minimum_tmp_disk_per_node"`
|
||||||
|
PreemptTime int `json:"preempt_time"`
|
||||||
|
PreSUSTime int `json:"pre_sus_time"`
|
||||||
|
Priority int `json:"priority"`
|
||||||
|
Profile interface{} `json:"profile"`
|
||||||
|
QOS string `json:"qos"`
|
||||||
|
Reboot bool `json:"reboot"`
|
||||||
|
RequiredNodes string `json:"required_nodes"`
|
||||||
|
Requeue bool `json:"requeue"`
|
||||||
|
ResizeTime int `json:"resize_time"`
|
||||||
|
RestartCnt int `json:"restart_cnt"`
|
||||||
|
ResvName string `json:"resv_name"`
|
||||||
|
Shared interface{} `json:"shared"`
|
||||||
|
ShowFlags []string `json:"show_flags"`
|
||||||
|
SocketsPerBoard int `json:"sockets_per_board"`
|
||||||
|
SocketsPerNode interface{} `json:"sockets_per_node"`
|
||||||
|
StartTime int `json:"start_time"`
|
||||||
|
StateDescription string `json:"state_description"`
|
||||||
|
StateReason string `json:"state_reason"`
|
||||||
|
StandardError string `json:"standard_error"`
|
||||||
|
StandardInput string `json:"standard_input"`
|
||||||
|
StandardOutput string `json:"standard_output"`
|
||||||
|
SubmitTime int `json:"submit_time"`
|
||||||
|
SuspendTime int `json:"suspend_time"`
|
||||||
|
SystemComment string `json:"system_comment"`
|
||||||
|
TimeLimit int `json:"time_limit"`
|
||||||
|
TimeMinimum int `json:"time_minimum"`
|
||||||
|
ThreadsPerCore interface{} `json:"threads_per_core"`
|
||||||
|
TresBind string `json:"tres_bind"`
|
||||||
|
TresFreq string `json:"tres_freq"`
|
||||||
|
TresPerJob string `json:"tres_per_job"`
|
||||||
|
TresPerNode string `json:"tres_per_node"`
|
||||||
|
TresPerSocket string `json:"tres_per_socket"`
|
||||||
|
TresPerTask string `json:"tres_per_task"`
|
||||||
|
TresReqStr string `json:"tres_req_str"`
|
||||||
|
TresAllocStr string `json:"tres_alloc_str"`
|
||||||
|
UserID int `json:"user_id"`
|
||||||
|
UserName string `json:"user_name"`
|
||||||
|
Wckey string `json:"wckey"`
|
||||||
|
CurrentWorkingDirectory string `json:"current_working_directory"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SlurmData struct {
|
||||||
|
Meta Meta `json:"meta"`
|
||||||
|
Errors []interface{} `json:"errors"`
|
||||||
|
Jobs []Job `json:"jobs"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func DecodeClusterConfig(filename string) (ClusterConfig, error) {
|
||||||
|
var clusterConfig ClusterConfig
|
||||||
|
|
||||||
|
file, err := os.Open(filename)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Cluster config file not found. No cores/GPU ids available.")
|
||||||
|
return clusterConfig, err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
decoder := json.NewDecoder(file)
|
||||||
|
err = decoder.Decode(&clusterConfig)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Error decoding cluster config file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("Name: %s\n", clusterConfig.Name)
|
||||||
|
log.Printf("MetricConfig:\n")
|
||||||
|
for _, metric := range clusterConfig.MetricConfig {
|
||||||
|
log.Printf(" Name: %s\n", metric.Name)
|
||||||
|
log.Printf(" Unit Base: %s\n", metric.Unit.Base)
|
||||||
|
log.Printf(" Scope: %s\n", metric.Scope)
|
||||||
|
log.Printf(" Aggregation: %s\n", metric.Aggregation)
|
||||||
|
log.Printf(" Timestep: %d\n", metric.Timestep)
|
||||||
|
log.Printf(" Peak: %f\n", metric.Peak)
|
||||||
|
log.Printf(" Normal: %f\n", metric.Normal)
|
||||||
|
log.Printf(" Caution: %f\n", metric.Caution)
|
||||||
|
log.Printf(" Alert: %f\n", metric.Alert)
|
||||||
|
}
|
||||||
|
log.Printf("SubClusters:\n")
|
||||||
|
for _, subCluster := range clusterConfig.SubClusters {
|
||||||
|
log.Printf(" Name: %s\n", subCluster.Name)
|
||||||
|
log.Printf(" Nodes: %s\n", subCluster.Nodes)
|
||||||
|
log.Printf(" Processor Type: %s\n", subCluster.ProcessorType)
|
||||||
|
log.Printf(" Sockets Per Node: %d\n", subCluster.SocketsPerNode)
|
||||||
|
log.Printf(" Cores Per Socket: %d\n", subCluster.CoresPerSocket)
|
||||||
|
log.Printf(" Threads Per Core: %d\n", subCluster.ThreadsPerCore)
|
||||||
|
log.Printf(" Flop Rate Scalar Unit Base: %s\n", subCluster.FlopRateScalar.Unit.Base)
|
||||||
|
log.Printf(" Flop Rate Scalar Unit Prefix: %s\n", subCluster.FlopRateScalar.Unit.Prefix)
|
||||||
|
log.Printf(" Flop Rate Scalar Value: %f\n", subCluster.FlopRateScalar.Value)
|
||||||
|
log.Printf(" Flop Rate Simd Unit Base: %s\n", subCluster.FlopRateSimd.Unit.Base)
|
||||||
|
log.Printf(" Flop Rate Simd Unit Prefix: %s\n", subCluster.FlopRateSimd.Unit.Prefix)
|
||||||
|
log.Printf(" Flop Rate Simd Value: %f\n", subCluster.FlopRateSimd.Value)
|
||||||
|
log.Printf(" Memory Bandwidth Unit Base: %s\n", subCluster.MemoryBandwidth.Unit.Base)
|
||||||
|
log.Printf(" Memory Bandwidth Unit Prefix: %s\n", subCluster.MemoryBandwidth.Unit.Prefix)
|
||||||
|
log.Printf(" Memory Bandwidth Value: %f\n", subCluster.MemoryBandwidth.Value)
|
||||||
|
log.Printf(" Topology Node: %v\n", subCluster.Topology.Node)
|
||||||
|
log.Printf(" Topology Socket: %v\n", subCluster.Topology.Socket)
|
||||||
|
log.Printf(" Topology Memory Domain: %v\n", subCluster.Topology.MemoryDomain)
|
||||||
|
log.Printf(" Topology Core: %v\n", subCluster.Topology.Core)
|
||||||
|
log.Printf(" Topology Accelerators:\n")
|
||||||
|
for _, accelerator := range subCluster.Topology.Accelerators {
|
||||||
|
log.Printf(" ID: %s\n", accelerator.ID)
|
||||||
|
log.Printf(" Type: %s\n", accelerator.Type)
|
||||||
|
log.Printf(" Model: %s\n", accelerator.Model)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return clusterConfig, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func UnmarshalSlurmPayload(jsonPayload string) (SlurmData, error) {
|
||||||
|
var slurmData SlurmData
|
||||||
|
err := json.Unmarshal([]byte(jsonPayload), &slurmData)
|
||||||
|
if err != nil {
|
||||||
|
return slurmData, fmt.Errorf("failed to unmarshal JSON data: %v", err)
|
||||||
|
}
|
||||||
|
return slurmData, nil
|
||||||
|
}
|
@ -5,7 +5,6 @@
|
|||||||
package scheduler
|
package scheduler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"database/sql"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
@ -16,6 +15,11 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||||
@ -28,7 +32,7 @@ type SlurmRestSchedulerConfig struct {
|
|||||||
|
|
||||||
JobRepository *repository.JobRepository
|
JobRepository *repository.JobRepository
|
||||||
|
|
||||||
clusterConfig map[string]interface{}
|
clusterConfig ClusterConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
var client *http.Client
|
var client *http.Client
|
||||||
@ -48,19 +52,19 @@ func queryDB(qtime int64, clusterName string) ([]interface{}, error) {
|
|||||||
// Create a new HTTP GET request
|
// Create a new HTTP GET request
|
||||||
req, err := http.NewRequest("GET", apiEndpoint, nil)
|
req, err := http.NewRequest("GET", apiEndpoint, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error creating request:", err)
|
log.Errorf("Error creating request: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send the request
|
// Send the request
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error sending request:", err)
|
log.Errorf("Error sending request: %v", err)
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
// Check the response status code
|
// Check the response status code
|
||||||
if resp.StatusCode != http.StatusOK {
|
if resp.StatusCode != http.StatusOK {
|
||||||
log.Errorf("API request failed with status:", resp.Status)
|
log.Errorf("API request failed with status: %v", resp.Status)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read the response body
|
// Read the response body
|
||||||
@ -69,15 +73,15 @@ func queryDB(qtime int64, clusterName string) ([]interface{}, error) {
|
|||||||
var dbOutput []byte
|
var dbOutput []byte
|
||||||
_, err = resp.Body.Read(dbOutput)
|
_, err = resp.Body.Read(dbOutput)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error reading response body:", err)
|
log.Errorf("Error reading response body: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Errorf("API response:", string(dbOutput))
|
log.Errorf("API response: %v", string(dbOutput))
|
||||||
|
|
||||||
dataJobs := make(map[string]interface{})
|
dataJobs := make(map[string]interface{})
|
||||||
err = json.Unmarshal(dbOutput, &dataJobs)
|
err = json.Unmarshal(dbOutput, &dataJobs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error parsing JSON response:", err)
|
log.Errorf("Error parsing JSON response: %v", err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,34 +97,53 @@ func queryDB(qtime int64, clusterName string) ([]interface{}, error) {
|
|||||||
func queryAllJobs() (openapi.V0038JobsResponse, error) {
|
func queryAllJobs() (openapi.V0038JobsResponse, error) {
|
||||||
var ctlOutput []byte
|
var ctlOutput []byte
|
||||||
|
|
||||||
apiEndpoint := "/slurm/v0.0.38/jobs"
|
apiEndpoint := "http://:8080/slurm/v0.0.38/jobs"
|
||||||
// Create a new HTTP GET request with query parameters
|
// Create a new HTTP GET request with query parameters
|
||||||
req, err := http.NewRequest("GET", apiEndpoint, nil)
|
req, err := http.NewRequest("GET", apiEndpoint, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error creating request:", err)
|
log.Errorf("Error creating request: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send the request
|
// Send the request
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error sending request:", err)
|
log.Errorf("Error sending request: %v", err)
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
// Check the response status code
|
// Check the response status code
|
||||||
if resp.StatusCode != http.StatusOK {
|
if resp.StatusCode != http.StatusOK {
|
||||||
log.Errorf("API request failed with status:", resp.Status)
|
log.Errorf("API request failed with status: %v", resp.Status)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = resp.Body.Read(ctlOutput)
|
_, err = resp.Body.Read(ctlOutput)
|
||||||
|
log.Printf("Received JSON Data: %v", ctlOutput)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error reading response body:", err)
|
log.Errorf("Error reading response body: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var jobsResponse openapi.V0038JobsResponse
|
var jobsResponse openapi.V0038JobsResponse
|
||||||
err = json.Unmarshal(ctlOutput, &jobsResponse)
|
err = json.Unmarshal(ctlOutput, &jobsResponse)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error parsing JSON response:", err)
|
log.Errorf("Error parsing JSON response: %v", err)
|
||||||
|
return jobsResponse, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return jobsResponse, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryAllJobsLocal() (openapi.V0038JobsResponse, error) {
|
||||||
|
// Read the JSON file
|
||||||
|
jobsData, err := os.ReadFile("slurm_0038.json")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println("Error reading JSON file:", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var jobsResponse openapi.V0038JobsResponse
|
||||||
|
err = json.Unmarshal(jobsData, &jobsResponse)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Error parsing JSON response: %v", err)
|
||||||
return jobsResponse, err
|
return jobsResponse, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,39 +202,23 @@ func exitWithError(err error, output []byte) {
|
|||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadClusterConfig(filename string) (map[string]interface{}, error) {
|
func (cfg *SlurmRestSchedulerConfig) Init() error {
|
||||||
clusterConfigData := make(map[string]interface{})
|
|
||||||
|
|
||||||
file, err := os.Open(filename)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Cluster config file not found. No cores/GPU ids available.")
|
|
||||||
return clusterConfigData, err
|
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
decoder := json.NewDecoder(file)
|
|
||||||
err = decoder.Decode(&clusterConfigData)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Error decoding cluster config file:", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return clusterConfigData, err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (cfg *SlurmRestSchedulerConfig) Init(rawConfig json.RawMessage) error {
|
|
||||||
var err error
|
var err error
|
||||||
cfg.clusterConfig, err = loadClusterConfig("cluster-fritz.json")
|
|
||||||
|
|
||||||
for k, v := range cfg.clusterConfig {
|
cfg.clusterConfig, err = DecodeClusterConfig("cluster-alex.json")
|
||||||
switch c := v.(type) {
|
|
||||||
case string:
|
// for k, v := range cfg.clusterConfig {
|
||||||
fmt.Printf("Item %q is a string, containing %q\n", k, c)
|
// fmt.Printf("Entry %q with value %x loaded\n", k, v)
|
||||||
case float64:
|
// // switch c := v.(type) {
|
||||||
fmt.Printf("Looks like item %q is a number, specifically %f\n", k, c)
|
// // case string:
|
||||||
default:
|
// // fmt.Printf("Item %q is a string, containing %q\n", k, c)
|
||||||
fmt.Printf("Not sure what type item %q is, but I think it might be %T\n", k, c)
|
// // case float64:
|
||||||
}
|
// // fmt.Printf("Looks like item %q is a number, specifically %f\n", k, c)
|
||||||
}
|
// // default:
|
||||||
|
// // fmt.Printf("Not sure what type item %q is, but I think it might be %T\n", k, c)
|
||||||
|
// // }
|
||||||
|
// }
|
||||||
|
// fmt.Printf("Cluster Name: %q\n", cfg.clusterConfig["name"])
|
||||||
|
|
||||||
// Create an HTTP client
|
// Create an HTTP client
|
||||||
client = &http.Client{}
|
client = &http.Client{}
|
||||||
@ -253,15 +260,49 @@ func (cfg *SlurmRestSchedulerConfig) checkAndHandleStopJob(job *schema.Job, req
|
|||||||
cfg.JobRepository.TriggerArchiving(job)
|
cfg.JobRepository.TriggerArchiving(job)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ConstructNodeAcceleratorMap(input string, accelerator string) map[string]string {
|
||||||
|
numberMap := make(map[string]string)
|
||||||
|
|
||||||
|
// Split the input by commas
|
||||||
|
groups := strings.Split(input, ",")
|
||||||
|
|
||||||
|
for _, group := range groups {
|
||||||
|
// Use regular expressions to match numbers and ranges
|
||||||
|
numberRangeRegex := regexp.MustCompile(`a\[(\d+)-(\d+)\]`)
|
||||||
|
numberRegex := regexp.MustCompile(`a(\d+)`)
|
||||||
|
|
||||||
|
if numberRangeRegex.MatchString(group) {
|
||||||
|
// Extract nodes from ranges
|
||||||
|
matches := numberRangeRegex.FindStringSubmatch(group)
|
||||||
|
if len(matches) == 3 {
|
||||||
|
start, _ := strconv.Atoi(matches[1])
|
||||||
|
end, _ := strconv.Atoi(matches[2])
|
||||||
|
for i := start; i <= end; i++ {
|
||||||
|
numberMap[matches[0]+fmt.Sprintf("%04d", i)] = accelerator
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if numberRegex.MatchString(group) {
|
||||||
|
// Extract individual node
|
||||||
|
matches := numberRegex.FindStringSubmatch(group)
|
||||||
|
if len(matches) == 2 {
|
||||||
|
numberMap[group] = accelerator
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return numberMap
|
||||||
|
}
|
||||||
|
|
||||||
func (cfg *SlurmRestSchedulerConfig) HandleJobsResponse(jobsResponse openapi.V0038JobsResponse) {
|
func (cfg *SlurmRestSchedulerConfig) HandleJobsResponse(jobsResponse openapi.V0038JobsResponse) {
|
||||||
|
|
||||||
// Iterate over the Jobs slice
|
// Iterate over the Jobs slice
|
||||||
for _, job := range jobsResponse.Jobs {
|
for _, job := range jobsResponse.Jobs {
|
||||||
// Process each job
|
// Process each job
|
||||||
fmt.Printf("Job ID: %s\n", job.JobId)
|
fmt.Printf("Job ID: %d\n", *job.JobId)
|
||||||
fmt.Printf("Job Name: %s\n", *job.Name)
|
fmt.Printf("Job Name: %s\n", *job.Name)
|
||||||
fmt.Printf("Job State: %s\n", *job.JobState)
|
fmt.Printf("Job State: %s\n", *job.JobState)
|
||||||
fmt.Println("Job StartTime:", *job.StartTime)
|
fmt.Println("Job StartTime:", *job.StartTime)
|
||||||
|
fmt.Println("Job Cluster:", *job.Cluster)
|
||||||
|
|
||||||
// aquire lock to avoid race condition between API calls
|
// aquire lock to avoid race condition between API calls
|
||||||
// var unlockOnce sync.Once
|
// var unlockOnce sync.Once
|
||||||
@ -269,22 +310,22 @@ func (cfg *SlurmRestSchedulerConfig) HandleJobsResponse(jobsResponse openapi.V00
|
|||||||
// defer unlockOnce.Do(cfg.RepositoryMutex.Unlock)
|
// defer unlockOnce.Do(cfg.RepositoryMutex.Unlock)
|
||||||
|
|
||||||
// is "running" one of JSON state?
|
// is "running" one of JSON state?
|
||||||
if *job.JobState == "running" {
|
if *job.JobState == "RUNNING" {
|
||||||
|
|
||||||
jobs, err := cfg.JobRepository.FindRunningJobs(*job.Cluster)
|
// jobs, err := cfg.JobRepository.FindRunningJobs(*job.Cluster)
|
||||||
if err != nil {
|
// if err != nil {
|
||||||
log.Fatalf("Failed to find running jobs: %v", err)
|
// log.Fatalf("Failed to find running jobs: %v", err)
|
||||||
}
|
// }
|
||||||
|
|
||||||
for id, job := range jobs {
|
// for id, job := range jobs {
|
||||||
fmt.Printf("Job ID: %d, Job: %+v\n", id, job)
|
// fmt.Printf("Job ID: %d, Job: %+v\n", id, job)
|
||||||
}
|
// }
|
||||||
|
|
||||||
if err != nil || err != sql.ErrNoRows {
|
// if err != nil || err != sql.ErrNoRows {
|
||||||
log.Errorf("checking for duplicate failed: %s", err.Error())
|
// log.Errorf("checking for duplicate failed: %s", err.Error())
|
||||||
return
|
// return
|
||||||
} else if err == nil {
|
// } else if err == nil {
|
||||||
if len(jobs) == 0 {
|
// if len(jobs) == 0 {
|
||||||
var exclusive int32
|
var exclusive int32
|
||||||
if job.Shared == nil {
|
if job.Shared == nil {
|
||||||
exclusive = 1
|
exclusive = 1
|
||||||
@ -294,7 +335,7 @@ func (cfg *SlurmRestSchedulerConfig) HandleJobsResponse(jobsResponse openapi.V00
|
|||||||
|
|
||||||
jobResourcesInBytes, err := json.Marshal(*job.JobResources)
|
jobResourcesInBytes, err := json.Marshal(*job.JobResources)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("JSON marshaling failed: %s", err)
|
log.Fatalf("JobResources JSON marshaling failed: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var resources []*schema.Resource
|
var resources []*schema.Resource
|
||||||
@ -317,32 +358,43 @@ func (cfg *SlurmRestSchedulerConfig) HandleJobsResponse(jobsResponse openapi.V00
|
|||||||
for _, node := range job.JobResources.AllocatedNodes {
|
for _, node := range job.JobResources.AllocatedNodes {
|
||||||
var res schema.Resource
|
var res schema.Resource
|
||||||
res.Hostname = *node.Nodename
|
res.Hostname = *node.Nodename
|
||||||
|
|
||||||
|
log.Debugf("Node %s V0038NodeAllocationSockets.Cores map size: %d\n", *node.Nodename, len(node.Sockets.Cores))
|
||||||
|
|
||||||
|
if node.Cpus == nil || node.Memory == nil {
|
||||||
|
log.Fatalf("Either node.Cpus or node.Memory is nil\n")
|
||||||
|
}
|
||||||
|
|
||||||
for k, v := range node.Sockets.Cores {
|
for k, v := range node.Sockets.Cores {
|
||||||
fmt.Printf("core id[%s] value[%s]\n", k, v)
|
fmt.Printf("core id[%s] value[%s]\n", k, v)
|
||||||
threadID, _ := strconv.Atoi(k)
|
threadID, _ := strconv.Atoi(k)
|
||||||
res.HWThreads = append(res.HWThreads, threadID)
|
res.HWThreads = append(res.HWThreads, threadID)
|
||||||
}
|
}
|
||||||
|
|
||||||
// cpu=512,mem=1875G,node=4,billing=512,gres\/gpu=32,gres\/gpu:a40=32
|
// cpu=512,mem=1875G,node=4,billing=512,gres\/gpu=32,gres\/gpu:a40=32
|
||||||
// For core/GPU id mapping, need to query from cluster config file
|
// For core/GPU id mapping, need to query from cluster config file
|
||||||
res.Accelerators = append(res.Accelerators, *job.TresAllocStr)
|
res.Accelerators = append(res.Accelerators, *job.Comment)
|
||||||
resources = append(resources, &res)
|
resources = append(resources, &res)
|
||||||
}
|
}
|
||||||
|
|
||||||
var metaData map[string]string
|
metaData := make(map[string]string)
|
||||||
metaData["jobName"] = *job.Name
|
metaData["jobName"] = *job.Name
|
||||||
metaData["slurmInfo"] = printSlurmInfo(job)
|
metaData["slurmInfo"] = printSlurmInfo(job)
|
||||||
|
|
||||||
switch slurmPath := cfg.clusterConfig["slurm_path"].(type) {
|
// switch slurmPath := cfg.clusterConfig["slurm_path"].(type) {
|
||||||
case string:
|
// case string:
|
||||||
commandCtlScriptTpl := fmt.Sprintf("%sscontrol -M %%s write batch_script %%s -", slurmPath)
|
// commandCtlScriptTpl := fmt.Sprintf("%sscontrol -M %%s write batch_script %%s -", slurmPath)
|
||||||
queryJobScript := fmt.Sprintf(commandCtlScriptTpl, job.Cluster, job.JobId)
|
// queryJobScript := fmt.Sprintf(commandCtlScriptTpl, job.Cluster, job.JobId)
|
||||||
metaData["jobScript"] = queryJobScript
|
// metaData["jobScript"] = queryJobScript
|
||||||
default:
|
// default:
|
||||||
// Type assertion failed
|
// // Type assertion failed
|
||||||
fmt.Println("Conversion of slurm_path to string failed")
|
// fmt.Println("Conversion of slurm_path to string failed", cfg.clusterConfig["slurm_path"])
|
||||||
}
|
// }
|
||||||
|
|
||||||
metaDataInBytes, err := json.Marshal(metaData)
|
metaDataInBytes, err := json.Marshal(metaData)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("metaData JSON marshaling failed: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
var defaultJob schema.BaseJob = schema.BaseJob{
|
var defaultJob schema.BaseJob = schema.BaseJob{
|
||||||
JobID: int64(*job.JobId),
|
JobID: int64(*job.JobId),
|
||||||
@ -373,24 +425,32 @@ func (cfg *SlurmRestSchedulerConfig) HandleJobsResponse(jobsResponse openapi.V00
|
|||||||
MetaData: metaData,
|
MetaData: metaData,
|
||||||
// ConcurrentJobs: job.ConcurrentJobs,
|
// ConcurrentJobs: job.ConcurrentJobs,
|
||||||
}
|
}
|
||||||
|
log.Debugf("Generated BaseJob with Resources=%v", defaultJob.Resources[0])
|
||||||
|
|
||||||
req := &schema.JobMeta{
|
req := &schema.JobMeta{
|
||||||
BaseJob: defaultJob,
|
BaseJob: defaultJob,
|
||||||
StartTime: *job.StartTime,
|
StartTime: *job.StartTime,
|
||||||
Statistics: make(map[string]schema.JobStatistics),
|
Statistics: make(map[string]schema.JobStatistics),
|
||||||
}
|
}
|
||||||
|
log.Debugf("Generated JobMeta %v", req.BaseJob.JobID)
|
||||||
|
|
||||||
// req := new(schema.JobMeta)
|
// req := new(schema.JobMeta)
|
||||||
id, err := cfg.JobRepository.Start(req)
|
// id, err := cfg.JobRepository.Start(req)
|
||||||
log.Debugf("Added %v", id)
|
// log.Debugf("Added %v", id)
|
||||||
} else {
|
// } else {
|
||||||
for _, job := range jobs {
|
// for _, job := range jobs {
|
||||||
log.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d", job.ID)
|
// log.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d", job.ID)
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
} else {
|
} else {
|
||||||
// Check if completed job with combination of (job_id, cluster_id, start_time) already exists:
|
// Check if completed job with combination of (job_id, cluster_id, start_time) already exists:
|
||||||
var jobID int64
|
var jobID int64
|
||||||
jobID = int64(*job.JobId)
|
jobID = int64(*job.JobId)
|
||||||
|
log.Debugf("jobID: %v Cluster: %v StartTime: %v", jobID, *job.Cluster, *job.StartTime)
|
||||||
|
// commented out as it will cause panic
|
||||||
|
// note down params invoked
|
||||||
|
|
||||||
existingJob, err := cfg.JobRepository.Find(&jobID, job.Cluster, job.StartTime)
|
existingJob, err := cfg.JobRepository.Find(&jobID, job.Cluster, job.StartTime)
|
||||||
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@ -416,7 +476,7 @@ func (cfg *SlurmRestSchedulerConfig) HandleJobsResponse(jobsResponse openapi.V00
|
|||||||
func (cfg *SlurmRestSchedulerConfig) Sync() {
|
func (cfg *SlurmRestSchedulerConfig) Sync() {
|
||||||
|
|
||||||
// Fetch an instance of V0037JobsResponse
|
// Fetch an instance of V0037JobsResponse
|
||||||
jobsResponse, err := queryAllJobs()
|
jobsResponse, err := queryAllJobsLocal()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err.Error())
|
log.Fatal(err.Error())
|
||||||
}
|
}
|
||||||
|
@ -110,10 +110,10 @@ func injectPayload() {
|
|||||||
|
|
||||||
// Start the HTTP server on port 8080
|
// Start the HTTP server on port 8080
|
||||||
fmt.Println("Listening on :8080...")
|
fmt.Println("Listening on :8080...")
|
||||||
http.ListenAndServe(":8080", nil)
|
go http.ListenAndServe(":8080", nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func loadSlurmNatsScheduler() {
|
||||||
cfgData := []byte(`{"target": "localhost"}`)
|
cfgData := []byte(`{"target": "localhost"}`)
|
||||||
|
|
||||||
var sch scheduler.SlurmNatsScheduler
|
var sch scheduler.SlurmNatsScheduler
|
||||||
@ -122,5 +122,16 @@ func main() {
|
|||||||
|
|
||||||
// go injectPayload()
|
// go injectPayload()
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
var sch scheduler.SlurmRestSchedulerConfig
|
||||||
|
sch.Init()
|
||||||
|
|
||||||
|
// injectPayload()
|
||||||
|
|
||||||
|
sch.Sync()
|
||||||
|
|
||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user