mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-01-14 13:39:08 +01:00
480 lines
18 KiB
Go
480 lines
18 KiB
Go
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
|
|
// All rights reserved.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package scheduler
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
|
)
|
|
|
|
type MetricConfig struct {
|
|
Name string `json:"name"`
|
|
Unit struct {
|
|
Base string `json:"base"`
|
|
} `json:"unit"`
|
|
Scope string `json:"scope"`
|
|
Aggregation string `json:"aggregation"`
|
|
Timestep int `json:"timestep"`
|
|
Peak float64 `json:"peak"`
|
|
Normal float64 `json:"normal"`
|
|
Caution float64 `json:"caution"`
|
|
Alert float64 `json:"alert"`
|
|
}
|
|
type SubCluster struct {
|
|
Name string `json:"name"`
|
|
Nodes string `json:"nodes"`
|
|
ProcessorType string `json:"processorType"`
|
|
SocketsPerNode int `json:"socketsPerNode"`
|
|
CoresPerSocket int `json:"coresPerSocket"`
|
|
ThreadsPerCore int `json:"threadsPerCore"`
|
|
FlopRateScalar struct {
|
|
Unit struct {
|
|
Base string `json:"base"`
|
|
Prefix string `json:"prefix"`
|
|
} `json:"unit"`
|
|
Value float64 `json:"value"`
|
|
} `json:"flopRateScalar"`
|
|
FlopRateSimd struct {
|
|
Unit struct {
|
|
Base string `json:"base"`
|
|
Prefix string `json:"prefix"`
|
|
} `json:"unit"`
|
|
Value float64 `json:"value"`
|
|
} `json:"flopRateSimd"`
|
|
MemoryBandwidth struct {
|
|
Unit struct {
|
|
Base string `json:"base"`
|
|
Prefix string `json:"prefix"`
|
|
} `json:"unit"`
|
|
Value float64 `json:"value"`
|
|
} `json:"memoryBandwidth"`
|
|
Topology struct {
|
|
Node []int `json:"node"`
|
|
Socket [][]int `json:"socket"`
|
|
MemoryDomain [][]int `json:"memoryDomain"`
|
|
Core [][]int `json:"core"`
|
|
Accelerators []struct {
|
|
ID string `json:"id"`
|
|
Type string `json:"type"`
|
|
Model string `json:"model"`
|
|
} `json:"accelerators"`
|
|
} `json:"topology"`
|
|
}
|
|
|
|
type ClusterConfig struct {
|
|
Name string `json:"name"`
|
|
MetricConfig []MetricConfig `json:"metricConfig"`
|
|
SubClusters []SubCluster `json:"subClusters"`
|
|
}
|
|
|
|
type Metadata struct {
|
|
Plugin struct {
|
|
Type string `json:"type"`
|
|
Name string `json:"name"`
|
|
} `json:"plugin"`
|
|
Slurm struct {
|
|
Version struct {
|
|
Major int `json:"major"`
|
|
Micro int `json:"micro"`
|
|
Minor int `json:"minor"`
|
|
} `json:"version"`
|
|
Release string `json:"release"`
|
|
} `json:"Slurm"`
|
|
}
|
|
|
|
type JobResource struct {
|
|
Nodes string `json:"nodes"`
|
|
AllocatedCores int `json:"allocated_cores"`
|
|
AllocatedHosts int `json:"allocated_hosts"`
|
|
AllocatedNodes []AllocatedNode `json:"allocated_nodes"`
|
|
}
|
|
|
|
type AllocatedNode struct {
|
|
Sockets map[string]Socket `json:"sockets"`
|
|
Nodename string `json:"nodename"`
|
|
CPUsUsed *int `json:"cpus_used"`
|
|
MemoryUsed *int `json:"memory_used"`
|
|
MemoryAllocated *int `json:"memory_allocated"`
|
|
}
|
|
|
|
type Socket struct {
|
|
Cores map[string]string `json:"cores"`
|
|
}
|
|
|
|
type Job struct {
|
|
Account string `json:"account"`
|
|
AccrueTime int `json:"accrue_time"`
|
|
AdminComment string `json:"admin_comment"`
|
|
ArrayJobID int64 `json:"array_job_id"`
|
|
ArrayTaskID interface{} `json:"array_task_id"`
|
|
ArrayMaxTasks int `json:"array_max_tasks"`
|
|
ArrayTaskString string `json:"array_task_string"`
|
|
AssociationID int `json:"association_id"`
|
|
BatchFeatures string `json:"batch_features"`
|
|
BatchFlag bool `json:"batch_flag"`
|
|
BatchHost string `json:"batch_host"`
|
|
Flags []string `json:"flags"`
|
|
BurstBuffer string `json:"burst_buffer"`
|
|
BurstBufferState string `json:"burst_buffer_state"`
|
|
Cluster string `json:"cluster"`
|
|
ClusterFeatures string `json:"cluster_features"`
|
|
Command string `json:"command"`
|
|
Comment string `json:"comment"`
|
|
Container string `json:"container"`
|
|
Contiguous bool `json:"contiguous"`
|
|
CoreSpec interface{} `json:"core_spec"`
|
|
ThreadSpec interface{} `json:"thread_spec"`
|
|
CoresPerSocket interface{} `json:"cores_per_socket"`
|
|
BillableTres interface{} `json:"billable_tres"`
|
|
CPUPerTask interface{} `json:"cpus_per_task"`
|
|
CPUFrequencyMinimum interface{} `json:"cpu_frequency_minimum"`
|
|
CPUFrequencyMaximum interface{} `json:"cpu_frequency_maximum"`
|
|
CPUFrequencyGovernor interface{} `json:"cpu_frequency_governor"`
|
|
CPUPerTres string `json:"cpus_per_tres"`
|
|
Deadline int `json:"deadline"`
|
|
DelayBoot int `json:"delay_boot"`
|
|
Dependency string `json:"dependency"`
|
|
DerivedExitCode int `json:"derived_exit_code"`
|
|
EligibleTime int `json:"eligible_time"`
|
|
EndTime int64 `json:"end_time"`
|
|
ExcludedNodes string `json:"excluded_nodes"`
|
|
ExitCode int `json:"exit_code"`
|
|
Features string `json:"features"`
|
|
FederationOrigin string `json:"federation_origin"`
|
|
FederationSiblingsActive string `json:"federation_siblings_active"`
|
|
FederationSiblingsViable string `json:"federation_siblings_viable"`
|
|
GresDetail []string `json:"gres_detail"`
|
|
GroupID int `json:"group_id"`
|
|
GroupName string `json:"group_name"`
|
|
JobID int64 `json:"job_id"`
|
|
JobResources JobResource `json:"job_resources"`
|
|
JobState string `json:"job_state"`
|
|
LastSchedEvaluation int `json:"last_sched_evaluation"`
|
|
Licenses string `json:"licenses"`
|
|
MaxCPUs int `json:"max_cpus"`
|
|
MaxNodes int `json:"max_nodes"`
|
|
MCSLabel string `json:"mcs_label"`
|
|
MemoryPerTres string `json:"memory_per_tres"`
|
|
Name string `json:"name"`
|
|
Nodes string `json:"nodes"`
|
|
Nice interface{} `json:"nice"`
|
|
TasksPerCore interface{} `json:"tasks_per_core"`
|
|
TasksPerNode int `json:"tasks_per_node"`
|
|
TasksPerSocket interface{} `json:"tasks_per_socket"`
|
|
TasksPerBoard int `json:"tasks_per_board"`
|
|
CPUs int32 `json:"cpus"`
|
|
NodeCount int32 `json:"node_count"`
|
|
Tasks int `json:"tasks"`
|
|
HETJobID int `json:"het_job_id"`
|
|
HETJobIDSet string `json:"het_job_id_set"`
|
|
HETJobOffset int `json:"het_job_offset"`
|
|
Partition string `json:"partition"`
|
|
MemoryPerNode interface{} `json:"memory_per_node"`
|
|
MemoryPerCPU int `json:"memory_per_cpu"`
|
|
MinimumCPUsPerNode int `json:"minimum_cpus_per_node"`
|
|
MinimumTmpDiskPerNode int `json:"minimum_tmp_disk_per_node"`
|
|
PreemptTime int `json:"preempt_time"`
|
|
PreSUSTime int `json:"pre_sus_time"`
|
|
Priority int `json:"priority"`
|
|
Profile interface{} `json:"profile"`
|
|
QoS string `json:"qos"`
|
|
Reboot bool `json:"reboot"`
|
|
RequiredNodes string `json:"required_nodes"`
|
|
Requeue bool `json:"requeue"`
|
|
ResizeTime int `json:"resize_time"`
|
|
RestartCnt int `json:"restart_cnt"`
|
|
ResvName string `json:"resv_name"`
|
|
Shared *string `json:"shared"`
|
|
ShowFlags []string `json:"show_flags"`
|
|
SocketsPerBoard int `json:"sockets_per_board"`
|
|
SocketsPerNode interface{} `json:"sockets_per_node"`
|
|
StartTime int64 `json:"start_time"`
|
|
StateDescription string `json:"state_description"`
|
|
StateReason string `json:"state_reason"`
|
|
StandardError string `json:"standard_error"`
|
|
StandardInput string `json:"standard_input"`
|
|
StandardOutput string `json:"standard_output"`
|
|
SubmitTime int `json:"submit_time"`
|
|
SuspendTime int `json:"suspend_time"`
|
|
SystemComment string `json:"system_comment"`
|
|
TimeLimit int `json:"time_limit"`
|
|
TimeMinimum int `json:"time_minimum"`
|
|
ThreadsPerCore interface{} `json:"threads_per_core"`
|
|
TresBind string `json:"tres_bind"`
|
|
TresFreq string `json:"tres_freq"`
|
|
TresPerJob string `json:"tres_per_job"`
|
|
TresPerNode string `json:"tres_per_node"`
|
|
TresPerSocket string `json:"tres_per_socket"`
|
|
TresPerTask string `json:"tres_per_task"`
|
|
TresReqStr string `json:"tres_req_str"`
|
|
TresAllocStr string `json:"tres_alloc_str"`
|
|
UserID int `json:"user_id"`
|
|
UserName string `json:"user_name"`
|
|
Wckey string `json:"wckey"`
|
|
CurrentWorkingDirectory string `json:"current_working_directory"`
|
|
}
|
|
|
|
type SlurmPayload struct {
|
|
Meta Metadata `json:"meta"`
|
|
Errors []interface{} `json:"errors"`
|
|
Jobs []Job `json:"jobs"`
|
|
}
|
|
|
|
type DumpedComment struct {
|
|
Administrator interface{} `json:"administrator"`
|
|
Job interface{} `json:"job"`
|
|
System interface{} `json:"system"`
|
|
}
|
|
|
|
type MaxLimits struct {
|
|
Running struct {
|
|
Tasks int `json:"tasks"`
|
|
} `json:"max"`
|
|
}
|
|
|
|
type ArrayInfo struct {
|
|
JobID int `json:"job_id"`
|
|
Limits MaxLimits `json:"limits"`
|
|
Task interface{} `json:"task"`
|
|
TaskID interface{} `json:"task_id"`
|
|
}
|
|
|
|
type Association struct {
|
|
Account string `json:"account"`
|
|
Cluster string `json:"cluster"`
|
|
Partition interface{} `json:"partition"`
|
|
User string `json:"user"`
|
|
}
|
|
|
|
type TimeInfo struct {
|
|
Elapsed int64 `json:"elapsed"`
|
|
Eligible int64 `json:"eligible"`
|
|
End int64 `json:"end"`
|
|
Start int64 `json:"start"`
|
|
Submission int64 `json:"submission"`
|
|
Suspended int64 `json:"suspended"`
|
|
System struct {
|
|
Seconds int `json:"seconds"`
|
|
Microseconds int `json:"microseconds"`
|
|
} `json:"system"`
|
|
Limit int `json:"limit"`
|
|
Total struct {
|
|
Seconds int `json:"seconds"`
|
|
Microseconds int `json:"microseconds"`
|
|
} `json:"total"`
|
|
User struct {
|
|
Seconds int `json:"seconds"`
|
|
Microseconds int `json:"microseconds"`
|
|
} `json:"user"`
|
|
}
|
|
|
|
type ExitCode struct {
|
|
Status string `json:"status"`
|
|
ReturnCode int `json:"return_code"`
|
|
}
|
|
|
|
type DumpedJob struct {
|
|
Account string `json:"account"`
|
|
Comment DumpedComment `json:"comment"`
|
|
AllocationNodes int `json:"allocation_nodes"`
|
|
Array ArrayInfo `json:"array"`
|
|
Association Association `json:"association"`
|
|
Cluster string `json:"cluster"`
|
|
Constraints string `json:"constraints"`
|
|
Container interface{} `json:"container"`
|
|
DerivedExitCode ExitCode `json:"derived_exit_code"`
|
|
Time TimeInfo `json:"time"`
|
|
ExitCode ExitCode `json:"exit_code"`
|
|
Flags []string `json:"flags"`
|
|
Group string `json:"group"`
|
|
Het struct {
|
|
JobID int `json:"job_id"`
|
|
JobOffset interface{} `json:"job_offset"`
|
|
} `json:"het"`
|
|
JobID int64 `json:"job_id"`
|
|
Name string `json:"name"`
|
|
MCS struct {
|
|
Label string `json:"label"`
|
|
} `json:"mcs"`
|
|
Nodes string `json:"nodes"`
|
|
Partition string `json:"partition"`
|
|
Priority int `json:"priority"`
|
|
QoS string `json:"qos"`
|
|
Required struct {
|
|
CPUs int `json:"CPUs"`
|
|
Memory int `json:"memory"`
|
|
} `json:"required"`
|
|
KillRequestUser interface{} `json:"kill_request_user"`
|
|
Reservation struct {
|
|
ID int `json:"id"`
|
|
Name int `json:"name"`
|
|
} `json:"reservation"`
|
|
State struct {
|
|
Current string `json:"current"`
|
|
Reason string `json:"reason"`
|
|
} `json:"state"`
|
|
Steps []struct {
|
|
Nodes struct {
|
|
List []string `json:"list"`
|
|
Count int `json:"count"`
|
|
Range string `json:"range"`
|
|
} `json:"nodes"`
|
|
Tres struct {
|
|
Requested struct {
|
|
Max []interface{} `json:"max"`
|
|
Min []interface{} `json:"min"`
|
|
Average []interface{} `json:"average"`
|
|
Total []interface{} `json:"total"`
|
|
} `json:"requested"`
|
|
Consumed struct {
|
|
Max []interface{} `json:"max"`
|
|
Min []interface{} `json:"min"`
|
|
Average []interface{} `json:"average"`
|
|
Total []interface{} `json:"total"`
|
|
} `json:"consumed"`
|
|
Allocated []struct {
|
|
Type string `json:"type"`
|
|
Name interface{} `json:"name"`
|
|
ID int `json:"id"`
|
|
Count int `json:"count"`
|
|
} `json:"allocated"`
|
|
} `json:"tres"`
|
|
Time TimeInfo `json:"time"`
|
|
ExitCode ExitCode `json:"exit_code"`
|
|
Tasks struct {
|
|
Count int `json:"count"`
|
|
} `json:"tasks"`
|
|
PID interface{} `json:"pid"`
|
|
CPU struct {
|
|
RequestedFrequency struct {
|
|
Min int `json:"min"`
|
|
Max int `json:"max"`
|
|
} `json:"requested_frequency"`
|
|
Governor []interface{} `json:"governor"`
|
|
} `json:"CPU"`
|
|
KillRequestUser interface{} `json:"kill_request_user"`
|
|
State string `json:"state"`
|
|
Statistics struct {
|
|
CPU struct {
|
|
ActualFrequency int `json:"actual_frequency"`
|
|
} `json:"CPU"`
|
|
Energy struct {
|
|
Consumed int `json:"consumed"`
|
|
} `json:"energy"`
|
|
} `json:"statistics"`
|
|
Step struct {
|
|
JobID int `json:"job_id"`
|
|
Het struct {
|
|
Component interface{} `json:"component"`
|
|
} `json:"het"`
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
} `json:"step"`
|
|
Task struct {
|
|
Distribution string `json:"distribution"`
|
|
} `json:"task"`
|
|
} `json:"steps"`
|
|
Tres struct {
|
|
Allocated []struct {
|
|
Type string `json:"type"`
|
|
Name interface{} `json:"name"`
|
|
ID int `json:"id"`
|
|
Count int `json:"count"`
|
|
} `json:"allocated"`
|
|
Requested []struct {
|
|
Type string `json:"type"`
|
|
Name interface{} `json:"name"`
|
|
ID int `json:"id"`
|
|
Count int `json:"count"`
|
|
} `json:"requested"`
|
|
} `json:"tres"`
|
|
User string `json:"user"`
|
|
Wckey struct {
|
|
Wckey string `json:"wckey"`
|
|
Flags []string `json:"flags"`
|
|
} `json:"wckey"`
|
|
WorkingDirectory string `json:"working_directory"`
|
|
}
|
|
|
|
type SlurmDBPayload struct {
|
|
Meta Metadata `json:"meta"`
|
|
Errors []string `json:"errors"`
|
|
Jobs []DumpedJob `json:"jobs"`
|
|
}
|
|
|
|
func DecodeClusterConfig(filename string) (ClusterConfig, error) {
|
|
var clusterConfig ClusterConfig
|
|
|
|
file, err := os.Open(filename)
|
|
if err != nil {
|
|
log.Errorf("Cluster config file not found. No cores/GPU ids available.")
|
|
return clusterConfig, err
|
|
}
|
|
defer file.Close()
|
|
|
|
decoder := json.NewDecoder(file)
|
|
err = decoder.Decode(&clusterConfig)
|
|
if err != nil {
|
|
log.Errorf("Error decoding cluster config file: %v", err)
|
|
}
|
|
|
|
log.Printf("Name: %s\n", clusterConfig.Name)
|
|
log.Printf("MetricConfig:\n")
|
|
for _, metric := range clusterConfig.MetricConfig {
|
|
log.Printf(" Name: %s\n", metric.Name)
|
|
log.Printf(" Unit Base: %s\n", metric.Unit.Base)
|
|
log.Printf(" Scope: %s\n", metric.Scope)
|
|
log.Printf(" Aggregation: %s\n", metric.Aggregation)
|
|
log.Printf(" Timestep: %d\n", metric.Timestep)
|
|
log.Printf(" Peak: %f\n", metric.Peak)
|
|
log.Printf(" Normal: %f\n", metric.Normal)
|
|
log.Printf(" Caution: %f\n", metric.Caution)
|
|
log.Printf(" Alert: %f\n", metric.Alert)
|
|
}
|
|
log.Printf("SubClusters:\n")
|
|
for _, subCluster := range clusterConfig.SubClusters {
|
|
log.Printf(" Name: %s\n", subCluster.Name)
|
|
log.Printf(" Nodes: %s\n", subCluster.Nodes)
|
|
log.Printf(" Processor Type: %s\n", subCluster.ProcessorType)
|
|
log.Printf(" Sockets Per Node: %d\n", subCluster.SocketsPerNode)
|
|
log.Printf(" Cores Per Socket: %d\n", subCluster.CoresPerSocket)
|
|
log.Printf(" Threads Per Core: %d\n", subCluster.ThreadsPerCore)
|
|
log.Printf(" Flop Rate Scalar Unit Base: %s\n", subCluster.FlopRateScalar.Unit.Base)
|
|
log.Printf(" Flop Rate Scalar Unit Prefix: %s\n", subCluster.FlopRateScalar.Unit.Prefix)
|
|
log.Printf(" Flop Rate Scalar Value: %f\n", subCluster.FlopRateScalar.Value)
|
|
log.Printf(" Flop Rate Simd Unit Base: %s\n", subCluster.FlopRateSimd.Unit.Base)
|
|
log.Printf(" Flop Rate Simd Unit Prefix: %s\n", subCluster.FlopRateSimd.Unit.Prefix)
|
|
log.Printf(" Flop Rate Simd Value: %f\n", subCluster.FlopRateSimd.Value)
|
|
log.Printf(" Memory Bandwidth Unit Base: %s\n", subCluster.MemoryBandwidth.Unit.Base)
|
|
log.Printf(" Memory Bandwidth Unit Prefix: %s\n", subCluster.MemoryBandwidth.Unit.Prefix)
|
|
log.Printf(" Memory Bandwidth Value: %f\n", subCluster.MemoryBandwidth.Value)
|
|
log.Printf(" Topology Node: %v\n", subCluster.Topology.Node)
|
|
log.Printf(" Topology Socket: %v\n", subCluster.Topology.Socket)
|
|
log.Printf(" Topology Memory Domain: %v\n", subCluster.Topology.MemoryDomain)
|
|
log.Printf(" Topology Core: %v\n", subCluster.Topology.Core)
|
|
log.Printf(" Topology Accelerators:\n")
|
|
for _, accelerator := range subCluster.Topology.Accelerators {
|
|
log.Printf(" ID: %s\n", accelerator.ID)
|
|
log.Printf(" Type: %s\n", accelerator.Type)
|
|
log.Printf(" Model: %s\n", accelerator.Model)
|
|
}
|
|
}
|
|
|
|
return clusterConfig, nil
|
|
}
|
|
|
|
func UnmarshalSlurmPayload(jsonPayload string) (SlurmPayload, error) {
|
|
var slurmData SlurmPayload
|
|
err := json.Unmarshal([]byte(jsonPayload), &slurmData)
|
|
if err != nil {
|
|
return slurmData, fmt.Errorf("failed to unmarshal JSON data: %v", err)
|
|
}
|
|
return slurmData, nil
|
|
}
|