2024-04-11 23:04:30 +02:00
|
|
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
2022-09-05 17:46:38 +02:00
|
|
|
// All rights reserved.
|
|
|
|
// Use of this source code is governed by a MIT-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package archive
|
|
|
|
|
|
|
|
import (
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
|
2023-01-31 18:28:44 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
2023-12-01 13:22:01 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
2022-09-05 17:46:38 +02:00
|
|
|
)
|
|
|
|
|
2024-07-02 10:13:11 +02:00
|
|
|
var (
|
2024-07-11 11:09:14 +02:00
|
|
|
Clusters []*schema.Cluster
|
|
|
|
GlobalMetricList []*schema.GlobalMetricListItem
|
|
|
|
nodeLists map[string]map[string]NodeList
|
2024-07-02 10:13:11 +02:00
|
|
|
)
|
2022-09-05 17:46:38 +02:00
|
|
|
|
|
|
|
func initClusterConfig() error {
|
2022-09-07 12:24:45 +02:00
|
|
|
Clusters = []*schema.Cluster{}
|
2022-09-05 17:46:38 +02:00
|
|
|
nodeLists = map[string]map[string]NodeList{}
|
2024-07-11 11:09:14 +02:00
|
|
|
metricLookup := make(map[string]schema.GlobalMetricListItem)
|
2022-09-05 17:46:38 +02:00
|
|
|
|
2022-09-06 08:57:38 +02:00
|
|
|
for _, c := range ar.GetClusters() {
|
2022-09-05 17:46:38 +02:00
|
|
|
|
2022-09-06 08:57:38 +02:00
|
|
|
cluster, err := ar.LoadClusterCfg(c)
|
2022-09-05 17:46:38 +02:00
|
|
|
if err != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
log.Warnf("Error while loading cluster config for cluster '%v'", c)
|
2022-09-05 17:46:38 +02:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2022-09-07 12:24:45 +02:00
|
|
|
if len(cluster.Name) == 0 ||
|
|
|
|
len(cluster.MetricConfig) == 0 ||
|
|
|
|
len(cluster.SubClusters) == 0 {
|
2022-09-05 17:46:38 +02:00
|
|
|
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, mc := range cluster.MetricConfig {
|
|
|
|
if len(mc.Name) == 0 {
|
|
|
|
return errors.New("cluster.metricConfig.name should not be empty")
|
|
|
|
}
|
|
|
|
if mc.Timestep < 1 {
|
|
|
|
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
|
|
|
|
}
|
|
|
|
|
|
|
|
// For backwards compability...
|
|
|
|
if mc.Scope == "" {
|
|
|
|
mc.Scope = schema.MetricScopeNode
|
|
|
|
}
|
|
|
|
if !mc.Scope.Valid() {
|
|
|
|
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
|
|
|
|
}
|
2024-07-02 10:13:11 +02:00
|
|
|
|
2024-07-11 11:09:14 +02:00
|
|
|
ml, ok := metricLookup[mc.Name]
|
|
|
|
if !ok {
|
2024-07-12 13:20:54 +02:00
|
|
|
metricLookup[mc.Name] = schema.GlobalMetricListItem{
|
|
|
|
Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint,
|
|
|
|
}
|
2024-07-11 11:09:14 +02:00
|
|
|
ml = metricLookup[mc.Name]
|
|
|
|
}
|
|
|
|
availability := schema.ClusterSupport{Cluster: cluster.Name}
|
2024-07-02 10:13:11 +02:00
|
|
|
scLookup := make(map[string]*schema.SubClusterConfig)
|
|
|
|
|
|
|
|
for _, scc := range mc.SubClusters {
|
|
|
|
scLookup[scc.Name] = scc
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, sc := range cluster.SubClusters {
|
|
|
|
newMetric := mc
|
2024-07-04 06:49:59 +02:00
|
|
|
newMetric.SubClusters = nil
|
2024-07-02 10:13:11 +02:00
|
|
|
|
|
|
|
if cfg, ok := scLookup[sc.Name]; ok {
|
|
|
|
if !cfg.Remove {
|
2024-07-11 11:09:14 +02:00
|
|
|
availability.SubClusters = append(availability.SubClusters, sc.Name)
|
2024-07-02 10:13:11 +02:00
|
|
|
newMetric.Peak = cfg.Peak
|
|
|
|
newMetric.Normal = cfg.Normal
|
|
|
|
newMetric.Caution = cfg.Caution
|
|
|
|
newMetric.Alert = cfg.Alert
|
|
|
|
newMetric.Footprint = cfg.Footprint
|
2024-07-11 16:58:12 +02:00
|
|
|
newMetric.Energy = cfg.Energy
|
|
|
|
newMetric.LowerIsBetter = cfg.LowerIsBetter
|
2024-07-02 10:13:11 +02:00
|
|
|
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
|
2024-07-04 06:49:59 +02:00
|
|
|
|
2024-07-20 08:59:07 +02:00
|
|
|
if newMetric.Footprint != "" {
|
2024-07-04 06:49:59 +02:00
|
|
|
sc.Footprint = append(sc.Footprint, newMetric.Name)
|
2024-07-20 08:59:07 +02:00
|
|
|
ml.Footprint = newMetric.Footprint
|
2024-07-04 06:49:59 +02:00
|
|
|
}
|
2024-08-27 17:43:48 +02:00
|
|
|
if newMetric.Energy != "" {
|
2024-07-11 11:09:14 +02:00
|
|
|
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
|
|
|
|
}
|
2024-07-02 10:13:11 +02:00
|
|
|
}
|
2024-07-04 06:49:59 +02:00
|
|
|
} else {
|
2024-07-11 11:09:14 +02:00
|
|
|
availability.SubClusters = append(availability.SubClusters, sc.Name)
|
2024-07-04 06:49:59 +02:00
|
|
|
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
|
2024-07-02 10:13:11 +02:00
|
|
|
|
2024-07-20 08:59:07 +02:00
|
|
|
if newMetric.Footprint != "" {
|
2024-07-04 06:49:59 +02:00
|
|
|
sc.Footprint = append(sc.Footprint, newMetric.Name)
|
|
|
|
}
|
2024-08-27 17:43:48 +02:00
|
|
|
if newMetric.Energy != "" {
|
2024-07-11 11:09:14 +02:00
|
|
|
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
|
|
|
|
}
|
2024-07-02 10:13:11 +02:00
|
|
|
}
|
|
|
|
}
|
2024-07-11 11:09:14 +02:00
|
|
|
ml.Availability = append(metricLookup[mc.Name].Availability, availability)
|
|
|
|
metricLookup[mc.Name] = ml
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
|
2022-09-13 07:37:48 +02:00
|
|
|
Clusters = append(Clusters, cluster)
|
2022-09-05 17:46:38 +02:00
|
|
|
|
|
|
|
nodeLists[cluster.Name] = make(map[string]NodeList)
|
|
|
|
for _, sc := range cluster.SubClusters {
|
2023-03-24 15:21:31 +01:00
|
|
|
if sc.Nodes == "*" {
|
2022-09-05 17:46:38 +02:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
nl, err := ParseNodeList(sc.Nodes)
|
|
|
|
if err != nil {
|
2023-01-19 16:59:14 +01:00
|
|
|
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > in %s/cluster.json: %w", cluster.Name, err)
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
nodeLists[cluster.Name][sc.Name] = nl
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-07-11 11:09:14 +02:00
|
|
|
for _, ml := range metricLookup {
|
|
|
|
GlobalMetricList = append(GlobalMetricList, &ml)
|
|
|
|
}
|
|
|
|
|
2022-09-05 17:46:38 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-09-07 12:24:45 +02:00
|
|
|
func GetCluster(cluster string) *schema.Cluster {
|
2022-09-05 17:46:38 +02:00
|
|
|
for _, c := range Clusters {
|
|
|
|
if c.Name == cluster {
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-03-29 10:39:31 +02:00
|
|
|
func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
|
2022-09-05 17:46:38 +02:00
|
|
|
for _, c := range Clusters {
|
|
|
|
if c.Name == cluster {
|
|
|
|
for _, p := range c.SubClusters {
|
|
|
|
if p.Name == subcluster {
|
2023-03-29 10:39:31 +02:00
|
|
|
return p, nil
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-07-02 10:13:11 +02:00
|
|
|
return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster)
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
|
2022-09-07 12:24:45 +02:00
|
|
|
func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
|
2022-09-05 17:46:38 +02:00
|
|
|
for _, c := range Clusters {
|
|
|
|
if c.Name == cluster {
|
|
|
|
for _, m := range c.MetricConfig {
|
|
|
|
if m.Name == metric {
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// AssignSubCluster sets the `job.subcluster` property of the job based
|
|
|
|
// on its cluster and resources.
|
|
|
|
func AssignSubCluster(job *schema.BaseJob) error {
|
|
|
|
cluster := GetCluster(job.Cluster)
|
|
|
|
if cluster == nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster)
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if job.SubCluster != "" {
|
|
|
|
for _, sc := range cluster.SubClusters {
|
|
|
|
if sc.Name == job.SubCluster {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
2023-02-01 11:58:27 +01:00
|
|
|
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > already assigned subcluster %v unkown (cluster: %v)", job.SubCluster, job.Cluster)
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if len(job.Resources) == 0 {
|
2023-01-19 16:59:14 +01:00
|
|
|
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > job without any resources/hosts")
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
host0 := job.Resources[0].Hostname
|
|
|
|
for sc, nl := range nodeLists[job.Cluster] {
|
|
|
|
if nl != nil && nl.Contains(host0) {
|
|
|
|
job.SubCluster = sc
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-24 15:56:27 +01:00
|
|
|
if cluster.SubClusters[0].Nodes == "*" {
|
2022-09-05 17:46:38 +02:00
|
|
|
job.SubCluster = cluster.SubClusters[0].Name
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-02-01 11:58:27 +01:00
|
|
|
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", job.Cluster, host0)
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func GetSubClusterByNode(cluster, hostname string) (string, error) {
|
|
|
|
for sc, nl := range nodeLists[cluster] {
|
|
|
|
if nl != nil && nl.Contains(hostname) {
|
|
|
|
return sc, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
c := GetCluster(cluster)
|
|
|
|
if c == nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", cluster)
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if c.SubClusters[0].Nodes == "" {
|
|
|
|
return c.SubClusters[0].Name, nil
|
|
|
|
}
|
|
|
|
|
2023-02-01 11:58:27 +01:00
|
|
|
return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname)
|
2022-09-05 17:46:38 +02:00
|
|
|
}
|
2024-08-29 07:26:49 +02:00
|
|
|
|
|
|
|
func MetricIndex(mc []schema.MetricConfig, name string) (int, error) {
|
|
|
|
for i, m := range mc {
|
|
|
|
if m.Name == name {
|
|
|
|
return i, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-27 13:45:44 +02:00
|
|
|
return 0, fmt.Errorf("unknown metric name %s", name)
|
2024-08-29 07:26:49 +02:00
|
|
|
}
|