From bd89ce7cc92ebb520efc3e13e33e7461e6529822 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 2 Jul 2024 10:13:11 +0200 Subject: [PATCH] Extend schema and start Unit test implementation Does not compile and work yet --- pkg/archive/clusterConfig.go | 39 +- pkg/archive/clusterConfig_test.go | 15 + pkg/archive/fsBackend_test.go | 5 +- .../testdata/archive/alex/cluster.json | 2774 +++++++++++++++++ .../testdata/archive/fritz/cluster.json | 746 +++++ pkg/schema/cluster.go | 22 +- 6 files changed, 3580 insertions(+), 21 deletions(-) create mode 100644 pkg/archive/clusterConfig_test.go create mode 100644 pkg/archive/testdata/archive/alex/cluster.json create mode 100644 pkg/archive/testdata/archive/fritz/cluster.json diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index d0bf397..14c9fd9 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -12,11 +12,12 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" ) -var Clusters []*schema.Cluster -var nodeLists map[string]map[string]NodeList +var ( + Clusters []*schema.Cluster + nodeLists map[string]map[string]NodeList +) func initClusterConfig() error { - Clusters = []*schema.Cluster{} nodeLists = map[string]map[string]NodeList{} @@ -49,6 +50,32 @@ func initClusterConfig() error { if !mc.Scope.Valid() { return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)") } + + scLookup := make(map[string]*schema.SubClusterConfig) + + for _, scc := range mc.SubClusters { + scLookup[scc.Name] = scc + } + + for _, sc := range cluster.SubClusters { + newMetric := mc + + if cfg, ok := scLookup[sc.Name]; ok { + if !cfg.Remove { + newMetric.Peak = cfg.Peak + newMetric.Peak = cfg.Peak + newMetric.Normal = cfg.Normal + newMetric.Caution = cfg.Caution + newMetric.Alert = cfg.Alert + newMetric.Footprint = cfg.Footprint + sc.MetricConfig = append(sc.MetricConfig, *newMetric) + } + } + + if newMetric.Footprint { + sc.Footprint = append(sc.Footprint, newMetric.Name) + } + } } Clusters = append(Clusters, cluster) @@ -71,7 +98,6 @@ func initClusterConfig() error { } func GetCluster(cluster string) *schema.Cluster { - for _, c := range Clusters { if c.Name == cluster { return c @@ -90,11 +116,10 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) { } } } - return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster) + return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster) } func GetMetricConfig(cluster, metric string) *schema.MetricConfig { - for _, c := range Clusters { if c.Name == cluster { for _, m := range c.MetricConfig { @@ -110,7 +135,6 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig { // AssignSubCluster sets the `job.subcluster` property of the job based // on its cluster and resources. func AssignSubCluster(job *schema.BaseJob) error { - cluster := GetCluster(job.Cluster) if cluster == nil { return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster) @@ -146,7 +170,6 @@ func AssignSubCluster(job *schema.BaseJob) error { } func GetSubClusterByNode(cluster, hostname string) (string, error) { - for sc, nl := range nodeLists[cluster] { if nl != nil && nl.Contains(hostname) { return sc, nil diff --git a/pkg/archive/clusterConfig_test.go b/pkg/archive/clusterConfig_test.go new file mode 100644 index 0000000..b4dc6ef --- /dev/null +++ b/pkg/archive/clusterConfig_test.go @@ -0,0 +1,15 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package archive + +import ( + "encoding/json" + "testing" +) + +func TestClusterConfig(t *testing.T) { + var fsa FsArchive + version, err := fsa.Init(json.RawMessage("{\"path\":\"testdata/archive\"}")) +} diff --git a/pkg/archive/fsBackend_test.go b/pkg/archive/fsBackend_test.go index 5e0a06c..78d634a 100644 --- a/pkg/archive/fsBackend_test.go +++ b/pkg/archive/fsBackend_test.go @@ -30,6 +30,7 @@ func TestInitNoJson(t *testing.T) { t.Fatal(err) } } + func TestInitNotExists(t *testing.T) { var fsa FsArchive _, err := fsa.Init(json.RawMessage("{\"path\":\"testdata/job-archive\"}")) @@ -50,7 +51,7 @@ func TestInit(t *testing.T) { if version != 1 { t.Fail() } - if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" { + if len(fsa.clusters) != 3 || fsa.clusters[0] != "emmy" { t.Fail() } } @@ -133,7 +134,6 @@ func TestLoadJobData(t *testing.T) { } func BenchmarkLoadJobData(b *testing.B) { - tmpdir := b.TempDir() jobarchive := filepath.Join(tmpdir, "job-archive") util.CopyDir("./testdata/archive/", jobarchive) @@ -157,7 +157,6 @@ func BenchmarkLoadJobData(b *testing.B) { } func BenchmarkLoadJobDataCompressed(b *testing.B) { - tmpdir := b.TempDir() jobarchive := filepath.Join(tmpdir, "job-archive") util.CopyDir("./testdata/archive/", jobarchive) diff --git a/pkg/archive/testdata/archive/alex/cluster.json b/pkg/archive/testdata/archive/alex/cluster.json new file mode 100644 index 0000000..5a04a78 --- /dev/null +++ b/pkg/archive/testdata/archive/alex/cluster.json @@ -0,0 +1,2774 @@ +{ + "name": "alex", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 128, + "normal": 128, + "caution": 10, + "alert": 5 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 512, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 9216, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "core_power", + "unit": { + "base": "W" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 10, + "alert": 5 + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 400, + "normal": 200, + "caution": 50, + "alert": 20 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + } + ], + "subClusters": [ + { + "name": "a40", + "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:01:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:25:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:41:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:61:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:81:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:A1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:C1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:E1:00.0", + "type": "Nvidia GPU", + "model": "A40" + } + ] + } + }, + { + "name": "a100", + "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + }, + { + "name": "a100m80", + "nodes": "a[0531-0537],a0831,a[0931-0934]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + } + ] +} diff --git a/pkg/archive/testdata/archive/fritz/cluster.json b/pkg/archive/testdata/archive/fritz/cluster.json new file mode 100644 index 0000000..23a8343 --- /dev/null +++ b/pkg/archive/testdata/archive/fritz/cluster.json @@ -0,0 +1,746 @@ +{ + "name": "fritz", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_sp", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_dp", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "f01[01-88],f02[01-88],f03[01-88],f03[01-88],f04[01-88],f05[01-88],f06[01-88],f07[01-88],f08[01-88],f09[01-88],f10[01-88],f11[01-56],f12[01-56]", + "processorType": "Intel Icelake", + "socketsPerNode": 2, + "coresPerSocket": 36, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17 + ], + [ + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53 + ], + [ + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ] + ] + } + } + ] +} diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index 6edd830..3bd05d9 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -30,16 +30,18 @@ type MetricValue struct { } type SubCluster struct { - Name string `json:"name"` - Nodes string `json:"nodes"` - ProcessorType string `json:"processorType"` - Topology Topology `json:"topology"` - FlopRateScalar MetricValue `json:"flopRateScalar"` - FlopRateSimd MetricValue `json:"flopRateSimd"` - MemoryBandwidth MetricValue `json:"memoryBandwidth"` - SocketsPerNode int `json:"socketsPerNode"` - CoresPerSocket int `json:"coresPerSocket"` - ThreadsPerCore int `json:"threadsPerCore"` + Name string `json:"name"` + Nodes string `json:"nodes"` + ProcessorType string `json:"processorType"` + Topology Topology `json:"topology"` + FlopRateScalar MetricValue `json:"flopRateScalar"` + FlopRateSimd MetricValue `json:"flopRateSimd"` + MemoryBandwidth MetricValue `json:"memoryBandwidth"` + MetricConfig []MetricConfig `json:"metricConfig,omitempty"` + Footprint []string `json:"footprint,omitempty"` + SocketsPerNode int `json:"socketsPerNode"` + CoresPerSocket int `json:"coresPerSocket"` + ThreadsPerCore int `json:"threadsPerCore"` } type SubClusterConfig struct {