cc-examples/nhr@fau/job-archive/cluster-alex.json

2810 lines
43 KiB
JSON
Raw Permalink Normal View History

2024-11-29 11:30:52 +01:00
{
"name": "alex",
"metricConfig": [
{
"name": "cpu_load",
"unit": {
"base": ""
},
"scope": "node",
"aggregation": "avg",
"footprint": "avg",
"timestep": 60,
"peak": 128,
"normal": 128,
"caution": 10,
"alert": 5
},
{
"name": "cpu_user",
"unit": {
"base": ""
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 50,
"caution": 20,
"alert": 10
},
{
"name": "mem_used",
"unit": {
"base": "B",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"footprint": "max",
"timestep": 60,
"peak": 512,
"normal": 128,
"caution": 200,
"alert": 240
},
{
"name": "flops_any",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "hwthread",
"aggregation": "sum",
"footprint": "avg",
"timestep": 60,
"peak": 9216,
"normal": 1000,
"caution": 200,
"alert": 50
},
{
"name": "net_bytes_in",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 125000000,
"normal": 125000000,
"caution": 200,
"alert": 240
},
{
"name": "net_bytes_out",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 125000000,
"normal": 125000000,
"caution": 200,
"alert": 240
},
{
"name": "mem_bw",
"unit": {
"base": "B/s",
"prefix": "G"
},
"scope": "socket",
"aggregation": "sum",
"footprint": "avg",
"timestep": 60,
"peak": 350,
"normal": 100,
"caution": 50,
"alert": 10
},
{
"name": "clock",
"unit": {
"base": "Hz",
"prefix": "M"
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 3000,
"normal": 2400,
"caution": 1800,
"alert": 1200
},
{
"name": "core_power",
"unit": {
"base": "W"
},
"scope": "hwthread",
"aggregation": "sum",
"energy": "power",
"timestep": 60,
"peak": 500,
"normal": 250,
"caution": 100,
"alert": 50
},
{
"name": "acc_utilization",
"unit": {
"base": ""
},
"scope": "accelerator",
"aggregation": "avg",
"footprint": "avg",
"timestep": 60,
"peak": 100,
"normal": 80,
"caution": 50,
"alert": 20
},
{
"name": "acc_mem_used",
"unit": {
"base": "B",
"prefix": "M"
},
"scope": "accelerator",
"aggregation": "sum",
"footprint": "max",
"timestep": 60,
"peak": 320000,
"normal": 160000,
"caution": 80000,
"alert": 40000,
"subClusters": [
{
"name": "a100m80",
"peak": 640000,
"normal": 320000,
"caution": 160000,
"alert": 80000,
"footprint": "max"
}
]
},
{
"name": "acc_power",
"unit": {
"base": "W"
},
"scope": "accelerator",
"aggregation": "sum",
"energy": "power",
"timestep": 60,
"peak": 3200,
"normal": 1600,
"caution": 400,
"alert": 160
},
{
"name": "nv_mem_util",
"unit": {
"base": ""
},
"scope": "accelerator",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 80,
"caution": 20,
"alert": 10
},
{
"name": "nv_temp",
"unit": {
"base": "°C"
},
"scope": "accelerator",
"aggregation": "avg",
"timestep": 60,
"peak": 40,
"normal": 20,
"caution": 5,
"alert": 2
},
{
"name": "nv_sm_clock",
"unit": {
"base": "Hz",
"prefix": "M"
},
"scope": "accelerator",
"aggregation": "avg",
"timestep": 60,
"peak": 1400,
"normal": 1200,
"caution": 100,
"alert": 50
},
{
"name": "cpu_power",
"unit": {
"base": "W"
},
"scope": "socket",
"aggregation": "sum",
"energy": "power",
"timestep": 60,
"peak": 500,
"normal": 250,
"caution": 100,
"alert": 50
},
{
"name": "ipc",
"unit": {
"base": "IPC"
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 4,
"normal": 2,
"caution": 1,
"alert": 0.5
}
],
"subClusters": [
{
"name": "a40",
"nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522],a[1621-1624],a[1721-1722]",
"processorType": "AMD Milan",
"socketsPerNode": 2,
"coresPerSocket": 64,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 432
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 9216
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 400
},
"topology": {
"node": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
],
"socket": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
],
[
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"memoryDomain": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"core": [
[
0
],
[
1
],
[
2
],
[
3
],
[
4
],
[
5
],
[
6
],
[
7
],
[
8
],
[
9
],
[
10
],
[
11
],
[
12
],
[
13
],
[
14
],
[
15
],
[
16
],
[
17
],
[
18
],
[
19
],
[
20
],
[
21
],
[
22
],
[
23
],
[
24
],
[
25
],
[
26
],
[
27
],
[
28
],
[
29
],
[
30
],
[
31
],
[
32
],
[
33
],
[
34
],
[
35
],
[
36
],
[
37
],
[
38
],
[
39
],
[
40
],
[
41
],
[
42
],
[
43
],
[
44
],
[
45
],
[
46
],
[
47
],
[
48
],
[
49
],
[
50
],
[
51
],
[
52
],
[
53
],
[
54
],
[
55
],
[
56
],
[
57
],
[
58
],
[
59
],
[
60
],
[
61
],
[
62
],
[
63
],
[
64
],
[
65
],
[
66
],
[
67
],
[
68
],
[
69
],
[
70
],
[
71
],
[
73
],
[
74
],
[
75
],
[
76
],
[
77
],
[
78
],
[
79
],
[
80
],
[
81
],
[
82
],
[
83
],
[
84
],
[
85
],
[
86
],
[
87
],
[
88
],
[
89
],
[
90
],
[
91
],
[
92
],
[
93
],
[
94
],
[
95
],
[
96
],
[
97
],
[
98
],
[
99
],
[
100
],
[
101
],
[
102
],
[
103
],
[
104
],
[
105
],
[
106
],
[
107
],
[
108
],
[
109
],
[
110
],
[
111
],
[
112
],
[
113
],
[
114
],
[
115
],
[
116
],
[
117
],
[
118
],
[
119
],
[
120
],
[
121
],
[
122
],
[
123
],
[
124
],
[
125
],
[
126
],
[
127
]
],
"accelerators": [
{
"id": "00000000:01:00.0",
"type": "Nvidia GPU",
"model": "A40"
},
{
"id": "00000000:25:00.0",
"type": "Nvidia GPU",
"model": "A40"
},
{
"id": "00000000:41:00.0",
"type": "Nvidia GPU",
"model": "A40"
},
{
"id": "00000000:61:00.0",
"type": "Nvidia GPU",
"model": "A40"
},
{
"id": "00000000:81:00.0",
"type": "Nvidia GPU",
"model": "A40"
},
{
"id": "00000000:A1:00.0",
"type": "Nvidia GPU",
"model": "A40"
},
{
"id": "00000000:C1:00.0",
"type": "Nvidia GPU",
"model": "A40"
},
{
"id": "00000000:E1:00.0",
"type": "Nvidia GPU",
"model": "A40"
}
]
}
},
{
"name": "a100",
"nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]",
"processorType": "AMD Milan",
"socketsPerNode": 2,
"coresPerSocket": 64,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 432
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 9216
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 400
},
"topology": {
"node": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
],
"socket": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
],
[
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"memoryDomain": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"core": [
[
0
],
[
1
],
[
2
],
[
3
],
[
4
],
[
5
],
[
6
],
[
7
],
[
8
],
[
9
],
[
10
],
[
11
],
[
12
],
[
13
],
[
14
],
[
15
],
[
16
],
[
17
],
[
18
],
[
19
],
[
20
],
[
21
],
[
22
],
[
23
],
[
24
],
[
25
],
[
26
],
[
27
],
[
28
],
[
29
],
[
30
],
[
31
],
[
32
],
[
33
],
[
34
],
[
35
],
[
36
],
[
37
],
[
38
],
[
39
],
[
40
],
[
41
],
[
42
],
[
43
],
[
44
],
[
45
],
[
46
],
[
47
],
[
48
],
[
49
],
[
50
],
[
51
],
[
52
],
[
53
],
[
54
],
[
55
],
[
56
],
[
57
],
[
58
],
[
59
],
[
60
],
[
61
],
[
62
],
[
63
],
[
64
],
[
65
],
[
66
],
[
67
],
[
68
],
[
69
],
[
70
],
[
71
],
[
73
],
[
74
],
[
75
],
[
76
],
[
77
],
[
78
],
[
79
],
[
80
],
[
81
],
[
82
],
[
83
],
[
84
],
[
85
],
[
86
],
[
87
],
[
88
],
[
89
],
[
90
],
[
91
],
[
92
],
[
93
],
[
94
],
[
95
],
[
96
],
[
97
],
[
98
],
[
99
],
[
100
],
[
101
],
[
102
],
[
103
],
[
104
],
[
105
],
[
106
],
[
107
],
[
108
],
[
109
],
[
110
],
[
111
],
[
112
],
[
113
],
[
114
],
[
115
],
[
116
],
[
117
],
[
118
],
[
119
],
[
120
],
[
121
],
[
122
],
[
123
],
[
124
],
[
125
],
[
126
],
[
127
]
],
"accelerators": [
{
"id": "00000000:0E:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:13:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:49:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:4F:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:90:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:96:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:CC:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:D1:00.0",
"type": "Nvidia GPU",
"model": "A100"
}
]
}
},
{
"name": "a100m80",
"nodes": "a[0531-0537],a[0631-0633],a0731,a[0831-0833],a[0931-0934]",
"processorType": "AMD Milan",
"socketsPerNode": 2,
"coresPerSocket": 64,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 432
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 9216
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 400
},
"topology": {
"node": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
],
"socket": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
],
[
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"memoryDomain": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"core": [
[
0
],
[
1
],
[
2
],
[
3
],
[
4
],
[
5
],
[
6
],
[
7
],
[
8
],
[
9
],
[
10
],
[
11
],
[
12
],
[
13
],
[
14
],
[
15
],
[
16
],
[
17
],
[
18
],
[
19
],
[
20
],
[
21
],
[
22
],
[
23
],
[
24
],
[
25
],
[
26
],
[
27
],
[
28
],
[
29
],
[
30
],
[
31
],
[
32
],
[
33
],
[
34
],
[
35
],
[
36
],
[
37
],
[
38
],
[
39
],
[
40
],
[
41
],
[
42
],
[
43
],
[
44
],
[
45
],
[
46
],
[
47
],
[
48
],
[
49
],
[
50
],
[
51
],
[
52
],
[
53
],
[
54
],
[
55
],
[
56
],
[
57
],
[
58
],
[
59
],
[
60
],
[
61
],
[
62
],
[
63
],
[
64
],
[
65
],
[
66
],
[
67
],
[
68
],
[
69
],
[
70
],
[
71
],
[
73
],
[
74
],
[
75
],
[
76
],
[
77
],
[
78
],
[
79
],
[
80
],
[
81
],
[
82
],
[
83
],
[
84
],
[
85
],
[
86
],
[
87
],
[
88
],
[
89
],
[
90
],
[
91
],
[
92
],
[
93
],
[
94
],
[
95
],
[
96
],
[
97
],
[
98
],
[
99
],
[
100
],
[
101
],
[
102
],
[
103
],
[
104
],
[
105
],
[
106
],
[
107
],
[
108
],
[
109
],
[
110
],
[
111
],
[
112
],
[
113
],
[
114
],
[
115
],
[
116
],
[
117
],
[
118
],
[
119
],
[
120
],
[
121
],
[
122
],
[
123
],
[
124
],
[
125
],
[
126
],
[
127
]
],
"accelerators": [
{
"id": "00000000:0E:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:13:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:49:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:4F:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:90:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:96:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:CC:00.0",
"type": "Nvidia GPU",
"model": "A100"
},
{
"id": "00000000:D1:00.0",
"type": "Nvidia GPU",
"model": "A100"
}
]
}
}
]
}