cc-examples/nhr@fau/job-archive/cluster-tinygpu.json

2487 lines
39 KiB
JSON

{
"name": "tinygpu",
"metricConfig": [
{
"name": "cpu_load",
"unit": {
"base": ""
},
"scope": "node",
"aggregation": "avg",
"footprint": "avg",
"timestep": 60,
"peak": 72,
"normal": 72,
"caution": 36,
"alert": 20
},
{
"name": "cpu_user",
"unit": {
"base": ""
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 50,
"caution": 20,
"alert": 10
},
{
"name": "mem_used",
"unit": {
"base": "B",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"footprint": "max",
"timestep": 60,
"peak": 256,
"normal": 128,
"caution": 200,
"alert": 240
},
{
"name": "flops_any",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "hwthread",
"aggregation": "sum",
"footprint": "avg",
"timestep": 60,
"peak": 5600,
"normal": 1000,
"caution": 200,
"alert": 50
},
{
"name": "flops_sp",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "hwthread",
"aggregation": "sum",
"timestep": 60,
"peak": 5600,
"normal": 1000,
"caution": 200,
"alert": 50
},
{
"name": "flops_dp",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "hwthread",
"aggregation": "sum",
"timestep": 60,
"peak": 2300,
"normal": 500,
"caution": 100,
"alert": 50
},
{
"name": "mem_bw",
"unit": {
"base": "B/s",
"prefix": "G"
},
"scope": "socket",
"aggregation": "sum",
"footprint": "avg",
"timestep": 60,
"peak": 350,
"normal": 100,
"caution": 50,
"alert": 10
},
{
"name": "clock",
"unit": {
"base": "Hz",
"prefix": "M"
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 3000,
"normal": 2400,
"caution": 1800,
"alert": 1200
},
{
"name": "cpu_power",
"unit": {
"base": "W"
},
"scope": "socket",
"aggregation": "sum",
"energy": "power",
"timestep": 60,
"peak": 500,
"normal": 250,
"caution": 100,
"alert": 50
},
{
"name": "mem_power",
"unit": {
"base": "W"
},
"scope": "socket",
"aggregation": "sum",
"energy": "power",
"timestep": 60,
"peak": 100,
"normal": 50,
"caution": 20,
"alert": 10
},
{
"name": "ipc",
"unit": {
"base": "IPC"
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 4,
"normal": 2,
"caution": 1,
"alert": 0.5
},
{
"name": "vectorization_ratio",
"unit": {
"base": "%"
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 60,
"caution": 40,
"alert": 10
},
{
"name": "acc_utilization",
"unit": {
"base": ""
},
"scope": "accelerator",
"aggregation": "avg",
"footprint": "avg",
"timestep": 60,
"peak": 100,
"normal": 80,
"caution": 50,
"alert": 20
},
{
"name": "acc_mem_used",
"unit": {
"base": "B",
"prefix": "M"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 40000,
"normal": 20000,
"caution": 10000,
"alert": 5000,
"subClusters": [
{
"name": "a100",
"peak": 160000,
"normal": 120000,
"caution": 80000,
"alert": 40000
},
{
"name": "v100",
"peak": 128000,
"normal": 96000,
"caution": 64000,
"alert": 32000
},
{
"name": "rtx3080",
"peak": 80000,
"normal": 60000,
"caution": 30000,
"alert": 10000
},
{
"name": "rtx2080ti",
"peak": 44000,
"normal": 33000,
"caution": 22000,
"alert": 11000
}
]
},
{
"name": "acc_power",
"unit": {
"base": "W"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 400,
"normal": 200,
"caution": 50,
"alert": 20
},
{
"name": "nv_mem_util",
"unit": {
"base": ""
},
"scope": "accelerator",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 80,
"caution": 20,
"alert": 10
},
{
"name": "nv_temp",
"unit": {
"base": "B",
"prefix": "G"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 40,
"normal": 20,
"caution": 5,
"alert": 2
},
{
"name": "nv_sm_clock",
"unit": {
"base": "Hz",
"prefix": "M"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 1400,
"normal": 1200,
"caution": 100,
"alert": 50
},
{
"name": "nfs4_read",
"unit": {
"base": "IOP",
"prefix": ""
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
},
{
"name": "nfs4_write",
"unit": {
"base": "IOP",
"prefix": ""
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
},
{
"name": "nfs4_total",
"unit": {
"base": "IOP",
"prefix": ""
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
}
],
"subClusters": [
{
"name": "rtx3080",
"nodes": "tg0[80-86]",
"processorType": "Intel(R) Xeon(R) Gold 6226R CPU @ 2.90GHz",
"socketsPerNode": 2,
"coresPerSocket": 16,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 111
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 787
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 229
},
"topology": {
"node": [
0,
32,
1,
33,
2,
34,
3,
35,
4,
36,
5,
37,
6,
38,
7,
39,
8,
40,
9,
41,
10,
42,
11,
43,
12,
44,
13,
45,
14,
46,
15,
4716,
48,
17,
49,
18,
50,
19,
51,
20,
52,
21,
53,
22,
54,
23,
55,
24,
56,
25,
57,
26,
58,
27,
59,
28,
60,
29,
61,
30,
62,
31,
63
],
"socket": [
[
0,
32,
1,
33,
2,
34,
3,
35,
4,
36,
5,
37,
6,
38,
7,
39,
8,
40,
9,
41,
10,
42,
11,
43,
12,
44,
13,
45,
14,
46,
15,
47
],
[
16,
48,
17,
49,
18,
50,
19,
51,
20,
52,
21,
53,
22,
54,
23,
55,
24,
56,
25,
57,
26,
58,
27,
59,
28,
60,
29,
61,
30,
62,
31,
63
]
],
"memoryDomain": [
[
0,
32,
1,
33,
2,
34,
3,
35,
4,
36,
5,
37,
6,
38,
7,
39,
8,
40,
9,
41,
10,
42,
11,
43,
12,
44,
13,
45,
14,
46,
15,
47
],
[
16,
48,
17,
49,
18,
50,
19,
51,
20,
52,
21,
53,
22,
54,
23,
55,
24,
56,
25,
57,
26,
58,
27,
59,
28,
60,
29,
61,
30,
62,
31,
63
]
],
"core": [
[
0,
32
],
[
1,
33
],
[
2,
34
],
[
3,
35
],
[
4,
36
],
[
5,
37
],
[
6,
38
],
[
7,
39
],
[
8,
40
],
[
9,
41
],
[
10,
42
],
[
11,
43
],
[
12,
44
],
[
13,
45
],
[
14,
46
],
[
15,
47
],
[
16,
48
],
[
17,
49
],
[
18,
50
],
[
19,
51
],
[
20,
52
],
[
21,
53
],
[
22,
54
],
[
23,
55
],
[
24,
56
],
[
25,
57
],
[
26,
58
],
[
27,
59
],
[
28,
60
],
[
29,
61
],
[
30,
62
],
[
31,
63
]
],
"accelerators": [
{
"id": "00000000:1a:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:1b:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:3d:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:3e:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:b1:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:b2:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:da:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:db:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
}
]
}
},
{
"name": "rtx2080",
"nodes": "tg060,tg0[65-69],tg06a,tg06b",
"processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz",
"socketsPerNode": 2,
"coresPerSocket": 8,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 47
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 326
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 137
},
"topology": {
"node": [
0,
19,
1,
17,
2,
18,
3,
16,
4,
20,
5,
21,
6,
22,
7,
238,
24,
9,
25,
10,
26,
11,
27,
12,
28,
13,
29,
14,
30,
15,
31
],
"socket": [
[
0,
19,
1,
17,
2,
18,
3,
16,
4,
20,
5,
21,
6,
22,
7,
23
],
[
8,
24,
9,
25,
10,
26,
11,
27,
12,
28,
13,
29,
14,
30,
15,
31
]
],
"memoryDomain": [
[
0,
19,
1,
17,
2,
18,
3,
16,
4,
20,
5,
21,
6,
22,
7,
23
],
[
8,
24,
9,
25,
10,
26,
11,
27,
12,
28,
13,
29,
14,
30,
15,
31
]
],
"core": [
[
0,
19
],
[
1,
17
],
[
2,
18
],
[
3,
16
],
[
4,
20
],
[
5,
21
],
[
6,
22
],
[
7,
23
],
[
8,
24
],
[
9,
25
],
[
10,
26
],
[
11,
27
],
[
12,
28
],
[
13,
29
],
[
14,
30
],
[
15,
31
]
],
"accelerators": [
{
"id": "00000000:18:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
},
{
"id": "00000000:3b:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
},
{
"id": "00000000:86:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
},
{
"id": "00000000:af:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
}
]
}
},
{
"name": "a100",
"nodes": "tg0[90-97]",
"processorType": "AMD EPYC 7662 64-Core Processor ",
"socketsPerNode": 2,
"coresPerSocket": 64,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 987
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 5660
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 306
},
"topology": {
"node": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
6364,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
],
"socket": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
],
[
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"memoryDomain": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31
],
[
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
],
[
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95
],
[
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127
]
],
"core": [
[
0
],
[
1
],
[
2
],
[
3
],
[
4
],
[
5
],
[
6
],
[
7
],
[
8
],
[
9
],
[
10
],
[
11
],
[
12
],
[
13
],
[
14
],
[
15
],
[
16
],
[
17
],
[
18
],
[
19
],
[
20
],
[
21
],
[
22
],
[
23
],
[
24
],
[
25
],
[
26
],
[
27
],
[
28
],
[
29
],
[
30
],
[
31
],
[
32
],
[
33
],
[
34
],
[
35
],
[
36
],
[
37
],
[
38
],
[
39
],
[
40
],
[
41
],
[
42
],
[
43
],
[
44
],
[
45
],
[
46
],
[
47
],
[
48
],
[
49
],
[
50
],
[
51
],
[
52
],
[
53
],
[
54
],
[
55
],
[
56
],
[
57
],
[
58
],
[
59
],
[
60
],
[
61
],
[
62
],
[
63
],
[
64
],
[
65
],
[
66
],
[
67
],
[
68
],
[
69
],
[
70
],
[
71
],
[
72
],
[
73
],
[
74
],
[
75
],
[
76
],
[
77
],
[
78
],
[
79
],
[
80
],
[
81
],
[
82
],
[
83
],
[
84
],
[
85
],
[
86
],
[
87
],
[
88
],
[
89
],
[
90
],
[
91
],
[
92
],
[
93
],
[
94
],
[
95
],
[
96
],
[
97
],
[
98
],
[
99
],
[
100
],
[
101
],
[
102
],
[
103
],
[
104
],
[
105
],
[
106
],
[
107
],
[
108
],
[
109
],
[
110
],
[
111
],
[
112
],
[
113
],
[
114
],
[
115
],
[
116
],
[
117
],
[
118
],
[
119
],
[
120
],
[
121
],
[
122
],
[
123
],
[
124
],
[
125
],
[
126
],
[
127
]
],
"accelerators": [
{
"id": "00000000:01:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
},
{
"id": "00000000:41:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
},
{
"id": "00000000:81:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
},
{
"id": "00000000:c1:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
}
]
}
},
{
"name": "v100",
"nodes": "tg0[71-74]",
"processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz",
"socketsPerNode": 2,
"coresPerSocket": 8,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 59
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 430
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 177
},
"topology": {
"node": [
0,
16,
1,
17,
2,
18,
3,
19,
4,
20,
5,
21,
6,
22,
7,
238,
24,
9,
25,
10,
26,
11,
27,
12,
28,
13,
29,
14,
30,
15,
31
],
"socket": [
[
0,
16,
1,
17,
2,
18,
3,
19,
4,
20,
5,
21,
6,
22,
7,
23
],
[
8,
24,
9,
25,
10,
26,
11,
27,
12,
28,
13,
29,
14,
30,
15,
31
]
],
"memoryDomain": [
[
0,
16,
1,
17,
2,
18,
3,
19,
4,
20,
5,
21,
6,
22,
7,
23
],
[
8,
24,
9,
25,
10,
26,
11,
27,
12,
28,
13,
29,
14,
30,
15,
31
]
],
"core": [
[
0,
16
],
[
1,
17
],
[
2,
18
],
[
3,
19
],
[
4,
20
],
[
5,
21
],
[
6,
22
],
[
7,
23
],
[
8,
24
],
[
9,
25
],
[
10,
26
],
[
11,
27
],
[
12,
28
],
[
13,
29
],
[
14,
30
],
[
15,
31
]
],
"accelerators": [
{
"id": "00000000:18:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
},
{
"id": "00000000:3b:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
},
{
"id": "00000000:86:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
},
{
"id": "00000000:af:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
}
]
}
},
{
"name": "h100",
"nodes": "tg1[00-02]",
"processorType": "AMD EPYC 9354 32-Core Processor",
"socketsPerNode": 2,
"coresPerSocket": 32,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 524
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 3174
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 659
},
"topology": {
"node": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
],
"socket": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31
],
[
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
]
],
"memoryDomain": [
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31
],
[
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63
]
],
"core": [
[
0
],
[
1
],
[
2
],
[
3
],
[
4
],
[
5
],
[
6
],
[
7
],
[
8
],
[
9
],
[
10
],
[
11
],
[
12
],
[
13
],
[
14
],
[
15
],
[
16
],
[
17
],
[
18
],
[
19
],
[
20
],
[
21
],
[
22
],
[
23
],
[
24
],
[
25
],
[
26
],
[
27
],
[
28
],
[
29
],
[
30
],
[
31
],
[
32
],
[
33
],
[
34
],
[
35
],
[
36
],
[
37
],
[
38
],
[
39
],
[
40
],
[
41
],
[
42
],
[
43
],
[
44
],
[
45
],
[
46
],
[
47
],
[
48
],
[
49
],
[
50
],
[
51
],
[
52
],
[
53
],
[
54
],
[
55
],
[
56
],
[
57
],
[
58
],
[
59
],
[
60
],
[
61
],
[
62
],
[
63
]
],
"accelerators": [
{
"id": "00000000:03:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA H100 80GB HBM3"
},
{
"id": "00000000:04:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA H100 80GB HBM3"
},
{
"id": "00000000:E3:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA H100 80GB HBM3"
},
{
"id": "00000000:E4:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA H100 80GB HBM3"
}
]
}
}
]
}