cc-examples/fau-systems/job-archive/cluster-tinygpu.json

621 lines
22 KiB
JSON
Raw Normal View History

2023-06-13 07:26:59 +02:00
{
"name": "tinygpu",
"metricConfig": [
{
"name": "cpu_load",
"unit": {
"base": ""
},
"scope": "node",
"aggregation": "avg",
"timestep": 60,
"peak": 64,
"normal": 32,
"caution": 30,
"alert": 20,
"subClusters": [
{
"name": "rtx2080",
"peak": 32,
"normal": 16,
"caution": 14,
"alert": 6
},
{
"name": "a100",
"peak": 128,
"normal": 128,
"caution": 60,
"alert": 20
},
{
"name": "v100",
"peak": 32,
"normal": 16,
"caution": 14,
"alert": 6
}
]
},
{
"name": "cpu_user",
"unit": {
"base": ""
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 50,
"caution": 20,
"alert": 10
},
{
"name": "mem_used",
"unit": {
"base": "B",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 96,
"normal": 45,
"caution": 80,
"alert": 90,
"subClusters": [
{
"name": "rtx3080",
"peak": 384,
"normal": 192,
"caution": 320,
"alert": 375
},
{
"name": "a100",
"peak": 512,
"normal": 256,
"caution": 480,
"alert": 500
}
]
},
{
"name": "flops_any",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "hwthread",
"aggregation": "sum",
"timestep": 60,
"peak": 5600,
"normal": 1000,
"caution": 200,
"alert": 50
},
{
"name": "flops_sp",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "hwthread",
"aggregation": "sum",
"timestep": 60,
"peak": 5600,
"normal": 1000,
"caution": 200,
"alert": 50
},
{
"name": "flops_dp",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "hwthread",
"aggregation": "sum",
"timestep": 60,
"peak": 2300,
"normal": 500,
"caution": 100,
"alert": 50
},
{
"name": "mem_bw",
"unit": {
"base": "B/s",
"prefix": "G"
},
"scope": "socket",
"aggregation": "sum",
"timestep": 60,
"peak": 350,
"normal": 100,
"caution": 50,
"alert": 10
},
{
"name": "clock",
"unit": {
"base": "Hz",
"prefix": "M"
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 3000,
"normal": 2400,
"caution": 1800,
"alert": 1200
},
{
"name": "cpu_power",
"unit": {
"base": "W"
},
"scope": "socket",
"aggregation": "sum",
"timestep": 60,
"peak": 500,
"normal": 250,
"caution": 100,
"alert": 50
},
{
"name": "mem_power",
"unit": {
"base": "W"
},
"scope": "socket",
"aggregation": "sum",
"timestep": 60,
"peak": 100,
"normal": 50,
"caution": 20,
"alert": 10
},
{
"name": "ipc",
"unit": {
"base": "IPC"
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 4,
"normal": 2,
"caution": 1,
"alert": 0.5
},
{
"name": "vectorization_ratio",
"unit": {
"base": ""
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 60,
"caution": 40,
"alert": 10
},
{
"name": "acc_utilization",
"unit": {
"base": ""
},
"scope": "accelerator",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 80,
"caution": 50,
"alert": 20
},
{
"name": "acc_mem_used",
"unit": {
"base": "B",
"prefix": "G"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 40,
"normal": 20,
"caution": 10,
"alert": 5
},
{
"name": "acc_power",
"unit": {
"base": "W"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 400,
"normal": 200,
"caution": 50,
"alert": 20
},
{
"name": "nv_mem_util",
"unit": {
"base": ""
},
"scope": "accelerator",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 80,
"caution": 20,
"alert": 10
},
{
"name": "nv_temp",
"unit": {
"base": "B",
"prefix": "G"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 40,
"normal": 20,
"caution": 5,
"alert": 2
},
{
"name": "nv_sm_clock",
"unit": {
"base": "Hz",
"prefix": "M"
},
"scope": "accelerator",
"aggregation": "sum",
"timestep": 60,
"peak": 1400,
"normal": 1200,
"caution": 100,
"alert": 50
},
{
"name": "nfs4_read",
"unit": {
"base": "B/s",
"prefix": "M"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
},
{
"name": "nfs4_write",
"unit": {
"base": "B/s",
"prefix": "M"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
},
{
"name": "nfs4_total",
"unit": {
"base": "B/s",
"prefix": "M"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
}
],
"subClusters": [
{
"name": "rtx3080",
"nodes": "tg0[80-86]",
"processorType": "Intel(R) Xeon(R) Gold 6226R CPU @ 2.90GHz",
"socketsPerNode": 2,
"coresPerSocket": 16,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 111
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 787
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 229
},
"topology": {
"node": [
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
],
"socket": [
[ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 ],
[ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ]
],
"memoryDomain": [
[ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 ],
[ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ]
],
"core": [
[ 0, 32 ], [ 1, 33 ], [ 2, 34 ], [ 3, 35 ], [ 4, 36 ], [ 5, 37 ], [ 6, 38 ], [ 7, 39 ], [ 8, 40 ], [ 9, 41 ], [ 10, 42 ], [ 11, 43 ], [ 12, 44 ], [ 13, 45 ], [ 14, 46 ], [ 15, 47 ], [ 16, 48 ], [ 17, 49 ], [ 18, 50 ], [ 19, 51 ], [ 20, 52 ], [ 21, 53 ], [ 22, 54 ], [ 23, 55 ], [ 24, 56 ], [ 25, 57 ], [ 26, 58 ], [ 27, 59 ], [ 28, 60 ], [ 29, 61 ], [ 30, 62 ], [ 31, 63 ]
],
"accelerators": [
{
"id": "00000000:1a:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:1b:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:3d:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:3e:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:b1:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:b2:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:da:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
},
{
"id": "00000000:db:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 3080"
}
]
}
},
{
"name": "rtx2080",
"nodes": "tg0[60-69],tg06a,tg06b",
"processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz",
"socketsPerNode": 2,
"coresPerSocket": 8,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 47
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 326
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 137
},
"topology": {
"node": [
0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
],
"socket": [
[ 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23 ],
[ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ]
],
"memoryDomain": [
[ 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23 ],
[ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ]
],
"core": [
[ 0, 19 ], [ 1, 17 ], [ 2, 18 ], [ 3, 16 ], [ 4, 20 ], [ 5, 21 ], [ 6, 22 ], [ 7, 23 ], [ 8, 24 ], [ 9, 25 ], [ 10, 26 ], [ 11, 27 ], [ 12, 28 ], [ 13, 29 ], [ 14, 30 ], [ 15, 31 ]
],
"accelerators": [
{
"id": "00000000:18:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
},
{
"id": "00000000:3b:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
},
{
"id": "00000000:86:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
},
{
"id": "00000000:af:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA GeForce RTX 2080 Ti"
}
]
}
},
{
"name": "a100",
"nodes": "tg0[90-97]",
"processorType": "AMD EPYC 7662 64-Core Processor",
"socketsPerNode": 2,
"coresPerSocket": 64,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 987
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 5660
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 306
},
"topology": {
"node": [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127
],
"socket": [
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ],
[ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ]
],
"memoryDomain": [
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ],
[ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ],
[ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 ],
[ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ]
],
"core": [
[ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 72 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ]
],
"accelerators": [
{
"id": "00000000:01:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
},
{
"id": "00000000:41:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
},
{
"id": "00000000:81:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
},
{
"id": "00000000:c1:00.0",
"type": "Nvidia GPU",
"model": "NVIDIA A100-SXM4-40GB"
}
]
}
},
{
"name": "v100",
"nodes": "tg0[71-74]",
"processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz",
"socketsPerNode": 2,
"coresPerSocket": 8,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 59
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 430
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 177
},
"topology": {
"node": [
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
],
"socket": [
[ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ],
[ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ]
],
"memoryDomain": [
[ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ],
[ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ]
],
"core": [
[ 0, 16 ], [ 1, 17 ], [ 2, 18 ], [ 3, 19 ], [ 4, 20 ], [ 5, 21 ], [ 6, 22 ], [ 7, 23 ], [ 8, 24 ], [ 9, 25 ], [ 10, 26 ], [ 11, 27 ], [ 12, 28 ], [ 13, 29 ], [ 14, 30 ], [ 15, 31 ]
],
"accelerators": [
{
"id": "00000000:18:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
},
{
"id": "00000000:3b:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
},
{
"id": "00000000:86:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
},
{
"id": "00000000:af:00.0",
"type": "",
"model": "Tesla V100-PCIE-32GB"
}
]
}
}
]
}