{ "name": "alex", "metricConfig": [ { "name": "cpu_load", "unit": { "base": "" }, "scope": "node", "aggregation": "avg", "footprint": "avg", "timestep": 60, "peak": 128, "normal": 128, "caution": 10, "alert": 5 }, { "name": "cpu_user", "unit": { "base": "" }, "scope": "hwthread", "aggregation": "avg", "timestep": 60, "peak": 100, "normal": 50, "caution": 20, "alert": 10 }, { "name": "mem_used", "unit": { "base": "B", "prefix": "G" }, "scope": "node", "aggregation": "sum", "footprint": "max", "timestep": 60, "peak": 512, "normal": 128, "caution": 200, "alert": 240 }, { "name": "flops_any", "unit": { "base": "Flops/s", "prefix": "G" }, "scope": "hwthread", "aggregation": "sum", "footprint": "avg", "timestep": 60, "peak": 9216, "normal": 1000, "caution": 200, "alert": 50 }, { "name": "net_bytes_in", "unit": { "base": "B/s" }, "scope": "node", "aggregation": "sum", "timestep": 60, "peak": 125000000, "normal": 125000000, "caution": 200, "alert": 240 }, { "name": "net_bytes_out", "unit": { "base": "B/s" }, "scope": "node", "aggregation": "sum", "timestep": 60, "peak": 125000000, "normal": 125000000, "caution": 200, "alert": 240 }, { "name": "mem_bw", "unit": { "base": "B/s", "prefix": "G" }, "scope": "socket", "aggregation": "sum", "footprint": "avg", "timestep": 60, "peak": 350, "normal": 100, "caution": 50, "alert": 10 }, { "name": "clock", "unit": { "base": "Hz", "prefix": "M" }, "scope": "hwthread", "aggregation": "avg", "timestep": 60, "peak": 3000, "normal": 2400, "caution": 1800, "alert": 1200 }, { "name": "core_power", "unit": { "base": "W" }, "scope": "hwthread", "aggregation": "sum", "energy": "power", "timestep": 60, "peak": 500, "normal": 250, "caution": 100, "alert": 50 }, { "name": "acc_utilization", "unit": { "base": "" }, "scope": "accelerator", "aggregation": "avg", "footprint": "avg", "timestep": 60, "peak": 100, "normal": 80, "caution": 50, "alert": 20 }, { "name": "acc_mem_used", "unit": { "base": "B", "prefix": "M" }, "scope": "accelerator", "aggregation": "sum", "footprint": "max", "timestep": 60, "peak": 320000, "normal": 160000, "caution": 80000, "alert": 40000, "subClusters": [ { "name": "a100m80", "peak": 640000, "normal": 320000, "caution": 160000, "alert": 80000, "footprint": "max" } ] }, { "name": "acc_power", "unit": { "base": "W" }, "scope": "accelerator", "aggregation": "sum", "energy": "power", "timestep": 60, "peak": 3200, "normal": 1600, "caution": 400, "alert": 160 }, { "name": "nv_mem_util", "unit": { "base": "" }, "scope": "accelerator", "aggregation": "avg", "timestep": 60, "peak": 100, "normal": 80, "caution": 20, "alert": 10 }, { "name": "nv_temp", "unit": { "base": "°C" }, "scope": "accelerator", "aggregation": "avg", "timestep": 60, "peak": 40, "normal": 20, "caution": 5, "alert": 2 }, { "name": "nv_sm_clock", "unit": { "base": "Hz", "prefix": "M" }, "scope": "accelerator", "aggregation": "avg", "timestep": 60, "peak": 1400, "normal": 1200, "caution": 100, "alert": 50 }, { "name": "cpu_power", "unit": { "base": "W" }, "scope": "socket", "aggregation": "sum", "energy": "power", "timestep": 60, "peak": 500, "normal": 250, "caution": 100, "alert": 50 }, { "name": "ipc", "unit": { "base": "IPC" }, "scope": "hwthread", "aggregation": "avg", "timestep": 60, "peak": 4, "normal": 2, "caution": 1, "alert": 0.5 } ], "subClusters": [ { "name": "a40", "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522],a[1621-1624],a[1721-1722]", "processorType": "AMD Milan", "socketsPerNode": 2, "coresPerSocket": 64, "threadsPerCore": 1, "flopRateScalar": { "unit": { "base": "F/s", "prefix": "G" }, "value": 432 }, "flopRateSimd": { "unit": { "base": "F/s", "prefix": "G" }, "value": 9216 }, "memoryBandwidth": { "unit": { "base": "B/s", "prefix": "G" }, "value": 400 }, "topology": { "node": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ], "socket": [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], [ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "memoryDomain": [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "core": [ [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] ], "accelerators": [ { "id": "00000000:01:00.0", "type": "Nvidia GPU", "model": "A40" }, { "id": "00000000:25:00.0", "type": "Nvidia GPU", "model": "A40" }, { "id": "00000000:41:00.0", "type": "Nvidia GPU", "model": "A40" }, { "id": "00000000:61:00.0", "type": "Nvidia GPU", "model": "A40" }, { "id": "00000000:81:00.0", "type": "Nvidia GPU", "model": "A40" }, { "id": "00000000:A1:00.0", "type": "Nvidia GPU", "model": "A40" }, { "id": "00000000:C1:00.0", "type": "Nvidia GPU", "model": "A40" }, { "id": "00000000:E1:00.0", "type": "Nvidia GPU", "model": "A40" } ] } }, { "name": "a100", "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", "processorType": "AMD Milan", "socketsPerNode": 2, "coresPerSocket": 64, "threadsPerCore": 1, "flopRateScalar": { "unit": { "base": "F/s", "prefix": "G" }, "value": 432 }, "flopRateSimd": { "unit": { "base": "F/s", "prefix": "G" }, "value": 9216 }, "memoryBandwidth": { "unit": { "base": "B/s", "prefix": "G" }, "value": 400 }, "topology": { "node": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ], "socket": [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], [ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "memoryDomain": [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "core": [ [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] ], "accelerators": [ { "id": "00000000:0E:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:13:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:49:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:4F:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:90:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:96:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:CC:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:D1:00.0", "type": "Nvidia GPU", "model": "A100" } ] } }, { "name": "a100m80", "nodes": "a[0531-0537],a[0631-0633],a0731,a[0831-0833],a[0931-0934]", "processorType": "AMD Milan", "socketsPerNode": 2, "coresPerSocket": 64, "threadsPerCore": 1, "flopRateScalar": { "unit": { "base": "F/s", "prefix": "G" }, "value": 432 }, "flopRateSimd": { "unit": { "base": "F/s", "prefix": "G" }, "value": 9216 }, "memoryBandwidth": { "unit": { "base": "B/s", "prefix": "G" }, "value": 400 }, "topology": { "node": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ], "socket": [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], [ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "memoryDomain": [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "core": [ [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] ], "accelerators": [ { "id": "00000000:0E:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:13:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:49:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:4F:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:90:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:96:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:CC:00.0", "type": "Nvidia GPU", "model": "A100" }, { "id": "00000000:D1:00.0", "type": "Nvidia GPU", "model": "A100" } ] } } ] }