diff --git a/README.md b/README.md index 4d1ecd7..7851939 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,11 @@ This is a collection of overall ClusterCockpit configurations that are used in production. -We are aware that the configuration of ClusterOCkcpit is currently tedious +We are aware that the configuration of ClusterCockpit is currently tedious involving to edit many different files with partly redundant information. Hopefully we can provide something simpler in the future. -Please note that the subcluster sections in the job archive cluster config files -can (and should) be generated with [this](https://github.com/ClusterCockpit/cc-backend/blob/master/configs/generate-subcluster.pl) provided perl skript. +Please note that the `subcluster` sections in the job archive cluster.json files +can (and should) be generated with +[this](https://github.com/ClusterCockpit/cc-backend/blob/master/configs/generate-subcluster.pl) +provided Perl script. diff --git a/fau-systems/cc-backend/config.json b/fau-systems/cc-backend/config.json deleted file mode 100644 index c0d1587..0000000 --- a/fau-systems/cc-backend/config.json +++ /dev/null @@ -1,158 +0,0 @@ -{ - "addr": "0.0.0.0:443", - "stop-jobs-exceeding-walltime": 288000, - "ldap": { - "url": "ldaps://hpcldap.rrze.uni-erlangen.de", - "user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", - "search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", - "user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", - "user_filter": "(&(objectclass=posixAccount)(uid=*))", - "sync_interval": "24h" - }, - "https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem", - "https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem", - "user": "clustercockpit", - "group": "clustercockpit", - "archive": { - "kind": "file", - "path": "./var/job-archive", - "compression": 7, - "retention": { - "policy": "none" - } - }, - "clusters": [ - { - "name": "fritz", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "XZY" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } - }, - { - "name": "alex", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "XZY" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } - }, - { - "name": "woody", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "XZY" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 1 - }, - "duration": { - "from": 0, - "to": 172800 - }, - "startTime": { - "from": "2020-01-01T00:00:00Z", - "to": null - } - } - }, - { - "name": "tinyfat", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "XZY" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 1 - }, - "duration": { - "from": 0, - "to": 172800 - }, - "startTime": { - "from": "2020-01-01T00:00:00Z", - "to": null - } - } - }, - { - "name": "tinygpu", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "XZY" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 1 - }, - "duration": { - "from": 0, - "to": 172800 - }, - "startTime": { - "from": "2020-01-01T00:00:00Z", - "to": null - } - } - }, - { - "name": "meggie", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "XZY" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2018-01-01T00:00:00Z", - "to": null - } - } - } - ] -} diff --git a/fau-systems/cc-metric-store/config.json b/fau-systems/cc-metric-store/config.json deleted file mode 100644 index 272a790..0000000 --- a/fau-systems/cc-metric-store/config.json +++ /dev/null @@ -1,180 +0,0 @@ -{ - "metrics": { - "clock": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_idle": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_iowait": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_irq": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_system": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_user": { - "frequency": 60, - "aggregation": "avg" - }, - "nv_mem_util": { - "frequency": 60, - "aggregation": "avg" - }, - "nv_temp": { - "frequency": 60, - "aggregation": "avg" - }, - "nv_sm_clock": { - "frequency": 60, - "aggregation": "avg" - }, - "acc_utilization": { - "frequency": 60, - "aggregation": "avg" - }, - "acc_mem_used": { - "frequency": 60, - "aggregation": "sum" - }, - "acc_power": { - "frequency": 60, - "aggregation": "sum" - }, - "flops_any": { - "frequency": 60, - "aggregation": "sum" - }, - "flops_dp": { - "frequency": 60, - "aggregation": "sum" - }, - "flops_sp": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_recv": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_xmit": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_recv_pkts": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_xmit_pkts": { - "frequency": 60, - "aggregation": "sum" - }, - "cpu_power": { - "frequency": 60, - "aggregation": "sum" - }, - "core_power": { - "frequency": 60, - "aggregation": "sum" - }, - "mem_power": { - "frequency": 60, - "aggregation": "sum" - }, - "ipc": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_load": { - "frequency": 60, - "aggregation": null - }, - "lustre_close": { - "frequency": 60, - "aggregation": null - }, - "lustre_open": { - "frequency": 60, - "aggregation": null - }, - "lustre_statfs": { - "frequency": 60, - "aggregation": null - }, - "lustre_read_bytes": { - "frequency": 60, - "aggregation": null - }, - "lustre_write_bytes": { - "frequency": 60, - "aggregation": null - }, - "net_bw": { - "frequency": 60, - "aggregation": null - }, - "file_bw": { - "frequency": 60, - "aggregation": null - }, - "mem_bw": { - "frequency": 60, - "aggregation": "sum" - }, - "mem_cached": { - "frequency": 60, - "aggregation": null - }, - "mem_used": { - "frequency": 60, - "aggregation": null - }, - "net_bytes_in": { - "frequency": 60, - "aggregation": null - }, - "net_bytes_out": { - "frequency": 60, - "aggregation": null - }, - "nfs4_read": { - "frequency": 60, - "aggregation": null - }, - "nfs4_total": { - "frequency": 60, - "aggregation": null - }, - "nfs4_write": { - "frequency": 60, - "aggregation": null - }, - "vectorization_ratio": { - "frequency": 60, - "aggregation": "avg" - } - }, - "checkpoints": { - "interval": "12h", - "directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints", - "restore": "48h" - }, - "archive": { - "interval": "50h", - "directory": "/opt/monitoring/cc-metric-store/fritz/archive" - }, - "http-api": { - "address": "0.0.0.0:8082", - "https-cert-file": null, - "https-key-file": null - }, - "retention-in-memory": "48h", - "jwt-public-key": "XZY" -} diff --git a/fau-systems/job-archive/cluster-alex.json b/fau-systems/job-archive/cluster-alex.json deleted file mode 100644 index a669cba..0000000 --- a/fau-systems/job-archive/cluster-alex.json +++ /dev/null @@ -1,484 +0,0 @@ -{ - "name": "alex", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 128, - "normal": 128, - "caution": 10, - "alert": 5 - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 512, - "normal": 128, - "caution": 200, - "alert": 240 - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 9216, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 10 - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - }, - { - "name": "core_power", - "unit": { - "base": "W" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "acc_utilization", - "unit": { - "base": "" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 80, - "caution": 50, - "alert": 20 - }, - { - "name": "acc_mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "accelerator", - "aggregation": "sum", - "timestep": 60, - "peak": 40, - "normal": 20, - "caution": 10, - "alert": 5 - }, - { - "name": "acc_power", - "unit": { - "base": "W" - }, - "scope": "accelerator", - "aggregation": "sum", - "timestep": 60, - "peak": 400, - "normal": 200, - "caution": 50, - "alert": 20 - }, - { - "name": "nv_mem_util", - "unit": { - "base": "" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 80, - "caution": 20, - "alert": 10 - }, - { - "name": "nv_temp", - "unit": { - "base": "°C" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 40, - "normal": 20, - "caution": 5, - "alert": 2 - }, - { - "name": "nv_sm_clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 1400, - "normal": 1200, - "caution": 100, - "alert": 50 - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - } - ], - "subClusters": [ - { - "name": "a40", - "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522]", - "processorType": "AMD Milan", - "socketsPerNode": 2, - "coresPerSocket": 64, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 400 - }, - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ], - "socket": [ - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 - ], - [ - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ] - ], - "memoryDomain": [ - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] - ], - "accelerators": [ - { - "id": "00000000:01:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:25:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:41:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:61:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:81:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:A1:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:C1:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:E1:00.0", - "type": "Nvidia GPU", - "model": "A40" - } - ] - } - }, - { - "name": "a100", - "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", - "processorType": "AMD Milan", - "socketsPerNode": 2, - "coresPerSocket": 64, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 400 - }, - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ], - "socket": [ - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 - ], - [ - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ] - ], - "memoryDomain": [ - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] - ], - "accelerators": [ - { - "id": "00000000:0E:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:13:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:49:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:4F:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:90:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:96:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:CC:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:D1:00.0", - "type": "Nvidia GPU", - "model": "A100" - } - ] - } - }, - { - "name": "a100m80", - "nodes": "a[0531-0537],a[0631-0633],a0831,a[0931-0934]", - "processorType": "AMD Milan", - "socketsPerNode": 2, - "coresPerSocket": 64, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 400 - }, - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ], - "socket": [ - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 - ], - [ - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ] - ], - "memoryDomain": [ - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] - ], - "accelerators": [ - { - "id": "00000000:0E:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:13:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:49:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:4F:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:90:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:96:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:CC:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:D1:00.0", - "type": "Nvidia GPU", - "model": "A100" - } - ] - } - } - ] -} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-emmy.json b/fau-systems/job-archive/cluster-emmy.json deleted file mode 100644 index 59502e5..0000000 --- a/fau-systems/job-archive/cluster-emmy.json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "name": "emmy", - "subClusters": [ - { - "name": "main", - "numberOfNode": 560, - "processorType": "Intel IvyBridge", - "socketsPerNode": 2, - "coresPerSocket": 10, - "threadsPerCore": 2, - "flopRateScalar": 88, - "flopRateSimd": 704, - "memoryBandwidth": 80, - "topology": { - "node": [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29,10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39], - "socket": [ - [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29], - [10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39] - ], - "memoryDomain": [ - [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29], - [10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39] - ], - "core": [ - [0,20],[1,21],[2,22],[3,23],[4,24],[5,25],[6,26],[7,27],[8,28],[9,29],[10,30],[11,31],[12,32],[13,33],[14,34],[15,35],[16,36],[17,37],[18,38],[19,39] - ] - } - } - ], - "metricConfig": [ - { - "name": "cpu_load", - "scope": "node", - "unit": "load", - "timestep": 60, - "aggregation": null, - "peak": 40, - "normal": 20, - "caution": 15, - "alert": 10, - "measurement": "data" - }, - { - "name": "mem_used", - "scope": "node", - "unit": "GB", - "timestep": 60, - "aggregation": null, - "peak": 64, - "normal": 20, - "caution": 40, - "alert": 55, - "measurement": "data" - }, - { - "name": "flops_any", - "scope": "node", - "unit": "GF/s", - "timestep": 60, - "aggregation": "sum", - "peak": 704, - "normal": 100, - "caution": 20, - "alert": 2, - "measurement": "data" - }, - { - "name": "flops_sp", - "scope": "node", - "unit": "GF/s", - "timestep": 60, - "aggregation": "sum", - "peak": 704, - "normal": 100, - "caution": 20, - "alert": 2, - "measurement": "data" - }, - { - "name": "flops_dp", - "scope": "node", - "unit": "GF/s", - "timestep": 60, - "aggregation": "sum", - "peak": 350, - "normal": 50, - "caution": 10, - "alert": 2, - "measurement": "data" - }, - { - "name": "mem_bw", - "scope": "node", - "unit": "GB/s", - "timestep": 60, - "aggregation": "sum", - "peak": 80, - "normal": 30, - "caution": 10, - "alert": 5, - "measurement": "data" - }, - { - "name": "ipc", - "scope": "node", - "unit": "IPC", - "timestep": 60, - "aggregation": "avg", - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5, - "measurement": "data" - }, - { - "name": "clock", - "scope": "node", - "unit": "MHz", - "timestep": 60, - "aggregation": "avg", - "peak": 3000, - "normal": 2200, - "caution": 1800, - "alert": 1200, - "measurement": "data" - }, - { - "name": "rapl_power", - "scope": "node", - "unit": "W", - "timestep": 60, - "aggregation": "sum", - "peak": 160, - "normal": 120, - "caution": 45, - "alert": 10, - "measurement": "data" - }, - { - "name": "ib_bw", - "scope": "node", - "unit": "GB/s", - "timestep": 60, - "aggregation": null, - "peak": 6, - "normal": 2, - "caution": 1, - "alert": 0.5, - "measurement": "data" - }, - { - "name": "lustre_bw", - "scope": "node", - "unit": "GB/s", - "timestep": 60, - "aggregation": null, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5, - "measurement": "data" - } - ] -} diff --git a/fau-systems/job-archive/cluster-fritz.json b/fau-systems/job-archive/cluster-fritz.json deleted file mode 100644 index 9437a8d..0000000 --- a/fau-systems/job-archive/cluster-fritz.json +++ /dev/null @@ -1,540 +0,0 @@ -{ - "name": "fritz", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 72, - "normal": 72, - "caution": 36, - "alert": 20, - "subClusters": [ - { - "name": "spr1tb", - "peak": 104, - "normal": 104, - "caution": 52, - "alert": 20 - }, - { - "name": "spr2tb", - "peak": 104, - "normal": 104, - "caution": 52, - "alert": 20 - } - ] - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 256, - "normal": 128, - "caution": 200, - "alert": 240, - "subClusters": [ - { - "name": "spr1tb", - "peak": 1024, - "normal": 512, - "caution": 900, - "alert": 1000 - }, - { - "name": "spr2tb", - "peak": 2048, - "normal": 1024, - "caution": 1800, - "alert": 2000 - } - ] - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50, - "subClusters": [ - { - "name": "spr1tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50, - "remove": true - }, - { - "name": "spr2tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50, - "remove": true - } - ] - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50, - "subClusters": [ - { - "name": "spr1tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50, - "remove": true - }, - { - "name": "spr2tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50, - "remove": true - } - ] - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 2300, - "normal": 500, - "caution": 100, - "alert": 50, - "subClusters": [ - { - "name": "spr1tb", - "peak": 3300, - "normal": 750, - "caution": 200, - "alert": 50, - "remove": true - }, - { - "name": "spr2tb", - "peak": 3300, - "normal": 750, - "caution": 200, - "alert": 50, - "remove": true - } - ] - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 10, - "subClusters": [ - { - "name": "spr1tb", - "peak": 549, - "normal": 200, - "caution": 100, - "alert": 20, - "remove": true - }, - { - "name": "spr2tb", - "peak": 520, - "normal": 200, - "caution": 100, - "alert": 20, - "remove": true - } - ] - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200, - "subClusters": [ - { - "name": "spr1tb", - "peak": 549, - "normal": 2000, - "caution": 1600, - "alert": 1200, - "remove": true - }, - { - "name": "spr2tb", - "peak": 520, - "normal": 2000, - "caution": 1600, - "alert": 1200, - "remove": true - } - ] - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "ib_recv", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_xmit", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_recv_pkts", - "unit": { - "base": "packets/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "ib_xmit_pkts", - "unit": { - "base": "packets/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_read", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_write", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "main", - "nodes": "f[0101-0188,0201-0288,0301-0388,0401-0488,0501-0588,0601-0688,0701-0788,0801-0888,0901-0988,1001-1088,1101-1156,1201-1256]", - "processorType": "Intel Icelake", - "socketsPerNode": 2, - "coresPerSocket": 36, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 350 - }, - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 - ], - "socket": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], - [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 ], - [ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], - [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ], - [ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ] - ] - } - }, - { - "name": "spr1tb", - "processorType": "Intel(R) Xeon(R) Platinum 8470", - "socketsPerNode": 2, - "coresPerSocket": 52, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 695 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 549 - }, - "nodes": "f[2157-2188,2257-2288]", - "topology": { - "node": - [ - 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,5152,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103 - ], - "socket": - [ - [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51], - [52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103] - ], - "memoryDomain": [ - [0,1,2,3,4,5,6,7,8,9,10,11,12], - [13,14,15,16,17,18,19,20,21,22,23,24,25], - [26,27,28,29,30,31,32,33,34,35,36,37,38], - [39,40,41,42,43,44,45,46,47,48,49,50,51], - [52,53,54,55,56,57,58,59,60,61,62,63,64], - [65,66,67,68,69,70,71,72,73,74,75,76,77], - [78,79,80,81,82,83,84,85,86,87,88,89,90], - [91,92,93,94,95,96,97,98,99,100,101,102,103] - ], - "core": [ - [0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],[31],[32],[33],[34],[35],[36],[37],[38],[39],[40],[41],[42],[43],[44],[45],[46],[47],[48],[49],[50],[51],[52],[53],[54],[55],[56],[57],[58],[59],[60],[61],[62],[63],[64],[65],[66],[67],[68],[69],[70],[71],[72],[73],[74],[75],[76],[77],[78],[79],[80],[81],[82],[83],[84],[85],[86],[87],[88],[89],[90],[91],[92],[93],[94],[95],[96],[97],[98],[99],[100],[101],[102],[103] - ] - } - }, - { - "name": "spr2tb", - "processorType": "Intel(R) Xeon(R) Platinum 8470", - "socketsPerNode": 2, - "coresPerSocket": 52, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 695 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 515 - }, - "nodes": "f[2181-2188,2281-2288]", - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 - ], - "socket": [ - [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 - ], - [ - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 - ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ], - [ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ], - [ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38 ], - [ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 ], - [ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 ], - [ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77 ], - [ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90 ], - [ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 72 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ] - ] - } - } - ] -} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-meggie.json b/fau-systems/job-archive/cluster-meggie.json deleted file mode 100644 index bfaeb99..0000000 --- a/fau-systems/job-archive/cluster-meggie.json +++ /dev/null @@ -1,243 +0,0 @@ -{ - "name": "meggie", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "load" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 40, - "normal": 20, - "caution": 15, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 64, - "normal": 20, - "caution": 40, - "alert": 55 - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1536, - "normal": 200, - "caution": 40, - "alert": 4 - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1536, - "normal": 100, - "caution": 20, - "alert": 2 - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 768, - "normal": 50, - "caution": 10, - "alert": 2 - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 140, - "normal": 70, - "caution": 20, - "alert": 5 - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 80, - "normal": 30, - "caution": 10, - "alert": 5 - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "nfs4_read", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_write", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "main", - "nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]", - "processorType": "Intel Broadwell", - "socketsPerNode": 2, - "coresPerSocket": 10, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 96 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 1536 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 140 - }, - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 - ], - "socket": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], - [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], - [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ] - ] - } - } - ] -} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-tinyfat.json b/fau-systems/job-archive/cluster-tinyfat.json deleted file mode 100644 index 2d22b11..0000000 --- a/fau-systems/job-archive/cluster-tinyfat.json +++ /dev/null @@ -1,466 +0,0 @@ -{ - "name": "tinyfat", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 128, - "normal": 64, - "caution": 60, - "alert": 20, - "subClusters": [ - { - "name": "broadwell_256gb", - "peak": 24, - "normal": 24, - "caution": 12, - "alert": 10 - }, - { - "name": "broadwell_512gb", - "peak": 56, - "normal": 56, - "caution": 28, - "alert": 20 - } - ] - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 512, - "normal": 256, - "caution": 480, - "alert": 500, - "subClusters": [ - { - "name": "broadwell_256gb", - "peak": 256, - "normal": 128, - "caution": 200, - "alert": 240 - }, - { - "name": "broadwell_512gb", - "peak": 512, - "normal": 256, - "caution": 480, - "alert": 500 - } - ] - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 2560, - "normal": 800, - "caution": 100, - "alert": 20, - "subClusters": [ - { - "name": "broadwell_256gb", - "peak": 653, - "normal": 200, - "caution": 50, - "alert": 10 - }, - { - "name": "broadwell_512gb", - "peak": 1075, - "normal": 500, - "caution": 80, - "alert": 20 - } - ] - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50, - "subClusters": [ - { - "name": "broadwell_256gb", - "peak": 653, - "normal": 200, - "caution": 50, - "alert": 10 - }, - { - "name": "broadwell_512gb", - "peak": 1075, - "normal": 500, - "caution": 80, - "alert": 20 - } - ] - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 2300, - "normal": 500, - "caution": 100, - "alert": 50, - "subClusters": [ - { - "name": "broadwell_256gb", - "peak": 325, - "normal": 100, - "caution": 25, - "alert": 10 - }, - { - "name": "broadwell_512gb", - "peak": 500, - "normal": 250, - "caution": 40, - "alert": 10 - } - ] - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 278, - "normal": 100, - "caution": 50, - "alert": 10, - "subClusters": [ - { - "name": "broadwell_256gb", - "peak": 114, - "normal": 50, - "caution": 25, - "alert": 10 - }, - { - "name": "broadwell_512gb", - "peak": 128, - "normal": 50, - "caution": 25, - "alert": 10 - } - ] - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2500, - "caution": 1800, - "alert": 1200, - "subClusters": [ - { - "name": "broadwell_256gb", - "peak": 3800, - "normal": 3400, - "caution": 2000, - "alert": 1200 - }, - { - "name": "broadwell_512gb", - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - } - ] - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "nfs4_read", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_write", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "broadwell_512gb", - "nodes": "tf040,tf041,tf042", - "processorType": "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", - "socketsPerNode": 2, - "coresPerSocket": 14, - "threadsPerCore": 2, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 158 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 1236 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 128 - }, - "topology": { - "node": [ - 0, 28, 1, 29, 2, 30, 3, 31, 4, 32, 5, 33, 6, 34, 7, 35, 8, 36, 9, 37, 10, 38, 11, 39, 12, 40, 13, 41, 14, 42, 15, 43, 16, 44, 17, 45, 18, 46, 19, 47, 20, 48, 21, 49, 22, 50, 23, 51, 24, 52, 25, 53, 26, 54, 27, 55 - ], - "socket": [ - [ 0, 28, 1, 29, 2, 30, 3, 31, 4, 32, 5, 33, 6, 34, 7, 35, 8, 36, 9, 37, 10, 38, 11, 39, 12, 40, 13, 41 ], - [ 14, 42, 15, 43, 16, 44, 17, 45, 18, 46, 19, 47, 20, 48, 21, 49, 22, 50, 23, 51, 24, 52, 25, 53, 26, 54, 27, 55 ] - ], - "memoryDomain": [ - [ 0, 28, 1, 29, 2, 30, 3, 31, 4, 32, 5, 33, 6, 34 ], - [ 7, 35, 8, 36, 9, 37, 10, 38, 11, 39, 12, 40, 13, 41 ], - [ 14, 42, 15, 43, 16, 44, 17, 45, 18, 46, 19, 47, 20, 48 ], - [ 21, 49, 22, 50, 23, 51, 24, 52, 25, 53, 26, 54, 27, 55 ] - ], - "core": [ - [ 0, 28 ], [ 1, 29 ], [ 2, 30 ], [ 3, 31 ], [ 4, 32 ], [ 5, 33 ], [ 6, 34 ], [ 7, 35 ], [ 8, 36 ], [ 9, 37 ], [ 10, 38 ], [ 11, 39 ], [ 12, 40 ], [ 13, 41 ], [ 14, 42 ], [ 15, 43 ], [ 16, 44 ], [ 17, 45 ], [ 18, 46 ], [ 19, 47 ], [ 20, 48 ], [ 21, 49 ], [ 22, 50 ], [ 23, 51 ], [ 24, 52 ], [ 25, 53 ], [ 26, 54 ], [ 27, 55 ] - ] - } - }, - { - "name": "broadwell_256gb", - "nodes": "tf0[50-57]", - "processorType": "Intel(R) Xeon(R) CPU E5-2643 v4 @ 3.40GHz", - "socketsPerNode": 2, - "coresPerSocket": 6, - "threadsPerCore": 2, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 85 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 672 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 114 - }, - "topology": { - "node": [ - 0, 12, 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23 - ], - "socket": [ - [ 0, 12, 1, 13, 2, 14, 3, 15, 4, 16, 5, 17 ], - [ 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23 ] - ], - "memoryDomain": [ - [ 0, 12, 1, 13, 2, 14, 3, 15, 4, 16, 5, 17 ], - [ 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23 ] - ], - "core": [ - [ 0, 12 ], [ 1, 13 ], [ 2, 14 ], [ 3, 15 ], [ 4, 16 ], [ 5, 17 ], [ 6, 18 ], [ 7, 19 ], [ 8, 20 ], [ 9, 21 ], [ 10, 22 ], [ 11, 23 ] - ] - } - }, - { - "name": "rome_512gb", - "nodes": "tf0[60-95]", - "processorType": "AMD EPYC 7502 32-Core Processor ", - "socketsPerNode": 2, - "coresPerSocket": 32, - "threadsPerCore": 2, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 553 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 3198 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 278 - }, - "topology": { - "node": [ - 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95, 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103, 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111, 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119, 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127 - ], - "socket": [ - [ 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 ], - [ 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103, 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111, 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119, 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127 ] - ], - "memoryDomain": [ - [ 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 ], - [ 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 ], - [ 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 ], - [ 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 ], - [ 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103 ], - [ 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111 ], - [ 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119 ], - [ 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127 ] - ], - "core": [ - [ 0, 64 ], [ 1, 65 ], [ 2, 66 ], [ 3, 67 ], [ 4, 68 ], [ 5, 69 ], [ 6, 70 ], [ 7, 71 ], [ 8, 72 ], [ 9, 73 ], [ 10, 74 ], [ 11, 75 ], [ 12, 76 ], [ 13, 77 ], [ 14, 78 ], [ 15, 79 ], [ 16, 80 ], [ 17, 81 ], [ 18, 82 ], [ 19, 83 ], [ 20, 84 ], [ 21, 85 ], [ 22, 86 ], [ 23, 87 ], [ 24, 88 ], [ 25, 89 ], [ 26, 90 ], [ 27, 91 ], [ 28, 92 ], [ 29, 93 ], [ 30, 94 ], [ 31, 95 ], [ 32, 96 ], [ 33, 97 ], [ 34, 98 ], [ 35, 99 ], [ 36, 100 ], [ 37, 101 ], [ 38, 102 ], [ 39, 103 ], [ 40, 104 ], [ 41, 105 ], [ 42, 106 ], [ 43, 107 ], [ 44, 108 ], [ 45, 109 ], [ 46, 110 ], [ 47, 111 ], [ 48, 112 ], [ 49, 113 ], [ 50, 114 ], [ 51, 115 ], [ 52, 116 ], [ 53, 117 ], [ 54, 118 ], [ 55, 119 ], [ 56, 120 ], [ 57, 121 ], [ 58, 122 ], [ 59, 123 ], [ 60, 124 ], [ 61, 125 ], [ 62, 126 ], [ 63, 127 ] - ] - } - } - ] -} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-tinygpu.json b/fau-systems/job-archive/cluster-tinygpu.json deleted file mode 100644 index 534b4fe..0000000 --- a/fau-systems/job-archive/cluster-tinygpu.json +++ /dev/null @@ -1,621 +0,0 @@ -{ - "name": "tinygpu", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 64, - "normal": 32, - "caution": 30, - "alert": 20, - "subClusters": [ - { - "name": "rtx2080", - "peak": 32, - "normal": 16, - "caution": 14, - "alert": 6 - }, - { - "name": "a100", - "peak": 128, - "normal": 128, - "caution": 60, - "alert": 20 - }, - { - "name": "v100", - "peak": 32, - "normal": 16, - "caution": 14, - "alert": 6 - } - ] - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 96, - "normal": 45, - "caution": 80, - "alert": 90, - "subClusters": [ - { - "name": "rtx3080", - "peak": 384, - "normal": 192, - "caution": 320, - "alert": 375 - }, - { - "name": "a100", - "peak": 512, - "normal": 256, - "caution": 480, - "alert": 500 - } - ] - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 2300, - "normal": 500, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 10 - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "acc_utilization", - "unit": { - "base": "" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 80, - "caution": 50, - "alert": 20 - }, - { - "name": "acc_mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "accelerator", - "aggregation": "sum", - "timestep": 60, - "peak": 40, - "normal": 20, - "caution": 10, - "alert": 5 - }, - { - "name": "acc_power", - "unit": { - "base": "W" - }, - "scope": "accelerator", - "aggregation": "sum", - "timestep": 60, - "peak": 400, - "normal": 200, - "caution": 50, - "alert": 20 - }, - { - "name": "nv_mem_util", - "unit": { - "base": "" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 80, - "caution": 20, - "alert": 10 - }, - { - "name": "nv_temp", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "accelerator", - "aggregation": "sum", - "timestep": 60, - "peak": 40, - "normal": 20, - "caution": 5, - "alert": 2 - }, - { - "name": "nv_sm_clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "accelerator", - "aggregation": "sum", - "timestep": 60, - "peak": 1400, - "normal": 1200, - "caution": 100, - "alert": 50 - }, - { - "name": "nfs4_read", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_write", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "rtx3080", - "nodes": "tg0[80-86]", - "processorType": "Intel(R) Xeon(R) Gold 6226R CPU @ 2.90GHz", - "socketsPerNode": 2, - "coresPerSocket": 16, - "threadsPerCore": 2, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 111 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 787 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 229 - }, - "topology": { - "node": [ - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 - ], - "socket": [ - [ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 ], - [ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ] - ], - "memoryDomain": [ - [ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 ], - [ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ] - ], - "core": [ - [ 0, 32 ], [ 1, 33 ], [ 2, 34 ], [ 3, 35 ], [ 4, 36 ], [ 5, 37 ], [ 6, 38 ], [ 7, 39 ], [ 8, 40 ], [ 9, 41 ], [ 10, 42 ], [ 11, 43 ], [ 12, 44 ], [ 13, 45 ], [ 14, 46 ], [ 15, 47 ], [ 16, 48 ], [ 17, 49 ], [ 18, 50 ], [ 19, 51 ], [ 20, 52 ], [ 21, 53 ], [ 22, 54 ], [ 23, 55 ], [ 24, 56 ], [ 25, 57 ], [ 26, 58 ], [ 27, 59 ], [ 28, 60 ], [ 29, 61 ], [ 30, 62 ], [ 31, 63 ] - ], - "accelerators": [ - { - "id": "00000000:1a:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - }, - { - "id": "00000000:1b:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - }, - { - "id": "00000000:3d:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - }, - { - "id": "00000000:3e:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - }, - { - "id": "00000000:b1:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - }, - { - "id": "00000000:b2:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - }, - { - "id": "00000000:da:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - }, - { - "id": "00000000:db:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 3080" - } - ] - } - }, - { - "name": "rtx2080", - "nodes": "tg0[60-69],tg06a,tg06b", - "processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz", - "socketsPerNode": 2, - "coresPerSocket": 8, - "threadsPerCore": 2, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 47 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 326 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 137 - }, - "topology": { - "node": [ - 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 - ], - "socket": [ - [ 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23 ], - [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] - ], - "memoryDomain": [ - [ 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23 ], - [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] - ], - "core": [ - [ 0, 19 ], [ 1, 17 ], [ 2, 18 ], [ 3, 16 ], [ 4, 20 ], [ 5, 21 ], [ 6, 22 ], [ 7, 23 ], [ 8, 24 ], [ 9, 25 ], [ 10, 26 ], [ 11, 27 ], [ 12, 28 ], [ 13, 29 ], [ 14, 30 ], [ 15, 31 ] - ], - "accelerators": [ - { - "id": "00000000:18:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 2080 Ti" - }, - { - "id": "00000000:3b:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 2080 Ti" - }, - { - "id": "00000000:86:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 2080 Ti" - }, - { - "id": "00000000:af:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA GeForce RTX 2080 Ti" - } - ] - } - }, - { - "name": "a100", - "nodes": "tg0[90-97]", - "processorType": "AMD EPYC 7662 64-Core Processor", - "socketsPerNode": 2, - "coresPerSocket": 64, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 987 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 5660 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 306 - }, - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 - ], - "socket": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], - [ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ], - [ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], - [ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 ], - [ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 72 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] - ], - "accelerators": [ - { - "id": "00000000:01:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA A100-SXM4-40GB" - }, - { - "id": "00000000:41:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA A100-SXM4-40GB" - }, - { - "id": "00000000:81:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA A100-SXM4-40GB" - }, - { - "id": "00000000:c1:00.0", - "type": "Nvidia GPU", - "model": "NVIDIA A100-SXM4-40GB" - } - ] - } - }, - { - "name": "v100", - "nodes": "tg0[71-74]", - "processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz", - "socketsPerNode": 2, - "coresPerSocket": 8, - "threadsPerCore": 2, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 59 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 430 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 177 - }, - "topology": { - "node": [ - 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 - ], - "socket": [ - [ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ], - [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] - ], - "memoryDomain": [ - [ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ], - [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] - ], - "core": [ - [ 0, 16 ], [ 1, 17 ], [ 2, 18 ], [ 3, 19 ], [ 4, 20 ], [ 5, 21 ], [ 6, 22 ], [ 7, 23 ], [ 8, 24 ], [ 9, 25 ], [ 10, 26 ], [ 11, 27 ], [ 12, 28 ], [ 13, 29 ], [ 14, 30 ], [ 15, 31 ] - ], - "accelerators": [ - { - "id": "00000000:18:00.0", - "type": "", - "model": "Tesla V100-PCIE-32GB" - }, - { - "id": "00000000:3b:00.0", - "type": "", - "model": "Tesla V100-PCIE-32GB" - }, - { - "id": "00000000:86:00.0", - "type": "", - "model": "Tesla V100-PCIE-32GB" - }, - { - "id": "00000000:af:00.0", - "type": "", - "model": "Tesla V100-PCIE-32GB" - } - ] - } - } - ] -} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-woody.json b/fau-systems/job-archive/cluster-woody.json deleted file mode 100644 index 50a4045..0000000 --- a/fau-systems/job-archive/cluster-woody.json +++ /dev/null @@ -1,415 +0,0 @@ -{ - "name": "woody", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 4, - "caution": 4, - "alert": 1, - "subClusters": [ - { - "name": "icelake", - "peak": 32, - "normal": 32, - "caution": 30, - "alert": 10 - } - ] - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.25 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 32, - "normal": 16, - "caution": 28, - "alert": 30, - "subClusters": [ - { - "name": "icelake", - "peak": 256, - "normal": 128, - "caution": 200, - "alert": 240 - } - ] - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 112, - "normal": 50, - "caution": 20, - "alert": 10, - "subClusters": [ - { - "name": "icelake", - "peak": 2970, - "normal": 1000, - "caution": 100, - "alert": 50 - } - ] - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 56, - "normal": 30, - "caution": 15, - "alert": 5, - "subClusters": [ - { - "name": "icelake", - "peak": 1450, - "normal": 700, - "caution": 100, - "alert": 50 - } - ] - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 112, - "normal": 50, - "caution": 20, - "alert": 10, - "subClusters": [ - { - "name": "icelake", - "peak": 2970, - "normal": 1000, - "caution": 100, - "alert": 50 - } - ] - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 24, - "normal": 10, - "caution": 5, - "alert": 2, - "subClusters": [ - { - "name": "icelake", - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 20 - } - ] - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2000, - "caution": 1500, - "alert": 1200 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "nfs4_read", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_write", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "haswell", - "nodes": "w11[27-45,49-63,69-72]", - "processorType": "Intel Xeon E3-1240 v3", - "socketsPerNode": 1, - "coresPerSocket": 4, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 14 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 112 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 24 - }, - "topology": { - "node": [ - 0, 1, 2, 3 - ], - "socket": [ - [ 0, 1, 2, 3 ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ] - ] - } - }, - { - "name": "skylake", - "nodes": "w12[01-08],w13[01-31,33-56]", - "processorType": "Intel Xeon E3-1240 v5 ", - "socketsPerNode": 1, - "coresPerSocket": 4, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 14 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 112 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 24 - }, - "topology": { - "node": [ - 0, 1, 2, 3 - ], - "socket": [ - [ 0, 1, 2, 3 ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ] - ] - } - }, - { - "name": "kabylake", - "nodes": "w14[01-56],w15[01-05,07-56]", - "processorType": "Intel Xeon E3-1240 v6", - "socketsPerNode": 1, - "coresPerSocket": 4, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 14 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 112 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 24 - }, - "topology": { - "node": [ - 0, 1, 2, 3 - ], - "socket": [ - [ 0, 1, 2, 3 ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ] - ] - } - }, - { - "name": "icelake", - "nodes": "w22[01-35],w23[01-35]", - "processorType": "Intel Xeon Gold 6326", - "socketsPerNode": 2, - "coresPerSocket": 16, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 350 - }, - "topology": { - "node": [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 - ], - "socket": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], - [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] - ], - "memoryDomain": [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 ], - [ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], - [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ], - [ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] - ], - "core": [ - [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ] - ] - } - } - ] -} \ No newline at end of file diff --git a/fau-systems/README.md b/nhr@fau/README.md similarity index 65% rename from fau-systems/README.md rename to nhr@fau/README.md index ee45a22..c7858f8 100644 --- a/fau-systems/README.md +++ b/nhr@fau/README.md @@ -1,28 +1,34 @@ # ClusterCockpit at NHR@FAU -NHR@FAU provides a production instance of ClusterCockpit for support personel +NHR@FAU provides a production instance of ClusterCockpit for support personnel and users. Authentication is via an LDAP directory as well as via our HPC Portal (homegrown account management platform) using JWT tokens. You can find an overview about all clusters -[here](https://hpc.fau.de/systems-services/documentation-instructions/). +[here](https://doc.nhr.fau.de/clusters/overview/). -Some systems run with exclusive nodes, others have node sharing enabled. -There are CPU systems (Fritz, Meggie, Woody, TinyuFat) as well as GPU enabled -clusters (Alex, TinyGPUs). +Some systems run with job exclusive nodes, others have node sharing enabled. +There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated +clusters (Alex, TinyGPU). NHR@FAU uses the following stack: + * `cc-metric-collector` as node agent -* `cc-metric-store` as temporal metric timeseries cache. We use one instance for all clusters. +* `cc-metric-store` as temporal metric time series cache. We use one instance +for all clusters. * `cc-backend` -* A homegrown python script running on the management nodes for providing job meta data from Slurm -* Builtin sqlite database for job meta and user data (currently 11GB) +* A homegrown python script running on the management nodes for providing job +meta data from Slurm +* Builtin sqlite database for job meta and user data (currently 50GB large) * Job Archive without retention using compressed data.json files (around 700GB) +Currently all API use regular HTTP protocol, but we plan to switch to NATS for +all communication. We also push the metric data to an InfluxDB instance for debugging purposes. The backend and metric store run on the same dedicated Dell server running Ubuntu Linux: + * Two Intel Xeon(R) Platinum 8352Y with 32 cores each * 512 GB Main memory capacity * A NVMe Raid with two 7TB disks diff --git a/fau-systems/cc-backend/clustercockpit.service b/nhr@fau/cc-backend/clustercockpit.service similarity index 76% rename from fau-systems/cc-backend/clustercockpit.service rename to nhr@fau/cc-backend/clustercockpit.service index cf06726..89bdd42 100644 --- a/fau-systems/cc-backend/clustercockpit.service +++ b/nhr@fau/cc-backend/clustercockpit.service @@ -1,6 +1,6 @@ [Unit] -Description=ClusterCockpit Web Server (Go edition) -Documentation=https://github.com/ClusterCockpit/cc-backend +Description=ClusterCockpit Web Server +Documentation=https://clustercockpit.org Wants=network-online.target After=network-online.target After=mariadb.service mysql.service diff --git a/nhr@fau/cc-backend/config.json b/nhr@fau/cc-backend/config.json new file mode 100644 index 0000000..062a98e --- /dev/null +++ b/nhr@fau/cc-backend/config.json @@ -0,0 +1,241 @@ +{ + "addr": "0.0.0.0:443", + "stop-jobs-exceeding-walltime": 288000, + "short-running-jobs-duration": 300, + "ldap": { + "url": "ldaps://hpcldap.rrze.uni-erlangen.de", + "user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "user_filter": "(&(objectclass=posixAccount))", + "sync_interval": "24h" + }, + "jwts": { + "syncUserOnLogin": true, + "updateUserOnLogin": true, + "trustedIssuer": "https://portal.hpc.fau.de/", + "validateUser": false, + "max-age": "168h" + }, + "https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem", + "https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem", + "user": "clustercockpit", + "group": "clustercockpit", + "archive": { + "kind": "file", + "path": "./var/job-archive", + "compression": 7, + "retention": { + "policy": "none" + } + }, + "enable-resampling": { + "trigger": 30, + "resolutions": [ + 600, + 300, + 120, + 60 + ] + }, + "emission-constant": 317, + "ui-defaults": { + "analysis_view_histogramMetrics": [ + "flops_any", + "mem_bw", + "mem_used" + ], + "analysis_view_scatterPlotMetrics": [ + [ + "flops_any", + "mem_bw" + ], + [ + "flops_any", + "cpu_load" + ], + [ + "cpu_load", + "mem_bw" + ] + ], + "job_view_nodestats_selectedMetrics": [ + "flops_any", + "mem_bw", + "mem_used" + ], + "job_view_polarPlotMetrics": [ + "flops_any", + "mem_bw", + "mem_used" + ], + "job_view_selectedMetrics": [ + "flops_any", + "mem_bw", + "mem_used" + ], + "job_view_showFootprint": true, + "job_list_usePaging": false, + "plot_general_colorBackground": true, + "plot_general_colorscheme": [ + "#00bfff", + "#0000ff", + "#ff00ff", + "#ff0000", + "#ff8000", + "#ffff00", + "#80ff00" + ], + "plot_general_lineWidth": 3, + "plot_list_jobsPerPage": 10, + "plot_list_selectedMetrics": [ + "cpu_load", + "mem_used", + "flops_any", + "mem_bw" + ], + "plot_view_plotsPerRow": 3, + "plot_view_showPolarplot": true, + "plot_view_showRoofline": true, + "plot_view_showStatTable": true, + "system_view_selectedMetric": "cpu_load", + "analysis_view_selectedTopEntity": "user", + "analysis_view_selectedTopCategory": "totalWalltime", + "status_view_selectedTopUserCategory": "totalJobs", + "status_view_selectedTopProjectCategory": "totalJobs" + }, + "clusters": [ + { + "name": "fritz", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "-" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2022-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "alex", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "-" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2022-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "woody", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "-" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 1 + }, + "duration": { + "from": 0, + "to": 172800 + }, + "startTime": { + "from": "2020-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "tinyfat", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "-" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 1 + }, + "duration": { + "from": 0, + "to": 172800 + }, + "startTime": { + "from": "2020-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "tinygpu", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "-" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 1 + }, + "duration": { + "from": 0, + "to": 172800 + }, + "startTime": { + "from": "2020-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "meggie", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "-" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2018-01-01T00:00:00Z", + "to": null + } + } + } + ] +} diff --git a/fau-systems/cc-metric-collector/alex/collectors.json b/nhr@fau/cc-metric-collector/alex/collectors.json similarity index 100% rename from fau-systems/cc-metric-collector/alex/collectors.json rename to nhr@fau/cc-metric-collector/alex/collectors.json diff --git a/fau-systems/cc-metric-collector/alex/config.json b/nhr@fau/cc-metric-collector/alex/config.json similarity index 100% rename from fau-systems/cc-metric-collector/alex/config.json rename to nhr@fau/cc-metric-collector/alex/config.json diff --git a/fau-systems/cc-metric-collector/alex/receivers.json b/nhr@fau/cc-metric-collector/alex/receivers.json similarity index 100% rename from fau-systems/cc-metric-collector/alex/receivers.json rename to nhr@fau/cc-metric-collector/alex/receivers.json diff --git a/fau-systems/cc-metric-collector/alex/router.json b/nhr@fau/cc-metric-collector/alex/router.json similarity index 100% rename from fau-systems/cc-metric-collector/alex/router.json rename to nhr@fau/cc-metric-collector/alex/router.json diff --git a/fau-systems/cc-metric-collector/alex/sinks.json b/nhr@fau/cc-metric-collector/alex/sinks.json similarity index 100% rename from fau-systems/cc-metric-collector/alex/sinks.json rename to nhr@fau/cc-metric-collector/alex/sinks.json diff --git a/fau-systems/cc-metric-collector/alex/sinks_debug.json b/nhr@fau/cc-metric-collector/alex/sinks_debug.json similarity index 100% rename from fau-systems/cc-metric-collector/alex/sinks_debug.json rename to nhr@fau/cc-metric-collector/alex/sinks_debug.json diff --git a/fau-systems/cc-metric-collector/fritz.spr/collectors.json b/nhr@fau/cc-metric-collector/fritz.spr/collectors.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz.spr/collectors.json rename to nhr@fau/cc-metric-collector/fritz.spr/collectors.json diff --git a/fau-systems/cc-metric-collector/fritz.spr/config.json b/nhr@fau/cc-metric-collector/fritz.spr/config.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz.spr/config.json rename to nhr@fau/cc-metric-collector/fritz.spr/config.json diff --git a/fau-systems/cc-metric-collector/fritz.spr/receivers.json b/nhr@fau/cc-metric-collector/fritz.spr/receivers.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz.spr/receivers.json rename to nhr@fau/cc-metric-collector/fritz.spr/receivers.json diff --git a/fau-systems/cc-metric-collector/fritz.spr/router.json b/nhr@fau/cc-metric-collector/fritz.spr/router.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz.spr/router.json rename to nhr@fau/cc-metric-collector/fritz.spr/router.json diff --git a/fau-systems/cc-metric-collector/fritz.spr/sinks.json b/nhr@fau/cc-metric-collector/fritz.spr/sinks.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz.spr/sinks.json rename to nhr@fau/cc-metric-collector/fritz.spr/sinks.json diff --git a/fau-systems/cc-metric-collector/fritz/collectors.json b/nhr@fau/cc-metric-collector/fritz/collectors.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz/collectors.json rename to nhr@fau/cc-metric-collector/fritz/collectors.json diff --git a/fau-systems/cc-metric-collector/fritz/config.json b/nhr@fau/cc-metric-collector/fritz/config.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz/config.json rename to nhr@fau/cc-metric-collector/fritz/config.json diff --git a/fau-systems/cc-metric-collector/fritz/receivers.json b/nhr@fau/cc-metric-collector/fritz/receivers.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz/receivers.json rename to nhr@fau/cc-metric-collector/fritz/receivers.json diff --git a/fau-systems/cc-metric-collector/fritz/router.json b/nhr@fau/cc-metric-collector/fritz/router.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz/router.json rename to nhr@fau/cc-metric-collector/fritz/router.json diff --git a/fau-systems/cc-metric-collector/fritz/sinks.json b/nhr@fau/cc-metric-collector/fritz/sinks.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz/sinks.json rename to nhr@fau/cc-metric-collector/fritz/sinks.json diff --git a/fau-systems/cc-metric-collector/fritz/sinks_debug.json b/nhr@fau/cc-metric-collector/fritz/sinks_debug.json similarity index 100% rename from fau-systems/cc-metric-collector/fritz/sinks_debug.json rename to nhr@fau/cc-metric-collector/fritz/sinks_debug.json diff --git a/fau-systems/cc-metric-collector/meggie-ng/.gitkeep b/nhr@fau/cc-metric-collector/meggie-ng/.gitkeep similarity index 100% rename from fau-systems/cc-metric-collector/meggie-ng/.gitkeep rename to nhr@fau/cc-metric-collector/meggie-ng/.gitkeep diff --git a/fau-systems/cc-metric-collector/meggie-ng/collectors.json b/nhr@fau/cc-metric-collector/meggie-ng/collectors.json similarity index 100% rename from fau-systems/cc-metric-collector/meggie-ng/collectors.json rename to nhr@fau/cc-metric-collector/meggie-ng/collectors.json diff --git a/fau-systems/cc-metric-collector/meggie-ng/config.json b/nhr@fau/cc-metric-collector/meggie-ng/config.json similarity index 100% rename from fau-systems/cc-metric-collector/meggie-ng/config.json rename to nhr@fau/cc-metric-collector/meggie-ng/config.json diff --git a/fau-systems/cc-metric-collector/meggie-ng/receivers.json b/nhr@fau/cc-metric-collector/meggie-ng/receivers.json similarity index 100% rename from fau-systems/cc-metric-collector/meggie-ng/receivers.json rename to nhr@fau/cc-metric-collector/meggie-ng/receivers.json diff --git a/fau-systems/cc-metric-collector/meggie-ng/router.json b/nhr@fau/cc-metric-collector/meggie-ng/router.json similarity index 100% rename from fau-systems/cc-metric-collector/meggie-ng/router.json rename to nhr@fau/cc-metric-collector/meggie-ng/router.json diff --git a/fau-systems/cc-metric-collector/meggie-ng/sinks.json b/nhr@fau/cc-metric-collector/meggie-ng/sinks.json similarity index 100% rename from fau-systems/cc-metric-collector/meggie-ng/sinks.json rename to nhr@fau/cc-metric-collector/meggie-ng/sinks.json diff --git a/fau-systems/cc-metric-collector/tinyfat/collectors.bdw.json b/nhr@fau/cc-metric-collector/tinyfat/collectors.bdw.json similarity index 100% rename from fau-systems/cc-metric-collector/tinyfat/collectors.bdw.json rename to nhr@fau/cc-metric-collector/tinyfat/collectors.bdw.json diff --git a/fau-systems/cc-metric-collector/tinyfat/collectors.rome.json b/nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json similarity index 100% rename from fau-systems/cc-metric-collector/tinyfat/collectors.rome.json rename to nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json diff --git a/fau-systems/cc-metric-collector/tinyfat/config.json b/nhr@fau/cc-metric-collector/tinyfat/config.json similarity index 100% rename from fau-systems/cc-metric-collector/tinyfat/config.json rename to nhr@fau/cc-metric-collector/tinyfat/config.json diff --git a/fau-systems/cc-metric-collector/tinyfat/receivers.json b/nhr@fau/cc-metric-collector/tinyfat/receivers.json similarity index 100% rename from fau-systems/cc-metric-collector/tinyfat/receivers.json rename to nhr@fau/cc-metric-collector/tinyfat/receivers.json diff --git a/fau-systems/cc-metric-collector/tinyfat/router.json b/nhr@fau/cc-metric-collector/tinyfat/router.json similarity index 100% rename from fau-systems/cc-metric-collector/tinyfat/router.json rename to nhr@fau/cc-metric-collector/tinyfat/router.json diff --git a/fau-systems/cc-metric-collector/tinyfat/sinks.json b/nhr@fau/cc-metric-collector/tinyfat/sinks.json similarity index 100% rename from fau-systems/cc-metric-collector/tinyfat/sinks.json rename to nhr@fau/cc-metric-collector/tinyfat/sinks.json diff --git a/fau-systems/cc-metric-collector/tinyfat/sinks_debug.json b/nhr@fau/cc-metric-collector/tinyfat/sinks_debug.json similarity index 100% rename from fau-systems/cc-metric-collector/tinyfat/sinks_debug.json rename to nhr@fau/cc-metric-collector/tinyfat/sinks_debug.json diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.rome.a100.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/collectors.rome.a100.json rename to nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.skx.2080.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/collectors.skx.2080.json rename to nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.skx.3080.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/collectors.skx.3080.json rename to nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.skx.v100.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/collectors.skx.v100.json rename to nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json diff --git a/fau-systems/cc-metric-collector/tinygpu/config.json b/nhr@fau/cc-metric-collector/tinygpu/config.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/config.json rename to nhr@fau/cc-metric-collector/tinygpu/config.json diff --git a/fau-systems/cc-metric-collector/tinygpu/receivers.json b/nhr@fau/cc-metric-collector/tinygpu/receivers.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/receivers.json rename to nhr@fau/cc-metric-collector/tinygpu/receivers.json diff --git a/fau-systems/cc-metric-collector/tinygpu/router.json b/nhr@fau/cc-metric-collector/tinygpu/router.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/router.json rename to nhr@fau/cc-metric-collector/tinygpu/router.json diff --git a/fau-systems/cc-metric-collector/tinygpu/sinks.json b/nhr@fau/cc-metric-collector/tinygpu/sinks.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/sinks.json rename to nhr@fau/cc-metric-collector/tinygpu/sinks.json diff --git a/fau-systems/cc-metric-collector/tinygpu/sinks_debug.json b/nhr@fau/cc-metric-collector/tinygpu/sinks_debug.json similarity index 100% rename from fau-systems/cc-metric-collector/tinygpu/sinks_debug.json rename to nhr@fau/cc-metric-collector/tinygpu/sinks_debug.json diff --git a/fau-systems/cc-metric-collector/woody-ng/collectors.icx.json b/nhr@fau/cc-metric-collector/woody-ng/collectors.icx.json similarity index 100% rename from fau-systems/cc-metric-collector/woody-ng/collectors.icx.json rename to nhr@fau/cc-metric-collector/woody-ng/collectors.icx.json diff --git a/fau-systems/cc-metric-collector/woody-ng/collectors.skl.json b/nhr@fau/cc-metric-collector/woody-ng/collectors.skl.json similarity index 100% rename from fau-systems/cc-metric-collector/woody-ng/collectors.skl.json rename to nhr@fau/cc-metric-collector/woody-ng/collectors.skl.json diff --git a/fau-systems/cc-metric-collector/woody-ng/config.json b/nhr@fau/cc-metric-collector/woody-ng/config.json similarity index 100% rename from fau-systems/cc-metric-collector/woody-ng/config.json rename to nhr@fau/cc-metric-collector/woody-ng/config.json diff --git a/fau-systems/cc-metric-collector/woody-ng/receivers.json b/nhr@fau/cc-metric-collector/woody-ng/receivers.json similarity index 100% rename from fau-systems/cc-metric-collector/woody-ng/receivers.json rename to nhr@fau/cc-metric-collector/woody-ng/receivers.json diff --git a/fau-systems/cc-metric-collector/woody-ng/router.json b/nhr@fau/cc-metric-collector/woody-ng/router.json similarity index 100% rename from fau-systems/cc-metric-collector/woody-ng/router.json rename to nhr@fau/cc-metric-collector/woody-ng/router.json diff --git a/fau-systems/cc-metric-collector/woody-ng/sinks.json b/nhr@fau/cc-metric-collector/woody-ng/sinks.json similarity index 100% rename from fau-systems/cc-metric-collector/woody-ng/sinks.json rename to nhr@fau/cc-metric-collector/woody-ng/sinks.json diff --git a/fau-systems/cc-metric-store/cc-metric-store.service b/nhr@fau/cc-metric-store/cc-metric-store.service similarity index 100% rename from fau-systems/cc-metric-store/cc-metric-store.service rename to nhr@fau/cc-metric-store/cc-metric-store.service diff --git a/nhr@fau/cc-metric-store/config.json b/nhr@fau/cc-metric-store/config.json new file mode 100644 index 0000000..1915608 --- /dev/null +++ b/nhr@fau/cc-metric-store/config.json @@ -0,0 +1,180 @@ +{ + "metrics": { + "clock": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_idle": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_iowait": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_irq": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_system": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_user": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_mem_util": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_temp": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_sm_clock": { + "frequency": 60, + "aggregation": "avg" + }, + "acc_utilization": { + "frequency": 60, + "aggregation": "avg" + }, + "acc_mem_used": { + "frequency": 60, + "aggregation": "sum" + }, + "acc_power": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_any": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_dp": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_sp": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_recv": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_xmit": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_recv_pkts": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_xmit_pkts": { + "frequency": 60, + "aggregation": "sum" + }, + "cpu_power": { + "frequency": 60, + "aggregation": "sum" + }, + "core_power": { + "frequency": 60, + "aggregation": "sum" + }, + "mem_power": { + "frequency": 60, + "aggregation": "sum" + }, + "ipc": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_load": { + "frequency": 60, + "aggregation": null + }, + "lustre_close": { + "frequency": 60, + "aggregation": null + }, + "lustre_open": { + "frequency": 60, + "aggregation": null + }, + "lustre_statfs": { + "frequency": 60, + "aggregation": null + }, + "lustre_read_bytes": { + "frequency": 60, + "aggregation": null + }, + "lustre_write_bytes": { + "frequency": 60, + "aggregation": null + }, + "net_bw": { + "frequency": 60, + "aggregation": null + }, + "file_bw": { + "frequency": 60, + "aggregation": null + }, + "mem_bw": { + "frequency": 60, + "aggregation": "sum" + }, + "mem_cached": { + "frequency": 60, + "aggregation": null + }, + "mem_used": { + "frequency": 60, + "aggregation": null + }, + "net_bytes_in": { + "frequency": 60, + "aggregation": null + }, + "net_bytes_out": { + "frequency": 60, + "aggregation": null + }, + "nfs4_read": { + "frequency": 60, + "aggregation": null + }, + "nfs4_total": { + "frequency": 60, + "aggregation": null + }, + "nfs4_write": { + "frequency": 60, + "aggregation": null + }, + "vectorization_ratio": { + "frequency": 60, + "aggregation": "avg" + } + }, + "checkpoints": { + "interval": "12h", + "directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints", + "restore": "48h" + }, + "archive": { + "interval": "50h", + "directory": "/opt/monitoring/cc-metric-store/fritz/archive" + }, + "http-api": { + "address": "0.0.0.0:8082", + "https-cert-file": null, + "https-key-file": null + }, + "retention-in-memory": "48h", + "jwt-public-key": "-" +} diff --git a/nhr@fau/job-archive/cluster-alex.json b/nhr@fau/job-archive/cluster-alex.json new file mode 100644 index 0000000..0356e28 --- /dev/null +++ b/nhr@fau/job-archive/cluster-alex.json @@ -0,0 +1,2809 @@ +{ + "name": "alex", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 128, + "normal": 128, + "caution": 10, + "alert": 5 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 512, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 9216, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 125000000, + "normal": 125000000, + "caution": 200, + "alert": 240 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 125000000, + "normal": 125000000, + "caution": 200, + "alert": 240 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "core_power", + "unit": { + "base": "W" + }, + "scope": "hwthread", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 320000, + "normal": 160000, + "caution": 80000, + "alert": 40000, + "subClusters": [ + { + "name": "a100m80", + "peak": 640000, + "normal": 320000, + "caution": 160000, + "alert": 80000, + "footprint": "max" + } + ] + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 3200, + "normal": 1600, + "caution": 400, + "alert": 160 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "°C" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + } + ], + "subClusters": [ + { + "name": "a40", + "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522],a[1621-1624],a[1721-1722]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:01:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:25:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:41:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:61:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:81:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:A1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:C1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:E1:00.0", + "type": "Nvidia GPU", + "model": "A40" + } + ] + } + }, + { + "name": "a100", + "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + }, + { + "name": "a100m80", + "nodes": "a[0531-0537],a[0631-0633],a0731,a[0831-0833],a[0931-0934]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + } + ] +} diff --git a/nhr@fau/job-archive/cluster-fritz.json b/nhr@fau/job-archive/cluster-fritz.json new file mode 100644 index 0000000..201d6b3 --- /dev/null +++ b/nhr@fau/job-archive/cluster-fritz.json @@ -0,0 +1,2293 @@ +{ + "name": "fritz", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20, + "subClusters": [ + { + "name": "spr1tb", + "footprint": "avg", + "peak": 104, + "normal": 104, + "caution": 52, + "alert": 20 + }, + { + "name": "spr2tb", + "footprint": "avg", + "peak": 104, + "normal": 104, + "caution": 52, + "alert": 20 + } + ] + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240, + "subClusters": [ + { + "name": "spr1tb", + "footprint": "max", + "peak": 1024, + "normal": 512, + "caution": 900, + "alert": 1000 + }, + { + "name": "spr2tb", + "footprint": "max", + "peak": 2048, + "normal": 1024, + "caution": 1800, + "alert": 2000 + } + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "footprint": "avg" + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "footprint": "avg" + } + ] + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50 + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50 + } + ] + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50 + }, + { + "name": "spr2tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50 + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "footprint": "avg", + "peak": 549, + "normal": 200, + "caution": 100, + "alert": 20 + }, + { + "name": "spr2tb", + "footprint": "avg", + "peak": 520, + "normal": 200, + "caution": 100, + "alert": 20 + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200, + "subClusters": [ + { + "name": "spr1tb", + "peak": 3000, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": false + }, + { + "name": "spr2tb", + "peak": 3000, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": false + } + ] + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 700, + "energy": "power", + "normal": 350, + "caution": 150, + "alert": 50 + }, + { + "name": "spr2tb", + "peak": 700, + "energy": "power", + "normal": 350, + "caution": 150, + "alert": 50 + } + ] + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "peak": 400, + "energy": "power", + "normal": 200, + "caution": 80, + "alert": 40 + }, + { + "name": "spr2tb", + "peak": 800, + "energy": "power", + "normal": 400, + "caution": 160, + "alert": 80 + } + ] + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "spr2tb", + "peak": 6, + "normal": 2, + "caution": 1, + "alert": 0.5 + } + ] + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "spr2tb", + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + } + ] + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1000, + "normal": 50, + "caution": 200, + "alert": 500 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1000, + "normal": 50, + "caution": 200, + "alert": 500 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "f[0101-0188,0201-0288,0301-0388,0401-0488,0501-0588,0601-0688,0701-0788,0801-0888,0901-0988,1001-1088,1101-1156,1201-1256]", + "processorType": "Intel Icelake", + "socketsPerNode": 2, + "coresPerSocket": 36, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17 + ], + [ + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53 + ], + [ + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ] + ] + } + }, + { + "name": "spr1tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 549 + }, + "nodes": "f[2157-2180,2257-2280]", + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 5152, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12 + ], + [ + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25 + ], + [ + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38 + ], + [ + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64 + ], + [ + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77 + ], + [ + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90 + ], + [ + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ] + ] + } + }, + { + "name": "spr2tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 515 + }, + "nodes": "f[2181-2188,2281-2288]", + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12 + ], + [ + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25 + ], + [ + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38 + ], + [ + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64 + ], + [ + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77 + ], + [ + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90 + ], + [ + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ] + ] + } + } + ] +} diff --git a/nhr@fau/job-archive/cluster-meggie.json b/nhr@fau/job-archive/cluster-meggie.json new file mode 100644 index 0000000..8894aa2 --- /dev/null +++ b/nhr@fau/job-archive/cluster-meggie.json @@ -0,0 +1,357 @@ +{ + "name": "meggie", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "load" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 15, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 64, + "normal": 20, + "caution": 40, + "alert": 55 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 1536, + "normal": 200, + "caution": 40, + "alert": 4 + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1536, + "normal": 100, + "caution": 20, + "alert": 2 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 768, + "normal": 50, + "caution": 10, + "alert": 2 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 140, + "normal": 70, + "caution": 20, + "alert": 5 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 80, + "normal": 30, + "caution": 10, + "alert": 5 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]", + "processorType": "Intel Broadwell", + "socketsPerNode": 2, + "coresPerSocket": 10, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 96 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1536 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 140 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 + ], + [ + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 + ], + [ + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ] + ] + } + } + ] +} diff --git a/nhr@fau/job-archive/cluster-tinyfat.json b/nhr@fau/job-archive/cluster-tinyfat.json new file mode 100644 index 0000000..a2a4bf9 --- /dev/null +++ b/nhr@fau/job-archive/cluster-tinyfat.json @@ -0,0 +1,1411 @@ +{ + "name": "tinyfat", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "%" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "broadwell_512gb", + "nodes": "tf040,tf041,tf042", + "processorType": "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", + "socketsPerNode": 2, + "coresPerSocket": 14, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 158 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1236 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 128 + }, + "topology": { + "node": [ + 0, + 28, + 1, + 29, + 2, + 30, + 3, + 31, + 4, + 32, + 5, + 33, + 6, + 34, + 7, + 35, + 8, + 36, + 9, + 37, + 10, + 38, + 11, + 39, + 12, + 40, + 13, + 4114, + 42, + 15, + 43, + 16, + 44, + 17, + 45, + 18, + 46, + 19, + 47, + 20, + 48, + 21, + 49, + 22, + 50, + 23, + 51, + 24, + 52, + 25, + 53, + 26, + 54, + 27, + 55 + ], + "socket": [ + [ + 0, + 28, + 1, + 29, + 2, + 30, + 3, + 31, + 4, + 32, + 5, + 33, + 6, + 34, + 7, + 35, + 8, + 36, + 9, + 37, + 10, + 38, + 11, + 39, + 12, + 40, + 13, + 41 + ], + [ + 14, + 42, + 15, + 43, + 16, + 44, + 17, + 45, + 18, + 46, + 19, + 47, + 20, + 48, + 21, + 49, + 22, + 50, + 23, + 51, + 24, + 52, + 25, + 53, + 26, + 54, + 27, + 55 + ] + ], + "memoryDomain": [ + [ + 0, + 28, + 1, + 29, + 2, + 30, + 3, + 31, + 4, + 32, + 5, + 33, + 6, + 34 + ], + [ + 7, + 35, + 8, + 36, + 9, + 37, + 10, + 38, + 11, + 39, + 12, + 40, + 13, + 41 + ], + [ + 14, + 42, + 15, + 43, + 16, + 44, + 17, + 45, + 18, + 46, + 19, + 47, + 20, + 48 + ], + [ + 21, + 49, + 22, + 50, + 23, + 51, + 24, + 52, + 25, + 53, + 26, + 54, + 27, + 55 + ] + ], + "core": [ + [ + 0, + 28 + ], + [ + 1, + 29 + ], + [ + 2, + 30 + ], + [ + 3, + 31 + ], + [ + 4, + 32 + ], + [ + 5, + 33 + ], + [ + 6, + 34 + ], + [ + 7, + 35 + ], + [ + 8, + 36 + ], + [ + 9, + 37 + ], + [ + 10, + 38 + ], + [ + 11, + 39 + ], + [ + 12, + 40 + ], + [ + 13, + 41 + ], + [ + 14, + 42 + ], + [ + 15, + 43 + ], + [ + 16, + 44 + ], + [ + 17, + 45 + ], + [ + 18, + 46 + ], + [ + 19, + 47 + ], + [ + 20, + 48 + ], + [ + 21, + 49 + ], + [ + 22, + 50 + ], + [ + 23, + 51 + ], + [ + 24, + 52 + ], + [ + 25, + 53 + ], + [ + 26, + 54 + ], + [ + 27, + 55 + ] + ] + } + }, + { + "name": "broadwell_256gb", + "nodes": "tf0[50-57]", + "processorType": "Intel(R) Xeon(R) CPU E5-2643 v4 @ 3.40GHz", + "socketsPerNode": 2, + "coresPerSocket": 6, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 85 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 672 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 114 + }, + "topology": { + "node": [ + 0, + 12, + 1, + 13, + 2, + 14, + 3, + 15, + 4, + 16, + 5, + 176, + 18, + 7, + 19, + 8, + 20, + 9, + 21, + 10, + 22, + 11, + 23 + ], + "socket": [ + [ + 0, + 12, + 1, + 13, + 2, + 14, + 3, + 15, + 4, + 16, + 5, + 17 + ], + [ + 6, + 18, + 7, + 19, + 8, + 20, + 9, + 21, + 10, + 22, + 11, + 23 + ] + ], + "memoryDomain": [ + [ + 0, + 12, + 1, + 13, + 2, + 14, + 3, + 15, + 4, + 16, + 5, + 17 + ], + [ + 6, + 18, + 7, + 19, + 8, + 20, + 9, + 21, + 10, + 22, + 11, + 23 + ] + ], + "core": [ + [ + 0, + 12 + ], + [ + 1, + 13 + ], + [ + 2, + 14 + ], + [ + 3, + 15 + ], + [ + 4, + 16 + ], + [ + 5, + 17 + ], + [ + 6, + 18 + ], + [ + 7, + 19 + ], + [ + 8, + 20 + ], + [ + 9, + 21 + ], + [ + 10, + 22 + ], + [ + 11, + 23 + ] + ] + } + }, + { + "name": "rome_512gb", + "nodes": "tf0[60-95]", + "processorType": "AMD EPYC 7502 32-Core Processor ", + "socketsPerNode": 2, + "coresPerSocket": 32, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 553 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 3198 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 278 + }, + "topology": { + "node": [ + 0, + 64, + 1, + 65, + 2, + 66, + 3, + 67, + 4, + 68, + 5, + 69, + 6, + 70, + 7, + 71, + 8, + 72, + 9, + 73, + 10, + 74, + 11, + 75, + 12, + 76, + 13, + 77, + 14, + 78, + 15, + 79, + 16, + 80, + 17, + 81, + 18, + 82, + 19, + 83, + 20, + 84, + 21, + 85, + 22, + 86, + 23, + 87, + 24, + 88, + 25, + 89, + 26, + 90, + 27, + 91, + 28, + 92, + 29, + 93, + 30, + 94, + 31, + 9532, + 96, + 33, + 97, + 34, + 98, + 35, + 99, + 36, + 100, + 37, + 101, + 38, + 102, + 39, + 103, + 40, + 104, + 41, + 105, + 42, + 106, + 43, + 107, + 44, + 108, + 45, + 109, + 46, + 110, + 47, + 111, + 48, + 112, + 49, + 113, + 50, + 114, + 51, + 115, + 52, + 116, + 53, + 117, + 54, + 118, + 55, + 119, + 56, + 120, + 57, + 121, + 58, + 122, + 59, + 123, + 60, + 124, + 61, + 125, + 62, + 126, + 63, + 127 + ], + "socket": [ + [ + 0, + 64, + 1, + 65, + 2, + 66, + 3, + 67, + 4, + 68, + 5, + 69, + 6, + 70, + 7, + 71, + 8, + 72, + 9, + 73, + 10, + 74, + 11, + 75, + 12, + 76, + 13, + 77, + 14, + 78, + 15, + 79, + 16, + 80, + 17, + 81, + 18, + 82, + 19, + 83, + 20, + 84, + 21, + 85, + 22, + 86, + 23, + 87, + 24, + 88, + 25, + 89, + 26, + 90, + 27, + 91, + 28, + 92, + 29, + 93, + 30, + 94, + 31, + 95 + ], + [ + 32, + 96, + 33, + 97, + 34, + 98, + 35, + 99, + 36, + 100, + 37, + 101, + 38, + 102, + 39, + 103, + 40, + 104, + 41, + 105, + 42, + 106, + 43, + 107, + 44, + 108, + 45, + 109, + 46, + 110, + 47, + 111, + 48, + 112, + 49, + 113, + 50, + 114, + 51, + 115, + 52, + 116, + 53, + 117, + 54, + 118, + 55, + 119, + 56, + 120, + 57, + 121, + 58, + 122, + 59, + 123, + 60, + 124, + 61, + 125, + 62, + 126, + 63, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 64, + 1, + 65, + 2, + 66, + 3, + 67, + 4, + 68, + 5, + 69, + 6, + 70, + 7, + 71 + ], + [ + 8, + 72, + 9, + 73, + 10, + 74, + 11, + 75, + 12, + 76, + 13, + 77, + 14, + 78, + 15, + 79 + ], + [ + 16, + 80, + 17, + 81, + 18, + 82, + 19, + 83, + 20, + 84, + 21, + 85, + 22, + 86, + 23, + 87 + ], + [ + 24, + 88, + 25, + 89, + 26, + 90, + 27, + 91, + 28, + 92, + 29, + 93, + 30, + 94, + 31, + 95 + ], + [ + 32, + 96, + 33, + 97, + 34, + 98, + 35, + 99, + 36, + 100, + 37, + 101, + 38, + 102, + 39, + 103 + ], + [ + 40, + 104, + 41, + 105, + 42, + 106, + 43, + 107, + 44, + 108, + 45, + 109, + 46, + 110, + 47, + 111 + ], + [ + 48, + 112, + 49, + 113, + 50, + 114, + 51, + 115, + 52, + 116, + 53, + 117, + 54, + 118, + 55, + 119 + ], + [ + 56, + 120, + 57, + 121, + 58, + 122, + 59, + 123, + 60, + 124, + 61, + 125, + 62, + 126, + 63, + 127 + ] + ], + "core": [ + [ + 0, + 64 + ], + [ + 1, + 65 + ], + [ + 2, + 66 + ], + [ + 3, + 67 + ], + [ + 4, + 68 + ], + [ + 5, + 69 + ], + [ + 6, + 70 + ], + [ + 7, + 71 + ], + [ + 8, + 72 + ], + [ + 9, + 73 + ], + [ + 10, + 74 + ], + [ + 11, + 75 + ], + [ + 12, + 76 + ], + [ + 13, + 77 + ], + [ + 14, + 78 + ], + [ + 15, + 79 + ], + [ + 16, + 80 + ], + [ + 17, + 81 + ], + [ + 18, + 82 + ], + [ + 19, + 83 + ], + [ + 20, + 84 + ], + [ + 21, + 85 + ], + [ + 22, + 86 + ], + [ + 23, + 87 + ], + [ + 24, + 88 + ], + [ + 25, + 89 + ], + [ + 26, + 90 + ], + [ + 27, + 91 + ], + [ + 28, + 92 + ], + [ + 29, + 93 + ], + [ + 30, + 94 + ], + [ + 31, + 95 + ], + [ + 32, + 96 + ], + [ + 33, + 97 + ], + [ + 34, + 98 + ], + [ + 35, + 99 + ], + [ + 36, + 100 + ], + [ + 37, + 101 + ], + [ + 38, + 102 + ], + [ + 39, + 103 + ], + [ + 40, + 104 + ], + [ + 41, + 105 + ], + [ + 42, + 106 + ], + [ + 43, + 107 + ], + [ + 44, + 108 + ], + [ + 45, + 109 + ], + [ + 46, + 110 + ], + [ + 47, + 111 + ], + [ + 48, + 112 + ], + [ + 49, + 113 + ], + [ + 50, + 114 + ], + [ + 51, + 115 + ], + [ + 52, + 116 + ], + [ + 53, + 117 + ], + [ + 54, + 118 + ], + [ + 55, + 119 + ], + [ + 56, + 120 + ], + [ + 57, + 121 + ], + [ + 58, + 122 + ], + [ + 59, + 123 + ], + [ + 60, + 124 + ], + [ + 61, + 125 + ], + [ + 62, + 126 + ], + [ + 63, + 127 + ] + ] + } + } + ] +} diff --git a/nhr@fau/job-archive/cluster-tinygpu.json b/nhr@fau/job-archive/cluster-tinygpu.json new file mode 100644 index 0000000..4e44c5d --- /dev/null +++ b/nhr@fau/job-archive/cluster-tinygpu.json @@ -0,0 +1,2486 @@ +{ + "name": "tinygpu", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "%" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40000, + "normal": 20000, + "caution": 10000, + "alert": 5000, + "subClusters": [ + { + "name": "a100", + "peak": 160000, + "normal": 120000, + "caution": 80000, + "alert": 40000 + }, + { + "name": "v100", + "peak": 128000, + "normal": 96000, + "caution": 64000, + "alert": 32000 + }, + { + "name": "rtx3080", + "peak": 80000, + "normal": 60000, + "caution": 30000, + "alert": 10000 + }, + { + "name": "rtx2080ti", + "peak": 44000, + "normal": 33000, + "caution": 22000, + "alert": 11000 + } + ] + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 400, + "normal": 200, + "caution": 50, + "alert": 20 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "rtx3080", + "nodes": "tg0[80-86]", + "processorType": "Intel(R) Xeon(R) Gold 6226R CPU @ 2.90GHz", + "socketsPerNode": 2, + "coresPerSocket": 16, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 111 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 787 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 229 + }, + "topology": { + "node": [ + 0, + 32, + 1, + 33, + 2, + 34, + 3, + 35, + 4, + 36, + 5, + 37, + 6, + 38, + 7, + 39, + 8, + 40, + 9, + 41, + 10, + 42, + 11, + 43, + 12, + 44, + 13, + 45, + 14, + 46, + 15, + 4716, + 48, + 17, + 49, + 18, + 50, + 19, + 51, + 20, + 52, + 21, + 53, + 22, + 54, + 23, + 55, + 24, + 56, + 25, + 57, + 26, + 58, + 27, + 59, + 28, + 60, + 29, + 61, + 30, + 62, + 31, + 63 + ], + "socket": [ + [ + 0, + 32, + 1, + 33, + 2, + 34, + 3, + 35, + 4, + 36, + 5, + 37, + 6, + 38, + 7, + 39, + 8, + 40, + 9, + 41, + 10, + 42, + 11, + 43, + 12, + 44, + 13, + 45, + 14, + 46, + 15, + 47 + ], + [ + 16, + 48, + 17, + 49, + 18, + 50, + 19, + 51, + 20, + 52, + 21, + 53, + 22, + 54, + 23, + 55, + 24, + 56, + 25, + 57, + 26, + 58, + 27, + 59, + 28, + 60, + 29, + 61, + 30, + 62, + 31, + 63 + ] + ], + "memoryDomain": [ + [ + 0, + 32, + 1, + 33, + 2, + 34, + 3, + 35, + 4, + 36, + 5, + 37, + 6, + 38, + 7, + 39, + 8, + 40, + 9, + 41, + 10, + 42, + 11, + 43, + 12, + 44, + 13, + 45, + 14, + 46, + 15, + 47 + ], + [ + 16, + 48, + 17, + 49, + 18, + 50, + 19, + 51, + 20, + 52, + 21, + 53, + 22, + 54, + 23, + 55, + 24, + 56, + 25, + 57, + 26, + 58, + 27, + 59, + 28, + 60, + 29, + 61, + 30, + 62, + 31, + 63 + ] + ], + "core": [ + [ + 0, + 32 + ], + [ + 1, + 33 + ], + [ + 2, + 34 + ], + [ + 3, + 35 + ], + [ + 4, + 36 + ], + [ + 5, + 37 + ], + [ + 6, + 38 + ], + [ + 7, + 39 + ], + [ + 8, + 40 + ], + [ + 9, + 41 + ], + [ + 10, + 42 + ], + [ + 11, + 43 + ], + [ + 12, + 44 + ], + [ + 13, + 45 + ], + [ + 14, + 46 + ], + [ + 15, + 47 + ], + [ + 16, + 48 + ], + [ + 17, + 49 + ], + [ + 18, + 50 + ], + [ + 19, + 51 + ], + [ + 20, + 52 + ], + [ + 21, + 53 + ], + [ + 22, + 54 + ], + [ + 23, + 55 + ], + [ + 24, + 56 + ], + [ + 25, + 57 + ], + [ + 26, + 58 + ], + [ + 27, + 59 + ], + [ + 28, + 60 + ], + [ + 29, + 61 + ], + [ + 30, + 62 + ], + [ + 31, + 63 + ] + ], + "accelerators": [ + { + "id": "00000000:1a:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:1b:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:3d:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:3e:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:b1:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:b2:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:da:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:db:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + } + ] + } + }, + { + "name": "rtx2080", + "nodes": "tg060,tg0[65-69],tg06a,tg06b", + "processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz", + "socketsPerNode": 2, + "coresPerSocket": 8, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 47 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 326 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 137 + }, + "topology": { + "node": [ + 0, + 19, + 1, + 17, + 2, + 18, + 3, + 16, + 4, + 20, + 5, + 21, + 6, + 22, + 7, + 238, + 24, + 9, + 25, + 10, + 26, + 11, + 27, + 12, + 28, + 13, + 29, + 14, + 30, + 15, + 31 + ], + "socket": [ + [ + 0, + 19, + 1, + 17, + 2, + 18, + 3, + 16, + 4, + 20, + 5, + 21, + 6, + 22, + 7, + 23 + ], + [ + 8, + 24, + 9, + 25, + 10, + 26, + 11, + 27, + 12, + 28, + 13, + 29, + 14, + 30, + 15, + 31 + ] + ], + "memoryDomain": [ + [ + 0, + 19, + 1, + 17, + 2, + 18, + 3, + 16, + 4, + 20, + 5, + 21, + 6, + 22, + 7, + 23 + ], + [ + 8, + 24, + 9, + 25, + 10, + 26, + 11, + 27, + 12, + 28, + 13, + 29, + 14, + 30, + 15, + 31 + ] + ], + "core": [ + [ + 0, + 19 + ], + [ + 1, + 17 + ], + [ + 2, + 18 + ], + [ + 3, + 16 + ], + [ + 4, + 20 + ], + [ + 5, + 21 + ], + [ + 6, + 22 + ], + [ + 7, + 23 + ], + [ + 8, + 24 + ], + [ + 9, + 25 + ], + [ + 10, + 26 + ], + [ + 11, + 27 + ], + [ + 12, + 28 + ], + [ + 13, + 29 + ], + [ + 14, + 30 + ], + [ + 15, + 31 + ] + ], + "accelerators": [ + { + "id": "00000000:18:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + }, + { + "id": "00000000:3b:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + }, + { + "id": "00000000:86:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + }, + { + "id": "00000000:af:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + } + ] + } + }, + { + "name": "a100", + "nodes": "tg0[90-97]", + "processorType": "AMD EPYC 7662 64-Core Processor ", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 987 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 5660 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 306 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 6364, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ], + [ + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95 + ], + [ + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:01:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + }, + { + "id": "00000000:41:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + }, + { + "id": "00000000:81:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + }, + { + "id": "00000000:c1:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + } + ] + } + }, + { + "name": "v100", + "nodes": "tg0[71-74]", + "processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz", + "socketsPerNode": 2, + "coresPerSocket": 8, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 59 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 430 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 177 + }, + "topology": { + "node": [ + 0, + 16, + 1, + 17, + 2, + 18, + 3, + 19, + 4, + 20, + 5, + 21, + 6, + 22, + 7, + 238, + 24, + 9, + 25, + 10, + 26, + 11, + 27, + 12, + 28, + 13, + 29, + 14, + 30, + 15, + 31 + ], + "socket": [ + [ + 0, + 16, + 1, + 17, + 2, + 18, + 3, + 19, + 4, + 20, + 5, + 21, + 6, + 22, + 7, + 23 + ], + [ + 8, + 24, + 9, + 25, + 10, + 26, + 11, + 27, + 12, + 28, + 13, + 29, + 14, + 30, + 15, + 31 + ] + ], + "memoryDomain": [ + [ + 0, + 16, + 1, + 17, + 2, + 18, + 3, + 19, + 4, + 20, + 5, + 21, + 6, + 22, + 7, + 23 + ], + [ + 8, + 24, + 9, + 25, + 10, + 26, + 11, + 27, + 12, + 28, + 13, + 29, + 14, + 30, + 15, + 31 + ] + ], + "core": [ + [ + 0, + 16 + ], + [ + 1, + 17 + ], + [ + 2, + 18 + ], + [ + 3, + 19 + ], + [ + 4, + 20 + ], + [ + 5, + 21 + ], + [ + 6, + 22 + ], + [ + 7, + 23 + ], + [ + 8, + 24 + ], + [ + 9, + 25 + ], + [ + 10, + 26 + ], + [ + 11, + 27 + ], + [ + 12, + 28 + ], + [ + 13, + 29 + ], + [ + 14, + 30 + ], + [ + 15, + 31 + ] + ], + "accelerators": [ + { + "id": "00000000:18:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + }, + { + "id": "00000000:3b:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + }, + { + "id": "00000000:86:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + }, + { + "id": "00000000:af:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + } + ] + } + }, + { + "name": "h100", + "nodes": "tg1[00-02]", + "processorType": "AMD EPYC 9354 32-Core Processor", + "socketsPerNode": 2, + "coresPerSocket": 32, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 524 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 3174 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 659 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ], + [ + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ], + [ + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ] + ], + "accelerators": [ + { + "id": "00000000:03:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA H100 80GB HBM3" + }, + { + "id": "00000000:04:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA H100 80GB HBM3" + }, + { + "id": "00000000:E3:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA H100 80GB HBM3" + }, + { + "id": "00000000:E4:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA H100 80GB HBM3" + } + ] + } + } + ] +} diff --git a/nhr@fau/job-archive/cluster-woody.json b/nhr@fau/job-archive/cluster-woody.json new file mode 100644 index 0000000..ac66988 --- /dev/null +++ b/nhr@fau/job-archive/cluster-woody.json @@ -0,0 +1,943 @@ +{ + "name": "woody", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 4, + "caution": 4, + "alert": 1, + "footprint": "avg", + "subClusters": [ + { + "name": "icelake", + "peak": 32, + "normal": 32, + "caution": 30, + "footprint": "avg", + "alert": 10 + } + ] + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.25 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 32, + "normal": 16, + "caution": 28, + "alert": 30, + "subClusters": [ + { + "name": "icelake", + "peak": 256, + "footprint": "max", + "normal": 128, + "caution": 200, + "alert": 240 + } + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 112, + "normal": 50, + "caution": 20, + "alert": 10, + "footprint": "avg", + "subClusters": [ + { + "name": "icelake", + "footprint": "avg", + "peak": 2970, + "normal": 1000, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 125000000, + "normal": 125000000, + "caution": 200, + "alert": 240 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 125000000, + "normal": 125000000, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 56, + "normal": 30, + "caution": 15, + "alert": 5, + "subClusters": [ + { + "name": "icelake", + "peak": 1450, + "normal": 700, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 112, + "normal": 50, + "caution": 20, + "alert": 10, + "subClusters": [ + { + "name": "icelake", + "peak": 2970, + "normal": 1000, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 24, + "normal": 10, + "caution": 5, + "alert": 2, + "footprint": "avg", + "subClusters": [ + { + "name": "icelake", + "peak": 350, + "footprint": "avg", + "normal": 100, + "caution": 50, + "alert": 20 + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2000, + "caution": 1500, + "alert": 1200 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "haswell", + "nodes": "w11[27-45,49-63,69-72]", + "processorType": "Intel Xeon E3-1240 v3", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3 + ], + "socket": [ + [ + 0, + 1, + 2, + 3 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ] + ] + } + }, + { + "name": "skylake", + "nodes": "w12[01-08],w13[01-31,33-56]", + "processorType": "Intel Xeon E3-1240 v5 ", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3 + ], + "socket": [ + [ + 0, + 1, + 2, + 3 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ] + ] + } + }, + { + "name": "kabylake", + "nodes": "w14[01-56],w15[01-05,07-56]", + "processorType": "Intel Xeon E3-1240 v6", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3 + ], + "socket": [ + [ + 0, + 1, + 2, + 3 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ] + ] + } + }, + { + "name": "icelake", + "nodes": "w22[01-35],w23[01-35],w24[01-20],w25[01-20]", + "processorType": "Intel Xeon Gold 6326", + "socketsPerNode": 2, + "coresPerSocket": 16, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17 + ], + [ + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53 + ], + [ + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ] + ] + } + } + ] +}