2019-10-21 14:50:16 +02:00
|
|
|
{
|
2022-07-27 13:01:15 +02:00
|
|
|
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
2022-10-06 11:17:48 +02:00
|
|
|
"$id": "cluster.schema.json",
|
2022-07-27 09:50:50 +02:00
|
|
|
"title": "HPC cluster description",
|
2019-10-21 14:50:16 +02:00
|
|
|
"description": "Meta data information of a HPC cluster",
|
|
|
|
"type": "object",
|
2023-06-27 15:08:03 +02:00
|
|
|
"properties": {
|
2021-12-17 06:54:16 +01:00
|
|
|
"name": {
|
2019-10-21 14:50:16 +02:00
|
|
|
"description": "The unique identifier of a cluster",
|
|
|
|
"type": "string"
|
|
|
|
},
|
2021-12-17 06:54:16 +01:00
|
|
|
"metricConfig": {
|
2021-04-23 07:14:53 +02:00
|
|
|
"description": "Metric specifications",
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
|
|
|
"type": "object",
|
2023-06-27 15:08:03 +02:00
|
|
|
"properties": {
|
2021-12-17 06:54:16 +01:00
|
|
|
"name": {
|
2021-04-23 07:14:53 +02:00
|
|
|
"description": "Metric name",
|
|
|
|
"type": "string"
|
|
|
|
},
|
2021-12-17 06:54:16 +01:00
|
|
|
"unit": {
|
2021-04-23 07:14:53 +02:00
|
|
|
"description": "Metric unit",
|
2022-10-06 11:17:48 +02:00
|
|
|
"$ref": "embedfs://unit.schema.json"
|
2021-04-23 07:14:53 +02:00
|
|
|
},
|
2021-12-17 09:38:41 +01:00
|
|
|
"scope": {
|
|
|
|
"description": "Native measurement resolution",
|
|
|
|
"type": "string"
|
|
|
|
},
|
2021-12-17 06:54:16 +01:00
|
|
|
"timestep": {
|
2021-04-23 07:14:53 +02:00
|
|
|
"description": "Frequency of timeseries points",
|
|
|
|
"type": "integer"
|
|
|
|
},
|
2022-05-03 15:58:08 +02:00
|
|
|
"aggregation": {
|
|
|
|
"description": "How the metric is aggregated",
|
|
|
|
"type": "string",
|
|
|
|
"enum": [
|
|
|
|
"sum",
|
2022-07-27 09:50:50 +02:00
|
|
|
"avg"
|
2022-05-03 15:58:08 +02:00
|
|
|
]
|
|
|
|
},
|
2023-06-27 15:08:03 +02:00
|
|
|
"peak": {
|
|
|
|
"description": "Metric peak threshold (Upper metric limit)",
|
|
|
|
"type": "number"
|
|
|
|
},
|
|
|
|
"normal": {
|
|
|
|
"description": "Metric normal threshold",
|
|
|
|
"type": "number"
|
|
|
|
},
|
|
|
|
"caution": {
|
|
|
|
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
|
|
|
|
"type": "number"
|
|
|
|
},
|
|
|
|
"alert": {
|
|
|
|
"description": "Metric alert threshold (Requires immediate action)",
|
|
|
|
"type": "number"
|
|
|
|
},
|
2022-05-06 08:53:08 +02:00
|
|
|
"subClusters": {
|
2022-07-27 09:50:50 +02:00
|
|
|
"description": "Array of cluster hardware partition metric thresholds",
|
2022-05-06 08:53:08 +02:00
|
|
|
"type": "array",
|
|
|
|
"items": {
|
|
|
|
"type": "object",
|
2023-06-27 15:08:03 +02:00
|
|
|
"properties": {
|
2022-05-06 08:53:08 +02:00
|
|
|
"name": {
|
|
|
|
"description": "Hardware partition name",
|
|
|
|
"type": "string"
|
|
|
|
},
|
|
|
|
"peak": {
|
|
|
|
"type": "number"
|
|
|
|
},
|
|
|
|
"normal": {
|
|
|
|
"type": "number"
|
|
|
|
},
|
|
|
|
"caution": {
|
|
|
|
"type": "number"
|
|
|
|
},
|
|
|
|
"alert": {
|
|
|
|
"type": "number"
|
2023-06-27 15:08:03 +02:00
|
|
|
},
|
|
|
|
"remove": {
|
|
|
|
"type": "boolean"
|
2022-05-06 08:53:08 +02:00
|
|
|
}
|
2022-07-27 09:50:50 +02:00
|
|
|
},
|
|
|
|
"required": [
|
2023-06-27 15:08:03 +02:00
|
|
|
"name"
|
2022-07-27 09:50:50 +02:00
|
|
|
]
|
2022-05-06 08:53:08 +02:00
|
|
|
}
|
2022-08-25 17:33:18 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"required": [
|
|
|
|
"name",
|
|
|
|
"unit",
|
|
|
|
"scope",
|
2023-06-27 15:08:03 +02:00
|
|
|
"timestep",
|
|
|
|
"aggregation",
|
|
|
|
"peak",
|
|
|
|
"normal",
|
|
|
|
"caution",
|
|
|
|
"alert"
|
2022-08-25 17:33:18 +02:00
|
|
|
]
|
2022-09-21 15:24:48 +02:00
|
|
|
},
|
|
|
|
"minItems": 1
|
2022-08-25 12:26:13 +02:00
|
|
|
},
|
|
|
|
"subClusters": {
|
|
|
|
"description": "Array of cluster hardware partitions",
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
|
|
|
"type": "object",
|
2023-06-27 15:08:03 +02:00
|
|
|
"properties": {
|
2022-08-25 12:26:13 +02:00
|
|
|
"name": {
|
|
|
|
"description": "Hardware partition name",
|
|
|
|
"type": "string"
|
|
|
|
},
|
|
|
|
"processorType": {
|
|
|
|
"description": "Processor type",
|
|
|
|
"type": "string"
|
|
|
|
},
|
|
|
|
"socketsPerNode": {
|
|
|
|
"description": "Number of sockets per node",
|
|
|
|
"type": "integer"
|
|
|
|
},
|
|
|
|
"coresPerSocket": {
|
|
|
|
"description": "Number of cores per socket",
|
|
|
|
"type": "integer"
|
|
|
|
},
|
|
|
|
"threadsPerCore": {
|
|
|
|
"description": "Number of SMT threads per core",
|
|
|
|
"type": "integer"
|
|
|
|
},
|
|
|
|
"flopRateScalar": {
|
|
|
|
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
|
2023-06-27 15:08:03 +02:00
|
|
|
"type": "object",
|
|
|
|
"properties": {
|
|
|
|
"unit": {
|
|
|
|
"description": "Metric unit",
|
|
|
|
"$ref": "embedfs://unit.schema.json"
|
|
|
|
},
|
|
|
|
"value": {
|
|
|
|
"type": "number"
|
|
|
|
}
|
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
},
|
|
|
|
"flopRateSimd": {
|
|
|
|
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
|
2023-06-27 15:08:03 +02:00
|
|
|
"type": "object",
|
|
|
|
"properties": {
|
|
|
|
"unit": {
|
|
|
|
"description": "Metric unit",
|
|
|
|
"$ref": "embedfs://unit.schema.json"
|
|
|
|
},
|
|
|
|
"value": {
|
|
|
|
"type": "number"
|
|
|
|
}
|
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
},
|
|
|
|
"memoryBandwidth": {
|
|
|
|
"description": "Theoretical node peak memory bandwidth in GB/s",
|
2023-06-27 15:08:03 +02:00
|
|
|
"type": "object",
|
|
|
|
"properties": {
|
|
|
|
"unit": {
|
|
|
|
"description": "Metric unit",
|
|
|
|
"$ref": "embedfs://unit.schema.json"
|
|
|
|
},
|
|
|
|
"value": {
|
|
|
|
"type": "number"
|
|
|
|
}
|
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
},
|
|
|
|
"nodes": {
|
|
|
|
"description": "Node list expression",
|
|
|
|
"type": "string"
|
|
|
|
},
|
|
|
|
"topology": {
|
|
|
|
"description": "Node topology",
|
|
|
|
"type": "object",
|
2023-06-27 15:08:03 +02:00
|
|
|
"properties": {
|
2022-08-25 12:26:13 +02:00
|
|
|
"node": {
|
|
|
|
"description": "HwTread lists of node",
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
|
|
|
"type": "integer"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"socket": {
|
|
|
|
"description": "HwTread lists of sockets",
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
2021-12-17 06:54:16 +01:00
|
|
|
"type": "array",
|
|
|
|
"items": {
|
|
|
|
"type": "integer"
|
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"memoryDomain": {
|
|
|
|
"description": "HwTread lists of memory domains",
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
2021-12-17 06:54:16 +01:00
|
|
|
"type": "array",
|
|
|
|
"items": {
|
2022-08-25 12:26:13 +02:00
|
|
|
"type": "integer"
|
2021-12-17 06:54:16 +01:00
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"die": {
|
|
|
|
"description": "HwTread lists of dies",
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
2021-12-17 06:54:16 +01:00
|
|
|
"type": "array",
|
|
|
|
"items": {
|
2022-08-25 12:26:13 +02:00
|
|
|
"type": "integer"
|
2021-12-17 06:54:16 +01:00
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"core": {
|
|
|
|
"description": "HwTread lists of cores",
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
2021-12-17 06:54:16 +01:00
|
|
|
"type": "array",
|
|
|
|
"items": {
|
2022-08-25 12:26:13 +02:00
|
|
|
"type": "integer"
|
2021-12-17 06:54:16 +01:00
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"accelerators": {
|
|
|
|
"type": "array",
|
|
|
|
"description": "List of of accelerator devices",
|
|
|
|
"items": {
|
|
|
|
"type": "object",
|
|
|
|
"properties": {
|
|
|
|
"id": {
|
|
|
|
"type": "string",
|
|
|
|
"description": "The unique device id"
|
2021-12-17 06:54:16 +01:00
|
|
|
},
|
2022-08-25 12:26:13 +02:00
|
|
|
"type": {
|
|
|
|
"type": "string",
|
|
|
|
"description": "The accelerator type",
|
|
|
|
"enum": [
|
|
|
|
"Nvidia GPU",
|
|
|
|
"AMD GPU",
|
|
|
|
"Intel GPU"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"model": {
|
|
|
|
"type": "string",
|
|
|
|
"description": "The accelerator model"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"required": [
|
|
|
|
"id",
|
|
|
|
"type",
|
|
|
|
"model"
|
|
|
|
]
|
|
|
|
}
|
2022-08-25 17:33:18 +02:00
|
|
|
}
|
2022-08-25 12:26:13 +02:00
|
|
|
},
|
2023-06-27 15:08:03 +02:00
|
|
|
"required": [
|
2022-08-25 17:33:18 +02:00
|
|
|
"node",
|
|
|
|
"socket",
|
|
|
|
"memoryDomain"
|
2022-08-25 12:26:13 +02:00
|
|
|
]
|
|
|
|
}
|
|
|
|
},
|
2023-06-27 15:08:03 +02:00
|
|
|
"required": [
|
2022-08-25 12:26:13 +02:00
|
|
|
"name",
|
2023-06-27 15:08:03 +02:00
|
|
|
"nodes",
|
2022-08-25 17:33:18 +02:00
|
|
|
"topology",
|
|
|
|
"processorType",
|
|
|
|
"socketsPerNode",
|
|
|
|
"coresPerSocket",
|
|
|
|
"threadsPerCore",
|
|
|
|
"flopRateScalar",
|
|
|
|
"flopRateSimd",
|
|
|
|
"memoryBandwidth"
|
2022-08-25 12:26:13 +02:00
|
|
|
]
|
2022-09-21 15:24:48 +02:00
|
|
|
},
|
|
|
|
"minItems": 1
|
2022-07-27 09:50:50 +02:00
|
|
|
}
|
2022-08-25 17:33:18 +02:00
|
|
|
},
|
2023-06-27 15:08:03 +02:00
|
|
|
"required": [
|
2022-08-25 17:33:18 +02:00
|
|
|
"name",
|
|
|
|
"metricConfig",
|
|
|
|
"subClusters"
|
|
|
|
]
|
2022-07-27 09:50:50 +02:00
|
|
|
}
|