From fc69a29b34ff44603b385d62a14756c1d7737311 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 17 Dec 2021 06:54:16 +0100 Subject: [PATCH] Rework schema * Add partition and topology to cluster * Move accelerator spec from job resources to partitions * Change to consistent camelCase * Fix ref URLS --- schema/json/cluster.schema.json | 228 +++++++++++++----- schema/json/job-meta.schema.json | 138 +++++------ schema/json/job-metric-data.schema.json | 50 ++-- ...json => job-metric-statistics.schema.json} | 0 4 files changed, 243 insertions(+), 173 deletions(-) rename schema/json/{job-statistic.schema.json => job-metric-statistics.schema.json} (100%) diff --git a/schema/json/cluster.schema.json b/schema/json/cluster.schema.json index e7dec8a..b69fa29 100644 --- a/schema/json/cluster.schema.json +++ b/schema/json/cluster.schema.json @@ -4,43 +4,15 @@ "description": "Meta data information of a HPC cluster", "type": "object", "properties":{ - "ClusterId": { + "name": { "description": "The unique identifier of a cluster", "type": "string" }, - "ProcessorType": { - "description": "Processor type", - "type": "string" - }, - "SocketsPerNode": { - "description": "Number of sockets per node", - "type": "integer" - }, - "CoresPerSocket": { - "description": "Number of cores per socket", - "type": "integer" - }, - "ThreadsPerCore": { - "description": "Number of SMT threads per core", - "type": "integer" - }, - "FlopRateScalar": { - "description": "Theoretical node peak flop rate for scalar code in GFlops/s", - "type": "integer" - }, - "FlopRateSimd": { - "description": "Theoretical node peak flop rate for SIMD code in GFlops/s", - "type": "integer" - }, - "MemoryBandwidth": { - "description": "Theoretical node peak memory bandwidth in GB/s", - "type": "integer" - }, - "MetricDataRepository": { + "metricDataRepository": { "description": "Type of the metric data repository for this cluster", "type": "object", "properties": { - "Kind": { + "kind": { "type": "string", "enum": [ "influxdb-v1", @@ -49,71 +21,193 @@ "cc-metric-store" ] }, - "Url": { + "url": { "type": "string" }, - "Token": { + "token": { "type": "string" } }, "required": [ - "Kind", - "Url" + "kind", + "url" ] }, - "MetricConfig": { + "metricConfig": { "description": "Metric specifications", "type": "array", "items": { "type": "object", "properties":{ - "Name": { + "name": { "description": "Metric name", "type": "string" }, - "Unit": { + "unit": { "description": "Metric unit", "type": "string" }, - "Timestep": { + "timestep": { "description": "Frequency of timeseries points", "type": "integer" }, - "Peak": { + "peak": { "type": "number" }, - "Normal": { + "normal": { "type": "number" }, - "Caution": { + "caution": { "type": "number" }, - "Alert": { + "alert": { "type": "number" } } }, - "required":[ - "Name", - "Unit", - "Timestep", - "Peak", - "Normal", - "Caution", - "Alert" - ] - } - }, - "required":[ - "ClusterId", - "ProcessorType", - "SocketsPerNode", - "CoresPerSocket", - "ThreadsPerCore", - "FlopRateScalar", - "FlopRateSimd", - "MemoryBandwidth", - "MetricDataRepository", - "MetricConfig" - ] -} + "partitions": { + "description": "Array of cluster partitions", + "type": "array", + "items": { + "type": "object", + "properties":{ + "name": { + "description": "Metric name", + "type": "string" + }, + "processorType": { + "description": "Processor type", + "type": "string" + }, + "socketsPerNode": { + "description": "Number of sockets per node", + "type": "integer" + }, + "coresPerSocket": { + "description": "Number of cores per socket", + "type": "integer" + }, + "threadsPerCore": { + "description": "Number of SMT threads per core", + "type": "integer" + }, + "flopRateScalar": { + "description": "Theoretical node peak flop rate for scalar code in GFlops/s", + "type": "integer" + }, + "flopRateSimd": { + "description": "Theoretical node peak flop rate for SIMD code in GFlops/s", + "type": "integer" + }, + "memoryBandwidth": { + "description": "Theoretical node peak memory bandwidth in GB/s", + "type": "integer" + }, + "topology": { + "description": "Node topology", + "type": "object", + "properties":{ + "node": { + "description": "HwTread lists of node", + "type": "array", + "items": { + "type": "integer" + } + }, + "socket": { + "description": "HwTread lists of sockets", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "memoryDomain": { + "description": "HwTread lists of memory domains", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "die": { + "description": "HwTread lists of dies", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "core": { + "description": "HwTread lists of cores", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "accelerators": { + "type": "array", + "description": "List of of accelerator devices", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The unique device id" + }, + "type": { + "type": "string", + "description": "The accelerator type", + "enum": [ + "Nvidia GPU", + "AMD GPU", + "Intel GPU" + ] + }, + "model": { + "type": "string", + "description": "The accelerator model" + } + }, + "required": [ + "id", + "type", + "model" + ] + } + }, + "required":[ + "node", + "socket", + "memoryDomain" + ] + }, + "required":[ + "name", + "topology", + "processorType", + "socketsPerNode", + "coresPerSocket", + "threadsPerCore", + "flopRateScalar", + "flopRateSimd", + "memoryBandwidth" + ] + } + }, + "required":[ + "name", + "metricDataRepository", + "metricConfig", + "partitions" + ] + } diff --git a/schema/json/job-meta.schema.json b/schema/json/job-meta.schema.json index 6f42273..1ba0c80 100644 --- a/schema/json/job-meta.schema.json +++ b/schema/json/job-meta.schema.json @@ -4,7 +4,7 @@ "description": "Meta data information of a HPC job", "type": "object", "properties": { - "job_id": { + "jobId": { "description": "The unique identifier of a job", "type": "integer" }, @@ -21,24 +21,24 @@ "type": "string" }, "partition": { - "description": "The queue to which the job was submitted", + "description": "The cluster partition to which the job was submitted", "type": "string" }, - "array_job_id": { + "arrayJobId": { "description": "The unique identifier of an array job", "type": "integer" }, - "num_nodes": { + "numNodes": { "description": "Number of nodes used", "type": "integer", "exclusiveMinimum": 0 }, - "num_hwthreads": { + "numHwthreads": { "description": "Number of HWThreads used", "type": "integer", "exclusiveMinimum": 0 }, - "num_acc": { + "numAcc": { "description": "Number of accelerators used", "type": "integer", "exclusiveMinimum": 0 @@ -47,7 +47,7 @@ "description": "Job uses only exclusive nodes", "type": "integer" }, - "monitoring_status": { + "monitoringStatus": { "description": "State of monitoring system during job run", "type": "integer" }, @@ -60,7 +60,7 @@ "type": "integer", "exclusiveMinimum": 0 }, - "job_state": { + "jobState": { "description": "Final state of job", "type": "string", "enum": [ @@ -71,12 +71,12 @@ "timeout" ] }, - "start_time": { + "startTime": { "description": "Start epoch time stamp in seconds", "type": "integer", "exclusiveMinimum": 0 }, - "stop_time": { + "stopTime": { "description": "Stop epoch time stamp in seconds", "type": "integer", "exclusiveMinimum": 0 @@ -104,33 +104,9 @@ }, "accelerators": { "type": "array", - "description": "List of of accelerator devices", + "description": "List of of accelerator ids", "items": { - "type": "object", - "properties": { - "id": { - "type": "string", - "description": "The unique device id" - }, - "type": { - "type": "string", - "description": "The accelerator type", - "enum": [ - "Nvidia GPU", - "AMD GPU", - "Intel GPU" - ] - }, - "model": { - "type": "string", - "description": "The accelerator model" - } - }, - "required": [ - "id", - "type", - "model" - ] + "type": "integer" } }, "configuration": { @@ -144,11 +120,11 @@ "minItems": 1 } }, - "meta_data": { + "metaData": { "description": "Additional information about the job", "type": "object", "properties": { - "job_script": { + "jobScript": { "type": "string", "description": "The batch script of the job" }, @@ -184,147 +160,147 @@ "properties": { "mem_used": { "description": "Memory capacity used (required)", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "cpu_load": { "description": "CPU requested core utilization (load 1m) (required)", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "flops_any": { "description": "Total flop rate with DP flops scaled up (required)", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "mem_bw": { "description": "Main memory bandwidth (required)", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "net_bw": { "description": "Total fast interconnect network bandwidth (required)", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "file_bw": { "description": "Total file IO bandwidth (required)", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "ipc": { "description": "Instructions executed per cycle", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "cpu_used": { "description": "CPU active core utilization", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "flops_dp": { "description": "Double precision flop rate", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "flops_sp": { "description": "Single precision flops rate", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "rapl_power": { "description": "CPU power consumption", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "gpu_used": { "description": "GPU utilization", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "gpu_mem_used": { "description": "GPU memory capacity used", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "gpu_power": { "description": "GPU power consumption", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "clock": { "description": "Average core frequency", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "eth_read_bw": { "description": "Ethernet read bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "eth_write_bw": { "description": "Ethernet write bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_read_bw": { "description": "Parallel file system read bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_write_bw": { "description": "Parallel file system write bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_read_req": { "description": "Parallel file system read requests", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_write_req": { "description": "Parallel file system write requests", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_inodes": { "description": "Parallel file system inodes used", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_accesses": { "description": "Parallel file system open and close", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_fsync": { "description": "Parallel file system fsync", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_create": { "description": "Parallel file system create", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_open": { "description": "Parallel file system open", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_close": { "description": "Parallel file system close", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "pfs_seek": { "description": "Parallel file system seek", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "fs_read_bw": { "description": "Local file system read bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "fs_write_bw": { "description": "Local file system write bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "fs_inodes": { "description": "Local file system inodes used", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "fs_accesses": { "description": "Local file system open and close", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "ic_rcv_packets": { "description": "Network interconnect read packets", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "ic_send_packets": { "description": "Network interconnect send packet", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "ic_read_bw": { "description": "Network interconnect read bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" }, "ic_write_bw": { "description": "Network interconnect write bandwidth", - "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-data.schema.json" + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/job-metric-statistics.schema.json" } }, "required": [ @@ -338,13 +314,13 @@ } }, "required": [ - "job_id", - "user_id", - "project_id", - "cluster_id", - "num_nodes", - "start_time", - "stop_time", + "jobId", + "user", + "project", + "cluster", + "numNodes", + "startTime", + "stopTime", "duration", "resources", "tags", diff --git a/schema/json/job-metric-data.schema.json b/schema/json/job-metric-data.schema.json index 6dc88bd..8cfff49 100644 --- a/schema/json/job-metric-data.schema.json +++ b/schema/json/job-metric-data.schema.json @@ -4,10 +4,10 @@ "description": "Metric data of a HPC job", "type": "object", "properties": { - "Unit": { - "type": "string" + "unit": { + "#ref": "https://github.com/ClusterCockpit/cc-specifications/blob/master/schema/json/unit.schema.json" }, - "Scope": { + "scope": { "type": "string", "enum": [ "node", @@ -19,62 +19,62 @@ "accelerator" ] }, - "Timestep": { + "timestep": { "description": "Measurement interval in seconds", "type": "integer" }, - "Thresholds": { + "thresholds": { "description": "Metric thresholds for specific system", "type": "object", "properties": { - "Peak": { + "peak": { "type": "number" }, - "Normal": { + "normal": { "type": "number" }, - "Caution": { + "caution": { "type": "number" }, - "Alert": { + "alert": { "type": "number" } } }, - "Series": { + "series": { "type": "array", "items": { "type": "object", "properties": { - "Hostname": { + "hostname": { "type": "string" }, - "Id": { + "id": { "type": "integer" }, - "Statistics": { + "statistics": { "type": "object", "properties": { - "Avg": { + "avg": { "description": "Series average", "type": "number", "minimum": 0 }, - "Min": { + "min": { "description": "Series minimum", "type": "number", "minimum": 0 }, - "Max": { + "max": { "description": "Series maximum", "type": "number", "minimum": 0 } }, "required": [ - "Avg", - "Min", - "Max" + "avg", + "min", + "max" ] }, "data": { @@ -87,16 +87,16 @@ } }, "required": [ - "Hostname", - "Data" + "hostname", + "data" ] } } }, "required": [ - "Unit", - "Scope", - "Timestep", - "Series" + "unit", + "scope", + "timestep", + "series" ] } diff --git a/schema/json/job-statistic.schema.json b/schema/json/job-metric-statistics.schema.json similarity index 100% rename from schema/json/job-statistic.schema.json rename to schema/json/job-metric-statistics.schema.json