Adopt changes from project Meeting

Cosmetic changes.
Add more metrics
Add job metaData entry.
This commit is contained in:
Jan Eitzinger 2021-12-02 10:45:01 +01:00
parent 344fc6fdef
commit 468ca857a2
3 changed files with 179 additions and 75 deletions

View File

@ -1,7 +1,7 @@
{ {
"$schema": "http://json-schema.org/draft-07/schema#", "$schema": "http://json-schema.org/draft-07/schema#",
"title": "Job metric data", "title": "Job metric data list",
"description": "Meta data information of a HPC job", "description": "Collection of metric data of a HPC job",
"type": "object", "type": "object",
"properties": { "properties": {
"mem_used": { "mem_used": {
@ -29,7 +29,11 @@
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"cpu_used": { "cpu_used": {
"description": "CPU core utilization", "description": "CPU active core utilization",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"cpu_load": {
"description": "CPU requested core utilization (load 1m)",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"flops_dp": { "flops_dp": {
@ -40,19 +44,27 @@
"description": "Single precision flops rate", "description": "Single precision flops rate",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"rapl_power": { "vectorization_ratio": {
"description": "Fraction of arithmetic instructions using SIMD instructions",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"cpu_power": {
"description": "CPU power consumption", "description": "CPU power consumption",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"gpu_used": { "mem_power": {
"description": "Memory power consumption",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"acc_utilization": {
"description": "GPU utilization", "description": "GPU utilization",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"gpu_mem_used": { "acc_mem_used": {
"description": "GPU memory capacity used", "description": "GPU memory capacity used",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"gpu_power": { "acc_power": {
"description": "GPU power consumption", "description": "GPU power consumption",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
@ -68,60 +80,80 @@
"description": "Ethernet write bandwidth", "description": "Ethernet write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_read_bw": { "pfs_read_bw": {
"description": "Lustre read bandwidth", "description": "Parallel file system read bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_write_bw": { "pfs_write_bw": {
"description": "Lustre write bandwidth", "description": "Parallel file system write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_read_req": { "pfs_read_req": {
"description": "Lustre read requests", "description": "Parallel file system read requests",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_write_req": { "pfs_write_req": {
"description": "Lustre write requests", "description": "Parallel file system write requests",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_inodes": { "pfs_inodes": {
"description": "Lustre inodes used", "description": "Parallel file system inodes used",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_accesses": { "pfs_accesses": {
"description": "Lustre open and close", "description": "Parallel file system open and close",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_fsync": { "pfs_fsync": {
"description": "Lustre fsync", "description": "Parallel file system fsync",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_create": { "pfs_create": {
"description": "Lustre create", "description": "Parallel file system create",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_open": { "pfs_open": {
"description": "Lustre open", "description": "Parallel file system open",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_close": { "pfs_close": {
"description": "Lustre close", "description": "Parallel file system close",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"lustre_seek": { "pfs_seek": {
"description": "Lustre seek", "description": "Parallel file system seek",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"ib_read_bw": { "fs_read_bw": {
"description": "Infiniband read bandwidth", "description": "Local file system read bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"ib_write_bw": { "fs_write_bw": {
"description": "Infiniband write bandwidth", "description": "Local file system write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
}, },
"ib_congestion": { "fs_inodes": {
"description": "Infiniband congestion", "description": "Local file system inodes used",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"fs_accesses": {
"description": "Local file system open and close",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"ic_rcv_packets": {
"description": "Network interconnect read packets",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"ic_send_packets": {
"description": "Network interconnect send packet",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"ic_read_bw": {
"description": "Network interconnect read bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
},
"ic_write_bw": {
"description": "Network interconnect write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-metric-data.schema.json"
} }
}, },

View File

@ -8,15 +8,15 @@
"description": "The unique identifier of a job", "description": "The unique identifier of a job",
"type": "integer" "type": "integer"
}, },
"user_id": { "user": {
"description": "The unique identifier of a user", "description": "The unique identifier of a user",
"type": "string" "type": "string"
}, },
"project_id": { "project": {
"description": "The unique identifier of a project", "description": "The unique identifier of a project",
"type": "string" "type": "string"
}, },
"cluster_id": { "cluster": {
"description": "The unique identifier of a cluster", "description": "The unique identifier of a cluster",
"type": "string" "type": "string"
}, },
@ -38,7 +38,7 @@
"type": "integer", "type": "integer",
"exclusiveMinimum": 0 "exclusiveMinimum": 0
}, },
"num_accelerators": { "num_acc": {
"description": "Number of accelerators used", "description": "Number of accelerators used",
"type": "integer", "type": "integer",
"exclusiveMinimum": 0 "exclusiveMinimum": 0
@ -47,9 +47,13 @@
"description": "Job uses only exclusive nodes", "description": "Job uses only exclusive nodes",
"type": "integer" "type": "integer"
}, },
"monitoring_status": {
"description": "State of monitoring system during job run",
"type": "string"
},
"smt": { "smt": {
"description": "Job uses smt feature", "description": "SMT threads used by job",
"type": "boolean" "type": "integer"
}, },
"walltime": { "walltime": {
"description": "Requested walltime of job in seconds", "description": "Requested walltime of job in seconds",
@ -100,16 +104,59 @@
}, },
"accelerators": { "accelerators": {
"type": "array", "type": "array",
"description": "List of of accelerator device addresses", "description": "List of of accelerator devices",
"items": { "items": {
"type": "integer" "type": "object",
"properties": {
"id": {
"type": "string",
"description": "The unique device id"
},
"type": {
"type": "string",
"description": "The accelerator type",
"enum": [
"Nvidia GPU",
"AMD GPU",
"Intel GPU"
]
},
"model": {
"type": "string",
"description": "The accelerator model"
} }
}, },
"required": [
"id",
"type",
"model"
]
}
},
"configuration": {
"type": "string",
"description": "The configuration options of the node"
},
"required": [ "required": [
"hostname" "hostname"
] ]
}, },
"minItems": 1 "minItems": 1
}
},
"meta_data": {
"description": "Additional information about the job",
"type": "object",
"properties": {
"job_script": {
"type": "string",
"description": "The batch script of the job"
},
"slurmdata": {
"type": "string",
"description": "Additional Slurm information"
}
}
}, },
"tags": { "tags": {
"description": "List of tags", "description": "List of tags",
@ -139,6 +186,10 @@
"description": "Memory capacity used (required)", "description": "Memory capacity used (required)",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"cpu_load": {
"description": "CPU requested core utilization (load 1m) (required)",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
},
"flops_any": { "flops_any": {
"description": "Total flop rate with DP flops scaled up (required)", "description": "Total flop rate with DP flops scaled up (required)",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
@ -160,7 +211,7 @@
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"cpu_used": { "cpu_used": {
"description": "CPU core utilization", "description": "CPU active core utilization",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"flops_dp": { "flops_dp": {
@ -199,66 +250,86 @@
"description": "Ethernet write bandwidth", "description": "Ethernet write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_read_bw": { "pfs_read_bw": {
"description": "Lustre read bandwidth", "description": "Parallel file system read bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_write_bw": { "pfs_write_bw": {
"description": "Lustre write bandwidth", "description": "Parallel file system write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_read_req": { "pfs_read_req": {
"description": "Lustre read requests", "description": "Parallel file system read requests",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_write_req": { "pfs_write_req": {
"description": "Lustre write requests", "description": "Parallel file system write requests",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_inodes": { "pfs_inodes": {
"description": "Lustre inodes used", "description": "Parallel file system inodes used",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_accesses": { "pfs_accesses": {
"description": "Lustre open and close", "description": "Parallel file system open and close",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_fsync": { "pfs_fsync": {
"description": "Lustre fsync", "description": "Parallel file system fsync",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_create": { "pfs_create": {
"description": "Lustre create", "description": "Parallel file system create",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_open": { "pfs_open": {
"description": "Lustre open", "description": "Parallel file system open",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_close": { "pfs_close": {
"description": "Lustre close", "description": "Parallel file system close",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"lustre_seek": { "pfs_seek": {
"description": "Lustre seek", "description": "Parallel file system seek",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"ib_read_bw": { "fs_read_bw": {
"description": "Infiniband read bandwidth", "description": "Local file system read bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"ib_write_bw": { "fs_write_bw": {
"description": "Infiniband write bandwidth", "description": "Local file system write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
}, },
"ib_congestion": { "fs_inodes": {
"description": "Infiniband congestion", "description": "Local file system inodes used",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
},
"fs_accesses": {
"description": "Local file system open and close",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
},
"ic_rcv_packets": {
"description": "Network interconnect read packets",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
},
"ic_send_packets": {
"description": "Network interconnect send packet",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
},
"ic_read_bw": {
"description": "Network interconnect read bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
},
"ic_write_bw": {
"description": "Network interconnect write bandwidth",
"#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json" "#ref": "https://github.com/RRZE-HPC/HPCJobDatabase/blob/master/json-schema/job-statistic.schema.json"
} }
}, },
"required": [ "required": [
"mem_used", "mem_used",
"cpu_used", "cpu_load",
"flops_any", "flops_any",
"mem_bw", "mem_bw",
"net_bw", "net_bw",

View File

@ -12,7 +12,8 @@
"enum": [ "enum": [
"node", "node",
"hwthread", "hwthread",
"memory", "core",
"memoryDomain",
"die", "die",
"socket", "socket",
"accelerator" "accelerator"