mirror of
https://github.com/ClusterCockpit/cc-specifications.git
synced 2024-12-26 05:19:05 +01:00
Update specs
This commit is contained in:
parent
d762e3e52b
commit
a77d6e0f31
49
job-archive/README.md
Normal file
49
job-archive/README.md
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# File based archive specification for HPC jobs
|
||||||
|
|
||||||
|
This is a json files based exchange format for HPC job meta and performance metric data.
|
||||||
|
|
||||||
|
It consists of two parts:
|
||||||
|
* a sqlite database schema for job meta data and performance statistics
|
||||||
|
* a json file format together with a directory hierarchy specification
|
||||||
|
|
||||||
|
By using an open, portable and simple specification based on files it is
|
||||||
|
possible to exchange job performance data for research and analysis purposes as
|
||||||
|
well as a robust way for archiving job performance data on disk.
|
||||||
|
|
||||||
|
## Directory hierarchy and file specification
|
||||||
|
|
||||||
|
The job archive has top-level directories named after the clusters. In every
|
||||||
|
cluster directory there must be one file named `cluster.json` describing the
|
||||||
|
cluster. The json schema for this file is described here. Within this directory
|
||||||
|
a three-level directory tree is used to organize job files.
|
||||||
|
|
||||||
|
To manage the number of directories within a single directory a tree approach
|
||||||
|
is used splitting the integer job ID. The job id is split in junks of 1000
|
||||||
|
each.
|
||||||
|
|
||||||
|
For a 2 layer schema this can be achieved with (code example in Perl):
|
||||||
|
|
||||||
|
```perl
|
||||||
|
$level1 = $jobID/1000;
|
||||||
|
$level2 = $jobID%1000;
|
||||||
|
$dstPath = sprintf("%s/%s/%d/%03d", $trunk, $destdir, $level1, $level2);
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
The last directory level is the unix epoch timestamp in seconds to allow for
|
||||||
|
overflowing job ids.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
For the job ID 1034871 the directory path is ./1034/871/<timestamp>/.
|
||||||
|
|
||||||
|
The job data consists of two files:
|
||||||
|
|
||||||
|
* meta.json: Contains job meta information and job statistics.
|
||||||
|
* data.json: Contains complete job data with time series
|
||||||
|
|
||||||
|
The description of the json format specification is available as json schema.
|
||||||
|
|
||||||
|
Metric time series data is stored with fixed time step. The time step can be
|
||||||
|
set per metric. If no value is available for a metric time series data
|
||||||
|
timestamp null must be entered.
|
@ -0,0 +1,4 @@
|
|||||||
|
## SQL Database Schema for Job Table
|
||||||
|
|
||||||
|
This sqlite schema for a HPC job table is used in cc-backend and also part of
|
||||||
|
the ClusterCockpit Job Archive specification.
|
@ -1,10 +1,49 @@
|
|||||||
CREATE TABLE job ( id INTEGER PRIMARY KEY,
|
DROP TABLE IF EXISTS jobtag;
|
||||||
job_id TEXT NOT NULL, user_id TEXT NOT NULL, project_id TEXT NOT NULL, cluster_id TEXT NOT NULL,
|
DROP TABLE IF EXISTS job;
|
||||||
start_time INTEGER NOT NULL, duration INTEGER NOT NULL,
|
DROP TABLE IF EXISTS tag;
|
||||||
walltime INTEGER, job_state TEXT,
|
|
||||||
num_nodes INTEGER NOT NULL, node_list TEXT NOT NULL, has_profile INTEGER NOT NULL,
|
CREATE TABLE job (
|
||||||
mem_used_max REAL, flops_any_avg REAL, mem_bw_avg REAL, load_avg REAL, net_bw_avg REAL, file_bw_avg REAL);
|
id INTEGER PRIMARY KEY /*!40101 AUTO_INCREMENT */,
|
||||||
CREATE TABLE tag ( id INTEGER PRIMARY KEY, tag_type TEXT, tag_name TEXT);
|
job_id BIGINT NOT NULL,
|
||||||
CREATE TABLE jobtag ( job_id INTEGER, tag_id INTEGER, PRIMARY KEY (job_id, tag_id),
|
cluster VARCHAR(255) NOT NULL,
|
||||||
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE ON UPDATE NO ACTION,
|
subcluster VARCHAR(255) NOT NULL,
|
||||||
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE ON UPDATE NO ACTION );
|
start_time BIGINT NOT NULL, -- Unix timestamp
|
||||||
|
|
||||||
|
user VARCHAR(255) NOT NULL,
|
||||||
|
project VARCHAR(255) NOT NULL,
|
||||||
|
` + "`partition`" + ` VARCHAR(255) NOT NULL, -- partition is a keyword in mysql -.-
|
||||||
|
array_job_id BIGINT NOT NULL,
|
||||||
|
duration INT NOT NULL DEFAULT 0,
|
||||||
|
walltime INT NOT NULL DEFAULT 0,
|
||||||
|
job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
|
||||||
|
meta_data TEXT, -- JSON
|
||||||
|
resources TEXT NOT NULL, -- JSON
|
||||||
|
|
||||||
|
num_nodes INT NOT NULL,
|
||||||
|
num_hwthreads INT NOT NULL,
|
||||||
|
num_acc INT NOT NULL,
|
||||||
|
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
|
||||||
|
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
|
||||||
|
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
|
||||||
|
|
||||||
|
mem_used_max REAL NOT NULL DEFAULT 0.0,
|
||||||
|
flops_any_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
load_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
net_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
|
||||||
|
file_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
|
||||||
|
|
||||||
|
CREATE TABLE tag (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
tag_type VARCHAR(255) NOT NULL,
|
||||||
|
tag_name VARCHAR(255) NOT NULL,
|
||||||
|
CONSTRAINT be_unique UNIQUE (tag_type, tag_name));
|
||||||
|
|
||||||
|
CREATE TABLE jobtag (
|
||||||
|
job_id INTEGER,
|
||||||
|
tag_id INTEGER,
|
||||||
|
PRIMARY KEY (job_id, tag_id),
|
||||||
|
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
|
||||||
|
Loading…
Reference in New Issue
Block a user