Update specs

This commit is contained in:
Jan Eitzinger 2022-03-18 15:19:04 +01:00
parent d762e3e52b
commit a77d6e0f31
3 changed files with 102 additions and 10 deletions

49
job-archive/README.md Normal file
View File

@ -0,0 +1,49 @@
# File based archive specification for HPC jobs
This is a json files based exchange format for HPC job meta and performance metric data.
It consists of two parts:
* a sqlite database schema for job meta data and performance statistics
* a json file format together with a directory hierarchy specification
By using an open, portable and simple specification based on files it is
possible to exchange job performance data for research and analysis purposes as
well as a robust way for archiving job performance data on disk.
## Directory hierarchy and file specification
The job archive has top-level directories named after the clusters. In every
cluster directory there must be one file named `cluster.json` describing the
cluster. The json schema for this file is described here. Within this directory
a three-level directory tree is used to organize job files.
To manage the number of directories within a single directory a tree approach
is used splitting the integer job ID. The job id is split in junks of 1000
each.
For a 2 layer schema this can be achieved with (code example in Perl):
```perl
$level1 = $jobID/1000;
$level2 = $jobID%1000;
$dstPath = sprintf("%s/%s/%d/%03d", $trunk, $destdir, $level1, $level2);
```
The last directory level is the unix epoch timestamp in seconds to allow for
overflowing job ids.
Example:
For the job ID 1034871 the directory path is ./1034/871/<timestamp>/.
The job data consists of two files:
* meta.json: Contains job meta information and job statistics.
* data.json: Contains complete job data with time series
The description of the json format specification is available as json schema.
Metric time series data is stored with fixed time step. The time step can be
set per metric. If no value is available for a metric time series data
timestamp null must be entered.

View File

@ -0,0 +1,4 @@
## SQL Database Schema for Job Table
This sqlite schema for a HPC job table is used in cc-backend and also part of
the ClusterCockpit Job Archive specification.

View File

@ -1,10 +1,49 @@
CREATE TABLE job ( id INTEGER PRIMARY KEY, DROP TABLE IF EXISTS jobtag;
job_id TEXT NOT NULL, user_id TEXT NOT NULL, project_id TEXT NOT NULL, cluster_id TEXT NOT NULL, DROP TABLE IF EXISTS job;
start_time INTEGER NOT NULL, duration INTEGER NOT NULL, DROP TABLE IF EXISTS tag;
walltime INTEGER, job_state TEXT,
num_nodes INTEGER NOT NULL, node_list TEXT NOT NULL, has_profile INTEGER NOT NULL, CREATE TABLE job (
mem_used_max REAL, flops_any_avg REAL, mem_bw_avg REAL, load_avg REAL, net_bw_avg REAL, file_bw_avg REAL); id INTEGER PRIMARY KEY /*!40101 AUTO_INCREMENT */,
CREATE TABLE tag ( id INTEGER PRIMARY KEY, tag_type TEXT, tag_name TEXT); job_id BIGINT NOT NULL,
CREATE TABLE jobtag ( job_id INTEGER, tag_id INTEGER, PRIMARY KEY (job_id, tag_id), cluster VARCHAR(255) NOT NULL,
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE ON UPDATE NO ACTION, subcluster VARCHAR(255) NOT NULL,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE ON UPDATE NO ACTION ); start_time BIGINT NOT NULL, -- Unix timestamp
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
` + "`partition`" + ` VARCHAR(255) NOT NULL, -- partition is a keyword in mysql -.-
array_job_id BIGINT NOT NULL,
duration INT NOT NULL DEFAULT 0,
walltime INT NOT NULL DEFAULT 0,
job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
meta_data TEXT, -- JSON
resources TEXT NOT NULL, -- JSON
num_nodes INT NOT NULL,
num_hwthreads INT NOT NULL,
num_acc INT NOT NULL,
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL,
CONSTRAINT be_unique UNIQUE (tag_type, tag_name));
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);