Update specs

2026-01-08 23:26:16 +01:00 · 2022-03-18 15:19:04 +01:00
parent d762e3e52b
commit a77d6e0f31
3 changed files with 102 additions and 10 deletions
--- a/job-archive/README.md
+++ b/job-archive/README.md
@@ -0,0 +1,49 @@
+# File based archive specification for HPC jobs
+
+This is a json files based exchange format for HPC job meta and performance metric data.
+
+It consists of two parts:
+* a sqlite database schema for job meta data and performance statistics
+* a json file format together with a directory hierarchy specification
+
+By using an open, portable and simple specification based on files it is
+possible to exchange job performance data for research and analysis purposes as
+well as a robust way for archiving job performance data on disk.
+
+## Directory hierarchy and file specification
+
+The job archive has top-level directories named after the clusters. In every
+cluster directory there must be one file named `cluster.json` describing the
+cluster. The json schema for this file is described here. Within this directory
+a three-level directory tree is used to organize job files.
+
+To manage the number of directories within a single directory a tree approach
+is used splitting the integer job ID. The job id is split in junks of 1000
+each.
+
+For a 2 layer schema this can be achieved with (code example in Perl):
+
+```perl
+$level1 = $jobID/1000;
+$level2 = $jobID%1000;
+$dstPath = sprintf("%s/%s/%d/%03d", $trunk, $destdir, $level1, $level2);
+
+```
+
+The last directory level is the unix epoch timestamp in seconds to allow for
+overflowing job ids.
+
+Example:
+
+For the job ID 1034871 the directory path is ./1034/871/<timestamp>/.
+
+The job data consists of two files:
+
+* meta.json: Contains job meta information and job statistics.
+* data.json: Contains complete job data with time series
+
+The description of the json format specification is available as json schema.
+
+Metric time series data is stored with fixed time step. The time step can be
+set per metric. If no value is available for a metric time series data
+timestamp null must be entered.
--- a/schemas/README.md
+++ b/schemas/README.md
@@ -0,0 +1,4 @@
+## SQL Database Schema for Job Table
+
+This sqlite schema for a HPC job table is used in cc-backend and also part of
+the ClusterCockpit Job Archive specification.
--- a/schemas/jobs-sqlite.sql
+++ b/schemas/jobs-sqlite.sql
@@ -1,10 +1,49 @@
-CREATE TABLE job ( id INTEGER PRIMARY KEY,
- job_id TEXT NOT NULL, user_id TEXT NOT NULL, project_id TEXT NOT NULL, cluster_id TEXT NOT NULL,
- start_time INTEGER NOT NULL, duration INTEGER NOT NULL,
- walltime INTEGER, job_state TEXT,
- num_nodes INTEGER NOT NULL, node_list TEXT NOT NULL, has_profile INTEGER NOT NULL,
- mem_used_max REAL, flops_any_avg REAL, mem_bw_avg REAL, load_avg REAL, net_bw_avg REAL, file_bw_avg REAL);
-CREATE TABLE tag ( id INTEGER PRIMARY KEY, tag_type TEXT, tag_name TEXT);
-CREATE TABLE jobtag ( job_id INTEGER, tag_id INTEGER, PRIMARY KEY (job_id, tag_id),
- FOREIGN KEY (job_id) REFERENCES job (id)  ON DELETE CASCADE ON UPDATE NO ACTION,
- FOREIGN KEY (tag_id) REFERENCES tag (id)  ON DELETE CASCADE ON UPDATE NO ACTION );
+DROP TABLE IF EXISTS jobtag;
+DROP TABLE IF EXISTS job;
+DROP TABLE IF EXISTS tag;
+
+CREATE TABLE job (
+    id                INTEGER PRIMARY KEY /*!40101 AUTO_INCREMENT */,
+    job_id            BIGINT NOT NULL,
+    cluster           VARCHAR(255) NOT NULL,
+    subcluster        VARCHAR(255) NOT NULL,
+    start_time        BIGINT NOT NULL, -- Unix timestamp
+
+    user              VARCHAR(255) NOT NULL,
+    project           VARCHAR(255) NOT NULL,
+    ` + "`partition`" + ` VARCHAR(255) NOT NULL, -- partition is a keyword in mysql -.-
+    array_job_id      BIGINT NOT NULL,
+    duration          INT NOT NULL DEFAULT 0,
+    walltime          INT NOT NULL DEFAULT 0,
+    job_state         VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
+    meta_data         TEXT,          -- JSON
+    resources         TEXT NOT NULL, -- JSON
+
+    num_nodes         INT NOT NULL,
+    num_hwthreads     INT NOT NULL,
+    num_acc           INT NOT NULL,
+    smt               TINYINT NOT NULL DEFAULT 1 CHECK(smt               IN (0, 1   )),
+    exclusive         TINYINT NOT NULL DEFAULT 1 CHECK(exclusive         IN (0, 1, 2)),
+    monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
+
+    mem_used_max        REAL NOT NULL DEFAULT 0.0,
+    flops_any_avg       REAL NOT NULL DEFAULT 0.0,
+    mem_bw_avg          REAL NOT NULL DEFAULT 0.0,
+    load_avg            REAL NOT NULL DEFAULT 0.0,
+    net_bw_avg          REAL NOT NULL DEFAULT 0.0,
+    net_data_vol_total  REAL NOT NULL DEFAULT 0.0,
+    file_bw_avg         REAL NOT NULL DEFAULT 0.0,
+    file_data_vol_total REAL NOT NULL DEFAULT 0.0);
+
+CREATE TABLE tag (
+    id       INTEGER PRIMARY KEY,
+    tag_type VARCHAR(255) NOT NULL,
+    tag_name VARCHAR(255) NOT NULL,
+    CONSTRAINT be_unique UNIQUE (tag_type, tag_name));
+
+CREATE TABLE jobtag (
+    job_id INTEGER,
+    tag_id INTEGER,
+    PRIMARY KEY (job_id, tag_id),
+    FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
+    FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);