Update, cleanup, and reformat

2025-07-23 21:31:42 +02:00 · 2024-12-20 09:29:46 +01:00
parent 119050b4b3
commit f39e04979b
9 changed files with 1528 additions and 720 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,12 @@
 # Specifications for datastructures, interfaces, and file formats
-* **[Generic datastructure specifications](https://github.com/ClusterCockpit/cc-specifications/tree/master/datastructures)**: To be used in applications, as payloads in apis, or as file formats.
+* **[Generic datastructure
-* **[APIs offered by ClusterCockpit](https://github.com/ClusterCockpit/cc-specifications/tree/master/interfaces)**: REST, GraphQL, and Influx Line protocol descriptions.
+specifications](https://github.com/ClusterCockpit/cc-specifications/tree/master/datastructures)**:
-* **[HPC Job-Archive specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive)**: A directory tree and file format description for a file based HPC Job Performance archive.
+To be used in applications, as payloads in apis, or as file formats.
-* **[SQL Jobs Table Schema](https://github.com/ClusterCockpit/cc-specifications/tree/master/schemas)**: The SQL database schema used in cc-backend.
+* **[APIs offered by
 ClusterCockpit](https://github.com/ClusterCockpit/cc-specifications/tree/master/interfaces)**:
 REST, and Influx Line protocol descriptions.
 * **[HPC Job-Archive
 specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive)**:
 A directory tree and file format description for a file based HPC Job
 Performance archive.
--- a/interfaces/graphql/README.md
+++ b/interfaces/graphql/README.md
@@ -1,3 +0,0 @@
 ## GraphQL Schema
 This schema is intended for communication between web-frontend and web-backend.
--- a/interfaces/graphql/schema.graphqls
+++ b/interfaces/graphql/schema.graphqls
@@ -1,303 +0,0 @@
 scalar Time
 scalar Any
 scalar NullableFloat
 scalar MetricScope
 scalar JobState
 type Job {
  id:               ID!
  jobId:            Int!
  user:             String!
  project:          String!
  cluster:          String!
  subCluster:       String!
  startTime:        Time!
  duration:         Int!
  walltime:         Int!
  numNodes:         Int!
  numHWThreads:     Int!
  numAcc:           Int!
  SMT:              Int!
  exclusive:        Int!
  partition:        String!
  arrayJobId:       Int!
  monitoringStatus: Int!
  state:            JobState!
  tags:             [Tag!]!
  resources:        [Resource!]!
  concurrentJobs:   JobLinkResultList
  metaData:         Any
  userData:         User
 }
 type JobLink {
  id:               ID!
  jobId:            Int!
 }
 type Cluster {
  name:         String!
  partitions:   [String!]!        # Slurm partitions
  metricConfig: [MetricConfig!]!
  subClusters:  [SubCluster!]!    # Hardware partitions/subclusters
 }
 type SubCluster {
  name:            String!
  nodes:           String!
  numberOfNodes:   Int!
  processorType:   String!
  socketsPerNode:  Int!
  coresPerSocket:  Int!
  threadsPerCore:  Int!
  flopRateScalar:  MetricValue!
  flopRateSimd:    MetricValue!
  memoryBandwidth: MetricValue!
  topology:        Topology!
 }
 type MetricValue {
  unit: Unit!
  value: Float!
 }
 type Topology {
  node:         [Int!]
  socket:       [[Int!]!]
  memoryDomain: [[Int!]!]
  die:          [[Int!]!]
  core:         [[Int!]!]
  accelerators: [Accelerator!]
 }
 type Accelerator {
  id:    String!
  type:  String!
  model: String!
 }
 type SubClusterConfig {
  name:    String!
  peak:    Float
  normal:  Float
  caution: Float
  alert:   Float
  remove:  Boolean
 }
 type MetricConfig {
  name:        String!
  unit:        Unit!
  scope:       MetricScope!
  aggregation: String!
  timestep:    Int!
  peak:    Float!
  normal:  Float
  caution: Float!
  alert:   Float!
  subClusters: [SubClusterConfig!]!
 }
 type Tag {
  id:   ID!
  type: String!
  name: String!
 }
 type Resource {
  hostname:      String!
  hwthreads:     [Int!]
  accelerators:  [String!]
  configuration: String
 }
 type JobMetricWithName {
  name:   String!
  scope:  MetricScope!
  metric: JobMetric!
 }
 type JobMetric {
  unit:             Unit
  timestep:         Int!
  series:           [Series!]
  statisticsSeries: StatsSeries
 }
 type Series {
  hostname:   String!
  id:         String
  statistics: MetricStatistics
  data:       [NullableFloat!]!
 }
 type Unit {
  base: String!
  prefix: String
 }
 type MetricStatistics {
  avg: Float!
  min: Float!
  max: Float!
 }
 type StatsSeries {
  mean: [NullableFloat!]!
  min:  [NullableFloat!]!
  max:  [NullableFloat!]!
 }
 type MetricFootprints {
  metric: String!
  data:   [NullableFloat!]!
 }
 type Footprints {
  nodehours: [NullableFloat!]!
  metrics:   [MetricFootprints!]!
 }
 enum Aggregate { USER, PROJECT, CLUSTER }
 enum Weights { NODE_COUNT, NODE_HOURS }
 type NodeMetrics {
  host:       String!
  subCluster: String!
  metrics:    [JobMetricWithName!]!
 }
 type Count {
  name:  String!
  count: Int!
 }
 type User {
  username: String!
  name:     String!
  email:    String!
 }
 type Query {
  clusters:     [Cluster!]!   # List of all clusters
  tags:         [Tag!]!       # List of all tags
  user(username: String!): User
  allocatedNodes(cluster: String!): [Count!]!
  job(id: ID!): Job
  jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
  jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
  jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
  jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
  jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, weight: Weights, limit: Int): [Count!]!
  rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
  nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
 }
 type Mutation {
  createTag(type: String!, name: String!): Tag!
  deleteTag(id: ID!): ID!
  addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
  removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
  updateConfiguration(name: String!, value: String!): String
 }
 type IntRangeOutput { from: Int!, to: Int! }
 type TimeRangeOutput { from: Time!, to: Time! }
 input JobFilter {
  tags:        [ID!]
  jobId:       StringInput
  arrayJobId:  Int
  user:        StringInput
  project:     StringInput
  jobName:     StringInput
  cluster:     StringInput
  partition:   StringInput
  duration:    IntRange
  minRunningFor: Int
  numNodes:        IntRange
  numAccelerators: IntRange
  numHWThreads:    IntRange
  startTime:   TimeRange
  state:       [JobState!]
  flopsAnyAvg: FloatRange
  memBwAvg:    FloatRange
  loadAvg:     FloatRange
  memUsedMax:  FloatRange
  exclusive:     Int
  sharedNode:    StringInput
  selfJobId:     StringInput
  selfStartTime: Time
  selfDuration:  Int
 }
 input OrderByInput {
  field: String!
  order: SortDirectionEnum! = ASC
 }
 enum SortDirectionEnum {
  DESC
  ASC
 }
 input StringInput {
  eq:         String
  neq:        String
  contains:   String
  startsWith: String
  endsWith:   String
  in:         [String!]
 }
 input IntRange   { from: Int!,   to: Int! }
 input FloatRange { from: Float!, to: Float! }
 input TimeRange  { from: Time,   to: Time }
 type JobResultList {
  items:  [Job!]!
  offset: Int
  limit:  Int
  count:  Int
 }
 type JobLinkResultList {
  items:  [JobLink!]!
  count:  Int
 }
 type HistoPoint {
  count: Int!
  value: Int!
 }
 type JobsStatistics  {
  id:             ID!            # If `groupBy` was used, ID of the user/project/cluster
  name:           String!        # if User-Statistics: Given Name of Account (ID) Owner
  totalJobs:      Int!           # Number of jobs
  runningJobs:    Int!           # Number of running jobs
  shortJobs:      Int!           # Number of jobs with a duration of less than duration
  totalWalltime:  Int!           # Sum of the duration of all matched jobs in hours
  totalNodeHours: Int!           # Sum of the node hours of all matched jobs
  totalCoreHours: Int!           # Sum of the core hours of all matched jobs
  totalAccHours:  Int!           # Sum of the gpu hours of all matched jobs
  histDuration:   [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
  histNumNodes:   [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
 }
 input PageRequest {
  itemsPerPage: Int!
  page:         Int!
 }
--- a/interfaces/lineprotocol/README.md
+++ b/interfaces/lineprotocol/README.md
@@ -1,63 +1,75 @@
-# Overview
+# InfluxData line-protocol flavor
 ## Overview
 ClusterCockpit uses the [InfluxData line-protocol](https://docs.influxdata.com/influxdb/v2.1/reference/syntax/line-protocol/) for transferring messages between its components. The line-protocol is a text-based representation of a metric/event with a value, time and describing tags. All metrics/events have the following format (if written to `stdout`):
 ```
 <measurement>,<tag set> <field set> <timestamp>
 ```
-where `<tag set>` and `<field set>` are comma-separated lists of `key=value` entries. In a mind-model, think about tags as `indices` in the database for faster lookup and the `<field set>` as values.
+where `<tag set>` and `<field set>` are comma-separated lists of `key=value`
 entries. In a mind-model, think about tags as `indices` in the database for
 faster lookup and the `<field set>` as values.
-**Remark**: In the first iteration, we only sent metrics (number values) but we had to extend the specification to messages with different meanings. The below text was changes accordingly. The update is downward-compatible, so for metrics (number values), nothing changed.
+**Remark**: In the first iteration, we only sent metrics (number values) but we
 had to extend the specification to messages with different meanings. The below
 text was changes accordingly. The update is downward-compatible, so for metrics
 (number values), nothing changed.
 ## Line-protocol in the ClusterCockpit ecosystem
-# Line-protocol in the ClusterCockpit ecosystem
+In ClusterCockpit we limit the flexibility of the InfluxData line-protocol
 slightly. The idea is to keep the format evaluatable by different components.
-In ClusterCockpit we limit the flexibility of the InfluxData line-protocol slightly. The idea is to keep the format evaluatable by different components.
+Each message is identifiable by the `measurement` (= metric name), the
 `hostname`, the `type` and, if required, a `type-id`.
-Each message is identifiable by the `measurement` (= metric name), the `hostname`, the `type` and, if required, a `type-id`.
+### Mandatory tags per message
 ## Mandatory tags per message:
 * `hostname`
 * `type`
-    - `node`
+  * `node`
-    - `socket`
+  * `socket`
-    - `die`
+  * `die`
-    - `memoryDomain`
+  * `memoryDomain`
-    - `llc`
+  * `llc`
-    - `core`
+  * `core`
-    - `hwthread`
+  * `hwthread`
-    - `accelerator`
+  * `accelerator`
 * `type-id` for further specifying the type like CPU socket  or HW Thread identifier
 Although no `type-id` is required if `type=node`, it is recommended to send `type=node,type-id=0`.
-### Optional tags depending on the message:
+#### Optional tags depending on the message
-In some cases, optional tags are required like `filesystem`, `device` or `version`. While you are free to do that, the ClusterCockpit components in the stack above will recognize `stype` (= "sub type") and `stype-id`. So `filesystem=/homes` should be better specified as `stype=filesystem,stype-id=/homes`.
+In some cases, optional tags are required like `filesystem`, `device` or
 `version`. While you are free to do that, the ClusterCockpit components in the
 stack above will recognize `stype` (= "sub type") and `stype-id`. So
 `filesystem=/homes` should be better specified as
 `stype=filesystem,stype-id=/homes`.
-## Mandatory fields per measurement:
+### Mandatory fields per measurement
- Metric: The field key is always `value`
+* Metric: The field key is always `value`
- Event: The field key is always `event`
+* Event: The field key is always `event`
- Log message: The field key is always `log`
+* Log message: The field key is always `log`
- Control message: The field key is always `log`
+* Control message: The field key is always `log`
 No other field keys are evaluated by the ClusterCockpit ecosystem.
 ### Message types
-## Message types
+There exist different message types in the ClusterCockpit ecosystem, all
 specified using the InfluxData line-protocol.
-There exist different message types in the ClusterCockpit ecosystem, all specified using the InfluxData line-protocol.
+#### Metrics
 ### Metrics
 **Identification:** `value=X` field with `X` being a number
-While the measurements (metric names) can be chosen freely, there is a basic set of measurements which should be present as long as you navigate in the ClusterCockpit ecosystem
+While the measurements (metric names) can be chosen freely, there is a basic set
 of measurements which should be present as long as you navigate in the
 ClusterCockpit ecosystem
 * `flops_sp`: Single-precision floating point rate in `Flops/s`
 * `flops_dp`: Double-precision floating point rate in `Flops/s`
@@ -73,19 +85,17 @@ While the measurements (metric names) can be chosen freely, there is a basic set
 For the whole list, see [job-data schema](../../datastructures/job-data.schema.json)
-
+#### Events
 ### Events
 **Identification:** `event="X"` field with `"X"` being a string
-### Controls
+#### Controls
 **Identification:**
 - `control="X"` field with `"X"` being a string
 - `method` tag is either `GET` or `PUT`
-### Logs
+* `control="X"` field with `"X"` being a string
 * `method` tag is either `GET` or `PUT`
 #### Logs
 **Identification:** `log="X"` field with `"X"` being a string
--- a/interfaces/rest/README.md
+++ b/interfaces/rest/README.md
@@ -1 +1 @@
-## REST API interfaces
+# REST API interfaces
--- a/interfaces/rest/swagger.json
+++ b/interfaces/rest/swagger.json
--- a/interfaces/rest/swagger.yaml
+++ b/interfaces/rest/swagger.yaml
--- a/schemas/README.md
+++ b/schemas/README.md
@@ -1,4 +0,0 @@
 ## SQL Database Schema for Job Table
 This sqlite schema for a HPC job table is used in cc-backend and also part of
 the ClusterCockpit Job Archive specification.
--- a/schemas/jobs-sqlite.sql
+++ b/schemas/jobs-sqlite.sql
@@ -1,71 +0,0 @@
 CREATE TABLE tag (
 id       INTEGER PRIMARY KEY,
 tag_type VARCHAR(255) NOT NULL,
 tag_name VARCHAR(255) NOT NULL,
 insert_ts TEXT DEFAULT CURRENT_TIMESTAMP,
 UNIQUE (tag_type, tag_name));
 CREATE TABLE jobtag (
 job_id INTEGER,
 tag_id INTEGER,
 insert_ts TEXT DEFAULT CURRENT_TIMESTAMP,
 PRIMARY KEY (job_id, tag_id),
 FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
 FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
 CREATE TABLE user (
 username varchar(255) PRIMARY KEY NOT NULL,
 password varchar(255) DEFAULT NULL,
 ldap     tinyint      NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */
 name     varchar(255) DEFAULT NULL,
 roles    varchar(255) NOT NULL DEFAULT "[]",
 email    varchar(255) DEFAULT NULL,
 projects varchar(255) NOT NULL DEFAULT "[]");
 CREATE TABLE configuration (
 username varchar(255),
 confkey  varchar(255),
 value    varchar(255),
 PRIMARY KEY (username, confkey),
 FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);
 CREATE TABLE job (
 id                INTEGER PRIMARY KEY,
 job_id            BIGINT NOT NULL,
 cluster           VARCHAR(255) NOT NULL,
 subcluster        VARCHAR(255) NOT NULL,
 start_time        BIGINT NOT NULL, -- Unix timestamp
 user              VARCHAR(255) NOT NULL,
 project           VARCHAR(255) NOT NULL,
 partition         VARCHAR(255),
 array_job_id      BIGINT,
 duration          INT NOT NULL,
 walltime          INT NOT NULL,
 job_state         VARCHAR(255) NOT NULL
 CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
 meta_data         TEXT,          -- JSON
 resources         TEXT NOT NULL, -- JSON
 num_nodes         INT NOT NULL,
 num_hwthreads     INT,
 num_acc           INT,
 smt               TINYINT NOT NULL DEFAULT 1 CHECK(smt               IN (0, 1   )),
 exclusive         TINYINT NOT NULL DEFAULT 1 CHECK(exclusive         IN (0, 1, 2)),
 monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
 mem_used_max        REAL NOT NULL DEFAULT 0.0,
 flops_any_avg       REAL NOT NULL DEFAULT 0.0,
 mem_bw_avg          REAL NOT NULL DEFAULT 0.0,
 load_avg            REAL NOT NULL DEFAULT 0.0,
 net_bw_avg          REAL NOT NULL DEFAULT 0.0,
 net_data_vol_total  REAL NOT NULL DEFAULT 0.0,
 file_bw_avg         REAL NOT NULL DEFAULT 0.0,
 file_data_vol_total REAL NOT NULL DEFAULT 0.0,
 UNIQUE (job_id, cluster, start_time));
 CREATE INDEX job_stats        ON job (cluster,subcluster,user);
 CREATE INDEX job_by_user      ON job (user);
 CREATE INDEX job_by_starttime ON job (start_time);
 CREATE INDEX job_by_job_id    ON job (job_id, cluster, start_time);
 CREATE INDEX job_list         ON job (cluster, job_state);
 CREATE INDEX job_list_user    ON job (user, cluster, job_state);
 CREATE INDEX job_list_users   ON job (user, job_state);
 CREATE INDEX job_list_users_start ON job (start_time, user, job_state);
		`@@ -1,3 +0,0 @@`
			`## GraphQL Schema`

			`This schema is intended for communication between web-frontend and web-backend.`
`@@ -1 +1 @@`
	`## REST API interfaces`	`# REST API interfaces`