Update, cleanup, and reformat

This commit is contained in:
Jan Eitzinger 2024-12-20 09:29:46 +01:00
parent 119050b4b3
commit f39e04979b
Signed by: moebiusband
GPG Key ID: 2574BA29B90D6DD5
9 changed files with 1528 additions and 720 deletions

View File

@ -1,6 +1,12 @@
# Specifications for datastructures, interfaces, and file formats # Specifications for datastructures, interfaces, and file formats
* **[Generic datastructure specifications](https://github.com/ClusterCockpit/cc-specifications/tree/master/datastructures)**: To be used in applications, as payloads in apis, or as file formats. * **[Generic datastructure
* **[APIs offered by ClusterCockpit](https://github.com/ClusterCockpit/cc-specifications/tree/master/interfaces)**: REST, GraphQL, and Influx Line protocol descriptions. specifications](https://github.com/ClusterCockpit/cc-specifications/tree/master/datastructures)**:
* **[HPC Job-Archive specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive)**: A directory tree and file format description for a file based HPC Job Performance archive. To be used in applications, as payloads in apis, or as file formats.
* **[SQL Jobs Table Schema](https://github.com/ClusterCockpit/cc-specifications/tree/master/schemas)**: The SQL database schema used in cc-backend. * **[APIs offered by
ClusterCockpit](https://github.com/ClusterCockpit/cc-specifications/tree/master/interfaces)**:
REST, and Influx Line protocol descriptions.
* **[HPC Job-Archive
specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive)**:
A directory tree and file format description for a file based HPC Job
Performance archive.

View File

@ -1,3 +0,0 @@
## GraphQL Schema
This schema is intended for communication between web-frontend and web-backend.

View File

@ -1,303 +0,0 @@
scalar Time
scalar Any
scalar NullableFloat
scalar MetricScope
scalar JobState
type Job {
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
subCluster: String!
startTime: Time!
duration: Int!
walltime: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
SMT: Int!
exclusive: Int!
partition: String!
arrayJobId: Int!
monitoringStatus: Int!
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
concurrentJobs: JobLinkResultList
metaData: Any
userData: User
}
type JobLink {
id: ID!
jobId: Int!
}
type Cluster {
name: String!
partitions: [String!]! # Slurm partitions
metricConfig: [MetricConfig!]!
subClusters: [SubCluster!]! # Hardware partitions/subclusters
}
type SubCluster {
name: String!
nodes: String!
numberOfNodes: Int!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: MetricValue!
flopRateSimd: MetricValue!
memoryBandwidth: MetricValue!
topology: Topology!
}
type MetricValue {
unit: Unit!
value: Float!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
memoryDomain: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
accelerators: [Accelerator!]
}
type Accelerator {
id: String!
type: String!
model: String!
}
type SubClusterConfig {
name: String!
peak: Float
normal: Float
caution: Float
alert: Float
remove: Boolean
}
type MetricConfig {
name: String!
unit: Unit!
scope: MetricScope!
aggregation: String!
timestep: Int!
peak: Float!
normal: Float
caution: Float!
alert: Float!
subClusters: [SubClusterConfig!]!
}
type Tag {
id: ID!
type: String!
name: String!
}
type Resource {
hostname: String!
hwthreads: [Int!]
accelerators: [String!]
configuration: String
}
type JobMetricWithName {
name: String!
scope: MetricScope!
metric: JobMetric!
}
type JobMetric {
unit: Unit
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
}
type Series {
hostname: String!
id: String
statistics: MetricStatistics
data: [NullableFloat!]!
}
type Unit {
base: String!
prefix: String
}
type MetricStatistics {
avg: Float!
min: Float!
max: Float!
}
type StatsSeries {
mean: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type MetricFootprints {
metric: String!
data: [NullableFloat!]!
}
type Footprints {
nodehours: [NullableFloat!]!
metrics: [MetricFootprints!]!
}
enum Aggregate { USER, PROJECT, CLUSTER }
enum Weights { NODE_COUNT, NODE_HOURS }
type NodeMetrics {
host: String!
subCluster: String!
metrics: [JobMetricWithName!]!
}
type Count {
name: String!
count: Int!
}
type User {
username: String!
name: String!
email: String!
}
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
user(username: String!): User
allocatedNodes(cluster: String!): [Count!]!
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, weight: Weights, limit: Int): [Count!]!
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
}
type Mutation {
createTag(type: String!, name: String!): Tag!
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
input JobFilter {
tags: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
jobName: StringInput
cluster: StringInput
partition: StringInput
duration: IntRange
minRunningFor: Int
numNodes: IntRange
numAccelerators: IntRange
numHWThreads: IntRange
startTime: TimeRange
state: [JobState!]
flopsAnyAvg: FloatRange
memBwAvg: FloatRange
loadAvg: FloatRange
memUsedMax: FloatRange
exclusive: Int
sharedNode: StringInput
selfJobId: StringInput
selfStartTime: Time
selfDuration: Int
}
input OrderByInput {
field: String!
order: SortDirectionEnum! = ASC
}
enum SortDirectionEnum {
DESC
ASC
}
input StringInput {
eq: String
neq: String
contains: String
startsWith: String
endsWith: String
in: [String!]
}
input IntRange { from: Int!, to: Int! }
input FloatRange { from: Float!, to: Float! }
input TimeRange { from: Time, to: Time }
type JobResultList {
items: [Job!]!
offset: Int
limit: Int
count: Int
}
type JobLinkResultList {
items: [JobLink!]!
count: Int
}
type HistoPoint {
count: Int!
value: Int!
}
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster
name: String! # if User-Statistics: Given Name of Account (ID) Owner
totalJobs: Int! # Number of jobs
runningJobs: Int! # Number of running jobs
shortJobs: Int! # Number of jobs with a duration of less than duration
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalNodeHours: Int! # Sum of the node hours of all matched jobs
totalCoreHours: Int! # Sum of the core hours of all matched jobs
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
}
input PageRequest {
itemsPerPage: Int!
page: Int!
}

View File

@ -1,63 +1,75 @@
# Overview # InfluxData line-protocol flavor
## Overview
ClusterCockpit uses the [InfluxData line-protocol](https://docs.influxdata.com/influxdb/v2.1/reference/syntax/line-protocol/) for transferring messages between its components. The line-protocol is a text-based representation of a metric/event with a value, time and describing tags. All metrics/events have the following format (if written to `stdout`): ClusterCockpit uses the [InfluxData line-protocol](https://docs.influxdata.com/influxdb/v2.1/reference/syntax/line-protocol/) for transferring messages between its components. The line-protocol is a text-based representation of a metric/event with a value, time and describing tags. All metrics/events have the following format (if written to `stdout`):
``` ```
<measurement>,<tag set> <field set> <timestamp> <measurement>,<tag set> <field set> <timestamp>
``` ```
where `<tag set>` and `<field set>` are comma-separated lists of `key=value` entries. In a mind-model, think about tags as `indices` in the database for faster lookup and the `<field set>` as values. where `<tag set>` and `<field set>` are comma-separated lists of `key=value`
entries. In a mind-model, think about tags as `indices` in the database for
faster lookup and the `<field set>` as values.
**Remark**: In the first iteration, we only sent metrics (number values) but we had to extend the specification to messages with different meanings. The below text was changes accordingly. The update is downward-compatible, so for metrics (number values), nothing changed. **Remark**: In the first iteration, we only sent metrics (number values) but we
had to extend the specification to messages with different meanings. The below
text was changes accordingly. The update is downward-compatible, so for metrics
(number values), nothing changed.
## Line-protocol in the ClusterCockpit ecosystem
# Line-protocol in the ClusterCockpit ecosystem In ClusterCockpit we limit the flexibility of the InfluxData line-protocol
slightly. The idea is to keep the format evaluatable by different components.
In ClusterCockpit we limit the flexibility of the InfluxData line-protocol slightly. The idea is to keep the format evaluatable by different components. Each message is identifiable by the `measurement` (= metric name), the
`hostname`, the `type` and, if required, a `type-id`.
Each message is identifiable by the `measurement` (= metric name), the `hostname`, the `type` and, if required, a `type-id`. ### Mandatory tags per message
## Mandatory tags per message:
* `hostname` * `hostname`
* `type` * `type`
- `node` * `node`
- `socket` * `socket`
- `die` * `die`
- `memoryDomain` * `memoryDomain`
- `llc` * `llc`
- `core` * `core`
- `hwthread` * `hwthread`
- `accelerator` * `accelerator`
* `type-id` for further specifying the type like CPU socket or HW Thread identifier * `type-id` for further specifying the type like CPU socket or HW Thread identifier
Although no `type-id` is required if `type=node`, it is recommended to send `type=node,type-id=0`. Although no `type-id` is required if `type=node`, it is recommended to send `type=node,type-id=0`.
### Optional tags depending on the message: #### Optional tags depending on the message
In some cases, optional tags are required like `filesystem`, `device` or `version`. While you are free to do that, the ClusterCockpit components in the stack above will recognize `stype` (= "sub type") and `stype-id`. So `filesystem=/homes` should be better specified as `stype=filesystem,stype-id=/homes`. In some cases, optional tags are required like `filesystem`, `device` or
`version`. While you are free to do that, the ClusterCockpit components in the
stack above will recognize `stype` (= "sub type") and `stype-id`. So
`filesystem=/homes` should be better specified as
`stype=filesystem,stype-id=/homes`.
## Mandatory fields per measurement: ### Mandatory fields per measurement
- Metric: The field key is always `value` * Metric: The field key is always `value`
- Event: The field key is always `event` * Event: The field key is always `event`
- Log message: The field key is always `log` * Log message: The field key is always `log`
- Control message: The field key is always `log` * Control message: The field key is always `log`
No other field keys are evaluated by the ClusterCockpit ecosystem. No other field keys are evaluated by the ClusterCockpit ecosystem.
### Message types
## Message types There exist different message types in the ClusterCockpit ecosystem, all
specified using the InfluxData line-protocol.
There exist different message types in the ClusterCockpit ecosystem, all specified using the InfluxData line-protocol. #### Metrics
### Metrics
**Identification:** `value=X` field with `X` being a number **Identification:** `value=X` field with `X` being a number
While the measurements (metric names) can be chosen freely, there is a basic set of measurements which should be present as long as you navigate in the ClusterCockpit ecosystem While the measurements (metric names) can be chosen freely, there is a basic set
of measurements which should be present as long as you navigate in the
ClusterCockpit ecosystem
* `flops_sp`: Single-precision floating point rate in `Flops/s` * `flops_sp`: Single-precision floating point rate in `Flops/s`
* `flops_dp`: Double-precision floating point rate in `Flops/s` * `flops_dp`: Double-precision floating point rate in `Flops/s`
@ -73,19 +85,17 @@ While the measurements (metric names) can be chosen freely, there is a basic set
For the whole list, see [job-data schema](../../datastructures/job-data.schema.json) For the whole list, see [job-data schema](../../datastructures/job-data.schema.json)
#### Events
### Events
**Identification:** `event="X"` field with `"X"` being a string **Identification:** `event="X"` field with `"X"` being a string
### Controls #### Controls
**Identification:** **Identification:**
- `control="X"` field with `"X"` being a string
- `method` tag is either `GET` or `PUT`
### Logs * `control="X"` field with `"X"` being a string
* `method` tag is either `GET` or `PUT`
#### Logs
**Identification:** `log="X"` field with `"X"` being a string **Identification:** `log="X"` field with `"X"` being a string

View File

@ -1 +1 @@
## REST API interfaces # REST API interfaces

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +0,0 @@
## SQL Database Schema for Job Table
This sqlite schema for a HPC job table is used in cc-backend and also part of
the ClusterCockpit Job Archive specification.

View File

@ -1,71 +0,0 @@
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL,
insert_ts TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE (tag_type, tag_name));
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
insert_ts TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
CREATE TABLE user (
username varchar(255) PRIMARY KEY NOT NULL,
password varchar(255) DEFAULT NULL,
ldap tinyint NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */
name varchar(255) DEFAULT NULL,
roles varchar(255) NOT NULL DEFAULT "[]",
email varchar(255) DEFAULT NULL,
projects varchar(255) NOT NULL DEFAULT "[]");
CREATE TABLE configuration (
username varchar(255),
confkey varchar(255),
value varchar(255),
PRIMARY KEY (username, confkey),
FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);
CREATE TABLE job (
id INTEGER PRIMARY KEY,
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
start_time BIGINT NOT NULL, -- Unix timestamp
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
partition VARCHAR(255),
array_job_id BIGINT,
duration INT NOT NULL,
walltime INT NOT NULL,
job_state VARCHAR(255) NOT NULL
CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
meta_data TEXT, -- JSON
resources TEXT NOT NULL, -- JSON
num_nodes INT NOT NULL,
num_hwthreads INT,
num_acc INT,
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0,
UNIQUE (job_id, cluster, start_time));
CREATE INDEX job_stats ON job (cluster,subcluster,user);
CREATE INDEX job_by_user ON job (user);
CREATE INDEX job_by_starttime ON job (start_time);
CREATE INDEX job_by_job_id ON job (job_id, cluster, start_time);
CREATE INDEX job_list ON job (cluster, job_state);
CREATE INDEX job_list_user ON job (user, cluster, job_state);
CREATE INDEX job_list_users ON job (user, job_state);
CREATE INDEX job_list_users_start ON job (start_time, user, job_state);