mirror of
https://github.com/ClusterCockpit/cc-specifications.git
synced 2025-07-23 21:31:42 +02:00
Update, cleanup, and reformat
This commit is contained in:
@@ -1,3 +0,0 @@
|
||||
## GraphQL Schema
|
||||
|
||||
This schema is intended for communication between web-frontend and web-backend.
|
@@ -1,303 +0,0 @@
|
||||
scalar Time
|
||||
scalar Any
|
||||
|
||||
scalar NullableFloat
|
||||
scalar MetricScope
|
||||
scalar JobState
|
||||
|
||||
type Job {
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
user: String!
|
||||
project: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
startTime: Time!
|
||||
duration: Int!
|
||||
walltime: Int!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
SMT: Int!
|
||||
exclusive: Int!
|
||||
partition: String!
|
||||
arrayJobId: Int!
|
||||
monitoringStatus: Int!
|
||||
state: JobState!
|
||||
tags: [Tag!]!
|
||||
resources: [Resource!]!
|
||||
concurrentJobs: JobLinkResultList
|
||||
|
||||
metaData: Any
|
||||
userData: User
|
||||
}
|
||||
|
||||
type JobLink {
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
}
|
||||
|
||||
type Cluster {
|
||||
name: String!
|
||||
partitions: [String!]! # Slurm partitions
|
||||
metricConfig: [MetricConfig!]!
|
||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||
}
|
||||
|
||||
type SubCluster {
|
||||
name: String!
|
||||
nodes: String!
|
||||
numberOfNodes: Int!
|
||||
processorType: String!
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: MetricValue!
|
||||
flopRateSimd: MetricValue!
|
||||
memoryBandwidth: MetricValue!
|
||||
topology: Topology!
|
||||
}
|
||||
|
||||
type MetricValue {
|
||||
unit: Unit!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type Topology {
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
memoryDomain: [[Int!]!]
|
||||
die: [[Int!]!]
|
||||
core: [[Int!]!]
|
||||
accelerators: [Accelerator!]
|
||||
}
|
||||
|
||||
type Accelerator {
|
||||
id: String!
|
||||
type: String!
|
||||
model: String!
|
||||
}
|
||||
|
||||
type SubClusterConfig {
|
||||
name: String!
|
||||
peak: Float
|
||||
normal: Float
|
||||
caution: Float
|
||||
alert: Float
|
||||
remove: Boolean
|
||||
}
|
||||
|
||||
type MetricConfig {
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
aggregation: String!
|
||||
timestep: Int!
|
||||
peak: Float!
|
||||
normal: Float
|
||||
caution: Float!
|
||||
alert: Float!
|
||||
subClusters: [SubClusterConfig!]!
|
||||
}
|
||||
|
||||
type Tag {
|
||||
id: ID!
|
||||
type: String!
|
||||
name: String!
|
||||
}
|
||||
|
||||
type Resource {
|
||||
hostname: String!
|
||||
hwthreads: [Int!]
|
||||
accelerators: [String!]
|
||||
configuration: String
|
||||
}
|
||||
|
||||
type JobMetricWithName {
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
metric: JobMetric!
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
series: [Series!]
|
||||
statisticsSeries: StatsSeries
|
||||
}
|
||||
|
||||
type Series {
|
||||
hostname: String!
|
||||
id: String
|
||||
statistics: MetricStatistics
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type Unit {
|
||||
base: String!
|
||||
prefix: String
|
||||
}
|
||||
|
||||
type MetricStatistics {
|
||||
avg: Float!
|
||||
min: Float!
|
||||
max: Float!
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type MetricFootprints {
|
||||
metric: String!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type Footprints {
|
||||
nodehours: [NullableFloat!]!
|
||||
metrics: [MetricFootprints!]!
|
||||
}
|
||||
|
||||
enum Aggregate { USER, PROJECT, CLUSTER }
|
||||
enum Weights { NODE_COUNT, NODE_HOURS }
|
||||
|
||||
type NodeMetrics {
|
||||
host: String!
|
||||
subCluster: String!
|
||||
metrics: [JobMetricWithName!]!
|
||||
}
|
||||
|
||||
type Count {
|
||||
name: String!
|
||||
count: Int!
|
||||
}
|
||||
|
||||
type User {
|
||||
username: String!
|
||||
name: String!
|
||||
email: String!
|
||||
}
|
||||
|
||||
type Query {
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [Tag!]! # List of all tags
|
||||
|
||||
user(username: String!): User
|
||||
allocatedNodes(cluster: String!): [Count!]!
|
||||
|
||||
job(id: ID!): Job
|
||||
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
|
||||
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
|
||||
|
||||
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
|
||||
jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
|
||||
jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, weight: Weights, limit: Int): [Count!]!
|
||||
|
||||
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
|
||||
|
||||
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
|
||||
}
|
||||
|
||||
type Mutation {
|
||||
createTag(type: String!, name: String!): Tag!
|
||||
deleteTag(id: ID!): ID!
|
||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
|
||||
updateConfiguration(name: String!, value: String!): String
|
||||
}
|
||||
|
||||
type IntRangeOutput { from: Int!, to: Int! }
|
||||
type TimeRangeOutput { from: Time!, to: Time! }
|
||||
|
||||
input JobFilter {
|
||||
tags: [ID!]
|
||||
jobId: StringInput
|
||||
arrayJobId: Int
|
||||
user: StringInput
|
||||
project: StringInput
|
||||
jobName: StringInput
|
||||
cluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
|
||||
minRunningFor: Int
|
||||
|
||||
numNodes: IntRange
|
||||
numAccelerators: IntRange
|
||||
numHWThreads: IntRange
|
||||
|
||||
startTime: TimeRange
|
||||
state: [JobState!]
|
||||
flopsAnyAvg: FloatRange
|
||||
memBwAvg: FloatRange
|
||||
loadAvg: FloatRange
|
||||
memUsedMax: FloatRange
|
||||
|
||||
exclusive: Int
|
||||
sharedNode: StringInput
|
||||
selfJobId: StringInput
|
||||
selfStartTime: Time
|
||||
selfDuration: Int
|
||||
}
|
||||
|
||||
input OrderByInput {
|
||||
field: String!
|
||||
order: SortDirectionEnum! = ASC
|
||||
}
|
||||
|
||||
enum SortDirectionEnum {
|
||||
DESC
|
||||
ASC
|
||||
}
|
||||
|
||||
input StringInput {
|
||||
eq: String
|
||||
neq: String
|
||||
contains: String
|
||||
startsWith: String
|
||||
endsWith: String
|
||||
in: [String!]
|
||||
}
|
||||
|
||||
input IntRange { from: Int!, to: Int! }
|
||||
input FloatRange { from: Float!, to: Float! }
|
||||
input TimeRange { from: Time, to: Time }
|
||||
|
||||
type JobResultList {
|
||||
items: [Job!]!
|
||||
offset: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
}
|
||||
|
||||
type JobLinkResultList {
|
||||
items: [JobLink!]!
|
||||
count: Int
|
||||
}
|
||||
|
||||
type HistoPoint {
|
||||
count: Int!
|
||||
value: Int!
|
||||
}
|
||||
|
||||
type JobsStatistics {
|
||||
id: ID! # If `groupBy` was used, ID of the user/project/cluster
|
||||
name: String! # if User-Statistics: Given Name of Account (ID) Owner
|
||||
totalJobs: Int! # Number of jobs
|
||||
runningJobs: Int! # Number of running jobs
|
||||
shortJobs: Int! # Number of jobs with a duration of less than duration
|
||||
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
||||
totalNodeHours: Int! # Sum of the node hours of all matched jobs
|
||||
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
||||
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
|
||||
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
||||
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
||||
}
|
||||
|
||||
input PageRequest {
|
||||
itemsPerPage: Int!
|
||||
page: Int!
|
||||
}
|
@@ -1,63 +1,75 @@
|
||||
# Overview
|
||||
# InfluxData line-protocol flavor
|
||||
|
||||
## Overview
|
||||
|
||||
ClusterCockpit uses the [InfluxData line-protocol](https://docs.influxdata.com/influxdb/v2.1/reference/syntax/line-protocol/) for transferring messages between its components. The line-protocol is a text-based representation of a metric/event with a value, time and describing tags. All metrics/events have the following format (if written to `stdout`):
|
||||
|
||||
|
||||
```
|
||||
<measurement>,<tag set> <field set> <timestamp>
|
||||
```
|
||||
|
||||
where `<tag set>` and `<field set>` are comma-separated lists of `key=value` entries. In a mind-model, think about tags as `indices` in the database for faster lookup and the `<field set>` as values.
|
||||
where `<tag set>` and `<field set>` are comma-separated lists of `key=value`
|
||||
entries. In a mind-model, think about tags as `indices` in the database for
|
||||
faster lookup and the `<field set>` as values.
|
||||
|
||||
**Remark**: In the first iteration, we only sent metrics (number values) but we had to extend the specification to messages with different meanings. The below text was changes accordingly. The update is downward-compatible, so for metrics (number values), nothing changed.
|
||||
**Remark**: In the first iteration, we only sent metrics (number values) but we
|
||||
had to extend the specification to messages with different meanings. The below
|
||||
text was changes accordingly. The update is downward-compatible, so for metrics
|
||||
(number values), nothing changed.
|
||||
|
||||
## Line-protocol in the ClusterCockpit ecosystem
|
||||
|
||||
# Line-protocol in the ClusterCockpit ecosystem
|
||||
In ClusterCockpit we limit the flexibility of the InfluxData line-protocol
|
||||
slightly. The idea is to keep the format evaluatable by different components.
|
||||
|
||||
In ClusterCockpit we limit the flexibility of the InfluxData line-protocol slightly. The idea is to keep the format evaluatable by different components.
|
||||
Each message is identifiable by the `measurement` (= metric name), the
|
||||
`hostname`, the `type` and, if required, a `type-id`.
|
||||
|
||||
Each message is identifiable by the `measurement` (= metric name), the `hostname`, the `type` and, if required, a `type-id`.
|
||||
### Mandatory tags per message
|
||||
|
||||
|
||||
|
||||
## Mandatory tags per message:
|
||||
* `hostname`
|
||||
* `type`
|
||||
- `node`
|
||||
- `socket`
|
||||
- `die`
|
||||
- `memoryDomain`
|
||||
- `llc`
|
||||
- `core`
|
||||
- `hwthread`
|
||||
- `accelerator`
|
||||
* `node`
|
||||
* `socket`
|
||||
* `die`
|
||||
* `memoryDomain`
|
||||
* `llc`
|
||||
* `core`
|
||||
* `hwthread`
|
||||
* `accelerator`
|
||||
* `type-id` for further specifying the type like CPU socket or HW Thread identifier
|
||||
|
||||
Although no `type-id` is required if `type=node`, it is recommended to send `type=node,type-id=0`.
|
||||
|
||||
### Optional tags depending on the message:
|
||||
#### Optional tags depending on the message
|
||||
|
||||
In some cases, optional tags are required like `filesystem`, `device` or `version`. While you are free to do that, the ClusterCockpit components in the stack above will recognize `stype` (= "sub type") and `stype-id`. So `filesystem=/homes` should be better specified as `stype=filesystem,stype-id=/homes`.
|
||||
In some cases, optional tags are required like `filesystem`, `device` or
|
||||
`version`. While you are free to do that, the ClusterCockpit components in the
|
||||
stack above will recognize `stype` (= "sub type") and `stype-id`. So
|
||||
`filesystem=/homes` should be better specified as
|
||||
`stype=filesystem,stype-id=/homes`.
|
||||
|
||||
## Mandatory fields per measurement:
|
||||
### Mandatory fields per measurement
|
||||
|
||||
- Metric: The field key is always `value`
|
||||
- Event: The field key is always `event`
|
||||
- Log message: The field key is always `log`
|
||||
- Control message: The field key is always `log`
|
||||
* Metric: The field key is always `value`
|
||||
* Event: The field key is always `event`
|
||||
* Log message: The field key is always `log`
|
||||
* Control message: The field key is always `log`
|
||||
|
||||
No other field keys are evaluated by the ClusterCockpit ecosystem.
|
||||
|
||||
### Message types
|
||||
|
||||
## Message types
|
||||
There exist different message types in the ClusterCockpit ecosystem, all
|
||||
specified using the InfluxData line-protocol.
|
||||
|
||||
There exist different message types in the ClusterCockpit ecosystem, all specified using the InfluxData line-protocol.
|
||||
|
||||
### Metrics
|
||||
#### Metrics
|
||||
|
||||
**Identification:** `value=X` field with `X` being a number
|
||||
|
||||
While the measurements (metric names) can be chosen freely, there is a basic set of measurements which should be present as long as you navigate in the ClusterCockpit ecosystem
|
||||
While the measurements (metric names) can be chosen freely, there is a basic set
|
||||
of measurements which should be present as long as you navigate in the
|
||||
ClusterCockpit ecosystem
|
||||
|
||||
* `flops_sp`: Single-precision floating point rate in `Flops/s`
|
||||
* `flops_dp`: Double-precision floating point rate in `Flops/s`
|
||||
@@ -73,19 +85,17 @@ While the measurements (metric names) can be chosen freely, there is a basic set
|
||||
|
||||
For the whole list, see [job-data schema](../../datastructures/job-data.schema.json)
|
||||
|
||||
|
||||
### Events
|
||||
#### Events
|
||||
|
||||
**Identification:** `event="X"` field with `"X"` being a string
|
||||
|
||||
### Controls
|
||||
#### Controls
|
||||
|
||||
**Identification:**
|
||||
- `control="X"` field with `"X"` being a string
|
||||
- `method` tag is either `GET` or `PUT`
|
||||
**Identification:**
|
||||
|
||||
### Logs
|
||||
* `control="X"` field with `"X"` being a string
|
||||
* `method` tag is either `GET` or `PUT`
|
||||
|
||||
#### Logs
|
||||
|
||||
**Identification:** `log="X"` field with `"X"` being a string
|
||||
|
||||
|
||||
|
@@ -1 +1 @@
|
||||
## REST API interfaces
|
||||
# REST API interfaces
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user