Restructure repo

This commit is contained in:
Jan Eitzinger 2022-03-18 14:47:39 +01:00
parent 403f74ddee
commit d762e3e52b
15 changed files with 490 additions and 35 deletions

4
datastructures/README.md Normal file
View File

@ -0,0 +1,4 @@
## Generic database specification
This collection of datastructures descriptions is intended to be used
as datastructures in application, payloads in apis, and file formats.

View File

@ -0,0 +1,3 @@
## GraphQL Schema
This schema is intended for communication between web-frontend and web-backend.

View File

@ -0,0 +1,261 @@
scalar Time
scalar Any
scalar NullableFloat
scalar MetricScope
scalar JobState
type Job {
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
subCluster: String!
startTime: Time!
duration: Int!
walltime: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
SMT: Int!
exclusive: Int!
partition: String!
arrayJobId: Int!
monitoringStatus: Int!
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
metaData: Any
userData: User
}
type Cluster {
name: String!
partitions: [String!]! # Slurm partitions
metricConfig: [MetricConfig!]!
filterRanges: FilterRanges!
subClusters: [SubCluster!]! # Hardware partitions/subclusters
}
type SubCluster {
name: String!
nodes: String!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: Int!
flopRateSimd: Int!
memoryBandwidth: Int!
topology: Topology!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
memoryDomain: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
accelerators: [Accelerator!]
}
type Accelerator {
id: String!
type: String!
model: String!
}
type MetricConfig {
name: String!
unit: String!
scope: MetricScope!
timestep: Int!
peak: Float!
normal: Float!
caution: Float!
alert: Float!
}
type Tag {
id: ID!
type: String!
name: String!
}
type Resource {
hostname: String!
hwthreads: [Int!]
accelerators: [Int!]
configuration: String
}
type JobMetricWithName {
name: String!
metric: JobMetric!
}
type JobMetric {
unit: String!
scope: MetricScope!
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
}
type Series {
hostname: String!
id: Int
statistics: MetricStatistics
data: [NullableFloat!]!
}
type MetricStatistics {
avg: Float!
min: Float!
max: Float!
}
type StatsSeries {
mean: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type MetricFootprints {
metric: String!
data: [NullableFloat!]!
}
type Footprints {
nodehours: [NullableFloat!]!
metrics: [MetricFootprints!]!
}
enum Aggregate { USER, PROJECT, CLUSTER }
type NodeMetrics {
host: String!
metrics: [JobMetricWithName!]!
}
type Count {
name: String!
count: Int!
}
type User {
username: String!
name: String!
email: String!
}
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
user(username: String!): User
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, limit: Int): [Count!]!
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(cluster: String!, partition: String, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
}
type Mutation {
createTag(type: String!, name: String!): Tag!
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
type FilterRanges {
duration: IntRangeOutput!
numNodes: IntRangeOutput!
startTime: TimeRangeOutput!
}
input JobFilter {
tags: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
cluster: StringInput
partition: StringInput
duration: IntRange
minRunningFor: Int
numNodes: IntRange
numAccelerators: IntRange
numHWThreads: IntRange
startTime: TimeRange
state: [JobState!]
flopsAnyAvg: FloatRange
memBwAvg: FloatRange
loadAvg: FloatRange
memUsedMax: FloatRange
}
input OrderByInput {
field: String!
order: SortDirectionEnum! = ASC
}
enum SortDirectionEnum {
DESC
ASC
}
input StringInput {
eq: String
contains: String
startsWith: String
endsWith: String
}
input IntRange { from: Int!, to: Int! }
input FloatRange { from: Float!, to: Float! }
input TimeRange { from: Time, to: Time }
type JobResultList {
items: [Job!]!
offset: Int
limit: Int
count: Int
}
type HistoPoint {
count: Int!
value: Int!
}
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster
totalJobs: Int! # Number of jobs that matched
shortJobs: Int! # Number of jobs with a duration of less than 2 minutes
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalCoreHours: Int! # Sum of the core hours of all matched jobs
histWalltime: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
}
input PageRequest {
itemsPerPage: Int!
page: Int!
}

View File

@ -0,0 +1 @@
## REST API interfaces

View File

@ -0,0 +1,221 @@
#
# ClusterCockpit's API spec can be exported via:
# docker exec -it cc-php php bin/console api:openapi:export --yaml
#
# This spec is written by hand and hopefully up to date with the API.
#
openapi: 3.0.3
info:
title: 'ClusterCockpit REST API'
description: 'API for batch job control'
version: 0.0.2
servers:
- url: /
description: ''
paths:
'/api/jobs/':
get:
operationId: 'getJobs'
summary: 'List all jobs'
description: 'Get a list of all jobs. Filters can be applied using query parameters.'
parameters:
- name: state
in: query
schema:
type: string
enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
- name: cluster
in: query
schema: { type: string }
- name: start-time
description: 'Syntax: "<from>-<to>", where <from> and <to> are unix timestamps in seconds'
in: query
schema: { type: string }
- name: page
in: query
schema: { type: integer }
- name: items-per-page
in: query
schema: { type: integer }
- name: with-metadata
in: query
schema: { type: boolean }
responses:
200:
description: 'Array of jobs'
content:
'application/json':
schema:
type: object
properties:
jobs:
type: array
items:
$ref: '#/components/schemas/Job'
400:
description: 'Bad Request'
'/api/jobs/tag_job/{id}':
post:
operationId: 'tagJob'
summary: 'Add a tag to a job'
parameters:
- name: id
in: path
required: true
schema: { type: integer }
description: 'Job ID'
requestBody:
description: 'Array of tags to add'
required: true
content:
'application/json':
schema:
type: array
items:
$ref: '#/components/schemas/Tag'
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
404:
description: 'Job or tag does not exist'
400:
description: 'Bad request'
'/api/jobs/start_job/':
post:
operationId: 'startJob'
summary: 'Add a newly started job'
requestBody:
required: true
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
responses:
201:
description: 'Job successfully'
content:
'application/json':
schema:
type: object
properties:
id:
type: integer
description: 'The database ID assigned to this job'
400:
description: 'Bad request'
422:
description: 'The combination of jobId, clusterId and startTime does already exist'
'/api/jobs/stop_job/':
post:
operationId: stopJobViaJobID
summary: 'Mark a job as stopped. Which job to stop is specified by the request body.'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [jobId, cluster, stopTime, jobState]
properties:
jobId: { type: integer }
cluster: { type: string }
startTime: { type: integer }
stopTime: { type: integer }
jobState:
type: string
enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
400:
description: 'Bad request'
404:
description: 'Resource not found'
'/api/jobs/stop_job/{id}':
post:
operationId: 'stopJobViaDBID'
summary: 'Mark a job as stopped.'
parameters:
- name: id
in: path
required: true
schema: { type: integer }
description: 'Database ID (Resource Identifier)'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [stopTime, jobState]
properties:
stopTime: { type: integer }
jobState:
type: string
enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
400:
description: 'Bad request'
404:
description: 'Resource not found'
'/api/jobs/import/':
post:
operationId: 'importJob'
summary: 'Imports a job and its metric data'
requestBody:
required: true
content:
'application/json':
schema:
type: object
properties:
meta:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
data:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-data.schema.json
responses:
200:
description: 'Import successful'
400:
description: 'Bad request'
422:
description: 'Unprocessable Entity'
components:
schemas:
Tag:
description: 'A job tag'
type: object
properties:
id:
type: string
description: 'Database ID'
type:
type: string
description: 'Tag type'
name:
type: string
description: 'Tag name'
Job:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- bearerAuth: [] # Applies `bearerAuth` globally

View File

@ -1,35 +0,0 @@
# Overview
ClusterCockpit uses the InfluxData line-protocol for collecting the node metric
data.
```
<measurement>,<tag set> <field set> <timestamp [s]>
```
Supported measurements:
* node Tags: host
* socket Tags: host, socket
* cpu -- Tags: host, cpu
## Supported node level fields
* `load`
* `mem_used`
* `net_bw` - split into `ib_bw` and `eth_bw` if required
* `file_bw` - split into multiple file systems if required
## Supported socket fields
All socket metrics can be aggregated to coarser granularity.
* `power`
* `mem_bw`
## Supported cpu level fields
All cpu metrics can be aggregated to coarser granularity.
* `ipc`
* `flops_any`
* `clock`

0
schemas/README.md Normal file
View File