Update to cc-backend version 1.0.0

This commit is contained in:
Jan Eitzinger 2023-06-27 15:08:03 +02:00
parent 1de04dd30d
commit f0bccc8229
10 changed files with 2628 additions and 320 deletions

View File

@ -4,7 +4,7 @@
"title": "HPC cluster description",
"description": "Meta data information of a HPC cluster",
"type": "object",
"properties":{
"properties": {
"name": {
"description": "The unique identifier of a cluster",
"type": "string"
@ -14,7 +14,7 @@
"type": "array",
"items": {
"type": "object",
"properties":{
"properties": {
"name": {
"description": "Metric name",
"type": "string"
@ -39,12 +39,28 @@
"avg"
]
},
"peak": {
"description": "Metric peak threshold (Upper metric limit)",
"type": "number"
},
"normal": {
"description": "Metric normal threshold",
"type": "number"
},
"caution": {
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
"type": "number"
},
"alert": {
"description": "Metric alert threshold (Requires immediate action)",
"type": "number"
},
"subClusters": {
"description": "Array of cluster hardware partition metric thresholds",
"type": "array",
"items": {
"type": "object",
"properties":{
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
@ -60,13 +76,13 @@
},
"alert": {
"type": "number"
},
"remove": {
"type": "boolean"
}
},
"required": [
"name",
"peak",
"caution",
"alert"
"name"
]
}
}
@ -75,7 +91,12 @@
"name",
"unit",
"scope",
"timestep"
"timestep",
"aggregation",
"peak",
"normal",
"caution",
"alert"
]
},
"minItems": 1
@ -85,7 +106,7 @@
"type": "array",
"items": {
"type": "object",
"properties":{
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
@ -108,15 +129,42 @@
},
"flopRateScalar": {
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
"type": "integer"
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"flopRateSimd": {
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
"type": "integer"
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"memoryBandwidth": {
"description": "Theoretical node peak memory bandwidth in GB/s",
"type": "integer"
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"nodes": {
"description": "Node list expression",
@ -125,7 +173,7 @@
"topology": {
"description": "Node topology",
"type": "object",
"properties":{
"properties": {
"node": {
"description": "HwTread lists of node",
"type": "array",
@ -205,15 +253,16 @@
}
}
},
"required":[
"required": [
"node",
"socket",
"memoryDomain"
]
}
},
"required":[
"required": [
"name",
"nodes",
"topology",
"processorType",
"socketsPerNode",
@ -227,7 +276,7 @@
"minItems": 1
}
},
"required":[
"required": [
"name",
"metricConfig",
"subClusters"

View File

@ -86,8 +86,8 @@
},
"minProperties": 1
},
"cpu_used": {
"description": "CPU active core utilization",
"cpu_user": {
"description": "CPU user active core utilization",
"properties": {
"node": {
"$ref": "job-metric-data.schema.json"
@ -479,7 +479,8 @@
]
},
"required": [
"cpu_used",
"cpu_user",
"cpu_load",
"mem_used",
"flops_any",
"mem_bw",

View File

@ -193,8 +193,8 @@
"description": "Instructions executed per cycle",
"$ref": "job-metric-statistics.schema.json"
},
"cpu_used": {
"description": "CPU active core utilization",
"cpu_user": {
"description": "CPU user active core utilization",
"$ref": "job-metric-statistics.schema.json"
},
"flops_dp": {
@ -326,7 +326,8 @@
}
},
"required": [
"cpu_used",
"cpu_user",
"cpu_load",
"mem_used",
"flops_any",
"mem_bw"
@ -338,6 +339,7 @@
"user",
"project",
"cluster",
"subCluster",
"numNodes",
"exclusive",
"startTime",

View File

@ -193,7 +193,7 @@
},
"data": {
"type": "array",
"items": {
"contains": {
"type": "number",
"minimum": 0
},

View File

@ -15,7 +15,6 @@
"F/s",
"CPI",
"IPC",
"load",
"Hz",
"W",
"°C",

View File

@ -26,32 +26,43 @@ type Job {
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
concurrentJobs: JobLinkResultList
metaData: Any
userData: User
}
type JobLink {
id: ID!
jobId: Int!
}
type Cluster {
name: String!
partitions: [String!]! # Slurm partitions
metricConfig: [MetricConfig!]!
filterRanges: FilterRanges!
subClusters: [SubCluster!]! # Hardware partitions/subclusters
}
type SubCluster {
name: String!
nodes: String!
numberOfNodes: Int!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: Int!
flopRateSimd: Int!
memoryBandwidth: Int!
flopRateScalar: MetricValue!
flopRateSimd: MetricValue!
memoryBandwidth: MetricValue!
topology: Topology!
}
type MetricValue {
unit: Unit!
value: Float!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
@ -67,15 +78,26 @@ type Accelerator {
model: String!
}
type SubClusterConfig {
name: String!
peak: Float
normal: Float
caution: Float
alert: Float
remove: Boolean
}
type MetricConfig {
name: String!
unit: String!
scope: MetricScope!
timestep: Int!
peak: Float!
normal: Float!
caution: Float!
alert: Float!
name: String!
unit: Unit!
scope: MetricScope!
aggregation: String!
timestep: Int!
peak: Float!
normal: Float
caution: Float!
alert: Float!
subClusters: [SubClusterConfig!]!
}
type Tag {
@ -87,18 +109,18 @@ type Tag {
type Resource {
hostname: String!
hwthreads: [Int!]
accelerators: [Int!]
accelerators: [String!]
configuration: String
}
type JobMetricWithName {
name: String!
scope: MetricScope!
metric: JobMetric!
}
type JobMetric {
unit: String!
scope: MetricScope!
unit: Unit
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
@ -106,11 +128,16 @@ type JobMetric {
type Series {
hostname: String!
id: Int
id: String
statistics: MetricStatistics
data: [NullableFloat!]!
}
type Unit {
base: String!
prefix: String
}
type MetricStatistics {
avg: Float!
min: Float!
@ -134,10 +161,12 @@ type Footprints {
}
enum Aggregate { USER, PROJECT, CLUSTER }
enum Weights { NODE_COUNT, NODE_HOURS }
type NodeMetrics {
host: String!
metrics: [JobMetricWithName!]!
host: String!
subCluster: String!
metrics: [JobMetricWithName!]!
}
type Count {
@ -156,6 +185,7 @@ type Query {
tags: [Tag!]! # List of all tags
user(username: String!): User
allocatedNodes(cluster: String!): [Count!]!
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
@ -163,11 +193,11 @@ type Query {
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, limit: Int): [Count!]!
jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, weight: Weights, limit: Int): [Count!]!
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(cluster: String!, partition: String, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
}
type Mutation {
@ -182,18 +212,13 @@ type Mutation {
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
type FilterRanges {
duration: IntRangeOutput!
numNodes: IntRangeOutput!
startTime: TimeRangeOutput!
}
input JobFilter {
tags: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
jobName: StringInput
cluster: StringInput
partition: StringInput
duration: IntRange
@ -210,6 +235,12 @@ input JobFilter {
memBwAvg: FloatRange
loadAvg: FloatRange
memUsedMax: FloatRange
exclusive: Int
sharedNode: StringInput
selfJobId: StringInput
selfStartTime: Time
selfDuration: Int
}
input OrderByInput {
@ -224,9 +255,11 @@ enum SortDirectionEnum {
input StringInput {
eq: String
neq: String
contains: String
startsWith: String
endsWith: String
in: [String!]
}
input IntRange { from: Int!, to: Int! }
@ -240,6 +273,11 @@ type JobResultList {
count: Int
}
type JobLinkResultList {
items: [JobLink!]!
count: Int
}
type HistoPoint {
count: Int!
value: Int!
@ -247,11 +285,15 @@ type HistoPoint {
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster
totalJobs: Int! # Number of jobs that matched
shortJobs: Int! # Number of jobs with a duration of less than 2 minutes
name: String! # if User-Statistics: Given Name of Account (ID) Owner
totalJobs: Int! # Number of jobs
runningJobs: Int! # Number of running jobs
shortJobs: Int! # Number of jobs with a duration of less than duration
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalNodeHours: Int! # Sum of the node hours of all matched jobs
totalCoreHours: Int! # Sum of the core hours of all matched jobs
histWalltime: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
totalAccHours: Int! # Sum of the gpu hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
}

View File

@ -1,221 +0,0 @@
#
# ClusterCockpit's API spec can be exported via:
# docker exec -it cc-php php bin/console api:openapi:export --yaml
#
# This spec is written by hand and hopefully up to date with the API.
#
openapi: 3.0.3
info:
title: 'ClusterCockpit REST API'
description: 'API for batch job control'
version: 0.0.2
servers:
- url: /
description: ''
paths:
'/api/jobs/':
get:
operationId: 'getJobs'
summary: 'List all jobs'
description: 'Get a list of all jobs. Filters can be applied using query parameters.'
parameters:
- name: state
in: query
schema:
type: string
enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
- name: cluster
in: query
schema: { type: string }
- name: start-time
description: 'Syntax: "<from>-<to>", where <from> and <to> are unix timestamps in seconds'
in: query
schema: { type: string }
- name: page
in: query
schema: { type: integer }
- name: items-per-page
in: query
schema: { type: integer }
- name: with-metadata
in: query
schema: { type: boolean }
responses:
200:
description: 'Array of jobs'
content:
'application/json':
schema:
type: object
properties:
jobs:
type: array
items:
$ref: '#/components/schemas/Job'
400:
description: 'Bad Request'
'/api/jobs/tag_job/{id}':
post:
operationId: 'tagJob'
summary: 'Add a tag to a job'
parameters:
- name: id
in: path
required: true
schema: { type: integer }
description: 'Job ID'
requestBody:
description: 'Array of tags to add'
required: true
content:
'application/json':
schema:
type: array
items:
$ref: '#/components/schemas/Tag'
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
404:
description: 'Job or tag does not exist'
400:
description: 'Bad request'
'/api/jobs/start_job/':
post:
operationId: 'startJob'
summary: 'Add a newly started job'
requestBody:
required: true
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
responses:
201:
description: 'Job successfully'
content:
'application/json':
schema:
type: object
properties:
id:
type: integer
description: 'The database ID assigned to this job'
400:
description: 'Bad request'
422:
description: 'The combination of jobId, clusterId and startTime does already exist'
'/api/jobs/stop_job/':
post:
operationId: stopJobViaJobID
summary: 'Mark a job as stopped. Which job to stop is specified by the request body.'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [jobId, cluster, stopTime, jobState]
properties:
jobId: { type: integer }
cluster: { type: string }
startTime: { type: integer }
stopTime: { type: integer }
jobState:
type: string
enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
400:
description: 'Bad request'
404:
description: 'Resource not found'
'/api/jobs/stop_job/{id}':
post:
operationId: 'stopJobViaDBID'
summary: 'Mark a job as stopped.'
parameters:
- name: id
in: path
required: true
schema: { type: integer }
description: 'Database ID (Resource Identifier)'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [stopTime, jobState]
properties:
stopTime: { type: integer }
jobState:
type: string
enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
400:
description: 'Bad request'
404:
description: 'Resource not found'
'/api/jobs/import/':
post:
operationId: 'importJob'
summary: 'Imports a job and its metric data'
requestBody:
required: true
content:
'application/json':
schema:
type: object
properties:
meta:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
data:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-data.schema.json
responses:
200:
description: 'Import successful'
400:
description: 'Bad request'
422:
description: 'Unprocessable Entity'
components:
schemas:
Tag:
description: 'A job tag'
type: object
properties:
id:
type: string
description: 'Database ID'
type:
type: string
description: 'Tag type'
name:
type: string
description: 'Tag name'
Job:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- bearerAuth: [] # Applies `bearerAuth` globally

1408
interfaces/rest/swagger.json Normal file

File diff suppressed because it is too large Load Diff

1006
interfaces/rest/swagger.yaml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,49 +1,71 @@
DROP TABLE IF EXISTS jobtag;
DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tag;
CREATE TABLE job (
id INTEGER PRIMARY KEY /*!40101 AUTO_INCREMENT */,
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
start_time BIGINT NOT NULL, -- Unix timestamp
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
` + "`partition`" + ` VARCHAR(255) NOT NULL, -- partition is a keyword in mysql -.-
array_job_id BIGINT NOT NULL,
duration INT NOT NULL DEFAULT 0,
walltime INT NOT NULL DEFAULT 0,
job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
meta_data TEXT, -- JSON
resources TEXT NOT NULL, -- JSON
num_nodes INT NOT NULL,
num_hwthreads INT NOT NULL,
num_acc INT NOT NULL,
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL,
CONSTRAINT be_unique UNIQUE (tag_type, tag_name));
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL,
insert_ts TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE (tag_type, tag_name));
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
job_id INTEGER,
tag_id INTEGER,
insert_ts TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
CREATE TABLE user (
username varchar(255) PRIMARY KEY NOT NULL,
password varchar(255) DEFAULT NULL,
ldap tinyint NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */
name varchar(255) DEFAULT NULL,
roles varchar(255) NOT NULL DEFAULT "[]",
email varchar(255) DEFAULT NULL,
projects varchar(255) NOT NULL DEFAULT "[]");
CREATE TABLE configuration (
username varchar(255),
confkey varchar(255),
value varchar(255),
PRIMARY KEY (username, confkey),
FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);
CREATE TABLE job (
id INTEGER PRIMARY KEY,
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
start_time BIGINT NOT NULL, -- Unix timestamp
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
partition VARCHAR(255),
array_job_id BIGINT,
duration INT NOT NULL,
walltime INT NOT NULL,
job_state VARCHAR(255) NOT NULL
CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
meta_data TEXT, -- JSON
resources TEXT NOT NULL, -- JSON
num_nodes INT NOT NULL,
num_hwthreads INT,
num_acc INT,
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0,
UNIQUE (job_id, cluster, start_time));
CREATE INDEX job_stats ON job (cluster,subcluster,user);
CREATE INDEX job_by_user ON job (user);
CREATE INDEX job_by_starttime ON job (start_time);
CREATE INDEX job_by_job_id ON job (job_id, cluster, start_time);
CREATE INDEX job_list ON job (cluster, job_state);
CREATE INDEX job_list_user ON job (user, cluster, job_state);
CREATE INDEX job_list_users ON job (user, job_state);
CREATE INDEX job_list_users_start ON job (start_time, user, job_state);