From f0bccc8229a7d96cd109d3057d1448bd4d0d3f82 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 27 Jun 2023 15:08:03 +0200 Subject: [PATCH] Update to cc-backend version 1.0.0 --- datastructures/cluster.schema.json | 81 +- datastructures/job-data.schema.json | 7 +- datastructures/job-meta.schema.json | 8 +- datastructures/job-metric-data.schema.json | 2 +- datastructures/unit.schema.json | 1 - interfaces/graphql/schema.graphqls | 100 +- interfaces/rest/openapi.yaml | 221 --- interfaces/rest/swagger.json | 1408 ++++++++++++++++++++ interfaces/rest/swagger.yaml | 1006 ++++++++++++++ schemas/jobs-sqlite.sql | 114 +- 10 files changed, 2628 insertions(+), 320 deletions(-) delete mode 100644 interfaces/rest/openapi.yaml create mode 100644 interfaces/rest/swagger.json create mode 100644 interfaces/rest/swagger.yaml diff --git a/datastructures/cluster.schema.json b/datastructures/cluster.schema.json index 503d6b8..1d00962 100644 --- a/datastructures/cluster.schema.json +++ b/datastructures/cluster.schema.json @@ -4,7 +4,7 @@ "title": "HPC cluster description", "description": "Meta data information of a HPC cluster", "type": "object", - "properties":{ + "properties": { "name": { "description": "The unique identifier of a cluster", "type": "string" @@ -14,7 +14,7 @@ "type": "array", "items": { "type": "object", - "properties":{ + "properties": { "name": { "description": "Metric name", "type": "string" @@ -39,12 +39,28 @@ "avg" ] }, + "peak": { + "description": "Metric peak threshold (Upper metric limit)", + "type": "number" + }, + "normal": { + "description": "Metric normal threshold", + "type": "number" + }, + "caution": { + "description": "Metric caution threshold (Suspicious but does not require immediate action)", + "type": "number" + }, + "alert": { + "description": "Metric alert threshold (Requires immediate action)", + "type": "number" + }, "subClusters": { "description": "Array of cluster hardware partition metric thresholds", "type": "array", "items": { "type": "object", - "properties":{ + "properties": { "name": { "description": "Hardware partition name", "type": "string" @@ -60,13 +76,13 @@ }, "alert": { "type": "number" + }, + "remove": { + "type": "boolean" } }, "required": [ - "name", - "peak", - "caution", - "alert" + "name" ] } } @@ -75,7 +91,12 @@ "name", "unit", "scope", - "timestep" + "timestep", + "aggregation", + "peak", + "normal", + "caution", + "alert" ] }, "minItems": 1 @@ -85,7 +106,7 @@ "type": "array", "items": { "type": "object", - "properties":{ + "properties": { "name": { "description": "Hardware partition name", "type": "string" @@ -108,15 +129,42 @@ }, "flopRateScalar": { "description": "Theoretical node peak flop rate for scalar code in GFlops/s", - "type": "integer" + "type": "object", + "properties": { + "unit": { + "description": "Metric unit", + "$ref": "embedfs://unit.schema.json" + }, + "value": { + "type": "number" + } + } }, "flopRateSimd": { "description": "Theoretical node peak flop rate for SIMD code in GFlops/s", - "type": "integer" + "type": "object", + "properties": { + "unit": { + "description": "Metric unit", + "$ref": "embedfs://unit.schema.json" + }, + "value": { + "type": "number" + } + } }, "memoryBandwidth": { "description": "Theoretical node peak memory bandwidth in GB/s", - "type": "integer" + "type": "object", + "properties": { + "unit": { + "description": "Metric unit", + "$ref": "embedfs://unit.schema.json" + }, + "value": { + "type": "number" + } + } }, "nodes": { "description": "Node list expression", @@ -125,7 +173,7 @@ "topology": { "description": "Node topology", "type": "object", - "properties":{ + "properties": { "node": { "description": "HwTread lists of node", "type": "array", @@ -205,15 +253,16 @@ } } }, - "required":[ + "required": [ "node", "socket", "memoryDomain" ] } }, - "required":[ + "required": [ "name", + "nodes", "topology", "processorType", "socketsPerNode", @@ -227,7 +276,7 @@ "minItems": 1 } }, - "required":[ + "required": [ "name", "metricConfig", "subClusters" diff --git a/datastructures/job-data.schema.json b/datastructures/job-data.schema.json index 1ea3110..d2526b1 100644 --- a/datastructures/job-data.schema.json +++ b/datastructures/job-data.schema.json @@ -86,8 +86,8 @@ }, "minProperties": 1 }, - "cpu_used": { - "description": "CPU active core utilization", + "cpu_user": { + "description": "CPU user active core utilization", "properties": { "node": { "$ref": "job-metric-data.schema.json" @@ -479,7 +479,8 @@ ] }, "required": [ - "cpu_used", + "cpu_user", + "cpu_load", "mem_used", "flops_any", "mem_bw", diff --git a/datastructures/job-meta.schema.json b/datastructures/job-meta.schema.json index bc515ff..aa8255f 100644 --- a/datastructures/job-meta.schema.json +++ b/datastructures/job-meta.schema.json @@ -193,8 +193,8 @@ "description": "Instructions executed per cycle", "$ref": "job-metric-statistics.schema.json" }, - "cpu_used": { - "description": "CPU active core utilization", + "cpu_user": { + "description": "CPU user active core utilization", "$ref": "job-metric-statistics.schema.json" }, "flops_dp": { @@ -326,7 +326,8 @@ } }, "required": [ - "cpu_used", + "cpu_user", + "cpu_load", "mem_used", "flops_any", "mem_bw" @@ -338,6 +339,7 @@ "user", "project", "cluster", + "subCluster", "numNodes", "exclusive", "startTime", diff --git a/datastructures/job-metric-data.schema.json b/datastructures/job-metric-data.schema.json index 081360f..f616f8a 100644 --- a/datastructures/job-metric-data.schema.json +++ b/datastructures/job-metric-data.schema.json @@ -193,7 +193,7 @@ }, "data": { "type": "array", - "items": { + "contains": { "type": "number", "minimum": 0 }, diff --git a/datastructures/unit.schema.json b/datastructures/unit.schema.json index a9dd3ca..aa31084 100644 --- a/datastructures/unit.schema.json +++ b/datastructures/unit.schema.json @@ -15,7 +15,6 @@ "F/s", "CPI", "IPC", - "load", "Hz", "W", "°C", diff --git a/interfaces/graphql/schema.graphqls b/interfaces/graphql/schema.graphqls index b3fbe29..71a5373 100644 --- a/interfaces/graphql/schema.graphqls +++ b/interfaces/graphql/schema.graphqls @@ -26,32 +26,43 @@ type Job { state: JobState! tags: [Tag!]! resources: [Resource!]! + concurrentJobs: JobLinkResultList metaData: Any userData: User } +type JobLink { + id: ID! + jobId: Int! +} + type Cluster { name: String! partitions: [String!]! # Slurm partitions metricConfig: [MetricConfig!]! - filterRanges: FilterRanges! subClusters: [SubCluster!]! # Hardware partitions/subclusters } type SubCluster { name: String! nodes: String! + numberOfNodes: Int! processorType: String! socketsPerNode: Int! coresPerSocket: Int! threadsPerCore: Int! - flopRateScalar: Int! - flopRateSimd: Int! - memoryBandwidth: Int! + flopRateScalar: MetricValue! + flopRateSimd: MetricValue! + memoryBandwidth: MetricValue! topology: Topology! } +type MetricValue { + unit: Unit! + value: Float! +} + type Topology { node: [Int!] socket: [[Int!]!] @@ -67,15 +78,26 @@ type Accelerator { model: String! } +type SubClusterConfig { + name: String! + peak: Float + normal: Float + caution: Float + alert: Float + remove: Boolean +} + type MetricConfig { - name: String! - unit: String! - scope: MetricScope! - timestep: Int! - peak: Float! - normal: Float! - caution: Float! - alert: Float! + name: String! + unit: Unit! + scope: MetricScope! + aggregation: String! + timestep: Int! + peak: Float! + normal: Float + caution: Float! + alert: Float! + subClusters: [SubClusterConfig!]! } type Tag { @@ -87,18 +109,18 @@ type Tag { type Resource { hostname: String! hwthreads: [Int!] - accelerators: [Int!] + accelerators: [String!] configuration: String } type JobMetricWithName { name: String! + scope: MetricScope! metric: JobMetric! } type JobMetric { - unit: String! - scope: MetricScope! + unit: Unit timestep: Int! series: [Series!] statisticsSeries: StatsSeries @@ -106,11 +128,16 @@ type JobMetric { type Series { hostname: String! - id: Int + id: String statistics: MetricStatistics data: [NullableFloat!]! } +type Unit { + base: String! + prefix: String +} + type MetricStatistics { avg: Float! min: Float! @@ -134,10 +161,12 @@ type Footprints { } enum Aggregate { USER, PROJECT, CLUSTER } +enum Weights { NODE_COUNT, NODE_HOURS } type NodeMetrics { - host: String! - metrics: [JobMetricWithName!]! + host: String! + subCluster: String! + metrics: [JobMetricWithName!]! } type Count { @@ -156,6 +185,7 @@ type Query { tags: [Tag!]! # List of all tags user(username: String!): User + allocatedNodes(cluster: String!): [Count!]! job(id: ID!): Job jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]! @@ -163,11 +193,11 @@ type Query { jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList! jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]! - jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, limit: Int): [Count!]! + jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, weight: Weights, limit: Int): [Count!]! rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]! - nodeMetrics(cluster: String!, partition: String, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! + nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! } type Mutation { @@ -182,18 +212,13 @@ type Mutation { type IntRangeOutput { from: Int!, to: Int! } type TimeRangeOutput { from: Time!, to: Time! } -type FilterRanges { - duration: IntRangeOutput! - numNodes: IntRangeOutput! - startTime: TimeRangeOutput! -} - input JobFilter { tags: [ID!] jobId: StringInput arrayJobId: Int user: StringInput project: StringInput + jobName: StringInput cluster: StringInput partition: StringInput duration: IntRange @@ -210,6 +235,12 @@ input JobFilter { memBwAvg: FloatRange loadAvg: FloatRange memUsedMax: FloatRange + + exclusive: Int + sharedNode: StringInput + selfJobId: StringInput + selfStartTime: Time + selfDuration: Int } input OrderByInput { @@ -224,9 +255,11 @@ enum SortDirectionEnum { input StringInput { eq: String + neq: String contains: String startsWith: String endsWith: String + in: [String!] } input IntRange { from: Int!, to: Int! } @@ -240,6 +273,11 @@ type JobResultList { count: Int } +type JobLinkResultList { + items: [JobLink!]! + count: Int +} + type HistoPoint { count: Int! value: Int! @@ -247,11 +285,15 @@ type HistoPoint { type JobsStatistics { id: ID! # If `groupBy` was used, ID of the user/project/cluster - totalJobs: Int! # Number of jobs that matched - shortJobs: Int! # Number of jobs with a duration of less than 2 minutes + name: String! # if User-Statistics: Given Name of Account (ID) Owner + totalJobs: Int! # Number of jobs + runningJobs: Int! # Number of running jobs + shortJobs: Int! # Number of jobs with a duration of less than duration totalWalltime: Int! # Sum of the duration of all matched jobs in hours + totalNodeHours: Int! # Sum of the node hours of all matched jobs totalCoreHours: Int! # Sum of the core hours of all matched jobs - histWalltime: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value + totalAccHours: Int! # Sum of the gpu hours of all matched jobs + histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes } diff --git a/interfaces/rest/openapi.yaml b/interfaces/rest/openapi.yaml deleted file mode 100644 index 2babbf5..0000000 --- a/interfaces/rest/openapi.yaml +++ /dev/null @@ -1,221 +0,0 @@ -# -# ClusterCockpit's API spec can be exported via: -# docker exec -it cc-php php bin/console api:openapi:export --yaml -# -# This spec is written by hand and hopefully up to date with the API. -# - -openapi: 3.0.3 -info: - title: 'ClusterCockpit REST API' - description: 'API for batch job control' - version: 0.0.2 -servers: - - url: / - description: '' -paths: - '/api/jobs/': - get: - operationId: 'getJobs' - summary: 'List all jobs' - description: 'Get a list of all jobs. Filters can be applied using query parameters.' - parameters: - - name: state - in: query - schema: - type: string - enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"] - - name: cluster - in: query - schema: { type: string } - - name: start-time - description: 'Syntax: "-", where and are unix timestamps in seconds' - in: query - schema: { type: string } - - name: page - in: query - schema: { type: integer } - - name: items-per-page - in: query - schema: { type: integer } - - name: with-metadata - in: query - schema: { type: boolean } - responses: - 200: - description: 'Array of jobs' - content: - 'application/json': - schema: - type: object - properties: - jobs: - type: array - items: - $ref: '#/components/schemas/Job' - 400: - description: 'Bad Request' - '/api/jobs/tag_job/{id}': - post: - operationId: 'tagJob' - summary: 'Add a tag to a job' - parameters: - - name: id - in: path - required: true - schema: { type: integer } - description: 'Job ID' - requestBody: - description: 'Array of tags to add' - required: true - content: - 'application/json': - schema: - type: array - items: - $ref: '#/components/schemas/Tag' - responses: - 200: - description: 'Job resource' - content: - 'application/json': - schema: - $ref: '#/components/schemas/Job' - 404: - description: 'Job or tag does not exist' - 400: - description: 'Bad request' - '/api/jobs/start_job/': - post: - operationId: 'startJob' - summary: 'Add a newly started job' - requestBody: - required: true - content: - 'application/json': - schema: - $ref: '#/components/schemas/Job' - responses: - 201: - description: 'Job successfully' - content: - 'application/json': - schema: - type: object - properties: - id: - type: integer - description: 'The database ID assigned to this job' - 400: - description: 'Bad request' - 422: - description: 'The combination of jobId, clusterId and startTime does already exist' - '/api/jobs/stop_job/': - post: - operationId: stopJobViaJobID - summary: 'Mark a job as stopped. Which job to stop is specified by the request body.' - requestBody: - required: true - content: - 'application/json': - schema: - type: object - required: [jobId, cluster, stopTime, jobState] - properties: - jobId: { type: integer } - cluster: { type: string } - startTime: { type: integer } - stopTime: { type: integer } - jobState: - type: string - enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"] - responses: - 200: - description: 'Job resource' - content: - 'application/json': - schema: - $ref: '#/components/schemas/Job' - 400: - description: 'Bad request' - 404: - description: 'Resource not found' - '/api/jobs/stop_job/{id}': - post: - operationId: 'stopJobViaDBID' - summary: 'Mark a job as stopped.' - parameters: - - name: id - in: path - required: true - schema: { type: integer } - description: 'Database ID (Resource Identifier)' - requestBody: - required: true - content: - 'application/json': - schema: - type: object - required: [stopTime, jobState] - properties: - stopTime: { type: integer } - jobState: - type: string - enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"] - responses: - 200: - description: 'Job resource' - content: - 'application/json': - schema: - $ref: '#/components/schemas/Job' - 400: - description: 'Bad request' - 404: - description: 'Resource not found' - '/api/jobs/import/': - post: - operationId: 'importJob' - summary: 'Imports a job and its metric data' - requestBody: - required: true - content: - 'application/json': - schema: - type: object - properties: - meta: - $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json - data: - $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-data.schema.json - responses: - 200: - description: 'Import successful' - 400: - description: 'Bad request' - 422: - description: 'Unprocessable Entity' -components: - schemas: - Tag: - description: 'A job tag' - type: object - properties: - id: - type: string - description: 'Database ID' - type: - type: string - description: 'Tag type' - name: - type: string - description: 'Tag name' - Job: - $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json - securitySchemes: - bearerAuth: - type: http - scheme: bearer - bearerFormat: JWT -security: - - bearerAuth: [] # Applies `bearerAuth` globally \ No newline at end of file diff --git a/interfaces/rest/swagger.json b/interfaces/rest/swagger.json new file mode 100644 index 0000000..87a7de5 --- /dev/null +++ b/interfaces/rest/swagger.json @@ -0,0 +1,1408 @@ +{ + "swagger": "2.0", + "info": { + "description": "API for batch job control.", + "title": "ClusterCockpit REST API", + "contact": { + "name": "ClusterCockpit Project", + "url": "https://github.com/ClusterCockpit", + "email": "support@clustercockpit.org" + }, + "license": { + "name": "MIT License", + "url": "https://opensource.org/licenses/MIT" + }, + "version": "1" + }, + "host": "localhost:8080", + "basePath": "/api", + "paths": { + "/jobs/": { + "get": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Get a list of all jobs. Filters can be applied using query parameters.\nNumber of results can be limited by page. Results are sorted by descending startTime.", + "produces": [ + "application/json" + ], + "tags": [ + "query" + ], + "summary": "Lists all jobs", + "parameters": [ + { + "enum": [ + "running", + "completed", + "failed", + "cancelled", + "stopped", + "timeout" + ], + "type": "string", + "description": "Job State", + "name": "state", + "in": "query" + }, + { + "type": "string", + "description": "Job Cluster", + "name": "cluster", + "in": "query" + }, + { + "type": "string", + "description": "Syntax: '$from-$to', as unix epoch timestamps in seconds", + "name": "start-time", + "in": "query" + }, + { + "type": "integer", + "description": "Items per page (Default: 25)", + "name": "items-per-page", + "in": "query" + }, + { + "type": "integer", + "description": "Page Number (Default: 1)", + "name": "page", + "in": "query" + }, + { + "type": "boolean", + "description": "Include metadata (e.g. jobScript) in response", + "name": "with-metadata", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Job array and page info", + "schema": { + "$ref": "#/definitions/api.GetJobsApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/delete_job/": { + "delete": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Job to delete is specified by request body. All fields are required in this case.", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "remove" + ], + "summary": "Remove a job from the sql database", + "parameters": [ + { + "description": "All fields required", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/api.DeleteJobApiRequest" + } + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "$ref": "#/definitions/api.DeleteJobApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Resource not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/delete_job/{id}": { + "delete": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Job to remove is specified by database ID. This will not remove the job from the job archive.", + "produces": [ + "application/json" + ], + "tags": [ + "remove" + ], + "summary": "Remove a job from the sql database", + "parameters": [ + { + "type": "integer", + "description": "Database ID of Job", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "$ref": "#/definitions/api.DeleteJobApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Resource not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/delete_job_before/{ts}": { + "delete": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.", + "produces": [ + "application/json" + ], + "tags": [ + "remove" + ], + "summary": "Remove a job from the sql database", + "parameters": [ + { + "type": "integer", + "description": "Unix epoch timestamp", + "name": "ts", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "$ref": "#/definitions/api.DeleteJobApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Resource not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/start_job/": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Job specified in request body will be saved to database as \"running\" with new DB ID.\nJob specifications follow the 'JobMeta' scheme, API will fail to execute if requirements are not met.", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "add and modify" + ], + "summary": "Adds a new job as \"running\"", + "parameters": [ + { + "description": "Job to add", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/schema.JobMeta" + } + } + ], + "responses": { + "201": { + "description": "Job added successfully", + "schema": { + "$ref": "#/definitions/api.StartJobApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity: The combination of jobId, clusterId and startTime does already exist", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/stop_job/": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Job to stop is specified by request body. All fields are required in this case.\nReturns full job resource information according to 'JobMeta' scheme.", + "produces": [ + "application/json" + ], + "tags": [ + "add and modify" + ], + "summary": "Marks job as completed and triggers archiving", + "parameters": [ + { + "description": "All fields required", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/api.StopJobApiRequest" + } + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "$ref": "#/definitions/schema.JobMeta" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Resource not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/stop_job/{id}": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Job to stop is specified by database ID. Only stopTime and final state are required in request body.\nReturns full job resource information according to 'JobMeta' scheme.", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "add and modify" + ], + "summary": "Marks job as completed and triggers archiving", + "parameters": [ + { + "type": "integer", + "description": "Database ID of Job", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "stopTime and final state in request body", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/api.StopJobApiRequest" + } + } + ], + "responses": { + "200": { + "description": "Job resource", + "schema": { + "$ref": "#/definitions/schema.JobMeta" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Resource not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/tag_job/{id}": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "add and modify" + ], + "summary": "Adds one or more tags to a job", + "parameters": [ + { + "type": "integer", + "description": "Job Database ID", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "Array of tag-objects to add", + "name": "request", + "in": "body", + "required": true, + "schema": { + "type": "array", + "items": { + "$ref": "#/definitions/api.ApiTag" + } + } + } + ], + "responses": { + "200": { + "description": "Updated job resource", + "schema": { + "$ref": "#/definitions/schema.Job" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Job or tag does not exist", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/jobs/{id}": { + "post": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Job to get is specified by database ID\nReturns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'.", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "query" + ], + "summary": "Get complete job meta and metric data", + "parameters": [ + { + "type": "integer", + "description": "Database ID of Job", + "name": "id", + "in": "path", + "required": true + }, + { + "description": "Array of metric names", + "name": "request", + "in": "body", + "required": true, + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + } + ], + "responses": { + "200": { + "description": "Job resource", + "schema": { + "$ref": "#/definitions/api.GetJobApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Resource not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + } + }, + "definitions": { + "api.ApiTag": { + "type": "object", + "properties": { + "name": { + "description": "Tag Name", + "type": "string", + "example": "Testjob" + }, + "type": { + "description": "Tag Type", + "type": "string", + "example": "Debug" + } + } + }, + "api.DeleteJobApiRequest": { + "type": "object", + "required": [ + "jobId" + ], + "properties": { + "cluster": { + "description": "Cluster of job", + "type": "string", + "example": "fritz" + }, + "jobId": { + "description": "Cluster Job ID of job", + "type": "integer", + "example": 123000 + }, + "startTime": { + "description": "Start Time of job as epoch", + "type": "integer", + "example": 1649723812 + } + } + }, + "api.DeleteJobApiResponse": { + "type": "object", + "properties": { + "msg": { + "type": "string" + } + } + }, + "api.ErrorResponse": { + "type": "object", + "properties": { + "error": { + "description": "Error Message", + "type": "string" + }, + "status": { + "description": "Statustext of Errorcode", + "type": "string" + } + } + }, + "api.GetJobApiResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/definitions/api.JobMetricWithName" + } + }, + "meta": { + "$ref": "#/definitions/schema.Job" + } + } + }, + "api.GetJobsApiResponse": { + "type": "object", + "properties": { + "items": { + "description": "Number of jobs returned", + "type": "integer" + }, + "jobs": { + "description": "Array of jobs", + "type": "array", + "items": { + "$ref": "#/definitions/schema.JobMeta" + } + }, + "page": { + "description": "Page id returned", + "type": "integer" + } + } + }, + "api.JobMetricWithName": { + "type": "object", + "properties": { + "metric": { + "$ref": "#/definitions/schema.JobMetric" + }, + "name": { + "type": "string" + }, + "scope": { + "$ref": "#/definitions/schema.MetricScope" + } + } + }, + "api.StartJobApiResponse": { + "type": "object", + "properties": { + "id": { + "description": "Database ID of new job", + "type": "integer" + } + } + }, + "api.StopJobApiRequest": { + "type": "object", + "required": [ + "jobState", + "stopTime" + ], + "properties": { + "cluster": { + "description": "Cluster of job", + "type": "string", + "example": "fritz" + }, + "jobId": { + "description": "Cluster Job ID of job", + "type": "integer", + "example": 123000 + }, + "jobState": { + "description": "Final job state", + "allOf": [ + { + "$ref": "#/definitions/schema.JobState" + } + ], + "example": "completed" + }, + "startTime": { + "description": "Start Time of job as epoch", + "type": "integer", + "example": 1649723812 + }, + "stopTime": { + "description": "Stop Time of job as epoch", + "type": "integer", + "example": 1649763839 + } + } + }, + "schema.Job": { + "description": "Information of a HPC job.", + "type": "object", + "properties": { + "arrayJobId": { + "description": "The unique identifier of an array job", + "type": "integer", + "example": 123000 + }, + "cluster": { + "description": "The unique identifier of a cluster", + "type": "string", + "example": "fritz" + }, + "concurrentJobs": { + "$ref": "#/definitions/schema.JobLinkResultList" + }, + "duration": { + "description": "Duration of job in seconds (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 43200 + }, + "exclusive": { + "description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, + "id": { + "description": "The unique identifier of a job in the database", + "type": "integer" + }, + "jobId": { + "description": "The unique identifier of a job", + "type": "integer", + "example": 123000 + }, + "jobState": { + "description": "Final state of job", + "enum": [ + "completed", + "failed", + "cancelled", + "stopped", + "timeout", + "out_of_memory" + ], + "allOf": [ + { + "$ref": "#/definitions/schema.JobState" + } + ], + "example": "completed" + }, + "metaData": { + "description": "Additional information about the job", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "monitoringStatus": { + "description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull", + "type": "integer", + "maximum": 3, + "minimum": 0, + "example": 1 + }, + "numAcc": { + "description": "Number of accelerators used (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 2 + }, + "numHwthreads": { + "description": "NumCores int32 `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` // Number of HWThreads used (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 20 + }, + "numNodes": { + "description": "Number of nodes used (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 2 + }, + "partition": { + "description": "The Slurm partition to which the job was submitted", + "type": "string", + "example": "main" + }, + "project": { + "description": "The unique identifier of a project", + "type": "string", + "example": "abcd200" + }, + "resources": { + "description": "Resources used by job", + "type": "array", + "items": { + "$ref": "#/definitions/schema.Resource" + } + }, + "smt": { + "description": "SMT threads used by job", + "type": "integer", + "example": 4 + }, + "startTime": { + "description": "Start time as 'time.Time' data type", + "type": "string" + }, + "subCluster": { + "description": "The unique identifier of a sub cluster", + "type": "string", + "example": "main" + }, + "tags": { + "description": "List of tags", + "type": "array", + "items": { + "$ref": "#/definitions/schema.Tag" + } + }, + "user": { + "description": "The unique identifier of a user", + "type": "string", + "example": "abcd100h" + }, + "walltime": { + "description": "Requested walltime of job in seconds (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 86400 + } + } + }, + "schema.JobLink": { + "type": "object", + "properties": { + "id": { + "type": "integer" + }, + "jobId": { + "type": "integer" + } + } + }, + "schema.JobLinkResultList": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "items": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.JobLink" + } + } + } + }, + "schema.JobMeta": { + "description": "Meta data information of a HPC job.", + "type": "object", + "properties": { + "arrayJobId": { + "description": "The unique identifier of an array job", + "type": "integer", + "example": 123000 + }, + "cluster": { + "description": "The unique identifier of a cluster", + "type": "string", + "example": "fritz" + }, + "concurrentJobs": { + "$ref": "#/definitions/schema.JobLinkResultList" + }, + "duration": { + "description": "Duration of job in seconds (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 43200 + }, + "exclusive": { + "description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, + "id": { + "description": "The unique identifier of a job in the database", + "type": "integer" + }, + "jobId": { + "description": "The unique identifier of a job", + "type": "integer", + "example": 123000 + }, + "jobState": { + "description": "Final state of job", + "enum": [ + "completed", + "failed", + "cancelled", + "stopped", + "timeout", + "out_of_memory" + ], + "allOf": [ + { + "$ref": "#/definitions/schema.JobState" + } + ], + "example": "completed" + }, + "metaData": { + "description": "Additional information about the job", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "monitoringStatus": { + "description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull", + "type": "integer", + "maximum": 3, + "minimum": 0, + "example": 1 + }, + "numAcc": { + "description": "Number of accelerators used (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 2 + }, + "numHwthreads": { + "description": "NumCores int32 `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` // Number of HWThreads used (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 20 + }, + "numNodes": { + "description": "Number of nodes used (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 2 + }, + "partition": { + "description": "The Slurm partition to which the job was submitted", + "type": "string", + "example": "main" + }, + "project": { + "description": "The unique identifier of a project", + "type": "string", + "example": "abcd200" + }, + "resources": { + "description": "Resources used by job", + "type": "array", + "items": { + "$ref": "#/definitions/schema.Resource" + } + }, + "smt": { + "description": "SMT threads used by job", + "type": "integer", + "example": 4 + }, + "startTime": { + "description": "Start epoch time stamp in seconds (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 1649723812 + }, + "statistics": { + "description": "Metric statistics of job", + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/schema.JobStatistics" + } + }, + "subCluster": { + "description": "The unique identifier of a sub cluster", + "type": "string", + "example": "main" + }, + "tags": { + "description": "List of tags", + "type": "array", + "items": { + "$ref": "#/definitions/schema.Tag" + } + }, + "user": { + "description": "The unique identifier of a user", + "type": "string", + "example": "abcd100h" + }, + "walltime": { + "description": "Requested walltime of job in seconds (Min \u003e 0)", + "type": "integer", + "minimum": 1, + "example": 86400 + } + } + }, + "schema.JobMetric": { + "type": "object", + "properties": { + "series": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.Series" + } + }, + "statisticsSeries": { + "$ref": "#/definitions/schema.StatsSeries" + }, + "timestep": { + "type": "integer" + }, + "unit": { + "$ref": "#/definitions/schema.Unit" + } + } + }, + "schema.JobState": { + "type": "string", + "enum": [ + "running", + "completed", + "failed", + "cancelled", + "stopped", + "timeout", + "preempted", + "out_of_memory" + ], + "x-enum-varnames": [ + "JobStateRunning", + "JobStateCompleted", + "JobStateFailed", + "JobStateCancelled", + "JobStateStopped", + "JobStateTimeout", + "JobStatePreempted", + "JobStateOutOfMemory" + ] + }, + "schema.JobStatistics": { + "description": "Specification for job metric statistics.", + "type": "object", + "properties": { + "avg": { + "description": "Job metric average", + "type": "number", + "minimum": 0, + "example": 2500 + }, + "max": { + "description": "Job metric maximum", + "type": "number", + "minimum": 0, + "example": 3000 + }, + "min": { + "description": "Job metric minimum", + "type": "number", + "minimum": 0, + "example": 2000 + }, + "unit": { + "$ref": "#/definitions/schema.Unit" + } + } + }, + "schema.MetricScope": { + "type": "string", + "enum": [ + "invalid_scope", + "node", + "socket", + "memoryDomain", + "core", + "hwthread", + "accelerator" + ], + "x-enum-varnames": [ + "MetricScopeInvalid", + "MetricScopeNode", + "MetricScopeSocket", + "MetricScopeMemoryDomain", + "MetricScopeCore", + "MetricScopeHWThread", + "MetricScopeAccelerator" + ] + }, + "schema.MetricStatistics": { + "type": "object", + "properties": { + "avg": { + "type": "number" + }, + "max": { + "type": "number" + }, + "min": { + "type": "number" + } + } + }, + "schema.Resource": { + "description": "A resource used by a job", + "type": "object", + "properties": { + "accelerators": { + "description": "List of of accelerator device ids", + "type": "array", + "items": { + "type": "string" + } + }, + "configuration": { + "description": "The configuration options of the node", + "type": "string" + }, + "hostname": { + "description": "Name of the host (= node)", + "type": "string" + }, + "hwthreads": { + "description": "List of OS processor ids", + "type": "array", + "items": { + "type": "integer" + } + } + } + }, + "schema.Series": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "type": "number" + } + }, + "hostname": { + "type": "string" + }, + "id": { + "type": "string" + }, + "statistics": { + "$ref": "#/definitions/schema.MetricStatistics" + } + } + }, + "schema.StatsSeries": { + "type": "object", + "properties": { + "max": { + "type": "array", + "items": { + "type": "number" + } + }, + "mean": { + "type": "array", + "items": { + "type": "number" + } + }, + "min": { + "type": "array", + "items": { + "type": "number" + } + }, + "percentiles": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { + "type": "number" + } + } + } + } + }, + "schema.Tag": { + "description": "Defines a tag using name and type.", + "type": "object", + "properties": { + "id": { + "description": "The unique DB identifier of a tag\nThe unique DB identifier of a tag", + "type": "integer" + }, + "name": { + "description": "Tag Name", + "type": "string", + "example": "Testjob" + }, + "type": { + "description": "Tag Type", + "type": "string", + "example": "Debug" + } + } + }, + "schema.Unit": { + "type": "object", + "properties": { + "base": { + "type": "string" + }, + "prefix": { + "type": "string" + } + } + } + }, + "securityDefinitions": { + "ApiKeyAuth": { + "type": "apiKey", + "name": "X-Auth-Token", + "in": "header" + } + }, + "tags": [ + { + "name": "Job API" + } + ] +} \ No newline at end of file diff --git a/interfaces/rest/swagger.yaml b/interfaces/rest/swagger.yaml new file mode 100644 index 0000000..093266d --- /dev/null +++ b/interfaces/rest/swagger.yaml @@ -0,0 +1,1006 @@ +basePath: /api +definitions: + api.ApiTag: + properties: + name: + description: Tag Name + example: Testjob + type: string + type: + description: Tag Type + example: Debug + type: string + type: object + api.DeleteJobApiRequest: + properties: + cluster: + description: Cluster of job + example: fritz + type: string + jobId: + description: Cluster Job ID of job + example: 123000 + type: integer + startTime: + description: Start Time of job as epoch + example: 1649723812 + type: integer + required: + - jobId + type: object + api.DeleteJobApiResponse: + properties: + msg: + type: string + type: object + api.ErrorResponse: + properties: + error: + description: Error Message + type: string + status: + description: Statustext of Errorcode + type: string + type: object + api.GetJobApiResponse: + properties: + data: + items: + $ref: '#/definitions/api.JobMetricWithName' + type: array + meta: + $ref: '#/definitions/schema.Job' + type: object + api.GetJobsApiResponse: + properties: + items: + description: Number of jobs returned + type: integer + jobs: + description: Array of jobs + items: + $ref: '#/definitions/schema.JobMeta' + type: array + page: + description: Page id returned + type: integer + type: object + api.JobMetricWithName: + properties: + metric: + $ref: '#/definitions/schema.JobMetric' + name: + type: string + scope: + $ref: '#/definitions/schema.MetricScope' + type: object + api.StartJobApiResponse: + properties: + id: + description: Database ID of new job + type: integer + type: object + api.StopJobApiRequest: + properties: + cluster: + description: Cluster of job + example: fritz + type: string + jobId: + description: Cluster Job ID of job + example: 123000 + type: integer + jobState: + allOf: + - $ref: '#/definitions/schema.JobState' + description: Final job state + example: completed + startTime: + description: Start Time of job as epoch + example: 1649723812 + type: integer + stopTime: + description: Stop Time of job as epoch + example: 1649763839 + type: integer + required: + - jobState + - stopTime + type: object + schema.Job: + description: Information of a HPC job. + properties: + arrayJobId: + description: The unique identifier of an array job + example: 123000 + type: integer + cluster: + description: The unique identifier of a cluster + example: fritz + type: string + concurrentJobs: + $ref: '#/definitions/schema.JobLinkResultList' + duration: + description: Duration of job in seconds (Min > 0) + example: 43200 + minimum: 1 + type: integer + exclusive: + description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs + of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple + jobs of same user' + example: 1 + maximum: 2 + minimum: 0 + type: integer + id: + description: The unique identifier of a job in the database + type: integer + jobId: + description: The unique identifier of a job + example: 123000 + type: integer + jobState: + allOf: + - $ref: '#/definitions/schema.JobState' + description: Final state of job + enum: + - completed + - failed + - cancelled + - stopped + - timeout + - out_of_memory + example: completed + metaData: + additionalProperties: + type: string + description: Additional information about the job + type: object + monitoringStatus: + description: 'State of monitoring system during job run: 0 - Disabled, 1 - + Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull' + example: 1 + maximum: 3 + minimum: 0 + type: integer + numAcc: + description: Number of accelerators used (Min > 0) + example: 2 + minimum: 1 + type: integer + numHwthreads: + description: NumCores int32 `json:"numCores" db:"num_cores" + example:"20" minimum:"1"` // + Number of HWThreads used (Min > 0) + example: 20 + minimum: 1 + type: integer + numNodes: + description: Number of nodes used (Min > 0) + example: 2 + minimum: 1 + type: integer + partition: + description: The Slurm partition to which the job was submitted + example: main + type: string + project: + description: The unique identifier of a project + example: abcd200 + type: string + resources: + description: Resources used by job + items: + $ref: '#/definitions/schema.Resource' + type: array + smt: + description: SMT threads used by job + example: 4 + type: integer + startTime: + description: Start time as 'time.Time' data type + type: string + subCluster: + description: The unique identifier of a sub cluster + example: main + type: string + tags: + description: List of tags + items: + $ref: '#/definitions/schema.Tag' + type: array + user: + description: The unique identifier of a user + example: abcd100h + type: string + walltime: + description: Requested walltime of job in seconds (Min > 0) + example: 86400 + minimum: 1 + type: integer + type: object + schema.JobLink: + properties: + id: + type: integer + jobId: + type: integer + type: object + schema.JobLinkResultList: + properties: + count: + type: integer + items: + items: + $ref: '#/definitions/schema.JobLink' + type: array + type: object + schema.JobMeta: + description: Meta data information of a HPC job. + properties: + arrayJobId: + description: The unique identifier of an array job + example: 123000 + type: integer + cluster: + description: The unique identifier of a cluster + example: fritz + type: string + concurrentJobs: + $ref: '#/definitions/schema.JobLinkResultList' + duration: + description: Duration of job in seconds (Min > 0) + example: 43200 + minimum: 1 + type: integer + exclusive: + description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs + of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple + jobs of same user' + example: 1 + maximum: 2 + minimum: 0 + type: integer + id: + description: The unique identifier of a job in the database + type: integer + jobId: + description: The unique identifier of a job + example: 123000 + type: integer + jobState: + allOf: + - $ref: '#/definitions/schema.JobState' + description: Final state of job + enum: + - completed + - failed + - cancelled + - stopped + - timeout + - out_of_memory + example: completed + metaData: + additionalProperties: + type: string + description: Additional information about the job + type: object + monitoringStatus: + description: 'State of monitoring system during job run: 0 - Disabled, 1 - + Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull' + example: 1 + maximum: 3 + minimum: 0 + type: integer + numAcc: + description: Number of accelerators used (Min > 0) + example: 2 + minimum: 1 + type: integer + numHwthreads: + description: NumCores int32 `json:"numCores" db:"num_cores" + example:"20" minimum:"1"` // + Number of HWThreads used (Min > 0) + example: 20 + minimum: 1 + type: integer + numNodes: + description: Number of nodes used (Min > 0) + example: 2 + minimum: 1 + type: integer + partition: + description: The Slurm partition to which the job was submitted + example: main + type: string + project: + description: The unique identifier of a project + example: abcd200 + type: string + resources: + description: Resources used by job + items: + $ref: '#/definitions/schema.Resource' + type: array + smt: + description: SMT threads used by job + example: 4 + type: integer + startTime: + description: Start epoch time stamp in seconds (Min > 0) + example: 1649723812 + minimum: 1 + type: integer + statistics: + additionalProperties: + $ref: '#/definitions/schema.JobStatistics' + description: Metric statistics of job + type: object + subCluster: + description: The unique identifier of a sub cluster + example: main + type: string + tags: + description: List of tags + items: + $ref: '#/definitions/schema.Tag' + type: array + user: + description: The unique identifier of a user + example: abcd100h + type: string + walltime: + description: Requested walltime of job in seconds (Min > 0) + example: 86400 + minimum: 1 + type: integer + type: object + schema.JobMetric: + properties: + series: + items: + $ref: '#/definitions/schema.Series' + type: array + statisticsSeries: + $ref: '#/definitions/schema.StatsSeries' + timestep: + type: integer + unit: + $ref: '#/definitions/schema.Unit' + type: object + schema.JobState: + enum: + - running + - completed + - failed + - cancelled + - stopped + - timeout + - preempted + - out_of_memory + type: string + x-enum-varnames: + - JobStateRunning + - JobStateCompleted + - JobStateFailed + - JobStateCancelled + - JobStateStopped + - JobStateTimeout + - JobStatePreempted + - JobStateOutOfMemory + schema.JobStatistics: + description: Specification for job metric statistics. + properties: + avg: + description: Job metric average + example: 2500 + minimum: 0 + type: number + max: + description: Job metric maximum + example: 3000 + minimum: 0 + type: number + min: + description: Job metric minimum + example: 2000 + minimum: 0 + type: number + unit: + $ref: '#/definitions/schema.Unit' + type: object + schema.MetricScope: + enum: + - invalid_scope + - node + - socket + - memoryDomain + - core + - hwthread + - accelerator + type: string + x-enum-varnames: + - MetricScopeInvalid + - MetricScopeNode + - MetricScopeSocket + - MetricScopeMemoryDomain + - MetricScopeCore + - MetricScopeHWThread + - MetricScopeAccelerator + schema.MetricStatistics: + properties: + avg: + type: number + max: + type: number + min: + type: number + type: object + schema.Resource: + description: A resource used by a job + properties: + accelerators: + description: List of of accelerator device ids + items: + type: string + type: array + configuration: + description: The configuration options of the node + type: string + hostname: + description: Name of the host (= node) + type: string + hwthreads: + description: List of OS processor ids + items: + type: integer + type: array + type: object + schema.Series: + properties: + data: + items: + type: number + type: array + hostname: + type: string + id: + type: string + statistics: + $ref: '#/definitions/schema.MetricStatistics' + type: object + schema.StatsSeries: + properties: + max: + items: + type: number + type: array + mean: + items: + type: number + type: array + min: + items: + type: number + type: array + percentiles: + additionalProperties: + items: + type: number + type: array + type: object + type: object + schema.Tag: + description: Defines a tag using name and type. + properties: + id: + description: |- + The unique DB identifier of a tag + The unique DB identifier of a tag + type: integer + name: + description: Tag Name + example: Testjob + type: string + type: + description: Tag Type + example: Debug + type: string + type: object + schema.Unit: + properties: + base: + type: string + prefix: + type: string + type: object +host: localhost:8080 +info: + contact: + email: support@clustercockpit.org + name: ClusterCockpit Project + url: https://github.com/ClusterCockpit + description: API for batch job control. + license: + name: MIT License + url: https://opensource.org/licenses/MIT + title: ClusterCockpit REST API + version: "1" +paths: + /jobs/: + get: + description: |- + Get a list of all jobs. Filters can be applied using query parameters. + Number of results can be limited by page. Results are sorted by descending startTime. + parameters: + - description: Job State + enum: + - running + - completed + - failed + - cancelled + - stopped + - timeout + in: query + name: state + type: string + - description: Job Cluster + in: query + name: cluster + type: string + - description: 'Syntax: ''$from-$to'', as unix epoch timestamps in seconds' + in: query + name: start-time + type: string + - description: 'Items per page (Default: 25)' + in: query + name: items-per-page + type: integer + - description: 'Page Number (Default: 1)' + in: query + name: page + type: integer + - description: Include metadata (e.g. jobScript) in response + in: query + name: with-metadata + type: boolean + produces: + - application/json + responses: + "200": + description: Job array and page info + schema: + $ref: '#/definitions/api.GetJobsApiResponse' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Lists all jobs + tags: + - query + /jobs/{id}: + post: + consumes: + - application/json + description: |- + Job to get is specified by database ID + Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. + parameters: + - description: Database ID of Job + in: path + name: id + required: true + type: integer + - description: Array of metric names + in: body + name: request + required: true + schema: + items: + type: string + type: array + produces: + - application/json + responses: + "200": + description: Job resource + schema: + $ref: '#/definitions/api.GetJobApiResponse' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Resource not found + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: 'Unprocessable Entity: finding job failed: sql: no rows in + result set' + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Get complete job meta and metric data + tags: + - query + /jobs/delete_job/: + delete: + consumes: + - application/json + description: Job to delete is specified by request body. All fields are required + in this case. + parameters: + - description: All fields required + in: body + name: request + required: true + schema: + $ref: '#/definitions/api.DeleteJobApiRequest' + produces: + - application/json + responses: + "200": + description: Success message + schema: + $ref: '#/definitions/api.DeleteJobApiResponse' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Resource not found + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: 'Unprocessable Entity: finding job failed: sql: no rows in + result set' + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Remove a job from the sql database + tags: + - remove + /jobs/delete_job/{id}: + delete: + description: Job to remove is specified by database ID. This will not remove + the job from the job archive. + parameters: + - description: Database ID of Job + in: path + name: id + required: true + type: integer + produces: + - application/json + responses: + "200": + description: Success message + schema: + $ref: '#/definitions/api.DeleteJobApiResponse' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Resource not found + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: 'Unprocessable Entity: finding job failed: sql: no rows in + result set' + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Remove a job from the sql database + tags: + - remove + /jobs/delete_job_before/{ts}: + delete: + description: Remove all jobs with start time before timestamp. The jobs will + not be removed from the job archive. + parameters: + - description: Unix epoch timestamp + in: path + name: ts + required: true + type: integer + produces: + - application/json + responses: + "200": + description: Success message + schema: + $ref: '#/definitions/api.DeleteJobApiResponse' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Resource not found + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: 'Unprocessable Entity: finding job failed: sql: no rows in + result set' + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Remove a job from the sql database + tags: + - remove + /jobs/start_job/: + post: + consumes: + - application/json + description: |- + Job specified in request body will be saved to database as "running" with new DB ID. + Job specifications follow the 'JobMeta' scheme, API will fail to execute if requirements are not met. + parameters: + - description: Job to add + in: body + name: request + required: true + schema: + $ref: '#/definitions/schema.JobMeta' + produces: + - application/json + responses: + "201": + description: Job added successfully + schema: + $ref: '#/definitions/api.StartJobApiResponse' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: 'Unprocessable Entity: The combination of jobId, clusterId + and startTime does already exist' + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Adds a new job as "running" + tags: + - add and modify + /jobs/stop_job/: + post: + description: |- + Job to stop is specified by request body. All fields are required in this case. + Returns full job resource information according to 'JobMeta' scheme. + parameters: + - description: All fields required + in: body + name: request + required: true + schema: + $ref: '#/definitions/api.StopJobApiRequest' + produces: + - application/json + responses: + "200": + description: Success message + schema: + $ref: '#/definitions/schema.JobMeta' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Resource not found + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: 'Unprocessable Entity: finding job failed: sql: no rows in + result set' + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Marks job as completed and triggers archiving + tags: + - add and modify + /jobs/stop_job/{id}: + post: + consumes: + - application/json + description: |- + Job to stop is specified by database ID. Only stopTime and final state are required in request body. + Returns full job resource information according to 'JobMeta' scheme. + parameters: + - description: Database ID of Job + in: path + name: id + required: true + type: integer + - description: stopTime and final state in request body + in: body + name: request + required: true + schema: + $ref: '#/definitions/api.StopJobApiRequest' + produces: + - application/json + responses: + "200": + description: Job resource + schema: + $ref: '#/definitions/schema.JobMeta' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Resource not found + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: 'Unprocessable Entity: finding job failed: sql: no rows in + result set' + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Marks job as completed and triggers archiving + tags: + - add and modify + /jobs/tag_job/{id}: + post: + consumes: + - application/json + description: |- + Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely. + If tagged job is already finished: Tag will be written directly to respective archive files. + parameters: + - description: Job Database ID + in: path + name: id + required: true + type: integer + - description: Array of tag-objects to add + in: body + name: request + required: true + schema: + items: + $ref: '#/definitions/api.ApiTag' + type: array + produces: + - application/json + responses: + "200": + description: Updated job resource + schema: + $ref: '#/definitions/schema.Job' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Job or tag does not exist + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Adds one or more tags to a job + tags: + - add and modify +securityDefinitions: + ApiKeyAuth: + in: header + name: X-Auth-Token + type: apiKey +swagger: "2.0" +tags: +- name: Job API diff --git a/schemas/jobs-sqlite.sql b/schemas/jobs-sqlite.sql index 67bdd5a..ab37924 100644 --- a/schemas/jobs-sqlite.sql +++ b/schemas/jobs-sqlite.sql @@ -1,49 +1,71 @@ -DROP TABLE IF EXISTS jobtag; -DROP TABLE IF EXISTS job; -DROP TABLE IF EXISTS tag; - -CREATE TABLE job ( - id INTEGER PRIMARY KEY /*!40101 AUTO_INCREMENT */, - job_id BIGINT NOT NULL, - cluster VARCHAR(255) NOT NULL, - subcluster VARCHAR(255) NOT NULL, - start_time BIGINT NOT NULL, -- Unix timestamp - - user VARCHAR(255) NOT NULL, - project VARCHAR(255) NOT NULL, - ` + "`partition`" + ` VARCHAR(255) NOT NULL, -- partition is a keyword in mysql -.- - array_job_id BIGINT NOT NULL, - duration INT NOT NULL DEFAULT 0, - walltime INT NOT NULL DEFAULT 0, - job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')), - meta_data TEXT, -- JSON - resources TEXT NOT NULL, -- JSON - - num_nodes INT NOT NULL, - num_hwthreads INT NOT NULL, - num_acc INT NOT NULL, - smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )), - exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)), - monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)), - - mem_used_max REAL NOT NULL DEFAULT 0.0, - flops_any_avg REAL NOT NULL DEFAULT 0.0, - mem_bw_avg REAL NOT NULL DEFAULT 0.0, - load_avg REAL NOT NULL DEFAULT 0.0, - net_bw_avg REAL NOT NULL DEFAULT 0.0, - net_data_vol_total REAL NOT NULL DEFAULT 0.0, - file_bw_avg REAL NOT NULL DEFAULT 0.0, - file_data_vol_total REAL NOT NULL DEFAULT 0.0); - CREATE TABLE tag ( - id INTEGER PRIMARY KEY, - tag_type VARCHAR(255) NOT NULL, - tag_name VARCHAR(255) NOT NULL, - CONSTRAINT be_unique UNIQUE (tag_type, tag_name)); +id INTEGER PRIMARY KEY, +tag_type VARCHAR(255) NOT NULL, +tag_name VARCHAR(255) NOT NULL, +insert_ts TEXT DEFAULT CURRENT_TIMESTAMP, +UNIQUE (tag_type, tag_name)); CREATE TABLE jobtag ( - job_id INTEGER, - tag_id INTEGER, - PRIMARY KEY (job_id, tag_id), - FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE, - FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE); +job_id INTEGER, +tag_id INTEGER, +insert_ts TEXT DEFAULT CURRENT_TIMESTAMP, +PRIMARY KEY (job_id, tag_id), +FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE, +FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE); + +CREATE TABLE user ( +username varchar(255) PRIMARY KEY NOT NULL, +password varchar(255) DEFAULT NULL, +ldap tinyint NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */ +name varchar(255) DEFAULT NULL, +roles varchar(255) NOT NULL DEFAULT "[]", +email varchar(255) DEFAULT NULL, +projects varchar(255) NOT NULL DEFAULT "[]"); + +CREATE TABLE configuration ( +username varchar(255), +confkey varchar(255), +value varchar(255), +PRIMARY KEY (username, confkey), +FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION); + +CREATE TABLE job ( +id INTEGER PRIMARY KEY, +job_id BIGINT NOT NULL, +cluster VARCHAR(255) NOT NULL, +subcluster VARCHAR(255) NOT NULL, +start_time BIGINT NOT NULL, -- Unix timestamp +user VARCHAR(255) NOT NULL, +project VARCHAR(255) NOT NULL, +partition VARCHAR(255), +array_job_id BIGINT, +duration INT NOT NULL, +walltime INT NOT NULL, +job_state VARCHAR(255) NOT NULL +CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')), +meta_data TEXT, -- JSON +resources TEXT NOT NULL, -- JSON +num_nodes INT NOT NULL, +num_hwthreads INT, +num_acc INT, +smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )), +exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)), +monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)), +mem_used_max REAL NOT NULL DEFAULT 0.0, +flops_any_avg REAL NOT NULL DEFAULT 0.0, +mem_bw_avg REAL NOT NULL DEFAULT 0.0, +load_avg REAL NOT NULL DEFAULT 0.0, +net_bw_avg REAL NOT NULL DEFAULT 0.0, +net_data_vol_total REAL NOT NULL DEFAULT 0.0, +file_bw_avg REAL NOT NULL DEFAULT 0.0, +file_data_vol_total REAL NOT NULL DEFAULT 0.0, +UNIQUE (job_id, cluster, start_time)); + +CREATE INDEX job_stats ON job (cluster,subcluster,user); +CREATE INDEX job_by_user ON job (user); +CREATE INDEX job_by_starttime ON job (start_time); +CREATE INDEX job_by_job_id ON job (job_id, cluster, start_time); +CREATE INDEX job_list ON job (cluster, job_state); +CREATE INDEX job_list_user ON job (user, cluster, job_state); +CREATE INDEX job_list_users ON job (user, job_state); +CREATE INDEX job_list_users_start ON job (start_time, user, job_state);