From 99d55f05f8ab74ca6bc0f65136158749f9c4f626 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 8 Mar 2024 16:35:30 +0100 Subject: [PATCH] feat: Add cluster config endpoint to rest api --- api/swagger.json | 263 +++++++++++++++++++++++++++++++++++++++++++ api/swagger.yaml | 172 ++++++++++++++++++++++++++++ internal/api/docs.go | 263 +++++++++++++++++++++++++++++++++++++++++++ internal/api/rest.go | 62 +++++++++- 4 files changed, 756 insertions(+), 4 deletions(-) diff --git a/api/swagger.json b/api/swagger.json index ba296eb..7f5eaf7 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -17,6 +17,63 @@ "host": "localhost:8080", "basePath": "/api", "paths": { + "/clusters/": { + "get": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", + "produces": [ + "application/json" + ], + "tags": [ + "Cluster query" + ], + "summary": "Lists all cluster configs", + "parameters": [ + { + "type": "string", + "description": "Job Cluster", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Array of clusters", + "schema": { + "$ref": "#/definitions/api.GetClustersApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, "/jobs/": { "get": { "security": [ @@ -1284,6 +1341,18 @@ } } }, + "api.GetClustersApiResponse": { + "type": "object", + "properties": { + "clusters": { + "description": "Array of clusters", + "type": "array", + "items": { + "$ref": "#/definitions/schema.Cluster" + } + } + } + }, "api.GetJobApiResponse": { "type": "object", "properties": { @@ -1379,6 +1448,40 @@ } } }, + "schema.Accelerator": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "model": { + "type": "string" + }, + "type": { + "type": "string" + } + } + }, + "schema.Cluster": { + "type": "object", + "properties": { + "metricConfig": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.MetricConfig" + } + }, + "name": { + "type": "string" + }, + "subClusters": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.SubCluster" + } + } + } + }, "schema.Job": { "description": "Information of a HPC job.", "type": "object", @@ -1777,6 +1880,44 @@ } } }, + "schema.MetricConfig": { + "type": "object", + "properties": { + "aggregation": { + "type": "string" + }, + "alert": { + "type": "number" + }, + "caution": { + "type": "number" + }, + "name": { + "type": "string" + }, + "normal": { + "type": "number" + }, + "peak": { + "type": "number" + }, + "scope": { + "$ref": "#/definitions/schema.MetricScope" + }, + "subClusters": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.SubClusterConfig" + } + }, + "timestep": { + "type": "integer" + }, + "unit": { + "$ref": "#/definitions/schema.Unit" + } + } + }, "schema.MetricScope": { "type": "string", "enum": [ @@ -1812,6 +1953,17 @@ } } }, + "schema.MetricValue": { + "type": "object", + "properties": { + "unit": { + "$ref": "#/definitions/schema.Unit" + }, + "value": { + "type": "number" + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", @@ -1892,6 +2044,64 @@ } } }, + "schema.SubCluster": { + "type": "object", + "properties": { + "coresPerSocket": { + "type": "integer" + }, + "flopRateScalar": { + "$ref": "#/definitions/schema.MetricValue" + }, + "flopRateSimd": { + "$ref": "#/definitions/schema.MetricValue" + }, + "memoryBandwidth": { + "$ref": "#/definitions/schema.MetricValue" + }, + "name": { + "type": "string" + }, + "nodes": { + "type": "string" + }, + "processorType": { + "type": "string" + }, + "socketsPerNode": { + "type": "integer" + }, + "threadsPerCore": { + "type": "integer" + }, + "topology": { + "$ref": "#/definitions/schema.Topology" + } + } + }, + "schema.SubClusterConfig": { + "type": "object", + "properties": { + "alert": { + "type": "number" + }, + "caution": { + "type": "number" + }, + "name": { + "type": "string" + }, + "normal": { + "type": "number" + }, + "peak": { + "type": "number" + }, + "remove": { + "type": "boolean" + } + } + }, "schema.Tag": { "description": "Defines a tag using name and type.", "type": "object", @@ -1912,6 +2122,59 @@ } } }, + "schema.Topology": { + "type": "object", + "properties": { + "accelerators": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.Accelerator" + } + }, + "core": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "die": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "memoryDomain": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "node": { + "type": "array", + "items": { + "type": "integer" + } + }, + "socket": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + } + } + }, "schema.Unit": { "type": "object", "properties": { diff --git a/api/swagger.yaml b/api/swagger.yaml index fbb4bdf..f47ac3f 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -68,6 +68,14 @@ definitions: description: Statustext of Errorcode type: string type: object + api.GetClustersApiResponse: + properties: + clusters: + description: Array of clusters + items: + $ref: '#/definitions/schema.Cluster' + type: array + type: object api.GetJobApiResponse: properties: data: @@ -133,6 +141,28 @@ definitions: - jobState - stopTime type: object + schema.Accelerator: + properties: + id: + type: string + model: + type: string + type: + type: string + type: object + schema.Cluster: + properties: + metricConfig: + items: + $ref: '#/definitions/schema.MetricConfig' + type: array + name: + type: string + subClusters: + items: + $ref: '#/definitions/schema.SubCluster' + type: array + type: object schema.Job: description: Information of a HPC job. properties: @@ -448,6 +478,31 @@ definitions: unit: $ref: '#/definitions/schema.Unit' type: object + schema.MetricConfig: + properties: + aggregation: + type: string + alert: + type: number + caution: + type: number + name: + type: string + normal: + type: number + peak: + type: number + scope: + $ref: '#/definitions/schema.MetricScope' + subClusters: + items: + $ref: '#/definitions/schema.SubClusterConfig' + type: array + timestep: + type: integer + unit: + $ref: '#/definitions/schema.Unit' + type: object schema.MetricScope: enum: - invalid_scope @@ -475,6 +530,13 @@ definitions: min: type: number type: object + schema.MetricValue: + properties: + unit: + $ref: '#/definitions/schema.Unit' + value: + type: number + type: object schema.Resource: description: A resource used by a job properties: @@ -529,6 +591,44 @@ definitions: type: array type: object type: object + schema.SubCluster: + properties: + coresPerSocket: + type: integer + flopRateScalar: + $ref: '#/definitions/schema.MetricValue' + flopRateSimd: + $ref: '#/definitions/schema.MetricValue' + memoryBandwidth: + $ref: '#/definitions/schema.MetricValue' + name: + type: string + nodes: + type: string + processorType: + type: string + socketsPerNode: + type: integer + threadsPerCore: + type: integer + topology: + $ref: '#/definitions/schema.Topology' + type: object + schema.SubClusterConfig: + properties: + alert: + type: number + caution: + type: number + name: + type: string + normal: + type: number + peak: + type: number + remove: + type: boolean + type: object schema.Tag: description: Defines a tag using name and type. properties: @@ -544,6 +644,41 @@ definitions: example: Debug type: string type: object + schema.Topology: + properties: + accelerators: + items: + $ref: '#/definitions/schema.Accelerator' + type: array + core: + items: + items: + type: integer + type: array + type: array + die: + items: + items: + type: integer + type: array + type: array + memoryDomain: + items: + items: + type: integer + type: array + type: array + node: + items: + type: integer + type: array + socket: + items: + items: + type: integer + type: array + type: array + type: object schema.Unit: properties: base: @@ -564,6 +699,43 @@ info: title: ClusterCockpit REST API version: 1.0.0 paths: + /clusters/: + get: + description: Get a list of all cluster configs. Specific cluster can be requested + using query parameter. + parameters: + - description: Job Cluster + in: query + name: cluster + type: string + produces: + - application/json + responses: + "200": + description: Array of clusters + schema: + $ref: '#/definitions/api.GetClustersApiResponse' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Lists all cluster configs + tags: + - Cluster query /jobs/: get: description: |- diff --git a/internal/api/docs.go b/internal/api/docs.go index 1cd5df1..e5ec50b 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -23,6 +23,63 @@ const docTemplate = `{ "host": "{{.Host}}", "basePath": "{{.BasePath}}", "paths": { + "/clusters/": { + "get": { + "security": [ + { + "ApiKeyAuth": [] + } + ], + "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", + "produces": [ + "application/json" + ], + "tags": [ + "Cluster query" + ], + "summary": "Lists all cluster configs", + "parameters": [ + { + "type": "string", + "description": "Job Cluster", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Array of clusters", + "schema": { + "$ref": "#/definitions/api.GetClustersApiResponse" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, "/jobs/": { "get": { "security": [ @@ -1290,6 +1347,18 @@ const docTemplate = `{ } } }, + "api.GetClustersApiResponse": { + "type": "object", + "properties": { + "clusters": { + "description": "Array of clusters", + "type": "array", + "items": { + "$ref": "#/definitions/schema.Cluster" + } + } + } + }, "api.GetJobApiResponse": { "type": "object", "properties": { @@ -1385,6 +1454,40 @@ const docTemplate = `{ } } }, + "schema.Accelerator": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "model": { + "type": "string" + }, + "type": { + "type": "string" + } + } + }, + "schema.Cluster": { + "type": "object", + "properties": { + "metricConfig": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.MetricConfig" + } + }, + "name": { + "type": "string" + }, + "subClusters": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.SubCluster" + } + } + } + }, "schema.Job": { "description": "Information of a HPC job.", "type": "object", @@ -1783,6 +1886,44 @@ const docTemplate = `{ } } }, + "schema.MetricConfig": { + "type": "object", + "properties": { + "aggregation": { + "type": "string" + }, + "alert": { + "type": "number" + }, + "caution": { + "type": "number" + }, + "name": { + "type": "string" + }, + "normal": { + "type": "number" + }, + "peak": { + "type": "number" + }, + "scope": { + "$ref": "#/definitions/schema.MetricScope" + }, + "subClusters": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.SubClusterConfig" + } + }, + "timestep": { + "type": "integer" + }, + "unit": { + "$ref": "#/definitions/schema.Unit" + } + } + }, "schema.MetricScope": { "type": "string", "enum": [ @@ -1818,6 +1959,17 @@ const docTemplate = `{ } } }, + "schema.MetricValue": { + "type": "object", + "properties": { + "unit": { + "$ref": "#/definitions/schema.Unit" + }, + "value": { + "type": "number" + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", @@ -1898,6 +2050,64 @@ const docTemplate = `{ } } }, + "schema.SubCluster": { + "type": "object", + "properties": { + "coresPerSocket": { + "type": "integer" + }, + "flopRateScalar": { + "$ref": "#/definitions/schema.MetricValue" + }, + "flopRateSimd": { + "$ref": "#/definitions/schema.MetricValue" + }, + "memoryBandwidth": { + "$ref": "#/definitions/schema.MetricValue" + }, + "name": { + "type": "string" + }, + "nodes": { + "type": "string" + }, + "processorType": { + "type": "string" + }, + "socketsPerNode": { + "type": "integer" + }, + "threadsPerCore": { + "type": "integer" + }, + "topology": { + "$ref": "#/definitions/schema.Topology" + } + } + }, + "schema.SubClusterConfig": { + "type": "object", + "properties": { + "alert": { + "type": "number" + }, + "caution": { + "type": "number" + }, + "name": { + "type": "string" + }, + "normal": { + "type": "number" + }, + "peak": { + "type": "number" + }, + "remove": { + "type": "boolean" + } + } + }, "schema.Tag": { "description": "Defines a tag using name and type.", "type": "object", @@ -1918,6 +2128,59 @@ const docTemplate = `{ } } }, + "schema.Topology": { + "type": "object", + "properties": { + "accelerators": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.Accelerator" + } + }, + "core": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "die": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "memoryDomain": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + }, + "node": { + "type": "array", + "items": { + "type": "integer" + } + }, + "socket": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + } + } + }, "schema.Unit": { "type": "object", "properties": { diff --git a/internal/api/rest.go b/internal/api/rest.go index 0d42437..807e7ae 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -78,6 +78,8 @@ func (api *RestApi) MountRoutes(r *mux.Router) { r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobById).Methods(http.MethodDelete) r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete) + r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet) + if api.MachineStateDir != "" { r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet) r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost) @@ -134,6 +136,11 @@ type GetJobsApiResponse struct { Page int `json:"page"` // Page id returned } +// GetClustersApiResponse model +type GetClustersApiResponse struct { + Clusters []*schema.Cluster `json:"clusters"` // Array of clusters +} + // ErrorResponse model type ErrorResponse struct { // Statustext of Errorcode @@ -236,6 +243,55 @@ func securedCheck(r *http.Request) error { return nil } +// getClusters godoc +// @summary Lists all cluster configs +// @tags Cluster query +// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. +// @produce json +// @param cluster query string false "Job Cluster" +// @success 200 {object} api.GetClustersApiResponse "Array of clusters" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /clusters/ [get] +func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) { + if user := repository.GetUserFromContext(r.Context()); user != nil && + !user.HasRole(schema.RoleApi) { + + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + var clusters []*schema.Cluster + + if r.URL.Query().Has("cluster") { + name := r.URL.Query().Get("cluster") + cluster := archive.GetCluster(name) + if cluster == nil { + handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw) + return + } + clusters = append(clusters, cluster) + } else { + clusters = archive.Clusters + } + + payload := GetClustersApiResponse{ + Clusters: clusters, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} + // getJobs godoc // @summary Lists all jobs // @tags Job query @@ -354,10 +410,8 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) { if res.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful { res.Statistics, err = archive.GetStatistics(job) if err != nil { - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } + handleError(err, http.StatusInternalServerError, rw) + return } }