diff --git a/api/swagger.json b/api/swagger.json index 0327a91d..42ed7bb6 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -18,11 +18,6 @@ "paths": { "/api/clusters/": { "get": { - "security": [ - { - "ApiKeyAuth": [] - } - ], "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", "produces": [ "application/json" @@ -43,7 +38,7 @@ "200": { "description": "Array of clusters", "schema": { - "$ref": "#/definitions/api.GetClustersApiResponse" + "$ref": "#/definitions/api.GetClustersAPIResponse" } }, "400": { @@ -70,16 +65,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/": { + "get": { "description": "Get a list of all jobs. Filters can be applied using query parameters.\nNumber of results can be limited by page. Results are sorted by descending startTime.", "produces": [ "application/json" @@ -138,7 +133,7 @@ "200": { "description": "Job array and page info", "schema": { - "$ref": "#/definitions/api.GetJobsApiResponse" + "$ref": "#/definitions/api.GetJobsAPIResponse" } }, "400": { @@ -165,16 +160,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/": { + "delete": { "description": "Job to delete is specified by request body. All fields are required in this case.", "consumes": [ "application/json" @@ -193,7 +188,7 @@ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.DeleteJobApiRequest" + "$ref": "#/definitions/api.DeleteJobAPIRequest" } } ], @@ -201,7 +196,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -240,16 +235,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/{id}": { + "delete": { "description": "Job to remove is specified by database ID. This will not remove the job from the job archive.", "produces": [ "application/json" @@ -271,7 +266,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -310,16 +305,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job_before/{ts}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job_before/{ts}": { + "delete": { "description": "Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.", "produces": [ "application/json" @@ -335,13 +330,19 @@ "name": "ts", "in": "path", "required": true + }, + { + "type": "boolean", + "description": "Omit jobs with tags from deletion", + "name": "omit-tagged", + "in": "query" } ], "responses": { "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -380,16 +381,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/edit_meta/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/edit_meta/{id}": { + "post": { "description": "Edit key value pairs in job metadata json\nIf a key already exists its content will be overwritten", "consumes": [ "application/json" @@ -450,16 +451,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/start_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/start_job/": { + "post": { "description": "Job specified in request body will be saved to database as \"running\" with new DB ID.\nJob specifications follow the 'Job' scheme, API will fail to execute if requirements are not met.", "consumes": [ "application/json" @@ -486,7 +487,7 @@ "201": { "description": "Job added successfully", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -519,16 +520,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/stop_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/stop_job/": { + "post": { "description": "Job to stop is specified by request body. All fields are required in this case.\nReturns full job resource information according to 'Job' scheme.", "produces": [ "application/json" @@ -544,7 +545,7 @@ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" + "$ref": "#/definitions/api.StopJobAPIRequest" } } ], @@ -591,16 +592,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/tag_job/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/tag_job/{id}": { + "post": { "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" @@ -628,7 +629,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -664,16 +665,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/{id}": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/{id}": { + "get": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "produces": [ "application/json" @@ -701,7 +702,7 @@ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -740,14 +741,14 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - }, - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + }, + "post": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "consumes": [ "application/json" @@ -784,7 +785,7 @@ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -823,16 +824,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/nodestats/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/nodestats/": { + "post": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -856,7 +857,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -883,16 +884,86 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/users/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/api/user/{id}": { + "post": { + "description": "Allows admins to add/remove roles and projects for a user", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Update user roles and projects", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Role to add", + "name": "add-role", + "in": "formData" + }, + { + "type": "string", + "description": "Role to remove", + "name": "remove-role", + "in": "formData" + }, + { + "type": "string", + "description": "Project to add", + "name": "add-project", + "in": "formData" + }, + { + "type": "string", + "description": "Project to remove", + "name": "remove-project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/api/users/": { + "get": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -916,7 +987,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiReturnedUser" + "$ref": "#/definitions/api.APIReturnedUser" } } }, @@ -944,16 +1015,361 @@ "type": "string" } } - } - } - }, - "/jobs/tag_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + }, + "post": { + "description": "Creates a new user with specified credentials and role", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Create a new user", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "username", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Password (not required for API users)", + "name": "password", + "in": "formData" + }, + { + "type": "string", + "description": "User role", + "name": "role", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Full name", + "name": "name", + "in": "formData" + }, + { + "type": "string", + "description": "Email address", + "name": "email", + "in": "formData" + }, + { + "type": "string", + "description": "Project (required for managers)", + "name": "project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "delete": { + "description": "Deletes a user from the system", + "produces": [ + "text/plain" + ], + "tags": [ + "User" + ], + "summary": "Delete a user", + "parameters": [ + { + "type": "string", + "description": "Username to delete", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Success", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/configuration/": { + "post": { + "description": "Updates a user's configuration key-value pair.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Update user configuration", + "parameters": [ + { + "type": "string", + "description": "Configuration key", + "name": "key", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Configuration value", + "name": "value", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "success", + "schema": { + "type": "string" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/debug/": { + "post": { + "description": "This endpoint allows the users to print the content of", + "produces": [ + "application/json" + ], + "tags": [ + "debug" + ], + "summary": "Debug endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/free/": { + "post": { + "description": "This endpoint allows the users to free the Buffers from the", + "produces": [ + "application/json" + ], + "tags": [ + "free" + ], + "parameters": [ + { + "type": "string", + "description": "up to timestamp", + "name": "to", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/healthcheck/": { + "get": { + "description": "This endpoint allows the users to check if a node is healthy", + "produces": [ + "application/json" + ], + "tags": [ + "healthcheck" + ], + "summary": "HealthCheck endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/jobs/tag_job/{id}": { + "delete": { "description": "Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nIf tagged job is already finished: Tag will be removed from respective archive files.", "consumes": [ "application/json" @@ -981,7 +1397,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1017,16 +1433,276 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/tags/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/jwt/": { + "get": { + "description": "Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.", + "consumes": [ + "multipart/form-data" ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Generate JWT token", + "parameters": [ + { + "type": "string", + "description": "Username to generate JWT for", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "JWT token", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "User Not Found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/machine_state/{cluster}/{host}": { + "get": { + "description": "Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "produces": [ + "application/json" + ], + "tags": [ + "Machine State" + ], + "summary": "Retrieve machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Machine state JSON data", + "schema": { + "type": "object" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled or file not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "put": { + "description": "Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "consumes": [ + "application/json" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Machine State" + ], + "summary": "Store machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "201": { + "description": "Created" + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/notice/": { + "post": { + "description": "Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Config" + ], + "summary": "Update system notice", + "parameters": [ + { + "type": "string", + "description": "New notice content (max 10000 characters)", + "name": "new-content", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Update Notice Content Success", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/roles/": { + "get": { + "description": "Returns a list of valid user roles. Only admins are allowed.", + "produces": [ + "application/json" + ], + "tags": [ + "Config" + ], + "summary": "Get available roles", + "responses": { + "200": { + "description": "List of role names", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/tags/": { + "delete": { "description": "Removes tags by type and name. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nTag wills be removed from respective archive files.", "consumes": [ "application/json" @@ -1047,7 +1723,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1083,12 +1759,72 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/write/": { + "post": { + "consumes": [ + "text/plain" + ], + "produces": [ + "application/json" + ], + "parameters": [ + { + "type": "string", + "description": "If the lines in the body do not have a cluster tag, use this value instead.", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } } }, "definitions": { - "api.ApiReturnedUser": { + "api.APIReturnedUser": { "type": "object", "properties": { "email": { @@ -1114,7 +1850,7 @@ } } }, - "api.ApiTag": { + "api.APITag": { "type": "object", "properties": { "name": { @@ -1134,7 +1870,7 @@ } } }, - "api.DefaultApiResponse": { + "api.DefaultAPIResponse": { "type": "object", "properties": { "msg": { @@ -1142,7 +1878,7 @@ } } }, - "api.DeleteJobApiRequest": { + "api.DeleteJobAPIRequest": { "type": "object", "required": [ "jobId" @@ -1191,7 +1927,7 @@ } } }, - "api.GetClustersApiResponse": { + "api.GetClustersAPIResponse": { "type": "object", "properties": { "clusters": { @@ -1203,7 +1939,7 @@ } } }, - "api.GetJobApiResponse": { + "api.GetJobAPIResponse": { "type": "object", "properties": { "data": { @@ -1217,7 +1953,7 @@ } } }, - "api.GetJobsApiResponse": { + "api.GetJobsAPIResponse": { "type": "object", "properties": { "items": { @@ -1251,39 +1987,7 @@ } } }, - "api.Node": { - "type": "object", - "properties": { - "cpusAllocated": { - "type": "integer" - }, - "cpusTotal": { - "type": "integer" - }, - "gpusAllocated": { - "type": "integer" - }, - "gpusTotal": { - "type": "integer" - }, - "hostname": { - "type": "string" - }, - "memoryAllocated": { - "type": "integer" - }, - "memoryTotal": { - "type": "integer" - }, - "states": { - "type": "array", - "items": { - "type": "string" - } - } - } - }, - "api.StopJobApiRequest": { + "api.StopJobAPIRequest": { "type": "object", "required": [ "jobState", @@ -1326,7 +2030,7 @@ "nodes": { "type": "array", "items": { - "$ref": "#/definitions/api.Node" + "$ref": "#/definitions/schema.NodePayload" } } } @@ -1335,12 +2039,15 @@ "type": "object", "properties": { "id": { + "description": "Unique identifier for the accelerator (e.g., \"0\", \"1\", \"GPU-0\")", "type": "string" }, "model": { + "description": "Specific model name (e.g., \"A100\", \"MI100\")", "type": "string" }, "type": { + "description": "Type of accelerator (e.g., \"Nvidia GPU\", \"AMD GPU\")", "type": "string" } } @@ -1349,15 +2056,18 @@ "type": "object", "properties": { "metricConfig": { + "description": "Cluster-wide metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Unique cluster name (e.g., \"fritz\", \"alex\")", "type": "string" }, "subClusters": { + "description": "Homogeneous partitions within the cluster", "type": "array", "items": { "$ref": "#/definitions/schema.SubCluster" @@ -1366,6 +2076,7 @@ } }, "schema.Job": { + "description": "Information of a HPC job.", "type": "object", "properties": { "arrayJobId": { @@ -1394,6 +2105,13 @@ "format": "float64" } }, + "exclusive": { + "description": "for backwards compatibility", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, "footprint": { "type": "object", "additionalProperties": { @@ -1416,7 +2134,7 @@ "deadline", "failed", "node_fail", - "out_of_memory", + "out-of-memory", "pending", "preempted", "running", @@ -1528,9 +2246,11 @@ "type": "object", "properties": { "id": { + "description": "Internal database ID", "type": "integer" }, "jobId": { + "description": "The job's external job ID", "type": "integer" } } @@ -1539,9 +2259,11 @@ "type": "object", "properties": { "count": { + "description": "Total count of available items", "type": "integer" }, "items": { + "description": "List of job links", "type": "array", "items": { "$ref": "#/definitions/schema.JobLink" @@ -1553,19 +2275,31 @@ "type": "object", "properties": { "series": { + "description": "Individual time series data", "type": "array", "items": { "$ref": "#/definitions/schema.Series" } }, "statisticsSeries": { - "$ref": "#/definitions/schema.StatsSeries" + "description": "Aggregated statistics over time", + "allOf": [ + { + "$ref": "#/definitions/schema.StatsSeries" + } + ] }, "timestep": { + "description": "Sampling interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1631,46 +2365,71 @@ "type": "object", "properties": { "aggregation": { + "description": "Aggregation function (avg, sum, min, max)", "type": "string" }, "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement method", "type": "string" }, "footprint": { + "description": "Footprint category", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values are better", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, + "restrict": { + "description": "Restrict visibility to non user roles", + "type": "boolean" + }, "scope": { - "$ref": "#/definitions/schema.MetricScope" + "description": "Metric scope (node, socket, core, etc.)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricScope" + } + ] }, "subClusters": { + "description": "Subcluster-specific overrides", "type": "array", "items": { "$ref": "#/definitions/schema.SubClusterConfig" } }, "timestep": { + "description": "Measurement interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1699,12 +2458,15 @@ "type": "object", "properties": { "avg": { + "description": "Average/mean value", "type": "number" }, "max": { + "description": "Maximum value", "type": "number" }, "min": { + "description": "Minimum value", "type": "number" } } @@ -1713,30 +2475,72 @@ "type": "object", "properties": { "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement (e.g., FLOP/s, GB/s)", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] }, "value": { + "description": "Numeric value of the measurement", "type": "number" } } }, + "schema.NodePayload": { + "type": "object", + "properties": { + "cpusAllocated": { + "description": "Number of allocated CPUs", + "type": "integer" + }, + "gpusAllocated": { + "description": "Number of allocated GPUs", + "type": "integer" + }, + "hostname": { + "description": "Node hostname", + "type": "string" + }, + "jobsRunning": { + "description": "Number of running jobs", + "type": "integer" + }, + "memoryAllocated": { + "description": "Allocated memory in MB", + "type": "integer" + }, + "states": { + "description": "State strings (flexible format)", + "type": "array", + "items": { + "type": "string" + } + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", "properties": { "accelerators": { + "description": "Allocated accelerator IDs (e.g., GPU IDs)", "type": "array", "items": { "type": "string" } }, "configuration": { + "description": "Optional configuration identifier", "type": "string" }, "hostname": { + "description": "Node hostname", "type": "string" }, "hwthreads": { + "description": "Allocated hardware thread IDs", "type": "array", "items": { "type": "integer" @@ -1748,19 +2552,27 @@ "type": "object", "properties": { "data": { + "description": "Time series measurements", "type": "array", "items": { "type": "number" } }, "hostname": { + "description": "Source hostname", "type": "string" }, "id": { + "description": "Optional ID (e.g., core ID, GPU ID)", "type": "string" }, "statistics": { - "$ref": "#/definitions/schema.MetricStatistics" + "description": "Statistical summary (min/avg/max)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricStatistics" + } + ] } } }, @@ -1768,30 +2580,35 @@ "type": "object", "properties": { "max": { + "description": "Maximum values over time", "type": "array", "items": { "type": "number" } }, "mean": { + "description": "Mean values over time", "type": "array", "items": { "type": "number" } }, "median": { + "description": "Median values over time", "type": "array", "items": { "type": "number" } }, "min": { + "description": "Minimum values over time", "type": "array", "items": { "type": "number" } }, "percentiles": { + "description": "Percentile values over time (e.g., 10th, 50th, 90th)", "type": "object", "additionalProperties": { "type": "array", @@ -1807,52 +2624,81 @@ "type": "object", "properties": { "coresPerSocket": { + "description": "Number of cores per CPU socket", "type": "integer" }, "energyFootprint": { + "description": "Energy-related footprint metrics", "type": "array", "items": { "type": "string" } }, "flopRateScalar": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical scalar FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "flopRateSimd": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical SIMD FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "footprint": { + "description": "Default footprint metrics for jobs", "type": "array", "items": { "type": "string" } }, "memoryBandwidth": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical memory bandwidth per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "metricConfig": { + "description": "Subcluster-specific metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Name of the subcluster (e.g., \"main\", \"gpu\", \"bigmem\")", "type": "string" }, "nodes": { + "description": "Node list in condensed format (e.g., \"node[001-100]\")", "type": "string" }, "processorType": { + "description": "CPU model (e.g., \"Intel Xeon Gold 6148\")", "type": "string" }, "socketsPerNode": { + "description": "Number of CPU sockets per node", "type": "integer" }, "threadsPerCore": { + "description": "Number of hardware threads per core (SMT level)", "type": "integer" }, "topology": { - "$ref": "#/definitions/schema.Topology" + "description": "Hardware topology of nodes in this subcluster", + "allOf": [ + { + "$ref": "#/definitions/schema.Topology" + } + ] } } }, @@ -1860,34 +2706,52 @@ "type": "object", "properties": { "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement configuration", "type": "string" }, "footprint": { + "description": "Footprint category for this metric", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values indicate better performance", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, "remove": { + "description": "Whether to exclude this metric for this subcluster", + "type": "boolean" + }, + "restrict": { + "description": "Restrict visibility to non user roles", "type": "boolean" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1916,12 +2780,14 @@ "type": "object", "properties": { "accelerators": { + "description": "Attached accelerators (GPUs, etc.)", "type": "array", "items": { "$ref": "#/definitions/schema.Accelerator" } }, "core": { + "description": "Hardware threads grouped by core", "type": "array", "items": { "type": "array", @@ -1931,6 +2797,7 @@ } }, "die": { + "description": "Hardware threads grouped by die (optional)", "type": "array", "items": { "type": "array", @@ -1940,6 +2807,7 @@ } }, "memoryDomain": { + "description": "Hardware threads grouped by NUMA domain", "type": "array", "items": { "type": "array", @@ -1949,12 +2817,14 @@ } }, "node": { + "description": "All hardware thread IDs on this node", "type": "array", "items": { "type": "integer" } }, "socket": { + "description": "Hardware threads grouped by socket", "type": "array", "items": { "type": "array", @@ -1969,9 +2839,11 @@ "type": "object", "properties": { "base": { + "description": "Base unit (e.g., \"B/s\", \"F/s\", \"W\")", "type": "string" }, "prefix": { + "description": "SI prefix (e.g., \"G\", \"M\", \"K\", \"T\")", "type": "string" } } diff --git a/api/swagger.yaml b/api/swagger.yaml index 119e9529..0bf60082 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -1,5 +1,5 @@ definitions: - api.ApiReturnedUser: + api.APIReturnedUser: properties: email: type: string @@ -16,7 +16,7 @@ definitions: username: type: string type: object - api.ApiTag: + api.APITag: properties: name: description: Tag Name @@ -31,12 +31,12 @@ definitions: example: Debug type: string type: object - api.DefaultApiResponse: + api.DefaultAPIResponse: properties: msg: type: string type: object - api.DeleteJobApiRequest: + api.DeleteJobAPIRequest: properties: cluster: description: Cluster of job @@ -71,7 +71,7 @@ definitions: description: Statustext of Errorcode type: string type: object - api.GetClustersApiResponse: + api.GetClustersAPIResponse: properties: clusters: description: Array of clusters @@ -79,7 +79,7 @@ definitions: $ref: '#/definitions/schema.Cluster' type: array type: object - api.GetJobApiResponse: + api.GetJobAPIResponse: properties: data: items: @@ -88,7 +88,7 @@ definitions: meta: $ref: '#/definitions/schema.Job' type: object - api.GetJobsApiResponse: + api.GetJobsAPIResponse: properties: items: description: Number of jobs returned @@ -111,28 +111,7 @@ definitions: scope: $ref: '#/definitions/schema.MetricScope' type: object - api.Node: - properties: - cpusAllocated: - type: integer - cpusTotal: - type: integer - gpusAllocated: - type: integer - gpusTotal: - type: integer - hostname: - type: string - memoryAllocated: - type: integer - memoryTotal: - type: integer - states: - items: - type: string - type: array - type: object - api.StopJobApiRequest: + api.StopJobAPIRequest: properties: cluster: example: fritz @@ -161,32 +140,39 @@ definitions: type: string nodes: items: - $ref: '#/definitions/api.Node' + $ref: '#/definitions/schema.NodePayload' type: array type: object schema.Accelerator: properties: id: + description: Unique identifier for the accelerator (e.g., "0", "1", "GPU-0") type: string model: + description: Specific model name (e.g., "A100", "MI100") type: string type: + description: Type of accelerator (e.g., "Nvidia GPU", "AMD GPU") type: string type: object schema.Cluster: properties: metricConfig: + description: Cluster-wide metric configurations items: $ref: '#/definitions/schema.MetricConfig' type: array name: + description: Unique cluster name (e.g., "fritz", "alex") type: string subClusters: + description: Homogeneous partitions within the cluster items: $ref: '#/definitions/schema.SubCluster' type: array type: object schema.Job: + description: Information of a HPC job. properties: arrayJobId: example: 123000 @@ -207,6 +193,12 @@ definitions: format: float64 type: number type: object + exclusive: + description: for backwards compatibility + example: 1 + maximum: 2 + minimum: 0 + type: integer footprint: additionalProperties: format: float64 @@ -227,7 +219,7 @@ definitions: - deadline - failed - node_fail - - out_of_memory + - out-of-memory - pending - preempted - running @@ -307,15 +299,19 @@ definitions: schema.JobLink: properties: id: + description: Internal database ID type: integer jobId: + description: The job's external job ID type: integer type: object schema.JobLinkResultList: properties: count: + description: Total count of available items type: integer items: + description: List of job links items: $ref: '#/definitions/schema.JobLink' type: array @@ -323,15 +319,21 @@ definitions: schema.JobMetric: properties: series: + description: Individual time series data items: $ref: '#/definitions/schema.Series' type: array statisticsSeries: - $ref: '#/definitions/schema.StatsSeries' + allOf: + - $ref: '#/definitions/schema.StatsSeries' + description: Aggregated statistics over time timestep: + description: Sampling interval in seconds type: integer unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.JobState: enum: @@ -385,33 +387,51 @@ definitions: schema.MetricConfig: properties: aggregation: + description: Aggregation function (avg, sum, min, max) type: string alert: + description: Alert threshold (requires attention) type: number caution: + description: Caution threshold (concerning but not critical) type: number energy: + description: Energy measurement method type: string footprint: + description: Footprint category type: string lowerIsBetter: + description: Whether lower values are better type: boolean name: + description: Metric name (e.g., "cpu_load", "mem_used") type: string normal: + description: Normal/typical value (good performance) type: number peak: + description: Peak/maximum expected value (best performance) type: number + restrict: + description: Restrict visibility to non user roles + type: boolean scope: - $ref: '#/definitions/schema.MetricScope' + allOf: + - $ref: '#/definitions/schema.MetricScope' + description: Metric scope (node, socket, core, etc.) subClusters: + description: Subcluster-specific overrides items: $ref: '#/definitions/schema.SubClusterConfig' type: array timestep: + description: Measurement interval in seconds type: integer unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.MetricScope: enum: @@ -434,31 +454,64 @@ definitions: schema.MetricStatistics: properties: avg: + description: Average/mean value type: number max: + description: Maximum value type: number min: + description: Minimum value type: number type: object schema.MetricValue: properties: unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement (e.g., FLOP/s, GB/s) value: + description: Numeric value of the measurement type: number type: object + schema.NodePayload: + properties: + cpusAllocated: + description: Number of allocated CPUs + type: integer + gpusAllocated: + description: Number of allocated GPUs + type: integer + hostname: + description: Node hostname + type: string + jobsRunning: + description: Number of running jobs + type: integer + memoryAllocated: + description: Allocated memory in MB + type: integer + states: + description: State strings (flexible format) + items: + type: string + type: array + type: object schema.Resource: description: A resource used by a job properties: accelerators: + description: Allocated accelerator IDs (e.g., GPU IDs) items: type: string type: array configuration: + description: Optional configuration identifier type: string hostname: + description: Node hostname type: string hwthreads: + description: Allocated hardware thread IDs items: type: integer type: array @@ -466,31 +519,40 @@ definitions: schema.Series: properties: data: + description: Time series measurements items: type: number type: array hostname: + description: Source hostname type: string id: + description: Optional ID (e.g., core ID, GPU ID) type: string statistics: - $ref: '#/definitions/schema.MetricStatistics' + allOf: + - $ref: '#/definitions/schema.MetricStatistics' + description: Statistical summary (min/avg/max) type: object schema.StatsSeries: properties: max: + description: Maximum values over time items: type: number type: array mean: + description: Mean values over time items: type: number type: array median: + description: Median values over time items: type: number type: array min: + description: Minimum values over time items: type: number type: array @@ -500,65 +562,97 @@ definitions: format: float64 type: number type: array + description: Percentile values over time (e.g., 10th, 50th, 90th) type: object type: object schema.SubCluster: properties: coresPerSocket: + description: Number of cores per CPU socket type: integer energyFootprint: + description: Energy-related footprint metrics items: type: string type: array flopRateScalar: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical scalar FLOP rate per node flopRateSimd: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical SIMD FLOP rate per node footprint: + description: Default footprint metrics for jobs items: type: string type: array memoryBandwidth: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical memory bandwidth per node metricConfig: + description: Subcluster-specific metric configurations items: $ref: '#/definitions/schema.MetricConfig' type: array name: + description: Name of the subcluster (e.g., "main", "gpu", "bigmem") type: string nodes: + description: Node list in condensed format (e.g., "node[001-100]") type: string processorType: + description: CPU model (e.g., "Intel Xeon Gold 6148") type: string socketsPerNode: + description: Number of CPU sockets per node type: integer threadsPerCore: + description: Number of hardware threads per core (SMT level) type: integer topology: - $ref: '#/definitions/schema.Topology' + allOf: + - $ref: '#/definitions/schema.Topology' + description: Hardware topology of nodes in this subcluster type: object schema.SubClusterConfig: properties: alert: + description: Alert threshold (requires attention) type: number caution: + description: Caution threshold (concerning but not critical) type: number energy: + description: Energy measurement configuration type: string footprint: + description: Footprint category for this metric type: string lowerIsBetter: + description: Whether lower values indicate better performance type: boolean name: + description: Metric name (e.g., "cpu_load", "mem_used") type: string normal: + description: Normal/typical value (good performance) type: number peak: + description: Peak/maximum expected value (best performance) type: number remove: + description: Whether to exclude this metric for this subcluster + type: boolean + restrict: + description: Restrict visibility to non user roles type: boolean unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.Tag: description: Defines a tag using name and type. @@ -578,32 +672,38 @@ definitions: schema.Topology: properties: accelerators: + description: Attached accelerators (GPUs, etc.) items: $ref: '#/definitions/schema.Accelerator' type: array core: + description: Hardware threads grouped by core items: items: type: integer type: array type: array die: + description: Hardware threads grouped by die (optional) items: items: type: integer type: array type: array memoryDomain: + description: Hardware threads grouped by NUMA domain items: items: type: integer type: array type: array node: + description: All hardware thread IDs on this node items: type: integer type: array socket: + description: Hardware threads grouped by socket items: items: type: integer @@ -613,8 +713,10 @@ definitions: schema.Unit: properties: base: + description: Base unit (e.g., "B/s", "F/s", "W") type: string prefix: + description: SI prefix (e.g., "G", "M", "K", "T") type: string type: object host: localhost:8080 @@ -645,7 +747,7 @@ paths: "200": description: Array of clusters schema: - $ref: '#/definitions/api.GetClustersApiResponse' + $ref: '#/definitions/api.GetClustersAPIResponse' "400": description: Bad Request schema: @@ -710,7 +812,7 @@ paths: "200": description: Job array and page info schema: - $ref: '#/definitions/api.GetJobsApiResponse' + $ref: '#/definitions/api.GetJobsAPIResponse' "400": description: Bad Request schema: @@ -753,7 +855,7 @@ paths: "200": description: Job resource schema: - $ref: '#/definitions/api.GetJobApiResponse' + $ref: '#/definitions/api.GetJobAPIResponse' "400": description: Bad Request schema: @@ -810,7 +912,7 @@ paths: "200": description: Job resource schema: - $ref: '#/definitions/api.GetJobApiResponse' + $ref: '#/definitions/api.GetJobAPIResponse' "400": description: Bad Request schema: @@ -853,14 +955,14 @@ paths: name: request required: true schema: - $ref: '#/definitions/api.DeleteJobApiRequest' + $ref: '#/definitions/api.DeleteJobAPIRequest' produces: - application/json responses: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -907,7 +1009,7 @@ paths: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -948,13 +1050,17 @@ paths: name: ts required: true type: integer + - description: Omit jobs with tags from deletion + in: query + name: omit-tagged + type: boolean produces: - application/json responses: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1052,7 +1158,7 @@ paths: "201": description: Job added successfully schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1090,7 +1196,7 @@ paths: name: request required: true schema: - $ref: '#/definitions/api.StopJobApiRequest' + $ref: '#/definitions/api.StopJobAPIRequest' produces: - application/json responses: @@ -1147,7 +1253,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - application/json @@ -1195,7 +1301,7 @@ paths: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1217,7 +1323,80 @@ paths: summary: Deliver updated Slurm node states tags: - Nodestates + /api/user/{id}: + post: + description: Allows admins to add/remove roles and projects for a user + parameters: + - description: Username + in: path + name: id + required: true + type: string + - description: Role to add + in: formData + name: add-role + type: string + - description: Role to remove + in: formData + name: remove-role + type: string + - description: Project to add + in: formData + name: add-project + type: string + - description: Project to remove + in: formData + name: remove-project + type: string + produces: + - text/plain + responses: + "200": + description: Success message + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update user roles and projects + tags: + - User /api/users/: + delete: + description: Deletes a user from the system + parameters: + - description: Username to delete + in: formData + name: username + required: true + type: string + produces: + - text/plain + responses: + "200": + description: Success + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Delete a user + tags: + - User get: description: |- Returns a JSON-encoded list of users. @@ -1236,7 +1415,7 @@ paths: description: List of users returned successfully schema: items: - $ref: '#/definitions/api.ApiReturnedUser' + $ref: '#/definitions/api.APIReturnedUser' type: array "400": description: Bad Request @@ -1259,6 +1438,198 @@ paths: summary: Returns a list of users tags: - User + post: + description: Creates a new user with specified credentials and role + parameters: + - description: Username + in: formData + name: username + required: true + type: string + - description: Password (not required for API users) + in: formData + name: password + type: string + - description: User role + in: formData + name: role + required: true + type: string + - description: Full name + in: formData + name: name + type: string + - description: Email address + in: formData + name: email + type: string + - description: Project (required for managers) + in: formData + name: project + type: string + produces: + - text/plain + responses: + "200": + description: Success message + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Create a new user + tags: + - User + /configuration/: + post: + consumes: + - multipart/form-data + description: Updates a user's configuration key-value pair. + parameters: + - description: Configuration key + in: formData + name: key + required: true + type: string + - description: Configuration value + in: formData + name: value + required: true + type: string + produces: + - text/plain + responses: + "200": + description: success + schema: + type: string + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update user configuration + tags: + - Frontend + /debug/: + post: + description: This endpoint allows the users to print the content of + parameters: + - description: Selector + in: query + name: selector + type: string + produces: + - application/json + responses: + "200": + description: Debug dump + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Debug endpoint + tags: + - debug + /free/: + post: + description: This endpoint allows the users to free the Buffers from the + parameters: + - description: up to timestamp + in: query + name: to + type: string + produces: + - application/json + responses: + "200": + description: ok + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + tags: + - free + /healthcheck/: + get: + description: This endpoint allows the users to check if a node is healthy + parameters: + - description: Selector + in: query + name: selector + type: string + produces: + - application/json + responses: + "200": + description: Debug dump + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: HealthCheck endpoint + tags: + - healthcheck /jobs/tag_job/{id}: delete: consumes: @@ -1279,7 +1650,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - application/json @@ -1309,6 +1680,176 @@ paths: summary: Removes one or more tags from a job tags: - Job add and modify + /jwt/: + get: + consumes: + - multipart/form-data + description: Generates a JWT token for a user. Admins can generate tokens for + any user, regular users only for themselves. + parameters: + - description: Username to generate JWT for + in: formData + name: username + required: true + type: string + produces: + - text/plain + responses: + "200": + description: JWT token + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: User Not Found + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Generate JWT token + tags: + - Frontend + /machine_state/{cluster}/{host}: + get: + description: Retrieves stored machine state data for a specific cluster node. + Validates cluster and host names to prevent path traversal. + parameters: + - description: Cluster name + in: path + name: cluster + required: true + type: string + - description: Host name + in: path + name: host + required: true + type: string + produces: + - application/json + responses: + "200": + description: Machine state JSON data + schema: + type: object + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Machine state not enabled or file not found + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Retrieve machine state + tags: + - Machine State + put: + consumes: + - application/json + description: Stores machine state data for a specific cluster node. Validates + cluster and host names to prevent path traversal. + parameters: + - description: Cluster name + in: path + name: cluster + required: true + type: string + - description: Host name + in: path + name: host + required: true + type: string + produces: + - text/plain + responses: + "201": + description: Created + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Machine state not enabled + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Store machine state + tags: + - Machine State + /notice/: + post: + consumes: + - multipart/form-data + description: Updates the notice.txt file content. Only admins are allowed. Content + is limited to 10000 characters. + parameters: + - description: New notice content (max 10000 characters) + in: formData + name: new-content + required: true + type: string + produces: + - text/plain + responses: + "200": + description: Update Notice Content Success + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update system notice + tags: + - Config + /roles/: + get: + description: Returns a list of valid user roles. Only admins are allowed. + produces: + - application/json + responses: + "200": + description: List of role names + schema: + items: + type: string + type: array + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Get available roles + tags: + - Config /tags/: delete: consumes: @@ -1324,7 +1865,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - text/plain @@ -1354,6 +1895,41 @@ paths: summary: Removes all tags and job-relations for type:name tuple tags: - Tag remove + /write/: + post: + consumes: + - text/plain + parameters: + - description: If the lines in the body do not have a cluster tag, use this + value instead. + in: query + name: cluster + type: string + produces: + - application/json + responses: + "200": + description: ok + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] securityDefinitions: ApiKeyAuth: in: header diff --git a/go.mod b/go.mod index 808b2e7a..479f1644 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ tool ( require ( github.com/99designs/gqlgen v0.17.85 - github.com/ClusterCockpit/cc-lib/v2 v2.0.0 + github.com/ClusterCockpit/cc-lib/v2 v2.1.0 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.6 @@ -109,7 +109,6 @@ require ( github.com/urfave/cli/v2 v2.27.7 // indirect github.com/urfave/cli/v3 v3.6.1 // indirect github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect - github.com/xtgo/set v1.0.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect @@ -119,7 +118,6 @@ require ( golang.org/x/sys v0.39.0 // indirect golang.org/x/text v0.32.0 // indirect golang.org/x/tools v0.40.0 // indirect - google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 39571309..ef89e2d2 100644 --- a/go.sum +++ b/go.sum @@ -2,12 +2,10 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH1S0= github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib/v2 v2.0.0 h1:OjDADx8mf9SflqeeKUuhy5pamu4YDucae6wUX6vvNNA= -github.com/ClusterCockpit/cc-lib/v2 v2.0.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg= +github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= @@ -74,10 +72,6 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= -github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= -github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= -github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc= github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= @@ -89,16 +83,6 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= -github.com/dhui/dktest v0.4.6 h1:+DPKyScKSEp3VLtbMDHcUq6V5Lm5zfZZVb0Sk7Ahom4= -github.com/dhui/dktest v0.4.6/go.mod h1:JHTSYDtKkvFNFHJKqCzVzqXecyv+tKt8EzceOmQOgbU= -github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= -github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= -github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8= github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= @@ -236,17 +220,8 @@ github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsO github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.33 h1:A5blZ5ulQo2AtayQ9/limgHEkFreKj1Dv226a1K73s0= github.com/mattn/go-sqlite3 v1.14.33/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= -github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= -github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk= +github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g= @@ -318,8 +293,6 @@ github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkW github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= -github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY= -github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= diff --git a/internal/api/cluster.go b/internal/api/cluster.go index b6f41244..d1c3c898 100644 --- a/internal/api/cluster.go +++ b/internal/api/cluster.go @@ -27,7 +27,7 @@ type GetClustersAPIResponse struct { // @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. // @produce json // @param cluster query string false "Job Cluster" -// @success 200 {object} api.GetClustersApiResponse "Array of clusters" +// @success 200 {object} api.GetClustersAPIResponse "Array of clusters" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" diff --git a/internal/api/docs.go b/internal/api/docs.go index d0b5c6fb..78eecfa3 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -25,11 +25,6 @@ const docTemplate = `{ "paths": { "/api/clusters/": { "get": { - "security": [ - { - "ApiKeyAuth": [] - } - ], "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", "produces": [ "application/json" @@ -50,7 +45,7 @@ const docTemplate = `{ "200": { "description": "Array of clusters", "schema": { - "$ref": "#/definitions/api.GetClustersApiResponse" + "$ref": "#/definitions/api.GetClustersAPIResponse" } }, "400": { @@ -77,16 +72,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/": { + "get": { "description": "Get a list of all jobs. Filters can be applied using query parameters.\nNumber of results can be limited by page. Results are sorted by descending startTime.", "produces": [ "application/json" @@ -145,7 +140,7 @@ const docTemplate = `{ "200": { "description": "Job array and page info", "schema": { - "$ref": "#/definitions/api.GetJobsApiResponse" + "$ref": "#/definitions/api.GetJobsAPIResponse" } }, "400": { @@ -172,16 +167,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/": { + "delete": { "description": "Job to delete is specified by request body. All fields are required in this case.", "consumes": [ "application/json" @@ -200,7 +195,7 @@ const docTemplate = `{ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.DeleteJobApiRequest" + "$ref": "#/definitions/api.DeleteJobAPIRequest" } } ], @@ -208,7 +203,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -247,16 +242,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/{id}": { + "delete": { "description": "Job to remove is specified by database ID. This will not remove the job from the job archive.", "produces": [ "application/json" @@ -278,7 +273,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -317,16 +312,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job_before/{ts}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job_before/{ts}": { + "delete": { "description": "Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.", "produces": [ "application/json" @@ -342,13 +337,19 @@ const docTemplate = `{ "name": "ts", "in": "path", "required": true + }, + { + "type": "boolean", + "description": "Omit jobs with tags from deletion", + "name": "omit-tagged", + "in": "query" } ], "responses": { "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -387,16 +388,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/edit_meta/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/edit_meta/{id}": { + "post": { "description": "Edit key value pairs in job metadata json\nIf a key already exists its content will be overwritten", "consumes": [ "application/json" @@ -457,16 +458,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/start_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/start_job/": { + "post": { "description": "Job specified in request body will be saved to database as \"running\" with new DB ID.\nJob specifications follow the 'Job' scheme, API will fail to execute if requirements are not met.", "consumes": [ "application/json" @@ -493,7 +494,7 @@ const docTemplate = `{ "201": { "description": "Job added successfully", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -526,16 +527,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/stop_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/stop_job/": { + "post": { "description": "Job to stop is specified by request body. All fields are required in this case.\nReturns full job resource information according to 'Job' scheme.", "produces": [ "application/json" @@ -551,7 +552,7 @@ const docTemplate = `{ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" + "$ref": "#/definitions/api.StopJobAPIRequest" } } ], @@ -598,16 +599,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/tag_job/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/tag_job/{id}": { + "post": { "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" @@ -635,7 +636,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -671,16 +672,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/{id}": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/{id}": { + "get": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "produces": [ "application/json" @@ -708,7 +709,7 @@ const docTemplate = `{ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -747,14 +748,14 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - }, - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + }, + "post": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "consumes": [ "application/json" @@ -791,7 +792,7 @@ const docTemplate = `{ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -830,16 +831,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/nodestats/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/nodestats/": { + "post": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -863,7 +864,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -890,16 +891,86 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/users/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/api/user/{id}": { + "post": { + "description": "Allows admins to add/remove roles and projects for a user", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Update user roles and projects", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Role to add", + "name": "add-role", + "in": "formData" + }, + { + "type": "string", + "description": "Role to remove", + "name": "remove-role", + "in": "formData" + }, + { + "type": "string", + "description": "Project to add", + "name": "add-project", + "in": "formData" + }, + { + "type": "string", + "description": "Project to remove", + "name": "remove-project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/api/users/": { + "get": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -923,7 +994,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiReturnedUser" + "$ref": "#/definitions/api.APIReturnedUser" } } }, @@ -951,16 +1022,361 @@ const docTemplate = `{ "type": "string" } } - } - } - }, - "/jobs/tag_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + }, + "post": { + "description": "Creates a new user with specified credentials and role", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Create a new user", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "username", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Password (not required for API users)", + "name": "password", + "in": "formData" + }, + { + "type": "string", + "description": "User role", + "name": "role", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Full name", + "name": "name", + "in": "formData" + }, + { + "type": "string", + "description": "Email address", + "name": "email", + "in": "formData" + }, + { + "type": "string", + "description": "Project (required for managers)", + "name": "project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "delete": { + "description": "Deletes a user from the system", + "produces": [ + "text/plain" + ], + "tags": [ + "User" + ], + "summary": "Delete a user", + "parameters": [ + { + "type": "string", + "description": "Username to delete", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Success", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/configuration/": { + "post": { + "description": "Updates a user's configuration key-value pair.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Update user configuration", + "parameters": [ + { + "type": "string", + "description": "Configuration key", + "name": "key", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Configuration value", + "name": "value", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "success", + "schema": { + "type": "string" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/debug/": { + "post": { + "description": "This endpoint allows the users to print the content of", + "produces": [ + "application/json" + ], + "tags": [ + "debug" + ], + "summary": "Debug endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/free/": { + "post": { + "description": "This endpoint allows the users to free the Buffers from the", + "produces": [ + "application/json" + ], + "tags": [ + "free" + ], + "parameters": [ + { + "type": "string", + "description": "up to timestamp", + "name": "to", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/healthcheck/": { + "get": { + "description": "This endpoint allows the users to check if a node is healthy", + "produces": [ + "application/json" + ], + "tags": [ + "healthcheck" + ], + "summary": "HealthCheck endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/jobs/tag_job/{id}": { + "delete": { "description": "Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nIf tagged job is already finished: Tag will be removed from respective archive files.", "consumes": [ "application/json" @@ -988,7 +1404,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1024,16 +1440,276 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/tags/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/jwt/": { + "get": { + "description": "Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.", + "consumes": [ + "multipart/form-data" ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Generate JWT token", + "parameters": [ + { + "type": "string", + "description": "Username to generate JWT for", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "JWT token", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "User Not Found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/machine_state/{cluster}/{host}": { + "get": { + "description": "Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "produces": [ + "application/json" + ], + "tags": [ + "Machine State" + ], + "summary": "Retrieve machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Machine state JSON data", + "schema": { + "type": "object" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled or file not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "put": { + "description": "Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "consumes": [ + "application/json" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Machine State" + ], + "summary": "Store machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "201": { + "description": "Created" + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/notice/": { + "post": { + "description": "Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Config" + ], + "summary": "Update system notice", + "parameters": [ + { + "type": "string", + "description": "New notice content (max 10000 characters)", + "name": "new-content", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Update Notice Content Success", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/roles/": { + "get": { + "description": "Returns a list of valid user roles. Only admins are allowed.", + "produces": [ + "application/json" + ], + "tags": [ + "Config" + ], + "summary": "Get available roles", + "responses": { + "200": { + "description": "List of role names", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/tags/": { + "delete": { "description": "Removes tags by type and name. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nTag wills be removed from respective archive files.", "consumes": [ "application/json" @@ -1054,7 +1730,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1090,12 +1766,72 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/write/": { + "post": { + "consumes": [ + "text/plain" + ], + "produces": [ + "application/json" + ], + "parameters": [ + { + "type": "string", + "description": "If the lines in the body do not have a cluster tag, use this value instead.", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } } }, "definitions": { - "api.ApiReturnedUser": { + "api.APIReturnedUser": { "type": "object", "properties": { "email": { @@ -1121,7 +1857,7 @@ const docTemplate = `{ } } }, - "api.ApiTag": { + "api.APITag": { "type": "object", "properties": { "name": { @@ -1141,7 +1877,7 @@ const docTemplate = `{ } } }, - "api.DefaultApiResponse": { + "api.DefaultAPIResponse": { "type": "object", "properties": { "msg": { @@ -1149,7 +1885,7 @@ const docTemplate = `{ } } }, - "api.DeleteJobApiRequest": { + "api.DeleteJobAPIRequest": { "type": "object", "required": [ "jobId" @@ -1198,7 +1934,7 @@ const docTemplate = `{ } } }, - "api.GetClustersApiResponse": { + "api.GetClustersAPIResponse": { "type": "object", "properties": { "clusters": { @@ -1210,7 +1946,7 @@ const docTemplate = `{ } } }, - "api.GetJobApiResponse": { + "api.GetJobAPIResponse": { "type": "object", "properties": { "data": { @@ -1224,7 +1960,7 @@ const docTemplate = `{ } } }, - "api.GetJobsApiResponse": { + "api.GetJobsAPIResponse": { "type": "object", "properties": { "items": { @@ -1258,39 +1994,7 @@ const docTemplate = `{ } } }, - "api.Node": { - "type": "object", - "properties": { - "cpusAllocated": { - "type": "integer" - }, - "cpusTotal": { - "type": "integer" - }, - "gpusAllocated": { - "type": "integer" - }, - "gpusTotal": { - "type": "integer" - }, - "hostname": { - "type": "string" - }, - "memoryAllocated": { - "type": "integer" - }, - "memoryTotal": { - "type": "integer" - }, - "states": { - "type": "array", - "items": { - "type": "string" - } - } - } - }, - "api.StopJobApiRequest": { + "api.StopJobAPIRequest": { "type": "object", "required": [ "jobState", @@ -1333,7 +2037,7 @@ const docTemplate = `{ "nodes": { "type": "array", "items": { - "$ref": "#/definitions/api.Node" + "$ref": "#/definitions/schema.NodePayload" } } } @@ -1342,12 +2046,15 @@ const docTemplate = `{ "type": "object", "properties": { "id": { + "description": "Unique identifier for the accelerator (e.g., \"0\", \"1\", \"GPU-0\")", "type": "string" }, "model": { + "description": "Specific model name (e.g., \"A100\", \"MI100\")", "type": "string" }, "type": { + "description": "Type of accelerator (e.g., \"Nvidia GPU\", \"AMD GPU\")", "type": "string" } } @@ -1356,15 +2063,18 @@ const docTemplate = `{ "type": "object", "properties": { "metricConfig": { + "description": "Cluster-wide metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Unique cluster name (e.g., \"fritz\", \"alex\")", "type": "string" }, "subClusters": { + "description": "Homogeneous partitions within the cluster", "type": "array", "items": { "$ref": "#/definitions/schema.SubCluster" @@ -1373,6 +2083,7 @@ const docTemplate = `{ } }, "schema.Job": { + "description": "Information of a HPC job.", "type": "object", "properties": { "arrayJobId": { @@ -1401,6 +2112,13 @@ const docTemplate = `{ "format": "float64" } }, + "exclusive": { + "description": "for backwards compatibility", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, "footprint": { "type": "object", "additionalProperties": { @@ -1423,7 +2141,7 @@ const docTemplate = `{ "deadline", "failed", "node_fail", - "out_of_memory", + "out-of-memory", "pending", "preempted", "running", @@ -1535,9 +2253,11 @@ const docTemplate = `{ "type": "object", "properties": { "id": { + "description": "Internal database ID", "type": "integer" }, "jobId": { + "description": "The job's external job ID", "type": "integer" } } @@ -1546,9 +2266,11 @@ const docTemplate = `{ "type": "object", "properties": { "count": { + "description": "Total count of available items", "type": "integer" }, "items": { + "description": "List of job links", "type": "array", "items": { "$ref": "#/definitions/schema.JobLink" @@ -1560,19 +2282,31 @@ const docTemplate = `{ "type": "object", "properties": { "series": { + "description": "Individual time series data", "type": "array", "items": { "$ref": "#/definitions/schema.Series" } }, "statisticsSeries": { - "$ref": "#/definitions/schema.StatsSeries" + "description": "Aggregated statistics over time", + "allOf": [ + { + "$ref": "#/definitions/schema.StatsSeries" + } + ] }, "timestep": { + "description": "Sampling interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1638,46 +2372,71 @@ const docTemplate = `{ "type": "object", "properties": { "aggregation": { + "description": "Aggregation function (avg, sum, min, max)", "type": "string" }, "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement method", "type": "string" }, "footprint": { + "description": "Footprint category", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values are better", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, + "restrict": { + "description": "Restrict visibility to non user roles", + "type": "boolean" + }, "scope": { - "$ref": "#/definitions/schema.MetricScope" + "description": "Metric scope (node, socket, core, etc.)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricScope" + } + ] }, "subClusters": { + "description": "Subcluster-specific overrides", "type": "array", "items": { "$ref": "#/definitions/schema.SubClusterConfig" } }, "timestep": { + "description": "Measurement interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1706,12 +2465,15 @@ const docTemplate = `{ "type": "object", "properties": { "avg": { + "description": "Average/mean value", "type": "number" }, "max": { + "description": "Maximum value", "type": "number" }, "min": { + "description": "Minimum value", "type": "number" } } @@ -1720,30 +2482,72 @@ const docTemplate = `{ "type": "object", "properties": { "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement (e.g., FLOP/s, GB/s)", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] }, "value": { + "description": "Numeric value of the measurement", "type": "number" } } }, + "schema.NodePayload": { + "type": "object", + "properties": { + "cpusAllocated": { + "description": "Number of allocated CPUs", + "type": "integer" + }, + "gpusAllocated": { + "description": "Number of allocated GPUs", + "type": "integer" + }, + "hostname": { + "description": "Node hostname", + "type": "string" + }, + "jobsRunning": { + "description": "Number of running jobs", + "type": "integer" + }, + "memoryAllocated": { + "description": "Allocated memory in MB", + "type": "integer" + }, + "states": { + "description": "State strings (flexible format)", + "type": "array", + "items": { + "type": "string" + } + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", "properties": { "accelerators": { + "description": "Allocated accelerator IDs (e.g., GPU IDs)", "type": "array", "items": { "type": "string" } }, "configuration": { + "description": "Optional configuration identifier", "type": "string" }, "hostname": { + "description": "Node hostname", "type": "string" }, "hwthreads": { + "description": "Allocated hardware thread IDs", "type": "array", "items": { "type": "integer" @@ -1755,19 +2559,27 @@ const docTemplate = `{ "type": "object", "properties": { "data": { + "description": "Time series measurements", "type": "array", "items": { "type": "number" } }, "hostname": { + "description": "Source hostname", "type": "string" }, "id": { + "description": "Optional ID (e.g., core ID, GPU ID)", "type": "string" }, "statistics": { - "$ref": "#/definitions/schema.MetricStatistics" + "description": "Statistical summary (min/avg/max)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricStatistics" + } + ] } } }, @@ -1775,30 +2587,35 @@ const docTemplate = `{ "type": "object", "properties": { "max": { + "description": "Maximum values over time", "type": "array", "items": { "type": "number" } }, "mean": { + "description": "Mean values over time", "type": "array", "items": { "type": "number" } }, "median": { + "description": "Median values over time", "type": "array", "items": { "type": "number" } }, "min": { + "description": "Minimum values over time", "type": "array", "items": { "type": "number" } }, "percentiles": { + "description": "Percentile values over time (e.g., 10th, 50th, 90th)", "type": "object", "additionalProperties": { "type": "array", @@ -1814,52 +2631,81 @@ const docTemplate = `{ "type": "object", "properties": { "coresPerSocket": { + "description": "Number of cores per CPU socket", "type": "integer" }, "energyFootprint": { + "description": "Energy-related footprint metrics", "type": "array", "items": { "type": "string" } }, "flopRateScalar": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical scalar FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "flopRateSimd": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical SIMD FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "footprint": { + "description": "Default footprint metrics for jobs", "type": "array", "items": { "type": "string" } }, "memoryBandwidth": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical memory bandwidth per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "metricConfig": { + "description": "Subcluster-specific metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Name of the subcluster (e.g., \"main\", \"gpu\", \"bigmem\")", "type": "string" }, "nodes": { + "description": "Node list in condensed format (e.g., \"node[001-100]\")", "type": "string" }, "processorType": { + "description": "CPU model (e.g., \"Intel Xeon Gold 6148\")", "type": "string" }, "socketsPerNode": { + "description": "Number of CPU sockets per node", "type": "integer" }, "threadsPerCore": { + "description": "Number of hardware threads per core (SMT level)", "type": "integer" }, "topology": { - "$ref": "#/definitions/schema.Topology" + "description": "Hardware topology of nodes in this subcluster", + "allOf": [ + { + "$ref": "#/definitions/schema.Topology" + } + ] } } }, @@ -1867,34 +2713,52 @@ const docTemplate = `{ "type": "object", "properties": { "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement configuration", "type": "string" }, "footprint": { + "description": "Footprint category for this metric", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values indicate better performance", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, "remove": { + "description": "Whether to exclude this metric for this subcluster", + "type": "boolean" + }, + "restrict": { + "description": "Restrict visibility to non user roles", "type": "boolean" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1923,12 +2787,14 @@ const docTemplate = `{ "type": "object", "properties": { "accelerators": { + "description": "Attached accelerators (GPUs, etc.)", "type": "array", "items": { "$ref": "#/definitions/schema.Accelerator" } }, "core": { + "description": "Hardware threads grouped by core", "type": "array", "items": { "type": "array", @@ -1938,6 +2804,7 @@ const docTemplate = `{ } }, "die": { + "description": "Hardware threads grouped by die (optional)", "type": "array", "items": { "type": "array", @@ -1947,6 +2814,7 @@ const docTemplate = `{ } }, "memoryDomain": { + "description": "Hardware threads grouped by NUMA domain", "type": "array", "items": { "type": "array", @@ -1956,12 +2824,14 @@ const docTemplate = `{ } }, "node": { + "description": "All hardware thread IDs on this node", "type": "array", "items": { "type": "integer" } }, "socket": { + "description": "Hardware threads grouped by socket", "type": "array", "items": { "type": "array", @@ -1976,9 +2846,11 @@ const docTemplate = `{ "type": "object", "properties": { "base": { + "description": "Base unit (e.g., \"B/s\", \"F/s\", \"W\")", "type": "string" }, "prefix": { + "description": "SI prefix (e.g., \"G\", \"M\", \"K\", \"T\")", "type": "string" } } diff --git a/internal/api/job.go b/internal/api/job.go index 09f7b22c..1b1e05d6 100644 --- a/internal/api/job.go +++ b/internal/api/job.go @@ -104,7 +104,7 @@ type JobMetricWithName struct { // @param items-per-page query int false "Items per page (Default: 25)" // @param page query int false "Page Number (Default: 1)" // @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" -// @success 200 {object} api.GetJobsApiResponse "Job array and page info" +// @success 200 {object} api.GetJobsAPIResponse "Job array and page info" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -232,7 +232,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) { // @produce json // @param id path int true "Database ID of Job" // @param all-metrics query bool false "Include all available metrics" -// @success 200 {object} api.GetJobApiResponse "Job resource" +// @success 200 {object} api.GetJobAPIResponse "Job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -324,8 +324,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) // @accept json // @produce json // @param id path int true "Database ID of Job" -// @param request body api.GetJobApiRequest true "Array of metric names" -// @success 200 {object} api.GetJobApiResponse "Job resource" +// @param request body api.GetJobAPIRequest true "Array of metric names" +// @success 200 {object} api.GetJobAPIResponse "Job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -478,7 +478,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to add" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to add" // @success 200 {object} schema.Job "Updated job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -542,7 +542,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove" // @success 200 {object} schema.Job "Updated job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -606,7 +606,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) { // @description Tag wills be removed from respective archive files. // @accept json // @produce plain -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove" // @success 200 {string} string "Success Response" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -650,7 +650,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param request body schema.Job true "Job to add" -// @success 201 {object} api.DefaultApiResponse "Job added successfully" +// @success 201 {object} api.DefaultAPIResponse "Job added successfully" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -728,7 +728,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) { // @description Job to stop is specified by request body. All fields are required in this case. // @description Returns full job resource information according to 'Job' scheme. // @produce json -// @param request body api.StopJobApiRequest true "All fields required" +// @param request body api.StopJobAPIRequest true "All fields required" // @success 200 {object} schema.Job "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -754,7 +754,6 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { return } - // cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) if err != nil { // Try cached jobs if not found in main repository @@ -776,7 +775,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // @description Job to remove is specified by database ID. This will not remove the job from the job archive. // @produce json // @param id path int true "Database ID of Job" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -820,8 +819,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) { // @description Job to delete is specified by request body. All fields are required in this case. // @accept json // @produce json -// @param request body api.DeleteJobApiRequest true "All fields required" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @param request body api.DeleteJobAPIRequest true "All fields required" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -873,7 +872,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) // @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. // @produce json // @param ts path int true "Unix epoch timestamp" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" diff --git a/internal/api/node.go b/internal/api/node.go index 350f097d..4ad5337a 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -47,7 +47,7 @@ func determineState(states []string) schema.SchedulerState { // @description Required query-parameter defines if all users or only users with additional special roles are returned. // @produce json // @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" diff --git a/internal/api/user.go b/internal/api/user.go index 1821b69b..5564fd61 100644 --- a/internal/api/user.go +++ b/internal/api/user.go @@ -31,7 +31,7 @@ type APIReturnedUser struct { // @description Required query-parameter defines if all users or only users with additional special roles are returned. // @produce json // @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" -// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" +// @success 200 {array} api.APIReturnedUser "List of users returned successfully" // @failure 400 {string} string "Bad Request" // @failure 401 {string} string "Unauthorized" // @failure 403 {string} string "Forbidden" diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index 2d3aca04..c218c0af 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -10815,7 +10815,7 @@ func (ec *executionContext) _SubCluster_metricConfig(ctx context.Context, field return obj.MetricConfig, nil }, nil, - ec.marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ, + ec.marshalNMetricConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ, true, true, ) @@ -18466,11 +18466,7 @@ func (ec *executionContext) marshalNJobsStatistics2ᚖgithubᚗcomᚋClusterCock return ec._JobsStatistics(ctx, sel, v) } -func (ec *executionContext) marshalNMetricConfig2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx context.Context, sel ast.SelectionSet, v schema.MetricConfig) graphql.Marshaler { - return ec._MetricConfig(ctx, sel, &v) -} - -func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.MetricConfig) graphql.Marshaler { +func (ec *executionContext) marshalNMetricConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.MetricConfig) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup isLen1 := len(v) == 1 @@ -18494,7 +18490,7 @@ func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpi if !isLen1 { defer wg.Done() } - ret[i] = ec.marshalNMetricConfig2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx, sel, v[i]) + ret[i] = ec.marshalNMetricConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx, sel, v[i]) } if isLen1 { f(i) @@ -18514,6 +18510,16 @@ func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpi return ret } +func (ec *executionContext) marshalNMetricConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx context.Context, sel ast.SelectionSet, v *schema.MetricConfig) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._MetricConfig(ctx, sel, v) +} + func (ec *executionContext) marshalNMetricFootprints2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricFootprintsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.MetricFootprints) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 2cb4f992..21ccaf92 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -3,7 +3,7 @@ package graph // This file will be automatically regenerated based on the schema, any resolver // implementations // will be copied through when generating and any unknown code will be moved to the end. -// Code generated by github.com/99designs/gqlgen version v0.17.84 +// Code generated by github.com/99designs/gqlgen version v0.17.85 import ( "context" @@ -283,7 +283,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin // Test Access: Admins && Admin Tag OR Everyone && Private Tag if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope { // Remove from DB - if err = r.Repo.RemoveTagById(tid); err != nil { + if err = r.Repo.RemoveTagByID(tid); err != nil { cclog.Warn("Error while removing tag") return nil, err } else { diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 4b217475..2ac35ea9 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package importer import ( diff --git a/internal/importer/normalize.go b/internal/importer/normalize.go index c6e84d4b..cc6fb545 100644 --- a/internal/importer/normalize.go +++ b/internal/importer/normalize.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package importer import ( diff --git a/internal/metricstore/metricstore.go b/internal/metricstore/metricstore.go index e35b4d58..d75c9ef8 100644 --- a/internal/metricstore/metricstore.go +++ b/internal/metricstore/metricstore.go @@ -74,7 +74,7 @@ func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) { cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers) // Helper function to add metric configuration - addMetricConfig := func(mc schema.MetricConfig) { + addMetricConfig := func(mc *schema.MetricConfig) { agg, err := AssignAggregationStrategy(mc.Aggregation) if err != nil { cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error()) @@ -88,7 +88,7 @@ func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) { for _, c := range archive.Clusters { for _, mc := range c.MetricConfig { - addMetricConfig(*mc) + addMetricConfig(mc) } for _, sc := range c.SubClusters { diff --git a/internal/repository/hooks.go b/internal/repository/hooks.go index c916b57e..824beb7c 100644 --- a/internal/repository/hooks.go +++ b/internal/repository/hooks.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( diff --git a/internal/repository/job.go b/internal/repository/job.go index bd33774c..9ee51735 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -686,7 +686,6 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in return subclusters, nil } -// FIXME: Set duration to requested walltime? // StopJobsExceedingWalltimeBy marks running jobs as failed if they exceed their walltime limit. // This is typically called periodically to clean up stuck or orphaned jobs. // @@ -762,7 +761,6 @@ func (r *JobRepository) FindJobIdsByTag(tagID int64) ([]int64, error) { return jobIds, nil } -// FIXME: Reconsider filtering short jobs with harcoded threshold // FindRunningJobs returns all currently running jobs for a specific cluster. // Filters out short-running jobs based on repoConfig.MinRunningJobDuration threshold. // diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index 41684d5c..66d29eea 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index 17766c69..9f4871fd 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -90,13 +90,13 @@ func TestFindJobsBetween(t *testing.T) { // 2. Create a tag tagName := fmt.Sprintf("testtag_%d", time.Now().UnixNano()) - tagId, err := r.CreateTag("testtype", tagName, "global") + tagID, err := r.CreateTag("testtype", tagName, "global") if err != nil { t.Fatal(err) } // 3. Link Tag (Manually to avoid archive dependency side-effects in unit test) - _, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagId) + _, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagID) if err != nil { t.Fatal(err) } diff --git a/internal/repository/node.go b/internal/repository/node.go index 2890cdbc..a81fc58d 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -579,7 +579,7 @@ func (r *NodeRepository) GetNodesForList( queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}}) } if stateFilter != "all" && stateFilter != "notindb" { - var queryState schema.SchedulerState = schema.SchedulerState(stateFilter) + queryState := schema.SchedulerState(stateFilter) queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState}) } // if healthFilter != "all" { diff --git a/internal/repository/repository_test.go b/internal/repository/repository_test.go index 9d07b026..475e7bca 100644 --- a/internal/repository/repository_test.go +++ b/internal/repository/repository_test.go @@ -46,7 +46,7 @@ func BenchmarkSelect1(b *testing.B) { } func BenchmarkDB_FindJobById(b *testing.B) { - var jobId int64 = 1677322 + var jobID int64 = 1677322 b.Run("FindJobById", func(b *testing.B) { db := setup(b) @@ -55,7 +55,7 @@ func BenchmarkDB_FindJobById(b *testing.B) { b.RunParallel(func(pb *testing.PB) { for pb.Next() { - _, err := db.FindByID(getContext(b), jobId) + _, err := db.FindByID(getContext(b), jobID) noErr(b, err) } }) @@ -63,7 +63,7 @@ func BenchmarkDB_FindJobById(b *testing.B) { } func BenchmarkDB_FindJob(b *testing.B) { - var jobId int64 = 107266 + var jobID int64 = 107266 var startTime int64 = 1657557241 cluster := "fritz" @@ -74,7 +74,7 @@ func BenchmarkDB_FindJob(b *testing.B) { b.RunParallel(func(pb *testing.PB) { for pb.Next() { - _, err := db.Find(&jobId, &cluster, &startTime) + _, err := db.Find(&jobID, &cluster, &startTime) noErr(b, err) } }) diff --git a/internal/repository/stats.go b/internal/repository/stats.go index cd175c23..851a4ca1 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -2,6 +2,44 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + +// This file contains job statistics and histogram generation functionality for the JobRepository. +// +// # Job Statistics +// +// The statistics methods provide aggregated metrics about jobs including total jobs, users, +// walltime, and resource usage (nodes, cores, accelerators). Statistics can be computed: +// - Overall (JobsStats): Single aggregate across all matching jobs +// - Grouped (JobsStatsGrouped): Aggregated by user, project, cluster, or subcluster +// - Counts (JobCountGrouped, AddJobCount): Simple job counts with optional filtering +// +// All statistics methods support filtering via JobFilter and respect security contexts. +// +// # Histograms +// +// Histogram methods generate distribution data for visualization: +// - Duration, nodes, cores, accelerators (AddHistograms) +// - Job metrics like CPU load, memory usage (AddMetricHistograms) +// +// Histograms use intelligent binning: +// - Duration: Variable bin sizes (1m, 10m, 1h, 6h, 12h, 24h) with zero-padding +// - Resources: Natural value-based bins +// - Metrics: Normalized to peak values with configurable bin counts +// +// # Running vs. Completed Jobs +// +// Statistics handle running jobs specially: +// - Duration calculated as (now - start_time) for running jobs +// - Metric histograms for running jobs load data from metric backend instead of footprint +// - Job state filtering distinguishes running/completed jobs +// +// # Performance Considerations +// +// - All queries use prepared statements via stmtCache +// - Complex aggregations use SQL for efficiency +// - Histogram pre-initialization ensures consistent bin ranges +// - Metric histogram queries limited to 500 jobs for running job analysis + package repository import ( @@ -19,7 +57,9 @@ import ( sq "github.com/Masterminds/squirrel" ) -// GraphQL validation should make sure that no unkown values can be specified. +// groupBy2column maps GraphQL Aggregate enum values to their corresponding database column names. +// Used by JobsStatsGrouped and JobCountGrouped to translate user-facing grouping dimensions +// into SQL GROUP BY clauses. GraphQL validation ensures only valid enum values are accepted. var groupBy2column = map[model.Aggregate]string{ model.AggregateUser: "job.hpc_user", model.AggregateProject: "job.project", @@ -27,6 +67,9 @@ var groupBy2column = map[model.Aggregate]string{ model.AggregateSubcluster: "job.subcluster", } +// sortBy2column maps GraphQL SortByAggregate enum values to their corresponding computed column names. +// Used by JobsStatsGrouped to translate sort preferences into SQL ORDER BY clauses. +// Column names match the AS aliases used in buildStatsQuery. var sortBy2column = map[model.SortByAggregate]string{ model.SortByAggregateTotaljobs: "totalJobs", model.SortByAggregateTotalusers: "totalUsers", @@ -39,6 +82,21 @@ var sortBy2column = map[model.SortByAggregate]string{ model.SortByAggregateTotalacchours: "totalAccHours", } +// buildCountQuery constructs a SQL query to count jobs with optional grouping and filtering. +// +// Parameters: +// - filter: Job filters to apply (cluster, user, time range, etc.) +// - kind: Special filter - "running" for running jobs only, "short" for jobs under threshold +// - col: Column name to GROUP BY; empty string for total count without grouping +// +// Returns a SelectBuilder that produces either: +// - Single count: COUNT(job.id) when col is empty +// - Grouped counts: col, COUNT(job.id) when col is specified +// +// The kind parameter enables counting specific job categories: +// - "running": Only jobs with job_state = 'running' +// - "short": Only jobs with duration < ShortRunningJobsDuration config value +// - empty: All jobs matching filters func (r *JobRepository) buildCountQuery( filter []*model.JobFilter, kind string, @@ -47,10 +105,8 @@ func (r *JobRepository) buildCountQuery( var query sq.SelectBuilder if col != "" { - // Scan columns: id, cnt query = sq.Select(col, "COUNT(job.id)").From("job").GroupBy(col) } else { - // Scan columns: cnt query = sq.Select("COUNT(job.id)").From("job") } @@ -68,6 +124,27 @@ func (r *JobRepository) buildCountQuery( return query } +// buildStatsQuery constructs a SQL query to compute comprehensive job statistics with optional grouping. +// +// Parameters: +// - filter: Job filters to apply (cluster, user, time range, etc.) +// - col: Column name to GROUP BY; empty string for overall statistics without grouping +// +// Returns a SelectBuilder that produces comprehensive statistics: +// - totalJobs: Count of jobs +// - totalUsers: Count of distinct users (always 0 when grouping by user) +// - totalWalltime: Sum of job durations in hours +// - totalNodes: Sum of nodes used across all jobs +// - totalNodeHours: Sum of (duration × num_nodes) in hours +// - totalCores: Sum of hardware threads used across all jobs +// - totalCoreHours: Sum of (duration × num_hwthreads) in hours +// - totalAccs: Sum of accelerators used across all jobs +// - totalAccHours: Sum of (duration × num_acc) in hours +// +// Special handling: +// - Running jobs: Duration calculated as (now - start_time) instead of stored duration +// - Grouped queries: Also select grouping column and user's display name from hpc_user table +// - All time values converted from seconds to hours (÷ 3600) and rounded func (r *JobRepository) buildStatsQuery( filter []*model.JobFilter, col string, @@ -75,31 +152,29 @@ func (r *JobRepository) buildStatsQuery( var query sq.SelectBuilder if col != "" { - // Scan columns: id, name, totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours query = sq.Select( col, "name", "COUNT(job.id) as totalJobs", "COUNT(DISTINCT job.hpc_user) AS totalUsers", fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int) as totalWalltime`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_nodes) as int) as totalNodes`), + `CAST(SUM(job.num_nodes) as int) as totalNodes`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int) as totalNodeHours`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as int) as totalCores`), + `CAST(SUM(job.num_hwthreads) as int) as totalCores`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int) as totalCoreHours`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_acc) as int) as totalAccs`), + `CAST(SUM(job.num_acc) as int) as totalAccs`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int) as totalAccHours`, time.Now().Unix()), ).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col) } else { - // Scan columns: totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours query = sq.Select( "COUNT(job.id) as totalJobs", "COUNT(DISTINCT job.hpc_user) AS totalUsers", fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int)`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_nodes) as int)`), + `CAST(SUM(job.num_nodes) as int)`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int)`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as int)`), + `CAST(SUM(job.num_hwthreads) as int)`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int)`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_acc) as int)`), + `CAST(SUM(job.num_acc) as int)`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int)`, time.Now().Unix()), ).From("job") } @@ -111,6 +186,25 @@ func (r *JobRepository) buildStatsQuery( return query } +// JobsStatsGrouped computes comprehensive job statistics grouped by a dimension (user, project, cluster, or subcluster). +// +// This is the primary method for generating aggregated statistics views in the UI, providing +// metrics like total jobs, walltime, and resource usage broken down by the specified grouping. +// +// Parameters: +// - ctx: Context for security checks and cancellation +// - filter: Filters to apply (time range, cluster, job state, etc.) +// - page: Optional pagination (ItemsPerPage: -1 disables pagination) +// - sortBy: Optional sort column (totalJobs, totalWalltime, totalCoreHours, etc.) +// - groupBy: Required grouping dimension (User, Project, Cluster, or Subcluster) +// +// Returns a slice of JobsStatistics, one per group, with: +// - ID: The group identifier (username, project name, cluster name, etc.) +// - Name: Display name (for users, from hpc_user.name; empty for other groups) +// - Statistics: totalJobs, totalUsers, totalWalltime, resource usage metrics +// +// Security: Respects user roles via SecurityCheck - users see only their own data unless admin/support. +// Performance: Results are sorted in SQL and pagination applied before scanning rows. func (r *JobRepository) JobsStatsGrouped( ctx context.Context, filter []*model.JobFilter, @@ -230,6 +324,21 @@ func (r *JobRepository) JobsStatsGrouped( return stats, nil } +// JobsStats computes overall job statistics across all matching jobs without grouping. +// +// This method provides a single aggregate view of job metrics, useful for dashboard +// summaries and overall system utilization reports. +// +// Parameters: +// - ctx: Context for security checks and cancellation +// - filter: Filters to apply (time range, cluster, job state, etc.) +// +// Returns a single-element slice containing aggregate statistics: +// - totalJobs, totalUsers, totalWalltime +// - totalNodeHours, totalCoreHours, totalAccHours +// +// Unlike JobsStatsGrouped, this returns overall totals without breaking down by dimension. +// Security checks are applied via SecurityCheck to respect user access levels. func (r *JobRepository) JobsStats( ctx context.Context, filter []*model.JobFilter, @@ -303,6 +412,17 @@ func LoadJobStat(job *schema.Job, metric string, statType string) float64 { return 0.0 } +// JobCountGrouped counts jobs grouped by a dimension without computing detailed statistics. +// +// This is a lightweight alternative to JobsStatsGrouped when only job counts are needed, +// avoiding the overhead of calculating walltime and resource usage metrics. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - groupBy: Grouping dimension (User, Project, Cluster, or Subcluster) +// +// Returns JobsStatistics with only ID and TotalJobs populated for each group. func (r *JobRepository) JobCountGrouped( ctx context.Context, filter []*model.JobFilter, @@ -343,6 +463,20 @@ func (r *JobRepository) JobCountGrouped( return stats, nil } +// AddJobCountGrouped augments existing statistics with additional job counts by category. +// +// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped +// with counts of running or short-running jobs, matched by group ID. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - groupBy: Grouping dimension (must match the dimension used for stats parameter) +// - stats: Existing statistics to augment (modified in-place by ID matching) +// - kind: "running" to add RunningJobs count, "short" to add ShortJobs count +// +// Returns the same stats slice with RunningJobs or ShortJobs fields populated per group. +// Groups without matching jobs will have 0 for the added field. func (r *JobRepository) AddJobCountGrouped( ctx context.Context, filter []*model.JobFilter, @@ -392,6 +526,18 @@ func (r *JobRepository) AddJobCountGrouped( return stats, nil } +// AddJobCount augments existing overall statistics with additional job counts by category. +// +// Similar to AddJobCountGrouped but for ungrouped statistics. Applies the same count +// to all statistics entries (typically just one). +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - stats: Existing statistics to augment (modified in-place) +// - kind: "running" to add RunningJobs count, "short" to add ShortJobs count +// +// Returns the same stats slice with RunningJobs or ShortJobs fields set to the total count. func (r *JobRepository) AddJobCount( ctx context.Context, filter []*model.JobFilter, @@ -437,6 +583,26 @@ func (r *JobRepository) AddJobCount( return stats, nil } +// AddHistograms augments statistics with distribution histograms for job properties. +// +// Generates histogram data for visualization of job duration, node count, core count, +// and accelerator count distributions. Duration histogram uses intelligent binning based +// on the requested resolution. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply to jobs included in histograms +// - stat: Statistics struct to augment (modified in-place) +// - durationBins: Bin size - "1m", "10m", "1h", "6h", "12h", or "24h" (default) +// +// Populates these fields in stat: +// - HistDuration: Job duration distribution (zero-padded bins) +// - HistNumNodes: Node count distribution +// - HistNumCores: Core (hwthread) count distribution +// - HistNumAccs: Accelerator count distribution +// +// Duration bins are pre-initialized with zeros to ensure consistent ranges for visualization. +// Bin size determines both the width and maximum duration displayed (e.g., "1h" = 48 bins × 1h = 48h max). func (r *JobRepository) AddHistograms( ctx context.Context, filter []*model.JobFilter, @@ -447,20 +613,20 @@ func (r *JobRepository) AddHistograms( var targetBinCount int var targetBinSize int - switch { - case *durationBins == "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes + switch *durationBins { + case "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes targetBinCount = 60 targetBinSize = 60 - case *durationBins == "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours + case "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours targetBinCount = 72 targetBinSize = 600 - case *durationBins == "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours + case "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours targetBinCount = 48 targetBinSize = 3600 - case *durationBins == "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days + case "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days targetBinCount = 12 targetBinSize = 21600 - case *durationBins == "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days + case "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days targetBinCount = 14 targetBinSize = 43200 default: // 24h @@ -499,7 +665,30 @@ func (r *JobRepository) AddHistograms( return stat, nil } -// Requires thresholds for metric from config for cluster? Of all clusters and use largest? split to 10 + 1 for artifacts? +// AddMetricHistograms augments statistics with distribution histograms for job metrics. +// +// Generates histogram data for metrics like CPU load, memory usage, etc. Handles running +// and completed jobs differently: running jobs load data from metric backend, completed jobs +// use footprint data from database. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply (MUST contain State filter for running jobs) +// - metrics: List of metric names to histogram (e.g., ["cpu_load", "mem_used"]) +// - stat: Statistics struct to augment (modified in-place) +// - targetBinCount: Number of histogram bins (default: 10) +// +// Populates HistMetrics field in stat with MetricHistoPoints for each metric. +// +// Binning algorithm: +// - Values normalized to metric's peak value from cluster configuration +// - Bins evenly distributed from 0 to peak +// - Pre-initialized with zeros for consistent visualization +// +// Limitations: +// - Running jobs: Limited to 500 jobs for performance +// - Requires valid cluster configuration with metric peak values +// - Uses footprint statistic (avg/max/min) configured per metric func (r *JobRepository) AddMetricHistograms( ctx context.Context, filter []*model.JobFilter, @@ -534,7 +723,16 @@ func (r *JobRepository) AddMetricHistograms( return stat, nil } -// `value` must be the column grouped by, but renamed to "value" +// jobsStatisticsHistogram generates a simple histogram by grouping on a column value. +// +// Used for histograms where the column value directly represents the bin (e.g., node count, core count). +// Unlike duration/metric histograms, this doesn't pre-initialize bins with zeros. +// +// Parameters: +// - value: SQL expression that produces the histogram value, aliased as "value" +// - filters: Job filters to apply +// +// Returns histogram points with Value (from column) and Count (number of jobs). func (r *JobRepository) jobsStatisticsHistogram( ctx context.Context, value string, @@ -573,6 +771,26 @@ func (r *JobRepository) jobsStatisticsHistogram( return points, nil } +// jobsDurationStatisticsHistogram generates a duration histogram with pre-initialized bins. +// +// Bins are zero-padded to provide consistent ranges for visualization, unlike simple +// histograms which only return bins with data. The value parameter should compute +// the bin number from job duration. +// +// Parameters: +// - value: SQL expression computing bin number from duration, aliased as "value" +// - filters: Job filters to apply +// - binSizeSeconds: Width of each bin in seconds +// - targetBinCount: Number of bins to pre-initialize +// +// Returns histogram points with Value (bin_number × binSizeSeconds) and Count. +// All bins from 1 to targetBinCount are returned, with Count=0 for empty bins. +// +// Algorithm: +// 1. Pre-initialize targetBinCount bins with zero counts +// 2. Query database for actual counts per bin +// 3. Match query results to pre-initialized bins by value +// 4. Bins without matches remain at zero func (r *JobRepository) jobsDurationStatisticsHistogram( ctx context.Context, value string, @@ -588,7 +806,6 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, qerr } - // Initialize histogram bins with zero counts // Each bin represents a duration range: bin N = [N*binSizeSeconds, (N+1)*binSizeSeconds) // Example: binSizeSeconds=3600 (1 hour), bin 1 = 0-1h, bin 2 = 1-2h, etc. points := make([]*model.HistoPoint, 0) @@ -607,8 +824,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, err } - // Match query results to pre-initialized bins and fill counts - // Query returns raw duration values that need to be mapped to correct bins + // Match query results to pre-initialized bins. + // point.Value from query is the bin number; multiply by binSizeSeconds to match bin.Value. for rows.Next() { point := model.HistoPoint{} if err := rows.Scan(&point.Value, &point.Count); err != nil { @@ -616,13 +833,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, err } - // Find matching bin and update count - // point.Value is multiplied by binSizeSeconds to match pre-calculated bin.Value for _, e := range points { if e.Value == (point.Value * binSizeSeconds) { - // Note: Matching on unmodified integer value (and multiplying point.Value - // by binSizeSeconds after match) causes frontend to loop into highest - // targetBinCount, due to zoom condition instantly being fulfilled (cause unknown) e.Count = point.Count break } @@ -633,13 +845,34 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return points, nil } +// jobsMetricStatisticsHistogram generates a metric histogram using footprint data from completed jobs. +// +// Values are normalized to the metric's peak value and distributed into bins. The algorithm +// is based on SQL histogram generation techniques, extracting metric values from JSON footprint +// and computing bin assignments in SQL. +// +// Parameters: +// - metric: Metric name (e.g., "cpu_load", "mem_used") +// - filters: Job filters to apply +// - bins: Number of bins to generate +// +// Returns MetricHistoPoints with metric name, unit, footprint stat type, and binned data. +// +// Algorithm: +// 1. Determine peak value from cluster configuration (filtered cluster or max across all) +// 2. Generate SQL that extracts footprint value, normalizes to [0,1], multiplies by bin count +// 3. Pre-initialize bins with min/max ranges based on peak value +// 4. Query database for counts per bin +// 5. Match results to pre-initialized bins +// +// Special handling: Values exactly equal to peak are forced into the last bin by multiplying +// peak by 0.999999999 to avoid creating an extra bin. func (r *JobRepository) jobsMetricStatisticsHistogram( ctx context.Context, metric string, filters []*model.JobFilter, bins *int, ) (*model.MetricHistoPoints, error) { - // Determine the metric's peak value for histogram normalization // Peak value defines the upper bound for binning: values are distributed across // bins from 0 to peak. First try to get peak from filtered cluster, otherwise // scan all clusters to find the maximum peak value. @@ -679,18 +912,14 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } } - // Construct SQL histogram bins using normalized values + // Construct SQL histogram bins using normalized values. // Algorithm based on: https://jereze.com/code/sql-histogram/ (modified) start := time.Now() - // Calculate bin number for each job's metric value: - // 1. Extract metric value from JSON footprint - // 2. Normalize to [0,1] by dividing by peak - // 3. Multiply by number of bins to get bin number - // 4. Cast to integer for bin assignment - // - // Special case: Values exactly equal to peak would fall into bin N+1, - // so we multiply peak by 0.999999999 to force it into the last bin (bin N) + // Bin calculation formula: + // bin_number = CAST( (value / peak) * num_bins AS INTEGER ) + 1 + // Special case: value == peak would create bin N+1, so we test for equality + // and multiply peak by 0.999999999 to force it into bin N. binQuery := fmt.Sprintf(`CAST( ((case when json_extract(footprint, "$.%s") = %f then %f*0.999999999 else json_extract(footprint, "$.%s") end) / %f) * %v as INTEGER )`, @@ -699,24 +928,19 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( mainQuery := sq.Select( fmt.Sprintf(`%s + 1 as bin`, binQuery), `count(*) as count`, - // For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * %s as min`, peak, *bins, binQuery), - // For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * (%s + 1) as max`, peak, *bins, binQuery), ).From("job").Where( "JSON_VALID(footprint)", ).Where(fmt.Sprintf(`json_extract(footprint, "$.%s") is not null and json_extract(footprint, "$.%s") <= %f`, (metric + "_" + footprintStat), (metric + "_" + footprintStat), peak)) - // Only accessible Jobs... mainQuery, qerr := SecurityCheck(ctx, mainQuery) if qerr != nil { return nil, qerr } - // Filters... for _, f := range filters { mainQuery = BuildWhereClause(f, mainQuery) } - // Finalize query with Grouping and Ordering mainQuery = mainQuery.GroupBy("bin").OrderBy("bin") rows, err := mainQuery.RunWith(r.DB).Query() @@ -725,8 +949,7 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( return nil, err } - // Initialize histogram bins with calculated min/max ranges - // Each bin represents a range of metric values + // Pre-initialize bins with calculated min/max ranges. // Example: peak=1000, bins=10 -> bin 1=[0,100), bin 2=[100,200), ..., bin 10=[900,1000] points := make([]*model.MetricHistoPoint, 0) binStep := int(peak) / *bins @@ -737,29 +960,18 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( points = append(points, &epoint) } - // Fill counts from query results - // Query only returns bins that have jobs, so we match against pre-initialized bins + // Match query results to pre-initialized bins. for rows.Next() { rpoint := model.MetricHistoPoint{} - if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { // Required for Debug: &rpoint.Min, &rpoint.Max + if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { cclog.Warnf("Error while scanning rows for %s", metric) - return nil, err // FIXME: Totally bricks cc-backend if returned and if all metrics requested? + return nil, err } - // Match query result to pre-initialized bin and update count for _, e := range points { - if e.Bin != nil && rpoint.Bin != nil { - if *e.Bin == *rpoint.Bin { - e.Count = rpoint.Count - // Only Required For Debug: Check DB returned Min/Max against Backend Init above - // if rpoint.Min != nil { - // cclog.Warnf(">>>> Bin %d Min Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Min, *e.Min) - // } - // if rpoint.Max != nil { - // cclog.Warnf(">>>> Bin %d Max Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Max, *e.Max) - // } - break - } + if e.Bin != nil && rpoint.Bin != nil && *e.Bin == *rpoint.Bin { + e.Count = rpoint.Count + break } } } @@ -770,6 +982,28 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( return &result, nil } +// runningJobsMetricStatisticsHistogram generates metric histograms for running jobs using live data. +// +// Unlike completed jobs which use footprint data from the database, running jobs require +// fetching current metric averages from the metric backend (via metricdispatch). +// +// Parameters: +// - metrics: List of metric names +// - filters: Job filters (should filter to running jobs only) +// - bins: Number of histogram bins +// +// Returns slice of MetricHistoPoints, one per metric. +// +// Limitations: +// - Maximum 500 jobs (returns nil if more jobs match) +// - Requires metric backend availability +// - Bins based on metric peak values from cluster configuration +// +// Algorithm: +// 1. Query first 501 jobs to check count limit +// 2. Load metric averages for all jobs via metricdispatch +// 3. For each metric, create bins based on peak value +// 4. Iterate averages and count jobs per bin func (r *JobRepository) runningJobsMetricStatisticsHistogram( ctx context.Context, metrics []string, diff --git a/internal/repository/tags.go b/internal/repository/tags.go index f6cccfe2..861cbb76 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -3,6 +3,34 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. +// Package repository provides data access and persistence layer for ClusterCockpit. +// +// This file implements tag management functionality for job categorization and classification. +// Tags support both manual assignment (via REST/GraphQL APIs) and automatic detection +// (via tagger plugins). The implementation includes role-based access control through +// tag scopes and maintains bidirectional consistency between the SQL database and +// the file-based job archive. +// +// Database Schema: +// +// CREATE TABLE tag ( +// id INTEGER PRIMARY KEY AUTOINCREMENT, +// tag_type VARCHAR(255) NOT NULL, +// tag_name VARCHAR(255) NOT NULL, +// tag_scope VARCHAR(255) NOT NULL DEFAULT "global", +// CONSTRAINT tag_unique UNIQUE (tag_type, tag_name, tag_scope) +// ); +// +// CREATE TABLE jobtag ( +// job_id INTEGER, +// tag_id INTEGER, +// PRIMARY KEY (job_id, tag_id), +// FOREIGN KEY (job_id) REFERENCES job(id) ON DELETE CASCADE, +// FOREIGN KEY (tag_id) REFERENCES tag(id) ON DELETE CASCADE +// ); +// +// The jobtag junction table enables many-to-many relationships between jobs and tags. +// CASCADE deletion ensures referential integrity when jobs or tags are removed. package repository import ( @@ -73,7 +101,7 @@ func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) { j, err := r.FindByIDWithUser(user, job) if err != nil { - cclog.Warn("Error while finding job by id") + cclog.Warnf("Error while finding job %d for user %s during tag removal: %v", job, user.Username, err) return nil, err } @@ -93,7 +121,7 @@ func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema. archiveTags, err := r.getArchiveTags(&job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting archive tags for job %d in RemoveTag: %v", job, err) return nil, err } @@ -104,7 +132,7 @@ func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema. // Requires user authentication for security checks. Used by REST API. func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagType string, tagName string, tagScope string) ([]*schema.Tag, error) { // Get Tag ID to delete - tagID, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { cclog.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return nil, fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) @@ -113,7 +141,7 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT // Get Job j, err := r.FindByIDWithUser(user, job) if err != nil { - cclog.Warn("Error while finding job by id") + cclog.Warnf("Error while finding job %d for user %s during tag removal by request: %v", job, user.Username, err) return nil, err } @@ -128,19 +156,30 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT tags, err := r.GetTags(user, &job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting tags for job %d in RemoveJobTagByRequest: %v", job, err) return nil, err } archiveTags, err := r.getArchiveTags(&job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting archive tags for job %d in RemoveJobTagByRequest: %v", job, err) return nil, err } return tags, archive.UpdateTags(j, archiveTags) } +// removeTagFromArchiveJobs updates the job archive for all affected jobs after a tag deletion. +// +// This function is called asynchronously (via goroutine) after removing a tag from the database +// to synchronize the file-based job archive with the database state. Errors are logged but not +// returned since this runs in the background. +// +// Parameters: +// - jobIds: Database IDs of all jobs that had the deleted tag +// +// Implementation note: Each job is processed individually to handle partial failures gracefully. +// If one job fails to update, others will still be processed. func (r *JobRepository) removeTagFromArchiveJobs(jobIds []int64) { for _, j := range jobIds { tags, err := r.getArchiveTags(&j) @@ -163,18 +202,18 @@ func (r *JobRepository) removeTagFromArchiveJobs(jobIds []int64) { // Used by REST API. Does not update tagged jobs in Job archive. func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagScope string) error { // Get Tag ID to delete - tagID, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { cclog.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } - return r.RemoveTagById(tagID) + return r.RemoveTagByID(tagID) } // Removes a tag from db by tag id // Used by GraphQL API. -func (r *JobRepository) RemoveTagById(tagID int64) error { +func (r *JobRepository) RemoveTagByID(tagID int64) error { jobIds, err := r.FindJobIdsByTag(tagID) if err != nil { return err @@ -213,7 +252,7 @@ func (r *JobRepository) RemoveTagById(tagID int64) error { // Example: // // tagID, err := repo.CreateTag("performance", "high-memory", "global") -func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagId int64, err error) { +func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagID int64, err error) { // Default to "Global" scope if none defined if tagScope == "" { tagScope = "global" @@ -300,13 +339,13 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts for rows.Next() { var tagType string var tagName string - var tagId int + var tagID int var count int - if err = rows.Scan(&tagType, &tagName, &tagId, &count); err != nil { + if err = rows.Scan(&tagType, &tagName, &tagID, &count); err != nil { return nil, nil, err } // Use tagId as second Map-Key component to differentiate tags with identical names - counts[fmt.Sprint(tagType, tagName, tagId)] = count + counts[fmt.Sprint(tagType, tagName, tagID)] = count } err = rows.Err() @@ -314,18 +353,44 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts } var ( - ErrTagNotFound = errors.New("the tag does not exist") - ErrJobNotOwned = errors.New("user is not owner of job") - ErrTagNoAccess = errors.New("user not permitted to use that tag") - ErrTagPrivateScope = errors.New("tag is private to another user") - ErrTagAdminScope = errors.New("tag requires admin privileges") + // ErrTagNotFound is returned when a tag ID or tag identifier (type, name, scope) does not exist in the database. + ErrTagNotFound = errors.New("the tag does not exist") + + // ErrJobNotOwned is returned when a user attempts to tag a job they do not have permission to access. + ErrJobNotOwned = errors.New("user is not owner of job") + + // ErrTagNoAccess is returned when a user attempts to use a tag they cannot access due to scope restrictions. + ErrTagNoAccess = errors.New("user not permitted to use that tag") + + // ErrTagPrivateScope is returned when a user attempts to access another user's private tag. + ErrTagPrivateScope = errors.New("tag is private to another user") + + // ErrTagAdminScope is returned when a non-admin user attempts to use an admin-scoped tag. + ErrTagAdminScope = errors.New("tag requires admin privileges") + + // ErrTagsIncompatScopes is returned when attempting to combine admin and non-admin scoped tags in a single operation. ErrTagsIncompatScopes = errors.New("combining admin and non-admin scoped tags not allowed") ) // addJobTag is a helper function that inserts a job-tag association and updates the archive. -// Returns the updated tag list for the job. -func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, getTags func() ([]*schema.Tag, error)) ([]*schema.Tag, error) { - q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId) +// +// This function performs three operations atomically: +// 1. Inserts the job-tag association into the jobtag junction table +// 2. Retrieves the updated tag list for the job (using the provided getTags callback) +// 3. Updates the job archive with the new tags to maintain database-archive consistency +// +// Parameters: +// - jobId: Database ID of the job +// - tagId: Database ID of the tag to associate +// - job: Full job object needed for archive update +// - getTags: Callback function to retrieve updated tags (allows different security contexts) +// +// Returns the complete updated tag list for the job or an error. +// +// Note: This function does NOT validate tag scope permissions - callers must perform +// authorization checks before invoking this helper. +func (r *JobRepository) addJobTag(jobID int64, tagID int64, job *schema.Job, getTags func() ([]*schema.Tag, error)) ([]*schema.Tag, error) { + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobID, tagID) if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { s, _, _ := q.ToSql() @@ -335,13 +400,13 @@ func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, get tags, err := getTags() if err != nil { - cclog.Warnf("Error getting tags for job %d: %v", jobId, err) + cclog.Warnf("Error getting tags for job %d: %v", jobID, err) return nil, err } - archiveTags, err := r.getArchiveTags(&jobId) + archiveTags, err := r.getArchiveTags(&jobID) if err != nil { - cclog.Warnf("Error getting archive tags for job %d: %v", jobId, err) + cclog.Warnf("Error getting archive tags for job %d: %v", jobID, err) return nil, err } @@ -350,7 +415,7 @@ func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, get // AddTagOrCreate adds the tag with the specified type and name to the job with the database id `jobId`. // If such a tag does not yet exist, it is created. -func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType string, tagName string, tagScope string) (tagId int64, err error) { +func (r *JobRepository) AddTagOrCreate(user *schema.User, jobID int64, tagType string, tagName string, tagScope string) (tagID int64, err error) { // Default to "Global" scope if none defined if tagScope == "" { tagScope = "global" @@ -364,44 +429,43 @@ func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType s return 0, fmt.Errorf("cannot write tag scope with current authorization") } - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return 0, err } } - if _, err := r.AddTag(user, jobId, tagId); err != nil { + if _, err := r.AddTag(user, jobID, tagID); err != nil { return 0, err } - return tagId, nil + return tagID, nil } -// used in auto tagger plugins -func (r *JobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) { +func (r *JobRepository) AddTagOrCreateDirect(jobID int64, tagType string, tagName string) (tagID int64, err error) { tagScope := "global" - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return 0, err } } - if _, err := r.AddTagDirect(jobId, tagId); err != nil { + if _, err := r.AddTagDirect(jobID, tagID); err != nil { return 0, err } - return tagId, nil + return tagID, nil } -func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { +func (r *JobRepository) HasTag(jobID int64, tagType string, tagName string) bool { var id int64 q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). - Where("jobtag.job_id = ?", jobId).Where("tag.tag_type = ?", tagType). + Where("jobtag.job_id = ?", jobID).Where("tag.tag_type = ?", tagType). Where("tag.tag_name = ?", tagName) err := q.RunWith(r.stmtCache).QueryRow().Scan(&id) if err != nil { @@ -411,21 +475,21 @@ func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool } } -// TagId returns the database id of the tag with the specified type and name. -func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) { +// TagID returns the database id of the tag with the specified type and name. +func (r *JobRepository) TagID(tagType string, tagName string, tagScope string) (tagID int64, exists bool) { exists = true if err := sq.Select("id").From("tag"). Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).Where("tag.tag_scope = ?", tagScope). - RunWith(r.stmtCache).QueryRow().Scan(&tagId); err != nil { + RunWith(r.stmtCache).QueryRow().Scan(&tagID); err != nil { exists = false } return } // TagInfo returns the database infos of the tag with the specified id. -func (r *JobRepository) TagInfo(tagId int64) (tagType string, tagName string, tagScope string, exists bool) { +func (r *JobRepository) TagInfo(tagID int64) (tagType string, tagName string, tagScope string, exists bool) { exists = true - if err := sq.Select("tag.tag_type", "tag.tag_name", "tag.tag_scope").From("tag").Where("tag.id = ?", tagId). + if err := sq.Select("tag.tag_type", "tag.tag_name", "tag.tag_scope").From("tag").Where("tag.id = ?", tagID). RunWith(r.stmtCache).QueryRow().Scan(&tagType, &tagName, &tagScope); err != nil { exists = false } @@ -450,7 +514,7 @@ func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, e for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in GetTags: %v", err) return nil, err } // Handle Scope Filtering: Tag Scope is Global, Private (== Username) or User is auth'd to view Admin Tags @@ -483,7 +547,7 @@ func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in GetTagsDirect: %v", err) return nil, err } tags = append(tags, tag) @@ -492,7 +556,18 @@ func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { return tags, nil } -// GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has. +// getArchiveTags returns all tags for a job WITHOUT applying scope-based filtering. +// +// This internal function is used exclusively for job archive synchronization where we need +// to store all tags regardless of the current user's permissions. Unlike GetTags() which +// filters by scope, this returns the complete unfiltered tag list. +// +// Parameters: +// - job: Pointer to job database ID, or nil to return all tags in the system +// +// Returns all tags without scope filtering, used only for archive operations. +// +// WARNING: Do NOT expose this function to user-facing APIs as it bypasses authorization. func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") if job != nil { @@ -510,7 +585,7 @@ func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in getArchiveTags: %v", err) return nil, err } tags = append(tags, tag) @@ -519,18 +594,18 @@ func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { return tags, nil } -func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, tagScope string) (err error) { +func (r *JobRepository) ImportTag(jobID int64, tagType string, tagName string, tagScope string) (err error) { // Import has no scope ctx, only import from metafile to DB (No recursive archive update required), only returns err - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return err } } - q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId) + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobID, tagID) if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { s, _, _ := q.ToSql() @@ -541,6 +616,28 @@ func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, t return nil } +// checkScopeAuth validates whether a user is authorized to perform an operation on a tag with the given scope. +// +// This function implements the tag scope authorization matrix: +// +// Scope | Read Access | Write Access +// -------------|----------------------------------|---------------------------------- +// "global" | All users | Admin, Support, API-only +// "admin" | Admin, Support | Admin, API-only +// | Owner only | Owner only (private tags) +// +// Parameters: +// - user: User attempting the operation (must not be nil) +// - operation: Either "read" or "write" +// - scope: Tag scope value ("global", "admin", or username for private tags) +// +// Returns: +// - pass: true if authorized, false if denied +// - err: error only if operation is invalid or user is nil +// +// Special cases: +// - API-only users (single role: RoleApi) can write to admin and global scopes for automation +// - Private tags use the username as scope, granting exclusive access to that user func (r *JobRepository) checkScopeAuth(user *schema.User, operation string, scope string) (pass bool, err error) { if user != nil { switch { diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 6e4866eb..272eeb35 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -108,7 +108,7 @@ func initClusterConfig() error { } availability.SubClusters = append(availability.SubClusters, sc.Name) - sc.MetricConfig = append(sc.MetricConfig, *newMetric) + sc.MetricConfig = append(sc.MetricConfig, newMetric) if newMetric.Footprint != "" { sc.Footprint = append(sc.Footprint, newMetric.Name) @@ -282,7 +282,7 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) { return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname) } -func MetricIndex(mc []schema.MetricConfig, name string) (int, error) { +func MetricIndex(mc []*schema.MetricConfig, name string) (int, error) { for i, m := range mc { if m.Name == name { return i, nil diff --git a/pkg/archive/nodelist.go b/pkg/archive/nodelist.go index 7a3784c3..42d8492a 100644 --- a/pkg/archive/nodelist.go +++ b/pkg/archive/nodelist.go @@ -3,6 +3,70 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. +// Package archive provides nodelist parsing functionality for HPC cluster node specifications. +// +// # Overview +// +// The nodelist package implements parsing and querying of compact node list representations +// commonly used in HPC job schedulers and cluster management systems. It converts compressed +// node specifications (e.g., "node[01-10]") into queryable structures that can efficiently +// test node membership and expand to full node lists. +// +// # Node List Format +// +// Node lists use a compact syntax with the following rules: +// +// 1. Comma-separated terms represent alternative node patterns (OR logic) +// 2. Each term consists of a string prefix followed by optional numeric ranges +// 3. Numeric ranges are specified in square brackets with zero-padded start-end format +// 4. Multiple ranges within brackets are comma-separated +// 5. Range digits must be zero-padded and of equal length (e.g., "01-99" not "1-99") +// +// # Examples +// +// "node01" // Single node +// "node01,node02" // Multiple individual nodes +// "node[01-10]" // Range: node01 through node10 (zero-padded) +// "node[01-10,20-30]" // Multiple ranges: node01-10 and node20-30 +// "cn-00[10-20],cn-00[50-60]" // Different prefixes with ranges +// "login,compute[001-100]" // Mixed individual and range terms +// +// # Usage +// +// Parse a node list specification: +// +// nl, err := ParseNodeList("node[01-10],login") +// if err != nil { +// log.Fatal(err) +// } +// +// Check if a node name matches the list: +// +// if nl.Contains("node05") { +// // node05 is in the list +// } +// +// Expand to full list of node names: +// +// nodes := nl.PrintList() // ["node01", "node02", ..., "node10", "login"] +// +// Count total nodes in the list: +// +// count := nl.NodeCount() // 11 (10 from range + 1 individual) +// +// # Integration +// +// This package is used by: +// - clusterConfig.go: Parses SubCluster.Nodes field from cluster configuration +// - schema.resolvers.go: GraphQL resolver for computing numberOfNodes in subclusters +// - Job archive: Validates node assignments against configured cluster topology +// +// # Constraints +// +// - Only zero-padded numeric ranges are supported +// - Range start and end must have identical digit counts +// - No whitespace allowed in node list specifications +// - Ranges must be specified as start-end (not individual numbers) package archive import ( @@ -13,12 +77,36 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) +// NodeList represents a parsed node list specification as a collection of node pattern terms. +// Each term is a sequence of expressions that must match consecutively for a node name to match. +// Terms are evaluated with OR logic - a node matches if ANY term matches completely. +// +// Internal structure: +// - Outer slice: OR terms (comma-separated in input) +// - Inner slice: AND expressions (must all match sequentially) +// - Each expression implements: consume (pattern matching), limits (range info), prefix (string part) +// +// Example: "node[01-10],login" becomes: +// - Term 1: [NLExprString("node"), NLExprIntRanges(01-10)] +// - Term 2: [NLExprString("login")] type NodeList [][]interface { consume(input string) (next string, ok bool) limits() []map[string]int prefix() string } +// Contains tests whether the given node name matches any pattern in the NodeList. +// Returns true if the name matches at least one term completely, false otherwise. +// +// Matching logic: +// - Evaluates each term sequentially (OR logic across terms) +// - Within a term, all expressions must match in order (AND logic) +// - A match is complete only if the entire input is consumed (str == "") +// +// Examples: +// - NodeList("node[01-10]").Contains("node05") → true +// - NodeList("node[01-10]").Contains("node11") → false +// - NodeList("node[01-10]").Contains("node5") → false (missing zero-padding) func (nl *NodeList) Contains(name string) bool { var ok bool for _, term := range *nl { @@ -38,14 +126,22 @@ func (nl *NodeList) Contains(name string) bool { return false } +// PrintList expands the NodeList into a full slice of individual node names. +// This performs the inverse operation of ParseNodeList, expanding all ranges +// into their constituent node names with proper zero-padding. +// +// Returns a slice of node names in the order they appear in the NodeList. +// For range terms, nodes are expanded in ascending numeric order. +// +// Example: +// - ParseNodeList("node[01-03],login").PrintList() → ["node01", "node02", "node03", "login"] func (nl *NodeList) PrintList() []string { var out []string for _, term := range *nl { - // Get String-Part first prefix := term[0].prefix() - if len(term) == 1 { // If only String-Part in Term: Single Node Name -> Use as provided + if len(term) == 1 { out = append(out, prefix) - } else { // Else: Numeric start-end definition with x digits zeroPadded + } else { limitArr := term[1].limits() for _, inner := range limitArr { for i := inner["start"]; i < inner["end"]+1; i++ { @@ -61,12 +157,22 @@ func (nl *NodeList) PrintList() []string { return out } +// NodeCount returns the total number of individual nodes represented by the NodeList. +// This efficiently counts nodes without expanding the full list, making it suitable +// for large node ranges. +// +// Calculation: +// - Individual node terms contribute 1 +// - Range terms contribute (end - start + 1) for each range +// +// Example: +// - ParseNodeList("node[01-10],login").NodeCount() → 11 (10 from range + 1 individual) func (nl *NodeList) NodeCount() int { out := 0 for _, term := range *nl { - if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one + if len(term) == 1 { out += 1 - } else { // Else: Numeric start-end definition -> add difference + 1 + } else { limitArr := term[1].limits() for _, inner := range limitArr { out += (inner["end"] - inner["start"]) + 1 @@ -76,6 +182,8 @@ func (nl *NodeList) NodeCount() int { return out } +// NLExprString represents a literal string prefix in a node name pattern. +// It matches by checking if the input starts with this exact string. type NLExprString string func (nle NLExprString) consume(input string) (next string, ok bool) { @@ -96,6 +204,8 @@ func (nle NLExprString) prefix() string { return string(nle) } +// NLExprIntRanges represents multiple alternative integer ranges (comma-separated within brackets). +// A node name matches if it matches ANY of the contained ranges (OR logic). type NLExprIntRanges []NLExprIntRange func (nles NLExprIntRanges) consume(input string) (next string, ok bool) { @@ -122,6 +232,11 @@ func (nles NLExprIntRanges) prefix() string { return s } +// NLExprIntRange represents a single zero-padded integer range (e.g., "01-99"). +// Fields: +// - start, end: Numeric range boundaries (inclusive) +// - zeroPadded: Must be true (non-padded ranges not supported) +// - digits: Required digit count for zero-padding type NLExprIntRange struct { start, end int64 zeroPadded bool @@ -176,6 +291,28 @@ func (nles NLExprIntRange) prefix() string { return s } +// ParseNodeList parses a compact node list specification into a queryable NodeList structure. +// +// Input format rules: +// - Comma-separated terms (OR logic): "node01,node02" matches either node +// - Range syntax: "node[01-10]" expands to node01 through node10 +// - Multiple ranges: "node[01-05,10-15]" creates two ranges +// - Zero-padding required: digits in ranges must be zero-padded and equal length +// - Mixed formats: "login,compute[001-100]" combines individual and range terms +// +// Validation: +// - Returns error if brackets are unclosed +// - Returns error if ranges lack '-' separator +// - Returns error if range digits have unequal length +// - Returns error if range numbers fail to parse +// - Returns error on invalid characters +// +// Examples: +// - "node[01-10]" → NodeList with one term (10 nodes) +// - "node01,node02" → NodeList with two terms (2 nodes) +// - "cn[01-05,10-15]" → NodeList with ranges 01-05 and 10-15 (11 nodes total) +// - "a[1-9]" → Error (not zero-padded) +// - "a[01-9]" → Error (unequal digit counts) func ParseNodeList(raw string) (NodeList, error) { isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') } isDigit := func(r byte) bool { return '0' <= r && r <= '9' } @@ -232,12 +369,12 @@ func ParseNodeList(raw string) (NodeList, error) { nles := NLExprIntRanges{} for _, part := range parts { - minus := strings.Index(part, "-") - if minus == -1 { + before, after, ok := strings.Cut(part, "-") + if !ok { return nil, fmt.Errorf("ARCHIVE/NODELIST > no '-' found inside '[...]'") } - s1, s2 := part[0:minus], part[minus+1:] + s1, s2 := before, after if len(s1) != len(s2) || len(s1) == 0 { return nil, fmt.Errorf("ARCHIVE/NODELIST > %v and %v are not of equal length or of length zero", s1, s2) }