From 28539e60b06d12aab988dad818382850b4e9cd30 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 26 Nov 2024 07:02:53 +0100 Subject: [PATCH] Regenerate Swagger, fix tests, cleanup --- api/swagger.json | 239 +++++++++----------------- api/swagger.yaml | 194 +++++++-------------- internal/api/api_test.go | 23 ++- internal/api/docs.go | 239 +++++++++----------------- internal/api/rest.go | 20 +-- internal/repository/jobStartWorker.go | 21 ++- 6 files changed, 254 insertions(+), 482 deletions(-) diff --git a/api/swagger.json b/api/swagger.json index 7f5eaf7..3b59b5e 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -595,88 +595,6 @@ } } }, - "/jobs/stop_job/{id}": { - "post": { - "security": [ - { - "ApiKeyAuth": [] - } - ], - "description": "Job to stop is specified by database ID. Only stopTime and final state are required in request body.\nReturns full job resource information according to 'JobMeta' scheme.", - "consumes": [ - "application/json" - ], - "produces": [ - "application/json" - ], - "tags": [ - "Job add and modify" - ], - "summary": "Marks job as completed and triggers archiving", - "parameters": [ - { - "type": "integer", - "description": "Database ID of Job", - "name": "id", - "in": "path", - "required": true - }, - { - "description": "stopTime and final state in request body", - "name": "request", - "in": "body", - "required": true, - "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" - } - } - ], - "responses": { - "200": { - "description": "Job resource", - "schema": { - "$ref": "#/definitions/schema.JobMeta" - } - }, - "400": { - "description": "Bad Request", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "401": { - "description": "Unauthorized", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "403": { - "description": "Forbidden", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "404": { - "description": "Resource not found", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "422": { - "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "500": { - "description": "Internal Server Error", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - } - } - } - }, "/jobs/tag_job/{id}": { "post": { "security": [ @@ -684,7 +602,7 @@ "ApiKeyAuth": [] } ], - "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", + "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" ], @@ -1277,6 +1195,11 @@ "type": "string", "example": "Testjob" }, + "scope": { + "description": "Tag Scope for Frontend Display", + "type": "string", + "example": "global" + }, "type": { "description": "Tag Type", "type": "string", @@ -1404,9 +1327,8 @@ "api.StartJobApiResponse": { "type": "object", "properties": { - "id": { - "description": "Database ID of new job", - "type": "integer" + "msg": { + "type": "string" } } }, @@ -1418,17 +1340,14 @@ ], "properties": { "cluster": { - "description": "Cluster of job", "type": "string", "example": "fritz" }, "jobId": { - "description": "Cluster Job ID of job", "type": "integer", "example": 123000 }, "jobState": { - "description": "Final job state", "allOf": [ { "$ref": "#/definitions/schema.JobState" @@ -1437,12 +1356,10 @@ "example": "completed" }, "startTime": { - "description": "Start Time of job as epoch", "type": "integer", "example": 1649723812 }, "stopTime": { - "description": "Stop Time of job as epoch", "type": "integer", "example": 1649763839 } @@ -1487,12 +1404,10 @@ "type": "object", "properties": { "arrayJobId": { - "description": "The unique identifier of an array job", "type": "integer", "example": 123000 }, "cluster": { - "description": "The unique identifier of a cluster", "type": "string", "example": "fritz" }, @@ -1500,33 +1415,39 @@ "$ref": "#/definitions/schema.JobLinkResultList" }, "duration": { - "description": "Duration of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 43200 }, + "energy": { + "type": "number" + }, + "energyFootprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } + }, "exclusive": { - "description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user", "type": "integer", "maximum": 2, "minimum": 0, "example": 1 }, - "flopsAnyAvg": { - "description": "FlopsAnyAvg as Float64", - "type": "number" + "footprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } }, "id": { - "description": "The unique identifier of a job in the database", "type": "integer" }, "jobId": { - "description": "The unique identifier of a job", "type": "integer", "example": 123000 }, "jobState": { - "description": "Final state of job", "enum": [ "completed", "failed", @@ -1542,95 +1463,69 @@ ], "example": "completed" }, - "loadAvg": { - "description": "LoadAvg as Float64", - "type": "number" - }, - "memBwAvg": { - "description": "MemBwAvg as Float64", - "type": "number" - }, - "memUsedMax": { - "description": "MemUsedMax as Float64", - "type": "number" - }, "metaData": { - "description": "Additional information about the job", "type": "object", "additionalProperties": { "type": "string" } }, "monitoringStatus": { - "description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull", "type": "integer", "maximum": 3, "minimum": 0, "example": 1 }, "numAcc": { - "description": "Number of accelerators used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "numHwthreads": { - "description": "NumCores int32 `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` // Number of HWThreads used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 20 }, "numNodes": { - "description": "Number of nodes used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "partition": { - "description": "The Slurm partition to which the job was submitted", "type": "string", "example": "main" }, "project": { - "description": "The unique identifier of a project", "type": "string", "example": "abcd200" }, "resources": { - "description": "Resources used by job", "type": "array", "items": { "$ref": "#/definitions/schema.Resource" } }, "smt": { - "description": "SMT threads used by job", "type": "integer", "example": 4 }, "startTime": { - "description": "Start time as 'time.Time' data type", "type": "string" }, "subCluster": { - "description": "The unique identifier of a sub cluster", "type": "string", "example": "main" }, "tags": { - "description": "List of tags", "type": "array", "items": { "$ref": "#/definitions/schema.Tag" } }, "user": { - "description": "The unique identifier of a user", "type": "string", "example": "abcd100h" }, "walltime": { - "description": "Requested walltime of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 86400 @@ -1667,12 +1562,10 @@ "type": "object", "properties": { "arrayJobId": { - "description": "The unique identifier of an array job", "type": "integer", "example": 123000 }, "cluster": { - "description": "The unique identifier of a cluster", "type": "string", "example": "fritz" }, @@ -1680,29 +1573,39 @@ "$ref": "#/definitions/schema.JobLinkResultList" }, "duration": { - "description": "Duration of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 43200 }, + "energy": { + "type": "number" + }, + "energyFootprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } + }, "exclusive": { - "description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user", "type": "integer", "maximum": 2, "minimum": 0, "example": 1 }, + "footprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } + }, "id": { - "description": "The unique identifier of a job in the database", "type": "integer" }, "jobId": { - "description": "The unique identifier of a job", "type": "integer", "example": 123000 }, "jobState": { - "description": "Final state of job", "enum": [ "completed", "failed", @@ -1719,91 +1622,76 @@ "example": "completed" }, "metaData": { - "description": "Additional information about the job", "type": "object", "additionalProperties": { "type": "string" } }, "monitoringStatus": { - "description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull", "type": "integer", "maximum": 3, "minimum": 0, "example": 1 }, "numAcc": { - "description": "Number of accelerators used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "numHwthreads": { - "description": "NumCores int32 `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` // Number of HWThreads used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 20 }, "numNodes": { - "description": "Number of nodes used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "partition": { - "description": "The Slurm partition to which the job was submitted", "type": "string", "example": "main" }, "project": { - "description": "The unique identifier of a project", "type": "string", "example": "abcd200" }, "resources": { - "description": "Resources used by job", "type": "array", "items": { "$ref": "#/definitions/schema.Resource" } }, "smt": { - "description": "SMT threads used by job", "type": "integer", "example": 4 }, "startTime": { - "description": "Start epoch time stamp in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 1649723812 }, "statistics": { - "description": "Metric statistics of job", "type": "object", "additionalProperties": { "$ref": "#/definitions/schema.JobStatistics" } }, "subCluster": { - "description": "The unique identifier of a sub cluster", "type": "string", "example": "main" }, "tags": { - "description": "List of tags", "type": "array", "items": { "$ref": "#/definitions/schema.Tag" } }, "user": { - "description": "The unique identifier of a user", "type": "string", "example": "abcd100h" }, "walltime": { - "description": "Requested walltime of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 86400 @@ -1892,6 +1780,15 @@ "caution": { "type": "number" }, + "energy": { + "type": "string" + }, + "footprint": { + "type": "string" + }, + "lowerIsBetter": { + "type": "boolean" + }, "name": { "type": "string" }, @@ -1969,22 +1866,18 @@ "type": "object", "properties": { "accelerators": { - "description": "List of of accelerator device ids", "type": "array", "items": { "type": "string" } }, "configuration": { - "description": "The configuration options of the node", "type": "string" }, "hostname": { - "description": "Name of the host (= node)", "type": "string" }, "hwthreads": { - "description": "List of OS processor ids", "type": "array", "items": { "type": "integer" @@ -2027,6 +1920,12 @@ "type": "number" } }, + "median": { + "type": "array", + "items": { + "type": "number" + } + }, "min": { "type": "array", "items": { @@ -2050,15 +1949,33 @@ "coresPerSocket": { "type": "integer" }, + "energyFootprint": { + "type": "array", + "items": { + "type": "string" + } + }, "flopRateScalar": { "$ref": "#/definitions/schema.MetricValue" }, "flopRateSimd": { "$ref": "#/definitions/schema.MetricValue" }, + "footprint": { + "type": "array", + "items": { + "type": "string" + } + }, "memoryBandwidth": { "$ref": "#/definitions/schema.MetricValue" }, + "metricConfig": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.MetricConfig" + } + }, "name": { "type": "string" }, @@ -2088,6 +2005,15 @@ "caution": { "type": "number" }, + "energy": { + "type": "string" + }, + "footprint": { + "type": "string" + }, + "lowerIsBetter": { + "type": "boolean" + }, "name": { "type": "string" }, @@ -2107,16 +2033,17 @@ "type": "object", "properties": { "id": { - "description": "The unique DB identifier of a tag", "type": "integer" }, "name": { - "description": "Tag Name", "type": "string", "example": "Testjob" }, + "scope": { + "type": "string", + "example": "global" + }, "type": { - "description": "Tag Type", "type": "string", "example": "Debug" } diff --git a/api/swagger.yaml b/api/swagger.yaml index f47ac3f..4e3c47e 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -23,6 +23,10 @@ definitions: description: Tag Name example: Testjob type: string + scope: + description: Tag Scope for Frontend Display + example: global + type: string type: description: Tag Type example: Debug @@ -110,31 +114,25 @@ definitions: type: object api.StartJobApiResponse: properties: - id: - description: Database ID of new job - type: integer + msg: + type: string type: object api.StopJobApiRequest: properties: cluster: - description: Cluster of job example: fritz type: string jobId: - description: Cluster Job ID of job example: 123000 type: integer jobState: allOf: - $ref: '#/definitions/schema.JobState' - description: Final job state example: completed startTime: - description: Start Time of job as epoch example: 1649723812 type: integer stopTime: - description: Stop Time of job as epoch example: 1649763839 type: integer required: @@ -167,42 +165,40 @@ definitions: description: Information of a HPC job. properties: arrayJobId: - description: The unique identifier of an array job example: 123000 type: integer cluster: - description: The unique identifier of a cluster example: fritz type: string concurrentJobs: $ref: '#/definitions/schema.JobLinkResultList' duration: - description: Duration of job in seconds (Min > 0) example: 43200 minimum: 1 type: integer + energy: + type: number + energyFootprint: + additionalProperties: + type: number + type: object exclusive: - description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs - of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple - jobs of same user' example: 1 maximum: 2 minimum: 0 type: integer - flopsAnyAvg: - description: FlopsAnyAvg as Float64 - type: number + footprint: + additionalProperties: + type: number + type: object id: - description: The unique identifier of a job in the database type: integer jobId: - description: The unique identifier of a job example: 123000 type: integer jobState: allOf: - $ref: '#/definitions/schema.JobState' - description: Final state of job enum: - completed - failed @@ -211,79 +207,53 @@ definitions: - timeout - out_of_memory example: completed - loadAvg: - description: LoadAvg as Float64 - type: number - memBwAvg: - description: MemBwAvg as Float64 - type: number - memUsedMax: - description: MemUsedMax as Float64 - type: number metaData: additionalProperties: type: string - description: Additional information about the job type: object monitoringStatus: - description: 'State of monitoring system during job run: 0 - Disabled, 1 - - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull' example: 1 maximum: 3 minimum: 0 type: integer numAcc: - description: Number of accelerators used (Min > 0) example: 2 minimum: 1 type: integer numHwthreads: - description: NumCores int32 `json:"numCores" db:"num_cores" - example:"20" minimum:"1"` // - Number of HWThreads used (Min > 0) example: 20 minimum: 1 type: integer numNodes: - description: Number of nodes used (Min > 0) example: 2 minimum: 1 type: integer partition: - description: The Slurm partition to which the job was submitted example: main type: string project: - description: The unique identifier of a project example: abcd200 type: string resources: - description: Resources used by job items: $ref: '#/definitions/schema.Resource' type: array smt: - description: SMT threads used by job example: 4 type: integer startTime: - description: Start time as 'time.Time' data type type: string subCluster: - description: The unique identifier of a sub cluster example: main type: string tags: - description: List of tags items: $ref: '#/definitions/schema.Tag' type: array user: - description: The unique identifier of a user example: abcd100h type: string walltime: - description: Requested walltime of job in seconds (Min > 0) example: 86400 minimum: 1 type: integer @@ -308,39 +278,40 @@ definitions: description: Meta data information of a HPC job. properties: arrayJobId: - description: The unique identifier of an array job example: 123000 type: integer cluster: - description: The unique identifier of a cluster example: fritz type: string concurrentJobs: $ref: '#/definitions/schema.JobLinkResultList' duration: - description: Duration of job in seconds (Min > 0) example: 43200 minimum: 1 type: integer + energy: + type: number + energyFootprint: + additionalProperties: + type: number + type: object exclusive: - description: 'Specifies how nodes are shared: 0 - Shared among multiple jobs - of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple - jobs of same user' example: 1 maximum: 2 minimum: 0 type: integer + footprint: + additionalProperties: + type: number + type: object id: - description: The unique identifier of a job in the database type: integer jobId: - description: The unique identifier of a job example: 123000 type: integer jobState: allOf: - $ref: '#/definitions/schema.JobState' - description: Final state of job enum: - completed - failed @@ -352,74 +323,56 @@ definitions: metaData: additionalProperties: type: string - description: Additional information about the job type: object monitoringStatus: - description: 'State of monitoring system during job run: 0 - Disabled, 1 - - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull' example: 1 maximum: 3 minimum: 0 type: integer numAcc: - description: Number of accelerators used (Min > 0) example: 2 minimum: 1 type: integer numHwthreads: - description: NumCores int32 `json:"numCores" db:"num_cores" - example:"20" minimum:"1"` // - Number of HWThreads used (Min > 0) example: 20 minimum: 1 type: integer numNodes: - description: Number of nodes used (Min > 0) example: 2 minimum: 1 type: integer partition: - description: The Slurm partition to which the job was submitted example: main type: string project: - description: The unique identifier of a project example: abcd200 type: string resources: - description: Resources used by job items: $ref: '#/definitions/schema.Resource' type: array smt: - description: SMT threads used by job example: 4 type: integer startTime: - description: Start epoch time stamp in seconds (Min > 0) example: 1649723812 minimum: 1 type: integer statistics: additionalProperties: $ref: '#/definitions/schema.JobStatistics' - description: Metric statistics of job type: object subCluster: - description: The unique identifier of a sub cluster example: main type: string tags: - description: List of tags items: $ref: '#/definitions/schema.Tag' type: array user: - description: The unique identifier of a user example: abcd100h type: string walltime: - description: Requested walltime of job in seconds (Min > 0) example: 86400 minimum: 1 type: integer @@ -486,6 +439,12 @@ definitions: type: number caution: type: number + energy: + type: string + footprint: + type: string + lowerIsBetter: + type: boolean name: type: string normal: @@ -541,18 +500,14 @@ definitions: description: A resource used by a job properties: accelerators: - description: List of of accelerator device ids items: type: string type: array configuration: - description: The configuration options of the node type: string hostname: - description: Name of the host (= node) type: string hwthreads: - description: List of OS processor ids items: type: integer type: array @@ -580,6 +535,10 @@ definitions: items: type: number type: array + median: + items: + type: number + type: array min: items: type: number @@ -595,12 +554,24 @@ definitions: properties: coresPerSocket: type: integer + energyFootprint: + items: + type: string + type: array flopRateScalar: $ref: '#/definitions/schema.MetricValue' flopRateSimd: $ref: '#/definitions/schema.MetricValue' + footprint: + items: + type: string + type: array memoryBandwidth: $ref: '#/definitions/schema.MetricValue' + metricConfig: + items: + $ref: '#/definitions/schema.MetricConfig' + type: array name: type: string nodes: @@ -620,6 +591,12 @@ definitions: type: number caution: type: number + energy: + type: string + footprint: + type: string + lowerIsBetter: + type: boolean name: type: string normal: @@ -633,14 +610,14 @@ definitions: description: Defines a tag using name and type. properties: id: - description: The unique DB identifier of a tag type: integer name: - description: Tag Name example: Testjob type: string + scope: + example: global + type: string type: - description: Tag Type example: Debug type: string type: object @@ -1197,68 +1174,13 @@ paths: summary: Marks job as completed and triggers archiving tags: - Job add and modify - /jobs/stop_job/{id}: - post: - consumes: - - application/json - description: |- - Job to stop is specified by database ID. Only stopTime and final state are required in request body. - Returns full job resource information according to 'JobMeta' scheme. - parameters: - - description: Database ID of Job - in: path - name: id - required: true - type: integer - - description: stopTime and final state in request body - in: body - name: request - required: true - schema: - $ref: '#/definitions/api.StopJobApiRequest' - produces: - - application/json - responses: - "200": - description: Job resource - schema: - $ref: '#/definitions/schema.JobMeta' - "400": - description: Bad Request - schema: - $ref: '#/definitions/api.ErrorResponse' - "401": - description: Unauthorized - schema: - $ref: '#/definitions/api.ErrorResponse' - "403": - description: Forbidden - schema: - $ref: '#/definitions/api.ErrorResponse' - "404": - description: Resource not found - schema: - $ref: '#/definitions/api.ErrorResponse' - "422": - description: 'Unprocessable Entity: finding job failed: sql: no rows in - result set' - schema: - $ref: '#/definitions/api.ErrorResponse' - "500": - description: Internal Server Error - schema: - $ref: '#/definitions/api.ErrorResponse' - security: - - ApiKeyAuth: [] - summary: Marks job as completed and triggers archiving - tags: - - Job add and modify /jobs/tag_job/{id}: post: consumes: - application/json description: |- Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely. + Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username. If tagged job is already finished: Tag will be written directly to respective archive files. parameters: - description: Job Database ID diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 3d1d7bb..bcabd5f 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -14,9 +14,9 @@ import ( "os" "path/filepath" "reflect" - "strconv" "strings" "testing" + "time" "github.com/ClusterCockpit/cc-backend/internal/api" "github.com/ClusterCockpit/cc-backend/internal/archiver" @@ -200,6 +200,10 @@ func TestRestApi(t *testing.T) { r.StrictSlash(true) restapi.MountApiRoutes(r) + var TestJobId int64 = 123 + var TestClusterName string = "testcluster" + var TestStartTime int64 = 123456789 + const startJobBody string = `{ "jobId": 123, "user": "testuser", @@ -225,7 +229,6 @@ func TestRestApi(t *testing.T) { "startTime": 123456789 }` - var dbid int64 const contextUserKey repository.ContextKey = "user" contextUserValue := &schema.User{ Username: "testuser", @@ -247,13 +250,10 @@ func TestRestApi(t *testing.T) { t.Fatal(response.Status, recorder.Body.String()) } - var res api.StartJobApiResponse - if err := json.Unmarshal(recorder.Body.Bytes(), &res); err != nil { - t.Fatal(err) - } + time.Sleep(1 * time.Second) resolver := graph.GetResolverInstance() - job, err := resolver.Query().Job(ctx, strconv.Itoa(int(res.DBID))) + job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -285,8 +285,6 @@ func TestRestApi(t *testing.T) { if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { t.Fatalf("unexpected tags: %#v", job.Tags) } - - dbid = res.DBID }); !ok { return } @@ -314,8 +312,7 @@ func TestRestApi(t *testing.T) { } archiver.WaitForArchiving() - resolver := graph.GetResolverInstance() - job, err := resolver.Query().Job(ctx, strconv.Itoa(int(dbid))) + job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -404,8 +401,10 @@ func TestRestApi(t *testing.T) { t.Fatal("subtest failed") } + time.Sleep(1 * time.Second) + const stopJobBodyFailed string = `{ - "jobId": 12345, + "jobId": 12345, "cluster": "testcluster", "jobState": "failed", diff --git a/internal/api/docs.go b/internal/api/docs.go index e5ec50b..7c1daac 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -601,88 +601,6 @@ const docTemplate = `{ } } }, - "/jobs/stop_job/{id}": { - "post": { - "security": [ - { - "ApiKeyAuth": [] - } - ], - "description": "Job to stop is specified by database ID. Only stopTime and final state are required in request body.\nReturns full job resource information according to 'JobMeta' scheme.", - "consumes": [ - "application/json" - ], - "produces": [ - "application/json" - ], - "tags": [ - "Job add and modify" - ], - "summary": "Marks job as completed and triggers archiving", - "parameters": [ - { - "type": "integer", - "description": "Database ID of Job", - "name": "id", - "in": "path", - "required": true - }, - { - "description": "stopTime and final state in request body", - "name": "request", - "in": "body", - "required": true, - "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" - } - } - ], - "responses": { - "200": { - "description": "Job resource", - "schema": { - "$ref": "#/definitions/schema.JobMeta" - } - }, - "400": { - "description": "Bad Request", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "401": { - "description": "Unauthorized", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "403": { - "description": "Forbidden", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "404": { - "description": "Resource not found", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "422": { - "description": "Unprocessable Entity: finding job failed: sql: no rows in result set", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - }, - "500": { - "description": "Internal Server Error", - "schema": { - "$ref": "#/definitions/api.ErrorResponse" - } - } - } - } - }, "/jobs/tag_job/{id}": { "post": { "security": [ @@ -690,7 +608,7 @@ const docTemplate = `{ "ApiKeyAuth": [] } ], - "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", + "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" ], @@ -1283,6 +1201,11 @@ const docTemplate = `{ "type": "string", "example": "Testjob" }, + "scope": { + "description": "Tag Scope for Frontend Display", + "type": "string", + "example": "global" + }, "type": { "description": "Tag Type", "type": "string", @@ -1410,9 +1333,8 @@ const docTemplate = `{ "api.StartJobApiResponse": { "type": "object", "properties": { - "id": { - "description": "Database ID of new job", - "type": "integer" + "msg": { + "type": "string" } } }, @@ -1424,17 +1346,14 @@ const docTemplate = `{ ], "properties": { "cluster": { - "description": "Cluster of job", "type": "string", "example": "fritz" }, "jobId": { - "description": "Cluster Job ID of job", "type": "integer", "example": 123000 }, "jobState": { - "description": "Final job state", "allOf": [ { "$ref": "#/definitions/schema.JobState" @@ -1443,12 +1362,10 @@ const docTemplate = `{ "example": "completed" }, "startTime": { - "description": "Start Time of job as epoch", "type": "integer", "example": 1649723812 }, "stopTime": { - "description": "Stop Time of job as epoch", "type": "integer", "example": 1649763839 } @@ -1493,12 +1410,10 @@ const docTemplate = `{ "type": "object", "properties": { "arrayJobId": { - "description": "The unique identifier of an array job", "type": "integer", "example": 123000 }, "cluster": { - "description": "The unique identifier of a cluster", "type": "string", "example": "fritz" }, @@ -1506,33 +1421,39 @@ const docTemplate = `{ "$ref": "#/definitions/schema.JobLinkResultList" }, "duration": { - "description": "Duration of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 43200 }, + "energy": { + "type": "number" + }, + "energyFootprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } + }, "exclusive": { - "description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user", "type": "integer", "maximum": 2, "minimum": 0, "example": 1 }, - "flopsAnyAvg": { - "description": "FlopsAnyAvg as Float64", - "type": "number" + "footprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } }, "id": { - "description": "The unique identifier of a job in the database", "type": "integer" }, "jobId": { - "description": "The unique identifier of a job", "type": "integer", "example": 123000 }, "jobState": { - "description": "Final state of job", "enum": [ "completed", "failed", @@ -1548,95 +1469,69 @@ const docTemplate = `{ ], "example": "completed" }, - "loadAvg": { - "description": "LoadAvg as Float64", - "type": "number" - }, - "memBwAvg": { - "description": "MemBwAvg as Float64", - "type": "number" - }, - "memUsedMax": { - "description": "MemUsedMax as Float64", - "type": "number" - }, "metaData": { - "description": "Additional information about the job", "type": "object", "additionalProperties": { "type": "string" } }, "monitoringStatus": { - "description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull", "type": "integer", "maximum": 3, "minimum": 0, "example": 1 }, "numAcc": { - "description": "Number of accelerators used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "numHwthreads": { - "description": "NumCores int32 ` + "`" + `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` + "`" + ` // Number of HWThreads used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 20 }, "numNodes": { - "description": "Number of nodes used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "partition": { - "description": "The Slurm partition to which the job was submitted", "type": "string", "example": "main" }, "project": { - "description": "The unique identifier of a project", "type": "string", "example": "abcd200" }, "resources": { - "description": "Resources used by job", "type": "array", "items": { "$ref": "#/definitions/schema.Resource" } }, "smt": { - "description": "SMT threads used by job", "type": "integer", "example": 4 }, "startTime": { - "description": "Start time as 'time.Time' data type", "type": "string" }, "subCluster": { - "description": "The unique identifier of a sub cluster", "type": "string", "example": "main" }, "tags": { - "description": "List of tags", "type": "array", "items": { "$ref": "#/definitions/schema.Tag" } }, "user": { - "description": "The unique identifier of a user", "type": "string", "example": "abcd100h" }, "walltime": { - "description": "Requested walltime of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 86400 @@ -1673,12 +1568,10 @@ const docTemplate = `{ "type": "object", "properties": { "arrayJobId": { - "description": "The unique identifier of an array job", "type": "integer", "example": 123000 }, "cluster": { - "description": "The unique identifier of a cluster", "type": "string", "example": "fritz" }, @@ -1686,29 +1579,39 @@ const docTemplate = `{ "$ref": "#/definitions/schema.JobLinkResultList" }, "duration": { - "description": "Duration of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 43200 }, + "energy": { + "type": "number" + }, + "energyFootprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } + }, "exclusive": { - "description": "Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user", "type": "integer", "maximum": 2, "minimum": 0, "example": 1 }, + "footprint": { + "type": "object", + "additionalProperties": { + "type": "number" + } + }, "id": { - "description": "The unique identifier of a job in the database", "type": "integer" }, "jobId": { - "description": "The unique identifier of a job", "type": "integer", "example": 123000 }, "jobState": { - "description": "Final state of job", "enum": [ "completed", "failed", @@ -1725,91 +1628,76 @@ const docTemplate = `{ "example": "completed" }, "metaData": { - "description": "Additional information about the job", "type": "object", "additionalProperties": { "type": "string" } }, "monitoringStatus": { - "description": "State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull", "type": "integer", "maximum": 3, "minimum": 0, "example": 1 }, "numAcc": { - "description": "Number of accelerators used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "numHwthreads": { - "description": "NumCores int32 ` + "`" + `json:\"numCores\" db:\"num_cores\" example:\"20\" minimum:\"1\"` + "`" + ` // Number of HWThreads used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 20 }, "numNodes": { - "description": "Number of nodes used (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 2 }, "partition": { - "description": "The Slurm partition to which the job was submitted", "type": "string", "example": "main" }, "project": { - "description": "The unique identifier of a project", "type": "string", "example": "abcd200" }, "resources": { - "description": "Resources used by job", "type": "array", "items": { "$ref": "#/definitions/schema.Resource" } }, "smt": { - "description": "SMT threads used by job", "type": "integer", "example": 4 }, "startTime": { - "description": "Start epoch time stamp in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 1649723812 }, "statistics": { - "description": "Metric statistics of job", "type": "object", "additionalProperties": { "$ref": "#/definitions/schema.JobStatistics" } }, "subCluster": { - "description": "The unique identifier of a sub cluster", "type": "string", "example": "main" }, "tags": { - "description": "List of tags", "type": "array", "items": { "$ref": "#/definitions/schema.Tag" } }, "user": { - "description": "The unique identifier of a user", "type": "string", "example": "abcd100h" }, "walltime": { - "description": "Requested walltime of job in seconds (Min \u003e 0)", "type": "integer", "minimum": 1, "example": 86400 @@ -1898,6 +1786,15 @@ const docTemplate = `{ "caution": { "type": "number" }, + "energy": { + "type": "string" + }, + "footprint": { + "type": "string" + }, + "lowerIsBetter": { + "type": "boolean" + }, "name": { "type": "string" }, @@ -1975,22 +1872,18 @@ const docTemplate = `{ "type": "object", "properties": { "accelerators": { - "description": "List of of accelerator device ids", "type": "array", "items": { "type": "string" } }, "configuration": { - "description": "The configuration options of the node", "type": "string" }, "hostname": { - "description": "Name of the host (= node)", "type": "string" }, "hwthreads": { - "description": "List of OS processor ids", "type": "array", "items": { "type": "integer" @@ -2033,6 +1926,12 @@ const docTemplate = `{ "type": "number" } }, + "median": { + "type": "array", + "items": { + "type": "number" + } + }, "min": { "type": "array", "items": { @@ -2056,15 +1955,33 @@ const docTemplate = `{ "coresPerSocket": { "type": "integer" }, + "energyFootprint": { + "type": "array", + "items": { + "type": "string" + } + }, "flopRateScalar": { "$ref": "#/definitions/schema.MetricValue" }, "flopRateSimd": { "$ref": "#/definitions/schema.MetricValue" }, + "footprint": { + "type": "array", + "items": { + "type": "string" + } + }, "memoryBandwidth": { "$ref": "#/definitions/schema.MetricValue" }, + "metricConfig": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.MetricConfig" + } + }, "name": { "type": "string" }, @@ -2094,6 +2011,15 @@ const docTemplate = `{ "caution": { "type": "number" }, + "energy": { + "type": "string" + }, + "footprint": { + "type": "string" + }, + "lowerIsBetter": { + "type": "boolean" + }, "name": { "type": "string" }, @@ -2113,16 +2039,17 @@ const docTemplate = `{ "type": "object", "properties": { "id": { - "description": "The unique DB identifier of a tag", "type": "integer" }, "name": { - "description": "Tag Name", "type": "string", "example": "Testjob" }, + "scope": { + "type": "string", + "example": "global" + }, "type": { - "description": "Tag Type", "type": "string", "example": "Debug" } diff --git a/internal/api/rest.go b/internal/api/rest.go index b60521b..3842596 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -124,8 +124,7 @@ func (api *RestApi) MountFrontendApiRoutes(r *mux.Router) { // StartJobApiResponse model type StartJobApiResponse struct { - // Database ID of new job - DBID int64 `json:"id"` + Message string `json:"msg"` } // DeleteJobApiResponse model @@ -806,25 +805,10 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { repository.TriggerJobStart(repository.JobWithUser{Job: &req, User: repository.GetUserFromContext(r.Context())}) - id, err := api.JobRepository.Start(&req) - if err != nil { - handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw) - return - } - - for _, tag := range req.Tags { - if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw) - return - } - } - - log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime) rw.Header().Add("Content-Type", "application/json") rw.WriteHeader(http.StatusCreated) json.NewEncoder(rw).Encode(StartJobApiResponse{ - DBID: id, + Message: fmt.Sprintf("Successfully triggered job start"), }) } diff --git a/internal/repository/jobStartWorker.go b/internal/repository/jobStartWorker.go index dbd2247..18d2be7 100644 --- a/internal/repository/jobStartWorker.go +++ b/internal/repository/jobStartWorker.go @@ -6,6 +6,7 @@ package repository import ( "sync" + "time" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -36,18 +37,30 @@ func jobStartWorker() { break } jobRepo := GetJobRepository() + var id int64 - id, err := jobRepo.Start(req.Job) - if err != nil { - log.Errorf("insert into database failed: %v", err) + for i := 0; i < 5; i++ { + var err error + + id, err = jobRepo.Start(req.Job) + if err != nil { + log.Errorf("Attempt %d: insert into database failed: %v", i, err) + } else { + break + } + time.Sleep(1 * time.Second) } for _, tag := range req.Job.Tags { - if _, err := jobRepo.AddTagOrCreate(req.User, id, tag.Type, tag.Name, tag.Scope); err != nil { + if _, err := jobRepo.AddTagOrCreate(req.User, id, + tag.Type, tag.Name, tag.Scope); err != nil { log.Errorf("adding tag to new job %d failed: %v", id, err) } } + log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", + id, req.Job.Cluster, req.Job.JobID, req.Job.User, req.Job.StartTime) + jobStartPending.Done() } }