new /api/jobs/ REST endpoint

This commit is contained in:
Lou Knauer 2022-01-17 13:27:40 +01:00
parent 98c4de65a7
commit 1a9f67fa28
2 changed files with 263 additions and 176 deletions

View File

@ -1,171 +1,203 @@
# #
# ClusterCockpit's API spec can be exported via: # ClusterCockpit's API spec can be exported via:
# docker exec -it cc-php php bin/console api:openapi:export --yaml # docker exec -it cc-php php bin/console api:openapi:export --yaml
# #
# This spec is written by hand and hopefully up to date with the API. # This spec is written by hand and hopefully up to date with the API.
# #
openapi: 3.0.3 openapi: 3.0.3
info: info:
title: 'ClusterCockpit REST API' title: 'ClusterCockpit REST API'
description: 'API for batch job control' description: 'API for batch job control'
version: 0.0.2 version: 0.0.2
servers: servers:
- url: / - url: /
description: '' description: ''
paths: paths:
'/api/jobs/{id}': '/api/jobs/':
get: get:
operationId: 'getJob' operationId: 'getJobs'
summary: 'Get job resource' summary: 'List all jobs'
parameters: description: 'Get a list of all jobs in the JSON schema defined via GraphQL (main difference: start-time is RFC3339 encoded and there are no statistics). Filters can be applied using query parameters.'
- name: id parameters:
in: path - name: state
required: true in: query
schema: { type: integer } schema:
description: 'Database ID (Resource Identifier)' type: string
responses: enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
200: - name: cluster
description: 'Job resource' in: query
content: schema: { type: string }
'application/json': responses:
schema: 200:
$ref: '#/components/schemas/Job' description: 'Array of jobs'
404: content:
description: 'Resource not found' 'application/json':
'/api/jobs/tag_job/{id}': schema:
post: type: array
operationId: 'tagJob' items:
summary: 'Add a tag to a job' # Not totally true: start-time is a RFC3339 string here!
parameters: $ref: '#/components/schemas/Job'
- name: id 400:
in: path description: 'Bad Request'
required: true '/api/jobs/{id}':
schema: { type: integer } get:
description: 'Job ID' operationId: 'getJob'
requestBody: summary: 'Get job resource'
description: 'Array of tags to add' parameters:
required: true - name: id
content: in: path
'application/json': required: true
schema: schema: { type: integer }
type: array description: 'Database ID (Resource Identifier)'
items: responses:
$ref: '#/components/schemas/Tag' 200:
responses: description: 'Job resource'
200: content:
description: 'Job resource' 'application/json':
content: schema:
'application/json': $ref: '#/components/schemas/Job'
schema: 404:
$ref: '#/components/schemas/Job' description: 'Resource not found'
404: '/api/jobs/tag_job/{id}':
description: 'Job or tag does not exist' post:
400: operationId: 'tagJob'
description: 'Bad request' summary: 'Add a tag to a job'
'/api/jobs/start_job/': parameters:
post: - name: id
operationId: 'startJob' in: path
summary: 'Add a newly started job' required: true
requestBody: schema: { type: integer }
required: true description: 'Job ID'
content: requestBody:
'application/json': description: 'Array of tags to add'
schema: required: true
$ref: '#/components/schemas/Job' content:
responses: 'application/json':
201: schema:
description: 'Job successfully' type: array
content: items:
'application/json': $ref: '#/components/schemas/Tag'
schema: responses:
type: object 200:
properties: description: 'Job resource'
id: content:
type: integer 'application/json':
description: 'The database ID assigned to this job' schema:
400: $ref: '#/components/schemas/Job'
description: 'Bad request' 404:
422: description: 'Job or tag does not exist'
description: 'The combination of jobId, clusterId and startTime does already exist' 400:
'/api/jobs/stop_job/': description: 'Bad request'
post: '/api/jobs/start_job/':
operationId: stopJobViaJobID post:
summary: 'Mark a job as stopped. Which job to stop is specified by the request body.' operationId: 'startJob'
requestBody: summary: 'Add a newly started job'
required: true requestBody:
content: required: true
'application/json': content:
schema: 'application/json':
type: object schema:
required: [jobId, cluster, startTime, stopTime] $ref: '#/components/schemas/Job'
properties: responses:
jobId: { type: integer } 201:
cluster: { type: string } description: 'Job successfully'
startTime: { type: integer } content:
stopTime: { type: integer } 'application/json':
responses: schema:
200: type: object
description: 'Job resource' properties:
content: id:
'application/json': type: integer
schema: description: 'The database ID assigned to this job'
$ref: '#/components/schemas/Job' 400:
400: description: 'Bad request'
description: 'Bad request' 422:
404: description: 'The combination of jobId, clusterId and startTime does already exist'
description: 'Resource not found' '/api/jobs/stop_job/':
'/api/jobs/stop_job/{id}': post:
post: operationId: stopJobViaJobID
operationId: 'stopJobViaDBID' summary: 'Mark a job as stopped. Which job to stop is specified by the request body.'
summary: 'Mark a job as stopped.' requestBody:
parameters: required: true
- name: id content:
in: path 'application/json':
required: true schema:
schema: { type: integer } type: object
description: 'Database ID (Resource Identifier)' required: [jobId, cluster, stopTime, jobState]
requestBody: properties:
required: true jobId: { type: integer }
content: cluster: { type: string }
'application/json': startTime: { type: integer }
schema: stopTime: { type: integer }
type: object jobState:
required: [stopTime] type: string
properties: enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
stopTime: { type: integer } responses:
responses: 200:
200: description: 'Job resource'
description: 'Job resource' content:
content: 'application/json':
'application/json': schema:
schema: $ref: '#/components/schemas/Job'
$ref: '#/components/schemas/Job' 400:
400: description: 'Bad request'
description: 'Bad request' 404:
404: description: 'Resource not found'
description: 'Resource not found' '/api/jobs/stop_job/{id}':
components: post:
schemas: operationId: 'stopJobViaDBID'
Tag: summary: 'Mark a job as stopped.'
description: 'A job tag' parameters:
type: object - name: id
properties: in: path
id: required: true
type: string schema: { type: integer }
description: 'Database ID' description: 'Database ID (Resource Identifier)'
type: requestBody:
type: string required: true
description: 'Tag type' content:
name: 'application/json':
type: string schema:
description: 'Tag name' type: object
Job: required: [stopTime, jobState]
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json properties:
securitySchemes: stopTime: { type: integer }
bearerAuth: jobState:
type: http type: string
scheme: bearer enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
bearerFormat: JWT responses:
security: 200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
400:
description: 'Bad request'
404:
description: 'Resource not found'
components:
schemas:
Tag:
description: 'A job tag'
type: object
properties:
id:
type: string
description: 'Database ID'
type:
type: string
description: 'Tag type'
name:
type: string
description: 'Tag name'
Job:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- bearerAuth: [] # Applies `bearerAuth` globally - bearerAuth: [] # Applies `bearerAuth` globally

View File

@ -1,6 +1,7 @@
package api package api
import ( import (
"bufio"
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
@ -13,6 +14,7 @@ import (
"github.com/ClusterCockpit/cc-jobarchive/config" "github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph" "github.com/ClusterCockpit/cc-jobarchive/graph"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/metricdata" "github.com/ClusterCockpit/cc-jobarchive/metricdata"
"github.com/ClusterCockpit/cc-jobarchive/schema" "github.com/ClusterCockpit/cc-jobarchive/schema"
sq "github.com/Masterminds/squirrel" sq "github.com/Masterminds/squirrel"
@ -36,11 +38,14 @@ func (api *RestApi) MountRoutes(r *mux.Router) {
r.HandleFunc("/jobs/stop_job/", api.stopJob).Methods(http.MethodPost, http.MethodPut) r.HandleFunc("/jobs/stop_job/", api.stopJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/stop_job/{id}", api.stopJob).Methods(http.MethodPost, http.MethodPut) r.HandleFunc("/jobs/stop_job/{id}", api.stopJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
r.HandleFunc("/jobs/{id}", api.getJob).Methods(http.MethodGet) r.HandleFunc("/jobs/{id}", api.getJob).Methods(http.MethodGet)
r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch) r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet) if api.MachineStateDir != "" {
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost) r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
}
} }
type StartJobApiRespone struct { type StartJobApiRespone struct {
@ -50,7 +55,7 @@ type StartJobApiRespone struct {
type StopJobApiRequest struct { type StopJobApiRequest struct {
// JobId, ClusterId and StartTime are optional. // JobId, ClusterId and StartTime are optional.
// They are only used if no database id was provided. // They are only used if no database id was provided.
JobId *string `json:"jobId"` JobId *int64 `json:"jobId"`
Cluster *string `json:"cluster"` Cluster *string `json:"cluster"`
StartTime *int64 `json:"startTime"` StartTime *int64 `json:"startTime"`
@ -64,6 +69,44 @@ type TagJobApiRequest []*struct {
Type string `json:"type"` Type string `json:"type"`
} }
// Return a list of jobs
func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
filter := model.JobFilter{}
for key, vals := range r.URL.Query() {
switch key {
case "state":
for _, s := range vals {
state := schema.JobState(s)
if !state.Valid() {
http.Error(rw, "invalid query parameter value: state", http.StatusBadRequest)
return
}
filter.State = append(filter.State, state)
}
case "cluster":
filter.Cluster = &model.StringInput{Eq: &vals[0]}
default:
http.Error(rw, "invalid query parameter: "+key, http.StatusBadRequest)
return
}
}
results, err := api.Resolver.Query().Jobs(r.Context(), []*model.JobFilter{&filter}, nil, nil)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
bw := bufio.NewWriter(rw)
defer bw.Flush()
if err := json.NewEncoder(bw).Encode(results.Items); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
}
// Return a single job
func (api *RestApi) getJob(rw http.ResponseWriter, r *http.Request) { func (api *RestApi) getJob(rw http.ResponseWriter, r *http.Request) {
id := mux.Vars(r)["id"] id := mux.Vars(r)["id"]
@ -84,6 +127,7 @@ func (api *RestApi) getJob(rw http.ResponseWriter, r *http.Request) {
json.NewEncoder(rw).Encode(job) json.NewEncoder(rw).Encode(job)
} }
// Add a tag to a job
func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) { func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
id := mux.Vars(r)["id"] id := mux.Vars(r)["id"]
job, err := api.Resolver.Query().Job(r.Context(), id) job, err := api.Resolver.Query().Job(r.Context(), id)
@ -130,6 +174,8 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
json.NewEncoder(rw).Encode(job) json.NewEncoder(rw).Encode(job)
} }
// A new job started. The body should be in the `meta.json` format, but some fields required
// there are optional here (e.g. `jobState` defaults to "running").
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
req := schema.JobMeta{BaseJob: schema.JobDefaults} req := schema.JobMeta{BaseJob: schema.JobDefaults}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil { if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@ -142,6 +188,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
return return
} }
// TODO: Do more such checks, be smarter with them.
if len(req.Resources) == 0 || len(req.User) == 0 || req.NumNodes == 0 { if len(req.Resources) == 0 || len(req.User) == 0 || req.NumNodes == 0 {
http.Error(rw, "required fields are missing", http.StatusBadRequest) http.Error(rw, "required fields are missing", http.StatusBadRequest)
return return
@ -193,6 +240,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
}) })
} }
// A job has stopped and should be archived.
func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) { func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
req := StopJobApiRequest{} req := StopJobApiRequest{}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil { if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@ -207,10 +255,13 @@ func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
if ok { if ok {
sql, args, err = sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id).ToSql() sql, args, err = sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id).ToSql()
} else { } else {
sql, args, err = sq.Select(schema.JobColumns...).From("job"). qb := sq.Select(schema.JobColumns...).From("job").
Where("job.job_id = ?", req.JobId). Where("job.job_id = ?", req.JobId).
Where("job.cluster = ?", req.Cluster). Where("job.cluster = ?", req.Cluster)
Where("job.start_time = ?", req.StartTime).ToSql() if req.StartTime != nil {
qb = qb.Where("job.start_time = ?", *req.StartTime)
}
sql, args, err = qb.ToSql()
} }
if err != nil { if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest) http.Error(rw, err.Error(), http.StatusBadRequest)
@ -234,6 +285,10 @@ func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
req.State = schema.JobStateCompleted req.State = schema.JobStateCompleted
} }
// This closure does the real work. It needs to be its own
// function so that it can be done in the background.
// TODO: Throttle/Have a max. number or parallel archivngs
// or use a long-running goroutine receiving jobs by a channel.
doArchiving := func(job *schema.Job, ctx context.Context) error { doArchiving := func(job *schema.Job, ctx context.Context) error {
api.OngoingArchivings.Add(1) api.OngoingArchivings.Add(1)
defer api.OngoingArchivings.Done() defer api.OngoingArchivings.Done()