From d762e3e52b88ad75f30c0cfd89bc8527f0a35c29 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 18 Mar 2022 14:47:39 +0100 Subject: [PATCH] Restructure repo --- datastructures/README.md | 4 + .../cluster.schema.json | 0 .../job-data.schema.json | 0 .../job-meta.schema.json | 0 .../job-metric-data.schema.json | 0 .../job-metric-statistics.schema.json | 0 .../json => datastructures}/unit.schema.json | 0 interfaces/graphql/README.md | 3 + interfaces/graphql/schema.graphqls | 261 ++++++++++++++++++ .../lineprotocol/README.md | 0 interfaces/rest/README.md | 1 + interfaces/rest/openapi.yaml | 221 +++++++++++++++ metrics/lineprotocol.md | 35 --- schemas/README.md | 0 {schema/sql => schemas}/jobs-sqlite.sql | 0 15 files changed, 490 insertions(+), 35 deletions(-) create mode 100644 datastructures/README.md rename {schema/json => datastructures}/cluster.schema.json (100%) rename {schema/json => datastructures}/job-data.schema.json (100%) rename {schema/json => datastructures}/job-meta.schema.json (100%) rename {schema/json => datastructures}/job-metric-data.schema.json (100%) rename {schema/json => datastructures}/job-metric-statistics.schema.json (100%) rename {schema/json => datastructures}/unit.schema.json (100%) create mode 100644 interfaces/graphql/README.md create mode 100644 interfaces/graphql/schema.graphqls rename metrics/lineprotocol_alternative.md => interfaces/lineprotocol/README.md (100%) create mode 100644 interfaces/rest/README.md create mode 100644 interfaces/rest/openapi.yaml delete mode 100644 metrics/lineprotocol.md create mode 100644 schemas/README.md rename {schema/sql => schemas}/jobs-sqlite.sql (100%) diff --git a/datastructures/README.md b/datastructures/README.md new file mode 100644 index 0000000..5c3bab2 --- /dev/null +++ b/datastructures/README.md @@ -0,0 +1,4 @@ +## Generic database specification + +This collection of datastructures descriptions is intended to be used +as datastructures in application, payloads in apis, and file formats. diff --git a/schema/json/cluster.schema.json b/datastructures/cluster.schema.json similarity index 100% rename from schema/json/cluster.schema.json rename to datastructures/cluster.schema.json diff --git a/schema/json/job-data.schema.json b/datastructures/job-data.schema.json similarity index 100% rename from schema/json/job-data.schema.json rename to datastructures/job-data.schema.json diff --git a/schema/json/job-meta.schema.json b/datastructures/job-meta.schema.json similarity index 100% rename from schema/json/job-meta.schema.json rename to datastructures/job-meta.schema.json diff --git a/schema/json/job-metric-data.schema.json b/datastructures/job-metric-data.schema.json similarity index 100% rename from schema/json/job-metric-data.schema.json rename to datastructures/job-metric-data.schema.json diff --git a/schema/json/job-metric-statistics.schema.json b/datastructures/job-metric-statistics.schema.json similarity index 100% rename from schema/json/job-metric-statistics.schema.json rename to datastructures/job-metric-statistics.schema.json diff --git a/schema/json/unit.schema.json b/datastructures/unit.schema.json similarity index 100% rename from schema/json/unit.schema.json rename to datastructures/unit.schema.json diff --git a/interfaces/graphql/README.md b/interfaces/graphql/README.md new file mode 100644 index 0000000..73cac45 --- /dev/null +++ b/interfaces/graphql/README.md @@ -0,0 +1,3 @@ +## GraphQL Schema + +This schema is intended for communication between web-frontend and web-backend. diff --git a/interfaces/graphql/schema.graphqls b/interfaces/graphql/schema.graphqls new file mode 100644 index 0000000..b3fbe29 --- /dev/null +++ b/interfaces/graphql/schema.graphqls @@ -0,0 +1,261 @@ +scalar Time +scalar Any + +scalar NullableFloat +scalar MetricScope +scalar JobState + +type Job { + id: ID! + jobId: Int! + user: String! + project: String! + cluster: String! + subCluster: String! + startTime: Time! + duration: Int! + walltime: Int! + numNodes: Int! + numHWThreads: Int! + numAcc: Int! + SMT: Int! + exclusive: Int! + partition: String! + arrayJobId: Int! + monitoringStatus: Int! + state: JobState! + tags: [Tag!]! + resources: [Resource!]! + + metaData: Any + userData: User +} + +type Cluster { + name: String! + partitions: [String!]! # Slurm partitions + metricConfig: [MetricConfig!]! + filterRanges: FilterRanges! + subClusters: [SubCluster!]! # Hardware partitions/subclusters +} + +type SubCluster { + name: String! + nodes: String! + processorType: String! + socketsPerNode: Int! + coresPerSocket: Int! + threadsPerCore: Int! + flopRateScalar: Int! + flopRateSimd: Int! + memoryBandwidth: Int! + topology: Topology! +} + +type Topology { + node: [Int!] + socket: [[Int!]!] + memoryDomain: [[Int!]!] + die: [[Int!]!] + core: [[Int!]!] + accelerators: [Accelerator!] +} + +type Accelerator { + id: String! + type: String! + model: String! +} + +type MetricConfig { + name: String! + unit: String! + scope: MetricScope! + timestep: Int! + peak: Float! + normal: Float! + caution: Float! + alert: Float! +} + +type Tag { + id: ID! + type: String! + name: String! +} + +type Resource { + hostname: String! + hwthreads: [Int!] + accelerators: [Int!] + configuration: String +} + +type JobMetricWithName { + name: String! + metric: JobMetric! +} + +type JobMetric { + unit: String! + scope: MetricScope! + timestep: Int! + series: [Series!] + statisticsSeries: StatsSeries +} + +type Series { + hostname: String! + id: Int + statistics: MetricStatistics + data: [NullableFloat!]! +} + +type MetricStatistics { + avg: Float! + min: Float! + max: Float! +} + +type StatsSeries { + mean: [NullableFloat!]! + min: [NullableFloat!]! + max: [NullableFloat!]! +} + +type MetricFootprints { + metric: String! + data: [NullableFloat!]! +} + +type Footprints { + nodehours: [NullableFloat!]! + metrics: [MetricFootprints!]! +} + +enum Aggregate { USER, PROJECT, CLUSTER } + +type NodeMetrics { + host: String! + metrics: [JobMetricWithName!]! +} + +type Count { + name: String! + count: Int! +} + +type User { + username: String! + name: String! + email: String! +} + +type Query { + clusters: [Cluster!]! # List of all clusters + tags: [Tag!]! # List of all tags + + user(username: String!): User + + job(id: ID!): Job + jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]! + jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints + + jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList! + jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]! + jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, limit: Int): [Count!]! + + rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]! + + nodeMetrics(cluster: String!, partition: String, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! +} + +type Mutation { + createTag(type: String!, name: String!): Tag! + deleteTag(id: ID!): ID! + addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]! + removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]! + + updateConfiguration(name: String!, value: String!): String +} + +type IntRangeOutput { from: Int!, to: Int! } +type TimeRangeOutput { from: Time!, to: Time! } + +type FilterRanges { + duration: IntRangeOutput! + numNodes: IntRangeOutput! + startTime: TimeRangeOutput! +} + +input JobFilter { + tags: [ID!] + jobId: StringInput + arrayJobId: Int + user: StringInput + project: StringInput + cluster: StringInput + partition: StringInput + duration: IntRange + + minRunningFor: Int + + numNodes: IntRange + numAccelerators: IntRange + numHWThreads: IntRange + + startTime: TimeRange + state: [JobState!] + flopsAnyAvg: FloatRange + memBwAvg: FloatRange + loadAvg: FloatRange + memUsedMax: FloatRange +} + +input OrderByInput { + field: String! + order: SortDirectionEnum! = ASC +} + +enum SortDirectionEnum { + DESC + ASC +} + +input StringInput { + eq: String + contains: String + startsWith: String + endsWith: String +} + +input IntRange { from: Int!, to: Int! } +input FloatRange { from: Float!, to: Float! } +input TimeRange { from: Time, to: Time } + +type JobResultList { + items: [Job!]! + offset: Int + limit: Int + count: Int +} + +type HistoPoint { + count: Int! + value: Int! +} + +type JobsStatistics { + id: ID! # If `groupBy` was used, ID of the user/project/cluster + totalJobs: Int! # Number of jobs that matched + shortJobs: Int! # Number of jobs with a duration of less than 2 minutes + totalWalltime: Int! # Sum of the duration of all matched jobs in hours + totalCoreHours: Int! # Sum of the core hours of all matched jobs + histWalltime: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value + histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes +} + +input PageRequest { + itemsPerPage: Int! + page: Int! +} diff --git a/metrics/lineprotocol_alternative.md b/interfaces/lineprotocol/README.md similarity index 100% rename from metrics/lineprotocol_alternative.md rename to interfaces/lineprotocol/README.md diff --git a/interfaces/rest/README.md b/interfaces/rest/README.md new file mode 100644 index 0000000..cb701bc --- /dev/null +++ b/interfaces/rest/README.md @@ -0,0 +1 @@ +## REST API interfaces diff --git a/interfaces/rest/openapi.yaml b/interfaces/rest/openapi.yaml new file mode 100644 index 0000000..2babbf5 --- /dev/null +++ b/interfaces/rest/openapi.yaml @@ -0,0 +1,221 @@ +# +# ClusterCockpit's API spec can be exported via: +# docker exec -it cc-php php bin/console api:openapi:export --yaml +# +# This spec is written by hand and hopefully up to date with the API. +# + +openapi: 3.0.3 +info: + title: 'ClusterCockpit REST API' + description: 'API for batch job control' + version: 0.0.2 +servers: + - url: / + description: '' +paths: + '/api/jobs/': + get: + operationId: 'getJobs' + summary: 'List all jobs' + description: 'Get a list of all jobs. Filters can be applied using query parameters.' + parameters: + - name: state + in: query + schema: + type: string + enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"] + - name: cluster + in: query + schema: { type: string } + - name: start-time + description: 'Syntax: "-", where and are unix timestamps in seconds' + in: query + schema: { type: string } + - name: page + in: query + schema: { type: integer } + - name: items-per-page + in: query + schema: { type: integer } + - name: with-metadata + in: query + schema: { type: boolean } + responses: + 200: + description: 'Array of jobs' + content: + 'application/json': + schema: + type: object + properties: + jobs: + type: array + items: + $ref: '#/components/schemas/Job' + 400: + description: 'Bad Request' + '/api/jobs/tag_job/{id}': + post: + operationId: 'tagJob' + summary: 'Add a tag to a job' + parameters: + - name: id + in: path + required: true + schema: { type: integer } + description: 'Job ID' + requestBody: + description: 'Array of tags to add' + required: true + content: + 'application/json': + schema: + type: array + items: + $ref: '#/components/schemas/Tag' + responses: + 200: + description: 'Job resource' + content: + 'application/json': + schema: + $ref: '#/components/schemas/Job' + 404: + description: 'Job or tag does not exist' + 400: + description: 'Bad request' + '/api/jobs/start_job/': + post: + operationId: 'startJob' + summary: 'Add a newly started job' + requestBody: + required: true + content: + 'application/json': + schema: + $ref: '#/components/schemas/Job' + responses: + 201: + description: 'Job successfully' + content: + 'application/json': + schema: + type: object + properties: + id: + type: integer + description: 'The database ID assigned to this job' + 400: + description: 'Bad request' + 422: + description: 'The combination of jobId, clusterId and startTime does already exist' + '/api/jobs/stop_job/': + post: + operationId: stopJobViaJobID + summary: 'Mark a job as stopped. Which job to stop is specified by the request body.' + requestBody: + required: true + content: + 'application/json': + schema: + type: object + required: [jobId, cluster, stopTime, jobState] + properties: + jobId: { type: integer } + cluster: { type: string } + startTime: { type: integer } + stopTime: { type: integer } + jobState: + type: string + enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"] + responses: + 200: + description: 'Job resource' + content: + 'application/json': + schema: + $ref: '#/components/schemas/Job' + 400: + description: 'Bad request' + 404: + description: 'Resource not found' + '/api/jobs/stop_job/{id}': + post: + operationId: 'stopJobViaDBID' + summary: 'Mark a job as stopped.' + parameters: + - name: id + in: path + required: true + schema: { type: integer } + description: 'Database ID (Resource Identifier)' + requestBody: + required: true + content: + 'application/json': + schema: + type: object + required: [stopTime, jobState] + properties: + stopTime: { type: integer } + jobState: + type: string + enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"] + responses: + 200: + description: 'Job resource' + content: + 'application/json': + schema: + $ref: '#/components/schemas/Job' + 400: + description: 'Bad request' + 404: + description: 'Resource not found' + '/api/jobs/import/': + post: + operationId: 'importJob' + summary: 'Imports a job and its metric data' + requestBody: + required: true + content: + 'application/json': + schema: + type: object + properties: + meta: + $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json + data: + $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-data.schema.json + responses: + 200: + description: 'Import successful' + 400: + description: 'Bad request' + 422: + description: 'Unprocessable Entity' +components: + schemas: + Tag: + description: 'A job tag' + type: object + properties: + id: + type: string + description: 'Database ID' + type: + type: string + description: 'Tag type' + name: + type: string + description: 'Tag name' + Job: + $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json + securitySchemes: + bearerAuth: + type: http + scheme: bearer + bearerFormat: JWT +security: + - bearerAuth: [] # Applies `bearerAuth` globally \ No newline at end of file diff --git a/metrics/lineprotocol.md b/metrics/lineprotocol.md deleted file mode 100644 index 4c93d7b..0000000 --- a/metrics/lineprotocol.md +++ /dev/null @@ -1,35 +0,0 @@ -# Overview - -ClusterCockpit uses the InfluxData line-protocol for collecting the node metric -data. - -``` -, -``` - -Supported measurements: -* node – Tags: host -* socket – Tags: host, socket -* cpu -- Tags: host, cpu - -## Supported node level fields - -* `load` -* `mem_used` -* `net_bw` - split into `ib_bw` and `eth_bw` if required -* `file_bw` - split into multiple file systems if required - -## Supported socket fields - -All socket metrics can be aggregated to coarser granularity. - -* `power` -* `mem_bw` - -## Supported cpu level fields - -All cpu metrics can be aggregated to coarser granularity. - -* `ipc` -* `flops_any` -* `clock` diff --git a/schemas/README.md b/schemas/README.md new file mode 100644 index 0000000..e69de29 diff --git a/schema/sql/jobs-sqlite.sql b/schemas/jobs-sqlite.sql similarity index 100% rename from schema/sql/jobs-sqlite.sql rename to schemas/jobs-sqlite.sql