Restructure repo

2025-10-15 02:54:30 +02:00 · 2022-03-18 14:47:39 +01:00
parent 403f74ddee
commit d762e3e52b
15 changed files with 490 additions and 35 deletions
--- a/datastructures/README.md
+++ b/datastructures/README.md
@@ -0,0 +1,4 @@
+## Generic database specification
+
+This collection of datastructures descriptions is intended to be used
+as datastructures in application, payloads in apis, and file formats.
--- a/datastructures/cluster.schema.json
+++ b/datastructures/cluster.schema.json
--- a/datastructures/job-data.schema.json
+++ b/datastructures/job-data.schema.json
--- a/datastructures/job-meta.schema.json
+++ b/datastructures/job-meta.schema.json
--- a/datastructures/job-metric-data.schema.json
+++ b/datastructures/job-metric-data.schema.json
--- a/datastructures/job-metric-statistics.schema.json
+++ b/datastructures/job-metric-statistics.schema.json
--- a/datastructures/unit.schema.json
+++ b/datastructures/unit.schema.json
--- a/interfaces/graphql/README.md
+++ b/interfaces/graphql/README.md
@@ -0,0 +1,3 @@
+## GraphQL Schema
+
+This schema is intended for communication between web-frontend and web-backend.
--- a/interfaces/graphql/schema.graphqls
+++ b/interfaces/graphql/schema.graphqls
@@ -0,0 +1,261 @@
+scalar Time
+scalar Any
+
+scalar NullableFloat
+scalar MetricScope
+scalar JobState
+
+type Job {
+  id:               ID!
+  jobId:            Int!
+  user:             String!
+  project:          String!
+  cluster:          String!
+  subCluster:       String!
+  startTime:        Time!
+  duration:         Int!
+  walltime:         Int!
+  numNodes:         Int!
+  numHWThreads:     Int!
+  numAcc:           Int!
+  SMT:              Int!
+  exclusive:        Int!
+  partition:        String!
+  arrayJobId:       Int!
+  monitoringStatus: Int!
+  state:            JobState!
+  tags:             [Tag!]!
+  resources:        [Resource!]!
+
+  metaData:         Any
+  userData:         User
+}
+
+type Cluster {
+  name:         String!
+  partitions:   [String!]!        # Slurm partitions
+  metricConfig: [MetricConfig!]!
+  filterRanges: FilterRanges!
+  subClusters:  [SubCluster!]!    # Hardware partitions/subclusters
+}
+
+type SubCluster {
+  name:            String!
+  nodes:           String!
+  processorType:   String!
+  socketsPerNode:  Int!
+  coresPerSocket:  Int!
+  threadsPerCore:  Int!
+  flopRateScalar:  Int!
+  flopRateSimd:    Int!
+  memoryBandwidth: Int!
+  topology:        Topology!
+}
+
+type Topology {
+  node:         [Int!]
+  socket:       [[Int!]!]
+  memoryDomain: [[Int!]!]
+  die:          [[Int!]!]
+  core:         [[Int!]!]
+  accelerators: [Accelerator!]
+}
+
+type Accelerator {
+  id:    String!
+  type:  String!
+  model: String!
+}
+
+type MetricConfig {
+  name:     String!
+  unit:     String!
+  scope:    MetricScope!
+  timestep: Int!
+  peak:     Float!
+  normal:   Float!
+  caution:  Float!
+  alert:    Float!
+}
+
+type Tag {
+  id:   ID!
+  type: String!
+  name: String!
+}
+
+type Resource {
+  hostname:      String!
+  hwthreads:     [Int!]
+  accelerators:  [Int!]
+  configuration: String
+}
+
+type JobMetricWithName {
+  name:   String!
+  metric: JobMetric!
+}
+
+type JobMetric {
+  unit:             String!
+  scope:            MetricScope!
+  timestep:         Int!
+  series:           [Series!]
+  statisticsSeries: StatsSeries
+}
+
+type Series {
+  hostname:   String!
+  id:         Int
+  statistics: MetricStatistics
+  data:       [NullableFloat!]!
+}
+
+type MetricStatistics {
+  avg: Float!
+  min: Float!
+  max: Float!
+}
+
+type StatsSeries {
+  mean: [NullableFloat!]!
+  min:  [NullableFloat!]!
+  max:  [NullableFloat!]!
+}
+
+type MetricFootprints {
+  metric: String!
+  data:   [NullableFloat!]!
+}
+
+type Footprints {
+  nodehours: [NullableFloat!]!
+  metrics:   [MetricFootprints!]!
+}
+
+enum Aggregate { USER, PROJECT, CLUSTER }
+
+type NodeMetrics {
+  host:    String!
+  metrics: [JobMetricWithName!]!
+}
+
+type Count {
+  name:  String!
+  count: Int!
+}
+
+type User {
+  username: String!
+  name:     String!
+  email:    String!
+}
+
+type Query {
+  clusters:     [Cluster!]!   # List of all clusters
+  tags:         [Tag!]!       # List of all tags
+
+  user(username: String!): User
+
+  job(id: ID!): Job
+  jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
+  jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
+
+  jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
+  jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
+  jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, limit: Int): [Count!]!
+
+  rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
+
+  nodeMetrics(cluster: String!, partition: String, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
+}
+
+type Mutation {
+  createTag(type: String!, name: String!): Tag!
+  deleteTag(id: ID!): ID!
+  addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
+  removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
+
+  updateConfiguration(name: String!, value: String!): String
+}
+
+type IntRangeOutput { from: Int!, to: Int! }
+type TimeRangeOutput { from: Time!, to: Time! }
+
+type FilterRanges {
+  duration:  IntRangeOutput!
+  numNodes:  IntRangeOutput!
+  startTime: TimeRangeOutput!
+}
+
+input JobFilter {
+  tags:        [ID!]
+  jobId:       StringInput
+  arrayJobId:  Int
+  user:        StringInput
+  project:     StringInput
+  cluster:     StringInput
+  partition:   StringInput
+  duration:    IntRange
+
+  minRunningFor: Int
+
+  numNodes:        IntRange
+  numAccelerators: IntRange
+  numHWThreads:    IntRange
+
+  startTime:   TimeRange
+  state:       [JobState!]
+  flopsAnyAvg: FloatRange
+  memBwAvg:    FloatRange
+  loadAvg:     FloatRange
+  memUsedMax:  FloatRange
+}
+
+input OrderByInput {
+  field: String!
+  order: SortDirectionEnum! = ASC
+}
+
+enum SortDirectionEnum {
+  DESC
+  ASC
+}
+
+input StringInput {
+  eq:         String
+  contains:   String
+  startsWith: String
+  endsWith:   String
+}
+
+input IntRange   { from: Int!,   to: Int! }
+input FloatRange { from: Float!, to: Float! }
+input TimeRange  { from: Time,   to: Time }
+
+type JobResultList {
+  items:  [Job!]!
+  offset: Int
+  limit:  Int
+  count:  Int
+}
+
+type HistoPoint {
+  count: Int!
+  value: Int!
+}
+
+type JobsStatistics  {
+  id:             ID!            # If `groupBy` was used, ID of the user/project/cluster
+  totalJobs:      Int!           # Number of jobs that matched
+  shortJobs:      Int!           # Number of jobs with a duration of less than 2 minutes
+  totalWalltime:  Int!           # Sum of the duration of all matched jobs in hours
+  totalCoreHours: Int!           # Sum of the core hours of all matched jobs
+  histWalltime:   [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
+  histNumNodes:   [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
+}
+
+input PageRequest {
+  itemsPerPage: Int!
+  page:         Int!
+}
--- a/metrics/lineprotocol_alternative.md
+++ b/metrics/lineprotocol_alternative.md
--- a/interfaces/rest/README.md
+++ b/interfaces/rest/README.md
@@ -0,0 +1 @@
+## REST API interfaces
--- a/interfaces/rest/openapi.yaml
+++ b/interfaces/rest/openapi.yaml
@@ -0,0 +1,221 @@
+#
+# ClusterCockpit's API spec can be exported via:
+# docker exec -it cc-php php bin/console api:openapi:export --yaml
+#
+# This spec is written by hand and hopefully up to date with the API.
+#
+
+openapi: 3.0.3
+info:
+  title: 'ClusterCockpit REST API'
+  description: 'API for batch job control'
+  version: 0.0.2
+servers:
+  - url: /
+    description: ''
+paths:
+  '/api/jobs/':
+    get:
+      operationId: 'getJobs'
+      summary: 'List all jobs'
+      description: 'Get a list of all jobs. Filters can be applied using query parameters.'
+      parameters:
+        - name: state
+          in: query
+          schema:
+            type: string
+            enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
+        - name: cluster
+          in: query
+          schema: { type: string }
+        - name: start-time
+          description: 'Syntax: "<from>-<to>", where <from> and <to> are unix timestamps in seconds'
+          in: query
+          schema: { type: string }
+        - name: page
+          in: query
+          schema: { type: integer }
+        - name: items-per-page
+          in: query
+          schema: { type: integer }
+        - name: with-metadata
+          in: query
+          schema: { type: boolean }
+      responses:
+        200:
+          description: 'Array of jobs'
+          content:
+            'application/json':
+              schema:
+                type: object
+                properties:
+                  jobs:
+                    type: array
+                    items:
+                      $ref: '#/components/schemas/Job'
+        400:
+          description: 'Bad Request'
+  '/api/jobs/tag_job/{id}':
+    post:
+      operationId: 'tagJob'
+      summary: 'Add a tag to a job'
+      parameters:
+        - name: id
+          in: path
+          required: true
+          schema: { type: integer }
+          description: 'Job ID'
+      requestBody:
+        description: 'Array of tags to add'
+        required: true
+        content:
+          'application/json':
+            schema:
+              type: array
+              items:
+                $ref: '#/components/schemas/Tag'
+      responses:
+        200:
+          description: 'Job resource'
+          content:
+            'application/json':
+              schema:
+                $ref: '#/components/schemas/Job'
+        404:
+          description: 'Job or tag does not exist'
+        400:
+          description: 'Bad request'
+  '/api/jobs/start_job/':
+    post:
+      operationId: 'startJob'
+      summary: 'Add a newly started job'
+      requestBody:
+        required: true
+        content:
+          'application/json':
+            schema:
+              $ref: '#/components/schemas/Job'
+      responses:
+        201:
+          description: 'Job successfully'
+          content:
+            'application/json':
+              schema:
+                type: object
+                properties:
+                  id:
+                    type: integer
+                    description: 'The database ID assigned to this job'
+        400:
+          description: 'Bad request'
+        422:
+          description: 'The combination of jobId, clusterId and startTime does already exist'
+  '/api/jobs/stop_job/':
+    post:
+      operationId: stopJobViaJobID
+      summary: 'Mark a job as stopped. Which job to stop is specified by the request body.'
+      requestBody:
+        required: true
+        content:
+          'application/json':
+            schema:
+              type: object
+              required: [jobId, cluster, stopTime, jobState]
+              properties:
+                jobId: { type: integer }
+                cluster: { type: string }
+                startTime: { type: integer }
+                stopTime: { type: integer }
+                jobState:
+                  type: string
+                  enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
+      responses:
+        200:
+          description: 'Job resource'
+          content:
+            'application/json':
+              schema:
+                $ref: '#/components/schemas/Job'
+        400:
+          description: 'Bad request'
+        404:
+          description: 'Resource not found'
+  '/api/jobs/stop_job/{id}':
+    post:
+      operationId: 'stopJobViaDBID'
+      summary: 'Mark a job as stopped.'
+      parameters:
+        - name: id
+          in: path
+          required: true
+          schema: { type: integer }
+          description: 'Database ID (Resource Identifier)'
+      requestBody:
+        required: true
+        content:
+          'application/json':
+            schema:
+              type: object
+              required: [stopTime, jobState]
+              properties:
+                stopTime: { type: integer }
+                jobState:
+                  type: string
+                  enum: ["running", "completed", "failed", "canceled", "stopped", "timeout"]
+      responses:
+        200:
+          description: 'Job resource'
+          content:
+            'application/json':
+              schema:
+                $ref: '#/components/schemas/Job'
+        400:
+          description: 'Bad request'
+        404:
+          description: 'Resource not found'
+  '/api/jobs/import/':
+    post:
+      operationId: 'importJob'
+      summary: 'Imports a job and its metric data'
+      requestBody:
+        required: true
+        content:
+          'application/json':
+            schema:
+              type: object
+              properties:
+                meta:
+                  $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
+                data:
+                  $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-data.schema.json
+      responses:
+        200:
+          description: 'Import successful'
+        400:
+          description: 'Bad request'
+        422:
+          description: 'Unprocessable Entity'
+components:
+  schemas:
+    Tag:
+      description: 'A job tag'
+      type: object
+      properties:
+        id:
+          type: string
+          description: 'Database ID'
+        type:
+          type: string
+          description: 'Tag type'
+        name:
+          type: string
+          description: 'Tag name'
+    Job:
+      $ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
+  securitySchemes:
+    bearerAuth:
+      type: http
+      scheme: bearer
+      bearerFormat: JWT
+security:
+  - bearerAuth: [] # Applies `bearerAuth` globally
--- a/metrics/lineprotocol.md
+++ b/metrics/lineprotocol.md
@@ -1,35 +0,0 @@
-# Overview
-
-ClusterCockpit uses the InfluxData line-protocol for collecting the node metric
-data.
-
-```
-<measurement>,<tag set> <field set> <timestamp [s]>
-```
-
-Supported measurements:
-* node – Tags: host
-* socket – Tags: host, socket
-* cpu -- Tags: host, cpu
-
-## Supported node level fields
-
-* `load`
-* `mem_used`
-* `net_bw` - split into `ib_bw` and `eth_bw` if required
-* `file_bw` - split into multiple file systems if required
-
-## Supported socket fields
-
-All socket metrics can be aggregated to coarser granularity.
-
-* `power`
-* `mem_bw`
-
-## Supported cpu level fields
-
-All cpu metrics can be aggregated to coarser granularity.
-
-* `ipc`
-* `flops_any`
-* `clock`
--- a/schemas/README.md
+++ b/schemas/README.md
--- a/schema/sql/jobs-sqlite.sql
+++ b/schema/sql/jobs-sqlite.sql