diff --git a/Makefile b/Makefile
index 48da4e0..52f0d39 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ TARGET = ./cc-backend
VAR = ./var
CFG = config.json .env
FRONTEND = ./web/frontend
-VERSION = 1.4.1
+VERSION = 1.4.2
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index bb25b5d..2659964 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,4 +1,4 @@
-# `cc-backend` version 1.4.1
+# `cc-backend` version 1.4.2
Supports job archive version 2 and database version 8.
@@ -12,7 +12,8 @@ For release specific notes visit the [ClusterCockpit Documentation](https://clus
migration might require several hours!
- You need to adapt the `cluster.json` configuration files in the job-archive,
add new required attributes to the metric list and after that edit
- `./job-archive/version.txt` to version 2.
+ `./job-archive/version.txt` to version 2. Only metrics that have the footprint
+ attribute set can be filtered and show up in the footprint UI and polar plot.
- Continuous scrolling is default now in all job lists. You can change this back
to paging globally, also every user can configure to use paging or continuous
scrolling individually.
diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go
index 436379d..33bab07 100644
--- a/cmd/cc-backend/main.go
+++ b/cmd/cc-backend/main.go
@@ -112,7 +112,7 @@ func main() {
if flagInit {
initEnv()
- fmt.Print("Succesfully setup environment!\n")
+ fmt.Print("Successfully setup environment!\n")
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
fmt.Print("Add your job-archive at ./var/job-archive.\n")
os.Exit(0)
diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go
index 083b9e5..0770e81 100644
--- a/cmd/cc-backend/server.go
+++ b/cmd/cc-backend/server.go
@@ -25,7 +25,6 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
- "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
@@ -314,9 +313,6 @@ func serverShutdown() {
// First shut down the server gracefully (waiting for all ongoing requests)
server.Shutdown(context.Background())
- // Then, wait for any async jobStarts still pending...
- repository.WaitForJobStart()
-
// Then, wait for any async archivings still pending...
archiver.WaitForArchiving()
}
diff --git a/init/clustercockpit.service b/init/clustercockpit.service
index 53fc429..0a9448d 100644
--- a/init/clustercockpit.service
+++ b/init/clustercockpit.service
@@ -1,5 +1,5 @@
[Unit]
-Description=ClusterCockpit Web Server (Go edition)
+Description=ClusterCockpit Web Server
Documentation=https://github.com/ClusterCockpit/cc-backend
Wants=network-online.target
After=network-online.target
diff --git a/internal/api/api_test.go b/internal/api/api_test.go
index bcabd5f..c47bd4d 100644
--- a/internal/api/api_test.go
+++ b/internal/api/api_test.go
@@ -249,9 +249,6 @@ func TestRestApi(t *testing.T) {
if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String())
}
-
- time.Sleep(1 * time.Second)
-
resolver := graph.GetResolverInstance()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
if err != nil {
diff --git a/internal/api/rest.go b/internal/api/rest.go
index db747ce..4e52701 100644
--- a/internal/api/rest.go
+++ b/internal/api/rest.go
@@ -123,18 +123,8 @@ func (api *RestApi) MountFrontendApiRoutes(r *mux.Router) {
}
}
-// StartJobApiResponse model
-type StartJobApiResponse struct {
- Message string `json:"msg"`
-}
-
-// DeleteJobApiResponse model
-type DeleteJobApiResponse struct {
- Message string `json:"msg"`
-}
-
-// UpdateUserApiResponse model
-type UpdateUserApiResponse struct {
+// DefaultApiResponse model
+type DefaultJobApiResponse struct {
Message string `json:"msg"`
}
@@ -341,7 +331,7 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
withMetadata := false
filter := &model.JobFilter{}
page := &model.PageRequest{ItemsPerPage: 25, Page: 1}
- order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
+ order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
for key, vals := range r.URL.Query() {
switch key {
@@ -790,6 +780,11 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
return
}
+ // aquire lock to avoid race condition between API calls
+ var unlockOnce sync.Once
+ api.RepositoryMutex.Lock()
+ defer unlockOnce.Do(api.RepositoryMutex.Unlock)
+
// Check if combination of (job_id, cluster_id, start_time) already exists:
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows {
@@ -804,12 +799,27 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
}
}
- repository.TriggerJobStart(repository.JobWithUser{Job: &req, User: repository.GetUserFromContext(r.Context())})
+ id, err := api.JobRepository.Start(&req)
+ if err != nil {
+ handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
+ return
+ }
+ // unlock here, adding Tags can be async
+ unlockOnce.Do(api.RepositoryMutex.Unlock)
+ for _, tag := range req.Tags {
+ if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil {
+ http.Error(rw, err.Error(), http.StatusInternalServerError)
+ handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw)
+ return
+ }
+ }
+
+ log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusCreated)
- json.NewEncoder(rw).Encode(StartJobApiResponse{
- Message: fmt.Sprintf("Successfully triggered job start"),
+ json.NewEncoder(rw).Encode(DefaultJobApiResponse{
+ Message: "success",
})
}
@@ -892,7 +902,7 @@ func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
- json.NewEncoder(rw).Encode(DeleteJobApiResponse{
+ json.NewEncoder(rw).Encode(DefaultJobApiResponse{
Message: fmt.Sprintf("Successfully deleted job %s", id),
})
}
@@ -943,7 +953,7 @@ func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
- json.NewEncoder(rw).Encode(DeleteJobApiResponse{
+ json.NewEncoder(rw).Encode(DefaultJobApiResponse{
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
})
}
@@ -987,7 +997,7 @@ func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
- json.NewEncoder(rw).Encode(DeleteJobApiResponse{
+ json.NewEncoder(rw).Encode(DefaultJobApiResponse{
Message: fmt.Sprintf("Successfully deleted %d jobs", cnt),
})
}
diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go
index 9fd7260..b529f2c 100644
--- a/internal/graph/schema.resolvers.go
+++ b/internal/graph/schema.resolvers.go
@@ -36,10 +36,7 @@ func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag,
// ConcurrentJobs is the resolver for the concurrentJobs field.
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
- if obj.State == schema.JobStateRunning {
- obj.Duration = int32(time.Now().Unix() - obj.StartTimeUnix)
- }
-
+ // FIXME: Make the hardcoded duration configurable
if obj.Exclusive != 1 && obj.Duration > 600 {
return r.Repo.FindConcurrentJobs(ctx, obj)
}
diff --git a/internal/repository/dbConnection.go b/internal/repository/dbConnection.go
index d062052..418eef9 100644
--- a/internal/repository/dbConnection.go
+++ b/internal/repository/dbConnection.go
@@ -82,8 +82,6 @@ func Connect(driver string, db string) {
if err != nil {
log.Fatal(err)
}
-
- startJobStartWorker()
})
}
diff --git a/internal/repository/job.go b/internal/repository/job.go
index cc44ca9..11f3b46 100644
--- a/internal/repository/job.go
+++ b/internal/repository/job.go
@@ -79,12 +79,9 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
}
job.RawFootprint = nil
- // if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
- // return nil, err
- // }
-
job.StartTime = time.Unix(job.StartTimeUnix, 0)
- if job.Duration == 0 && job.State == schema.JobStateRunning {
+ // Always ensure accurate duration for running jobs
+ if job.State == schema.JobStateRunning {
job.Duration = int32(time.Since(job.StartTime).Seconds())
}
@@ -457,6 +454,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
return subclusters, nil
}
+// FIXME: Set duration to requested walltime?
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
start := time.Now()
res, err := sq.Update("job").
diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go
index 0ab2ea2..b43b569 100644
--- a/internal/repository/jobQuery.go
+++ b/internal/repository/jobQuery.go
@@ -170,8 +170,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
query = buildTimeCondition("job.start_time", filter.StartTime, query)
}
if filter.Duration != nil {
- now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
- query = query.Where("(CASE WHEN job.job_state = 'running' THEN (? - job.start_time) ELSE job.duration END) BETWEEN ? AND ?", now, filter.Duration.From, filter.Duration.To)
+ query = buildIntCondition("job.duration", filter.Duration, query)
}
if filter.MinRunningFor != nil {
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
diff --git a/internal/repository/jobStartWorker.go b/internal/repository/jobStartWorker.go
deleted file mode 100644
index 18d2be7..0000000
--- a/internal/repository/jobStartWorker.go
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-package repository
-
-import (
- "sync"
- "time"
-
- "github.com/ClusterCockpit/cc-backend/pkg/log"
- "github.com/ClusterCockpit/cc-backend/pkg/schema"
-)
-
-type JobWithUser struct {
- Job *schema.JobMeta
- User *schema.User
-}
-
-var (
- jobStartPending sync.WaitGroup
- jobStartChannel chan JobWithUser
-)
-
-func startJobStartWorker() {
- jobStartChannel = make(chan JobWithUser, 128)
-
- go jobStartWorker()
-}
-
-// Archiving worker thread
-func jobStartWorker() {
- for {
- select {
- case req, ok := <-jobStartChannel:
- if !ok {
- break
- }
- jobRepo := GetJobRepository()
- var id int64
-
- for i := 0; i < 5; i++ {
- var err error
-
- id, err = jobRepo.Start(req.Job)
- if err != nil {
- log.Errorf("Attempt %d: insert into database failed: %v", i, err)
- } else {
- break
- }
- time.Sleep(1 * time.Second)
- }
-
- for _, tag := range req.Job.Tags {
- if _, err := jobRepo.AddTagOrCreate(req.User, id,
- tag.Type, tag.Name, tag.Scope); err != nil {
- log.Errorf("adding tag to new job %d failed: %v", id, err)
- }
- }
-
- log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
- id, req.Job.Cluster, req.Job.JobID, req.Job.User, req.Job.StartTime)
-
- jobStartPending.Done()
- }
- }
-}
-
-// Trigger async archiving
-func TriggerJobStart(req JobWithUser) {
- if jobStartChannel == nil {
- log.Fatal("Cannot start Job without jobStart channel. Did you Start the worker?")
- }
-
- jobStartPending.Add(1)
- jobStartChannel <- req
-}
-
-// Wait for background thread to finish pending archiving operations
-func WaitForJobStart() {
- // close channel and wait for worker to process remaining jobs
- jobStartPending.Wait()
-}
diff --git a/internal/repository/repository_test.go b/internal/repository/repository_test.go
index 6d1fbfc..1ca9ec5 100644
--- a/internal/repository/repository_test.go
+++ b/internal/repository/repository_test.go
@@ -111,7 +111,7 @@ func BenchmarkDB_QueryJobs(b *testing.B) {
user := "mppi133h"
filter.User = &model.StringInput{Eq: &user}
page := &model.PageRequest{ItemsPerPage: 50, Page: 1}
- order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
+ order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
b.Run("QueryJobs", func(b *testing.B) {
db := setup(b)
diff --git a/internal/routerConfig/routes.go b/internal/routerConfig/routes.go
index 2267efb..1a3317f 100644
--- a/internal/routerConfig/routes.go
+++ b/internal/routerConfig/routes.go
@@ -182,6 +182,7 @@ func setupTaglistRoute(i InfoType, r *http.Request) InfoType {
return i
}
+// FIXME: Lots of redundant code. Needs refactoring
func buildFilterPresets(query url.Values) map[string]interface{} {
filterPresets := map[string]interface{}{}
diff --git a/pkg/schema/schemas/job-data.schema.json b/pkg/schema/schemas/job-data.schema.json
index e8a5739..c0c492b 100644
--- a/pkg/schema/schemas/job-data.schema.json
+++ b/pkg/schema/schemas/job-data.schema.json
@@ -1,490 +1,490 @@
{
- "$schema": "http://json-schema.org/draft/2020-12/schema",
- "$id": "embedfs://job-data.schema.json",
- "title": "Job metric data list",
- "description": "Collection of metric data of a HPC job",
- "type": "object",
- "properties": {
- "mem_used": {
- "description": "Memory capacity used",
- "type": "object",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "flops_any": {
- "description": "Total flop rate with DP flops scaled up",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "core": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "hwthread": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "mem_bw": {
- "description": "Main memory bandwidth",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "net_bw": {
- "description": "Total fast interconnect network bandwidth",
- "type": "object",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "ipc": {
- "description": "Instructions executed per cycle",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "core": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "hwthread": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "cpu_user": {
- "description": "CPU user active core utilization",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "core": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "hwthread": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "cpu_load": {
- "description": "CPU requested core utilization (load 1m)",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "flops_dp": {
- "description": "Double precision flop rate",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "core": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "hwthread": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "flops_sp": {
- "description": "Single precision flops rate",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "core": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "hwthread": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "vectorization_ratio": {
- "description": "Fraction of arithmetic instructions using SIMD instructions",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "core": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "hwthread": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "cpu_power": {
- "description": "CPU power consumption",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "mem_power": {
- "description": "Memory power consumption",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "acc_utilization": {
- "description": "GPU utilization",
- "properties": {
- "accelerator": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "accelerator"
- ]
- },
- "acc_mem_used": {
- "description": "GPU memory capacity used",
- "properties": {
- "accelerator": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "accelerator"
- ]
- },
- "acc_power": {
- "description": "GPU power consumption",
- "properties": {
- "accelerator": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "accelerator"
- ]
- },
- "clock": {
- "description": "Average core frequency",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "socket": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "memoryDomain": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "core": {
- "$ref": "embedfs://job-metric-data.schema.json"
- },
- "hwthread": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "minProperties": 1
- },
- "eth_read_bw": {
- "description": "Ethernet read bandwidth",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "eth_write_bw": {
- "description": "Ethernet write bandwidth",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "filesystems": {
- "description": "Array of filesystems",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "type": {
- "type": "string",
- "enum": [
- "nfs",
- "lustre",
- "gpfs",
- "nvme",
- "ssd",
- "hdd",
- "beegfs"
- ]
- },
- "read_bw": {
- "description": "File system read bandwidth",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "write_bw": {
- "description": "File system write bandwidth",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "read_req": {
- "description": "File system read requests",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "write_req": {
- "description": "File system write requests",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "inodes": {
- "description": "File system write requests",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "accesses": {
- "description": "File system open and close",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "fsync": {
- "description": "File system fsync",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "create": {
- "description": "File system create",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "open": {
- "description": "File system open",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "close": {
- "description": "File system close",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "seek": {
- "description": "File system seek",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- }
- },
- "required": [
- "name",
- "type",
- "read_bw",
- "write_bw"
- ]
- },
- "minItems": 1
+ "$schema": "http://json-schema.org/draft/2020-12/schema",
+ "$id": "embedfs://job-data.schema.json",
+ "title": "Job metric data list",
+ "description": "Collection of metric data of a HPC job",
+ "type": "object",
+ "properties": {
+ "mem_used": {
+ "description": "Memory capacity used",
+ "type": "object",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
}
+ },
+ "required": [
+ "node"
+ ]
},
- "ic_rcv_packets": {
- "description": "Network interconnect read packets",
+ "flops_any": {
+ "description": "Total flop rate with DP flops scaled up",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "core": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "hwthread": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "mem_bw": {
+ "description": "Main memory bandwidth",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "net_bw": {
+ "description": "Total fast interconnect network bandwidth",
+ "type": "object",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "ipc": {
+ "description": "Instructions executed per cycle",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "core": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "hwthread": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "cpu_user": {
+ "description": "CPU user active core utilization",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "core": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "hwthread": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "cpu_load": {
+ "description": "CPU requested core utilization (load 1m)",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "flops_dp": {
+ "description": "Double precision flop rate",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "core": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "hwthread": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "flops_sp": {
+ "description": "Single precision flops rate",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "core": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "hwthread": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "vectorization_ratio": {
+ "description": "Fraction of arithmetic instructions using SIMD instructions",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "core": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "hwthread": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "cpu_power": {
+ "description": "CPU power consumption",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "mem_power": {
+ "description": "Memory power consumption",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "acc_utilization": {
+ "description": "GPU utilization",
+ "properties": {
+ "accelerator": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "accelerator"
+ ]
+ },
+ "acc_mem_used": {
+ "description": "GPU memory capacity used",
+ "properties": {
+ "accelerator": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "accelerator"
+ ]
+ },
+ "acc_power": {
+ "description": "GPU power consumption",
+ "properties": {
+ "accelerator": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "accelerator"
+ ]
+ },
+ "clock": {
+ "description": "Average core frequency",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "socket": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "memoryDomain": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "core": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ },
+ "hwthread": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "minProperties": 1
+ },
+ "eth_read_bw": {
+ "description": "Ethernet read bandwidth",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "eth_write_bw": {
+ "description": "Ethernet write bandwidth",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "filesystems": {
+ "description": "Array of filesystems",
+ "type": "array",
+ "items": {
+ "type": "object",
"properties": {
- "node": {
+ "name": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "enum": [
+ "nfs",
+ "lustre",
+ "gpfs",
+ "nvme",
+ "ssd",
+ "hdd",
+ "beegfs"
+ ]
+ },
+ "read_bw": {
+ "description": "File system read bandwidth",
+ "properties": {
+ "node": {
"$ref": "embedfs://job-metric-data.schema.json"
- }
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "write_bw": {
+ "description": "File system write bandwidth",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "read_req": {
+ "description": "File system read requests",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "write_req": {
+ "description": "File system write requests",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "inodes": {
+ "description": "File system write requests",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "accesses": {
+ "description": "File system open and close",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "fsync": {
+ "description": "File system fsync",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "create": {
+ "description": "File system create",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "open": {
+ "description": "File system open",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "close": {
+ "description": "File system close",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "seek": {
+ "description": "File system seek",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ }
},
"required": [
- "node"
- ]
- },
- "ic_send_packets": {
- "description": "Network interconnect send packet",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "ic_read_bw": {
- "description": "Network interconnect read bandwidth",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
- ]
- },
- "ic_write_bw": {
- "description": "Network interconnect write bandwidth",
- "properties": {
- "node": {
- "$ref": "embedfs://job-metric-data.schema.json"
- }
- },
- "required": [
- "node"
+ "name",
+ "type",
+ "read_bw",
+ "write_bw"
]
+ },
+ "minItems": 1
+ }
+ },
+ "ic_rcv_packets": {
+ "description": "Network interconnect read packets",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
},
"required": [
- "cpu_user",
- "cpu_load",
- "mem_used",
- "flops_any",
- "mem_bw",
- "net_bw",
- "filesystems"
+ "node"
]
+ },
+ "ic_send_packets": {
+ "description": "Network interconnect send packet",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "ic_read_bw": {
+ "description": "Network interconnect read bandwidth",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "ic_write_bw": {
+ "description": "Network interconnect write bandwidth",
+ "properties": {
+ "node": {
+ "$ref": "embedfs://job-metric-data.schema.json"
+ }
+ },
+ "required": [
+ "node"
+ ]
+ },
+ "required": [
+ "cpu_user",
+ "cpu_load",
+ "mem_used",
+ "flops_any",
+ "mem_bw",
+ "net_bw",
+ "filesystems"
+ ]
}
diff --git a/pkg/schema/schemas/job-meta.schema.json b/pkg/schema/schemas/job-meta.schema.json
index b907d7f..db7475c 100644
--- a/pkg/schema/schemas/job-meta.schema.json
+++ b/pkg/schema/schemas/job-meta.schema.json
@@ -1,351 +1,351 @@
{
- "$schema": "http://json-schema.org/draft/2020-12/schema",
- "$id": "embedfs://job-meta.schema.json",
- "title": "Job meta data",
- "description": "Meta data information of a HPC job",
- "type": "object",
- "properties": {
- "jobId": {
- "description": "The unique identifier of a job",
- "type": "integer"
- },
- "user": {
- "description": "The unique identifier of a user",
+ "$schema": "http://json-schema.org/draft/2020-12/schema",
+ "$id": "embedfs://job-meta.schema.json",
+ "title": "Job meta data",
+ "description": "Meta data information of a HPC job",
+ "type": "object",
+ "properties": {
+ "jobId": {
+ "description": "The unique identifier of a job",
+ "type": "integer"
+ },
+ "user": {
+ "description": "The unique identifier of a user",
+ "type": "string"
+ },
+ "project": {
+ "description": "The unique identifier of a project",
+ "type": "string"
+ },
+ "cluster": {
+ "description": "The unique identifier of a cluster",
+ "type": "string"
+ },
+ "subCluster": {
+ "description": "The unique identifier of a sub cluster",
+ "type": "string"
+ },
+ "partition": {
+ "description": "The Slurm partition to which the job was submitted",
+ "type": "string"
+ },
+ "arrayJobId": {
+ "description": "The unique identifier of an array job",
+ "type": "integer"
+ },
+ "numNodes": {
+ "description": "Number of nodes used",
+ "type": "integer",
+ "exclusiveMinimum": 0
+ },
+ "numHwthreads": {
+ "description": "Number of HWThreads used",
+ "type": "integer",
+ "exclusiveMinimum": 0
+ },
+ "numAcc": {
+ "description": "Number of accelerators used",
+ "type": "integer",
+ "exclusiveMinimum": 0
+ },
+ "exclusive": {
+ "description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 2
+ },
+ "monitoringStatus": {
+ "description": "State of monitoring system during job run",
+ "type": "integer"
+ },
+ "smt": {
+ "description": "SMT threads used by job",
+ "type": "integer"
+ },
+ "walltime": {
+ "description": "Requested walltime of job in seconds",
+ "type": "integer",
+ "exclusiveMinimum": 0
+ },
+ "jobState": {
+ "description": "Final state of job",
+ "type": "string",
+ "enum": [
+ "completed",
+ "failed",
+ "cancelled",
+ "stopped",
+ "out_of_memory",
+ "timeout"
+ ]
+ },
+ "startTime": {
+ "description": "Start epoch time stamp in seconds",
+ "type": "integer",
+ "exclusiveMinimum": 0
+ },
+ "duration": {
+ "description": "Duration of job in seconds",
+ "type": "integer",
+ "exclusiveMinimum": 0
+ },
+ "resources": {
+ "description": "Resources used by job",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "hostname": {
"type": "string"
- },
- "project": {
- "description": "The unique identifier of a project",
- "type": "string"
- },
- "cluster": {
- "description": "The unique identifier of a cluster",
- "type": "string"
- },
- "subCluster": {
- "description": "The unique identifier of a sub cluster",
- "type": "string"
- },
- "partition": {
- "description": "The Slurm partition to which the job was submitted",
- "type": "string"
- },
- "arrayJobId": {
- "description": "The unique identifier of an array job",
- "type": "integer"
- },
- "numNodes": {
- "description": "Number of nodes used",
- "type": "integer",
- "exclusiveMinimum": 0
- },
- "numHwthreads": {
- "description": "Number of HWThreads used",
- "type": "integer",
- "exclusiveMinimum": 0
- },
- "numAcc": {
- "description": "Number of accelerators used",
- "type": "integer",
- "exclusiveMinimum": 0
- },
- "exclusive": {
- "description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
- "type": "integer",
- "minimum": 0,
- "maximum": 2
- },
- "monitoringStatus": {
- "description": "State of monitoring system during job run",
- "type": "integer"
- },
- "smt": {
- "description": "SMT threads used by job",
- "type": "integer"
- },
- "walltime": {
- "description": "Requested walltime of job in seconds",
- "type": "integer",
- "exclusiveMinimum": 0
- },
- "jobState": {
- "description": "Final state of job",
+ },
+ "hwthreads": {
+ "type": "array",
+ "description": "List of OS processor ids",
+ "items": {
+ "type": "integer"
+ }
+ },
+ "accelerators": {
+ "type": "array",
+ "description": "List of of accelerator device ids",
+ "items": {
+ "type": "string"
+ }
+ },
+ "configuration": {
"type": "string",
- "enum": [
- "completed",
- "failed",
- "cancelled",
- "stopped",
- "out_of_memory",
- "timeout"
- ]
+ "description": "The configuration options of the node"
+ }
},
- "startTime": {
- "description": "Start epoch time stamp in seconds",
- "type": "integer",
- "exclusiveMinimum": 0
+ "required": [
+ "hostname"
+ ],
+ "minItems": 1
+ }
+ },
+ "metaData": {
+ "description": "Additional information about the job",
+ "type": "object",
+ "properties": {
+ "jobScript": {
+ "type": "string",
+ "description": "The batch script of the job"
},
- "duration": {
- "description": "Duration of job in seconds",
- "type": "integer",
- "exclusiveMinimum": 0
+ "jobName": {
+ "type": "string",
+ "description": "Slurm Job name"
},
- "resources": {
- "description": "Resources used by job",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "hostname": {
- "type": "string"
- },
- "hwthreads": {
- "type": "array",
- "description": "List of OS processor ids",
- "items": {
- "type": "integer"
- }
- },
- "accelerators": {
- "type": "array",
- "description": "List of of accelerator device ids",
- "items": {
- "type": "string"
- }
- },
- "configuration": {
- "type": "string",
- "description": "The configuration options of the node"
- }
- },
- "required": [
- "hostname"
- ],
- "minItems": 1
- }
+ "slurmInfo": {
+ "type": "string",
+ "description": "Additional slurm infos as show by scontrol show job"
+ }
+ }
+ },
+ "tags": {
+ "description": "List of tags",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string"
+ }
},
- "metaData": {
- "description": "Additional information about the job",
+ "required": [
+ "name",
+ "type"
+ ]
+ },
+ "uniqueItems": true
+ },
+ "statistics": {
+ "description": "Job statistic data",
+ "type": "object",
+ "properties": {
+ "mem_used": {
+ "description": "Memory capacity used (required)",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "cpu_load": {
+ "description": "CPU requested core utilization (load 1m) (required)",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "flops_any": {
+ "description": "Total flop rate with DP flops scaled up (required)",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "mem_bw": {
+ "description": "Main memory bandwidth (required)",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "net_bw": {
+ "description": "Total fast interconnect network bandwidth (required)",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "file_bw": {
+ "description": "Total file IO bandwidth (required)",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "ipc": {
+ "description": "Instructions executed per cycle",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "cpu_user": {
+ "description": "CPU user active core utilization",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "flops_dp": {
+ "description": "Double precision flop rate",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "flops_sp": {
+ "description": "Single precision flops rate",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "rapl_power": {
+ "description": "CPU power consumption",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "acc_used": {
+ "description": "GPU utilization",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "acc_mem_used": {
+ "description": "GPU memory capacity used",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "acc_power": {
+ "description": "GPU power consumption",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "clock": {
+ "description": "Average core frequency",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "eth_read_bw": {
+ "description": "Ethernet read bandwidth",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "eth_write_bw": {
+ "description": "Ethernet write bandwidth",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "ic_rcv_packets": {
+ "description": "Network interconnect read packets",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "ic_send_packets": {
+ "description": "Network interconnect send packet",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "ic_read_bw": {
+ "description": "Network interconnect read bandwidth",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "ic_write_bw": {
+ "description": "Network interconnect write bandwidth",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "filesystems": {
+ "description": "Array of filesystems",
+ "type": "array",
+ "items": {
"type": "object",
"properties": {
- "jobScript": {
- "type": "string",
- "description": "The batch script of the job"
- },
- "jobName": {
- "type": "string",
- "description": "Slurm Job name"
- },
- "slurmInfo": {
- "type": "string",
- "description": "Additional slurm infos as show by scontrol show job"
- }
- }
- },
- "tags": {
- "description": "List of tags",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "type": {
- "type": "string"
- }
- },
- "required": [
- "name",
- "type"
+ "name": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "enum": [
+ "nfs",
+ "lustre",
+ "gpfs",
+ "nvme",
+ "ssd",
+ "hdd",
+ "beegfs"
]
- },
- "uniqueItems": true
- },
- "statistics": {
- "description": "Job statistic data",
- "type": "object",
- "properties": {
- "mem_used": {
- "description": "Memory capacity used (required)",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "cpu_load": {
- "description": "CPU requested core utilization (load 1m) (required)",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "flops_any": {
- "description": "Total flop rate with DP flops scaled up (required)",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "mem_bw": {
- "description": "Main memory bandwidth (required)",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "net_bw": {
- "description": "Total fast interconnect network bandwidth (required)",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "file_bw": {
- "description": "Total file IO bandwidth (required)",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "ipc": {
- "description": "Instructions executed per cycle",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "cpu_user": {
- "description": "CPU user active core utilization",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "flops_dp": {
- "description": "Double precision flop rate",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "flops_sp": {
- "description": "Single precision flops rate",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "rapl_power": {
- "description": "CPU power consumption",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "acc_used": {
- "description": "GPU utilization",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "acc_mem_used": {
- "description": "GPU memory capacity used",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "acc_power": {
- "description": "GPU power consumption",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "clock": {
- "description": "Average core frequency",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "eth_read_bw": {
- "description": "Ethernet read bandwidth",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "eth_write_bw": {
- "description": "Ethernet write bandwidth",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "ic_rcv_packets": {
- "description": "Network interconnect read packets",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "ic_send_packets": {
- "description": "Network interconnect send packet",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "ic_read_bw": {
- "description": "Network interconnect read bandwidth",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "ic_write_bw": {
- "description": "Network interconnect write bandwidth",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "filesystems": {
- "description": "Array of filesystems",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "type": {
- "type": "string",
- "enum": [
- "nfs",
- "lustre",
- "gpfs",
- "nvme",
- "ssd",
- "hdd",
- "beegfs"
- ]
- },
- "read_bw": {
- "description": "File system read bandwidth",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "write_bw": {
- "description": "File system write bandwidth",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "read_req": {
- "description": "File system read requests",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "write_req": {
- "description": "File system write requests",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "inodes": {
- "description": "File system write requests",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "accesses": {
- "description": "File system open and close",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "fsync": {
- "description": "File system fsync",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "create": {
- "description": "File system create",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "open": {
- "description": "File system open",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "close": {
- "description": "File system close",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- },
- "seek": {
- "description": "File system seek",
- "$ref": "embedfs://job-metric-statistics.schema.json"
- }
- },
- "required": [
- "name",
- "type",
- "read_bw",
- "write_bw"
- ]
- },
- "minItems": 1
- }
+ },
+ "read_bw": {
+ "description": "File system read bandwidth",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "write_bw": {
+ "description": "File system write bandwidth",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "read_req": {
+ "description": "File system read requests",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "write_req": {
+ "description": "File system write requests",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "inodes": {
+ "description": "File system write requests",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "accesses": {
+ "description": "File system open and close",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "fsync": {
+ "description": "File system fsync",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "create": {
+ "description": "File system create",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "open": {
+ "description": "File system open",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "close": {
+ "description": "File system close",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ },
+ "seek": {
+ "description": "File system seek",
+ "$ref": "embedfs://job-metric-statistics.schema.json"
+ }
},
"required": [
- "cpu_user",
- "cpu_load",
- "mem_used",
- "flops_any",
- "mem_bw"
+ "name",
+ "type",
+ "read_bw",
+ "write_bw"
]
+ },
+ "minItems": 1
}
- },
- "required": [
- "jobId",
- "user",
- "project",
- "cluster",
- "subCluster",
- "numNodes",
- "exclusive",
- "startTime",
- "jobState",
- "duration",
- "resources",
- "statistics"
- ]
+ },
+ "required": [
+ "cpu_user",
+ "cpu_load",
+ "mem_used",
+ "flops_any",
+ "mem_bw"
+ ]
+ }
+ },
+ "required": [
+ "jobId",
+ "user",
+ "project",
+ "cluster",
+ "subCluster",
+ "numNodes",
+ "exclusive",
+ "startTime",
+ "jobState",
+ "duration",
+ "resources",
+ "statistics"
+ ]
}
diff --git a/pkg/schema/schemas/job-metric-data.schema.json b/pkg/schema/schemas/job-metric-data.schema.json
index 3f2b934..ad499bf 100644
--- a/pkg/schema/schemas/job-metric-data.schema.json
+++ b/pkg/schema/schemas/job-metric-data.schema.json
@@ -1,216 +1,216 @@
{
- "$schema": "http://json-schema.org/draft/2020-12/schema",
- "$id": "embedfs://job-metric-data.schema.json",
- "title": "Job metric data",
- "description": "Metric data of a HPC job",
- "type": "object",
- "properties": {
- "unit": {
- "description": "Metric unit",
- "$ref": "embedfs://unit.schema.json"
- },
- "timestep": {
- "description": "Measurement interval in seconds",
- "type": "integer"
- },
- "thresholds": {
- "description": "Metric thresholds for specific system",
- "type": "object",
- "properties": {
- "peak": {
- "type": "number"
- },
- "normal": {
- "type": "number"
- },
- "caution": {
- "type": "number"
- },
- "alert": {
- "type": "number"
- }
- }
- },
- "statisticsSeries": {
- "type": "object",
- "description": "Statistics series across topology",
- "properties": {
- "min": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "max": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "mean": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "percentiles": {
- "type": "object",
- "properties": {
- "10": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "20": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "30": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "40": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "50": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "60": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "70": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "80": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "90": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "25": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- },
- "75": {
- "type": "array",
- "items": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 3
- }
- }
- }
- }
- },
- "series": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "hostname": {
- "type": "string"
- },
- "id": {
- "type": "string"
- },
- "statistics": {
- "type": "object",
- "description": "Statistics across time dimension",
- "properties": {
- "avg": {
- "description": "Series average",
- "type": "number",
- "minimum": 0
- },
- "min": {
- "description": "Series minimum",
- "type": "number",
- "minimum": 0
- },
- "max": {
- "description": "Series maximum",
- "type": "number",
- "minimum": 0
- }
- },
- "required": [
- "avg",
- "min",
- "max"
- ]
- },
- "data": {
- "type": "array",
- "contains": {
- "type": "number",
- "minimum": 0
- },
- "minItems": 1
- }
- },
- "required": [
- "hostname",
- "statistics",
- "data"
- ]
- }
- }
+ "$schema": "http://json-schema.org/draft/2020-12/schema",
+ "$id": "embedfs://job-metric-data.schema.json",
+ "title": "Job metric data",
+ "description": "Metric data of a HPC job",
+ "type": "object",
+ "properties": {
+ "unit": {
+ "description": "Metric unit",
+ "$ref": "embedfs://unit.schema.json"
},
- "required": [
- "unit",
- "timestep",
- "series"
- ]
+ "timestep": {
+ "description": "Measurement interval in seconds",
+ "type": "integer"
+ },
+ "thresholds": {
+ "description": "Metric thresholds for specific system",
+ "type": "object",
+ "properties": {
+ "peak": {
+ "type": "number"
+ },
+ "normal": {
+ "type": "number"
+ },
+ "caution": {
+ "type": "number"
+ },
+ "alert": {
+ "type": "number"
+ }
+ }
+ },
+ "statisticsSeries": {
+ "type": "object",
+ "description": "Statistics series across topology",
+ "properties": {
+ "min": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "max": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "mean": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "percentiles": {
+ "type": "object",
+ "properties": {
+ "10": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "20": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "30": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "40": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "50": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "60": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "70": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "80": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "90": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "25": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ },
+ "75": {
+ "type": "array",
+ "items": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 3
+ }
+ }
+ }
+ }
+ },
+ "series": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "hostname": {
+ "type": "string"
+ },
+ "id": {
+ "type": "string"
+ },
+ "statistics": {
+ "type": "object",
+ "description": "Statistics across time dimension",
+ "properties": {
+ "avg": {
+ "description": "Series average",
+ "type": "number",
+ "minimum": 0
+ },
+ "min": {
+ "description": "Series minimum",
+ "type": "number",
+ "minimum": 0
+ },
+ "max": {
+ "description": "Series maximum",
+ "type": "number",
+ "minimum": 0
+ }
+ },
+ "required": [
+ "avg",
+ "min",
+ "max"
+ ]
+ },
+ "data": {
+ "type": "array",
+ "contains": {
+ "type": "number",
+ "minimum": 0
+ },
+ "minItems": 1
+ }
+ },
+ "required": [
+ "hostname",
+ "statistics",
+ "data"
+ ]
+ }
+ }
+ },
+ "required": [
+ "unit",
+ "timestep",
+ "series"
+ ]
}
diff --git a/pkg/schema/schemas/job-metric-statistics.schema.json b/pkg/schema/schemas/job-metric-statistics.schema.json
index 3412c23..f753ed3 100644
--- a/pkg/schema/schemas/job-metric-statistics.schema.json
+++ b/pkg/schema/schemas/job-metric-statistics.schema.json
@@ -1,34 +1,34 @@
{
- "$schema": "http://json-schema.org/draft/2020-12/schema",
- "$id": "embedfs://job-metric-statistics.schema.json",
- "title": "Job statistics",
- "description": "Format specification for job metric statistics",
- "type": "object",
- "properties": {
- "unit": {
- "description": "Metric unit",
- "$ref": "embedfs://unit.schema.json"
- },
- "avg": {
- "description": "Job metric average",
- "type": "number",
- "minimum": 0
- },
- "min": {
- "description": "Job metric minimum",
- "type": "number",
- "minimum": 0
- },
- "max": {
- "description": "Job metric maximum",
- "type": "number",
- "minimum": 0
- }
+ "$schema": "http://json-schema.org/draft/2020-12/schema",
+ "$id": "embedfs://job-metric-statistics.schema.json",
+ "title": "Job statistics",
+ "description": "Format specification for job metric statistics",
+ "type": "object",
+ "properties": {
+ "unit": {
+ "description": "Metric unit",
+ "$ref": "embedfs://unit.schema.json"
},
- "required": [
- "unit",
- "avg",
- "min",
- "max"
- ]
+ "avg": {
+ "description": "Job metric average",
+ "type": "number",
+ "minimum": 0
+ },
+ "min": {
+ "description": "Job metric minimum",
+ "type": "number",
+ "minimum": 0
+ },
+ "max": {
+ "description": "Job metric maximum",
+ "type": "number",
+ "minimum": 0
+ }
+ },
+ "required": [
+ "unit",
+ "avg",
+ "min",
+ "max"
+ ]
}
diff --git a/pkg/schema/schemas/unit.schema.json b/pkg/schema/schemas/unit.schema.json
index 9ee781c..c0a3df3 100644
--- a/pkg/schema/schemas/unit.schema.json
+++ b/pkg/schema/schemas/unit.schema.json
@@ -1,40 +1,40 @@
{
- "$schema": "http://json-schema.org/draft/2020-12/schema",
- "$id": "embedfs://unit.schema.json",
- "title": "Metric unit",
- "description": "Format specification for job metric units",
- "type": "object",
- "properties": {
- "base": {
- "description": "Metric base unit",
- "type": "string",
- "enum": [
- "B",
- "F",
- "B/s",
- "F/s",
- "CPI",
- "IPC",
- "Hz",
- "W",
- "°C",
- ""
- ]
- },
- "prefix": {
- "description": "Unit prefix",
- "type": "string",
- "enum": [
- "K",
- "M",
- "G",
- "T",
- "P",
- "E"
- ]
- }
+ "$schema": "http://json-schema.org/draft/2020-12/schema",
+ "$id": "embedfs://unit.schema.json",
+ "title": "Metric unit",
+ "description": "Format specification for job metric units",
+ "type": "object",
+ "properties": {
+ "base": {
+ "description": "Metric base unit",
+ "type": "string",
+ "enum": [
+ "B",
+ "F",
+ "B/s",
+ "F/s",
+ "CPI",
+ "IPC",
+ "Hz",
+ "W",
+ "°C",
+ ""
+ ]
},
- "required": [
- "base"
- ]
+ "prefix": {
+ "description": "Unit prefix",
+ "type": "string",
+ "enum": [
+ "K",
+ "M",
+ "G",
+ "T",
+ "P",
+ "E"
+ ]
+ }
+ },
+ "required": [
+ "base"
+ ]
}
diff --git a/web/frontend/src/generic/helper/JobFootprint.svelte b/web/frontend/src/generic/helper/JobFootprint.svelte
index 187eff9..f9bc165 100644
--- a/web/frontend/src/generic/helper/JobFootprint.svelte
+++ b/web/frontend/src/generic/helper/JobFootprint.svelte
@@ -9,12 +9,11 @@
-->
@@ -93,7 +69,7 @@
const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "")
// Threshold / -Differences
- const fmt = findJobThresholds(job, fmc);
+ const fmt = findJobThresholds(job, jf.stat, fmc);
if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
// Define basic data -> Value: Use as Provided
diff --git a/web/frontend/src/generic/joblist/JobInfo.svelte b/web/frontend/src/generic/joblist/JobInfo.svelte
index adacd4f..8917653 100644
--- a/web/frontend/src/generic/joblist/JobInfo.svelte
+++ b/web/frontend/src/generic/joblist/JobInfo.svelte
@@ -7,7 +7,7 @@
-->
@@ -58,13 +76,18 @@
{job.jobId}
({job.cluster})
-