mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-25 04:49:05 +01:00
Merge pull request #320 from ClusterCockpit/hotfix
Fixes for Bugfix Release 1.4.2
This commit is contained in:
commit
9489ebc7d6
2
Makefile
2
Makefile
@ -2,7 +2,7 @@ TARGET = ./cc-backend
|
|||||||
VAR = ./var
|
VAR = ./var
|
||||||
CFG = config.json .env
|
CFG = config.json .env
|
||||||
FRONTEND = ./web/frontend
|
FRONTEND = ./web/frontend
|
||||||
VERSION = 1.4.1
|
VERSION = 1.4.2
|
||||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# `cc-backend` version 1.4.1
|
# `cc-backend` version 1.4.2
|
||||||
|
|
||||||
Supports job archive version 2 and database version 8.
|
Supports job archive version 2 and database version 8.
|
||||||
|
|
||||||
@ -12,7 +12,8 @@ For release specific notes visit the [ClusterCockpit Documentation](https://clus
|
|||||||
migration might require several hours!
|
migration might require several hours!
|
||||||
- You need to adapt the `cluster.json` configuration files in the job-archive,
|
- You need to adapt the `cluster.json` configuration files in the job-archive,
|
||||||
add new required attributes to the metric list and after that edit
|
add new required attributes to the metric list and after that edit
|
||||||
`./job-archive/version.txt` to version 2.
|
`./job-archive/version.txt` to version 2. Only metrics that have the footprint
|
||||||
|
attribute set can be filtered and show up in the footprint UI and polar plot.
|
||||||
- Continuous scrolling is default now in all job lists. You can change this back
|
- Continuous scrolling is default now in all job lists. You can change this back
|
||||||
to paging globally, also every user can configure to use paging or continuous
|
to paging globally, also every user can configure to use paging or continuous
|
||||||
scrolling individually.
|
scrolling individually.
|
||||||
|
@ -112,7 +112,7 @@ func main() {
|
|||||||
|
|
||||||
if flagInit {
|
if flagInit {
|
||||||
initEnv()
|
initEnv()
|
||||||
fmt.Print("Succesfully setup environment!\n")
|
fmt.Print("Successfully setup environment!\n")
|
||||||
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
|
fmt.Print("Please review config.json and .env and adjust it to your needs.\n")
|
||||||
fmt.Print("Add your job-archive at ./var/job-archive.\n")
|
fmt.Print("Add your job-archive at ./var/job-archive.\n")
|
||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
|
@ -25,7 +25,6 @@ import (
|
|||||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
|
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
|
||||||
@ -314,9 +313,6 @@ func serverShutdown() {
|
|||||||
// First shut down the server gracefully (waiting for all ongoing requests)
|
// First shut down the server gracefully (waiting for all ongoing requests)
|
||||||
server.Shutdown(context.Background())
|
server.Shutdown(context.Background())
|
||||||
|
|
||||||
// Then, wait for any async jobStarts still pending...
|
|
||||||
repository.WaitForJobStart()
|
|
||||||
|
|
||||||
// Then, wait for any async archivings still pending...
|
// Then, wait for any async archivings still pending...
|
||||||
archiver.WaitForArchiving()
|
archiver.WaitForArchiving()
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=ClusterCockpit Web Server (Go edition)
|
Description=ClusterCockpit Web Server
|
||||||
Documentation=https://github.com/ClusterCockpit/cc-backend
|
Documentation=https://github.com/ClusterCockpit/cc-backend
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
After=network-online.target
|
After=network-online.target
|
||||||
|
@ -249,9 +249,6 @@ func TestRestApi(t *testing.T) {
|
|||||||
if response.StatusCode != http.StatusCreated {
|
if response.StatusCode != http.StatusCreated {
|
||||||
t.Fatal(response.Status, recorder.Body.String())
|
t.Fatal(response.Status, recorder.Body.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
time.Sleep(1 * time.Second)
|
|
||||||
|
|
||||||
resolver := graph.GetResolverInstance()
|
resolver := graph.GetResolverInstance()
|
||||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -123,18 +123,8 @@ func (api *RestApi) MountFrontendApiRoutes(r *mux.Router) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartJobApiResponse model
|
// DefaultApiResponse model
|
||||||
type StartJobApiResponse struct {
|
type DefaultJobApiResponse struct {
|
||||||
Message string `json:"msg"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// DeleteJobApiResponse model
|
|
||||||
type DeleteJobApiResponse struct {
|
|
||||||
Message string `json:"msg"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// UpdateUserApiResponse model
|
|
||||||
type UpdateUserApiResponse struct {
|
|
||||||
Message string `json:"msg"`
|
Message string `json:"msg"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -341,7 +331,7 @@ func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
|
|||||||
withMetadata := false
|
withMetadata := false
|
||||||
filter := &model.JobFilter{}
|
filter := &model.JobFilter{}
|
||||||
page := &model.PageRequest{ItemsPerPage: 25, Page: 1}
|
page := &model.PageRequest{ItemsPerPage: 25, Page: 1}
|
||||||
order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
|
order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
|
||||||
|
|
||||||
for key, vals := range r.URL.Query() {
|
for key, vals := range r.URL.Query() {
|
||||||
switch key {
|
switch key {
|
||||||
@ -790,6 +780,11 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// aquire lock to avoid race condition between API calls
|
||||||
|
var unlockOnce sync.Once
|
||||||
|
api.RepositoryMutex.Lock()
|
||||||
|
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||||
|
|
||||||
// Check if combination of (job_id, cluster_id, start_time) already exists:
|
// Check if combination of (job_id, cluster_id, start_time) already exists:
|
||||||
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
||||||
if err != nil && err != sql.ErrNoRows {
|
if err != nil && err != sql.ErrNoRows {
|
||||||
@ -804,12 +799,27 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
repository.TriggerJobStart(repository.JobWithUser{Job: &req, User: repository.GetUserFromContext(r.Context())})
|
id, err := api.JobRepository.Start(&req)
|
||||||
|
if err != nil {
|
||||||
|
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// unlock here, adding Tags can be async
|
||||||
|
unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||||
|
|
||||||
|
for _, tag := range req.Tags {
|
||||||
|
if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||||
|
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||||
|
handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
rw.WriteHeader(http.StatusCreated)
|
rw.WriteHeader(http.StatusCreated)
|
||||||
json.NewEncoder(rw).Encode(StartJobApiResponse{
|
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||||
Message: fmt.Sprintf("Successfully triggered job start"),
|
Message: "success",
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -892,7 +902,7 @@ func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
rw.WriteHeader(http.StatusOK)
|
rw.WriteHeader(http.StatusOK)
|
||||||
json.NewEncoder(rw).Encode(DeleteJobApiResponse{
|
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||||
Message: fmt.Sprintf("Successfully deleted job %s", id),
|
Message: fmt.Sprintf("Successfully deleted job %s", id),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -943,7 +953,7 @@ func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
|||||||
|
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
rw.WriteHeader(http.StatusOK)
|
rw.WriteHeader(http.StatusOK)
|
||||||
json.NewEncoder(rw).Encode(DeleteJobApiResponse{
|
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||||
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
|
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -987,7 +997,7 @@ func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
rw.WriteHeader(http.StatusOK)
|
rw.WriteHeader(http.StatusOK)
|
||||||
json.NewEncoder(rw).Encode(DeleteJobApiResponse{
|
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
|
||||||
Message: fmt.Sprintf("Successfully deleted %d jobs", cnt),
|
Message: fmt.Sprintf("Successfully deleted %d jobs", cnt),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -36,10 +36,7 @@ func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag,
|
|||||||
|
|
||||||
// ConcurrentJobs is the resolver for the concurrentJobs field.
|
// ConcurrentJobs is the resolver for the concurrentJobs field.
|
||||||
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
|
func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) {
|
||||||
if obj.State == schema.JobStateRunning {
|
// FIXME: Make the hardcoded duration configurable
|
||||||
obj.Duration = int32(time.Now().Unix() - obj.StartTimeUnix)
|
|
||||||
}
|
|
||||||
|
|
||||||
if obj.Exclusive != 1 && obj.Duration > 600 {
|
if obj.Exclusive != 1 && obj.Duration > 600 {
|
||||||
return r.Repo.FindConcurrentJobs(ctx, obj)
|
return r.Repo.FindConcurrentJobs(ctx, obj)
|
||||||
}
|
}
|
||||||
|
@ -82,8 +82,6 @@ func Connect(driver string, db string) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
startJobStartWorker()
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,12 +79,9 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
|
|||||||
}
|
}
|
||||||
job.RawFootprint = nil
|
job.RawFootprint = nil
|
||||||
|
|
||||||
// if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
|
|
||||||
// return nil, err
|
|
||||||
// }
|
|
||||||
|
|
||||||
job.StartTime = time.Unix(job.StartTimeUnix, 0)
|
job.StartTime = time.Unix(job.StartTimeUnix, 0)
|
||||||
if job.Duration == 0 && job.State == schema.JobStateRunning {
|
// Always ensure accurate duration for running jobs
|
||||||
|
if job.State == schema.JobStateRunning {
|
||||||
job.Duration = int32(time.Since(job.StartTime).Seconds())
|
job.Duration = int32(time.Since(job.StartTime).Seconds())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -457,6 +454,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
|
|||||||
return subclusters, nil
|
return subclusters, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME: Set duration to requested walltime?
|
||||||
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
res, err := sq.Update("job").
|
res, err := sq.Update("job").
|
||||||
|
@ -170,8 +170,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
|||||||
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
||||||
}
|
}
|
||||||
if filter.Duration != nil {
|
if filter.Duration != nil {
|
||||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
query = buildIntCondition("job.duration", filter.Duration, query)
|
||||||
query = query.Where("(CASE WHEN job.job_state = 'running' THEN (? - job.start_time) ELSE job.duration END) BETWEEN ? AND ?", now, filter.Duration.From, filter.Duration.To)
|
|
||||||
}
|
}
|
||||||
if filter.MinRunningFor != nil {
|
if filter.MinRunningFor != nil {
|
||||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||||
|
@ -1,83 +0,0 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
package repository
|
|
||||||
|
|
||||||
import (
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
|
||||||
)
|
|
||||||
|
|
||||||
type JobWithUser struct {
|
|
||||||
Job *schema.JobMeta
|
|
||||||
User *schema.User
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
jobStartPending sync.WaitGroup
|
|
||||||
jobStartChannel chan JobWithUser
|
|
||||||
)
|
|
||||||
|
|
||||||
func startJobStartWorker() {
|
|
||||||
jobStartChannel = make(chan JobWithUser, 128)
|
|
||||||
|
|
||||||
go jobStartWorker()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Archiving worker thread
|
|
||||||
func jobStartWorker() {
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case req, ok := <-jobStartChannel:
|
|
||||||
if !ok {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
jobRepo := GetJobRepository()
|
|
||||||
var id int64
|
|
||||||
|
|
||||||
for i := 0; i < 5; i++ {
|
|
||||||
var err error
|
|
||||||
|
|
||||||
id, err = jobRepo.Start(req.Job)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Attempt %d: insert into database failed: %v", i, err)
|
|
||||||
} else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
time.Sleep(1 * time.Second)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tag := range req.Job.Tags {
|
|
||||||
if _, err := jobRepo.AddTagOrCreate(req.User, id,
|
|
||||||
tag.Type, tag.Name, tag.Scope); err != nil {
|
|
||||||
log.Errorf("adding tag to new job %d failed: %v", id, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
|
|
||||||
id, req.Job.Cluster, req.Job.JobID, req.Job.User, req.Job.StartTime)
|
|
||||||
|
|
||||||
jobStartPending.Done()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trigger async archiving
|
|
||||||
func TriggerJobStart(req JobWithUser) {
|
|
||||||
if jobStartChannel == nil {
|
|
||||||
log.Fatal("Cannot start Job without jobStart channel. Did you Start the worker?")
|
|
||||||
}
|
|
||||||
|
|
||||||
jobStartPending.Add(1)
|
|
||||||
jobStartChannel <- req
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for background thread to finish pending archiving operations
|
|
||||||
func WaitForJobStart() {
|
|
||||||
// close channel and wait for worker to process remaining jobs
|
|
||||||
jobStartPending.Wait()
|
|
||||||
}
|
|
@ -111,7 +111,7 @@ func BenchmarkDB_QueryJobs(b *testing.B) {
|
|||||||
user := "mppi133h"
|
user := "mppi133h"
|
||||||
filter.User = &model.StringInput{Eq: &user}
|
filter.User = &model.StringInput{Eq: &user}
|
||||||
page := &model.PageRequest{ItemsPerPage: 50, Page: 1}
|
page := &model.PageRequest{ItemsPerPage: 50, Page: 1}
|
||||||
order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
|
order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
|
||||||
|
|
||||||
b.Run("QueryJobs", func(b *testing.B) {
|
b.Run("QueryJobs", func(b *testing.B) {
|
||||||
db := setup(b)
|
db := setup(b)
|
||||||
|
@ -182,6 +182,7 @@ func setupTaglistRoute(i InfoType, r *http.Request) InfoType {
|
|||||||
return i
|
return i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME: Lots of redundant code. Needs refactoring
|
||||||
func buildFilterPresets(query url.Values) map[string]interface{} {
|
func buildFilterPresets(query url.Values) map[string]interface{} {
|
||||||
filterPresets := map[string]interface{}{}
|
filterPresets := map[string]interface{}{}
|
||||||
|
|
||||||
|
@ -1,490 +1,490 @@
|
|||||||
{
|
{
|
||||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||||
"$id": "embedfs://job-data.schema.json",
|
"$id": "embedfs://job-data.schema.json",
|
||||||
"title": "Job metric data list",
|
"title": "Job metric data list",
|
||||||
"description": "Collection of metric data of a HPC job",
|
"description": "Collection of metric data of a HPC job",
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"mem_used": {
|
"mem_used": {
|
||||||
"description": "Memory capacity used",
|
"description": "Memory capacity used",
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"node": {
|
"node": {
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"flops_any": {
|
|
||||||
"description": "Total flop rate with DP flops scaled up",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"hwthread": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"mem_bw": {
|
|
||||||
"description": "Main memory bandwidth",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"net_bw": {
|
|
||||||
"description": "Total fast interconnect network bandwidth",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"ipc": {
|
|
||||||
"description": "Instructions executed per cycle",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"hwthread": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"cpu_user": {
|
|
||||||
"description": "CPU user active core utilization",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"hwthread": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"cpu_load": {
|
|
||||||
"description": "CPU requested core utilization (load 1m)",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"flops_dp": {
|
|
||||||
"description": "Double precision flop rate",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"hwthread": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"flops_sp": {
|
|
||||||
"description": "Single precision flops rate",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"hwthread": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"vectorization_ratio": {
|
|
||||||
"description": "Fraction of arithmetic instructions using SIMD instructions",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"hwthread": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"cpu_power": {
|
|
||||||
"description": "CPU power consumption",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"mem_power": {
|
|
||||||
"description": "Memory power consumption",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"acc_utilization": {
|
|
||||||
"description": "GPU utilization",
|
|
||||||
"properties": {
|
|
||||||
"accelerator": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"accelerator"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"acc_mem_used": {
|
|
||||||
"description": "GPU memory capacity used",
|
|
||||||
"properties": {
|
|
||||||
"accelerator": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"accelerator"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"acc_power": {
|
|
||||||
"description": "GPU power consumption",
|
|
||||||
"properties": {
|
|
||||||
"accelerator": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"accelerator"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"clock": {
|
|
||||||
"description": "Average core frequency",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
},
|
|
||||||
"hwthread": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minProperties": 1
|
|
||||||
},
|
|
||||||
"eth_read_bw": {
|
|
||||||
"description": "Ethernet read bandwidth",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"eth_write_bw": {
|
|
||||||
"description": "Ethernet write bandwidth",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"filesystems": {
|
|
||||||
"description": "Array of filesystems",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"name": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": [
|
|
||||||
"nfs",
|
|
||||||
"lustre",
|
|
||||||
"gpfs",
|
|
||||||
"nvme",
|
|
||||||
"ssd",
|
|
||||||
"hdd",
|
|
||||||
"beegfs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"read_bw": {
|
|
||||||
"description": "File system read bandwidth",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"write_bw": {
|
|
||||||
"description": "File system write bandwidth",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"read_req": {
|
|
||||||
"description": "File system read requests",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"write_req": {
|
|
||||||
"description": "File system write requests",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"inodes": {
|
|
||||||
"description": "File system write requests",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"accesses": {
|
|
||||||
"description": "File system open and close",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"fsync": {
|
|
||||||
"description": "File system fsync",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"create": {
|
|
||||||
"description": "File system create",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"open": {
|
|
||||||
"description": "File system open",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"close": {
|
|
||||||
"description": "File system close",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"seek": {
|
|
||||||
"description": "File system seek",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"name",
|
|
||||||
"type",
|
|
||||||
"read_bw",
|
|
||||||
"write_bw"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"minItems": 1
|
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"ic_rcv_packets": {
|
"flops_any": {
|
||||||
"description": "Network interconnect read packets",
|
"description": "Total flop rate with DP flops scaled up",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"hwthread": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"mem_bw": {
|
||||||
|
"description": "Main memory bandwidth",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"net_bw": {
|
||||||
|
"description": "Total fast interconnect network bandwidth",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"ipc": {
|
||||||
|
"description": "Instructions executed per cycle",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"hwthread": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"cpu_user": {
|
||||||
|
"description": "CPU user active core utilization",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"hwthread": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"cpu_load": {
|
||||||
|
"description": "CPU requested core utilization (load 1m)",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"flops_dp": {
|
||||||
|
"description": "Double precision flop rate",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"hwthread": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"flops_sp": {
|
||||||
|
"description": "Single precision flops rate",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"hwthread": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"vectorization_ratio": {
|
||||||
|
"description": "Fraction of arithmetic instructions using SIMD instructions",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"hwthread": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"cpu_power": {
|
||||||
|
"description": "CPU power consumption",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"mem_power": {
|
||||||
|
"description": "Memory power consumption",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"acc_utilization": {
|
||||||
|
"description": "GPU utilization",
|
||||||
|
"properties": {
|
||||||
|
"accelerator": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"accelerator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"acc_mem_used": {
|
||||||
|
"description": "GPU memory capacity used",
|
||||||
|
"properties": {
|
||||||
|
"accelerator": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"accelerator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"acc_power": {
|
||||||
|
"description": "GPU power consumption",
|
||||||
|
"properties": {
|
||||||
|
"accelerator": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"accelerator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"clock": {
|
||||||
|
"description": "Average core frequency",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
},
|
||||||
|
"hwthread": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"minProperties": 1
|
||||||
|
},
|
||||||
|
"eth_read_bw": {
|
||||||
|
"description": "Ethernet read bandwidth",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"eth_write_bw": {
|
||||||
|
"description": "Ethernet write bandwidth",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"filesystems": {
|
||||||
|
"description": "Array of filesystems",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"node": {
|
"name": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"nfs",
|
||||||
|
"lustre",
|
||||||
|
"gpfs",
|
||||||
|
"nvme",
|
||||||
|
"ssd",
|
||||||
|
"hdd",
|
||||||
|
"beegfs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"read_bw": {
|
||||||
|
"description": "File system read bandwidth",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"write_bw": {
|
||||||
|
"description": "File system write bandwidth",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"read_req": {
|
||||||
|
"description": "File system read requests",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"write_req": {
|
||||||
|
"description": "File system write requests",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"inodes": {
|
||||||
|
"description": "File system write requests",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"accesses": {
|
||||||
|
"description": "File system open and close",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"fsync": {
|
||||||
|
"description": "File system fsync",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"create": {
|
||||||
|
"description": "File system create",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"open": {
|
||||||
|
"description": "File system open",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"close": {
|
||||||
|
"description": "File system close",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"seek": {
|
||||||
|
"description": "File system seek",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
"node"
|
"name",
|
||||||
]
|
"type",
|
||||||
},
|
"read_bw",
|
||||||
"ic_send_packets": {
|
"write_bw"
|
||||||
"description": "Network interconnect send packet",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"ic_read_bw": {
|
|
||||||
"description": "Network interconnect read bandwidth",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"ic_write_bw": {
|
|
||||||
"description": "Network interconnect write bandwidth",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"$ref": "embedfs://job-metric-data.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node"
|
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ic_rcv_packets": {
|
||||||
|
"description": "Network interconnect read packets",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
"cpu_user",
|
"node"
|
||||||
"cpu_load",
|
|
||||||
"mem_used",
|
|
||||||
"flops_any",
|
|
||||||
"mem_bw",
|
|
||||||
"net_bw",
|
|
||||||
"filesystems"
|
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"ic_send_packets": {
|
||||||
|
"description": "Network interconnect send packet",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"ic_read_bw": {
|
||||||
|
"description": "Network interconnect read bandwidth",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"ic_write_bw": {
|
||||||
|
"description": "Network interconnect write bandwidth",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"$ref": "embedfs://job-metric-data.schema.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"cpu_user",
|
||||||
|
"cpu_load",
|
||||||
|
"mem_used",
|
||||||
|
"flops_any",
|
||||||
|
"mem_bw",
|
||||||
|
"net_bw",
|
||||||
|
"filesystems"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
@ -1,351 +1,351 @@
|
|||||||
{
|
{
|
||||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||||
"$id": "embedfs://job-meta.schema.json",
|
"$id": "embedfs://job-meta.schema.json",
|
||||||
"title": "Job meta data",
|
"title": "Job meta data",
|
||||||
"description": "Meta data information of a HPC job",
|
"description": "Meta data information of a HPC job",
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"jobId": {
|
"jobId": {
|
||||||
"description": "The unique identifier of a job",
|
"description": "The unique identifier of a job",
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"user": {
|
"user": {
|
||||||
"description": "The unique identifier of a user",
|
"description": "The unique identifier of a user",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"project": {
|
||||||
|
"description": "The unique identifier of a project",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"cluster": {
|
||||||
|
"description": "The unique identifier of a cluster",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"subCluster": {
|
||||||
|
"description": "The unique identifier of a sub cluster",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"partition": {
|
||||||
|
"description": "The Slurm partition to which the job was submitted",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"arrayJobId": {
|
||||||
|
"description": "The unique identifier of an array job",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"numNodes": {
|
||||||
|
"description": "Number of nodes used",
|
||||||
|
"type": "integer",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"numHwthreads": {
|
||||||
|
"description": "Number of HWThreads used",
|
||||||
|
"type": "integer",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"numAcc": {
|
||||||
|
"description": "Number of accelerators used",
|
||||||
|
"type": "integer",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"exclusive": {
|
||||||
|
"description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"maximum": 2
|
||||||
|
},
|
||||||
|
"monitoringStatus": {
|
||||||
|
"description": "State of monitoring system during job run",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"smt": {
|
||||||
|
"description": "SMT threads used by job",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"walltime": {
|
||||||
|
"description": "Requested walltime of job in seconds",
|
||||||
|
"type": "integer",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"jobState": {
|
||||||
|
"description": "Final state of job",
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"completed",
|
||||||
|
"failed",
|
||||||
|
"cancelled",
|
||||||
|
"stopped",
|
||||||
|
"out_of_memory",
|
||||||
|
"timeout"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"startTime": {
|
||||||
|
"description": "Start epoch time stamp in seconds",
|
||||||
|
"type": "integer",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"duration": {
|
||||||
|
"description": "Duration of job in seconds",
|
||||||
|
"type": "integer",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"resources": {
|
||||||
|
"description": "Resources used by job",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"hostname": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"project": {
|
"hwthreads": {
|
||||||
"description": "The unique identifier of a project",
|
"type": "array",
|
||||||
"type": "string"
|
"description": "List of OS processor ids",
|
||||||
},
|
"items": {
|
||||||
"cluster": {
|
"type": "integer"
|
||||||
"description": "The unique identifier of a cluster",
|
}
|
||||||
"type": "string"
|
},
|
||||||
},
|
"accelerators": {
|
||||||
"subCluster": {
|
"type": "array",
|
||||||
"description": "The unique identifier of a sub cluster",
|
"description": "List of of accelerator device ids",
|
||||||
"type": "string"
|
"items": {
|
||||||
},
|
"type": "string"
|
||||||
"partition": {
|
}
|
||||||
"description": "The Slurm partition to which the job was submitted",
|
},
|
||||||
"type": "string"
|
"configuration": {
|
||||||
},
|
|
||||||
"arrayJobId": {
|
|
||||||
"description": "The unique identifier of an array job",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"numNodes": {
|
|
||||||
"description": "Number of nodes used",
|
|
||||||
"type": "integer",
|
|
||||||
"exclusiveMinimum": 0
|
|
||||||
},
|
|
||||||
"numHwthreads": {
|
|
||||||
"description": "Number of HWThreads used",
|
|
||||||
"type": "integer",
|
|
||||||
"exclusiveMinimum": 0
|
|
||||||
},
|
|
||||||
"numAcc": {
|
|
||||||
"description": "Number of accelerators used",
|
|
||||||
"type": "integer",
|
|
||||||
"exclusiveMinimum": 0
|
|
||||||
},
|
|
||||||
"exclusive": {
|
|
||||||
"description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
|
|
||||||
"type": "integer",
|
|
||||||
"minimum": 0,
|
|
||||||
"maximum": 2
|
|
||||||
},
|
|
||||||
"monitoringStatus": {
|
|
||||||
"description": "State of monitoring system during job run",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"smt": {
|
|
||||||
"description": "SMT threads used by job",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"walltime": {
|
|
||||||
"description": "Requested walltime of job in seconds",
|
|
||||||
"type": "integer",
|
|
||||||
"exclusiveMinimum": 0
|
|
||||||
},
|
|
||||||
"jobState": {
|
|
||||||
"description": "Final state of job",
|
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"description": "The configuration options of the node"
|
||||||
"completed",
|
}
|
||||||
"failed",
|
|
||||||
"cancelled",
|
|
||||||
"stopped",
|
|
||||||
"out_of_memory",
|
|
||||||
"timeout"
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"startTime": {
|
"required": [
|
||||||
"description": "Start epoch time stamp in seconds",
|
"hostname"
|
||||||
"type": "integer",
|
],
|
||||||
"exclusiveMinimum": 0
|
"minItems": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metaData": {
|
||||||
|
"description": "Additional information about the job",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"jobScript": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The batch script of the job"
|
||||||
},
|
},
|
||||||
"duration": {
|
"jobName": {
|
||||||
"description": "Duration of job in seconds",
|
"type": "string",
|
||||||
"type": "integer",
|
"description": "Slurm Job name"
|
||||||
"exclusiveMinimum": 0
|
|
||||||
},
|
},
|
||||||
"resources": {
|
"slurmInfo": {
|
||||||
"description": "Resources used by job",
|
"type": "string",
|
||||||
"type": "array",
|
"description": "Additional slurm infos as show by scontrol show job"
|
||||||
"items": {
|
}
|
||||||
"type": "object",
|
}
|
||||||
"properties": {
|
},
|
||||||
"hostname": {
|
"tags": {
|
||||||
"type": "string"
|
"description": "List of tags",
|
||||||
},
|
"type": "array",
|
||||||
"hwthreads": {
|
"items": {
|
||||||
"type": "array",
|
"type": "object",
|
||||||
"description": "List of OS processor ids",
|
"properties": {
|
||||||
"items": {
|
"name": {
|
||||||
"type": "integer"
|
"type": "string"
|
||||||
}
|
},
|
||||||
},
|
"type": {
|
||||||
"accelerators": {
|
"type": "string"
|
||||||
"type": "array",
|
}
|
||||||
"description": "List of of accelerator device ids",
|
|
||||||
"items": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"configuration": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The configuration options of the node"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"hostname"
|
|
||||||
],
|
|
||||||
"minItems": 1
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"metaData": {
|
"required": [
|
||||||
"description": "Additional information about the job",
|
"name",
|
||||||
|
"type"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"uniqueItems": true
|
||||||
|
},
|
||||||
|
"statistics": {
|
||||||
|
"description": "Job statistic data",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"mem_used": {
|
||||||
|
"description": "Memory capacity used (required)",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"cpu_load": {
|
||||||
|
"description": "CPU requested core utilization (load 1m) (required)",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"flops_any": {
|
||||||
|
"description": "Total flop rate with DP flops scaled up (required)",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"mem_bw": {
|
||||||
|
"description": "Main memory bandwidth (required)",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"net_bw": {
|
||||||
|
"description": "Total fast interconnect network bandwidth (required)",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"file_bw": {
|
||||||
|
"description": "Total file IO bandwidth (required)",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"ipc": {
|
||||||
|
"description": "Instructions executed per cycle",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"cpu_user": {
|
||||||
|
"description": "CPU user active core utilization",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"flops_dp": {
|
||||||
|
"description": "Double precision flop rate",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"flops_sp": {
|
||||||
|
"description": "Single precision flops rate",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"rapl_power": {
|
||||||
|
"description": "CPU power consumption",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"acc_used": {
|
||||||
|
"description": "GPU utilization",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"acc_mem_used": {
|
||||||
|
"description": "GPU memory capacity used",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"acc_power": {
|
||||||
|
"description": "GPU power consumption",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"clock": {
|
||||||
|
"description": "Average core frequency",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"eth_read_bw": {
|
||||||
|
"description": "Ethernet read bandwidth",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"eth_write_bw": {
|
||||||
|
"description": "Ethernet write bandwidth",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"ic_rcv_packets": {
|
||||||
|
"description": "Network interconnect read packets",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"ic_send_packets": {
|
||||||
|
"description": "Network interconnect send packet",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"ic_read_bw": {
|
||||||
|
"description": "Network interconnect read bandwidth",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"ic_write_bw": {
|
||||||
|
"description": "Network interconnect write bandwidth",
|
||||||
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
|
},
|
||||||
|
"filesystems": {
|
||||||
|
"description": "Array of filesystems",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"jobScript": {
|
"name": {
|
||||||
"type": "string",
|
"type": "string"
|
||||||
"description": "The batch script of the job"
|
},
|
||||||
},
|
"type": {
|
||||||
"jobName": {
|
"type": "string",
|
||||||
"type": "string",
|
"enum": [
|
||||||
"description": "Slurm Job name"
|
"nfs",
|
||||||
},
|
"lustre",
|
||||||
"slurmInfo": {
|
"gpfs",
|
||||||
"type": "string",
|
"nvme",
|
||||||
"description": "Additional slurm infos as show by scontrol show job"
|
"ssd",
|
||||||
}
|
"hdd",
|
||||||
}
|
"beegfs"
|
||||||
},
|
|
||||||
"tags": {
|
|
||||||
"description": "List of tags",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"name": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"type": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"name",
|
|
||||||
"type"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"uniqueItems": true
|
"read_bw": {
|
||||||
},
|
"description": "File system read bandwidth",
|
||||||
"statistics": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Job statistic data",
|
},
|
||||||
"type": "object",
|
"write_bw": {
|
||||||
"properties": {
|
"description": "File system write bandwidth",
|
||||||
"mem_used": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Memory capacity used (required)",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"read_req": {
|
||||||
},
|
"description": "File system read requests",
|
||||||
"cpu_load": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "CPU requested core utilization (load 1m) (required)",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"write_req": {
|
||||||
},
|
"description": "File system write requests",
|
||||||
"flops_any": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Total flop rate with DP flops scaled up (required)",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"inodes": {
|
||||||
},
|
"description": "File system write requests",
|
||||||
"mem_bw": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Main memory bandwidth (required)",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"accesses": {
|
||||||
},
|
"description": "File system open and close",
|
||||||
"net_bw": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Total fast interconnect network bandwidth (required)",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"fsync": {
|
||||||
},
|
"description": "File system fsync",
|
||||||
"file_bw": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Total file IO bandwidth (required)",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"create": {
|
||||||
},
|
"description": "File system create",
|
||||||
"ipc": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Instructions executed per cycle",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"open": {
|
||||||
},
|
"description": "File system open",
|
||||||
"cpu_user": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "CPU user active core utilization",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"close": {
|
||||||
},
|
"description": "File system close",
|
||||||
"flops_dp": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Double precision flop rate",
|
},
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
"seek": {
|
||||||
},
|
"description": "File system seek",
|
||||||
"flops_sp": {
|
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||||
"description": "Single precision flops rate",
|
}
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"rapl_power": {
|
|
||||||
"description": "CPU power consumption",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"acc_used": {
|
|
||||||
"description": "GPU utilization",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"acc_mem_used": {
|
|
||||||
"description": "GPU memory capacity used",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"acc_power": {
|
|
||||||
"description": "GPU power consumption",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"clock": {
|
|
||||||
"description": "Average core frequency",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"eth_read_bw": {
|
|
||||||
"description": "Ethernet read bandwidth",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"eth_write_bw": {
|
|
||||||
"description": "Ethernet write bandwidth",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"ic_rcv_packets": {
|
|
||||||
"description": "Network interconnect read packets",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"ic_send_packets": {
|
|
||||||
"description": "Network interconnect send packet",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"ic_read_bw": {
|
|
||||||
"description": "Network interconnect read bandwidth",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"ic_write_bw": {
|
|
||||||
"description": "Network interconnect write bandwidth",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"filesystems": {
|
|
||||||
"description": "Array of filesystems",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"name": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": [
|
|
||||||
"nfs",
|
|
||||||
"lustre",
|
|
||||||
"gpfs",
|
|
||||||
"nvme",
|
|
||||||
"ssd",
|
|
||||||
"hdd",
|
|
||||||
"beegfs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"read_bw": {
|
|
||||||
"description": "File system read bandwidth",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"write_bw": {
|
|
||||||
"description": "File system write bandwidth",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"read_req": {
|
|
||||||
"description": "File system read requests",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"write_req": {
|
|
||||||
"description": "File system write requests",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"inodes": {
|
|
||||||
"description": "File system write requests",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"accesses": {
|
|
||||||
"description": "File system open and close",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"fsync": {
|
|
||||||
"description": "File system fsync",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"create": {
|
|
||||||
"description": "File system create",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"open": {
|
|
||||||
"description": "File system open",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"close": {
|
|
||||||
"description": "File system close",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
},
|
|
||||||
"seek": {
|
|
||||||
"description": "File system seek",
|
|
||||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"name",
|
|
||||||
"type",
|
|
||||||
"read_bw",
|
|
||||||
"write_bw"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"minItems": 1
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
"cpu_user",
|
"name",
|
||||||
"cpu_load",
|
"type",
|
||||||
"mem_used",
|
"read_bw",
|
||||||
"flops_any",
|
"write_bw"
|
||||||
"mem_bw"
|
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
"jobId",
|
"cpu_user",
|
||||||
"user",
|
"cpu_load",
|
||||||
"project",
|
"mem_used",
|
||||||
"cluster",
|
"flops_any",
|
||||||
"subCluster",
|
"mem_bw"
|
||||||
"numNodes",
|
]
|
||||||
"exclusive",
|
}
|
||||||
"startTime",
|
},
|
||||||
"jobState",
|
"required": [
|
||||||
"duration",
|
"jobId",
|
||||||
"resources",
|
"user",
|
||||||
"statistics"
|
"project",
|
||||||
]
|
"cluster",
|
||||||
|
"subCluster",
|
||||||
|
"numNodes",
|
||||||
|
"exclusive",
|
||||||
|
"startTime",
|
||||||
|
"jobState",
|
||||||
|
"duration",
|
||||||
|
"resources",
|
||||||
|
"statistics"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
@ -1,216 +1,216 @@
|
|||||||
{
|
{
|
||||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||||
"$id": "embedfs://job-metric-data.schema.json",
|
"$id": "embedfs://job-metric-data.schema.json",
|
||||||
"title": "Job metric data",
|
"title": "Job metric data",
|
||||||
"description": "Metric data of a HPC job",
|
"description": "Metric data of a HPC job",
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"unit": {
|
"unit": {
|
||||||
"description": "Metric unit",
|
"description": "Metric unit",
|
||||||
"$ref": "embedfs://unit.schema.json"
|
"$ref": "embedfs://unit.schema.json"
|
||||||
},
|
|
||||||
"timestep": {
|
|
||||||
"description": "Measurement interval in seconds",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"thresholds": {
|
|
||||||
"description": "Metric thresholds for specific system",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"peak": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"normal": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"caution": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"alert": {
|
|
||||||
"type": "number"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"statisticsSeries": {
|
|
||||||
"type": "object",
|
|
||||||
"description": "Statistics series across topology",
|
|
||||||
"properties": {
|
|
||||||
"min": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"max": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"mean": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"percentiles": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"10": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"20": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"30": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"40": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"50": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"60": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"70": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"80": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"90": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"25": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
},
|
|
||||||
"75": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"series": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"hostname": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"id": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"statistics": {
|
|
||||||
"type": "object",
|
|
||||||
"description": "Statistics across time dimension",
|
|
||||||
"properties": {
|
|
||||||
"avg": {
|
|
||||||
"description": "Series average",
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"min": {
|
|
||||||
"description": "Series minimum",
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"max": {
|
|
||||||
"description": "Series maximum",
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"avg",
|
|
||||||
"min",
|
|
||||||
"max"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"data": {
|
|
||||||
"type": "array",
|
|
||||||
"contains": {
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"minItems": 1
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"hostname",
|
|
||||||
"statistics",
|
|
||||||
"data"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"required": [
|
"timestep": {
|
||||||
"unit",
|
"description": "Measurement interval in seconds",
|
||||||
"timestep",
|
"type": "integer"
|
||||||
"series"
|
},
|
||||||
]
|
"thresholds": {
|
||||||
|
"description": "Metric thresholds for specific system",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"peak": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"normal": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"caution": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"alert": {
|
||||||
|
"type": "number"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"statisticsSeries": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Statistics series across topology",
|
||||||
|
"properties": {
|
||||||
|
"min": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"max": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"mean": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"percentiles": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"10": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"20": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"30": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"40": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"50": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"60": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"70": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"80": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"90": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"25": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
},
|
||||||
|
"75": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"series": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"hostname": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"statistics": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Statistics across time dimension",
|
||||||
|
"properties": {
|
||||||
|
"avg": {
|
||||||
|
"description": "Series average",
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"min": {
|
||||||
|
"description": "Series minimum",
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"max": {
|
||||||
|
"description": "Series maximum",
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"avg",
|
||||||
|
"min",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"type": "array",
|
||||||
|
"contains": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"hostname",
|
||||||
|
"statistics",
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"unit",
|
||||||
|
"timestep",
|
||||||
|
"series"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
@ -1,34 +1,34 @@
|
|||||||
{
|
{
|
||||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||||
"$id": "embedfs://job-metric-statistics.schema.json",
|
"$id": "embedfs://job-metric-statistics.schema.json",
|
||||||
"title": "Job statistics",
|
"title": "Job statistics",
|
||||||
"description": "Format specification for job metric statistics",
|
"description": "Format specification for job metric statistics",
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"unit": {
|
"unit": {
|
||||||
"description": "Metric unit",
|
"description": "Metric unit",
|
||||||
"$ref": "embedfs://unit.schema.json"
|
"$ref": "embedfs://unit.schema.json"
|
||||||
},
|
|
||||||
"avg": {
|
|
||||||
"description": "Job metric average",
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"min": {
|
|
||||||
"description": "Job metric minimum",
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
},
|
|
||||||
"max": {
|
|
||||||
"description": "Job metric maximum",
|
|
||||||
"type": "number",
|
|
||||||
"minimum": 0
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"required": [
|
"avg": {
|
||||||
"unit",
|
"description": "Job metric average",
|
||||||
"avg",
|
"type": "number",
|
||||||
"min",
|
"minimum": 0
|
||||||
"max"
|
},
|
||||||
]
|
"min": {
|
||||||
|
"description": "Job metric minimum",
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"max": {
|
||||||
|
"description": "Job metric maximum",
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"unit",
|
||||||
|
"avg",
|
||||||
|
"min",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
@ -1,40 +1,40 @@
|
|||||||
{
|
{
|
||||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||||
"$id": "embedfs://unit.schema.json",
|
"$id": "embedfs://unit.schema.json",
|
||||||
"title": "Metric unit",
|
"title": "Metric unit",
|
||||||
"description": "Format specification for job metric units",
|
"description": "Format specification for job metric units",
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"base": {
|
"base": {
|
||||||
"description": "Metric base unit",
|
"description": "Metric base unit",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"B",
|
"B",
|
||||||
"F",
|
"F",
|
||||||
"B/s",
|
"B/s",
|
||||||
"F/s",
|
"F/s",
|
||||||
"CPI",
|
"CPI",
|
||||||
"IPC",
|
"IPC",
|
||||||
"Hz",
|
"Hz",
|
||||||
"W",
|
"W",
|
||||||
"°C",
|
"°C",
|
||||||
""
|
""
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"prefix": {
|
|
||||||
"description": "Unit prefix",
|
|
||||||
"type": "string",
|
|
||||||
"enum": [
|
|
||||||
"K",
|
|
||||||
"M",
|
|
||||||
"G",
|
|
||||||
"T",
|
|
||||||
"P",
|
|
||||||
"E"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"required": [
|
"prefix": {
|
||||||
"base"
|
"description": "Unit prefix",
|
||||||
]
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"K",
|
||||||
|
"M",
|
||||||
|
"G",
|
||||||
|
"T",
|
||||||
|
"P",
|
||||||
|
"E"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"base"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
@ -9,12 +9,11 @@
|
|||||||
-->
|
-->
|
||||||
|
|
||||||
<script context="module">
|
<script context="module">
|
||||||
function findJobThresholds(job, metricConfig) {
|
function findJobThresholds(job, stat, metricConfig) {
|
||||||
if (!job || !metricConfig) {
|
if (!job || !metricConfig || !stat) {
|
||||||
console.warn("Argument missing for findJobThresholds!");
|
console.warn("Argument missing for findJobThresholds!");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// metricConfig is on subCluster-Level
|
// metricConfig is on subCluster-Level
|
||||||
const defaultThresholds = {
|
const defaultThresholds = {
|
||||||
peak: metricConfig.peak,
|
peak: metricConfig.peak,
|
||||||
@ -22,13 +21,13 @@
|
|||||||
caution: metricConfig.caution,
|
caution: metricConfig.caution,
|
||||||
alert: metricConfig.alert
|
alert: metricConfig.alert
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
NEW: Footprints should be comparable: Always use Unchanged Single Node Thresholds, except for shared jobs.
|
Footprints should be comparable:
|
||||||
HW Clocks, HW Temperatures and File/Net IO Thresholds will be scaled down too, even if they are independent.
|
Always use unchanged single node thresholds for exclusive jobs and "avg" Footprints.
|
||||||
'jf.stats' is one of: avg, min, max -> Always relative to one nodes' thresholds as configured.
|
For shared jobs, scale thresholds by the fraction of the job's HWThreads to the node's HWThreads.
|
||||||
|
'stat' is one of: avg, min, max
|
||||||
*/
|
*/
|
||||||
if (job.exclusive === 1) {
|
if (job.exclusive === 1 || stat === "avg") {
|
||||||
return defaultThresholds
|
return defaultThresholds
|
||||||
} else {
|
} else {
|
||||||
const topol = getContext("getHardwareTopology")(job.cluster, job.subCluster)
|
const topol = getContext("getHardwareTopology")(job.cluster, job.subCluster)
|
||||||
@ -40,29 +39,6 @@
|
|||||||
alert: round(defaultThresholds.alert * jobFraction, 0),
|
alert: round(defaultThresholds.alert * jobFraction, 0),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/* OLD: Based on Metric Aggregation Setting
|
|
||||||
// Job_Exclusivity does not matter, only aggregation
|
|
||||||
if (metricConfig.aggregation === "avg") {
|
|
||||||
return defaultThresholds;
|
|
||||||
} else if (metricConfig.aggregation === "sum") {
|
|
||||||
const topol = getContext("getHardwareTopology")(job.cluster, job.subCluster)
|
|
||||||
const jobFraction = job.numHWThreads / topol.node.length;
|
|
||||||
|
|
||||||
return {
|
|
||||||
peak: round(defaultThresholds.peak * jobFraction, 0),
|
|
||||||
normal: round(defaultThresholds.normal * jobFraction, 0),
|
|
||||||
caution: round(defaultThresholds.caution * jobFraction, 0),
|
|
||||||
alert: round(defaultThresholds.alert * jobFraction, 0),
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
console.warn(
|
|
||||||
"Missing or unkown aggregation mode (sum/avg) for metric:",
|
|
||||||
metricConfig,
|
|
||||||
);
|
|
||||||
return defaultThresholds;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
@ -93,7 +69,7 @@
|
|||||||
const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "")
|
const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "")
|
||||||
|
|
||||||
// Threshold / -Differences
|
// Threshold / -Differences
|
||||||
const fmt = findJobThresholds(job, fmc);
|
const fmt = findJobThresholds(job, jf.stat, fmc);
|
||||||
if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
|
if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
|
||||||
|
|
||||||
// Define basic data -> Value: Use as Provided
|
// Define basic data -> Value: Use as Provided
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
-->
|
-->
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
import { Badge, Button, Icon } from "@sveltestrap/sveltestrap";
|
import { Badge, Button, Icon, Tooltip } from "@sveltestrap/sveltestrap";
|
||||||
import { scrambleNames, scramble } from "../utils.js";
|
import { scrambleNames, scramble } from "../utils.js";
|
||||||
import Tag from "../helper/Tag.svelte";
|
import Tag from "../helper/Tag.svelte";
|
||||||
import TagManagement from "../helper/TagManagement.svelte";
|
import TagManagement from "../helper/TagManagement.svelte";
|
||||||
@ -42,12 +42,30 @@
|
|||||||
let displayCheck = false;
|
let displayCheck = false;
|
||||||
function clipJobId(jid) {
|
function clipJobId(jid) {
|
||||||
displayCheck = true;
|
displayCheck = true;
|
||||||
navigator.clipboard
|
// Navigator clipboard api needs a secure context (https)
|
||||||
.writeText(jid)
|
if (navigator.clipboard && window.isSecureContext) {
|
||||||
.catch((reason) => console.error(reason));
|
navigator.clipboard
|
||||||
setTimeout(function () {
|
.writeText(jid)
|
||||||
displayCheck = false;
|
.catch((reason) => console.error(reason));
|
||||||
}, 1500);
|
} else {
|
||||||
|
// Workaround: Create, Fill, And Copy Content of Textarea
|
||||||
|
const textArea = document.createElement("textarea");
|
||||||
|
textArea.value = jid;
|
||||||
|
textArea.style.position = "absolute";
|
||||||
|
textArea.style.left = "-999999px";
|
||||||
|
document.body.prepend(textArea);
|
||||||
|
textArea.select();
|
||||||
|
try {
|
||||||
|
document.execCommand('copy');
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error);
|
||||||
|
} finally {
|
||||||
|
textArea.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setTimeout(function () {
|
||||||
|
displayCheck = false;
|
||||||
|
}, 1000);
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
@ -58,13 +76,18 @@
|
|||||||
<a href="/monitoring/job/{job.id}" target="_blank">{job.jobId}</a>
|
<a href="/monitoring/job/{job.id}" target="_blank">{job.jobId}</a>
|
||||||
({job.cluster})
|
({job.cluster})
|
||||||
</span>
|
</span>
|
||||||
<Button outline color="secondary" size="sm" title="Copy JobID to Clipboard" on:click={clipJobId(job.jobId)} >
|
<Button id={`${job.cluster}-${job.jobId}-clipboard`} outline color="secondary" size="sm" on:click={clipJobId(job.jobId)} >
|
||||||
{#if displayCheck}
|
{#if displayCheck}
|
||||||
<Icon name="clipboard2-check-fill"/> Copied
|
<Icon name="clipboard2-check-fill"/>
|
||||||
{:else}
|
{:else}
|
||||||
<Icon name="clipboard2"/> Job ID
|
<Icon name="clipboard2"/>
|
||||||
{/if}
|
{/if}
|
||||||
</Button>
|
</Button>
|
||||||
|
<Tooltip
|
||||||
|
target={`${job.cluster}-${job.jobId}-clipboard`}
|
||||||
|
placement="right">
|
||||||
|
{ displayCheck ? 'Copied!' : 'Copy Job ID to Clipboard' }
|
||||||
|
</Tooltip>
|
||||||
</span>
|
</span>
|
||||||
{#if job.metaData?.jobName}
|
{#if job.metaData?.jobName}
|
||||||
{#if job.metaData?.jobName.length <= 25}
|
{#if job.metaData?.jobName.length <= 25}
|
||||||
|
Loading…
Reference in New Issue
Block a user