Merge branch 'master' into 37-provide-a-s3-compatible-storage-backend-for-the-job-archive

This commit is contained in:
2025-10-23 15:23:32 +02:00
294 changed files with 35907 additions and 19676 deletions

View File

@@ -1,26 +1,20 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archive
import (
"bufio"
"bytes"
"compress/gzip"
"encoding/json"
"fmt"
"io"
"path/filepath"
"strconv"
"sync"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
const Version uint64 = 1
const Version uint64 = 2
type ArchiveBackend interface {
Init(rawConfig json.RawMessage) (uint64, error)
@@ -33,6 +27,8 @@ type ArchiveBackend interface {
LoadJobData(job *schema.Job) (schema.JobData, error)
LoadJobStats(job *schema.Job) (schema.ScopedJobStats, error)
LoadClusterCfg(name string) (*schema.Cluster, error)
StoreJobMeta(jobMeta *schema.JobMeta) error
@@ -60,105 +56,55 @@ type JobContainer struct {
}
var (
initOnce sync.Once
cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
ar ArchiveBackend
useArchive bool
)
func getPath(
job *schema.Job,
rootPath string,
file string,
) string {
return filepath.Join(
getDirectory(job, rootPath), file)
}
func getDirectory(
job *schema.Job,
rootPath string,
) string {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
return filepath.Join(
rootPath,
job.Cluster,
lvl1, lvl2,
strconv.FormatInt(job.StartTime.Unix(), 10))
}
func loadJobMeta(b []byte) (*schema.JobMeta, error) {
if config.Keys.Validate {
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err)
}
}
return DecodeJobMeta(bytes.NewReader(b))
}
func loadJobData(f io.Reader, key string, isCompressed bool) (schema.JobData, error) {
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(r, key)
} else {
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(bufio.NewReader(f), key)
}
}
func Init(rawConfig json.RawMessage, disableArchive bool) error {
useArchive = !disableArchive
var err error
var cfg struct {
Kind string `json:"kind"`
}
initOnce.Do(func() {
useArchive = !disableArchive
if err := json.Unmarshal(rawConfig, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
return err
}
var cfg struct {
Kind string `json:"kind"`
}
switch cfg.Kind {
case "file":
ar = &FsArchive{}
// case "s3":
// ar = &S3Archive{}
default:
return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
}
if err = json.Unmarshal(rawConfig, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
return
}
version, err := ar.Init(rawConfig)
if err != nil {
log.Error("Error while initializing archiveBackend")
return err
}
log.Infof("Load archive version %d", version)
switch cfg.Kind {
case "file":
ar = &FsArchive{}
// case "s3":
// ar = &S3Archive{}
default:
err = fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
}
return initClusterConfig()
var version uint64
version, err = ar.Init(rawConfig)
if err != nil {
log.Errorf("Error while initializing archiveBackend: %s", err.Error())
return
}
log.Infof("Load archive version %d", version)
err = initClusterConfig()
})
return err
}
func GetHandle() ArchiveBackend {
return ar
}
// Helper to metricdata.LoadAverages().
// Helper to metricdataloader.LoadAverages().
func LoadAveragesFromArchive(
job *schema.Job,
metrics []string,
@@ -166,7 +112,7 @@ func LoadAveragesFromArchive(
) error {
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
@@ -181,16 +127,80 @@ func LoadAveragesFromArchive(
return nil
}
// Helper to metricdataloader.LoadJobStats().
func LoadStatsFromArchive(
job *schema.Job,
metrics []string,
) (map[string]schema.MetricStatistics, error) {
data := make(map[string]schema.MetricStatistics, len(metrics))
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return data, err
}
for _, m := range metrics {
stat, ok := metaFile.Statistics[m]
if !ok {
data[m] = schema.MetricStatistics{Min: 0.0, Avg: 0.0, Max: 0.0}
continue
}
data[m] = schema.MetricStatistics{
Avg: stat.Avg,
Min: stat.Min,
Max: stat.Max,
}
}
return data, nil
}
// Helper to metricdataloader.LoadScopedJobStats().
func LoadScopedStatsFromArchive(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
) (schema.ScopedJobStats, error) {
data, err := ar.LoadJobStats(job)
if err != nil {
log.Errorf("Error while loading job stats from archiveBackend: %s", err.Error())
return nil, err
}
return data, nil
}
func GetStatistics(job *schema.Job) (map[string]schema.JobStatistics, error) {
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return nil, err
}
return metaFile.Statistics, nil
}
// If the job is archived, find its `meta.json` file and override the Metadata
// in that JSON file. If the job is not archived, nothing is done.
func UpdateMetadata(job *schema.Job, metadata map[string]string) error {
if job.State == schema.JobStateRunning || !useArchive {
return nil
}
jobMeta, err := ar.LoadJobMeta(job)
if err != nil {
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
for k, v := range metadata {
jobMeta.MetaData[k] = v
}
return ar.StoreJobMeta(jobMeta)
}
// If the job is archived, find its `meta.json` file and override the tags list
// in that JSON file. If the job is not archived, nothing is done.
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
@@ -200,15 +210,16 @@ func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
jobMeta, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
jobMeta.Tags = make([]*schema.Tag, 0)
for _, tag := range tags {
jobMeta.Tags = append(jobMeta.Tags, &schema.Tag{
Name: tag.Name,
Type: tag.Type,
Name: tag.Name,
Type: tag.Type,
Scope: tag.Scope,
})
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -12,13 +12,16 @@ import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var Clusters []*schema.Cluster
var nodeLists map[string]map[string]NodeList
var (
Clusters []*schema.Cluster
GlobalMetricList []*schema.GlobalMetricListItem
NodeLists map[string]map[string]NodeList
)
func initClusterConfig() error {
Clusters = []*schema.Cluster{}
nodeLists = map[string]map[string]NodeList{}
NodeLists = map[string]map[string]NodeList{}
metricLookup := make(map[string]schema.GlobalMetricListItem)
for _, c := range ar.GetClusters() {
@@ -49,11 +52,79 @@ func initClusterConfig() error {
if !mc.Scope.Valid() {
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
}
ml, ok := metricLookup[mc.Name]
if !ok {
metricLookup[mc.Name] = schema.GlobalMetricListItem{
Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint,
}
ml = metricLookup[mc.Name]
}
availability := schema.ClusterSupport{Cluster: cluster.Name}
scLookup := make(map[string]*schema.SubClusterConfig)
for _, scc := range mc.SubClusters {
scLookup[scc.Name] = scc
}
for _, sc := range cluster.SubClusters {
newMetric := &schema.MetricConfig{
Unit: mc.Unit,
Energy: mc.Energy,
Name: mc.Name,
Scope: mc.Scope,
Aggregation: mc.Aggregation,
Peak: mc.Peak,
Caution: mc.Caution,
Alert: mc.Alert,
Timestep: mc.Timestep,
Normal: mc.Normal,
LowerIsBetter: mc.LowerIsBetter,
}
if mc.Footprint != "" {
newMetric.Footprint = mc.Footprint
}
if cfg, ok := scLookup[sc.Name]; ok {
if !cfg.Remove {
availability.SubClusters = append(availability.SubClusters, sc.Name)
newMetric.Peak = cfg.Peak
newMetric.Normal = cfg.Normal
newMetric.Caution = cfg.Caution
newMetric.Alert = cfg.Alert
newMetric.Footprint = cfg.Footprint
newMetric.Energy = cfg.Energy
newMetric.LowerIsBetter = cfg.LowerIsBetter
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
ml.Footprint = newMetric.Footprint
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
} else {
availability.SubClusters = append(availability.SubClusters, sc.Name)
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
}
ml.Availability = append(metricLookup[mc.Name].Availability, availability)
metricLookup[mc.Name] = ml
}
Clusters = append(Clusters, cluster)
nodeLists[cluster.Name] = make(map[string]NodeList)
NodeLists[cluster.Name] = make(map[string]NodeList)
for _, sc := range cluster.SubClusters {
if sc.Nodes == "*" {
continue
@@ -63,15 +134,18 @@ func initClusterConfig() error {
if err != nil {
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > in %s/cluster.json: %w", cluster.Name, err)
}
nodeLists[cluster.Name][sc.Name] = nl
NodeLists[cluster.Name][sc.Name] = nl
}
}
for _, ml := range metricLookup {
GlobalMetricList = append(GlobalMetricList, &ml)
}
return nil
}
func GetCluster(cluster string) *schema.Cluster {
for _, c := range Clusters {
if c.Name == cluster {
return c
@@ -90,11 +164,10 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
}
}
}
return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster)
return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster)
}
func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
for _, c := range Clusters {
if c.Name == cluster {
for _, m := range c.MetricConfig {
@@ -110,7 +183,6 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
// AssignSubCluster sets the `job.subcluster` property of the job based
// on its cluster and resources.
func AssignSubCluster(job *schema.BaseJob) error {
cluster := GetCluster(job.Cluster)
if cluster == nil {
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster)
@@ -130,7 +202,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
}
host0 := job.Resources[0].Hostname
for sc, nl := range nodeLists[job.Cluster] {
for sc, nl := range NodeLists[job.Cluster] {
if nl != nil && nl.Contains(host0) {
job.SubCluster = sc
return nil
@@ -146,8 +218,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
}
func GetSubClusterByNode(cluster, hostname string) (string, error) {
for sc, nl := range nodeLists[cluster] {
for sc, nl := range NodeLists[cluster] {
if nl != nil && nl.Contains(hostname) {
return sc, nil
}
@@ -164,3 +235,13 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) {
return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname)
}
func MetricIndex(mc []schema.MetricConfig, name string) (int, error) {
for i, m := range mc {
if m.Name == name {
return i, nil
}
}
return 0, fmt.Errorf("unknown metric name %s", name)
}

View File

@@ -0,0 +1,39 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archive_test
import (
"encoding/json"
"testing"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
)
func TestClusterConfig(t *testing.T) {
if err := archive.Init(json.RawMessage("{\"kind\": \"file\",\"path\": \"testdata/archive\"}"), false); err != nil {
t.Fatal(err)
}
sc, err := archive.GetSubCluster("fritz", "spr1tb")
if err != nil {
t.Fatal(err)
}
// spew.Dump(sc.MetricConfig)
if len(sc.Footprint) != 3 {
t.Fail()
}
if len(sc.MetricConfig) != 15 {
t.Fail()
}
for _, metric := range sc.MetricConfig {
if metric.LowerIsBetter && metric.Name != "mem_used" {
t.Fail()
}
}
// spew.Dump(archive.GlobalMetricList)
// t.Fail()
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -40,6 +40,109 @@ type clusterInfo struct {
diskSize float64
}
func getDirectory(
job *schema.Job,
rootPath string,
) string {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
return filepath.Join(
rootPath,
job.Cluster,
lvl1, lvl2,
strconv.FormatInt(job.StartTime.Unix(), 10))
}
func getPath(
job *schema.Job,
rootPath string,
file string,
) string {
return filepath.Join(
getDirectory(job, rootPath), file)
}
func loadJobMeta(filename string) (*schema.JobMeta, error) {
b, err := os.ReadFile(filename)
if err != nil {
log.Errorf("loadJobMeta() > open file error: %v", err)
return &schema.JobMeta{}, err
}
if config.Keys.Validate {
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err)
}
}
return DecodeJobMeta(bytes.NewReader(b))
}
func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend LoadJobData()- %v", err)
return nil, err
}
defer f.Close()
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(r, filename)
} else {
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(bufio.NewReader(f), filename)
}
}
func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend LoadJobStats()- %v", err)
return nil, err
}
defer f.Close()
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return nil, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobStats(r, filename)
} else {
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return nil, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobStats(bufio.NewReader(f), filename)
}
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) {
var config FsArchiveConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
@@ -317,6 +420,18 @@ func (fsa *FsArchive) LoadJobData(job *schema.Job) (schema.JobData, error) {
return loadJobData(f, filename, isCompressed)
}
func (fsa *FsArchive) LoadJobStats(job *schema.Job) (schema.ScopedJobStats, error) {
var isCompressed bool = true
filename := getPath(job, fsa.path, "data.json.gz")
if !util.CheckFileExists(filename) {
filename = getPath(job, fsa.path, "data.json")
isCompressed = false
}
return loadJobStats(filename, isCompressed)
}
func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) {
filename := getPath(job, fsa.path, "meta.json")
b, err := os.ReadFile(filename)

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -49,10 +49,10 @@ func TestInit(t *testing.T) {
if fsa.path != "testdata/archive" {
t.Fail()
}
if version != 1 {
if version != 2 {
t.Fail()
}
if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" {
if len(fsa.clusters) != 3 || fsa.clusters[1] != "emmy" {
t.Fail()
}
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -9,8 +9,8 @@ import (
"io"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func DecodeJobData(r io.Reader, k string) (schema.JobData, error) {
@@ -32,6 +32,43 @@ func DecodeJobData(r io.Reader, k string) (schema.JobData, error) {
return data.(schema.JobData), nil
}
func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) {
jobData, err := DecodeJobData(r, k)
// Convert schema.JobData to schema.ScopedJobStats
if jobData != nil {
scopedJobStats := make(schema.ScopedJobStats)
for metric, metricData := range jobData {
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
for scope, jobMetric := range metricData {
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range jobMetric.Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Id: series.Id,
Data: &series.Statistics,
})
}
// So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty
if len(scopedJobStats[metric][scope]) == 0 {
delete(scopedJobStats[metric], scope)
if len(scopedJobStats[metric]) == 0 {
delete(scopedJobStats, metric)
}
}
}
}
return scopedJobStats, nil
}
return nil, err
}
func DecodeJobMeta(r io.Reader) (*schema.JobMeta, error) {
var d schema.JobMeta
if err := json.NewDecoder(r).Decode(&d); err != nil {

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1 +1 @@
1
2

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -46,12 +46,12 @@ var loglevel string = "info"
/* CONFIG */
func Init(lvl string, logdate bool) {
// Discard I/O for all writers below selected loglevel; <CRITICAL> is always written.
switch lvl {
case "crit":
ErrWriter = io.Discard
fallthrough
case "err", "fatal":
case "err":
WarnWriter = io.Discard
fallthrough
case "warn":
@@ -63,8 +63,7 @@ func Init(lvl string, logdate bool) {
// Nothing to do...
break
default:
fmt.Printf("pkg/log: Flag 'loglevel' has invalid value %#v\npkg/log: Will use default loglevel 'debug'\n", lvl)
//SetLogLevel("debug")
fmt.Printf("pkg/log: Flag 'loglevel' has invalid value %#v\npkg/log: Will use default loglevel '%s'\n", lvl, loglevel)
}
if !logdate {
@@ -84,109 +83,138 @@ func Init(lvl string, logdate bool) {
loglevel = lvl
}
/* PRINT */
// Private helper
func printStr(v ...interface{}) string {
return fmt.Sprint(v...)
}
// Uses Info() -> If errorpath required at some point:
// Will need own writer with 'Output(2, out)' to correctly render path
func Print(v ...interface{}) {
Info(v...)
}
func Debug(v ...interface{}) {
DebugLog.Output(2, printStr(v...))
}
func Info(v ...interface{}) {
InfoLog.Output(2, printStr(v...))
}
func Warn(v ...interface{}) {
WarnLog.Output(2, printStr(v...))
}
func Error(v ...interface{}) {
ErrLog.Output(2, printStr(v...))
}
// Writes panic stacktrace, but keeps application alive
func Panic(v ...interface{}) {
ErrLog.Output(2, printStr(v...))
panic("Panic triggered ...")
}
func Crit(v ...interface{}) {
CritLog.Output(2, printStr(v...))
}
// Writes critical log, stops application
func Fatal(v ...interface{}) {
CritLog.Output(2, printStr(v...))
os.Exit(1)
}
/* PRINT FORMAT*/
// Private helper
func printfStr(format string, v ...interface{}) string {
return fmt.Sprintf(format, v...)
}
// Uses Infof() -> If errorpath required at some point:
// Will need own writer with 'Output(2, out)' to correctly render path
func Printf(format string, v ...interface{}) {
Infof(format, v...)
}
func Debugf(format string, v ...interface{}) {
DebugLog.Output(2, printfStr(format, v...))
}
func Infof(format string, v ...interface{}) {
InfoLog.Output(2, printfStr(format, v...))
}
func Warnf(format string, v ...interface{}) {
WarnLog.Output(2, printfStr(format, v...))
}
func Errorf(format string, v ...interface{}) {
ErrLog.Output(2, printfStr(format, v...))
}
// Writes panic stacktrace, but keeps application alive
func Panicf(format string, v ...interface{}) {
ErrLog.Output(2, printfStr(format, v...))
panic("Panic triggered ...")
}
func Critf(format string, v ...interface{}) {
CritLog.Output(2, printfStr(format, v...))
}
// Writes crit log, stops application
func Fatalf(format string, v ...interface{}) {
CritLog.Output(2, printfStr(format, v...))
os.Exit(1)
}
/* HELPER */
func Loglevel() string {
return loglevel
}
/* SPECIAL */
/* PRIVATE HELPER */
// func Finfof(w io.Writer, format string, v ...interface{}) {
// if w != io.Discard {
// if logDateTime {
// currentTime := time.Now()
// fmt.Fprintf(InfoWriter, currentTime.String()+InfoPrefix+format+"\n", v...)
// } else {
// fmt.Fprintf(InfoWriter, InfoPrefix+format+"\n", v...)
// }
// }
// }
// Return unformatted string
func printStr(v ...interface{}) string {
return fmt.Sprint(v...)
}
// Return formatted string
func printfStr(format string, v ...interface{}) string {
return fmt.Sprintf(format, v...)
}
/* PRINT */
// Prints to STDOUT without string formatting; application continues.
// Used for special cases not requiring log information like date or location.
func Print(v ...interface{}) {
fmt.Fprintln(os.Stdout, v...)
}
// Prints to STDOUT without string formatting; application exits with error code 0.
// Used for exiting succesfully with message after expected outcome, e.g. successful single-call application runs.
func Exit(v ...interface{}) {
fmt.Fprintln(os.Stdout, v...)
os.Exit(0)
}
// Prints to STDOUT without string formatting; application exits with error code 1.
// Used for terminating with message after to be expected errors, e.g. wrong arguments or during init().
func Abort(v ...interface{}) {
fmt.Fprintln(os.Stdout, v...)
os.Exit(1)
}
// Prints to DEBUG writer without string formatting; application continues.
// Used for logging additional information, primarily for development.
func Debug(v ...interface{}) {
DebugLog.Output(2, printStr(v...))
}
// Prints to INFO writer without string formatting; application continues.
// Used for logging additional information, e.g. notable returns or common fail-cases.
func Info(v ...interface{}) {
InfoLog.Output(2, printStr(v...))
}
// Prints to WARNING writer without string formatting; application continues.
// Used for logging important information, e.g. uncommon edge-cases or administration related information.
func Warn(v ...interface{}) {
WarnLog.Output(2, printStr(v...))
}
// Prints to ERROR writer without string formatting; application continues.
// Used for logging errors, but code still can return default(s) or nil.
func Error(v ...interface{}) {
ErrLog.Output(2, printStr(v...))
}
// Prints to CRITICAL writer without string formatting; application exits with error code 1.
// Used for terminating on unexpected errors with date and code location.
func Fatal(v ...interface{}) {
CritLog.Output(2, printStr(v...))
os.Exit(1)
}
// Prints to PANIC function without string formatting; application exits with panic.
// Used for terminating on unexpected errors with stacktrace.
func Panic(v ...interface{}) {
panic(printStr(v...))
}
/* PRINT FORMAT*/
// Prints to STDOUT with string formatting; application continues.
// Used for special cases not requiring log information like date or location.
func Printf(format string, v ...interface{}) {
fmt.Fprintf(os.Stdout, format, v...)
}
// Prints to STDOUT with string formatting; application exits with error code 0.
// Used for exiting succesfully with message after expected outcome, e.g. successful single-call application runs.
func Exitf(format string, v ...interface{}) {
fmt.Fprintf(os.Stdout, format, v...)
os.Exit(0)
}
// Prints to STDOUT with string formatting; application exits with error code 1.
// Used for terminating with message after to be expected errors, e.g. wrong arguments or during init().
func Abortf(format string, v ...interface{}) {
fmt.Fprintf(os.Stdout, format, v...)
os.Exit(1)
}
// Prints to DEBUG writer with string formatting; application continues.
// Used for logging additional information, primarily for development.
func Debugf(format string, v ...interface{}) {
DebugLog.Output(2, printfStr(format, v...))
}
// Prints to INFO writer with string formatting; application continues.
// Used for logging additional information, e.g. notable returns or common fail-cases.
func Infof(format string, v ...interface{}) {
InfoLog.Output(2, printfStr(format, v...))
}
// Prints to WARNING writer with string formatting; application continues.
// Used for logging important information, e.g. uncommon edge-cases or administration related information.
func Warnf(format string, v ...interface{}) {
WarnLog.Output(2, printfStr(format, v...))
}
// Prints to ERROR writer with string formatting; application continues.
// Used for logging errors, but code still can return default(s) or nil.
func Errorf(format string, v ...interface{}) {
ErrLog.Output(2, printfStr(format, v...))
}
// Prints to CRITICAL writer with string formatting; application exits with error code 1.
// Used for terminating on unexpected errors with date and code location.
func Fatalf(format string, v ...interface{}) {
CritLog.Output(2, printfStr(format, v...))
os.Exit(1)
}
// Prints to PANIC function with string formatting; application exits with panic.
// Used for terminating on unexpected errors with stacktrace.
func Panicf(format string, v ...interface{}) {
panic(printfStr(format, v...))
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

123
pkg/resampler/resampler.go Normal file
View File

@@ -0,0 +1,123 @@
package resampler
import (
"errors"
"fmt"
"math"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func SimpleResampler(data []schema.Float, old_frequency int64, new_frequency int64) ([]schema.Float, int64, error) {
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
return data, old_frequency, nil
}
if new_frequency%old_frequency != 0 {
return nil, 0, errors.New("new sampling frequency should be multiple of the old frequency")
}
var step int = int(new_frequency / old_frequency)
var new_data_length = len(data) / step
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
return data, old_frequency, nil
}
new_data := make([]schema.Float, new_data_length)
for i := 0; i < new_data_length; i++ {
new_data[i] = data[i*step]
}
return new_data, new_frequency, nil
}
// Inspired by one of the algorithms from https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf
// Adapted from https://github.com/haoel/downsampling/blob/master/core/lttb.go
func LargestTriangleThreeBucket(data []schema.Float, old_frequency int, new_frequency int) ([]schema.Float, int, error) {
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
return data, old_frequency, nil
}
if new_frequency%old_frequency != 0 {
return nil, 0, errors.New(fmt.Sprintf("new sampling frequency : %d should be multiple of the old frequency : %d", new_frequency, old_frequency))
}
var step int = int(new_frequency / old_frequency)
var new_data_length = len(data) / step
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
return data, old_frequency, nil
}
new_data := make([]schema.Float, 0, new_data_length)
// Bucket size. Leave room for start and end data points
bucketSize := float64(len(data)-2) / float64(new_data_length-2)
new_data = append(new_data, data[0]) // Always add the first point
// We have 3 pointers represent for
// > bucketLow - the current bucket's beginning location
// > bucketMiddle - the current bucket's ending location,
// also the beginning location of next bucket
// > bucketHight - the next bucket's ending location.
bucketLow := 1
bucketMiddle := int(math.Floor(bucketSize)) + 1
var prevMaxAreaPoint int
for i := 0; i < new_data_length-2; i++ {
bucketHigh := int(math.Floor(float64(i+2)*bucketSize)) + 1
if bucketHigh >= len(data)-1 {
bucketHigh = len(data) - 2
}
// Calculate point average for next bucket (containing c)
avgPointX, avgPointY := calculateAverageDataPoint(data[bucketMiddle:bucketHigh+1], int64(bucketMiddle))
// Get the range for current bucket
currBucketStart := bucketLow
currBucketEnd := bucketMiddle
// Point a
pointX := prevMaxAreaPoint
pointY := data[prevMaxAreaPoint]
maxArea := -1.0
var maxAreaPoint int
flag_ := 0
for ; currBucketStart < currBucketEnd; currBucketStart++ {
area := calculateTriangleArea(schema.Float(pointX), pointY, avgPointX, avgPointY, schema.Float(currBucketStart), data[currBucketStart])
if area > maxArea {
maxArea = area
maxAreaPoint = currBucketStart
}
if math.IsNaN(float64(avgPointY)) {
flag_ = 1
}
}
if flag_ == 1 {
new_data = append(new_data, schema.NaN) // Pick this point from the bucket
} else {
new_data = append(new_data, data[maxAreaPoint]) // Pick this point from the bucket
}
prevMaxAreaPoint = maxAreaPoint // This MaxArea point is the next's prevMAxAreaPoint
//move to the next window
bucketLow = bucketMiddle
bucketMiddle = bucketHigh
}
new_data = append(new_data, data[len(data)-1]) // Always add last
return new_data, new_frequency, nil
}

35
pkg/resampler/util.go Normal file
View File

@@ -0,0 +1,35 @@
package resampler
import (
"math"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func calculateTriangleArea(paX, paY, pbX, pbY, pcX, pcY schema.Float) float64 {
area := ((paX-pcX)*(pbY-paY) - (paX-pbX)*(pcY-paY)) * 0.5
return math.Abs(float64(area))
}
func calculateAverageDataPoint(points []schema.Float, xStart int64) (avgX schema.Float, avgY schema.Float) {
flag := 0
for _, point := range points {
avgX += schema.Float(xStart)
avgY += point
xStart++
if math.IsNaN(float64(point)) {
flag = 1
}
}
l := schema.Float(len(points))
avgX /= l
avgY /= l
if flag == 1 {
return avgX, schema.NaN
} else {
return avgX, avgY
}
}

142
pkg/runtimeEnv/setup.go Normal file
View File

@@ -0,0 +1,142 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package runtimeEnv
import (
"bufio"
"errors"
"fmt"
"os"
"os/exec"
"os/user"
"strconv"
"strings"
"syscall"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
// Very simple and limited .env file reader.
// All variable definitions found are directly
// added to the processes environment.
func LoadEnv(file string) error {
f, err := os.Open(file)
if err != nil {
log.Error("Error while opening .env file")
return err
}
defer f.Close()
s := bufio.NewScanner(bufio.NewReader(f))
for s.Scan() {
line := s.Text()
if strings.HasPrefix(line, "#") || len(line) == 0 {
continue
}
if strings.Contains(line, "#") {
return errors.New("'#' are only supported at the start of a line")
}
line = strings.TrimPrefix(line, "export ")
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
return fmt.Errorf("RUNTIME/SETUP > unsupported line: %#v", line)
}
key := strings.TrimSpace(parts[0])
val := strings.TrimSpace(parts[1])
if strings.HasPrefix(val, "\"") {
if !strings.HasSuffix(val, "\"") {
return fmt.Errorf("RUNTIME/SETUP > unsupported line: %#v", line)
}
runes := []rune(val[1 : len(val)-1])
sb := strings.Builder{}
for i := 0; i < len(runes); i++ {
if runes[i] == '\\' {
i++
switch runes[i] {
case 'n':
sb.WriteRune('\n')
case 'r':
sb.WriteRune('\r')
case 't':
sb.WriteRune('\t')
case '"':
sb.WriteRune('"')
default:
return fmt.Errorf("RUNTIME/SETUP > unsupported escape sequence in quoted string: backslash %#v", runes[i])
}
continue
}
sb.WriteRune(runes[i])
}
val = sb.String()
}
os.Setenv(key, val)
}
return s.Err()
}
// Changes the processes user and group to that
// specified in the config.json. The go runtime
// takes care of all threads (and not only the calling one)
// executing the underlying systemcall.
func DropPrivileges(username string, group string) error {
if group != "" {
g, err := user.LookupGroup(group)
if err != nil {
log.Warn("Error while looking up group")
return err
}
gid, _ := strconv.Atoi(g.Gid)
if err := syscall.Setgid(gid); err != nil {
log.Warn("Error while setting gid")
return err
}
}
if username != "" {
u, err := user.Lookup(username)
if err != nil {
log.Warn("Error while looking up user")
return err
}
uid, _ := strconv.Atoi(u.Uid)
if err := syscall.Setuid(uid); err != nil {
log.Warn("Error while setting uid")
return err
}
}
return nil
}
// If started via systemd, inform systemd that we are running:
// https://www.freedesktop.org/software/systemd/man/sd_notify.html
func SystemdNotifiy(ready bool, status string) {
if os.Getenv("NOTIFY_SOCKET") == "" {
// Not started using systemd
return
}
args := []string{fmt.Sprintf("--pid=%d", os.Getpid())}
if ready {
args = append(args, "--ready")
}
if status != "" {
args = append(args, fmt.Sprintf("--status=%s", status))
}
cmd := exec.Command("systemd-notify", args...)
cmd.Run() // errors ignored on purpose, there is not much to do anyways.
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -30,38 +30,47 @@ type MetricValue struct {
}
type SubCluster struct {
Name string `json:"name"`
Nodes string `json:"nodes"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar MetricValue `json:"flopRateScalar"`
FlopRateSimd MetricValue `json:"flopRateSimd"`
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
Topology Topology `json:"topology"`
Name string `json:"name"`
Nodes string `json:"nodes"`
ProcessorType string `json:"processorType"`
Topology Topology `json:"topology"`
FlopRateScalar MetricValue `json:"flopRateScalar"`
FlopRateSimd MetricValue `json:"flopRateSimd"`
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
MetricConfig []MetricConfig `json:"metricConfig,omitempty"`
Footprint []string `json:"footprint,omitempty"`
EnergyFootprint []string `json:"energyFootprint,omitempty"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
}
type SubClusterConfig struct {
Name string `json:"name"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Remove bool `json:"remove"`
Name string `json:"name"`
Footprint string `json:"footprint,omitempty"`
Energy string `json:"energy"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Remove bool `json:"remove"`
LowerIsBetter bool `json:"lowerIsBetter"`
}
type MetricConfig struct {
Name string `json:"name"`
Unit Unit `json:"unit"`
Scope MetricScope `json:"scope"`
Aggregation string `json:"aggregation"`
Timestep int `json:"timestep"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
Unit Unit `json:"unit"`
Energy string `json:"energy"`
Name string `json:"name"`
Scope MetricScope `json:"scope"`
Aggregation string `json:"aggregation"`
Footprint string `json:"footprint,omitempty"`
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
Peak float64 `json:"peak"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Timestep int `json:"timestep"`
Normal float64 `json:"normal"`
LowerIsBetter bool `json:"lowerIsBetter"`
}
type Cluster struct {
@@ -70,14 +79,27 @@ type Cluster struct {
SubClusters []*SubCluster `json:"subClusters"`
}
type ClusterSupport struct {
Cluster string `json:"cluster"`
SubClusters []string `json:"subclusters"`
}
type GlobalMetricListItem struct {
Name string `json:"name"`
Unit Unit `json:"unit"`
Scope MetricScope `json:"scope"`
Footprint string `json:"footprint,omitempty"`
Availability []ClusterSupport `json:"availability"`
}
// Return a list of socket IDs given a list of hwthread IDs. Even if just one
// hwthread is in that socket, add it to the list. If no hwthreads other than
// those in the argument list are assigned to one of the sockets in the first
// return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm.
func (topo *Topology) GetSocketsFromHWThreads(
hwthreads []int) (sockets []int, exclusive bool) {
hwthreads []int,
) (sockets []int, exclusive bool) {
socketsMap := map[int]int{}
for _, hwthread := range hwthreads {
for socket, hwthreadsInSocket := range topo.Socket {
@@ -100,14 +122,46 @@ func (topo *Topology) GetSocketsFromHWThreads(
return sockets, exclusive
}
// Return a list of socket IDs given a list of core IDs. Even if just one
// core is in that socket, add it to the list. If no cores other than
// those in the argument list are assigned to one of the sockets in the first
// return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm.
func (topo *Topology) GetSocketsFromCores (
cores []int,
) (sockets []int, exclusive bool) {
socketsMap := map[int]int{}
for _, core := range cores {
for _, hwthreadInCore := range topo.Core[core] {
for socket, hwthreadsInSocket := range topo.Socket {
for _, hwthreadInSocket := range hwthreadsInSocket {
if hwthreadInCore == hwthreadInSocket {
socketsMap[socket] += 1
}
}
}
}
}
exclusive = true
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
sockets = make([]int, 0, len(socketsMap))
for socket, count := range socketsMap {
sockets = append(sockets, socket)
exclusive = exclusive && count == hwthreadsPerSocket
}
return sockets, exclusive
}
// Return a list of core IDs given a list of hwthread IDs. Even if just one
// hwthread is in that core, add it to the list. If no hwthreads other than
// those in the argument list are assigned to one of the cores in the first
// return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm.
func (topo *Topology) GetCoresFromHWThreads(
hwthreads []int) (cores []int, exclusive bool) {
hwthreads []int,
) (cores []int, exclusive bool) {
coresMap := map[int]int{}
for _, hwthread := range hwthreads {
for core, hwthreadsInCore := range topo.Core {
@@ -136,8 +190,8 @@ func (topo *Topology) GetCoresFromHWThreads(
// memory domains in the first return value, return true as the second value.
// TODO: Optimize this, there must be a more efficient way/algorithm.
func (topo *Topology) GetMemoryDomainsFromHWThreads(
hwthreads []int) (memDoms []int, exclusive bool) {
hwthreads []int,
) (memDoms []int, exclusive bool) {
memDomsMap := map[int]int{}
for _, hwthread := range hwthreads {
for memDom, hwthreadsInmemDom := range topo.MemoryDomain {
@@ -172,7 +226,17 @@ func (topo *Topology) GetAcceleratorID(id int) (string, error) {
}
}
func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
// Return list of hardware (string) accelerator IDs
func (topo *Topology) GetAcceleratorIDs() []string {
accels := make([]string, 0)
for _, accel := range topo.Accelerators {
accels = append(accels, accel.ID)
}
return accels
}
// Outdated? Or: Return indices of accelerators in parent array?
func (topo *Topology) GetAcceleratorIDsAsInt() ([]int, error) {
accels := make([]int, 0)
for _, accel := range topo.Accelerators {
id, err := strconv.Atoi(accel.ID)

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -23,6 +23,12 @@ type LdapConfig struct {
SyncUserOnLogin bool `json:"syncUserOnLogin"`
}
type OpenIDConfig struct {
Provider string `json:"provider"`
SyncUserOnLogin bool `json:"syncUserOnLogin"`
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
}
type JWTAuthConfig struct {
// Specifies for how long a JWT token shall be valid
// as a string parsable by time.ParseDuration().
@@ -40,6 +46,9 @@ type JWTAuthConfig struct {
// Should an non-existent user be added to the DB based on the information in the token
SyncUserOnLogin bool `json:"syncUserOnLogin"`
// Should an existent user be updated in the DB based on the information in the token
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
}
type IntRange struct {
@@ -48,8 +57,9 @@ type IntRange struct {
}
type TimeRange struct {
From *time.Time `json:"from"`
To *time.Time `json:"to"`
From *time.Time `json:"from"`
To *time.Time `json:"to"`
Range string `json:"range,omitempty"`
}
type FilterRanges struct {
@@ -65,10 +75,24 @@ type ClusterConfig struct {
}
type Retention struct {
Age int `json:"age"`
IncludeDB bool `json:"includeDB"`
Policy string `json:"policy"`
Location string `json:"location"`
Age int `json:"age"`
IncludeDB bool `json:"includeDB"`
}
type ResampleConfig struct {
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
Resolutions []int `json:"resolutions"`
// Trigger next zoom level at less than this many visible datapoints
Trigger int `json:"trigger"`
}
type CronFrequency struct {
// Duration Update Worker [Defaults to '5m']
DurationWorker string `json:"duration-worker"`
// Metric-Footprint Update Worker [Defaults to '10m']
FootprintWorker string `json:"footprint-worker"`
}
// Format of the configuration (file). See below for the defaults.
@@ -76,7 +100,7 @@ type ProgramConfig struct {
// Address where the http (or https) server will listen on (for example: 'localhost:80').
Addr string `json:"addr"`
// Addresses from which secured API endpoints can be reached
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
ApiAllowedIPs []string `json:"apiAllowedIPs"`
// Drop root permissions once .env was read and the port was taken.
@@ -109,8 +133,9 @@ type ProgramConfig struct {
Validate bool `json:"validate"`
// For LDAP Authentication and user synchronisation.
LdapConfig *LdapConfig `json:"ldap"`
JwtConfig *JWTAuthConfig `json:"jwts"`
LdapConfig *LdapConfig `json:"ldap"`
JwtConfig *JWTAuthConfig `json:"jwts"`
OpenIDConfig *OpenIDConfig `json:"oidc"`
// If 0 or empty, the session does not expire!
SessionMaxAge string `json:"session-max-age"`
@@ -127,6 +152,9 @@ type ProgramConfig struct {
// be provided! Most options here can be overwritten by the user.
UiDefaults map[string]interface{} `json:"ui-defaults"`
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"enable-resampling"`
// Where to store MachineState files
MachineStateDir string `json:"machine-state-dir"`
@@ -136,6 +164,13 @@ type ProgramConfig struct {
// Defines time X in seconds in which jobs are considered to be "short" and will be filtered in specific views.
ShortRunningJobsDuration int `json:"short-running-jobs-duration"`
// Energy Mix CO2 Emission Constant [g/kWh]
// If entered, displays estimated CO2 emission for job based on jobs totalEnergy
EmissionConstant int `json:"emission-constant"`
// Frequency of cron job workers
CronFrequency *CronFrequency `json:"cron-frequency"`
// Array of Clusters
Clusters []*ClusterConfig `json:"clusters"`
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -16,30 +16,33 @@ import (
// Common subset of Job and JobMeta. Use one of those, not this type directly.
type BaseJob struct {
// The unique identifier of a job
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
User string `json:"user" db:"user" example:"abcd100h"` // The unique identifier of a user
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
Partition string `json:"partition,omitempty" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` // The unique identifier of an array job
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
// NumCores int32 `json:"numCores" db:"num_cores" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` // SMT threads used by job
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
Tags []*Tag `json:"tags,omitempty"` // List of tags
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
Resources []*Resource `json:"resources"` // Resources used by job
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
MetaData map[string]string `json:"metaData"` // Additional information about the job
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
Cluster string `json:"cluster" db:"cluster" example:"fritz"`
SubCluster string `json:"subCluster" db:"subcluster" example:"main"`
Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"`
Project string `json:"project" db:"project" example:"abcd200"`
User string `json:"user" db:"hpc_user" example:"abcd100h"`
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"`
Tags []*Tag `json:"tags,omitempty"`
RawEnergyFootprint []byte `json:"-" db:"energy_footprint"`
RawFootprint []byte `json:"-" db:"footprint"`
RawMetaData []byte `json:"-" db:"meta_data"`
RawResources []byte `json:"-" db:"resources"`
Resources []*Resource `json:"resources"`
EnergyFootprint map[string]float64 `json:"energyFootprint"`
Footprint map[string]float64 `json:"footprint"`
MetaData map[string]string `json:"metaData"`
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
Energy float64 `json:"energy" db:"energy"`
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"`
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"`
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"`
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"`
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"`
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"`
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"`
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"`
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"`
}
// Job struct type
@@ -49,19 +52,10 @@ type BaseJob struct {
// Job model
// @Description Information of a HPC job.
type Job struct {
// The unique identifier of a job in the database
ID int64 `json:"id" db:"id"`
StartTime time.Time `json:"startTime"`
BaseJob
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds
StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type
MemUsedMax float64 `json:"memUsedMax" db:"mem_used_max"` // MemUsedMax as Float64
FlopsAnyAvg float64 `json:"flopsAnyAvg" db:"flops_any_avg"` // FlopsAnyAvg as Float64
MemBwAvg float64 `json:"memBwAvg" db:"mem_bw_avg"` // MemBwAvg as Float64
LoadAvg float64 `json:"loadAvg" db:"load_avg"` // LoadAvg as Float64
NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64
FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"` // FileDataVolTotal as Float64
ID int64 `json:"id" db:"id"`
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"`
}
// JobMeta struct type
@@ -88,11 +82,10 @@ type JobLinkResultList struct {
// JobMeta model
// @Description Meta data information of a HPC job.
type JobMeta struct {
// The unique identifier of a job in the database
ID *int64 `json:"id,omitempty"`
ID *int64 `json:"id,omitempty"`
Statistics map[string]JobStatistics `json:"statistics"`
BaseJob
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` // Start epoch time stamp in seconds (Min > 0)
Statistics map[string]JobStatistics `json:"statistics"` // Metric statistics of job
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"`
}
const (
@@ -124,18 +117,19 @@ type JobStatistics struct {
// Tag model
// @Description Defines a tag using name and type.
type Tag struct {
ID int64 `json:"id" db:"id"` // The unique DB identifier of a tag
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type
Name string `json:"name" db:"tag_name" example:"Testjob"` // Tag Name
Type string `json:"type" db:"tag_type" example:"Debug"`
Name string `json:"name" db:"tag_name" example:"Testjob"`
Scope string `json:"scope" db:"tag_scope" example:"global"`
ID int64 `json:"id" db:"id"`
}
// Resource model
// @Description A resource used by a job
type Resource struct {
Hostname string `json:"hostname"` // Name of the host (= node)
HWThreads []int `json:"hwthreads,omitempty"` // List of OS processor ids
Accelerators []string `json:"accelerators,omitempty"` // List of of accelerator device ids
Configuration string `json:"configuration,omitempty"` // The configuration options of the node
Hostname string `json:"hostname"`
Configuration string `json:"configuration,omitempty"`
HWThreads []int `json:"hwthreads,omitempty"`
Accelerators []string `json:"accelerators,omitempty"`
}
type JobState string

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -10,22 +10,31 @@ import (
"math"
"sort"
"unsafe"
"github.com/ClusterCockpit/cc-backend/internal/util"
)
type JobData map[string]map[MetricScope]*JobMetric
type ScopedJobStats map[string]map[MetricScope][]*ScopedStats
type JobMetric struct {
Unit Unit `json:"unit"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
Unit Unit `json:"unit"`
Series []Series `json:"series"`
Timestep int `json:"timestep"`
}
type Series struct {
Hostname string `json:"hostname"`
Id *string `json:"id,omitempty"`
Statistics MetricStatistics `json:"statistics"`
Hostname string `json:"hostname"`
Data []Float `json:"data"`
Statistics MetricStatistics `json:"statistics"`
}
type ScopedStats struct {
Hostname string `json:"hostname"`
Id *string `json:"id,omitempty"`
Data *MetricStatistics `json:"data"`
}
type MetricStatistics struct {
@@ -35,10 +44,11 @@ type MetricStatistics struct {
}
type StatsSeries struct {
Percentiles map[int][]Float `json:"percentiles,omitempty"`
Mean []Float `json:"mean"`
Median []Float `json:"median"`
Min []Float `json:"min"`
Max []Float `json:"max"`
Percentiles map[int][]Float `json:"percentiles,omitempty"`
}
type MetricScope string
@@ -121,6 +131,7 @@ func (jd *JobData) Size() int {
if metric.StatisticsSeries != nil {
n += len(metric.StatisticsSeries.Max)
n += len(metric.StatisticsSeries.Mean)
n += len(metric.StatisticsSeries.Median)
n += len(metric.StatisticsSeries.Min)
}
@@ -149,53 +160,74 @@ func (jm *JobMetric) AddStatisticsSeries() {
}
}
min, mean, max := make([]Float, n), make([]Float, n), make([]Float, n)
// mean := make([]Float, n)
min, median, max := make([]Float, n), make([]Float, n), make([]Float, n)
i := 0
for ; i < m; i++ {
smin, ssum, smax := math.MaxFloat32, 0.0, -math.MaxFloat32
seriesCount := len(jm.Series)
// ssum := 0.0
smin, smed, smax := math.MaxFloat32, make([]float64, seriesCount), -math.MaxFloat32
notnan := 0
for j := 0; j < len(jm.Series); j++ {
for j := 0; j < seriesCount; j++ {
x := float64(jm.Series[j].Data[i])
if math.IsNaN(x) {
continue
}
notnan += 1
ssum += x
// ssum += x
smed[j] = x
smin = math.Min(smin, x)
smax = math.Max(smax, x)
}
if notnan < 3 {
min[i] = NaN
mean[i] = NaN
// mean[i] = NaN
median[i] = NaN
max[i] = NaN
} else {
min[i] = Float(smin)
mean[i] = Float(ssum / float64(notnan))
// mean[i] = Float(ssum / float64(notnan))
max[i] = Float(smax)
medianRaw, err := util.Median(smed)
if err != nil {
median[i] = NaN
} else {
median[i] = Float(medianRaw)
}
}
}
for ; i < n; i++ {
min[i] = NaN
mean[i] = NaN
// mean[i] = NaN
median[i] = NaN
max[i] = NaN
}
if smooth {
for i := 2; i < len(mean)-2; i++ {
for i := 2; i < len(median)-2; i++ {
if min[i].IsNaN() {
continue
}
min[i] = (min[i-2] + min[i-1] + min[i] + min[i+1] + min[i+2]) / 5
max[i] = (max[i-2] + max[i-1] + max[i] + max[i+1] + max[i+2]) / 5
mean[i] = (mean[i-2] + mean[i-1] + mean[i] + mean[i+1] + mean[i+2]) / 5
// mean[i] = (mean[i-2] + mean[i-1] + mean[i] + mean[i+1] + mean[i+2]) / 5
// Reduce Median further
smoothRaw := []float64{float64(median[i-2]), float64(median[i-1]), float64(median[i]), float64(median[i+1]), float64(median[i+2])}
smoothMedian, err := util.Median(smoothRaw)
if err != nil {
median[i] = NaN
} else {
median[i] = Float(smoothMedian)
}
}
}
jm.StatisticsSeries = &StatsSeries{Mean: mean, Min: min, Max: max}
jm.StatisticsSeries = &StatsSeries{Median: median, Min: min, Max: max} // Mean: mean
}
func (jd *JobData) AddNodeScope(metric string) bool {
@@ -204,7 +236,7 @@ func (jd *JobData) AddNodeScope(metric string) bool {
return false
}
var maxScope MetricScope = MetricScopeInvalid
maxScope := MetricScopeInvalid
for scope := range scopes {
maxScope = maxScope.Max(scope)
}
@@ -266,6 +298,21 @@ func (jd *JobData) AddNodeScope(metric string) bool {
return true
}
func (jd *JobData) RoundMetricStats() {
// TODO: Make Digit-Precision Configurable? (Currently: Fixed to 2 Digits)
for _, scopes := range *jd {
for _, jm := range scopes {
for index := range jm.Series {
jm.Series[index].Statistics = MetricStatistics{
Avg: (math.Round(jm.Series[index].Statistics.Avg*100) / 100),
Min: (math.Round(jm.Series[index].Statistics.Min*100) / 100),
Max: (math.Round(jm.Series[index].Statistics.Max*100) / 100),
}
}
}
}
}
func (jm *JobMetric) AddPercentiles(ps []int) bool {
if jm.StatisticsSeries == nil {
jm.AddStatisticsSeries()

View File

@@ -1,284 +1,339 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://cluster.schema.json",
"title": "HPC cluster description",
"description": "Meta data information of a HPC cluster",
"type": "object",
"properties": {
"name": {
"description": "The unique identifier of a cluster",
"type": "string"
},
"metricConfig": {
"description": "Metric specifications",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Metric name",
"type": "string"
},
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"scope": {
"description": "Native measurement resolution",
"type": "string"
},
"timestep": {
"description": "Frequency of timeseries points",
"type": "integer"
},
"aggregation": {
"description": "How the metric is aggregated",
"type": "string",
"enum": [
"sum",
"avg"
]
},
"peak": {
"description": "Metric peak threshold (Upper metric limit)",
"type": "number"
},
"normal": {
"description": "Metric normal threshold",
"type": "number"
},
"caution": {
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
"type": "number"
},
"alert": {
"description": "Metric alert threshold (Requires immediate action)",
"type": "number"
},
"subClusters": {
"description": "Array of cluster hardware partition metric thresholds",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"peak": {
"type": "number"
},
"normal": {
"type": "number"
},
"caution": {
"type": "number"
},
"alert": {
"type": "number"
},
"remove": {
"type": "boolean"
}
},
"required": [
"name"
]
}
}
},
"required": [
"name",
"unit",
"scope",
"timestep",
"aggregation",
"peak",
"normal",
"caution",
"alert"
]
},
"minItems": 1
},
"subClusters": {
"description": "Array of cluster hardware partitions",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"processorType": {
"description": "Processor type",
"type": "string"
},
"socketsPerNode": {
"description": "Number of sockets per node",
"type": "integer"
},
"coresPerSocket": {
"description": "Number of cores per socket",
"type": "integer"
},
"threadsPerCore": {
"description": "Number of SMT threads per core",
"type": "integer"
},
"flopRateScalar": {
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"flopRateSimd": {
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"memoryBandwidth": {
"description": "Theoretical node peak memory bandwidth in GB/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"nodes": {
"description": "Node list expression",
"type": "string"
},
"topology": {
"description": "Node topology",
"type": "object",
"properties": {
"node": {
"description": "HwTread lists of node",
"type": "array",
"items": {
"type": "integer"
}
},
"socket": {
"description": "HwTread lists of sockets",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"memoryDomain": {
"description": "HwTread lists of memory domains",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"die": {
"description": "HwTread lists of dies",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"core": {
"description": "HwTread lists of cores",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"accelerators": {
"type": "array",
"description": "List of of accelerator devices",
"items": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The unique device id"
},
"type": {
"type": "string",
"description": "The accelerator type",
"enum": [
"Nvidia GPU",
"AMD GPU",
"Intel GPU"
]
},
"model": {
"type": "string",
"description": "The accelerator model"
}
},
"required": [
"id",
"type",
"model"
]
}
}
},
"required": [
"node",
"socket",
"memoryDomain"
]
}
},
"required": [
"name",
"nodes",
"topology",
"processorType",
"socketsPerNode",
"coresPerSocket",
"threadsPerCore",
"flopRateScalar",
"flopRateSimd",
"memoryBandwidth"
]
},
"minItems": 1
}
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://cluster.schema.json",
"title": "HPC cluster description",
"description": "Meta data information of a HPC cluster",
"type": "object",
"properties": {
"name": {
"description": "The unique identifier of a cluster",
"type": "string"
},
"required": [
"name",
"metricConfig",
"subClusters"
]
"metricConfig": {
"description": "Metric specifications",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Metric name",
"type": "string"
},
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"scope": {
"description": "Native measurement resolution",
"type": "string",
"enum": [
"node",
"socket",
"memoryDomain",
"core",
"hwthread",
"accelerator"
]
},
"timestep": {
"description": "Frequency of timeseries points in seconds",
"type": "integer"
},
"aggregation": {
"description": "How the metric is aggregated",
"type": "string",
"enum": [
"sum",
"avg"
]
},
"footprint": {
"description": "Is it a footprint metric and what type",
"type": "string",
"enum": [
"avg",
"max",
"min"
]
},
"energy": {
"description": "Is it used to calculate job energy",
"type": "string",
"enum": [
"power",
"energy"
]
},
"lowerIsBetter": {
"description": "Is lower better.",
"type": "boolean"
},
"peak": {
"description": "Metric peak threshold (Upper metric limit)",
"type": "number"
},
"normal": {
"description": "Metric normal threshold",
"type": "number"
},
"caution": {
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
"type": "number"
},
"alert": {
"description": "Metric alert threshold (Requires immediate action)",
"type": "number"
},
"subClusters": {
"description": "Array of cluster hardware partition metric thresholds",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"footprint": {
"description": "Is it a footprint metric and what type. Overwrite global setting",
"type": "string",
"enum": [
"avg",
"max",
"min"
]
},
"energy": {
"description": "Is it used to calculate job energy. Overwrite global",
"type": "string",
"enum": [
"power",
"energy"
]
},
"lowerIsBetter": {
"description": "Is lower better. Overwrite global",
"type": "boolean"
},
"peak": {
"description": "The maximum possible metric value",
"type": "number"
},
"normal": {
"description": "A common metric value level",
"type": "number"
},
"caution": {
"description": "Metric value requires attention",
"type": "number"
},
"alert": {
"description": "Metric value requiring immediate attention",
"type": "number"
},
"remove": {
"description": "Remove this metric for this subcluster",
"type": "boolean"
}
},
"required": [
"name"
]
}
}
},
"required": [
"name",
"unit",
"scope",
"timestep",
"aggregation",
"peak",
"normal",
"caution",
"alert"
]
},
"minItems": 1
},
"subClusters": {
"description": "Array of cluster hardware partitions",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Hardware partition name",
"type": "string"
},
"processorType": {
"description": "Processor type",
"type": "string"
},
"socketsPerNode": {
"description": "Number of sockets per node",
"type": "integer"
},
"coresPerSocket": {
"description": "Number of cores per socket",
"type": "integer"
},
"threadsPerCore": {
"description": "Number of SMT threads per core",
"type": "integer"
},
"flopRateScalar": {
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"flopRateSimd": {
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"memoryBandwidth": {
"description": "Theoretical node peak memory bandwidth in GB/s",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"nodes": {
"description": "Node list expression",
"type": "string"
},
"topology": {
"description": "Node topology",
"type": "object",
"properties": {
"node": {
"description": "HwTread lists of node",
"type": "array",
"items": {
"type": "integer"
}
},
"socket": {
"description": "HwTread lists of sockets",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"memoryDomain": {
"description": "HwTread lists of memory domains",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"die": {
"description": "HwTread lists of dies",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"core": {
"description": "HwTread lists of cores",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"accelerators": {
"type": "array",
"description": "List of of accelerator devices",
"items": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The unique device id"
},
"type": {
"type": "string",
"description": "The accelerator type",
"enum": [
"Nvidia GPU",
"AMD GPU",
"Intel GPU"
]
},
"model": {
"type": "string",
"description": "The accelerator model"
}
},
"required": [
"id",
"type",
"model"
]
}
}
},
"required": [
"node",
"socket",
"memoryDomain"
]
}
},
"required": [
"name",
"nodes",
"topology",
"processorType",
"socketsPerNode",
"coresPerSocket",
"threadsPerCore",
"flopRateScalar",
"flopRateSimd",
"memoryBandwidth"
]
},
"minItems": 1
}
},
"required": [
"name",
"metricConfig",
"subClusters"
]
}

View File

@@ -1,433 +1,498 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://config.schema.json",
"title": "cc-backend configuration file schema",
"type": "object",
"properties": {
"addr": {
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
"type": "string"
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://config.schema.json",
"title": "cc-backend configuration file schema",
"type": "object",
"properties": {
"addr": {
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
"type": "string"
},
"apiAllowedIPs": {
"description": "Addresses from which secured API endpoints can be reached",
"type": "array",
"items": {
"type": "string"
}
},
"user": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"group": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
},
"disable-authentication": {
"description": "Disable authentication (for everything: API, Web-UI, ...).",
"type": "boolean"
},
"embed-static-files": {
"description": "If all files in `web/frontend/public` should be served from within the binary itself (they are embedded) or not.",
"type": "boolean"
},
"static-files": {
"description": "Folder where static assets can be found, if embed-static-files is false.",
"type": "string"
},
"db-driver": {
"description": "sqlite3 or mysql (mysql will work for mariadb as well).",
"type": "string",
"enum": [
"sqlite3",
"mysql"
]
},
"db": {
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
"type": "string"
},
"archive": {
"description": "Configuration keys for job-archive",
"type": "object",
"properties": {
"kind": {
"description": "Backend type for job-archive",
"type": "string",
"enum": [
"file",
"s3"
]
},
"user": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
"path": {
"description": "Path to job archive for file backend",
"type": "string"
},
"group": {
"description": "Drop root permissions once .env was read and the port was taken. Only applicable if using privileged port.",
"type": "string"
"compression": {
"description": "Setup automatic compression for jobs older than number of days",
"type": "integer"
},
"disable-authentication": {
"description": "Disable authentication (for everything: API, Web-UI, ...).",
"type": "boolean"
},
"embed-static-files": {
"description": "If all files in `web/frontend/public` should be served from within the binary itself (they are embedded) or not.",
"type": "boolean"
},
"static-files": {
"description": "Folder where static assets can be found, if embed-static-files is false.",
"type": "string"
},
"db-driver": {
"description": "sqlite3 or mysql (mysql will work for mariadb as well).",
"type": "string",
"enum": [
"sqlite3",
"mysql"
]
},
"db": {
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
"type": "string"
},
"job-archive": {
"description": "Configuration keys for job-archive",
"type": "object",
"properties": {
"kind": {
"description": "Backend type for job-archive",
"type": "string",
"enum": [
"file",
"s3"
]
},
"path": {
"description": "Path to job archive for file backend",
"type": "string"
},
"compression": {
"description": "Setup automatic compression for jobs older than number of days",
"type": "integer"
},
"retention": {
"description": "Configuration keys for retention",
"type": "object",
"properties": {
"policy": {
"description": "Retention policy",
"type": "string",
"enum": [
"none",
"delete",
"move"
]
},
"includeDB": {
"description": "Also remove jobs from database",
"type": "boolean"
},
"age": {
"description": "Act on jobs with startTime older than age (in days)",
"type": "integer"
},
"location": {
"description": "The target directory for retention. Only applicable for retention move.",
"type": "string"
}
},
"required": [
"policy"
]
}
"retention": {
"description": "Configuration keys for retention",
"type": "object",
"properties": {
"policy": {
"description": "Retention policy",
"type": "string",
"enum": [
"none",
"delete",
"move"
]
},
"required": [
"kind"
]
"includeDB": {
"description": "Also remove jobs from database",
"type": "boolean"
},
"age": {
"description": "Act on jobs with startTime older than age (in days)",
"type": "integer"
},
"location": {
"description": "The target directory for retention. Only applicable for retention move.",
"type": "string"
}
},
"required": [
"policy"
]
}
},
"required": [
"kind"
]
},
"disable-archive": {
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
"type": "boolean"
},
"validate": {
"description": "Validate all input json documents against json schema.",
"type": "boolean"
},
"session-max-age": {
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
"type": "string"
},
"https-cert-file": {
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
"type": "string"
},
"https-key-file": {
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
"type": "string"
},
"redirect-http-to": {
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
"type": "string"
},
"stop-jobs-exceeding-walltime": {
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
"type": "integer"
},
"short-running-jobs-duration": {
"description": "Do not show running jobs shorter than X seconds.",
"type": "integer"
},
"emission-constant": {
"description": ".",
"type": "integer"
},
"cron-frequency": {
"description": "Frequency of cron job workers.",
"type": "object",
"properties": {
"duration-worker": {
"description": "Duration Update Worker [Defaults to '5m']",
"type": "string"
},
"disable-archive": {
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
"type": "boolean"
"footprint-worker": {
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
"type": "string"
}
}
},
"enable-resampling": {
"description": "Enable dynamic zoom in frontend metric plots.",
"type": "object",
"properties": {
"trigger": {
"description": "Trigger next zoom level at less than this many visible datapoints.",
"type": "integer"
},
"validate": {
"description": "Validate all input json documents against json schema.",
"type": "boolean"
},
"session-max-age": {
"description": "Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire!",
"type": "string"
},
"https-cert-file": {
"description": "Filepath to SSL certificate. If also https-key-file is set use HTTPS using those certificates.",
"type": "string"
},
"https-key-file": {
"description": "Filepath to SSL key file. If also https-cert-file is set use HTTPS using those certificates.",
"type": "string"
},
"redirect-http-to": {
"description": "If not the empty string and addr does not end in :80, redirect every request incoming at port 80 to that url.",
"type": "string"
},
"stop-jobs-exceeding-walltime": {
"description": "If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job.",
"resolutions": {
"description": "Array of resampling target resolutions, in seconds.",
"type": "array",
"items": {
"type": "integer"
}
}
},
"required": [
"trigger",
"resolutions"
]
},
"jwts": {
"description": "For JWT token authentication.",
"type": "object",
"properties": {
"max-age": {
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
"type": "string"
},
"short-running-jobs-duration": {
"description": "Do not show running jobs shorter than X seconds.",
"type": "integer"
"cookieName": {
"description": "Cookie that should be checked for a JWT token.",
"type": "string"
},
"jwts": {
"description": "For JWT token authentication.",
"validateUser": {
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
"type": "boolean"
},
"trustedIssuer": {
"description": "Issuer that should be accepted when validating external JWTs ",
"type": "string"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
"type": "boolean"
}
},
"required": [
"max-age"
]
},
"oidc": {
"provider": {
"description": "",
"type": "string"
},
"syncUserOnLogin": {
"description": "",
"type": "boolean"
},
"updateUserOnLogin": {
"description": "",
"type": "boolean"
},
"required": [
"provider"
]
},
"ldap": {
"description": "For LDAP Authentication and user synchronisation.",
"type": "object",
"properties": {
"url": {
"description": "URL of LDAP directory server.",
"type": "string"
},
"user_base": {
"description": "Base DN of user tree root.",
"type": "string"
},
"search_dn": {
"description": "DN for authenticating LDAP admin account with general read rights.",
"type": "string"
},
"user_bind": {
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
"type": "string"
},
"user_filter": {
"description": "Filter to extract users for syncing.",
"type": "string"
},
"username_attr": {
"description": "Attribute with full username. Default: gecos",
"type": "string"
},
"sync_interval": {
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
"type": "string"
},
"sync_del_old_users": {
"description": "Delete obsolete users in database.",
"type": "boolean"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
"type": "boolean"
}
},
"required": [
"url",
"user_base",
"search_dn",
"user_bind",
"user_filter"
]
},
"clusters": {
"description": "Configuration for the clusters to be displayed.",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"max-age": {
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
"type": "string"
},
"cookieName": {
"description": "Cookie that should be checked for a JWT token.",
"type": "string"
},
"validateUser": {
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
"type": "boolean"
},
"trustedIssuer": {
"description": "Issuer that should be accepted when validating external JWTs ",
"type": "string"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
"type": "boolean"
}
"kind": {
"type": "string",
"enum": [
"influxdb",
"prometheus",
"cc-metric-store",
"test"
]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": [
"max-age"
"kind",
"url"
]
},
"ldap": {
"description": "For LDAP Authentication and user synchronisation.",
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"url": {
"description": "URL of LDAP directory server.",
"type": "string"
},
"user_base": {
"description": "Base DN of user tree root.",
"type": "string"
},
"search_dn": {
"description": "DN for authenticating LDAP admin account with general read rights.",
"type": "string"
},
"user_bind": {
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
"type": "string"
},
"user_filter": {
"description": "Filter to extract users for syncing.",
"type": "string"
},
"username_attr": {
"description": "Attribute with full username. Default: gecos",
"type": "string"
},
"sync_interval": {
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
"type": "string"
},
"sync_del_old_users": {
"description": "Delete obsolete users in database.",
"type": "boolean"
},
"syncUserOnLogin": {
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
"type": "boolean"
}
},
"required": [
"url",
"user_base",
"search_dn",
"user_bind",
"user_filter"
]
},
"clusters": {
"description": "Configuration for the clusters to be displayed.",
"type": "array",
"items": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": [
"influxdb",
"prometheus",
"cc-metric-store",
"test"
]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": [
"kind",
"url"
]
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": [
"from",
"to"
]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": [
"from",
"to"
]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"required": [
"from",
"to"
]
}
},
"required": [
"numNodes",
"duration",
"startTime"
]
}
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": [
"name",
"metricDataRepository",
"filterRanges"
],
"minItems": 1
}
},
"ui-defaults": {
"description": "Default configuration for web UI",
"type": "object",
"properties": {
"plot_general_colorBackground": {
"description": "Color plot background according to job average threshold limits",
"type": "boolean"
},
"plot_general_lineWidth": {
"description": "Initial linewidth",
"from",
"to"
]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"plot_list_jobsPerPage": {
"description": "Jobs shown per page in job lists",
},
"to": {
"type": "integer"
}
},
"plot_view_plotsPerRow": {
"description": "Number of plots per row in single job view",
"type": "integer"
"required": [
"from",
"to"
]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"plot_view_showPolarplot": {
"description": "Option to toggle polar plot in single job view",
"type": "boolean"
},
"plot_view_showRoofline": {
"description": "Option to toggle roofline plot in single job view",
"type": "boolean"
},
"plot_view_showStatTable": {
"description": "Option to toggle the node statistic table in single job view",
"type": "boolean"
},
"system_view_selectedMetric": {
"description": "Initial metric shown in system view",
"type": "string"
},
"analysis_view_histogramMetrics": {
"description": "Metrics to show as job count histograms in analysis view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"analysis_view_scatterPlotMetrics": {
"description": "Initial scatter plto configuration in analysis view",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string",
"minItems": 2,
"maxItems": 2
},
"minItems": 1
}
},
"job_view_nodestats_selectedMetrics": {
"description": "Initial metrics shown in node statistics table of single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"job_view_polarPlotMetrics": {
"description": "Metrics shown in polar plot of single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"job_view_selectedMetrics": {
"description": "",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_general_colorscheme": {
"description": "Initial color scheme",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_list_selectedMetrics": {
"description": "Initial metric plots shown in jobs lists",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
}
"required": [
"from",
"to"
]
}
},
"required": [
"plot_general_colorBackground",
"plot_general_lineWidth",
"plot_list_jobsPerPage",
"plot_view_plotsPerRow",
"plot_view_showPolarplot",
"plot_view_showRoofline",
"plot_view_showStatTable",
"system_view_selectedMetric",
"analysis_view_histogramMetrics",
"analysis_view_scatterPlotMetrics",
"job_view_nodestats_selectedMetrics",
"job_view_polarPlotMetrics",
"job_view_selectedMetrics",
"plot_general_colorscheme",
"plot_list_selectedMetrics"
"numNodes",
"duration",
"startTime"
]
}
}
},
"required": [
"name",
"metricDataRepository",
"filterRanges"
],
"minItems": 1
}
},
"required": [
"jwts",
"clusters"
]
"ui-defaults": {
"description": "Default configuration for web UI",
"type": "object",
"properties": {
"plot_general_colorBackground": {
"description": "Color plot background according to job average threshold limits",
"type": "boolean"
},
"plot_general_lineWidth": {
"description": "Initial linewidth",
"type": "integer"
},
"plot_list_jobsPerPage": {
"description": "Jobs shown per page in job lists",
"type": "integer"
},
"plot_view_plotsPerRow": {
"description": "Number of plots per row in single job view",
"type": "integer"
},
"plot_view_showPolarplot": {
"description": "Option to toggle polar plot in single job view",
"type": "boolean"
},
"plot_view_showRoofline": {
"description": "Option to toggle roofline plot in single job view",
"type": "boolean"
},
"plot_view_showStatTable": {
"description": "Option to toggle the node statistic table in single job view",
"type": "boolean"
},
"system_view_selectedMetric": {
"description": "Initial metric shown in system view",
"type": "string"
},
"job_view_showFootprint": {
"description": "Option to toggle footprint ui in single job view",
"type": "boolean"
},
"job_list_usePaging": {
"description": "Option to switch from continous scroll to paging",
"type": "boolean"
},
"analysis_view_histogramMetrics": {
"description": "Metrics to show as job count histograms in analysis view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"analysis_view_scatterPlotMetrics": {
"description": "Initial scatter plto configuration in analysis view",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string",
"minItems": 2,
"maxItems": 2
},
"minItems": 1
}
},
"job_view_nodestats_selectedMetrics": {
"description": "Initial metrics shown in node statistics table of single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"job_view_selectedMetrics": {
"description": "Initial metrics shown as plots in single job view",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_general_colorscheme": {
"description": "Initial color scheme",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
},
"plot_list_selectedMetrics": {
"description": "Initial metric plots shown in jobs lists",
"type": "array",
"items": {
"type": "string",
"minItems": 1
}
}
},
"required": [
"plot_general_colorBackground",
"plot_general_lineWidth",
"plot_list_jobsPerPage",
"plot_view_plotsPerRow",
"plot_view_showPolarplot",
"plot_view_showRoofline",
"plot_view_showStatTable",
"system_view_selectedMetric",
"job_view_showFootprint",
"job_list_usePaging",
"analysis_view_histogramMetrics",
"analysis_view_scatterPlotMetrics",
"job_view_nodestats_selectedMetrics",
"job_view_selectedMetrics",
"plot_general_colorscheme",
"plot_list_selectedMetrics"
]
}
},
"required": [
"jwts",
"clusters",
"apiAllowedIPs"
]
}

View File

@@ -1,490 +1,490 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-data.schema.json",
"title": "Job metric data list",
"description": "Collection of metric data of a HPC job",
"type": "object",
"properties": {
"mem_used": {
"description": "Memory capacity used",
"type": "object",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"flops_any": {
"description": "Total flop rate with DP flops scaled up",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"mem_bw": {
"description": "Main memory bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"net_bw": {
"description": "Total fast interconnect network bandwidth",
"type": "object",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"ipc": {
"description": "Instructions executed per cycle",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"cpu_user": {
"description": "CPU user active core utilization",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"cpu_load": {
"description": "CPU requested core utilization (load 1m)",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"flops_dp": {
"description": "Double precision flop rate",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"flops_sp": {
"description": "Single precision flops rate",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"vectorization_ratio": {
"description": "Fraction of arithmetic instructions using SIMD instructions",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"cpu_power": {
"description": "CPU power consumption",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"mem_power": {
"description": "Memory power consumption",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"acc_utilization": {
"description": "GPU utilization",
"properties": {
"accelerator": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"accelerator"
]
},
"acc_mem_used": {
"description": "GPU memory capacity used",
"properties": {
"accelerator": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"accelerator"
]
},
"acc_power": {
"description": "GPU power consumption",
"properties": {
"accelerator": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"accelerator"
]
},
"clock": {
"description": "Average core frequency",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"eth_read_bw": {
"description": "Ethernet read bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"eth_write_bw": {
"description": "Ethernet write bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"filesystems": {
"description": "Array of filesystems",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string",
"enum": [
"nfs",
"lustre",
"gpfs",
"nvme",
"ssd",
"hdd",
"beegfs"
]
},
"read_bw": {
"description": "File system read bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"write_bw": {
"description": "File system write bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"read_req": {
"description": "File system read requests",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"write_req": {
"description": "File system write requests",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"inodes": {
"description": "File system write requests",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"accesses": {
"description": "File system open and close",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"fsync": {
"description": "File system fsync",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"create": {
"description": "File system create",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"open": {
"description": "File system open",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"close": {
"description": "File system close",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"seek": {
"description": "File system seek",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
}
},
"required": [
"name",
"type",
"read_bw",
"write_bw"
]
},
"minItems": 1
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-data.schema.json",
"title": "Job metric data list",
"description": "Collection of metric data of a HPC job",
"type": "object",
"properties": {
"mem_used": {
"description": "Memory capacity used",
"type": "object",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"ic_rcv_packets": {
"description": "Network interconnect read packets",
"flops_any": {
"description": "Total flop rate with DP flops scaled up",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"mem_bw": {
"description": "Main memory bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"net_bw": {
"description": "Total fast interconnect network bandwidth",
"type": "object",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"ipc": {
"description": "Instructions executed per cycle",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"cpu_user": {
"description": "CPU user active core utilization",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"cpu_load": {
"description": "CPU requested core utilization (load 1m)",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"flops_dp": {
"description": "Double precision flop rate",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"flops_sp": {
"description": "Single precision flops rate",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"vectorization_ratio": {
"description": "Fraction of arithmetic instructions using SIMD instructions",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"cpu_power": {
"description": "CPU power consumption",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"mem_power": {
"description": "Memory power consumption",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"acc_utilization": {
"description": "GPU utilization",
"properties": {
"accelerator": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"accelerator"
]
},
"acc_mem_used": {
"description": "GPU memory capacity used",
"properties": {
"accelerator": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"accelerator"
]
},
"acc_power": {
"description": "GPU power consumption",
"properties": {
"accelerator": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"accelerator"
]
},
"clock": {
"description": "Average core frequency",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"socket": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"memoryDomain": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"core": {
"$ref": "embedfs://job-metric-data.schema.json"
},
"hwthread": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"minProperties": 1
},
"eth_read_bw": {
"description": "Ethernet read bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"eth_write_bw": {
"description": "Ethernet write bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"filesystems": {
"description": "Array of filesystems",
"type": "array",
"items": {
"type": "object",
"properties": {
"node": {
"name": {
"type": "string"
},
"type": {
"type": "string",
"enum": [
"nfs",
"lustre",
"gpfs",
"nvme",
"ssd",
"hdd",
"beegfs"
]
},
"read_bw": {
"description": "File system read bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
}
},
"required": [
"node"
]
},
"write_bw": {
"description": "File system write bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"read_req": {
"description": "File system read requests",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"write_req": {
"description": "File system write requests",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"inodes": {
"description": "File system write requests",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"accesses": {
"description": "File system open and close",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"fsync": {
"description": "File system fsync",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"create": {
"description": "File system create",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"open": {
"description": "File system open",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"close": {
"description": "File system close",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"seek": {
"description": "File system seek",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
}
},
"required": [
"node"
]
},
"ic_send_packets": {
"description": "Network interconnect send packet",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"ic_read_bw": {
"description": "Network interconnect read bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"ic_write_bw": {
"description": "Network interconnect write bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
"name",
"type",
"read_bw",
"write_bw"
]
},
"minItems": 1
}
},
"ic_rcv_packets": {
"description": "Network interconnect read packets",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"cpu_user",
"cpu_load",
"mem_used",
"flops_any",
"mem_bw",
"net_bw",
"filesystems"
"node"
]
},
"ic_send_packets": {
"description": "Network interconnect send packet",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"ic_read_bw": {
"description": "Network interconnect read bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"ic_write_bw": {
"description": "Network interconnect write bandwidth",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
}
},
"required": [
"node"
]
},
"required": [
"cpu_user",
"cpu_load",
"mem_used",
"flops_any",
"mem_bw",
"net_bw",
"filesystems"
]
}

View File

@@ -1,351 +1,351 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-meta.schema.json",
"title": "Job meta data",
"description": "Meta data information of a HPC job",
"type": "object",
"properties": {
"jobId": {
"description": "The unique identifier of a job",
"type": "integer"
},
"user": {
"description": "The unique identifier of a user",
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-meta.schema.json",
"title": "Job meta data",
"description": "Meta data information of a HPC job",
"type": "object",
"properties": {
"jobId": {
"description": "The unique identifier of a job",
"type": "integer"
},
"user": {
"description": "The unique identifier of a user",
"type": "string"
},
"project": {
"description": "The unique identifier of a project",
"type": "string"
},
"cluster": {
"description": "The unique identifier of a cluster",
"type": "string"
},
"subCluster": {
"description": "The unique identifier of a sub cluster",
"type": "string"
},
"partition": {
"description": "The Slurm partition to which the job was submitted",
"type": "string"
},
"arrayJobId": {
"description": "The unique identifier of an array job",
"type": "integer"
},
"numNodes": {
"description": "Number of nodes used",
"type": "integer",
"exclusiveMinimum": 0
},
"numHwthreads": {
"description": "Number of HWThreads used",
"type": "integer",
"exclusiveMinimum": 0
},
"numAcc": {
"description": "Number of accelerators used",
"type": "integer",
"exclusiveMinimum": 0
},
"exclusive": {
"description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
"type": "integer",
"minimum": 0,
"maximum": 2
},
"monitoringStatus": {
"description": "State of monitoring system during job run",
"type": "integer"
},
"smt": {
"description": "SMT threads used by job",
"type": "integer"
},
"walltime": {
"description": "Requested walltime of job in seconds",
"type": "integer",
"exclusiveMinimum": 0
},
"jobState": {
"description": "Final state of job",
"type": "string",
"enum": [
"completed",
"failed",
"cancelled",
"stopped",
"out_of_memory",
"timeout"
]
},
"startTime": {
"description": "Start epoch time stamp in seconds",
"type": "integer",
"exclusiveMinimum": 0
},
"duration": {
"description": "Duration of job in seconds",
"type": "integer",
"exclusiveMinimum": 0
},
"resources": {
"description": "Resources used by job",
"type": "array",
"items": {
"type": "object",
"properties": {
"hostname": {
"type": "string"
},
"project": {
"description": "The unique identifier of a project",
"type": "string"
},
"cluster": {
"description": "The unique identifier of a cluster",
"type": "string"
},
"subCluster": {
"description": "The unique identifier of a sub cluster",
"type": "string"
},
"partition": {
"description": "The Slurm partition to which the job was submitted",
"type": "string"
},
"arrayJobId": {
"description": "The unique identifier of an array job",
"type": "integer"
},
"numNodes": {
"description": "Number of nodes used",
"type": "integer",
"exclusiveMinimum": 0
},
"numHwthreads": {
"description": "Number of HWThreads used",
"type": "integer",
"exclusiveMinimum": 0
},
"numAcc": {
"description": "Number of accelerators used",
"type": "integer",
"exclusiveMinimum": 0
},
"exclusive": {
"description": "Specifies how nodes are shared. 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive, 2 - Shared among multiple jobs of same user",
"type": "integer",
"minimum": 0,
"maximum": 2
},
"monitoringStatus": {
"description": "State of monitoring system during job run",
"type": "integer"
},
"smt": {
"description": "SMT threads used by job",
"type": "integer"
},
"walltime": {
"description": "Requested walltime of job in seconds",
"type": "integer",
"exclusiveMinimum": 0
},
"jobState": {
"description": "Final state of job",
},
"hwthreads": {
"type": "array",
"description": "List of OS processor ids",
"items": {
"type": "integer"
}
},
"accelerators": {
"type": "array",
"description": "List of of accelerator device ids",
"items": {
"type": "string"
}
},
"configuration": {
"type": "string",
"enum": [
"completed",
"failed",
"cancelled",
"stopped",
"out_of_memory",
"timeout"
]
"description": "The configuration options of the node"
}
},
"startTime": {
"description": "Start epoch time stamp in seconds",
"type": "integer",
"exclusiveMinimum": 0
"required": [
"hostname"
],
"minItems": 1
}
},
"metaData": {
"description": "Additional information about the job",
"type": "object",
"properties": {
"jobScript": {
"type": "string",
"description": "The batch script of the job"
},
"duration": {
"description": "Duration of job in seconds",
"type": "integer",
"exclusiveMinimum": 0
"jobName": {
"type": "string",
"description": "Slurm Job name"
},
"resources": {
"description": "Resources used by job",
"type": "array",
"items": {
"type": "object",
"properties": {
"hostname": {
"type": "string"
},
"hwthreads": {
"type": "array",
"description": "List of OS processor ids",
"items": {
"type": "integer"
}
},
"accelerators": {
"type": "array",
"description": "List of of accelerator device ids",
"items": {
"type": "string"
}
},
"configuration": {
"type": "string",
"description": "The configuration options of the node"
}
},
"required": [
"hostname"
],
"minItems": 1
}
"slurmInfo": {
"type": "string",
"description": "Additional slurm infos as show by scontrol show job"
}
}
},
"tags": {
"description": "List of tags",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
},
"metaData": {
"description": "Additional information about the job",
"required": [
"name",
"type"
]
},
"uniqueItems": true
},
"statistics": {
"description": "Job statistic data",
"type": "object",
"properties": {
"mem_used": {
"description": "Memory capacity used (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"cpu_load": {
"description": "CPU requested core utilization (load 1m) (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"flops_any": {
"description": "Total flop rate with DP flops scaled up (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"mem_bw": {
"description": "Main memory bandwidth (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"net_bw": {
"description": "Total fast interconnect network bandwidth (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"file_bw": {
"description": "Total file IO bandwidth (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ipc": {
"description": "Instructions executed per cycle",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"cpu_user": {
"description": "CPU user active core utilization",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"flops_dp": {
"description": "Double precision flop rate",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"flops_sp": {
"description": "Single precision flops rate",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"rapl_power": {
"description": "CPU power consumption",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"acc_used": {
"description": "GPU utilization",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"acc_mem_used": {
"description": "GPU memory capacity used",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"acc_power": {
"description": "GPU power consumption",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"clock": {
"description": "Average core frequency",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"eth_read_bw": {
"description": "Ethernet read bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"eth_write_bw": {
"description": "Ethernet write bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_rcv_packets": {
"description": "Network interconnect read packets",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_send_packets": {
"description": "Network interconnect send packet",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_read_bw": {
"description": "Network interconnect read bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_write_bw": {
"description": "Network interconnect write bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"filesystems": {
"description": "Array of filesystems",
"type": "array",
"items": {
"type": "object",
"properties": {
"jobScript": {
"type": "string",
"description": "The batch script of the job"
},
"jobName": {
"type": "string",
"description": "Slurm Job name"
},
"slurmInfo": {
"type": "string",
"description": "Additional slurm infos as show by scontrol show job"
}
}
},
"tags": {
"description": "List of tags",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
},
"required": [
"name",
"type"
"name": {
"type": "string"
},
"type": {
"type": "string",
"enum": [
"nfs",
"lustre",
"gpfs",
"nvme",
"ssd",
"hdd",
"beegfs"
]
},
"uniqueItems": true
},
"statistics": {
"description": "Job statistic data",
"type": "object",
"properties": {
"mem_used": {
"description": "Memory capacity used (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"cpu_load": {
"description": "CPU requested core utilization (load 1m) (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"flops_any": {
"description": "Total flop rate with DP flops scaled up (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"mem_bw": {
"description": "Main memory bandwidth (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"net_bw": {
"description": "Total fast interconnect network bandwidth (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"file_bw": {
"description": "Total file IO bandwidth (required)",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ipc": {
"description": "Instructions executed per cycle",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"cpu_user": {
"description": "CPU user active core utilization",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"flops_dp": {
"description": "Double precision flop rate",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"flops_sp": {
"description": "Single precision flops rate",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"rapl_power": {
"description": "CPU power consumption",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"acc_used": {
"description": "GPU utilization",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"acc_mem_used": {
"description": "GPU memory capacity used",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"acc_power": {
"description": "GPU power consumption",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"clock": {
"description": "Average core frequency",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"eth_read_bw": {
"description": "Ethernet read bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"eth_write_bw": {
"description": "Ethernet write bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_rcv_packets": {
"description": "Network interconnect read packets",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_send_packets": {
"description": "Network interconnect send packet",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_read_bw": {
"description": "Network interconnect read bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"ic_write_bw": {
"description": "Network interconnect write bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"filesystems": {
"description": "Array of filesystems",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string",
"enum": [
"nfs",
"lustre",
"gpfs",
"nvme",
"ssd",
"hdd",
"beegfs"
]
},
"read_bw": {
"description": "File system read bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"write_bw": {
"description": "File system write bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"read_req": {
"description": "File system read requests",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"write_req": {
"description": "File system write requests",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"inodes": {
"description": "File system write requests",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"accesses": {
"description": "File system open and close",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"fsync": {
"description": "File system fsync",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"create": {
"description": "File system create",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"open": {
"description": "File system open",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"close": {
"description": "File system close",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"seek": {
"description": "File system seek",
"$ref": "embedfs://job-metric-statistics.schema.json"
}
},
"required": [
"name",
"type",
"read_bw",
"write_bw"
]
},
"minItems": 1
}
},
"read_bw": {
"description": "File system read bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"write_bw": {
"description": "File system write bandwidth",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"read_req": {
"description": "File system read requests",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"write_req": {
"description": "File system write requests",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"inodes": {
"description": "File system write requests",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"accesses": {
"description": "File system open and close",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"fsync": {
"description": "File system fsync",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"create": {
"description": "File system create",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"open": {
"description": "File system open",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"close": {
"description": "File system close",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"seek": {
"description": "File system seek",
"$ref": "embedfs://job-metric-statistics.schema.json"
}
},
"required": [
"cpu_user",
"cpu_load",
"mem_used",
"flops_any",
"mem_bw"
"name",
"type",
"read_bw",
"write_bw"
]
},
"minItems": 1
}
},
"required": [
"jobId",
"user",
"project",
"cluster",
"subCluster",
"numNodes",
"exclusive",
"startTime",
"jobState",
"duration",
"resources",
"statistics"
]
},
"required": [
"cpu_user",
"cpu_load",
"mem_used",
"flops_any",
"mem_bw"
]
}
},
"required": [
"jobId",
"user",
"project",
"cluster",
"subCluster",
"numNodes",
"exclusive",
"startTime",
"jobState",
"duration",
"resources",
"statistics"
]
}

View File

@@ -1,216 +1,216 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-metric-data.schema.json",
"title": "Job metric data",
"description": "Metric data of a HPC job",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"timestep": {
"description": "Measurement interval in seconds",
"type": "integer"
},
"thresholds": {
"description": "Metric thresholds for specific system",
"type": "object",
"properties": {
"peak": {
"type": "number"
},
"normal": {
"type": "number"
},
"caution": {
"type": "number"
},
"alert": {
"type": "number"
}
}
},
"statisticsSeries": {
"type": "object",
"description": "Statistics series across topology",
"properties": {
"min": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"max": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"mean": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"percentiles": {
"type": "object",
"properties": {
"10": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"20": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"30": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"40": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"50": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"60": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"70": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"80": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"90": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"25": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"75": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
}
}
}
}
},
"series": {
"type": "array",
"items": {
"type": "object",
"properties": {
"hostname": {
"type": "string"
},
"id": {
"type": "string"
},
"statistics": {
"type": "object",
"description": "Statistics across time dimension",
"properties": {
"avg": {
"description": "Series average",
"type": "number",
"minimum": 0
},
"min": {
"description": "Series minimum",
"type": "number",
"minimum": 0
},
"max": {
"description": "Series maximum",
"type": "number",
"minimum": 0
}
},
"required": [
"avg",
"min",
"max"
]
},
"data": {
"type": "array",
"contains": {
"type": "number",
"minimum": 0
},
"minItems": 1
}
},
"required": [
"hostname",
"statistics",
"data"
]
}
}
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-metric-data.schema.json",
"title": "Job metric data",
"description": "Metric data of a HPC job",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"required": [
"unit",
"timestep",
"series"
]
"timestep": {
"description": "Measurement interval in seconds",
"type": "integer"
},
"thresholds": {
"description": "Metric thresholds for specific system",
"type": "object",
"properties": {
"peak": {
"type": "number"
},
"normal": {
"type": "number"
},
"caution": {
"type": "number"
},
"alert": {
"type": "number"
}
}
},
"statisticsSeries": {
"type": "object",
"description": "Statistics series across topology",
"properties": {
"min": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"max": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"mean": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"percentiles": {
"type": "object",
"properties": {
"10": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"20": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"30": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"40": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"50": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"60": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"70": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"80": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"90": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"25": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
},
"75": {
"type": "array",
"items": {
"type": "number",
"minimum": 0
},
"minItems": 3
}
}
}
}
},
"series": {
"type": "array",
"items": {
"type": "object",
"properties": {
"hostname": {
"type": "string"
},
"id": {
"type": "string"
},
"statistics": {
"type": "object",
"description": "Statistics across time dimension",
"properties": {
"avg": {
"description": "Series average",
"type": "number",
"minimum": 0
},
"min": {
"description": "Series minimum",
"type": "number",
"minimum": 0
},
"max": {
"description": "Series maximum",
"type": "number",
"minimum": 0
}
},
"required": [
"avg",
"min",
"max"
]
},
"data": {
"type": "array",
"contains": {
"type": "number",
"minimum": 0
},
"minItems": 1
}
},
"required": [
"hostname",
"statistics",
"data"
]
}
}
},
"required": [
"unit",
"timestep",
"series"
]
}

View File

@@ -1,34 +1,34 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-metric-statistics.schema.json",
"title": "Job statistics",
"description": "Format specification for job metric statistics",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"avg": {
"description": "Job metric average",
"type": "number",
"minimum": 0
},
"min": {
"description": "Job metric minimum",
"type": "number",
"minimum": 0
},
"max": {
"description": "Job metric maximum",
"type": "number",
"minimum": 0
}
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://job-metric-statistics.schema.json",
"title": "Job statistics",
"description": "Format specification for job metric statistics",
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"required": [
"unit",
"avg",
"min",
"max"
]
"avg": {
"description": "Job metric average",
"type": "number",
"minimum": 0
},
"min": {
"description": "Job metric minimum",
"type": "number",
"minimum": 0
},
"max": {
"description": "Job metric maximum",
"type": "number",
"minimum": 0
}
},
"required": [
"unit",
"avg",
"min",
"max"
]
}

View File

@@ -1,40 +1,41 @@
{
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://unit.schema.json",
"title": "Metric unit",
"description": "Format specification for job metric units",
"type": "object",
"properties": {
"base": {
"description": "Metric base unit",
"type": "string",
"enum": [
"B",
"F",
"B/s",
"F/s",
"CPI",
"IPC",
"Hz",
"W",
"°C",
""
]
},
"prefix": {
"description": "Unit prefix",
"type": "string",
"enum": [
"K",
"M",
"G",
"T",
"P",
"E"
]
}
"$schema": "http://json-schema.org/draft/2020-12/schema",
"$id": "embedfs://unit.schema.json",
"title": "Metric unit",
"description": "Format specification for job metric units",
"type": "object",
"properties": {
"base": {
"description": "Metric base unit",
"type": "string",
"enum": [
"B",
"F",
"B/s",
"F/s",
"CPI",
"IPC",
"Hz",
"W",
"J",
"°C",
""
]
},
"required": [
"base"
]
"prefix": {
"description": "Unit prefix",
"type": "string",
"enum": [
"K",
"M",
"G",
"T",
"P",
"E"
]
}
},
"required": [
"base"
]
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -27,6 +27,7 @@ const (
AuthViaLocalPassword AuthSource = iota
AuthViaLDAP
AuthViaToken
AuthViaOIDC
AuthViaAll
)
@@ -41,11 +42,11 @@ type User struct {
Username string `json:"username"`
Password string `json:"-"`
Name string `json:"name"`
Email string `json:"email"`
Roles []string `json:"roles"`
Projects []string `json:"projects"`
AuthType AuthType `json:"authType"`
AuthSource AuthSource `json:"authSource"`
Email string `json:"email"`
Projects []string `json:"projects"`
}
func (u *User) HasProject(project string) bool {
@@ -84,6 +85,7 @@ func IsValidRole(role string) bool {
return getRoleEnum(role) != RoleError
}
// Check if User has SPECIFIED role AND role is VALID
func (u *User) HasValidRole(role string) (hasRole bool, isValid bool) {
if IsValidRole(role) {
for _, r := range u.Roles {
@@ -96,6 +98,7 @@ func (u *User) HasValidRole(role string) (hasRole bool, isValid bool) {
return false, false
}
// Check if User has SPECIFIED role
func (u *User) HasRole(role Role) bool {
for _, r := range u.Roles {
if r == GetRoleString(role) {
@@ -105,7 +108,7 @@ func (u *User) HasRole(role Role) bool {
return false
}
// Role-Arrays are short: performance not impacted by nested loop
// Check if User has ANY of the listed roles
func (u *User) HasAnyRole(queryroles []Role) bool {
for _, ur := range u.Roles {
for _, qr := range queryroles {
@@ -117,7 +120,7 @@ func (u *User) HasAnyRole(queryroles []Role) bool {
return false
}
// Role-Arrays are short: performance not impacted by nested loop
// Check if User has ALL of the listed roles
func (u *User) HasAllRoles(queryroles []Role) bool {
target := len(queryroles)
matches := 0
@@ -137,7 +140,7 @@ func (u *User) HasAllRoles(queryroles []Role) bool {
}
}
// Role-Arrays are short: performance not impacted by nested loop
// Check if User has NONE of the listed roles
func (u *User) HasNotRoles(queryroles []Role) bool {
matches := 0
for _, ur := range u.Roles {

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -14,17 +14,20 @@ func TestValidateConfig(t *testing.T) {
"jwts": {
"max-age": "2m"
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "localhost:8082"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}}]
"apiAllowedIPs": [
"*"
],
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "localhost:8082"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}}]
}`)
if err := Validate(Config, bytes.NewReader(json)); err != nil {
@@ -33,7 +36,6 @@ func TestValidateConfig(t *testing.T) {
}
func TestValidateJobMeta(t *testing.T) {
}
func TestValidateCluster(t *testing.T) {