Merge pull request #104 from ClusterCockpit/import-data-sanitation

Import data sanitation
Fixes among other things  MetricConfig for GPU SubCluster #99 Mismatch of type of "id" in job-metric-data "series" object schema #101
This commit is contained in:
Jan Eitzinger 2023-04-12 09:15:27 +02:00 committed by GitHub
commit 7272db4fb0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
75 changed files with 5240 additions and 1395 deletions

4
.gitignore vendored
View File

@ -9,4 +9,6 @@
/web/frontend/public/build
/web/frontend/node_modules
.vscode/settings.json
/.vscode/*
/archive-migration
/archive-manager

View File

@ -47,12 +47,17 @@ type SubCluster {
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: Int!
flopRateSimd: Int!
memoryBandwidth: Int!
flopRateScalar: MetricValue!
flopRateSimd: MetricValue!
memoryBandwidth: MetricValue!
topology: Topology!
}
type MetricValue {
unit: Unit!
value: Float!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
@ -70,23 +75,24 @@ type Accelerator {
type SubClusterConfig {
name: String!
peak: Float!
normal: Float!
caution: Float!
alert: Float!
}
type MetricConfig {
name: String!
unit: String!
scope: MetricScope!
aggregation: String
timestep: Int!
peak: Float
normal: Float
caution: Float
alert: Float
subClusters: [SubClusterConfig]
remove: Boolean
}
type MetricConfig {
name: String!
unit: Unit!
scope: MetricScope!
aggregation: String!
timestep: Int!
peak: Float!
normal: Float
caution: Float!
alert: Float!
subClusters: [SubClusterConfig!]!
}
type Tag {
@ -104,12 +110,12 @@ type Resource {
type JobMetricWithName {
name: String!
scope: MetricScope!
metric: JobMetric!
}
type JobMetric {
unit: String!
scope: MetricScope!
unit: Unit
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
@ -117,11 +123,16 @@ type JobMetric {
type Series {
hostname: String!
id: Int
id: String
statistics: MetricStatistics
data: [NullableFloat!]!
}
type Unit {
base: String!
prefix: String
}
type MetricStatistics {
avg: Float!
min: Float!

View File

@ -15,6 +15,7 @@
"kind": "file",
"path": "./var/job-archive"
},
"validate": true,
"clusters": [
{
"name": "test",
@ -24,9 +25,18 @@
"token": "eyJhbGciOiJF-E-pQBQ"
},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}
],

View File

@ -1,10 +1,12 @@
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
my %INFO;
my %DOMAINS;
my $SMT;
my $numMemoryDomains;
$DOMAINS{socket} = [];
@ -198,8 +200,11 @@ END
$INFO{gpus} .= join(",\n",@gpuStr);
$INFO{gpus} .= "]\n";
} else {
$INFO{gpus} = '';
}
print <<"END";
{
"name": "<FILL IN>",
@ -219,10 +224,10 @@ print <<"END";
"memoryDomain": [
$INFO{memoryDomains}
],
$INFO{gpus}
"core": [
$INFO{cores}
]
$INFO{gpus}
}
}
END

View File

@ -0,0 +1,37 @@
# Release versioning
Releases are numbered with a integer id starting with 1.
Every release embeds the following assets into the binary:
* Web-frontend including javascript files and all static assets
* Golang template files for server-side rendering
* JSON schema files for validation
Remaining external assets are:
* The SQL database used
* The job archive
Both external assets are also versioned using integer ids.
This means every release binary is tied to specific versions for the SQL
database and job archive.
A command line switch `--migrate-db` is provided to migrate the SQL database
from a previous to the most recent version.
We provide a separate tool `archive-migration` to migrate an existing job
archive from the previous to the most recent version.
# Versioning of APIs
cc-backend provides two API backends:
* A REST API for querying jobs
* A GraphQL API used for data exchange between web frontend and cc-backend
Both APIs will also be versioned. We still need to decide if we also support
older REST API version using versioning of the endpoint URLs.
# How to build a specific release
# How to migrate the SQL database
# How to migrate the job archive

View File

@ -13,9 +13,18 @@
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
},
"filterRanges": {
"numNodes": { "from": 1, "to": 32 },
"duration": { "from": 0, "to": 172800 },
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
"numNodes": {
"from": 1,
"to": 32
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2010-01-01T00:00:00Z",
"to": null
}
}
},
{
@ -26,9 +35,18 @@
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
},
"filterRanges": {
"numNodes": { "from": 1, "to": 1 },
"duration": { "from": 0, "to": 172800 },
"startTime": { "from": "2015-01-01T00:00:00Z", "to": null }
"numNodes": {
"from": 1,
"to": 1
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2015-01-01T00:00:00Z",
"to": null
}
}
}
]

View File

@ -63,6 +63,7 @@ models:
resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
@ -79,3 +80,4 @@ models:
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }

File diff suppressed because it is too large Load Diff

View File

@ -60,6 +60,7 @@ type JobFilter struct {
type JobMetricWithName struct {
Name string `json:"name"`
Scope schema.MetricScope `json:"scope"`
Metric *schema.JobMetric `json:"metric"`
}

View File

@ -194,12 +194,9 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
res := []*model.JobMetricWithName{}
for name, md := range data {
for scope, metric := range md {
if metric.Scope != schema.MetricScope(scope) {
log.Panic("metric.Scope != schema.MetricScope(scope) : Should not happen!")
}
res = append(res, &model.JobMetricWithName{
Name: name,
Scope: scope,
Metric: metric,
})
}
@ -296,6 +293,7 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
for _, scopedMetric := range scopedMetrics {
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
Name: metric,
Scope: schema.MetricScopeNode,
Metric: scopedMetric,
})
}
@ -307,6 +305,15 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nodeMetrics, nil
}
// NumberOfNodes is the resolver for the numberOfNodes field.
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
nodeList, err := archive.ParseNodeList(obj.Nodes)
if err != nil {
return 0, err
}
return nodeList.NodeCount(), nil
}
// Cluster returns generated.ClusterResolver implementation.
func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver{r} }
@ -319,7 +326,11 @@ func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResol
// Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
// SubCluster returns generated.SubClusterResolver implementation.
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver }
type queryResolver struct{ *Resolver }
type subClusterResolver struct{ *Resolver }

View File

@ -164,7 +164,6 @@ func (ccms *CCMetricStore) LoadData(
scopes []schema.MetricScope,
ctx context.Context) (schema.JobData, error) {
topology := archive.GetSubCluster(job.Cluster, job.SubCluster).Topology
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
if err != nil {
log.Warn("Error while building queries")
@ -201,7 +200,6 @@ func (ccms *CCMetricStore) LoadData(
if !ok {
jobMetric = &schema.JobMetric{
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0),
}
@ -215,13 +213,10 @@ func (ccms *CCMetricStore) LoadData(
continue
}
id := (*int)(nil)
id := (*string)(nil)
if query.Type != nil {
id = new(int)
*id, err = strconv.Atoi(query.TypeIds[0])
if err != nil || *query.Type == acceleratorString {
*id, _ = topology.GetAcceleratorIndex(query.TypeIds[0])
}
id = new(string)
*id = query.TypeIds[0]
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
@ -235,7 +230,7 @@ func (ccms *CCMetricStore) LoadData(
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname,
Id: id,
Statistics: &schema.MetricStatistics{
Statistics: schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
@ -275,9 +270,14 @@ func (ccms *CCMetricStore) buildQueries(
scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) {
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
topology := archive.GetSubCluster(job.Cluster, job.SubCluster).Topology
assignedScope := []schema.MetricScope{}
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
if scerr != nil {
return nil, nil, scerr
}
topology := subcluster.Topology
for _, metric := range metrics {
remoteName := ccms.toRemoteName(metric)
mc := archive.GetMetricConfig(job.Cluster, metric)
@ -293,7 +293,7 @@ func (ccms *CCMetricStore) buildQueries(
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == nil {
continue
}
@ -624,13 +624,12 @@ func (ccms *CCMetricStore) LoadNodeData(
mc := archive.GetMetricConfig(cluster, metric)
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: mc.Unit,
Scope: schema.MetricScopeNode,
Timestep: mc.Timestep,
Series: []schema.Series{
{
Hostname: query.Hostname,
Data: qdata.Data,
Statistics: &schema.MetricStatistics{
Statistics: schema.MetricStatistics{
Avg: float64(qdata.Avg),
Min: float64(qdata.Min),
Max: float64(qdata.Max),

View File

@ -134,7 +134,6 @@ func (idb *InfluxDBv2DataRepository) LoadData(
jobMetric = map[schema.MetricScope]*schema.JobMetric{
scope: { // uses scope var from above!
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0, len(job.Resources)),
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
@ -159,7 +158,7 @@ func (idb *InfluxDBv2DataRepository) LoadData(
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
hostSeries = schema.Series{
Hostname: host,
Statistics: nil,
Statistics: schema.MetricStatistics{}, //TODO Add Statistics
Data: make([]schema.Float, 0),
}
}
@ -212,15 +211,10 @@ func (idb *InfluxDBv2DataRepository) LoadData(
for _, scope := range scopes {
if scope == "node" { // No 'socket/core' support yet
for metric, nodes := range stats {
// log.Debugf("<< Add Stats for : Field %s >>", metric)
for node, stats := range nodes {
// log.Debugf("<< Add Stats for : Host %s : Min %.2f, Max %.2f, Avg %.2f >>", node, stats.Min, stats.Max, stats.Avg )
for index, _ := range jobData[metric][scope].Series {
// log.Debugf("<< Try to add Stats to Series in Position %d >>", index)
if jobData[metric][scope].Series[index].Hostname == node {
// log.Debugf("<< Match for Series in Position %d : Host %s >>", index, jobData[metric][scope].Series[index].Hostname)
jobData[metric][scope].Series[index].Statistics = &schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
// log.Debugf("<< Result Inner: Min %.2f, Max %.2f, Avg %.2f >>", jobData[metric][scope].Series[index].Statistics.Min, jobData[metric][scope].Series[index].Statistics.Max, jobData[metric][scope].Series[index].Statistics.Avg)
jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
}
}
}
@ -228,17 +222,6 @@ func (idb *InfluxDBv2DataRepository) LoadData(
}
}
// DEBUG:
// for _, scope := range scopes {
// for _, met := range metrics {
// for _, series := range jobData[met][scope].Series {
// log.Debugf("<< Result: %d data points for metric %s on %s with scope %s, Stats: Min %.2f, Max %.2f, Avg %.2f >>",
// len(series.Data), met, series.Hostname, scope,
// series.Statistics.Min, series.Statistics.Max, series.Statistics.Avg)
// }
// }
// }
return jobData, nil
}

View File

@ -335,7 +335,10 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
}
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: archive.GetMetricConfig(job.Cluster, metric).Unit,
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: avg / float64(job.NumNodes),
Min: min,
Max: max,

View File

@ -251,7 +251,7 @@ func (pdb *PrometheusDataRepository) RowToSeries(
return schema.Series{
Hostname: hostname,
Data: values,
Statistics: &schema.MetricStatistics{
Statistics: schema.MetricStatistics{
Avg: mean,
Min: min,
Max: max,
@ -323,7 +323,6 @@ func (pdb *PrometheusDataRepository) LoadData(
if !ok {
jobMetric = &schema.JobMetric{
Unit: metricConfig.Unit,
Scope: scope,
Timestep: metricConfig.Timestep,
Series: make([]schema.Series, 0),
}
@ -362,7 +361,7 @@ func (pdb *PrometheusDataRepository) LoadStats(
for metric, metricData := range data {
stats[metric] = make(map[string]schema.MetricStatistics)
for _, series := range metricData[schema.MetricScopeNode].Series {
stats[metric][series.Hostname] = *series.Statistics
stats[metric][series.Hostname] = series.Statistics
}
}
@ -432,7 +431,6 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
// output per host and metric
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: metricConfig.Unit,
Scope: scope,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
},

View File

@ -17,6 +17,7 @@ import (
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/units"
)
const NamedJobInsert string = `INSERT INTO job (
@ -75,6 +76,7 @@ func HandleImportFlag(flag string) error {
return err
}
checkJobData(&jobData)
SanityChecks(&jobMeta.BaseJob)
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
if job, err := GetJobRepository().Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
@ -173,7 +175,9 @@ func InitDB() error {
i := 0
errorOccured := 0
for jobMeta := range ar.Iter() {
for jobContainer := range ar.Iter(false) {
jobMeta := jobContainer.Meta
// // Bundle 100 inserts into one transaction for better performance:
if i%10 == 0 {
@ -297,7 +301,7 @@ func SanityChecks(job *schema.BaseJob) error {
if len(job.Resources) == 0 || len(job.User) == 0 {
return fmt.Errorf("'resources' and 'user' should not be empty")
}
if job.NumAcc < 0 || job.NumHWThreads < 0 || job.NumNodes < 1 {
if *job.NumAcc < 0 || *job.NumHWThreads < 0 || job.NumNodes < 1 {
return fmt.Errorf("'numNodes', 'numAcc' or 'numHWThreads' invalid")
}
if len(job.Resources) != int(job.NumNodes) {
@ -314,3 +318,34 @@ func loadJobStat(job *schema.JobMeta, metric string) float64 {
return 0.0
}
func checkJobData(d *schema.JobData) error {
for _, scopes := range *d {
var newUnit string
// Add node scope if missing
for _, metric := range scopes {
if strings.Contains(metric.Unit.Base, "B/s") ||
strings.Contains(metric.Unit.Base, "F/s") ||
strings.Contains(metric.Unit.Base, "B") {
// First get overall avg
sum := 0.0
for _, s := range metric.Series {
sum += s.Statistics.Avg
}
avg := sum / float64(len(metric.Series))
for _, s := range metric.Series {
fp := schema.ConvertFloatToFloat64(s.Data)
// Normalize values with new unit prefix
oldUnit := metric.Unit.Base
units.NormalizeSeries(fp, avg, oldUnit, &newUnit)
s.Data = schema.GetFloat64ToFloat(fp)
}
metric.Unit.Base = newUnit
}
}
}
return nil
}

View File

@ -335,7 +335,13 @@ func (r *JobRepository) DeleteJobById(id int64) error {
}
// TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC;
func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggregate, filters []*model.JobFilter, weight *model.Weights, limit *int) (map[string]int, error) {
func (r *JobRepository) CountGroupedJobs(
ctx context.Context,
aggreg model.Aggregate,
filters []*model.JobFilter,
weight *model.Weights,
limit *int) (map[string]int, error) {
start := time.Now()
if !aggreg.IsValid() {
return nil, errors.New("invalid aggregate")

View File

@ -8,13 +8,15 @@ import (
"encoding/json"
"fmt"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
const Version = 1
type ArchiveBackend interface {
Init(rawConfig json.RawMessage) error
Init(rawConfig json.RawMessage) (int, error)
LoadJobMeta(job *schema.Job) (*schema.JobMeta, error)
@ -28,7 +30,12 @@ type ArchiveBackend interface {
GetClusters() []string
Iter() <-chan *schema.JobMeta
Iter(loadMetricData bool) <-chan JobContainer
}
type JobContainer struct {
Meta *schema.JobMeta
Data *schema.JobData
}
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
@ -54,10 +61,12 @@ func Init(rawConfig json.RawMessage, disableArchive bool) error {
return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", kind.Kind)
}
if err := ar.Init(rawConfig); err != nil {
version, err := ar.Init(rawConfig)
if err != nil {
log.Error("Error while initializing archiveBackend")
return err
}
log.Infof("Load archive version %d", version)
return initClusterConfig()
}

View File

@ -55,7 +55,7 @@ func initClusterConfig() error {
nodeLists[cluster.Name] = make(map[string]NodeList)
for _, sc := range cluster.SubClusters {
if sc.Nodes == "" {
if sc.Nodes == "*" {
continue
}
@ -80,18 +80,17 @@ func GetCluster(cluster string) *schema.Cluster {
return nil
}
func GetSubCluster(cluster, subcluster string) *schema.SubCluster {
func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
for _, c := range Clusters {
if c.Name == cluster {
for _, p := range c.SubClusters {
if p.Name == subcluster {
return p
return p, nil
}
}
}
}
return nil
return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster)
}
func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
@ -138,7 +137,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
}
}
if cluster.SubClusters[0].Nodes == "" {
if cluster.SubClusters[0].Nodes == "*" {
job.SubCluster = cluster.SubClusters[0].Name
return nil
}

View File

@ -7,17 +7,21 @@ package archive
import (
"bufio"
"bytes"
"compress/gzip"
"encoding/json"
"errors"
"fmt"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/santhosh-tekuri/jsonschema/v5"
)
type FsArchiveConfig struct {
@ -29,6 +33,11 @@ type FsArchive struct {
clusters []string
}
func checkFileExists(filePath string) bool {
_, err := os.Stat(filePath)
return !errors.Is(err, os.ErrNotExist)
}
func getPath(
job *schema.Job,
rootPath string,
@ -44,54 +53,109 @@ func getPath(
func loadJobMeta(filename string) (*schema.JobMeta, error) {
f, err := os.Open(filename)
b, err := os.ReadFile(filename)
if err != nil {
log.Errorf("loadJobMeta() > open file error: %v", err)
return &schema.JobMeta{}, err
}
defer f.Close()
return DecodeJobMeta(bufio.NewReader(f))
if config.Keys.Validate {
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err)
}
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) error {
return DecodeJobMeta(bytes.NewReader(b))
}
func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend LoadJobData()- %v", err)
return nil, err
}
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(r, filename)
} else {
defer f.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(bufio.NewReader(f), filename)
}
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) (int, error) {
var config FsArchiveConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Warnf("Init() > Unmarshal error: %#v", err)
return err
return 0, err
}
if config.Path == "" {
err := fmt.Errorf("Init() : empty config.Path")
log.Errorf("Init() > config.Path error: %v", err)
return err
return 0, err
}
fsa.path = config.Path
b, err := os.ReadFile(filepath.Join(fsa.path, "version.txt"))
if err != nil {
fmt.Println("Err")
return 0, err
}
version, err := strconv.Atoi(strings.TrimSuffix(string(b), "\n"))
if err != nil {
log.Errorf("fsBackend Init()- %v", err)
return 0, err
}
if version != Version {
return version, fmt.Errorf("unsupported version %d, need %d", version, Version)
}
entries, err := os.ReadDir(fsa.path)
if err != nil {
log.Errorf("Init() > ReadDir() error: %v", err)
return err
return 0, err
}
for _, de := range entries {
if !de.IsDir() {
continue
}
fsa.clusters = append(fsa.clusters, de.Name())
}
return nil
return version, nil
}
func (fsa *FsArchive) LoadJobData(job *schema.Job) (schema.JobData, error) {
filename := getPath(job, fsa.path, "data.json")
f, err := os.Open(filename)
if err != nil {
log.Errorf("LoadJobData() > open file error: %v", err)
return nil, err
var isCompressed bool = true
filename := getPath(job, fsa.path, "data.json.gz")
if !checkFileExists(filename) {
filename = getPath(job, fsa.path, "data.json")
isCompressed = false
}
defer f.Close()
return DecodeJobData(bufio.NewReader(f), filename)
return loadJobData(filename, isCompressed)
}
func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) {
@ -105,20 +169,19 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) {
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
if err != nil {
log.Errorf("LoadClusterCfg() > open file error: %v", err)
return &schema.Cluster{}, err
}
if config.Keys.Validate {
// if config.Keys.Validate {
if err := schema.Validate(schema.ClusterCfg, bytes.NewReader(b)); err != nil {
log.Warnf("Validate cluster config: %v\n", err)
return &schema.Cluster{}, fmt.Errorf("Validate cluster config: %v\n", err)
return &schema.Cluster{}, fmt.Errorf("validate cluster config: %v", err)
}
}
// }
return DecodeCluster(bytes.NewReader(b))
}
func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
ch := make(chan *schema.JobMeta)
ch := make(chan JobContainer)
go func() {
clustersDir, err := os.ReadDir(fsa.path)
if err != nil {
@ -126,6 +189,9 @@ func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
}
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
continue
}
lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name()))
if err != nil {
log.Fatalf("Reading jobs failed @ lvl1 dirs: %s", err.Error())
@ -152,10 +218,27 @@ func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
for _, startTimeDir := range startTimeDirs {
if startTimeDir.IsDir() {
job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json"))
if err != nil {
log.Errorf("error in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
}
if loadMetricData {
var isCompressed bool = true
filename := filepath.Join(dirpath, startTimeDir.Name(), "data.json.gz")
if !checkFileExists(filename) {
filename = filepath.Join(dirpath, startTimeDir.Name(), "data.json")
isCompressed = false
}
data, err := loadJobData(filename, isCompressed)
if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
}
ch <- JobContainer{Meta: job, Data: &data}
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
} else {
ch <- job
ch <- JobContainer{Meta: job, Data: nil}
}
}
}
@ -225,6 +308,28 @@ func (fsa *FsArchive) ImportJob(
return err
}
// var isCompressed bool = true
// // TODO Use shortJob Config for check
// if jobMeta.Duration < 300 {
// isCompressed = false
// f, err = os.Create(path.Join(dir, "data.json"))
// } else {
// f, err = os.Create(path.Join(dir, "data.json.gz"))
// }
// if err != nil {
// return err
// }
//
// if isCompressed {
// if err := EncodeJobData(gzip.NewWriter(f), jobData); err != nil {
// return err
// }
// } else {
// if err := EncodeJobData(f, jobData); err != nil {
// return err
// }
// }
f, err = os.Create(path.Join(dir, "data.json"))
if err != nil {
log.Error("Error while creating filepath for data.json")
@ -236,9 +341,6 @@ func (fsa *FsArchive) ImportJob(
}
if err := f.Close(); err != nil {
log.Warn("Error while closing data.json file")
}
return err
}
// no error: final return is nil
return nil
}

View File

@ -20,7 +20,7 @@ func init() {
func TestInitEmptyPath(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"kind\":\"../../test/archive\"}"))
_, err := fsa.Init(json.RawMessage("{\"kind\":\"../../test/archive\"}"))
if err == nil {
t.Fatal(err)
}
@ -28,14 +28,14 @@ func TestInitEmptyPath(t *testing.T) {
func TestInitNoJson(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("\"path\":\"../../test/archive\"}"))
_, err := fsa.Init(json.RawMessage("\"path\":\"../../test/archive\"}"))
if err == nil {
t.Fatal(err)
}
}
func TestInitNotExists(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/job-archive\"}"))
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/job-archive\"}"))
if err == nil {
t.Fatal(err)
}
@ -43,15 +43,16 @@ func TestInitNotExists(t *testing.T) {
func TestInit(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
version, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil {
t.Fatal(err)
}
if fsa.path != "../../test/archive" {
t.Fail()
}
if version != 1 {
t.Fail()
}
if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" {
t.Fail()
}
@ -59,7 +60,7 @@ func TestInit(t *testing.T) {
func TestLoadJobMetaInternal(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil {
t.Fatal(err)
}
@ -82,7 +83,7 @@ func TestLoadJobMetaInternal(t *testing.T) {
func TestLoadJobMeta(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil {
t.Fatal(err)
}
@ -110,7 +111,7 @@ func TestLoadJobMeta(t *testing.T) {
func TestLoadJobData(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil {
t.Fatal(err)
}
@ -136,7 +137,7 @@ func TestLoadJobData(t *testing.T) {
func TestLoadCluster(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil {
t.Fatal(err)
}
@ -146,22 +147,22 @@ func TestLoadCluster(t *testing.T) {
t.Fatal(err)
}
if cfg.SubClusters[0].CoresPerSocket != 10 {
if cfg.SubClusters[0].CoresPerSocket != 4 {
t.Fail()
}
}
func TestIter(t *testing.T) {
var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil {
t.Fatal(err)
}
for job := range fsa.Iter() {
fmt.Printf("Job %d\n", job.JobID)
for job := range fsa.Iter(false) {
fmt.Printf("Job %d\n", job.Meta.JobID)
if job.Cluster != "emmy" {
if job.Meta.Cluster != "emmy" {
t.Fail()
}
}

View File

@ -14,6 +14,8 @@ import (
type NodeList [][]interface {
consume(input string) (next string, ok bool)
limits() []map[string]int
prefix() string
}
func (nl *NodeList) Contains(name string) bool {
@ -35,6 +37,44 @@ func (nl *NodeList) Contains(name string) bool {
return false
}
func (nl *NodeList) PrintList() []string {
var out []string
for _, term := range *nl {
// Get String-Part first
prefix := term[0].prefix()
if len(term) == 1 { // If only String-Part in Term: Single Node Name -> Use as provided
out = append(out, prefix)
} else { // Else: Numeric start-end definition with x digits zeroPadded
limitArr := term[1].limits()
for _, inner := range limitArr {
for i := inner["start"]; i < inner["end"]+1; i++ {
if inner["zeroPadded"] == 1 {
out = append(out, fmt.Sprintf("%s%0*d", prefix, inner["digits"], i))
} else {
log.Error("node list: only zero-padded ranges are allowed")
}
}
}
}
}
return out
}
func (nl *NodeList) NodeCount() int {
var out int = 0
for _, term := range *nl {
if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one
out += 1
} else { // Else: Numeric start-end definition -> add difference + 1
limitArr := term[1].limits()
for _, inner := range limitArr {
out += (inner["end"] - inner["start"]) + 1
}
}
}
return out
}
type NLExprString string
func (nle NLExprString) consume(input string) (next string, ok bool) {
@ -45,6 +85,16 @@ func (nle NLExprString) consume(input string) (next string, ok bool) {
return "", false
}
func (nle NLExprString) limits() []map[string]int {
// Null implementation to fullfill interface requirement
l := make([]map[string]int, 0)
return l
}
func (nle NLExprString) prefix() string {
return string(nle)
}
type NLExprIntRanges []NLExprIntRange
func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
@ -56,6 +106,21 @@ func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
return "", false
}
func (nles NLExprIntRanges) limits() []map[string]int {
l := make([]map[string]int, 0)
for _, nle := range nles {
inner := nle.limits()
l = append(l, inner[0])
}
return l
}
func (nles NLExprIntRanges) prefix() string {
// Null implementation to fullfill interface requirement
var s string
return s
}
type NLExprIntRange struct {
start, end int64
zeroPadded bool
@ -89,6 +154,27 @@ func (nle NLExprIntRange) consume(input string) (next string, ok bool) {
return "", false
}
func (nle NLExprIntRange) limits() []map[string]int {
l := make([]map[string]int, 0)
m := make(map[string]int)
m["start"] = int(nle.start)
m["end"] = int(nle.end)
m["digits"] = int(nle.digits)
if nle.zeroPadded == true {
m["zeroPadded"] = 1
} else {
m["zeroPadded"] = 0
}
l = append(l, m)
return l
}
func (nles NLExprIntRange) prefix() string {
// Null implementation to fullfill interface requirement
var s string
return s
}
func ParseNodeList(raw string) (NodeList, error) {
isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') }
isDigit := func(r byte) bool { return '0' <= r && r <= '9' }
@ -117,6 +203,8 @@ func ParseNodeList(raw string) (NodeList, error) {
for _, rawterm := range rawterms {
exprs := []interface {
consume(input string) (next string, ok bool)
limits() []map[string]int
prefix() string
}{}
for i := 0; i < len(rawterm); i++ {

View File

@ -4,7 +4,10 @@
// license that can be found in the LICENSE file.
package schema
import "strconv"
import (
"fmt"
"strconv"
)
type Accelerator struct {
ID string `json:"id"`
@ -16,23 +19,27 @@ type Topology struct {
Node []int `json:"node"`
Socket [][]int `json:"socket"`
MemoryDomain [][]int `json:"memoryDomain"`
Die [][]int `json:"die"`
Die [][]*int `json:"die,omitempty"`
Core [][]int `json:"core"`
Accelerators []*Accelerator `json:"accelerators"`
Accelerators []*Accelerator `json:"accelerators,omitempty"`
}
type MetricValue struct {
Unit Unit `json:"unit"`
Value float64 `json:"value"`
}
type SubCluster struct {
Name string `json:"name"`
Nodes string `json:"nodes"`
NumberOfNodes int `json:"numberOfNodes"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
Topology *Topology `json:"topology"`
FlopRateScalar MetricValue `json:"flopRateScalar"`
FlopRateSimd MetricValue `json:"flopRateSimd"`
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
Topology Topology `json:"topology"`
}
type SubClusterConfig struct {
@ -41,19 +48,20 @@ type SubClusterConfig struct {
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Remove bool `json:"remove"`
}
type MetricConfig struct {
Name string `json:"name"`
Unit string `json:"unit"`
Unit Unit `json:"unit"`
Scope MetricScope `json:"scope"`
Aggregation *string `json:"aggregation"`
Aggregation string `json:"aggregation"`
Timestep int `json:"timestep"`
Peak *float64 `json:"peak"`
Normal *float64 `json:"normal"`
Caution *float64 `json:"caution"`
Alert *float64 `json:"alert"`
SubClusters []*SubClusterConfig `json:"subClusters"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
}
type Cluster struct {
@ -152,6 +160,15 @@ func (topo *Topology) GetMemoryDomainsFromHWThreads(
return memDoms, exclusive
}
// Temporary fix to convert back from int id to string id for accelerators
func (topo *Topology) GetAcceleratorID(id int) (string, error) {
if id < len(topo.Accelerators) {
return topo.Accelerators[id].ID, nil
} else {
return "", fmt.Errorf("Index %d out of range", id)
}
}
func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
accels := make([]int, 0)
for _, accel := range topo.Accelerators {
@ -163,12 +180,3 @@ func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
}
return accels, nil
}
func (topo *Topology) GetAcceleratorIndex(id string) (int, bool) {
for idx, accel := range topo.Accelerators {
if accel.ID == id {
return idx, true
}
}
return -1, false
}

View File

@ -83,10 +83,10 @@ func (s *Series) MarshalJSON() ([]byte, error) {
buf = append(buf, s.Hostname...)
buf = append(buf, '"')
if s.Id != nil {
buf = append(buf, `,"id":`...)
buf = strconv.AppendInt(buf, int64(*s.Id), 10)
buf = append(buf, `,"id":"`...)
buf = append(buf, *s.Id...)
buf = append(buf, '"')
}
if s.Statistics != nil {
buf = append(buf, `,"statistics":{"min":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
buf = append(buf, `,"avg":`...)
@ -94,7 +94,6 @@ func (s *Series) MarshalJSON() ([]byte, error) {
buf = append(buf, `,"max":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
buf = append(buf, '}')
}
buf = append(buf, `,"data":[`...)
for i := 0; i < len(s.Data); i++ {
if i != 0 {
@ -110,3 +109,23 @@ func (s *Series) MarshalJSON() ([]byte, error) {
buf = append(buf, ']', '}')
return buf, nil
}
func ConvertFloatToFloat64(s []Float) []float64 {
fp := make([]float64, len(s))
for i, val := range s {
fp[i] = float64(val)
}
return fp
}
func GetFloat64ToFloat(s []float64) []Float {
fp := make([]Float, len(s))
for i, val := range s {
fp[i] = Float(val)
}
return fp
}

View File

@ -21,18 +21,18 @@ type BaseJob struct {
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
Partition string `json:"partition" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
ArrayJobId int64 `json:"arrayJobId" db:"array_job_id" example:"123000"` // The unique identifier of an array job
Partition *string `json:"partition,omitempty" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
ArrayJobId *int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` // The unique identifier of an array job
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
NumHWThreads int32 `json:"numHwthreads" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumAcc int32 `json:"numAcc" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
NumHWThreads *int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumAcc *int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
SMT int32 `json:"smt" db:"smt" example:"4"` // SMT threads used by job
State JobState `json:"jobState" db:"job_state" example:"completed"` // Final state of job
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
SMT *int32 `json:"smt,omitempty" db:"smt" example:"4"` // SMT threads used by job
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
Walltime int64 `json:"walltime" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
Tags []*Tag `json:"tags"` // List of tags
Walltime *int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
Tags []*Tag `json:"tags,omitempty"` // List of tags
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
Resources []*Resource `json:"resources"` // Resources used by job
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
@ -89,11 +89,15 @@ var JobDefaults BaseJob = BaseJob{
MonitoringStatus: MonitoringStatusRunningOrArchiving,
}
type Unit struct {
Base string `json:"base"`
Prefix *string `json:"prefix,omitempty"`
}
// JobStatistics model
// @Description Specification for job metric statistics.
type JobStatistics struct {
// Metric unit (see schema/unit.schema.json)
Unit string `json:"unit" example:"GHz"`
Unit Unit `json:"unit" example:"GHz"`
Avg float64 `json:"avg" example:"2500" minimum:"0"` // Job metric average
Min float64 `json:"min" example:"2000" minimum:"0"` // Job metric minimum
Max float64 `json:"max" example:"3000" minimum:"0"` // Job metric maximum
@ -102,6 +106,7 @@ type JobStatistics struct {
// Tag model
// @Description Defines a tag using name and type.
type Tag struct {
// The unique DB identifier of a tag
// The unique DB identifier of a tag
ID int64 `json:"id" db:"id"`
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type

View File

@ -15,17 +15,16 @@ import (
type JobData map[string]map[MetricScope]*JobMetric
type JobMetric struct {
Unit string `json:"unit"`
Scope MetricScope `json:"scope"`
Unit Unit `json:"unit"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries"`
StatisticsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
}
type Series struct {
Hostname string `json:"hostname"`
Id *int `json:"id,omitempty"`
Statistics *MetricStatistics `json:"statistics"`
Id *string `json:"id,omitempty"`
Statistics MetricStatistics `json:"statistics"`
Data []Float `json:"data"`
}
@ -218,17 +217,12 @@ func (jd *JobData) AddNodeScope(metric string) bool {
nodeJm := &JobMetric{
Unit: jm.Unit,
Scope: MetricScopeNode,
Timestep: jm.Timestep,
Series: make([]Series, 0, len(hosts)),
}
for hostname, series := range hosts {
min, sum, max := math.MaxFloat32, 0.0, -math.MaxFloat32
for _, series := range series {
if series.Statistics == nil {
min, sum, max = math.NaN(), math.NaN(), math.NaN()
break
}
sum += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
@ -259,7 +253,7 @@ func (jd *JobData) AddNodeScope(metric string) bool {
nodeJm.Series = append(nodeJm.Series, Series{
Hostname: hostname,
Statistics: &MetricStatistics{Min: min, Avg: sum / float64(len(series)), Max: max},
Statistics: MetricStatistics{Min: min, Avg: sum / float64(len(series)), Max: max},
Data: data,
})
}

View File

@ -21,7 +21,7 @@
},
"unit": {
"description": "Metric unit",
"type": "string"
"$ref": "embedfs://unit.schema.json"
},
"scope": {
"description": "Native measurement resolution",
@ -38,7 +38,22 @@
"sum",
"avg"
]
},
"peak": {
"description": "Metric peak threshold (Upper metric limit)",
"type": "number"
},
"normal": {
"description": "Metric normal threshold",
"type": "number"
},
"caution": {
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
"type": "number"
},
"alert": {
"description": "Metric alert threshold (Requires immediate action)",
"type": "number"
},
"subClusters": {
"description": "Array of cluster hardware partition metric thresholds",
@ -61,13 +76,13 @@
},
"alert": {
"type": "number"
},
"remove": {
"type": "boolean"
}
},
"required": [
"name",
"peak",
"caution",
"alert"
"name"
]
}
}
@ -76,7 +91,12 @@
"name",
"unit",
"scope",
"timestep"
"timestep",
"aggregation",
"peak",
"normal",
"caution",
"alert"
]
},
"minItems": 1
@ -109,15 +129,42 @@
},
"flopRateScalar": {
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
"type": "integer"
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"flopRateSimd": {
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
"type": "integer"
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"memoryBandwidth": {
"description": "Theoretical node peak memory bandwidth in GB/s",
"type": "integer"
"type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
},
"nodes": {
"description": "Node list expression",
@ -215,6 +262,7 @@
},
"required": [
"name",
"nodes",
"topology",
"processorType",
"socketsPerNode",

View File

@ -86,8 +86,8 @@
},
"minProperties": 1
},
"cpu_used": {
"description": "CPU active core utilization",
"cpu_user": {
"description": "CPU user active core utilization",
"properties": {
"node": {
"$ref": "embedfs://job-metric-data.schema.json"
@ -479,7 +479,7 @@
]
},
"required": [
"cpu_used",
"cpu_user",
"mem_used",
"flops_any",
"mem_bw",

View File

@ -84,11 +84,6 @@
"type": "integer",
"exclusiveMinimum": 0
},
"stopTime": {
"description": "Stop epoch time stamp in seconds",
"type": "integer",
"exclusiveMinimum": 0
},
"duration": {
"description": "Duration of job in seconds",
"type": "integer",
@ -198,8 +193,8 @@
"description": "Instructions executed per cycle",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"cpu_used": {
"description": "CPU active core utilization",
"cpu_user": {
"description": "CPU user active core utilization",
"$ref": "embedfs://job-metric-statistics.schema.json"
},
"flops_dp": {
@ -331,7 +326,7 @@
}
},
"required": [
"cpu_used",
"cpu_user",
"mem_used",
"flops_any",
"mem_bw"
@ -343,13 +338,13 @@
"user",
"project",
"cluster",
"subCluster",
"numNodes",
"exclusive",
"startTime",
"jobState",
"duration",
"resources",
"tags",
"statistics"
]
}

View File

@ -193,7 +193,7 @@
},
"data": {
"type": "array",
"items": {
"contains": {
"type": "number",
"minimum": 0
},

View File

@ -5,7 +5,7 @@
"description": "Format specification for job metric units",
"type": "object",
"properties": {
"base_unit": {
"base": {
"description": "Metric base unit",
"type": "string",
"enum": [
@ -15,7 +15,6 @@
"F/s",
"CPI",
"IPC",
"load",
"Hz",
"W",
"°C",
@ -36,6 +35,6 @@
}
},
"required": [
"base_unit"
"base"
]
}

View File

@ -45,9 +45,29 @@ func TestValidateCluster(t *testing.T) {
"socketsPerNode": 2,
"coresPerSocket": 10,
"threadsPerCore": 2,
"flopRateScalar": 44,
"flopRateSimd": 704,
"memoryBandwidth": 80,
"flopRateScalar": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"nodes": "w11[27-45,49-63,69-72]",
"topology": {
"node": [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29,10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39],
"socket": [
@ -68,8 +88,13 @@ func TestValidateCluster(t *testing.T) {
{
"name": "cpu_load",
"scope": "hwthread",
"unit": "load",
"timestep": 60
"unit": {"base": ""},
"aggregation": "avg",
"timestep": 60,
"peak": 4,
"normal": 2,
"caution": 1,
"alert": 0.25
}
]
}`)

View File

@ -1,6 +1,7 @@
# cc-units - A unit system for ClusterCockpit
When working with metrics, the problem comes up that they may use different unit name but have the same unit in fact. There are a lot of real world examples like 'kB' and 'Kbyte'. In [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector), the collectors read data from different sources which may use different units or the programmer specifies a unit for a metric by hand. The cc-units system is not comparable with the SI unit system. If you are looking for a package for the SI units, see [here](https://pkg.go.dev/github.com/gurre/si).
When working with metrics, the problem comes up that they may use different unit name but have the same unit in fact.
There are a lot of real world examples like 'kB' and 'Kbyte'. In [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector), the collectors read data from different sources which may use different units or the programmer specifies a unit for a metric by hand. The cc-units system is not comparable with the SI unit system. If you are looking for a package for the SI units, see [here](https://pkg.go.dev/github.com/gurre/si).
In order to enable unit comparison and conversion, the ccUnits package provides some helpers:
```go

View File

@ -39,7 +39,7 @@ var MeasuresMap map[Measure]MeasureData = map[Measure]MeasureData{
},
Flops: {
Long: "Flops",
Short: "Flops",
Short: "F",
Regex: "^([fF][lL]?[oO]?[pP]?[sS]?)",
},
Percentage: {

View File

@ -1,6 +1,7 @@
package units
import (
"math"
"regexp"
)
@ -172,3 +173,20 @@ func NewPrefix(prefix string) Prefix {
}
return InvalidPrefix
}
func getExponent(p float64) int {
count := 0
for p > 1.0 {
p = p / 1000.0
count++
}
return count * 3
}
func NewPrefixFromFactor(op Prefix, e int) Prefix {
f := float64(op)
exp := math.Pow10(getExponent(f) - e)
return Prefix(exp)
}

View File

@ -3,7 +3,10 @@ package units
import (
"fmt"
"math"
"strings"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type unit struct {
@ -25,7 +28,9 @@ type Unit interface {
var INVALID_UNIT = NewUnit("foobar")
// Valid checks whether a unit is a valid unit. A unit is valid if it has at least a prefix and a measure. The unit denominator is optional.
// Valid checks whether a unit is a valid unit.
// A unit is valid if it has at least a prefix and a measure.
// The unit denominator is optional.
func (u *unit) Valid() bool {
return u.prefix != InvalidPrefix && u.measure != InvalidMeasure
}
@ -71,6 +76,90 @@ func (u *unit) getUnitDenominator() Measure {
return u.divMeasure
}
func ConvertValue(v *float64, from string, to string) {
uf := NewUnit(from)
ut := NewUnit(to)
factor := float64(uf.getPrefix()) / float64(ut.getPrefix())
*v = math.Ceil(*v * factor)
}
func ConvertSeries(s []float64, from string, to string) {
uf := NewUnit(from)
ut := NewUnit(to)
factor := float64(uf.getPrefix()) / float64(ut.getPrefix())
for i := 0; i < len(s); i++ {
s[i] = math.Ceil(s[i] * factor)
}
}
func getNormalizationFactor(v float64) (float64, int) {
count := 0
scale := -3
if v > 1000.0 {
for v > 1000.0 {
v *= 1e-3
count++
}
} else {
for v < 1.0 {
v *= 1e3
count++
}
scale = 3
}
return math.Pow10(count * scale), count * scale
}
func NormalizeValue(v *float64, us string, nu *string) {
u := NewUnit(us)
f, e := getNormalizationFactor((*v))
*v = math.Ceil(*v * f)
u.setPrefix(NewPrefixFromFactor(u.getPrefix(), e))
*nu = u.Short()
}
func NormalizeSeries(s []float64, avg float64, us string, nu *string) {
u := NewUnit(us)
f, e := getNormalizationFactor(avg)
for i := 0; i < len(s); i++ {
s[i] *= f
s[i] = math.Ceil(s[i])
}
u.setPrefix(NewPrefixFromFactor(u.getPrefix(), e))
fmt.Printf("Prefix: %e \n", u.getPrefix())
*nu = u.Short()
}
func ConvertUnitString(us string) schema.Unit {
var nu schema.Unit
if us == "CPI" ||
us == "IPC" ||
us == "load" ||
us == "" {
nu.Base = us
return nu
}
u := NewUnit(us)
p := u.getPrefix()
if p.Prefix() != "" {
prefix := p.Prefix()
nu.Prefix = &prefix
}
m := u.getMeasure()
d := u.getUnitDenominator()
if d.Short() != "inval" {
nu.Base = fmt.Sprintf("%s/%s", m.Short(), d.Short())
} else {
nu.Base = m.Short()
}
return nu
}
// GetPrefixPrefixFactor creates the default conversion function between two prefixes.
// It returns a conversation function for the value.
func GetPrefixPrefixFactor(in Prefix, out Prefix) func(value interface{}) interface{} {

View File

@ -2,6 +2,7 @@ package units
import (
"fmt"
"reflect"
"regexp"
"testing"
)
@ -199,3 +200,108 @@ func TestPrefixRegex(t *testing.T) {
t.Logf("succussfully compiled regex '%s' for prefix %s", data.Regex, data.Long)
}
}
func TestConvertValue(t *testing.T) {
v := float64(103456)
ConvertValue(&v, "MB/s", "GB/s")
if v != 104.00 {
t.Errorf("Failed ConvertValue: Want 103.456, Got %f", v)
}
}
func TestConvertValueUp(t *testing.T) {
v := float64(10.3456)
ConvertValue(&v, "GB/s", "MB/s")
if v != 10346.00 {
t.Errorf("Failed ConvertValue: Want 10346.00, Got %f", v)
}
}
func TestConvertSeries(t *testing.T) {
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
r := []float64{3, 24, 390, 391}
ConvertSeries(s, "F/s", "GF/s")
if !reflect.DeepEqual(s, r) {
t.Errorf("Failed ConvertValue: Want 3, 24, 390, 391, Got %v", s)
}
}
func TestNormalizeValue(t *testing.T) {
var s string
v := float64(103456)
NormalizeValue(&v, "MB/s", &s)
if v != 104.00 {
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
}
if s != "GB/s" {
t.Errorf("Failed Prefix or unit: Want GB/s, Got %s", s)
}
}
func TestNormalizeValueNoPrefix(t *testing.T) {
var s string
v := float64(103458596)
NormalizeValue(&v, "F/s", &s)
if v != 104.00 {
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
}
if s != "MF/s" {
t.Errorf("Failed Prefix or unit: Want MF/s, Got %s", s)
}
}
func TestNormalizeValueKeep(t *testing.T) {
var s string
v := float64(345)
NormalizeValue(&v, "MB/s", &s)
if v != 345.00 {
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
}
if s != "MB/s" {
t.Errorf("Failed Prefix or unit: Want GB/s, Got %s", s)
}
}
func TestNormalizeValueDown(t *testing.T) {
var s string
v := float64(0.0004578)
NormalizeValue(&v, "GB/s", &s)
if v != 458.00 {
t.Errorf("Failed ConvertValue: Want 458.00, Got %f", v)
}
if s != "KB/s" {
t.Errorf("Failed Prefix or unit: Want KB/s, Got %s", s)
}
}
func TestNormalizeSeries(t *testing.T) {
var us string
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
r := []float64{3, 24, 390, 391}
total := 0.0
for _, number := range s {
total += number
}
avg := total / float64(len(s))
fmt.Printf("AVG: %e\n", avg)
NormalizeSeries(s, avg, "KB/s", &us)
if !reflect.DeepEqual(s, r) {
t.Errorf("Failed ConvertValue: Want 3, 24, 390, 391, Got %v", s)
}
if us != "TB/s" {
t.Errorf("Failed Prefix or unit: Want TB/s, Got %s", us)
}
}

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -1 +1,194 @@
{"exclusive":1,"jobId":1403244,"statistics":{"mem_bw":{"avg":63.57,"min":0,"unit":"GB/s","max":74.5},"rapl_power":{"avg":228.07,"min":0,"unit":"W","max":258.56},"ipc":{"unit":"IPC","max":0.510204081632653,"avg":1.53846153846154,"min":0.0},"clock":{"min":1380.32,"avg":2599.39,"unit":"MHz","max":2634.46},"cpu_load":{"avg":18.4,"min":0,"max":23.58,"unit":"load"},"flops_any":{"max":404.62,"unit":"GF/s","avg":225.59,"min":0},"flops_dp":{"max":0.24,"unit":"GF/s","min":0,"avg":0},"mem_used":{"min":1.55,"avg":27.84,"unit":"GB","max":37.5},"flops_sp":{"min":0,"avg":225.59,"max":404.62,"unit":"GF/s"}},"resources":[{"hostname":"e0102"},{"hostname":"e0103"},{"hostname":"e0105"},{"hostname":"e0106"},{"hostname":"e0107"},{"hostname":"e0108"},{"hostname":"e0114"},{"hostname":"e0320"},{"hostname":"e0321"},{"hostname":"e0325"},{"hostname":"e0404"},{"hostname":"e0415"},{"hostname":"e0433"},{"hostname":"e0437"},{"hostname":"e0439"},{"hostname":"e0501"},{"hostname":"e0503"},{"hostname":"e0505"},{"hostname":"e0506"},{"hostname":"e0512"},{"hostname":"e0513"},{"hostname":"e0514"},{"hostname":"e0653"},{"hostname":"e0701"},{"hostname":"e0716"},{"hostname":"e0727"},{"hostname":"e0728"},{"hostname":"e0925"},{"hostname":"e0926"},{"hostname":"e0929"},{"hostname":"e0934"},{"hostname":"e0951"}],"walltime":10,"jobState":"completed","cluster":"emmy","stopTime":1609009562,"user":"emmyUser6","startTime":1608923076,"partition":"work","tags":[],"project":"no project","numNodes":32,"duration":86486}
{
"exclusive": 1,
"jobId": 1403244,
"statistics": {
"mem_bw": {
"avg": 63.57,
"min": 0,
"unit": {
"base": "B/s",
"prefix": "G"
},
"max": 74.5
},
"rapl_power": {
"avg": 228.07,
"min": 0,
"unit": {
"base": "W"
},
"max": 258.56
},
"ipc": {
"unit": {
"base": "IPC"
},
"max": 0.510204081632653,
"avg": 1.53846153846154,
"min": 0.0
},
"clock": {
"min": 1380.32,
"avg": 2599.39,
"unit": {
"base": "Hz",
"prefix": "M"
},
"max": 2634.46
},
"cpu_load": {
"avg": 18.4,
"min": 0,
"max": 23.58,
"unit": {
"base": "load"
}
},
"flops_any": {
"max": 404.62,
"unit": {
"base": "F/s",
"prefix": "G"
},
"avg": 225.59,
"min": 0
},
"flops_dp": {
"max": 0.24,
"unit": {
"base": "F/s",
"prefix": "G"
},
"min": 0,
"avg": 0
},
"mem_used": {
"min": 1.55,
"avg": 27.84,
"unit": {
"base": "B",
"prefix": "G"
},
"max": 37.5
},
"flops_sp": {
"min": 0,
"avg": 225.59,
"max": 404.62,
"unit": {
"base": "F/s",
"prefix": "G"
}
}
},
"resources": [
{
"hostname": "e0102"
},
{
"hostname": "e0103"
},
{
"hostname": "e0105"
},
{
"hostname": "e0106"
},
{
"hostname": "e0107"
},
{
"hostname": "e0108"
},
{
"hostname": "e0114"
},
{
"hostname": "e0320"
},
{
"hostname": "e0321"
},
{
"hostname": "e0325"
},
{
"hostname": "e0404"
},
{
"hostname": "e0415"
},
{
"hostname": "e0433"
},
{
"hostname": "e0437"
},
{
"hostname": "e0439"
},
{
"hostname": "e0501"
},
{
"hostname": "e0503"
},
{
"hostname": "e0505"
},
{
"hostname": "e0506"
},
{
"hostname": "e0512"
},
{
"hostname": "e0513"
},
{
"hostname": "e0514"
},
{
"hostname": "e0653"
},
{
"hostname": "e0701"
},
{
"hostname": "e0716"
},
{
"hostname": "e0727"
},
{
"hostname": "e0728"
},
{
"hostname": "e0925"
},
{
"hostname": "e0926"
},
{
"hostname": "e0929"
},
{
"hostname": "e0934"
},
{
"hostname": "e0951"
}
],
"walltime": 10,
"jobState": "completed",
"cluster": "emmy",
"subCluster": "haswell",
"stopTime": 1609009562,
"user": "emmyUser6",
"startTime": 1608923076,
"partition": "work",
"tags": [],
"project": "no project",
"numNodes": 32,
"duration": 86486
}

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -1 +1,194 @@
{"stopTime":1609387081,"resources":[{"hostname":"e0151"},{"hostname":"e0152"},{"hostname":"e0153"},{"hostname":"e0232"},{"hostname":"e0303"},{"hostname":"e0314"},{"hostname":"e0344"},{"hostname":"e0345"},{"hostname":"e0348"},{"hostname":"e0507"},{"hostname":"e0518"},{"hostname":"e0520"},{"hostname":"e0522"},{"hostname":"e0526"},{"hostname":"e0527"},{"hostname":"e0528"},{"hostname":"e0530"},{"hostname":"e0551"},{"hostname":"e0604"},{"hostname":"e0613"},{"hostname":"e0634"},{"hostname":"e0639"},{"hostname":"e0640"},{"hostname":"e0651"},{"hostname":"e0653"},{"hostname":"e0701"},{"hostname":"e0704"},{"hostname":"e0751"},{"hostname":"e0809"},{"hostname":"e0814"},{"hostname":"e0819"},{"hostname":"e0908"}],"walltime":10,"cluster":"emmy","jobState":"completed","statistics":{"clock":{"max":2634.9,"unit":"MHz","min":0,"avg":2597.8},"cpu_load":{"max":27.41,"unit":"load","min":0,"avg":18.39},"mem_bw":{"min":0,"avg":63.23,"unit":"GB/s","max":75.06},"ipc":{"min":0.0,"avg":1.53846153846154,"unit":"IPC","max":0.490196078431373},"rapl_power":{"min":0,"avg":227.32,"unit":"W","max":256.22},"mem_used":{"min":1.5,"avg":27.77,"unit":"GB","max":37.43},"flops_sp":{"unit":"GF/s","max":413.21,"min":0,"avg":224.41},"flops_dp":{"max":5.72,"unit":"GF/s","min":0,"avg":0},"flops_any":{"min":0,"avg":224.42,"max":413.21,"unit":"GF/s"}},"exclusive":1,"jobId":1404397,"tags":[],"partition":"work","project":"no project","user":"emmyUser6","startTime":1609300556,"duration":86525,"numNodes":32}
{
"stopTime": 1609387081,
"resources": [
{
"hostname": "e0151"
},
{
"hostname": "e0152"
},
{
"hostname": "e0153"
},
{
"hostname": "e0232"
},
{
"hostname": "e0303"
},
{
"hostname": "e0314"
},
{
"hostname": "e0344"
},
{
"hostname": "e0345"
},
{
"hostname": "e0348"
},
{
"hostname": "e0507"
},
{
"hostname": "e0518"
},
{
"hostname": "e0520"
},
{
"hostname": "e0522"
},
{
"hostname": "e0526"
},
{
"hostname": "e0527"
},
{
"hostname": "e0528"
},
{
"hostname": "e0530"
},
{
"hostname": "e0551"
},
{
"hostname": "e0604"
},
{
"hostname": "e0613"
},
{
"hostname": "e0634"
},
{
"hostname": "e0639"
},
{
"hostname": "e0640"
},
{
"hostname": "e0651"
},
{
"hostname": "e0653"
},
{
"hostname": "e0701"
},
{
"hostname": "e0704"
},
{
"hostname": "e0751"
},
{
"hostname": "e0809"
},
{
"hostname": "e0814"
},
{
"hostname": "e0819"
},
{
"hostname": "e0908"
}
],
"walltime": 10,
"cluster": "emmy",
"subCluster": "haswell",
"jobState": "completed",
"statistics": {
"clock": {
"max": 2634.9,
"unit": {
"base": "Hz",
"prefix": "M"
},
"min": 0,
"avg": 2597.8
},
"cpu_load": {
"max": 27.41,
"min": 0,
"avg": 18.39,
"unit": {
"base": "load"
}
},
"mem_bw": {
"min": 0,
"avg": 63.23,
"unit": {
"base": "B/s",
"prefix": "G"
},
"max": 75.06
},
"ipc": {
"min": 0.0,
"avg": 1.53846153846154,
"unit": {
"base": "IPC"
},
"max": 0.490196078431373
},
"rapl_power": {
"min": 0,
"avg": 227.32,
"unit": {
"base": "W"
},
"max": 256.22
},
"mem_used": {
"min": 1.5,
"avg": 27.77,
"unit": {
"base": "B",
"prefix": "G"
},
"max": 37.43
},
"flops_sp": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"max": 413.21,
"min": 0,
"avg": 224.41
},
"flops_dp": {
"max": 5.72,
"unit": {
"base": "F/s",
"prefix": "G"
},
"min": 0,
"avg": 0
},
"flops_any": {
"min": 0,
"avg": 224.42,
"max": 413.21,
"unit": {
"base": "F/s",
"prefix": "G"
}
}
},
"exclusive": 1,
"jobId": 1404397,
"tags": [],
"partition": "work",
"project": "no project",
"user": "emmyUser6",
"startTime": 1609300556,
"duration": 86525,
"numNodes": 32
}

File diff suppressed because it is too large Load Diff

1
test/archive/version.txt Normal file
View File

@ -0,0 +1 @@
1

View File

@ -1,13 +1,14 @@
{
"cpu_used": {
"core": {
"unit": "cpu used",
"scope": "core",
"unit": {
"base": ""
},
"timestep": 30,
"series": [
{
"hostname": "taurusi6489",
"id": 0,
"id": "0",
"statistics": {
"min": 0.09090909090909093,
"avg": 0.9173553719008265,
@ -29,7 +30,7 @@
},
{
"hostname": "taurusi6489",
"id": 1,
"id": "1",
"statistics": {
"min": 0.03694102397926118,
"avg": 0.045968409230268584,
@ -51,7 +52,7 @@
},
{
"hostname": "taurusi6490",
"id": 10,
"id": "10",
"statistics": {
"min": 0.10505319148936171,
"avg": 0.9186411992263056,
@ -73,7 +74,7 @@
},
{
"hostname": "taurusi6490",
"id": 11,
"id": "11",
"statistics": {
"min": 0.05286048845767815,
"avg": 0.07053823838706144,
@ -99,13 +100,14 @@
},
"ipc": {
"core": {
"unit": "IPC",
"scope": "core",
"unit": {
"base": "IPC"
},
"timestep": 60,
"series": [
{
"hostname": "taurusi6489",
"id": 0,
"id": "0",
"statistics": {
"min": 1.3808406263195592,
"avg": 1.3960848578375105,
@ -121,7 +123,7 @@
},
{
"hostname": "taurusi6489",
"id": 1,
"id": "1",
"statistics": {
"min": 0.30469640475234366,
"avg": 0.8816944294664065,
@ -137,7 +139,7 @@
},
{
"hostname": "taurusi6490",
"id": 10,
"id": "10",
"statistics": {
"min": 1.3791232173760588,
"avg": 1.3850247295506815,
@ -153,7 +155,7 @@
},
{
"hostname": "taurusi6490",
"id": 11,
"id": "11",
"statistics": {
"min": 0.6424094604392216,
"avg": 0.9544442638400293,
@ -173,13 +175,14 @@
},
"flops_any": {
"core": {
"unit": "F/s",
"scope": "core",
"unit": {
"base": "F/s"
},
"timestep": 60,
"series": [
{
"hostname": "taurusi6489",
"id": 0,
"id": "0",
"statistics": {
"min": 0.0,
"avg": 184.2699002412084,
@ -195,7 +198,7 @@
},
{
"hostname": "taurusi6489",
"id": 1,
"id": "1",
"statistics": {
"min": 0.13559227208748068,
"avg": 273.2997868356056,
@ -211,7 +214,7 @@
},
{
"hostname": "taurusi6490",
"id": 10,
"id": "10",
"statistics": {
"min": 0.0,
"avg": 1678.8419461262179,
@ -227,7 +230,7 @@
},
{
"hostname": "taurusi6490",
"id": 11,
"id": "11",
"statistics": {
"min": 45.28689133054866,
"avg": 609.6644949204072,
@ -247,13 +250,14 @@
},
"mem_bw": {
"socket": {
"unit": "B/s",
"scope": "socket",
"unit": {
"base": "B/s"
},
"timestep": 60,
"series": [
{
"hostname": "taurusi6489",
"id": 0,
"id": "0",
"statistics": {
"min": 653671812.1661415,
"avg": 1637585527.5854635,
@ -269,7 +273,7 @@
},
{
"hostname": "taurusi6490",
"id": 0,
"id": "0",
"statistics": {
"min": 1520190251.61048,
"avg": 1572477682.3850098,
@ -289,8 +293,9 @@
},
"file_bw": {
"node": {
"unit": "B/s",
"scope": "node",
"unit": {
"base": "B/s"
},
"timestep": 30,
"series": [
{
@ -341,8 +346,9 @@
},
"net_bw": {
"node": {
"unit": "B/s",
"scope": "node",
"unit": {
"base": "B/s"
},
"timestep": 30,
"series": [
{
@ -393,8 +399,9 @@
},
"mem_used": {
"node": {
"unit": "B",
"scope": "node",
"unit": {
"base": "B"
},
"timestep": 30,
"series": [
{
@ -445,13 +452,14 @@
},
"cpu_power": {
"socket": {
"unit": "W",
"scope": "socket",
"unit": {
"base": "W"
},
"timestep": 60,
"series": [
{
"hostname": "taurusi6489",
"id": 0,
"id": "0",
"statistics": {
"min": 35.50647456742635,
"avg": 72.08313211552377,
@ -467,7 +475,7 @@
},
{
"hostname": "taurusi6490",
"id": 0,
"id": "0",
"statistics": {
"min": 83.8466923147859,
"avg": 85.18572681122097,

View File

@ -59,10 +59,6 @@ func setup(t *testing.T) *api.RestApi {
const testclusterJson = `{
"name": "testcluster",
"subClusters": [
{
"name": "sc0",
"nodes": "host120,host121,host122"
},
{
"name": "sc1",
"nodes": "host123,host124,host125",
@ -70,9 +66,28 @@ func setup(t *testing.T) *api.RestApi {
"socketsPerNode": 1,
"coresPerSocket": 4,
"threadsPerCore": 2,
"flopRateScalar": 44,
"flopRateSimd": 704,
"memoryBandwidth": 80,
"flopRateScalar": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"topology": {
"node": [0, 1, 2, 3, 4, 5, 6, 7],
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
@ -85,9 +100,10 @@ func setup(t *testing.T) *api.RestApi {
"metricConfig": [
{
"name": "load_one",
"unit": "load",
"unit": { "base": ""},
"scope": "node",
"timestep": 60,
"aggregation": "avg",
"peak": 8,
"normal": 0,
"caution": 0,
@ -95,19 +111,38 @@ func setup(t *testing.T) *api.RestApi {
}
]
}`
const taurusclusterJson = `{
"name": "taurus",
"SubClusters": [
"subClusters": [
{
"name": "haswell",
"processorType": "Intel Haswell",
"socketsPerNode": 2,
"coresPerSocket": 12,
"threadsPerCore": 1,
"flopRateScalar": 32,
"flopRateSimd": 512,
"memoryBandwidth": 60,
"flopRateScalar": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"nodes": "w11[27-45,49-63,69-72]",
"topology": {
"node": [ 0, 1 ],
"socket": [
@ -126,8 +161,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "cpu_used",
"scope": "core",
"unit": "",
"unit": {"base": ""},
"aggregation": "avg",
"timestep": 30,
"peak": 1,
"normal": 0.5,
"caution": 2e-07,
"alert": 1e-07,
"subClusters": [
{
"name": "haswell",
@ -141,8 +181,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "ipc",
"scope": "core",
"unit": "IPC",
"unit": { "base": "IPC"},
"aggregation": "avg",
"timestep": 60,
"peak": 2,
"normal": 1,
"caution": 0.1,
"alert": 0.5,
"subClusters": [
{
"name": "haswell",
@ -156,8 +201,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "flops_any",
"scope": "core",
"unit": "F/s",
"unit": { "base": "F/s"},
"aggregation": "sum",
"timestep": 60,
"peak": 40000000000,
"normal": 20000000000,
"caution": 30000000000,
"alert": 35000000000,
"subClusters": [
{
"name": "haswell",
@ -171,8 +221,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "mem_bw",
"scope": "socket",
"unit": "B/s",
"unit": { "base": "B/s"},
"aggregation": "sum",
"timestep": 60,
"peak": 58800000000,
"normal": 28800000000,
"caution": 38800000000,
"alert": 48800000000,
"subClusters": [
{
"name": "haswell",
@ -186,8 +241,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "file_bw",
"scope": "node",
"unit": "B/s",
"unit": { "base": "B/s"},
"aggregation": "sum",
"timestep": 30,
"peak": 20000000000,
"normal": 5000000000,
"caution": 9000000000,
"alert": 19000000000,
"subClusters": [
{
"name": "haswell",
@ -201,8 +261,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "net_bw",
"scope": "node",
"unit": "B/s",
"unit": { "base": "B/s"},
"timestep": 30,
"aggregation": "sum",
"peak": 7000000000,
"normal": 5000000000,
"caution": 6000000000,
"alert": 6500000000,
"subClusters": [
{
"name": "haswell",
@ -216,8 +281,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "mem_used",
"scope": "node",
"unit": "B",
"unit": {"base": "B"},
"aggregation": "sum",
"timestep": 30,
"peak": 32000000000,
"normal": 2000000000,
"caution": 31000000000,
"alert": 30000000000,
"subClusters": [
{
"name": "haswell",
@ -231,8 +301,13 @@ func setup(t *testing.T) *api.RestApi {
{
"name": "cpu_power",
"scope": "socket",
"unit": "W",
"unit": {"base": "W"},
"aggregation": "sum",
"timestep": 60,
"peak": 100,
"normal": 80,
"caution": 90,
"alert": 90,
"subClusters": [
{
"name": "haswell",
@ -253,6 +328,10 @@ func setup(t *testing.T) *api.RestApi {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
t.Fatal(err)
}
@ -315,13 +394,12 @@ func TestRestApi(t *testing.T) {
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: "load",
Scope: schema.MetricScopeNode,
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: &schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
@ -392,15 +470,15 @@ func TestRestApi(t *testing.T) {
job.Project != "testproj" ||
job.Cluster != "testcluster" ||
job.SubCluster != "sc1" ||
job.Partition != "default" ||
job.Walltime != 3600 ||
job.ArrayJobId != 0 ||
*job.Partition != "default" ||
*job.Walltime != 3600 ||
*job.ArrayJobId != 0 ||
job.NumNodes != 1 ||
job.NumHWThreads != 8 ||
job.NumAcc != 0 ||
*job.NumHWThreads != 8 ||
*job.NumAcc != 0 ||
job.Exclusive != 1 ||
job.MonitoringStatus != 1 ||
job.SMT != 1 ||
*job.SMT != 1 ||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
job.StartTime.Unix() != 123456789 {
t.Fatalf("unexpected job properties: %#v", job)
@ -488,13 +566,13 @@ func TestRestApi(t *testing.T) {
}
})
t.Run("FailedJob", func(t *testing.T) {
subtestLetJobFail(t, restapi, r)
})
// t.Run("FailedJob", func(t *testing.T) {
// subtestLetJobFail(t, restapi, r)
// })
t.Run("ImportJob", func(t *testing.T) {
testImportFlag(t)
})
// t.Run("ImportJob", func(t *testing.T) {
// testImportFlag(t)
// })
}
func subtestLetJobFail(t *testing.T, restapi *api.RestApi, r *mux.Router) {
@ -505,19 +583,15 @@ func subtestLetJobFail(t *testing.T, restapi *api.RestApi, r *mux.Router) {
"cluster": "testcluster",
"partition": "default",
"walltime": 3600,
"arrayJobId": 0,
"numNodes": 1,
"numAcc": 0,
"exclusive": 1,
"monitoringStatus": 1,
"smt": 1,
"tags": [],
"resources": [
{
"hostname": "host123"
}
],
"metaData": {},
"startTime": 12345678
}`
@ -596,4 +670,17 @@ func testImportFlag(t *testing.T) {
if len(data) != 8 {
t.Errorf("Job data length: Got %d, want 8", len(data))
}
r := map[string]string{"mem_used": "GB", "net_bw": "KB/s",
"cpu_power": "W", "cpu_used": "",
"file_bw": "KB/s", "flops_any": "F/s",
"mem_bw": "GB/s", "ipc": "IPC"}
for name, scopes := range data {
for _, metric := range scopes {
if metric.Unit.Base != r[name] {
t.Errorf("Metric %s unit: Got %s, want %s", name, metric.Unit.Base, r[name])
}
}
}
}

View File

@ -5,10 +5,8 @@
"cluster": "taurus",
"subCluster": "haswell",
"partition": "haswell64",
"arrayJobId": 0,
"numNodes": 2,
"numHwthreads": 4,
"numAcc": 0,
"exclusive": 0,
"startTime": 1635856524,
"jobState": "completed",
@ -18,11 +16,17 @@
"resources": [
{
"hostname": "taurusi6489",
"hwthreads": [ 0, 1 ]
"hwthreads": [
0,
1
]
},
{
"hostname": "taurusi6490",
"hwthreads": [ 10, 11 ]
"hwthreads": [
10,
11
]
}
],
"statistics": {
@ -30,49 +34,65 @@
"min": 0.03694102397926118,
"avg": 0.48812580468611544,
"max": 1.0000000000000002,
"unit": "cpu used"
"unit": {
"base": ""
}
},
"ipc": {
"min": 0.30469640475234366,
"avg": 1.154312070173657,
"max": 1.797623522191001,
"unit": "IPC"
"unit": {
"base": "IPC"
}
},
"flops_any": {
"min": 0.0,
"avg": 686.5190320308598,
"max": 4346.591400350933,
"unit": "F/s"
"unit": {
"base": "F/s"
}
},
"mem_bw": {
"min": 653671812.1661415,
"avg": 1605031604.9852366,
"max": 2614718291.9554267,
"unit": "B/s"
"unit": {
"base": "B/s"
}
},
"file_bw": {
"min": 0.0,
"avg": 620592.5419124186,
"max": 11559156.360352296,
"unit": "B/s"
"unit": {
"base": "B/s"
}
},
"net_bw": {
"min": 126779.89655880642,
"avg": 763101.082138246,
"max": 1916309.7075416835,
"unit": "B/s"
"unit": {
"base": "B/s"
}
},
"mem_used": {
"min": 2779066368.0,
"avg": 9647598685.09091,
"max": 10202595328.0,
"unit": "B"
"unit": {
"base": "B"
}
},
"cpu_power": {
"min": 35.50647456742635,
"avg": 78.63442946337237,
"max": 85.83909286117324,
"unit": "W"
"unit": {
"base": "W"
}
}
}
}

View File

@ -0,0 +1,36 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
)
func main() {
var srcPath, flagConfigFile string
flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.Parse()
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", srcPath)
config.Init(flagConfigFile)
config.Keys.Validate = true
if err := archive.Init(json.RawMessage(archiveCfg), false); err != nil {
log.Fatal(err)
}
ar := archive.GetHandle()
for job := range ar.Iter(true) {
log.Printf("Validate %s - %d\n", job.Meta.Cluster, job.Meta.JobID)
}
}

View File

@ -0,0 +1,65 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// type Accelerator struct {
// ID string `json:"id"`
// Type string `json:"type"`
// Model string `json:"model"`
// }
// type Topology struct {
// Node []int `json:"node"`
// Socket [][]int `json:"socket"`
// MemoryDomain [][]int `json:"memoryDomain"`
// Die [][]int `json:"die"`
// Core [][]int `json:"core"`
// Accelerators []*Accelerator `json:"accelerators"`
// }
type SubCluster struct {
Name string `json:"name"`
Nodes string `json:"nodes"`
NumberOfNodes int `json:"numberOfNodes"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
Topology *schema.Topology `json:"topology"`
}
// type SubClusterConfig struct {
// Name string `json:"name"`
// Peak float64 `json:"peak"`
// Normal float64 `json:"normal"`
// Caution float64 `json:"caution"`
// Alert float64 `json:"alert"`
// }
type MetricConfig struct {
Name string `json:"name"`
Unit string `json:"unit"`
Scope schema.MetricScope `json:"scope"`
Aggregation string `json:"aggregation"`
Timestep int `json:"timestep"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
SubClusters []*schema.SubClusterConfig `json:"subClusters"`
}
type Cluster struct {
Name string `json:"name"`
MetricConfig []*MetricConfig `json:"metricConfig"`
SubClusters []*SubCluster `json:"subClusters"`
}

View File

@ -0,0 +1,166 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"errors"
"fmt"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var Clusters []*Cluster
var nodeLists map[string]map[string]archive.NodeList
func initClusterConfig() error {
Clusters = []*Cluster{}
nodeLists = map[string]map[string]archive.NodeList{}
for _, c := range ar.GetClusters() {
cluster, err := ar.LoadClusterCfg(c)
if err != nil {
return err
}
if len(cluster.Name) == 0 ||
len(cluster.MetricConfig) == 0 ||
len(cluster.SubClusters) == 0 {
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
}
for _, mc := range cluster.MetricConfig {
if len(mc.Name) == 0 {
return errors.New("cluster.metricConfig.name should not be empty")
}
if mc.Timestep < 1 {
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
}
// For backwards compability...
if mc.Scope == "" {
mc.Scope = schema.MetricScopeNode
}
if !mc.Scope.Valid() {
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
}
}
Clusters = append(Clusters, cluster)
nodeLists[cluster.Name] = make(map[string]archive.NodeList)
for _, sc := range cluster.SubClusters {
if sc.Nodes == "" {
continue
}
nl, err := archive.ParseNodeList(sc.Nodes)
if err != nil {
return fmt.Errorf("in %s/cluster.json: %w", cluster.Name, err)
}
nodeLists[cluster.Name][sc.Name] = nl
}
}
return nil
}
func GetCluster(cluster string) *Cluster {
for _, c := range Clusters {
if c.Name == cluster {
return c
}
}
return nil
}
func GetSubCluster(cluster, subcluster string) *SubCluster {
for _, c := range Clusters {
if c.Name == cluster {
for _, p := range c.SubClusters {
if p.Name == subcluster {
return p
}
}
}
}
return nil
}
func GetMetricConfig(cluster, metric string) *MetricConfig {
for _, c := range Clusters {
if c.Name == cluster {
for _, m := range c.MetricConfig {
if m.Name == metric {
return m
}
}
}
}
return nil
}
// AssignSubCluster sets the `job.subcluster` property of the job based
// on its cluster and resources.
func AssignSubCluster(job *BaseJob) error {
cluster := GetCluster(job.Cluster)
if cluster == nil {
return fmt.Errorf("unkown cluster: %#v", job.Cluster)
}
if job.SubCluster != "" {
for _, sc := range cluster.SubClusters {
if sc.Name == job.SubCluster {
return nil
}
}
return fmt.Errorf("already assigned subcluster %#v unkown (cluster: %#v)", job.SubCluster, job.Cluster)
}
if len(job.Resources) == 0 {
return fmt.Errorf("job without any resources/hosts")
}
host0 := job.Resources[0].Hostname
for sc, nl := range nodeLists[job.Cluster] {
if nl != nil && nl.Contains(host0) {
job.SubCluster = sc
return nil
}
}
if cluster.SubClusters[0].Nodes == "" {
job.SubCluster = cluster.SubClusters[0].Name
return nil
}
return fmt.Errorf("no subcluster found for cluster %#v and host %#v", job.Cluster, host0)
}
func GetSubClusterByNode(cluster, hostname string) (string, error) {
for sc, nl := range nodeLists[cluster] {
if nl != nil && nl.Contains(hostname) {
return sc, nil
}
}
c := GetCluster(cluster)
if c == nil {
return "", fmt.Errorf("unkown cluster: %#v", cluster)
}
if c.SubClusters[0].Nodes == "" {
return c.SubClusters[0].Name, nil
}
return "", fmt.Errorf("no subcluster found for cluster %#v and host %#v", cluster, hostname)
}

View File

@ -0,0 +1,109 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"errors"
"io"
"math"
"strconv"
)
// A custom float type is used so that (Un)MarshalJSON and
// (Un)MarshalGQL can be overloaded and NaN/null can be used.
// The default behaviour of putting every nullable value behind
// a pointer has a bigger overhead.
type Float float64
var NaN Float = Float(math.NaN())
var nullAsBytes []byte = []byte("null")
func (f Float) IsNaN() bool {
return math.IsNaN(float64(f))
}
// NaN will be serialized to `null`.
func (f Float) MarshalJSON() ([]byte, error) {
if f.IsNaN() {
return nullAsBytes, nil
}
return strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64), nil
}
// `null` will be unserialized to NaN.
func (f *Float) UnmarshalJSON(input []byte) error {
s := string(input)
if s == "null" {
*f = NaN
return nil
}
val, err := strconv.ParseFloat(s, 64)
if err != nil {
return err
}
*f = Float(val)
return nil
}
// UnmarshalGQL implements the graphql.Unmarshaler interface.
func (f *Float) UnmarshalGQL(v interface{}) error {
f64, ok := v.(float64)
if !ok {
return errors.New("invalid Float scalar")
}
*f = Float(f64)
return nil
}
// MarshalGQL implements the graphql.Marshaler interface.
// NaN will be serialized to `null`.
func (f Float) MarshalGQL(w io.Writer) {
if f.IsNaN() {
w.Write(nullAsBytes)
} else {
w.Write(strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64))
}
}
// Only used via REST-API, not via GraphQL.
// This uses a lot less allocations per series,
// but it turns out that the performance increase
// from using this is not that big.
func (s *Series) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 512+len(s.Data)*8)
buf = append(buf, `{"hostname":"`...)
buf = append(buf, s.Hostname...)
buf = append(buf, '"')
if s.Id != nil {
buf = append(buf, `,"id":`...)
buf = strconv.AppendInt(buf, int64(*s.Id), 10)
}
if s.Statistics != nil {
buf = append(buf, `,"statistics":{"min":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
buf = append(buf, `,"avg":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Avg, 'f', 2, 64)
buf = append(buf, `,"max":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
buf = append(buf, '}')
}
buf = append(buf, `,"data":[`...)
for i := 0; i < len(s.Data); i++ {
if i != 0 {
buf = append(buf, ',')
}
if s.Data[i].IsNaN() {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(s.Data[i]), 'f', 2, 32)
}
}
buf = append(buf, ']', '}')
return buf, nil
}

View File

@ -0,0 +1,142 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
type FsArchiveConfig struct {
Path string `json:"path"`
}
type FsArchive struct {
path string
clusters []string
}
func getPath(
job *JobMeta,
rootPath string,
file string) string {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
return filepath.Join(
rootPath,
job.Cluster,
lvl1, lvl2,
strconv.FormatInt(job.StartTime, 10), file)
}
func loadJobMeta(filename string) (*JobMeta, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend loadJobMeta()- %v", err)
return &JobMeta{}, err
}
defer f.Close()
return DecodeJobMeta(bufio.NewReader(f))
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) error {
var config FsArchiveConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Errorf("fsBackend Init()- %v", err)
return err
}
if config.Path == "" {
err := fmt.Errorf("fsBackend Init()- empty path")
log.Errorf("fsBackend Init()- %v", err)
return err
}
fsa.path = config.Path
entries, err := os.ReadDir(fsa.path)
if err != nil {
log.Errorf("fsBackend Init()- %v", err)
return err
}
for _, de := range entries {
fsa.clusters = append(fsa.clusters, de.Name())
}
return nil
}
func (fsa *FsArchive) Iter() <-chan *JobMeta {
ch := make(chan *JobMeta)
go func() {
clustersDir, err := os.ReadDir(fsa.path)
if err != nil {
log.Fatalf("Reading clusters failed: %s", err.Error())
}
for _, clusterDir := range clustersDir {
lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name()))
if err != nil {
log.Fatalf("Reading jobs failed: %s", err.Error())
}
for _, lvl1Dir := range lvl1Dirs {
if !lvl1Dir.IsDir() {
// Could be the cluster.json file
continue
}
lvl2Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name()))
if err != nil {
log.Fatalf("Reading jobs failed: %s", err.Error())
}
for _, lvl2Dir := range lvl2Dirs {
dirpath := filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name(), lvl2Dir.Name())
startTimeDirs, err := os.ReadDir(dirpath)
if err != nil {
log.Fatalf("Reading jobs failed: %s", err.Error())
}
for _, startTimeDir := range startTimeDirs {
if startTimeDir.IsDir() {
job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json"))
if err != nil {
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
} else {
ch <- job
}
}
}
}
}
}
close(ch)
}()
return ch
}
func (fsa *FsArchive) LoadClusterCfg(name string) (*Cluster, error) {
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
if err != nil {
log.Errorf("fsBackend LoadClusterCfg()- %v", err)
return &Cluster{}, err
}
return DecodeCluster(bytes.NewReader(b))
}
func (fsa *FsArchive) GetClusters() []string {
return fsa.clusters
}

View File

@ -0,0 +1,162 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"errors"
"fmt"
"io"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// Non-Swaggered Comment: BaseJob
// Non-Swaggered Comment: Common subset of Job and JobMeta. Use one of those, not this type directly.
type BaseJob struct {
// The unique identifier of a job
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
User string `json:"user" db:"user" example:"abcd100h"` // The unique identifier of a user
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
Partition *string `json:"partition" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
ArrayJobId *int64 `json:"arrayJobId" db:"array_job_id" example:"123000"` // The unique identifier of an array job
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
NumHWThreads *int32 `json:"numHwthreads" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumAcc *int32 `json:"numAcc" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
SMT *int32 `json:"smt" db:"smt" example:"4"` // SMT threads used by job
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
Walltime *int64 `json:"walltime" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
Tags []*schema.Tag `json:"tags"` // List of tags
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
Resources []*Resource `json:"resources"` // Resources used by job
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
MetaData map[string]string `json:"metaData"` // Additional information about the job
}
// Non-Swaggered Comment: Job
// Non-Swaggered Comment: This type is used as the GraphQL interface and using sqlx as a table row.
// Job model
// @Description Information of a HPC job.
type Job struct {
// The unique identifier of a job in the database
ID int64 `json:"id" db:"id"`
BaseJob
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds
StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type
MemUsedMax float64 `json:"-" db:"mem_used_max"` // MemUsedMax as Float64
FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"` // FlopsAnyAvg as Float64
MemBwAvg float64 `json:"-" db:"mem_bw_avg"` // MemBwAvg as Float64
LoadAvg float64 `json:"-" db:"load_avg"` // LoadAvg as Float64
NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64
FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"` // FileDataVolTotal as Float64
}
// Non-Swaggered Comment: JobMeta
// Non-Swaggered Comment: When reading from the database or sending data via GraphQL, the start time can be in the much more
// Non-Swaggered Comment: convenient time.Time type. In the `meta.json` files, the start time is encoded as a unix epoch timestamp.
// Non-Swaggered Comment: This is why there is this struct, which contains all fields from the regular job struct, but "overwrites"
// Non-Swaggered Comment: the StartTime field with one of type int64.
// Non-Swaggered Comment: ID *int64 `json:"id,omitempty"` >> never used in the job-archive, only available via REST-API
// JobMeta model
// @Description Meta data information of a HPC job.
type JobMeta struct {
// The unique identifier of a job in the database
ID *int64 `json:"id,omitempty"`
BaseJob
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` // Start epoch time stamp in seconds (Min > 0)
Statistics map[string]JobStatistics `json:"statistics,omitempty"` // Metric statistics of job
}
const (
MonitoringStatusDisabled int32 = 0
MonitoringStatusRunningOrArchiving int32 = 1
MonitoringStatusArchivingFailed int32 = 2
MonitoringStatusArchivingSuccessful int32 = 3
)
var JobDefaults BaseJob = BaseJob{
Exclusive: 1,
MonitoringStatus: MonitoringStatusRunningOrArchiving,
}
// JobStatistics model
// @Description Specification for job metric statistics.
type JobStatistics struct {
// Metric unit (see schema/unit.schema.json)
Unit string `json:"unit" example:"GHz"`
Avg float64 `json:"avg" example:"2500" minimum:"0"` // Job metric average
Min float64 `json:"min" example:"2000" minimum:"0"` // Job metric minimum
Max float64 `json:"max" example:"3000" minimum:"0"` // Job metric maximum
}
// Tag model
// @Description Defines a tag using name and type.
type Tag struct {
// The unique DB identifier of a tag
ID int64 `json:"id" db:"id"`
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type
Name string `json:"name" db:"tag_name" example:"Testjob"` // Tag Name
}
// Resource model
// @Description A resource used by a job
type Resource struct {
Hostname string `json:"hostname"` // Name of the host (= node)
HWThreads []int `json:"hwthreads,omitempty"` // List of OS processor ids
Accelerators []string `json:"accelerators,omitempty"` // List of of accelerator device ids
Configuration string `json:"configuration,omitempty"` // The configuration options of the node
}
type JobState string
const (
JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed"
JobStateFailed JobState = "failed"
JobStateCancelled JobState = "cancelled"
JobStateStopped JobState = "stopped"
JobStateTimeout JobState = "timeout"
JobStatePreempted JobState = "preempted"
JobStateOutOfMemory JobState = "out_of_memory"
)
func (e *JobState) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = JobState(str)
if !e.Valid() {
return errors.New("invalid job state")
}
return nil
}
func (e JobState) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}
func (e JobState) Valid() bool {
return e == JobStateRunning ||
e == JobStateCompleted ||
e == JobStateFailed ||
e == JobStateCancelled ||
e == JobStateStopped ||
e == JobStateTimeout ||
e == JobStatePreempted ||
e == JobStateOutOfMemory
}

View File

@ -0,0 +1,66 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"encoding/json"
"io"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func DecodeJobData(r io.Reader) (*JobData, error) {
var d JobData
if err := json.NewDecoder(r).Decode(&d); err != nil {
return nil, err
}
return &d, nil
}
func DecodeJobMeta(r io.Reader) (*JobMeta, error) {
var d JobMeta
if err := json.NewDecoder(r).Decode(&d); err != nil {
return nil, err
}
return &d, nil
}
func DecodeCluster(r io.Reader) (*Cluster, error) {
var c Cluster
if err := json.NewDecoder(r).Decode(&c); err != nil {
return nil, err
}
return &c, nil
}
func EncodeJobData(w io.Writer, d *schema.JobData) error {
// Sanitize parameters
if err := json.NewEncoder(w).Encode(d); err != nil {
return err
}
return nil
}
func EncodeJobMeta(w io.Writer, d *schema.JobMeta) error {
// Sanitize parameters
if err := json.NewEncoder(w).Encode(d); err != nil {
return err
}
return nil
}
func EncodeCluster(w io.Writer, c *schema.Cluster) error {
// Sanitize parameters
if err := json.NewEncoder(w).Encode(c); err != nil {
return err
}
return nil
}

View File

@ -0,0 +1,318 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"bufio"
"encoding/json"
"errors"
"flag"
"fmt"
"log"
"os"
"path/filepath"
"sync"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/units"
)
const Version = 1
var ar FsArchive
func loadJobData(filename string) (*JobData, error) {
f, err := os.Open(filename)
if err != nil {
return &JobData{}, fmt.Errorf("fsBackend loadJobData()- %v", err)
}
defer f.Close()
return DecodeJobData(bufio.NewReader(f))
}
func deepCopyJobMeta(j *JobMeta) schema.JobMeta {
var jn schema.JobMeta
//required properties
jn.JobID = j.JobID
jn.User = j.User
jn.Project = j.Project
jn.Cluster = j.Cluster
jn.SubCluster = j.SubCluster
jn.NumNodes = j.NumNodes
jn.Exclusive = j.Exclusive
jn.StartTime = j.StartTime
jn.State = schema.JobState(j.State)
jn.Duration = j.Duration
for _, ro := range j.Resources {
var rn schema.Resource
rn.Hostname = ro.Hostname
rn.Configuration = ro.Configuration
hwt := make([]int, len(ro.HWThreads))
if ro.HWThreads != nil {
copy(hwt, ro.HWThreads)
}
rn.HWThreads = hwt
acc := make([]string, len(ro.Accelerators))
if ro.Accelerators != nil {
copy(acc, ro.Accelerators)
}
rn.Accelerators = acc
jn.Resources = append(jn.Resources, &rn)
}
jn.MetaData = make(map[string]string)
for k, v := range j.MetaData {
jn.MetaData[k] = v
}
jn.Statistics = make(map[string]schema.JobStatistics)
for k, v := range j.Statistics {
var sn schema.JobStatistics
sn.Avg = v.Avg
sn.Max = v.Max
sn.Min = v.Min
tmpUnit := units.ConvertUnitString(v.Unit)
if tmpUnit.Base == "inval" {
sn.Unit = schema.Unit{Base: ""}
} else {
sn.Unit = tmpUnit
}
jn.Statistics[k] = sn
}
//optional properties
jn.Partition = j.Partition
jn.ArrayJobId = j.ArrayJobId
jn.NumHWThreads = j.NumHWThreads
jn.NumAcc = j.NumAcc
jn.MonitoringStatus = j.MonitoringStatus
jn.SMT = j.SMT
jn.Walltime = j.Walltime
for _, t := range j.Tags {
jn.Tags = append(jn.Tags, t)
}
return jn
}
func deepCopyJobData(d *JobData, cluster string, subCluster string) *schema.JobData {
var dn = make(schema.JobData)
for k, v := range *d {
// fmt.Printf("Metric %s\n", k)
dn[k] = make(map[schema.MetricScope]*schema.JobMetric)
for mk, mv := range v {
// fmt.Printf("Scope %s\n", mk)
var mn schema.JobMetric
tmpUnit := units.ConvertUnitString(mv.Unit)
if tmpUnit.Base == "inval" {
mn.Unit = schema.Unit{Base: ""}
} else {
mn.Unit = tmpUnit
}
mn.Timestep = mv.Timestep
for _, v := range mv.Series {
var sn schema.Series
sn.Hostname = v.Hostname
if v.Id != nil {
var id = new(string)
if mk == schema.MetricScopeAccelerator {
s := GetSubCluster(cluster, subCluster)
var err error
*id, err = s.Topology.GetAcceleratorID(*v.Id)
if err != nil {
log.Fatal(err)
}
} else {
*id = fmt.Sprint(*v.Id)
}
sn.Id = id
}
if v.Statistics != nil {
sn.Statistics = schema.MetricStatistics{
Avg: v.Statistics.Avg,
Min: v.Statistics.Min,
Max: v.Statistics.Max}
}
sn.Data = make([]schema.Float, len(v.Data))
copy(sn.Data, v.Data)
mn.Series = append(mn.Series, sn)
}
dn[k][mk] = &mn
}
// fmt.Printf("FINISH %s\n", k)
}
return &dn
}
func deepCopyClusterConfig(co *Cluster) schema.Cluster {
var cn schema.Cluster
cn.Name = co.Name
for _, sco := range co.SubClusters {
var scn schema.SubCluster
scn.Name = sco.Name
scn.Nodes = sco.Nodes
scn.ProcessorType = sco.ProcessorType
scn.SocketsPerNode = sco.SocketsPerNode
scn.CoresPerSocket = sco.CoresPerSocket
scn.ThreadsPerCore = sco.ThreadsPerCore
var prefix = new(string)
*prefix = "G"
scn.FlopRateScalar = schema.MetricValue{
Unit: schema.Unit{Base: "F/s", Prefix: prefix},
Value: float64(sco.FlopRateScalar)}
scn.FlopRateSimd = schema.MetricValue{
Unit: schema.Unit{Base: "F/s", Prefix: prefix},
Value: float64(sco.FlopRateSimd)}
scn.MemoryBandwidth = schema.MetricValue{
Unit: schema.Unit{Base: "B/s", Prefix: prefix},
Value: float64(sco.MemoryBandwidth)}
scn.Topology = *sco.Topology
cn.SubClusters = append(cn.SubClusters, &scn)
}
for _, mco := range co.MetricConfig {
var mcn schema.MetricConfig
mcn.Name = mco.Name
mcn.Scope = mco.Scope
if mco.Aggregation == "" {
fmt.Println("Property aggregation missing! Please review file!")
mcn.Aggregation = "sum"
} else {
mcn.Aggregation = mco.Aggregation
}
mcn.Timestep = mco.Timestep
tmpUnit := units.ConvertUnitString(mco.Unit)
if tmpUnit.Base == "inval" {
mcn.Unit = schema.Unit{Base: ""}
} else {
mcn.Unit = tmpUnit
}
mcn.Peak = mco.Peak
mcn.Normal = mco.Normal
mcn.Caution = mco.Caution
mcn.Alert = mco.Alert
mcn.SubClusters = mco.SubClusters
cn.MetricConfig = append(cn.MetricConfig, &mcn)
}
return cn
}
func main() {
var srcPath string
var dstPath string
flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive")
flag.StringVar(&dstPath, "d", "./var/job-archive-new", "Specify the destination job archive path. Default is ./var/job-archive-new")
flag.Parse()
if _, err := os.Stat(filepath.Join(srcPath, "version.txt")); !errors.Is(err, os.ErrNotExist) {
log.Fatal("Archive version exists!")
}
srcConfig := fmt.Sprintf("{\"path\": \"%s\"}", srcPath)
err := ar.Init(json.RawMessage(srcConfig))
if err != nil {
log.Fatal(err)
}
err = initClusterConfig()
if err != nil {
log.Fatal(err)
}
// setup new job archive
err = os.Mkdir(dstPath, 0750)
if err != nil {
log.Fatal(err)
}
for _, c := range Clusters {
path := fmt.Sprintf("%s/%s", dstPath, c.Name)
fmt.Println(path)
err = os.Mkdir(path, 0750)
if err != nil {
log.Fatal(err)
}
cn := deepCopyClusterConfig(c)
f, err := os.Create(fmt.Sprintf("%s/%s/cluster.json", dstPath, c.Name))
if err != nil {
log.Fatal(err)
}
if err := EncodeCluster(f, &cn); err != nil {
log.Fatal(err)
}
if err := f.Close(); err != nil {
log.Fatal(err)
}
}
var wg sync.WaitGroup
for job := range ar.Iter() {
// fmt.Printf("Job %d\n", job.JobID)
job := job
wg.Add(1)
go func() {
defer wg.Done()
path := getPath(job, dstPath, "meta.json")
err = os.MkdirAll(filepath.Dir(path), 0750)
if err != nil {
log.Fatal(err)
}
f, err := os.Create(path)
if err != nil {
log.Fatal(err)
}
jmn := deepCopyJobMeta(job)
if err = EncodeJobMeta(f, &jmn); err != nil {
log.Fatal(err)
}
if err = f.Close(); err != nil {
log.Fatal(err)
}
f, err = os.Create(getPath(job, dstPath, "data.json"))
if err != nil {
log.Fatal(err)
}
var jd *JobData
jd, err = loadJobData(getPath(job, srcPath, "data.json"))
if err != nil {
log.Fatal(err)
}
jdn := deepCopyJobData(jd, job.Cluster, job.SubCluster)
if err := EncodeJobData(f, jdn); err != nil {
log.Fatal(err)
}
if err := f.Close(); err != nil {
log.Fatal(err)
}
}()
}
wg.Wait()
os.WriteFile(filepath.Join(dstPath, "version.txt"), []byte(fmt.Sprintf("%d", Version)), 0644)
}

View File

@ -0,0 +1,65 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type JobData map[string]map[schema.MetricScope]*JobMetric
type JobMetric struct {
Unit string `json:"unit"`
Scope schema.MetricScope `json:"scope"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries"`
}
type Series struct {
Hostname string `json:"hostname"`
Id *int `json:"id,omitempty"`
Statistics *MetricStatistics `json:"statistics"`
Data []schema.Float `json:"data"`
}
type MetricStatistics struct {
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
type StatsSeries struct {
Mean []Float `json:"mean"`
Min []Float `json:"min"`
Max []Float `json:"max"`
Percentiles map[int][]Float `json:"percentiles,omitempty"`
}
// type MetricScope string
// const (
// MetricScopeInvalid MetricScope = "invalid_scope"
// MetricScopeNode MetricScope = "node"
// MetricScopeSocket MetricScope = "socket"
// MetricScopeMemoryDomain MetricScope = "memoryDomain"
// MetricScopeCore MetricScope = "core"
// MetricScopeHWThread MetricScope = "hwthread"
// MetricScopeAccelerator MetricScope = "accelerator"
// )
// var metricScopeGranularity map[MetricScope]int = map[MetricScope]int{
// MetricScopeNode: 10,
// MetricScopeSocket: 5,
// MetricScopeMemoryDomain: 3,
// MetricScopeCore: 2,
// MetricScopeHWThread: 1,
// MetricScopeAccelerator: 5, // Special/Randomly choosen
// MetricScopeInvalid: -1,
// }

View File

@ -1,9 +0,0 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
func main() {
}

View File

@ -30,8 +30,8 @@
let rooflineMaxY
let colWidth
let numBins = 50
const ccconfig = getContext('cc-config'),
metricConfig = getContext('metrics')
const ccconfig = getContext('cc-config')
const metricConfig = getContext('metrics')
let metricsInHistograms = ccconfig.analysis_view_histogramMetrics,
metricsInScatterplots = ccconfig.analysis_view_scatterPlotMetrics
@ -161,24 +161,29 @@
<Histogram
width={colWidth - 25} height={300 * 0.5}
data={$statsQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
label={(x) => x < $statsQuery.data.topUsers.length ? $statsQuery.data.topUsers[Math.floor(x)].name : '0'} />
label={(x) => x < $statsQuery.data.topUsers.length ? $statsQuery.data.topUsers[Math.floor(x)].name : 'No Users'}
ylabel="Node Hours [h]"/>
{/key}
</div>
</div>
<div class="col-3">
{#key $statsQuery.data.stats[0].histDuration}
<h4>Walltime Distribution</h4>
<h4>Duration Distribution</h4>
<Histogram
width={colWidth - 25} height={300}
data={$statsQuery.data.stats[0].histDuration} />
width={colWidth - 25}
data={$statsQuery.data.stats[0].histDuration}
xlabel="Current Runtimes [h]"
ylabel="Number of Jobs"/>
{/key}
</div>
<div class="col-3">
{#key $statsQuery.data.stats[0].histNumNodes}
<h4>Number of Nodes Distribution</h4>
<Histogram
width={colWidth - 25} height={300}
data={$statsQuery.data.stats[0].histNumNodes} />
width={colWidth - 25}
data={$statsQuery.data.stats[0].histNumNodes}
xlabel="Allocated Nodes [#]"
ylabel="Number of Jobs" />
{/key}
</div>
<div class="col-3">
@ -189,7 +194,7 @@
{:else if $rooflineQuery.data && cluster}
{#key $rooflineQuery.data}
<Roofline
width={colWidth - 25} height={300}
width={colWidth - 25}
tiles={$rooflineQuery.data.rooflineHeatmap}
cluster={cluster.subClusters.length == 1 ? cluster.subClusters[0] : null}
maxY={rooflineMaxY} />
@ -211,6 +216,7 @@
<Col>
<Card body>
These histograms show the distribution of the averages of all jobs matching the filters. Each job/average is weighted by its node hours.
Note that some metrics could be disabled for specific subclusters as per metriConfig and thus could affect shown average values.
</Card>
<br/>
</Col>
@ -224,12 +230,16 @@
$footprintsQuery.data.footprints.nodehours,
$footprintsQuery.data.footprints.metrics.find(f => f.metric == metric).data, numBins) }))}
itemsPerRow={ccconfig.plot_view_plotsPerRow}>
<h4>{item.metric} [{metricConfig(cluster.name, item.metric)?.unit}]</h4>
<h4>Average Distribution of '{item.metric}'</h4>
<Histogram
width={width} height={250}
min={item.min} max={item.max}
data={item.bins} label={item.label} />
data={item.bins}
label={item.label}
xlabel={`${item.metric} Average [${(metricConfig(cluster.name, item.metric)?.unit?.prefix ? metricConfig(cluster.name, item.metric)?.unit?.prefix : '') +
(metricConfig(cluster.name, item.metric)?.unit?.base ? metricConfig(cluster.name, item.metric)?.unit?.base : '')}]`}
ylabel="Node Hours [h]" />
</PlotTable>
</Col>
</Row>
@ -238,6 +248,7 @@
<Col>
<Card body>
Each circle represents one job. The size of a circle is proportional to its node hours. Darker circles mean multiple jobs have the same averages for the respective metrics.
Note that some metrics could be disabled for specific subclusters as per metriConfig and thus could affect shown average values.
</Card>
<br/>
</Col>
@ -254,12 +265,18 @@
<ScatterPlot
width={width} height={250} color={"rgba(0, 102, 204, 0.33)"}
xLabel={`${item.m1} [${metricConfig(cluster.name, item.m1)?.unit}]`}
yLabel={`${item.m2} [${metricConfig(cluster.name, item.m2)?.unit}]`}
xLabel={`${item.m1} [${(metricConfig(cluster.name, item.m1)?.unit?.prefix ? metricConfig(cluster.name, item.m1)?.unit?.prefix : '') +
(metricConfig(cluster.name, item.m1)?.unit?.base ? metricConfig(cluster.name, item.m1)?.unit?.base : '')}]`}
yLabel={`${item.m2} [${(metricConfig(cluster.name, item.m2)?.unit?.prefix ? metricConfig(cluster.name, item.m2)?.unit?.prefix : '') +
(metricConfig(cluster.name, item.m2)?.unit?.base ? metricConfig(cluster.name, item.m2)?.unit?.base : '')}]`}
X={item.f1} Y={item.f2} S={$footprintsQuery.data.footprints.nodehours} />
</PlotTable>
</Col>
</Row>
{/if}
<style>
h4 {
text-align: center;
}
</style>

View File

@ -81,7 +81,7 @@
missingMetrics = metricNames.filter(metric => !metrics.some(jm => jm.name == metric))
missingHosts = job.resources.map(({ hostname }) => ({
hostname: hostname,
metrics: metricNames.filter(metric => !metrics.some(jm => jm.metric.scope == 'node' && jm.metric.series.some(series => series.hostname == hostname)))
metrics: metricNames.filter(metric => !metrics.some(jm => jm.scope == 'node' && jm.metric.series.some(series => series.hostname == hostname)))
})).filter(({ metrics }) => metrics.length > 0)
somethingMissing = missingMetrics.length > 0 || missingHosts.length > 0
}
@ -114,8 +114,8 @@
cluster={clusters
.find(c => c.name == $initq.data.job.cluster).subClusters
.find(sc => sc.name == $initq.data.job.subCluster)}
flopsAny={$jobMetrics.data.jobMetrics.find(m => m.name == 'flops_any' && m.metric.scope == 'node')}
memBw={$jobMetrics.data.jobMetrics.find(m => m.name == 'mem_bw' && m.metric.scope == 'node')} />
flopsAny={$jobMetrics.data.jobMetrics.find(m => m.name == 'flops_any' && m.scope == 'node').metric}
memBw={$jobMetrics.data.jobMetrics.find(m => m.name == 'mem_bw' && m.scope == 'node').metric} />
</Col>
{:else}
<Col></Col>
@ -163,8 +163,9 @@
bind:this={plots[item.metric]}
on:more-loaded={({ detail }) => statsTable.moreLoaded(detail)}
job={$initq.data.job}
metric={item.metric}
scopes={item.data.map(x => x.metric)}
metricName={item.metric}
rawData={item.data.map(x => x.metric)}
scopes={item.data.map(x => x.scope)}
width={width}/>
{:else}
<Card body color="warning">No data for <code>{item.metric}</code></Card>

View File

@ -17,11 +17,15 @@
export let authlevel
export let roles
let filters, jobList, matchedJobs = null
let filters = []
let jobList, matchedJobs = null
let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false
let metrics = filterPresets.cluster
? ccconfig[`plot_list_selectedMetrics:${filterPresets.cluster}`] || ccconfig.plot_list_selectedMetrics
: ccconfig.plot_list_selectedMetrics
let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null
$: selectedCluster = filters[0]?.cluster ? filters[0].cluster.eq : null
// The filterPresets are handled by the Filters component,
// so we need to wait for it to be ready before we can start a query.
@ -58,7 +62,10 @@
<Filters
filterPresets={filterPresets}
bind:this={filters}
on:update={({ detail }) => jobList.update(detail.filters)} />
on:update={({ detail }) => {
filters = detail.filters
jobList.update(detail.filters)}
} />
</Col>
<Col xs="3" style="margin-left: auto;">
@ -84,7 +91,7 @@
bind:isOpen={isSortingOpen} />
<MetricSelection
cluster={filterPresets.cluster}
bind:cluster={selectedCluster}
configName="plot_list_selectedMetrics"
bind:metrics={metrics}
bind:isOpen={isMetricsSelectionOpen} />

View File

@ -5,19 +5,22 @@
import { fetchMetrics, minScope } from './utils'
export let job
export let metric
export let metricName
export let scopes
export let width
export let rawData
const dispatch = createEventDispatcher()
const cluster = getContext('clusters').find(cluster => cluster.name == job.cluster)
const subCluster = cluster.subClusters.find(subCluster => subCluster.name == job.subCluster)
const metricConfig = cluster.metricConfig.find(metricConfig => metricConfig.name == metric)
const metricConfig = cluster.metricConfig.find(metricConfig => metricConfig.name == metricName)
let selectedScope = minScope(scopes.map(s => s.scope)), selectedHost = null, plot, fetching = false, error = null
let selectedHost = null, plot, fetching = false, error = null
let selectedScope = minScope(scopes)
let selectedScopeIndex = scopes.findIndex(s => s == selectedScope)
$: avaliableScopes = scopes.map(metric => metric.scope)
$: data = scopes.find(metric => metric.scope == selectedScope)
$: avaliableScopes = scopes
$: data = rawData[selectedScopeIndex]
$: series = data?.series.filter(series => selectedHost == null || series.hostname == selectedHost)
let from = null, to = null
@ -29,7 +32,7 @@
export async function loadMore() {
fetching = true
let response = await fetchMetrics(job, [metric], ["core"])
let response = await fetchMetrics(job, [metricName], ["core"])
fetching = false
if (response.error) {
@ -38,9 +41,9 @@
}
for (let jm of response.data.jobMetrics) {
if (jm.metric.scope != "node") {
if (jm.scope != "node") {
scopes.push(jm.metric)
selectedScope = jm.metric.scope
selectedScope = jm.scope
dispatch('more-loaded', jm)
if (!avaliableScopes.includes(selectedScope))
avaliableScopes = [...avaliableScopes, selectedScope]
@ -52,7 +55,8 @@
</script>
<InputGroup>
<InputGroupText style="min-width: 150px;">
{metric} ({metricConfig?.unit})
{metricName} ({(metricConfig?.unit?.prefix ? metricConfig.unit.prefix : '') +
(metricConfig?.unit?.base ? metricConfig.unit.base : '')})
</InputGroupText>
<select class="form-select" bind:value={selectedScope}>
{#each avaliableScopes as scope}
@ -82,7 +86,7 @@
width={width} height={300}
cluster={cluster} subCluster={subCluster}
timestep={data.timestep}
scope={selectedScope} metric={metric}
scope={selectedScope} metric={metricName}
series={series} />
{/if}
{/key}

View File

@ -95,7 +95,7 @@
<Modal isOpen={isOpen} toggle={() => (isOpen = !isOpen)}>
<ModalHeader>
Configure columns
Configure columns (Metric availability shown)
</ModalHeader>
<ModalBody>
<ListGroup>
@ -113,9 +113,26 @@
{/if}
{metric}
<span style="float: right;">
{cluster == null ? clusters
{cluster == null ?
clusters // No single cluster specified: List Clusters with Metric
.filter(cluster => cluster.metricConfig.find(m => m.name == metric) != null)
.map(cluster => cluster.name).join(', ') : ''}
.map(cluster => cluster.name).join(', ') :
clusters // Single cluster requested: List Subclusters with do not have metric remove flag
.filter(cluster => cluster.metricConfig.find(m => m.name == metric) != null)
.map(function(cluster) {
let scNames = cluster.subClusters.map(sc => sc.name)
scNames.forEach(function(scName){
let met = cluster.metricConfig.find(m => m.name == metric)
let msc = met.subClusters.find(msc => msc.name == scName)
if (msc != null) {
if (msc.remove == true) {
scNames = scNames.filter(scn => scn != msc.name)
}
}
})
return scNames
})
.join(', ')}
</span>
</li>
{/each}

View File

@ -20,16 +20,19 @@
from.setMinutes(from.getMinutes() - 30)
}
const ccconfig = getContext('cc-config'), clusters = getContext('clusters')
const ccconfig = getContext('cc-config')
const clusters = getContext('clusters')
const nodesQuery = operationStore(`query($cluster: String!, $nodes: [String!], $from: Time!, $to: Time!) {
nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) {
host, subCluster
host
subCluster
metrics {
name,
name
scope
metric {
timestep
scope
unit { base, prefix }
series {
statistics { min, avg, max }
data
@ -46,6 +49,17 @@
$: $nodesQuery.variables = { cluster, nodes: [hostname], from: from.toISOString(), to: to.toISOString() }
let metricUnits = {}
$: if ($nodesQuery.data) {
for (let metric of clusters.find(c => c.name == cluster).metricConfig) {
if (metric.unit.prefix || metric.unit.base) {
metricUnits[metric.name] = '(' + (metric.unit.prefix ? metric.unit.prefix : '') + (metric.unit.base ? metric.unit.base : '') + ')'
} else { // If no unit defined: Omit Unit Display
metricUnits[metric.name] = ''
}
}
}
query(nodesQuery)
// $: console.log($nodesQuery?.data?.nodeMetrics[0].metrics)
@ -83,7 +97,7 @@
let:width
itemsPerRow={ccconfig.plot_view_plotsPerRow}
items={$nodesQuery.data.nodeMetrics[0].metrics.sort((a, b) => a.name.localeCompare(b.name))}>
<h4 style="text-align: center;">{item.name}</h4>
<h4 style="text-align: center;">{item.name} {metricUnits[item.name]}</h4>
<MetricPlot
width={width} height={300} metric={item.name} timestep={item.metric.timestep}
cluster={clusters.find(c => c.name == cluster)} subCluster={$nodesQuery.data.nodeMetrics[0].subCluster}

View File

@ -11,7 +11,7 @@
const allMetrics = [...new Set(jobMetrics.map(m => m.name))].sort(),
scopesForMetric = (metric) => jobMetrics
.filter(jm => jm.name == metric)
.map(jm => jm.metric.scope)
.map(jm => jm.scope)
let hosts = job.resources.map(r => r.hostname).sort(),
selectedScopes = {},
@ -40,7 +40,7 @@
s.active = true
}
let series = jobMetrics.find(jm => jm.name == metric && jm.metric.scope == 'node')?.metric.series
let series = jobMetrics.find(jm => jm.name == metric && jm.scope == 'node')?.metric.series
sorting = {...sorting}
hosts = hosts.sort((h1, h2) => {
let s1 = series.find(s => s.hostname == h1)?.statistics

View File

@ -5,7 +5,7 @@
export let jobMetrics
$: series = jobMetrics
.find(jm => jm.name == metric && jm.metric.scope == scope)
.find(jm => jm.name == metric && jm.scope == scope)
?.metric.series.filter(s => s.hostname == host && s.statistics != null)
</script>

View File

@ -2,8 +2,8 @@
import Refresher from './joblist/Refresher.svelte'
import Roofline, { transformPerNodeData } from './plots/Roofline.svelte'
import Histogram from './plots/Histogram.svelte'
import { Row, Col, Spinner, Card, Table, Progress } from 'sveltestrap'
import { init } from './utils.js'
import { Row, Col, Spinner, Card, CardHeader, CardTitle, CardBody, Table, Progress, Icon } from 'sveltestrap'
import { init, formatNumber } from './utils.js'
import { operationStore, query } from '@urql/svelte'
const { query: initq } = init()
@ -15,13 +15,14 @@
let from = new Date(Date.now() - 5 * 60 * 1000), to = new Date(Date.now())
const mainQuery = operationStore(`query($cluster: String!, $filter: [JobFilter!]!, $metrics: [String!], $from: Time!, $to: Time!) {
nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) {
host,
subCluster,
host
subCluster
metrics {
name,
metric {
name
scope
timestep,
metric {
timestep
unit { base, prefix }
series { data }
}
}
@ -47,20 +48,27 @@
? sum + (node.metrics.find(m => m.name == metric)?.metric.series.reduce((sum, series) => sum + series.data[series.data.length - 1], 0) || 0)
: sum, 0)
let allocatedNodes = {}, flopRate = {}, memBwRate = {}
let allocatedNodes = {}, flopRate = {}, flopRateUnit = {}, memBwRate = {}, memBwRateUnit = {}
$: if ($initq.data && $mainQuery.data) {
let subClusters = $initq.data.clusters.find(c => c.name == cluster).subClusters
for (let subCluster of subClusters) {
allocatedNodes[subCluster.name] = $mainQuery.data.allocatedNodes.find(({ name }) => name == subCluster.name)?.count || 0
flopRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'flops_any') * 100) / 100
flopRateUnit[subCluster.name] = subCluster.flopRateSimd.unit.prefix + subCluster.flopRateSimd.unit.base
memBwRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'mem_bw') * 100) / 100
memBwRateUnit[subCluster.name] = subCluster.memoryBandwidth.unit.prefix + subCluster.memoryBandwidth.unit.base
}
}
query(mainQuery)
</script>
<!-- Loading indicator & Refresh -->
<Row>
<Col xs="auto" style="align-self: flex-end;">
<h4 class="mb-0" >Current usage of cluster "{cluster}"</h4>
</Col>
<Col xs="auto">
{#if $initq.fetching || $mainQuery.fetching}
<Spinner/>
@ -89,54 +97,72 @@
</Col>
</Row>
{/if}
<hr>
<!-- Gauges & Roofline per Subcluster-->
{#if $initq.data && $mainQuery.data}
{#each $initq.data.clusters.find(c => c.name == cluster).subClusters as subCluster, i}
<Row>
<Col xs="3">
<Row cols={2} class="mb-3 justify-content-center">
<Col xs="4" class="px-3">
<Card class="h-auto mt-1">
<CardHeader>
<CardTitle class="mb-0">SubCluster "{subCluster.name}"</CardTitle>
</CardHeader>
<CardBody>
<Table>
<tr>
<th scope="col">SubCluster</th>
<td colspan="2">{subCluster.name}</td>
</tr>
<tr>
<th scope="col">Allocated Nodes</th>
<td style="min-width: 75px;"><div class="col"><Progress value={allocatedNodes[subCluster.name]} max={subCluster.numberOfNodes}/></div></td>
<td>({allocatedNodes[subCluster.name]} / {subCluster.numberOfNodes})</td>
<td style="min-width: 100px;"><div class="col"><Progress value={allocatedNodes[subCluster.name]} max={subCluster.numberOfNodes}/></div></td>
<td>({allocatedNodes[subCluster.name]} Nodes / {subCluster.numberOfNodes} Total Nodes)</td>
</tr>
<tr>
<th scope="col">Flop Rate</th>
<td style="min-width: 75px;"><div class="col"><Progress value={flopRate[subCluster.name]} max={subCluster.flopRateSimd * subCluster.numberOfNodes}/></div></td>
<td>({flopRate[subCluster.name]} / {subCluster.flopRateSimd * subCluster.numberOfNodes})</td>
<th scope="col">Flop Rate (Any) <Icon name="info-circle" class="p-1" style="cursor: help;" title="Flops[Any] = (Flops[Double] x 2) + Flops[Single]"/></th>
<td style="min-width: 100px;"><div class="col"><Progress value={flopRate[subCluster.name]} max={subCluster.flopRateSimd.value * subCluster.numberOfNodes}/></div></td>
<td>({flopRate[subCluster.name]} {flopRateUnit[subCluster.name]} / {(subCluster.flopRateSimd.value * subCluster.numberOfNodes)} {flopRateUnit[subCluster.name]} [Max])</td>
</tr>
<tr>
<th scope="col">MemBw Rate</th>
<td style="min-width: 75px;"><div class="col"><Progress value={memBwRate[subCluster.name]} max={subCluster.memoryBandwidth * subCluster.numberOfNodes}/></div></td>
<td>({memBwRate[subCluster.name]} / {subCluster.memoryBandwidth * subCluster.numberOfNodes})</td>
<td style="min-width: 100px;"><div class="col"><Progress value={memBwRate[subCluster.name]} max={subCluster.memoryBandwidth.value * subCluster.numberOfNodes}/></div></td>
<td>({memBwRate[subCluster.name]} {memBwRateUnit[subCluster.name]} / {(subCluster.memoryBandwidth.value * subCluster.numberOfNodes)} {memBwRateUnit[subCluster.name]} [Max])</td>
</tr>
</Table>
</CardBody>
</Card>
</Col>
<div class="col-9" bind:clientWidth={plotWidths[i]}>
<Col class="px-3">
<div bind:clientWidth={plotWidths[i]}>
{#key $mainQuery.data.nodeMetrics}
<Roofline
width={plotWidths[i] - 10} height={300} colorDots={false} cluster={subCluster}
width={plotWidths[i] - 10} height={300} colorDots={true} showTime={false} cluster={subCluster}
data={transformPerNodeData($mainQuery.data.nodeMetrics.filter(data => data.subCluster == subCluster.name))} />
{/key}
</div>
</Col>
</Row>
{/each}
<Row>
<div class="col-4" bind:clientWidth={colWidth1}>
<h4>Top Users</h4>
<hr style="margin-top: -1em;">
<!-- Usage Stats as Histograms -->
<Row cols={4}>
<Col class="p-2">
<div bind:clientWidth={colWidth1}>
<h4 class="mb-3 text-center">Top Users</h4>
{#key $mainQuery.data}
<Histogram
width={colWidth1 - 25} height={300}
width={colWidth1 - 25}
data={$mainQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
label={(x) => x < $mainQuery.data.topUsers.length ? $mainQuery.data.topUsers[Math.floor(x)].name : '0'} />
label={(x) => x < $mainQuery.data.topUsers.length ? $mainQuery.data.topUsers[Math.floor(x)].name : '0'}
xlabel="User Name" ylabel="Number of Jobs" />
{/key}
</div>
<div class="col-2">
</Col>
<Col class="px-4 py-2">
<Table>
<tr><th>Name</th><th>Number of Nodes</th></tr>
<tr class="mb-2"><th>User Name</th><th>Number of Nodes</th></tr>
{#each $mainQuery.data.topUsers.sort((a, b) => b.count - a.count) as { name, count }}
<tr>
<th scope="col"><a href="/monitoring/user/{name}">{name}</a></th>
@ -144,41 +170,48 @@
</tr>
{/each}
</Table>
</div>
<div class="col-4">
<h4>Top Projects</h4>
</Col>
<Col class="p-2">
<h4 class="mb-3 text-center">Top Projects</h4>
{#key $mainQuery.data}
<Histogram
width={colWidth1 - 25} height={300}
width={colWidth1 - 25}
data={$mainQuery.data.topProjects.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
label={(x) => x < $mainQuery.data.topProjects.length ? $mainQuery.data.topProjects[Math.floor(x)].name : '0'} />
label={(x) => x < $mainQuery.data.topProjects.length ? $mainQuery.data.topProjects[Math.floor(x)].name : '0'}
xlabel="Project Code" ylabel="Number of Jobs" />
{/key}
</div>
<div class="col-2">
</Col>
<Col class="px-4 py-2">
<Table>
<tr><th>Name</th><th>Number of Nodes</th></tr>
<tr class="mb-2"><th>Project Code</th><th>Number of Nodes</th></tr>
{#each $mainQuery.data.topProjects.sort((a, b) => b.count - a.count) as { name, count }}
<tr><th scope="col">{name}</th><td>{count}</td></tr>
{/each}
</Table>
</div>
</Col>
</Row>
<Row>
<div class="col" bind:clientWidth={colWidth2}>
<h4>Duration Distribution</h4>
<Row cols={2} class="mt-3">
<Col class="p-2">
<div bind:clientWidth={colWidth2}>
<h4 class="mb-3 text-center">Duration Distribution</h4>
{#key $mainQuery.data.stats}
<Histogram
width={colWidth2 - 25} height={300}
data={$mainQuery.data.stats[0].histDuration} />
width={colWidth2 - 25}
data={$mainQuery.data.stats[0].histDuration}
xlabel="Current Runtimes [h]"
ylabel="Number of Jobs" />
{/key}
</div>
<div class="col">
<h4>Number of Nodes Distribution</h4>
</Col>
<Col class="p-2">
<h4 class="mb-3 text-center">Number of Nodes Distribution</h4>
{#key $mainQuery.data.stats}
<Histogram
width={colWidth2 - 25} height={300}
data={$mainQuery.data.stats[0].histNumNodes} />
width={colWidth2 - 25}
data={$mainQuery.data.stats[0].histNumNodes}
xlabel="Allocated Nodes [#]"
ylabel="Number of Jobs" />
{/key}
</div>
</Col>
</Row>
{/if}

View File

@ -21,6 +21,7 @@
const clusters = getContext('clusters')
const ccconfig = getContext('cc-config')
const metricConfig = getContext('metrics')
let plotHeight = 300
let hostnameFilter = ''
@ -28,13 +29,14 @@
const nodesQuery = operationStore(`query($cluster: String!, $metrics: [String!], $from: Time!, $to: Time!) {
nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) {
host,
host
subCluster
metrics {
name,
metric {
name
scope
timestep,
metric {
timestep
unit { base, prefix }
series {
statistics { min, avg, max }
data
@ -49,6 +51,18 @@
to: to.toISOString()
})
let metricUnits = {}
$: if ($nodesQuery.data) {
let thisCluster = clusters.find(c => c.name == cluster)
for (let metric of thisCluster.metricConfig) {
if (metric.unit.prefix || metric.unit.base) {
metricUnits[metric.name] = '(' + (metric.unit.prefix ? metric.unit.prefix : '') + (metric.unit.base ? metric.unit.base : '') + ')'
} else { // If no unit defined: Omit Unit Display
metricUnits[metric.name] = ''
}
}
}
$: $nodesQuery.variables = { cluster, metrics: [selectedMetric], from: from.toISOString(), to: to.toISOString() }
query(nodesQuery)
@ -71,7 +85,7 @@
<InputGroupText>Metric</InputGroupText>
<select class="form-select" bind:value={selectedMetric}>
{#each clusters.find(c => c.name == cluster).metricConfig as metric}
<option value={metric.name}>{metric.name} ({metric.unit})</option>
<option value={metric.name}>{metric.name} {metricUnits[metric.name]}</option>
{/each}
</select>
</InputGroup>
@ -98,11 +112,23 @@
let:width
itemsPerRow={ccconfig.plot_view_plotsPerRow}
items={$nodesQuery.data.nodeMetrics
.filter(h => h.host.includes(hostnameFilter) && h.metrics.some(m => m.name == selectedMetric && m.metric.scope == 'node'))
.map(h => ({ host: h.host, subCluster: h.subCluster, data: h.metrics.find(m => m.name == selectedMetric && m.metric.scope == 'node') }))
.filter(h => h.host.includes(hostnameFilter) && h.metrics.some(m => m.name == selectedMetric && m.scope == 'node'))
.map(function (h) {
let thisConfig = metricConfig(cluster, selectedMetric)
let thisSCIndex = thisConfig.subClusters.findIndex(sc => sc.name == h.subCluster)
// Metric remove == true
if (thisSCIndex >= 0) {
if (thisConfig.subClusters[thisSCIndex].remove == true) {
return { host: h.host, subCluster: h.subCluster, data: null, removed: true }
}
}
// Else
return { host: h.host, subCluster: h.subCluster, data: h.metrics.find(m => m.name == selectedMetric && m.scope == 'node'), removed: false }
})
.sort((a, b) => a.host.localeCompare(b.host))}>
<h4 style="width: 100%; text-align: center;"><a href="/monitoring/node/{cluster}/{item.host}">{item.host} ({item.subCluster})</a></h4>
{#if item.removed == false && item.data != null}
<MetricPlot
width={width}
height={plotHeight}
@ -111,6 +137,11 @@
metric={item.data.name}
cluster={clusters.find(c => c.name == cluster)}
subCluster={item.subCluster} />
{:else if item.removed == true && item.data == null}
<Card body color="info">Metric '{ selectedMetric }' disabled for subcluster '{ item.subCluster }'</Card>
{:else}
<Card body color="warning">Missing Data</Card>
{/if}
</PlotTable>
{/if}
</Col>

View File

@ -18,10 +18,12 @@
export let user
export let filterPresets
let filters, jobList
let filters = []
let jobList
let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false
let metrics = ccconfig.plot_list_selectedMetrics, isMetricsSelectionOpen = false
let w1, w2, histogramHeight = 250
let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null
const stats = operationStore(`
query($filter: [JobFilter!]!) {
@ -40,6 +42,12 @@
pause: true
})
// filters[filters.findIndex(filter => filter.cluster != null)] ?
// filters[filters.findIndex(filter => filter.cluster != null)].cluster.eq :
// null
// Cluster filter has to be alwas @ first index, above will throw error
$: selectedCluster = filters[0]?.cluster ? filters[0].cluster.eq : null
query(stats)
onMount(() => filters.update())
@ -75,11 +83,12 @@
startTimeQuickSelect={true}
bind:this={filters}
on:update={({ detail }) => {
let filters = [...detail.filters, { user: { eq: user.username } }]
$stats.variables = { filter: filters }
let jobFilters = [...detail.filters, { user: { eq: user.username } }]
$stats.variables = { filter: jobFilters }
$stats.context.pause = false
$stats.reexecute()
jobList.update(filters)
filters = jobFilters
jobList.update(jobFilters)
}} />
</Col>
<Col xs="auto" style="margin-left: auto;">
@ -136,19 +145,23 @@
</Table>
</Col>
<div class="col-4" style="text-align: center;" bind:clientWidth={w1}>
<b>Walltime</b>
<b>Duration Distribution</b>
{#key $stats.data.jobsStatistics[0].histDuration}
<Histogram
data={$stats.data.jobsStatistics[0].histDuration}
width={w1 - 25} height={histogramHeight} />
width={w1 - 25} height={histogramHeight}
xlabel="Current Runtimes [h]"
ylabel="Number of Jobs"/>
{/key}
</div>
<div class="col-4" style="text-align: center;" bind:clientWidth={w2}>
<b>Number of Nodes</b>
<b>Number of Nodes Distribution</b>
{#key $stats.data.jobsStatistics[0].histNumNodes}
<Histogram
data={$stats.data.jobsStatistics[0].histNumNodes}
width={w2 - 25} height={histogramHeight} />
width={w2 - 25} height={histogramHeight}
xlabel="Allocated Nodes [#]"
ylabel="Number of Jobs" />
{/key}
</div>
{/if}
@ -167,6 +180,8 @@
bind:sorting={sorting}
bind:isOpen={isSortingOpen} />
<MetricSelection configName="plot_list_selectedMetrics"
<MetricSelection
bind:cluster={selectedCluster}
configName="plot_list_selectedMetrics"
bind:metrics={metrics}
bind:isOpen={isMetricsSelectionOpen} />

View File

@ -20,6 +20,7 @@
let text = await res.text()
popMessage(text, '#048109')
reloadUserList()
form.reset()
} else {
let text = await res.text()
// console.log(res.statusText)
@ -79,7 +80,12 @@
{#if i == 0}
<div>
<input type="radio" id={role} name="role" value={role} checked/>
<label for={role}>{role.charAt(0).toUpperCase() + role.slice(1)} (regular user, same as if created via LDAP sync.)</label>
<label for={role}>{role.toUpperCase()} (Allowed to interact with REST API.)</label>
</div>
{:else if i == 1}
<div>
<input type="radio" id={role} name="role" value={role} checked/>
<label for={role}>{role.charAt(0).toUpperCase() + role.slice(1)} (Same as if created via LDAP sync.)</label>
</div>
{:else}
<div>

View File

@ -102,9 +102,11 @@
{#if $initialized}
({clusters
.map(cluster => cluster.metricConfig.find(m => m.name == metric))
.filter(m => m != null).map(m => m.unit)
.reduce((arr, unit) => arr.includes(unit) ? arr : [...arr, unit], [])
.join(', ')})
.filter(m => m != null)
.map(m => (m.unit?.prefix?m.unit?.prefix:'') + (m.unit?.base?m.unit?.base:'')) // Build unitStr
.reduce((arr, unitStr) => arr.includes(unitStr) ? arr : [...arr, unitStr], []) // w/o this, output would be [unitStr, unitStr]
.join(', ')
})
{/if}
</th>
{/each}

View File

@ -24,12 +24,14 @@
let scopes = [job.numNodes == 1 ? 'core' : 'node']
const cluster = getContext('clusters').find(c => c.name == job.cluster)
// Get all MetricConfs which include subCluster-specific settings for this job
const metricConfig = getContext('metrics')
const metricsQuery = operationStore(`query($id: ID!, $metrics: [String!]!, $scopes: [MetricScope!]!) {
jobMetrics(id: $id, metrics: $metrics, scopes: $scopes) {
name
scope
metric {
unit, scope, timestep
unit { prefix, base }, timestep
statisticsSeries { min, mean, max }
series {
hostname, id, data
@ -44,13 +46,47 @@
})
const selectScope = (jobMetrics) => jobMetrics.reduce(
(a, b) => maxScope([a.metric.scope, b.metric.scope]) == a.metric.scope
(a, b) => maxScope([a.scope, b.scope]) == a.scope
? (job.numNodes > 1 ? a : b)
: (job.numNodes > 1 ? b : a), jobMetrics[0])
const sortAndSelectScope = (jobMetrics) => metrics
.map(name => jobMetrics.filter(jobMetric => jobMetric.name == name))
.map(jobMetrics => jobMetrics.length > 0 ? selectScope(jobMetrics) : null)
.map(function(name) {
// Get MetricConf for this selected/requested metric
let thisConfig = metricConfig(cluster, name)
let thisSCIndex = thisConfig.subClusters.findIndex(sc => sc.name == job.subCluster)
// Check if Subcluster has MetricConf: If not found (index == -1), no further remove flag check required
if (thisSCIndex >= 0) {
// SubCluster Config present: Check if remove flag is set
if (thisConfig.subClusters[thisSCIndex].remove == true) {
// Return null data and informational flag
return {removed: true, data: null}
} else {
// load and return metric, if data available
let thisMetric = jobMetrics.filter(jobMetric => jobMetric.name == name) // Returns Array
if (thisMetric.length > 0) {
return {removed: false, data: thisMetric}
} else {
return {removed: false, data: null}
}
}
} else {
// No specific subCluster config: 'remove' flag not set, deemed false -> load and return metric, if data available
let thisMetric = jobMetrics.filter(jobMetric => jobMetric.name == name) // Returns Array
if (thisMetric.length > 0) {
return {removed: false, data: thisMetric}
} else {
return {removed: false, data: null}
}
}
})
.map(function(jobMetrics) {
if (jobMetrics.data != null && jobMetrics.data.length > 0) {
return {removed: jobMetrics.removed, data: selectScope(jobMetrics.data)}
} else {
return jobMetrics
}
})
$: metricsQuery.variables = { id: job.id, metrics, scopes }
@ -81,17 +117,20 @@
{:else}
{#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric || i)}
<td>
{#if metric != null}
<!-- Subluster Metricconfig remove keyword for jobtables (joblist main, user joblist, project joblist) to be used here as toplevel case-->
{#if metric.removed == false && metric.data != null}
<MetricPlot
width={plotWidth}
height={plotHeight}
timestep={metric.metric.timestep}
scope={metric.metric.scope}
series={metric.metric.series}
statisticsSeries={metric.metric.statisticsSeries}
metric={metric.name}
timestep={metric.data.metric.timestep}
scope={metric.data.scope}
series={metric.data.metric.series}
statisticsSeries={metric.data.metric.statisticsSeries}
metric={metric.data.name}
cluster={cluster}
subCluster={job.subCluster} />
{:else if metric.removed == true && metric.data == null}
<Card body color="info">Metric disabled for subcluster '{ job.subCluster }'</Card>
{:else}
<Card body color="warning">Missing Data</Card>
{/if}

View File

@ -18,15 +18,17 @@
import { onMount } from 'svelte'
export let data
export let width
export let height
export let width = 500
export let height = 300
export let xlabel = ''
export let ylabel = ''
export let min = null
export let max = null
export let label = formatNumber
const fontSize = 12
const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"'
const paddingLeft = 35, paddingRight = 20, paddingTop = 20, paddingBottom = 20
const paddingLeft = 50, paddingRight = 20, paddingTop = 20, paddingBottom = 20
let ctx, canvasElement
@ -72,9 +74,11 @@
}
function render() {
const h = height - paddingTop - paddingBottom
const labelOffset = Math.floor(height * 0.1)
const h = height - paddingTop - paddingBottom - labelOffset
const w = width - paddingLeft - paddingRight
const barWidth = Math.ceil(w / (maxValue + 1))
const barGap = 5
const barWidth = Math.ceil(w / (maxValue + 1)) - barGap
if (Number.isNaN(barWidth))
return
@ -83,9 +87,14 @@
const getCanvasY = (count) => (h - (count / maxCount) * h) + paddingTop
// X Axis
ctx.font = `${fontSize}px ${fontFamily}`
ctx.font = `bold ${fontSize}px ${fontFamily}`
ctx.fillStyle = 'black'
if (xlabel != '') {
let textWidth = ctx.measureText(xlabel).width
ctx.fillText(xlabel, Math.floor((width / 2) - (textWidth / 2) + barGap), height - Math.floor(labelOffset / 2))
}
ctx.textAlign = 'center'
ctx.font = `${fontSize}px ${fontFamily}`
if (min != null && max != null) {
const stepsizeX = getStepSize(max - min, w, 75)
let startX = 0
@ -94,19 +103,28 @@
for (let x = startX; x < max; x += stepsizeX) {
let px = ((x - min) / (max - min)) * (w - barWidth) + paddingLeft + (barWidth / 2.)
ctx.fillText(`${formatNumber(x)}`, px, height - paddingBottom + 15)
ctx.fillText(`${formatNumber(x)}`, px, height - paddingBottom - Math.floor(labelOffset / 2))
}
} else {
const stepsizeX = getStepSize(maxValue, w, 120)
for (let x = 0; x <= maxValue; x += stepsizeX) {
ctx.fillText(label(x), getCanvasX(x), height - paddingBottom + 15)
ctx.fillText(label(x), getCanvasX(x), height - paddingBottom - Math.floor(labelOffset / 2))
}
}
// Y Axis
ctx.fillStyle = 'black'
ctx.strokeStyle = '#bbbbbb'
ctx.font = `bold ${fontSize}px ${fontFamily}`
if (ylabel != '') {
ctx.save()
ctx.translate(15, Math.floor(h / 2))
ctx.rotate(-Math.PI / 2)
ctx.fillText(ylabel, 0, 0)
ctx.restore()
}
ctx.textAlign = 'right'
ctx.font = `${fontSize}px ${fontFamily}`
ctx.beginPath()
const stepsizeY = getStepSize(maxCount, h, 50)
for (let y = stepsizeY; y <= maxCount; y += stepsizeY) {
@ -118,7 +136,7 @@
ctx.stroke()
// Draw bars
ctx.fillStyle = '#0066cc'
ctx.fillStyle = '#85abce'
for (let p of data) {
ctx.fillRect(
getCanvasX(p.value) - (barWidth / 2.),
@ -130,10 +148,10 @@
// Fat lines left and below plotting area
ctx.strokeStyle = 'black'
ctx.beginPath()
ctx.moveTo(0, height - paddingBottom)
ctx.lineTo(width, height - paddingBottom)
ctx.moveTo(0, height - paddingBottom - labelOffset)
ctx.lineTo(width, height - paddingBottom - labelOffset)
ctx.moveTo(paddingLeft, 0)
ctx.lineTo(paddingLeft, height- paddingBottom)
ctx.lineTo(paddingLeft, height - Math.floor(labelOffset / 2))
ctx.stroke()
}

View File

@ -18,7 +18,7 @@
let ctx, canvasElement
const labels = metrics.filter(name => {
if (!jobMetrics.find(m => m.name == name && m.metric.scope == "node")) {
if (!jobMetrics.find(m => m.name == name && m.scope == "node")) {
console.warn(`PolarPlot: No metric data for '${name}'`)
return false
}
@ -27,7 +27,7 @@
const getValuesForStat = (getStat) => labels.map(name => {
const peak = metricConfig(cluster, name).peak
const metric = jobMetrics.find(m => m.name == name && m.metric.scope == "node")
const metric = jobMetrics.find(m => m.name == name && m.scope == "node")
const value = getStat(metric.metric) / peak
return value <= 1. ? value : 1.
})

View File

@ -4,7 +4,8 @@
<script context="module">
const axesColor = '#aaaaaa'
const fontSize = 12
const tickFontSize = 10
const labelFontSize = 12
const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"'
const paddingLeft = 40,
paddingRight = 10,
@ -67,11 +68,11 @@
return 2
}
function render(ctx, data, cluster, width, height, colorDots, defaultMaxY) {
function render(ctx, data, cluster, width, height, colorDots, showTime, defaultMaxY) {
if (width <= 0)
return
const [minX, maxX, minY, maxY] = [0.01, 1000, 1., cluster?.flopRateSimd || defaultMaxY]
const [minX, maxX, minY, maxY] = [0.01, 1000, 1., cluster?.flopRateSimd?.value || defaultMaxY]
const w = width - paddingLeft - paddingRight
const h = height - paddingTop - paddingBottom
@ -95,7 +96,7 @@
// Axes
ctx.fillStyle = 'black'
ctx.strokeStyle = axesColor
ctx.font = `${fontSize}px ${fontFamily}`
ctx.font = `${tickFontSize}px ${fontFamily}`
ctx.beginPath()
for (let x = minX, i = 0; x <= maxX; i++) {
let px = getCanvasX(x)
@ -103,18 +104,20 @@
let textWidth = ctx.measureText(text).width
ctx.fillText(text,
Math.floor(px - (textWidth / 2)),
height - paddingBottom + fontSize + 5)
height - paddingBottom + tickFontSize + 5)
ctx.moveTo(px, paddingTop - 5)
ctx.lineTo(px, height - paddingBottom + 5)
x *= axisStepFactor(i, w)
}
if (data.xLabel) {
ctx.font = `${labelFontSize}px ${fontFamily}`
let textWidth = ctx.measureText(data.xLabel).width
ctx.fillText(data.xLabel, Math.floor((width / 2) - (textWidth / 2)), height - 20)
}
ctx.textAlign = 'center'
ctx.font = `${tickFontSize}px ${fontFamily}`
for (let y = minY, i = 0; y <= maxY; i++) {
let py = getCanvasY(y)
ctx.moveTo(paddingLeft - 5, py)
@ -129,6 +132,7 @@
y *= axisStepFactor(i)
}
if (data.yLabel) {
ctx.font = `${labelFontSize}px ${fontFamily}`
ctx.save()
ctx.translate(15, Math.floor(height / 2))
ctx.rotate(-Math.PI / 2)
@ -185,13 +189,13 @@
ctx.lineWidth = 2
ctx.beginPath()
if (cluster != null) {
const ycut = 0.01 * cluster.memoryBandwidth
const scalarKnee = (cluster.flopRateScalar - ycut) / cluster.memoryBandwidth
const simdKnee = (cluster.flopRateSimd - ycut) / cluster.memoryBandwidth
const ycut = 0.01 * cluster.memoryBandwidth.value
const scalarKnee = (cluster.flopRateScalar.value - ycut) / cluster.memoryBandwidth.value
const simdKnee = (cluster.flopRateSimd.value - ycut) / cluster.memoryBandwidth.value
const scalarKneeX = getCanvasX(scalarKnee),
simdKneeX = getCanvasX(simdKnee),
flopRateScalarY = getCanvasY(cluster.flopRateScalar),
flopRateSimdY = getCanvasY(cluster.flopRateSimd)
flopRateScalarY = getCanvasY(cluster.flopRateScalar.value),
flopRateSimdY = getCanvasY(cluster.flopRateSimd.value)
if (scalarKneeX < width - paddingRight) {
ctx.moveTo(scalarKneeX, flopRateScalarY)
@ -222,8 +226,8 @@
}
ctx.stroke()
if (colorDots && data.x && data.y) {
// The Color Scale
if (colorDots && showTime && data.x && data.y) {
// The Color Scale For Time Information
ctx.fillStyle = 'black'
ctx.fillText('Time:', 17, height - 5)
const start = paddingLeft + 5
@ -237,7 +241,7 @@
}
}
function transformData(flopsAny, memBw, colorDots) {
function transformData(flopsAny, memBw, colorDots) { // Uses Metric Object
const nodes = flopsAny.series.length
const timesteps = flopsAny.series[0].data.length
@ -308,17 +312,18 @@
export let memBw = null
export let cluster = null
export let maxY = null
export let width
export let height
export let width = 500
export let height = 300
export let tiles = null
export let colorDots = true
export let showTime = true
export let data = null
console.assert(data || tiles || (flopsAny && memBw), "you must provide flopsAny and memBw or tiles!")
let ctx, canvasElement, prevWidth = width, prevHeight = height
data = data != null ? data : (flopsAny && memBw
? transformData(flopsAny.metric, memBw.metric, colorDots)
? transformData(flopsAny, memBw, colorDots) // Use Metric Object from Parent
: {
tiles: tiles,
xLabel: 'Intensity [FLOPS/byte]',
@ -334,7 +339,7 @@
canvasElement.width = width
canvasElement.height = height
render(ctx, data, cluster, width, height, colorDots, maxY)
render(ctx, data, cluster, width, height, colorDots, showTime, maxY)
})
let timeoutId = null
@ -354,7 +359,7 @@
timeoutId = null
canvasElement.width = width
canvasElement.height = height
render(ctx, data, cluster, width, height, colorDots, maxY)
render(ctx, data, cluster, width, height, colorDots, showTime, maxY)
}, 250)
}

View File

@ -37,11 +37,11 @@ export function init(extraInitQuery = '') {
clusters {
name,
metricConfig {
name, unit, peak,
name, unit { base, prefix }, peak,
normal, caution, alert,
timestep, scope,
aggregation,
subClusters { name, peak, normal, caution, alert }
subClusters { name, peak, normal, caution, alert, remove }
}
partitions
subClusters {
@ -49,9 +49,9 @@ export function init(extraInitQuery = '') {
socketsPerNode
coresPerSocket
threadsPerCore
flopRateScalar
flopRateSimd
memoryBandwidth
flopRateScalar { unit { base, prefix }, value }
flopRateSimd { unit { base, prefix }, value }
memoryBandwidth { unit { base, prefix }, value }
numberOfNodes
topology {
node, socket, core