Merge pull request #104 from ClusterCockpit/import-data-sanitation

Import data sanitation
Fixes among other things  MetricConfig for GPU SubCluster #99 Mismatch of type of "id" in job-metric-data "series" object schema #101
This commit is contained in:
Jan Eitzinger 2023-04-12 09:15:27 +02:00 committed by GitHub
commit 7272db4fb0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
75 changed files with 5240 additions and 1395 deletions

4
.gitignore vendored
View File

@ -9,4 +9,6 @@
/web/frontend/public/build /web/frontend/public/build
/web/frontend/node_modules /web/frontend/node_modules
.vscode/settings.json /.vscode/*
/archive-migration
/archive-manager

View File

@ -47,12 +47,17 @@ type SubCluster {
socketsPerNode: Int! socketsPerNode: Int!
coresPerSocket: Int! coresPerSocket: Int!
threadsPerCore: Int! threadsPerCore: Int!
flopRateScalar: Int! flopRateScalar: MetricValue!
flopRateSimd: Int! flopRateSimd: MetricValue!
memoryBandwidth: Int! memoryBandwidth: MetricValue!
topology: Topology! topology: Topology!
} }
type MetricValue {
unit: Unit!
value: Float!
}
type Topology { type Topology {
node: [Int!] node: [Int!]
socket: [[Int!]!] socket: [[Int!]!]
@ -70,23 +75,24 @@ type Accelerator {
type SubClusterConfig { type SubClusterConfig {
name: String! name: String!
peak: Float!
normal: Float!
caution: Float!
alert: Float!
}
type MetricConfig {
name: String!
unit: String!
scope: MetricScope!
aggregation: String
timestep: Int!
peak: Float peak: Float
normal: Float normal: Float
caution: Float caution: Float
alert: Float alert: Float
subClusters: [SubClusterConfig] remove: Boolean
}
type MetricConfig {
name: String!
unit: Unit!
scope: MetricScope!
aggregation: String!
timestep: Int!
peak: Float!
normal: Float
caution: Float!
alert: Float!
subClusters: [SubClusterConfig!]!
} }
type Tag { type Tag {
@ -104,12 +110,12 @@ type Resource {
type JobMetricWithName { type JobMetricWithName {
name: String! name: String!
scope: MetricScope!
metric: JobMetric! metric: JobMetric!
} }
type JobMetric { type JobMetric {
unit: String! unit: Unit
scope: MetricScope!
timestep: Int! timestep: Int!
series: [Series!] series: [Series!]
statisticsSeries: StatsSeries statisticsSeries: StatsSeries
@ -117,11 +123,16 @@ type JobMetric {
type Series { type Series {
hostname: String! hostname: String!
id: Int id: String
statistics: MetricStatistics statistics: MetricStatistics
data: [NullableFloat!]! data: [NullableFloat!]!
} }
type Unit {
base: String!
prefix: String
}
type MetricStatistics { type MetricStatistics {
avg: Float! avg: Float!
min: Float! min: Float!

View File

@ -15,6 +15,7 @@
"kind": "file", "kind": "file",
"path": "./var/job-archive" "path": "./var/job-archive"
}, },
"validate": true,
"clusters": [ "clusters": [
{ {
"name": "test", "name": "test",
@ -24,9 +25,18 @@
"token": "eyJhbGciOiJF-E-pQBQ" "token": "eyJhbGciOiJF-E-pQBQ"
}, },
"filterRanges": { "filterRanges": {
"numNodes": { "from": 1, "to": 64 }, "numNodes": {
"duration": { "from": 0, "to": 86400 }, "from": 1,
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null } "to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
} }
} }
], ],

View File

@ -1,10 +1,12 @@
#!/usr/bin/env perl #!/usr/bin/env perl
use strict; use strict;
use warnings; use warnings;
use utf8; use utf8;
my %INFO; my %INFO;
my %DOMAINS; my %DOMAINS;
my $SMT; my $SMT;
my $numMemoryDomains; my $numMemoryDomains;
$DOMAINS{socket} = []; $DOMAINS{socket} = [];
@ -198,8 +200,11 @@ END
$INFO{gpus} .= join(",\n",@gpuStr); $INFO{gpus} .= join(",\n",@gpuStr);
$INFO{gpus} .= "]\n"; $INFO{gpus} .= "]\n";
} else {
$INFO{gpus} = '';
} }
print <<"END"; print <<"END";
{ {
"name": "<FILL IN>", "name": "<FILL IN>",
@ -219,10 +224,10 @@ print <<"END";
"memoryDomain": [ "memoryDomain": [
$INFO{memoryDomains} $INFO{memoryDomains}
], ],
$INFO{gpus}
"core": [ "core": [
$INFO{cores} $INFO{cores}
] ]
$INFO{gpus}
} }
} }
END END

View File

@ -0,0 +1,37 @@
# Release versioning
Releases are numbered with a integer id starting with 1.
Every release embeds the following assets into the binary:
* Web-frontend including javascript files and all static assets
* Golang template files for server-side rendering
* JSON schema files for validation
Remaining external assets are:
* The SQL database used
* The job archive
Both external assets are also versioned using integer ids.
This means every release binary is tied to specific versions for the SQL
database and job archive.
A command line switch `--migrate-db` is provided to migrate the SQL database
from a previous to the most recent version.
We provide a separate tool `archive-migration` to migrate an existing job
archive from the previous to the most recent version.
# Versioning of APIs
cc-backend provides two API backends:
* A REST API for querying jobs
* A GraphQL API used for data exchange between web frontend and cc-backend
Both APIs will also be versioned. We still need to decide if we also support
older REST API version using versioning of the endpoint URLs.
# How to build a specific release
# How to migrate the SQL database
# How to migrate the job archive

View File

@ -13,9 +13,18 @@
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
}, },
"filterRanges": { "filterRanges": {
"numNodes": { "from": 1, "to": 32 }, "numNodes": {
"duration": { "from": 0, "to": 172800 }, "from": 1,
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null } "to": 32
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2010-01-01T00:00:00Z",
"to": null
}
} }
}, },
{ {
@ -26,9 +35,18 @@
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
}, },
"filterRanges": { "filterRanges": {
"numNodes": { "from": 1, "to": 1 }, "numNodes": {
"duration": { "from": 0, "to": 172800 }, "from": 1,
"startTime": { "from": "2015-01-01T00:00:00Z", "to": null } "to": 1
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2015-01-01T00:00:00Z",
"to": null
}
} }
} }
] ]

View File

@ -63,6 +63,7 @@ models:
resolver: true resolver: true
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" } NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" } MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" } JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" } Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" } Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
@ -79,3 +80,4 @@ models:
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" } FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" } SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" } StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }

File diff suppressed because it is too large Load Diff

View File

@ -60,6 +60,7 @@ type JobFilter struct {
type JobMetricWithName struct { type JobMetricWithName struct {
Name string `json:"name"` Name string `json:"name"`
Scope schema.MetricScope `json:"scope"`
Metric *schema.JobMetric `json:"metric"` Metric *schema.JobMetric `json:"metric"`
} }

View File

@ -194,12 +194,9 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
res := []*model.JobMetricWithName{} res := []*model.JobMetricWithName{}
for name, md := range data { for name, md := range data {
for scope, metric := range md { for scope, metric := range md {
if metric.Scope != schema.MetricScope(scope) {
log.Panic("metric.Scope != schema.MetricScope(scope) : Should not happen!")
}
res = append(res, &model.JobMetricWithName{ res = append(res, &model.JobMetricWithName{
Name: name, Name: name,
Scope: scope,
Metric: metric, Metric: metric,
}) })
} }
@ -296,6 +293,7 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
for _, scopedMetric := range scopedMetrics { for _, scopedMetric := range scopedMetrics {
host.Metrics = append(host.Metrics, &model.JobMetricWithName{ host.Metrics = append(host.Metrics, &model.JobMetricWithName{
Name: metric, Name: metric,
Scope: schema.MetricScopeNode,
Metric: scopedMetric, Metric: scopedMetric,
}) })
} }
@ -307,6 +305,15 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nodeMetrics, nil return nodeMetrics, nil
} }
// NumberOfNodes is the resolver for the numberOfNodes field.
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
nodeList, err := archive.ParseNodeList(obj.Nodes)
if err != nil {
return 0, err
}
return nodeList.NodeCount(), nil
}
// Cluster returns generated.ClusterResolver implementation. // Cluster returns generated.ClusterResolver implementation.
func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver{r} } func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver{r} }
@ -319,7 +326,11 @@ func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResol
// Query returns generated.QueryResolver implementation. // Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
// SubCluster returns generated.SubClusterResolver implementation.
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
type clusterResolver struct{ *Resolver } type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver } type jobResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver } type mutationResolver struct{ *Resolver }
type queryResolver struct{ *Resolver } type queryResolver struct{ *Resolver }
type subClusterResolver struct{ *Resolver }

View File

@ -164,7 +164,6 @@ func (ccms *CCMetricStore) LoadData(
scopes []schema.MetricScope, scopes []schema.MetricScope,
ctx context.Context) (schema.JobData, error) { ctx context.Context) (schema.JobData, error) {
topology := archive.GetSubCluster(job.Cluster, job.SubCluster).Topology
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes) queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
if err != nil { if err != nil {
log.Warn("Error while building queries") log.Warn("Error while building queries")
@ -201,7 +200,6 @@ func (ccms *CCMetricStore) LoadData(
if !ok { if !ok {
jobMetric = &schema.JobMetric{ jobMetric = &schema.JobMetric{
Unit: mc.Unit, Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep, Timestep: mc.Timestep,
Series: make([]schema.Series, 0), Series: make([]schema.Series, 0),
} }
@ -215,13 +213,10 @@ func (ccms *CCMetricStore) LoadData(
continue continue
} }
id := (*int)(nil) id := (*string)(nil)
if query.Type != nil { if query.Type != nil {
id = new(int) id = new(string)
*id, err = strconv.Atoi(query.TypeIds[0]) *id = query.TypeIds[0]
if err != nil || *query.Type == acceleratorString {
*id, _ = topology.GetAcceleratorIndex(query.TypeIds[0])
}
} }
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
@ -235,7 +230,7 @@ func (ccms *CCMetricStore) LoadData(
jobMetric.Series = append(jobMetric.Series, schema.Series{ jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname, Hostname: query.Hostname,
Id: id, Id: id,
Statistics: &schema.MetricStatistics{ Statistics: schema.MetricStatistics{
Avg: float64(res.Avg), Avg: float64(res.Avg),
Min: float64(res.Min), Min: float64(res.Min),
Max: float64(res.Max), Max: float64(res.Max),
@ -275,9 +270,14 @@ func (ccms *CCMetricStore) buildQueries(
scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) { scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) {
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
topology := archive.GetSubCluster(job.Cluster, job.SubCluster).Topology
assignedScope := []schema.MetricScope{} assignedScope := []schema.MetricScope{}
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
if scerr != nil {
return nil, nil, scerr
}
topology := subcluster.Topology
for _, metric := range metrics { for _, metric := range metrics {
remoteName := ccms.toRemoteName(metric) remoteName := ccms.toRemoteName(metric)
mc := archive.GetMetricConfig(job.Cluster, metric) mc := archive.GetMetricConfig(job.Cluster, metric)
@ -293,7 +293,7 @@ func (ccms *CCMetricStore) buildQueries(
scopesLoop: scopesLoop:
for _, requestedScope := range scopes { for _, requestedScope := range scopes {
nativeScope := mc.Scope nativeScope := mc.Scope
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 { if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == nil {
continue continue
} }
@ -624,13 +624,12 @@ func (ccms *CCMetricStore) LoadNodeData(
mc := archive.GetMetricConfig(cluster, metric) mc := archive.GetMetricConfig(cluster, metric)
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: mc.Unit, Unit: mc.Unit,
Scope: schema.MetricScopeNode,
Timestep: mc.Timestep, Timestep: mc.Timestep,
Series: []schema.Series{ Series: []schema.Series{
{ {
Hostname: query.Hostname, Hostname: query.Hostname,
Data: qdata.Data, Data: qdata.Data,
Statistics: &schema.MetricStatistics{ Statistics: schema.MetricStatistics{
Avg: float64(qdata.Avg), Avg: float64(qdata.Avg),
Min: float64(qdata.Min), Min: float64(qdata.Min),
Max: float64(qdata.Max), Max: float64(qdata.Max),

View File

@ -134,7 +134,6 @@ func (idb *InfluxDBv2DataRepository) LoadData(
jobMetric = map[schema.MetricScope]*schema.JobMetric{ jobMetric = map[schema.MetricScope]*schema.JobMetric{
scope: { // uses scope var from above! scope: { // uses scope var from above!
Unit: mc.Unit, Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep, Timestep: mc.Timestep,
Series: make([]schema.Series, 0, len(job.Resources)), Series: make([]schema.Series, 0, len(job.Resources)),
StatisticsSeries: nil, // Should be: &schema.StatsSeries{}, StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
@ -159,7 +158,7 @@ func (idb *InfluxDBv2DataRepository) LoadData(
field, host = row.Measurement(), row.ValueByKey("hostname").(string) field, host = row.Measurement(), row.ValueByKey("hostname").(string)
hostSeries = schema.Series{ hostSeries = schema.Series{
Hostname: host, Hostname: host,
Statistics: nil, Statistics: schema.MetricStatistics{}, //TODO Add Statistics
Data: make([]schema.Float, 0), Data: make([]schema.Float, 0),
} }
} }
@ -212,15 +211,10 @@ func (idb *InfluxDBv2DataRepository) LoadData(
for _, scope := range scopes { for _, scope := range scopes {
if scope == "node" { // No 'socket/core' support yet if scope == "node" { // No 'socket/core' support yet
for metric, nodes := range stats { for metric, nodes := range stats {
// log.Debugf("<< Add Stats for : Field %s >>", metric)
for node, stats := range nodes { for node, stats := range nodes {
// log.Debugf("<< Add Stats for : Host %s : Min %.2f, Max %.2f, Avg %.2f >>", node, stats.Min, stats.Max, stats.Avg )
for index, _ := range jobData[metric][scope].Series { for index, _ := range jobData[metric][scope].Series {
// log.Debugf("<< Try to add Stats to Series in Position %d >>", index)
if jobData[metric][scope].Series[index].Hostname == node { if jobData[metric][scope].Series[index].Hostname == node {
// log.Debugf("<< Match for Series in Position %d : Host %s >>", index, jobData[metric][scope].Series[index].Hostname) jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
jobData[metric][scope].Series[index].Statistics = &schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
// log.Debugf("<< Result Inner: Min %.2f, Max %.2f, Avg %.2f >>", jobData[metric][scope].Series[index].Statistics.Min, jobData[metric][scope].Series[index].Statistics.Max, jobData[metric][scope].Series[index].Statistics.Avg)
} }
} }
} }
@ -228,17 +222,6 @@ func (idb *InfluxDBv2DataRepository) LoadData(
} }
} }
// DEBUG:
// for _, scope := range scopes {
// for _, met := range metrics {
// for _, series := range jobData[met][scope].Series {
// log.Debugf("<< Result: %d data points for metric %s on %s with scope %s, Stats: Min %.2f, Max %.2f, Avg %.2f >>",
// len(series.Data), met, series.Hostname, scope,
// series.Statistics.Min, series.Statistics.Max, series.Statistics.Avg)
// }
// }
// }
return jobData, nil return jobData, nil
} }

View File

@ -335,7 +335,10 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
} }
jobMeta.Statistics[metric] = schema.JobStatistics{ jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: archive.GetMetricConfig(job.Cluster, metric).Unit, Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: avg / float64(job.NumNodes), Avg: avg / float64(job.NumNodes),
Min: min, Min: min,
Max: max, Max: max,

View File

@ -251,7 +251,7 @@ func (pdb *PrometheusDataRepository) RowToSeries(
return schema.Series{ return schema.Series{
Hostname: hostname, Hostname: hostname,
Data: values, Data: values,
Statistics: &schema.MetricStatistics{ Statistics: schema.MetricStatistics{
Avg: mean, Avg: mean,
Min: min, Min: min,
Max: max, Max: max,
@ -323,7 +323,6 @@ func (pdb *PrometheusDataRepository) LoadData(
if !ok { if !ok {
jobMetric = &schema.JobMetric{ jobMetric = &schema.JobMetric{
Unit: metricConfig.Unit, Unit: metricConfig.Unit,
Scope: scope,
Timestep: metricConfig.Timestep, Timestep: metricConfig.Timestep,
Series: make([]schema.Series, 0), Series: make([]schema.Series, 0),
} }
@ -362,7 +361,7 @@ func (pdb *PrometheusDataRepository) LoadStats(
for metric, metricData := range data { for metric, metricData := range data {
stats[metric] = make(map[string]schema.MetricStatistics) stats[metric] = make(map[string]schema.MetricStatistics)
for _, series := range metricData[schema.MetricScopeNode].Series { for _, series := range metricData[schema.MetricScopeNode].Series {
stats[metric][series.Hostname] = *series.Statistics stats[metric][series.Hostname] = series.Statistics
} }
} }
@ -432,7 +431,6 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
// output per host and metric // output per host and metric
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: metricConfig.Unit, Unit: metricConfig.Unit,
Scope: scope,
Timestep: metricConfig.Timestep, Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)}, Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
}, },

View File

@ -17,6 +17,7 @@ import (
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/units"
) )
const NamedJobInsert string = `INSERT INTO job ( const NamedJobInsert string = `INSERT INTO job (
@ -75,6 +76,7 @@ func HandleImportFlag(flag string) error {
return err return err
} }
checkJobData(&jobData)
SanityChecks(&jobMeta.BaseJob) SanityChecks(&jobMeta.BaseJob)
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
if job, err := GetJobRepository().Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows { if job, err := GetJobRepository().Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
@ -173,7 +175,9 @@ func InitDB() error {
i := 0 i := 0
errorOccured := 0 errorOccured := 0
for jobMeta := range ar.Iter() { for jobContainer := range ar.Iter(false) {
jobMeta := jobContainer.Meta
// // Bundle 100 inserts into one transaction for better performance: // // Bundle 100 inserts into one transaction for better performance:
if i%10 == 0 { if i%10 == 0 {
@ -297,7 +301,7 @@ func SanityChecks(job *schema.BaseJob) error {
if len(job.Resources) == 0 || len(job.User) == 0 { if len(job.Resources) == 0 || len(job.User) == 0 {
return fmt.Errorf("'resources' and 'user' should not be empty") return fmt.Errorf("'resources' and 'user' should not be empty")
} }
if job.NumAcc < 0 || job.NumHWThreads < 0 || job.NumNodes < 1 { if *job.NumAcc < 0 || *job.NumHWThreads < 0 || job.NumNodes < 1 {
return fmt.Errorf("'numNodes', 'numAcc' or 'numHWThreads' invalid") return fmt.Errorf("'numNodes', 'numAcc' or 'numHWThreads' invalid")
} }
if len(job.Resources) != int(job.NumNodes) { if len(job.Resources) != int(job.NumNodes) {
@ -314,3 +318,34 @@ func loadJobStat(job *schema.JobMeta, metric string) float64 {
return 0.0 return 0.0
} }
func checkJobData(d *schema.JobData) error {
for _, scopes := range *d {
var newUnit string
// Add node scope if missing
for _, metric := range scopes {
if strings.Contains(metric.Unit.Base, "B/s") ||
strings.Contains(metric.Unit.Base, "F/s") ||
strings.Contains(metric.Unit.Base, "B") {
// First get overall avg
sum := 0.0
for _, s := range metric.Series {
sum += s.Statistics.Avg
}
avg := sum / float64(len(metric.Series))
for _, s := range metric.Series {
fp := schema.ConvertFloatToFloat64(s.Data)
// Normalize values with new unit prefix
oldUnit := metric.Unit.Base
units.NormalizeSeries(fp, avg, oldUnit, &newUnit)
s.Data = schema.GetFloat64ToFloat(fp)
}
metric.Unit.Base = newUnit
}
}
}
return nil
}

View File

@ -335,7 +335,13 @@ func (r *JobRepository) DeleteJobById(id int64) error {
} }
// TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC; // TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC;
func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggregate, filters []*model.JobFilter, weight *model.Weights, limit *int) (map[string]int, error) { func (r *JobRepository) CountGroupedJobs(
ctx context.Context,
aggreg model.Aggregate,
filters []*model.JobFilter,
weight *model.Weights,
limit *int) (map[string]int, error) {
start := time.Now() start := time.Now()
if !aggreg.IsValid() { if !aggreg.IsValid() {
return nil, errors.New("invalid aggregate") return nil, errors.New("invalid aggregate")

View File

@ -8,13 +8,15 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache" "github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/log"
) )
const Version = 1
type ArchiveBackend interface { type ArchiveBackend interface {
Init(rawConfig json.RawMessage) error Init(rawConfig json.RawMessage) (int, error)
LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error)
@ -28,7 +30,12 @@ type ArchiveBackend interface {
GetClusters() []string GetClusters() []string
Iter() <-chan *schema.JobMeta Iter(loadMetricData bool) <-chan JobContainer
}
type JobContainer struct {
Meta *schema.JobMeta
Data *schema.JobData
} }
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024) var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
@ -54,10 +61,12 @@ func Init(rawConfig json.RawMessage, disableArchive bool) error {
return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", kind.Kind) return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", kind.Kind)
} }
if err := ar.Init(rawConfig); err != nil { version, err := ar.Init(rawConfig)
if err != nil {
log.Error("Error while initializing archiveBackend") log.Error("Error while initializing archiveBackend")
return err return err
} }
log.Infof("Load archive version %d", version)
return initClusterConfig() return initClusterConfig()
} }

View File

@ -55,7 +55,7 @@ func initClusterConfig() error {
nodeLists[cluster.Name] = make(map[string]NodeList) nodeLists[cluster.Name] = make(map[string]NodeList)
for _, sc := range cluster.SubClusters { for _, sc := range cluster.SubClusters {
if sc.Nodes == "" { if sc.Nodes == "*" {
continue continue
} }
@ -80,18 +80,17 @@ func GetCluster(cluster string) *schema.Cluster {
return nil return nil
} }
func GetSubCluster(cluster, subcluster string) *schema.SubCluster { func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
for _, c := range Clusters { for _, c := range Clusters {
if c.Name == cluster { if c.Name == cluster {
for _, p := range c.SubClusters { for _, p := range c.SubClusters {
if p.Name == subcluster { if p.Name == subcluster {
return p return p, nil
} }
} }
} }
} }
return nil return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster)
} }
func GetMetricConfig(cluster, metric string) *schema.MetricConfig { func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
@ -138,7 +137,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
} }
} }
if cluster.SubClusters[0].Nodes == "" { if cluster.SubClusters[0].Nodes == "*" {
job.SubCluster = cluster.SubClusters[0].Name job.SubCluster = cluster.SubClusters[0].Name
return nil return nil
} }

View File

@ -7,17 +7,21 @@ package archive
import ( import (
"bufio" "bufio"
"bytes" "bytes"
"compress/gzip"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings"
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/santhosh-tekuri/jsonschema/v5"
) )
type FsArchiveConfig struct { type FsArchiveConfig struct {
@ -29,6 +33,11 @@ type FsArchive struct {
clusters []string clusters []string
} }
func checkFileExists(filePath string) bool {
_, err := os.Stat(filePath)
return !errors.Is(err, os.ErrNotExist)
}
func getPath( func getPath(
job *schema.Job, job *schema.Job,
rootPath string, rootPath string,
@ -44,54 +53,109 @@ func getPath(
func loadJobMeta(filename string) (*schema.JobMeta, error) { func loadJobMeta(filename string) (*schema.JobMeta, error) {
f, err := os.Open(filename) b, err := os.ReadFile(filename)
if err != nil { if err != nil {
log.Errorf("loadJobMeta() > open file error: %v", err) log.Errorf("loadJobMeta() > open file error: %v", err)
return &schema.JobMeta{}, err return &schema.JobMeta{}, err
} }
defer f.Close() if config.Keys.Validate {
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
return DecodeJobMeta(bufio.NewReader(f)) return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err)
}
} }
func (fsa *FsArchive) Init(rawConfig json.RawMessage) error { return DecodeJobMeta(bytes.NewReader(b))
}
func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend LoadJobData()- %v", err)
return nil, err
}
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(r, filename)
} else {
defer f.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(bufio.NewReader(f), filename)
}
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) (int, error) {
var config FsArchiveConfig var config FsArchiveConfig
if err := json.Unmarshal(rawConfig, &config); err != nil { if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Warnf("Init() > Unmarshal error: %#v", err) log.Warnf("Init() > Unmarshal error: %#v", err)
return err return 0, err
} }
if config.Path == "" { if config.Path == "" {
err := fmt.Errorf("Init() : empty config.Path") err := fmt.Errorf("Init() : empty config.Path")
log.Errorf("Init() > config.Path error: %v", err) log.Errorf("Init() > config.Path error: %v", err)
return err return 0, err
} }
fsa.path = config.Path fsa.path = config.Path
b, err := os.ReadFile(filepath.Join(fsa.path, "version.txt"))
if err != nil {
fmt.Println("Err")
return 0, err
}
version, err := strconv.Atoi(strings.TrimSuffix(string(b), "\n"))
if err != nil {
log.Errorf("fsBackend Init()- %v", err)
return 0, err
}
if version != Version {
return version, fmt.Errorf("unsupported version %d, need %d", version, Version)
}
entries, err := os.ReadDir(fsa.path) entries, err := os.ReadDir(fsa.path)
if err != nil { if err != nil {
log.Errorf("Init() > ReadDir() error: %v", err) log.Errorf("Init() > ReadDir() error: %v", err)
return err return 0, err
} }
for _, de := range entries { for _, de := range entries {
if !de.IsDir() {
continue
}
fsa.clusters = append(fsa.clusters, de.Name()) fsa.clusters = append(fsa.clusters, de.Name())
} }
return nil return version, nil
} }
func (fsa *FsArchive) LoadJobData(job *schema.Job) (schema.JobData, error) { func (fsa *FsArchive) LoadJobData(job *schema.Job) (schema.JobData, error) {
var isCompressed bool = true
filename := getPath(job, fsa.path, "data.json") filename := getPath(job, fsa.path, "data.json.gz")
f, err := os.Open(filename) if !checkFileExists(filename) {
if err != nil { filename = getPath(job, fsa.path, "data.json")
log.Errorf("LoadJobData() > open file error: %v", err) isCompressed = false
return nil, err
} }
defer f.Close()
return DecodeJobData(bufio.NewReader(f), filename) return loadJobData(filename, isCompressed)
} }
func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) { func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) {
@ -105,20 +169,19 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) {
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json")) b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
if err != nil { if err != nil {
log.Errorf("LoadClusterCfg() > open file error: %v", err) log.Errorf("LoadClusterCfg() > open file error: %v", err)
return &schema.Cluster{}, err // if config.Keys.Validate {
}
if config.Keys.Validate {
if err := schema.Validate(schema.ClusterCfg, bytes.NewReader(b)); err != nil { if err := schema.Validate(schema.ClusterCfg, bytes.NewReader(b)); err != nil {
log.Warnf("Validate cluster config: %v\n", err) log.Warnf("Validate cluster config: %v\n", err)
return &schema.Cluster{}, fmt.Errorf("Validate cluster config: %v\n", err) return &schema.Cluster{}, fmt.Errorf("validate cluster config: %v", err)
} }
} }
// }
return DecodeCluster(bytes.NewReader(b)) return DecodeCluster(bytes.NewReader(b))
} }
func (fsa *FsArchive) Iter() <-chan *schema.JobMeta { func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
ch := make(chan *schema.JobMeta) ch := make(chan JobContainer)
go func() { go func() {
clustersDir, err := os.ReadDir(fsa.path) clustersDir, err := os.ReadDir(fsa.path)
if err != nil { if err != nil {
@ -126,6 +189,9 @@ func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
} }
for _, clusterDir := range clustersDir { for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
continue
}
lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name())) lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name()))
if err != nil { if err != nil {
log.Fatalf("Reading jobs failed @ lvl1 dirs: %s", err.Error()) log.Fatalf("Reading jobs failed @ lvl1 dirs: %s", err.Error())
@ -152,10 +218,27 @@ func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
for _, startTimeDir := range startTimeDirs { for _, startTimeDir := range startTimeDirs {
if startTimeDir.IsDir() { if startTimeDir.IsDir() {
job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json")) job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json"))
if err != nil { if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
log.Errorf("error in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error()) log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
}
if loadMetricData {
var isCompressed bool = true
filename := filepath.Join(dirpath, startTimeDir.Name(), "data.json.gz")
if !checkFileExists(filename) {
filename = filepath.Join(dirpath, startTimeDir.Name(), "data.json")
isCompressed = false
}
data, err := loadJobData(filename, isCompressed)
if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
}
ch <- JobContainer{Meta: job, Data: &data}
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
} else { } else {
ch <- job ch <- JobContainer{Meta: job, Data: nil}
} }
} }
} }
@ -225,6 +308,28 @@ func (fsa *FsArchive) ImportJob(
return err return err
} }
// var isCompressed bool = true
// // TODO Use shortJob Config for check
// if jobMeta.Duration < 300 {
// isCompressed = false
// f, err = os.Create(path.Join(dir, "data.json"))
// } else {
// f, err = os.Create(path.Join(dir, "data.json.gz"))
// }
// if err != nil {
// return err
// }
//
// if isCompressed {
// if err := EncodeJobData(gzip.NewWriter(f), jobData); err != nil {
// return err
// }
// } else {
// if err := EncodeJobData(f, jobData); err != nil {
// return err
// }
// }
f, err = os.Create(path.Join(dir, "data.json")) f, err = os.Create(path.Join(dir, "data.json"))
if err != nil { if err != nil {
log.Error("Error while creating filepath for data.json") log.Error("Error while creating filepath for data.json")
@ -236,9 +341,6 @@ func (fsa *FsArchive) ImportJob(
} }
if err := f.Close(); err != nil { if err := f.Close(); err != nil {
log.Warn("Error while closing data.json file") log.Warn("Error while closing data.json file")
}
return err return err
} }
// no error: final return is nil
return nil
}

View File

@ -20,7 +20,7 @@ func init() {
func TestInitEmptyPath(t *testing.T) { func TestInitEmptyPath(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"kind\":\"../../test/archive\"}")) _, err := fsa.Init(json.RawMessage("{\"kind\":\"../../test/archive\"}"))
if err == nil { if err == nil {
t.Fatal(err) t.Fatal(err)
} }
@ -28,14 +28,14 @@ func TestInitEmptyPath(t *testing.T) {
func TestInitNoJson(t *testing.T) { func TestInitNoJson(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("\"path\":\"../../test/archive\"}")) _, err := fsa.Init(json.RawMessage("\"path\":\"../../test/archive\"}"))
if err == nil { if err == nil {
t.Fatal(err) t.Fatal(err)
} }
} }
func TestInitNotExists(t *testing.T) { func TestInitNotExists(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/job-archive\"}")) _, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/job-archive\"}"))
if err == nil { if err == nil {
t.Fatal(err) t.Fatal(err)
} }
@ -43,15 +43,16 @@ func TestInitNotExists(t *testing.T) {
func TestInit(t *testing.T) { func TestInit(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}")) version, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
if fsa.path != "../../test/archive" { if fsa.path != "../../test/archive" {
t.Fail() t.Fail()
} }
if version != 1 {
t.Fail()
}
if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" { if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" {
t.Fail() t.Fail()
} }
@ -59,7 +60,7 @@ func TestInit(t *testing.T) {
func TestLoadJobMetaInternal(t *testing.T) { func TestLoadJobMetaInternal(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}")) _, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -82,7 +83,7 @@ func TestLoadJobMetaInternal(t *testing.T) {
func TestLoadJobMeta(t *testing.T) { func TestLoadJobMeta(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}")) _, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -110,7 +111,7 @@ func TestLoadJobMeta(t *testing.T) {
func TestLoadJobData(t *testing.T) { func TestLoadJobData(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}")) _, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -136,7 +137,7 @@ func TestLoadJobData(t *testing.T) {
func TestLoadCluster(t *testing.T) { func TestLoadCluster(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}")) _, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -146,22 +147,22 @@ func TestLoadCluster(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if cfg.SubClusters[0].CoresPerSocket != 10 { if cfg.SubClusters[0].CoresPerSocket != 4 {
t.Fail() t.Fail()
} }
} }
func TestIter(t *testing.T) { func TestIter(t *testing.T) {
var fsa FsArchive var fsa FsArchive
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}")) _, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
for job := range fsa.Iter() { for job := range fsa.Iter(false) {
fmt.Printf("Job %d\n", job.JobID) fmt.Printf("Job %d\n", job.Meta.JobID)
if job.Cluster != "emmy" { if job.Meta.Cluster != "emmy" {
t.Fail() t.Fail()
} }
} }

View File

@ -14,6 +14,8 @@ import (
type NodeList [][]interface { type NodeList [][]interface {
consume(input string) (next string, ok bool) consume(input string) (next string, ok bool)
limits() []map[string]int
prefix() string
} }
func (nl *NodeList) Contains(name string) bool { func (nl *NodeList) Contains(name string) bool {
@ -35,6 +37,44 @@ func (nl *NodeList) Contains(name string) bool {
return false return false
} }
func (nl *NodeList) PrintList() []string {
var out []string
for _, term := range *nl {
// Get String-Part first
prefix := term[0].prefix()
if len(term) == 1 { // If only String-Part in Term: Single Node Name -> Use as provided
out = append(out, prefix)
} else { // Else: Numeric start-end definition with x digits zeroPadded
limitArr := term[1].limits()
for _, inner := range limitArr {
for i := inner["start"]; i < inner["end"]+1; i++ {
if inner["zeroPadded"] == 1 {
out = append(out, fmt.Sprintf("%s%0*d", prefix, inner["digits"], i))
} else {
log.Error("node list: only zero-padded ranges are allowed")
}
}
}
}
}
return out
}
func (nl *NodeList) NodeCount() int {
var out int = 0
for _, term := range *nl {
if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one
out += 1
} else { // Else: Numeric start-end definition -> add difference + 1
limitArr := term[1].limits()
for _, inner := range limitArr {
out += (inner["end"] - inner["start"]) + 1
}
}
}
return out
}
type NLExprString string type NLExprString string
func (nle NLExprString) consume(input string) (next string, ok bool) { func (nle NLExprString) consume(input string) (next string, ok bool) {
@ -45,6 +85,16 @@ func (nle NLExprString) consume(input string) (next string, ok bool) {
return "", false return "", false
} }
func (nle NLExprString) limits() []map[string]int {
// Null implementation to fullfill interface requirement
l := make([]map[string]int, 0)
return l
}
func (nle NLExprString) prefix() string {
return string(nle)
}
type NLExprIntRanges []NLExprIntRange type NLExprIntRanges []NLExprIntRange
func (nles NLExprIntRanges) consume(input string) (next string, ok bool) { func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
@ -56,6 +106,21 @@ func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
return "", false return "", false
} }
func (nles NLExprIntRanges) limits() []map[string]int {
l := make([]map[string]int, 0)
for _, nle := range nles {
inner := nle.limits()
l = append(l, inner[0])
}
return l
}
func (nles NLExprIntRanges) prefix() string {
// Null implementation to fullfill interface requirement
var s string
return s
}
type NLExprIntRange struct { type NLExprIntRange struct {
start, end int64 start, end int64
zeroPadded bool zeroPadded bool
@ -89,6 +154,27 @@ func (nle NLExprIntRange) consume(input string) (next string, ok bool) {
return "", false return "", false
} }
func (nle NLExprIntRange) limits() []map[string]int {
l := make([]map[string]int, 0)
m := make(map[string]int)
m["start"] = int(nle.start)
m["end"] = int(nle.end)
m["digits"] = int(nle.digits)
if nle.zeroPadded == true {
m["zeroPadded"] = 1
} else {
m["zeroPadded"] = 0
}
l = append(l, m)
return l
}
func (nles NLExprIntRange) prefix() string {
// Null implementation to fullfill interface requirement
var s string
return s
}
func ParseNodeList(raw string) (NodeList, error) { func ParseNodeList(raw string) (NodeList, error) {
isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') } isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') }
isDigit := func(r byte) bool { return '0' <= r && r <= '9' } isDigit := func(r byte) bool { return '0' <= r && r <= '9' }
@ -117,6 +203,8 @@ func ParseNodeList(raw string) (NodeList, error) {
for _, rawterm := range rawterms { for _, rawterm := range rawterms {
exprs := []interface { exprs := []interface {
consume(input string) (next string, ok bool) consume(input string) (next string, ok bool)
limits() []map[string]int
prefix() string
}{} }{}
for i := 0; i < len(rawterm); i++ { for i := 0; i < len(rawterm); i++ {

View File

@ -4,7 +4,10 @@
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
package schema package schema
import "strconv" import (
"fmt"
"strconv"
)
type Accelerator struct { type Accelerator struct {
ID string `json:"id"` ID string `json:"id"`
@ -16,23 +19,27 @@ type Topology struct {
Node []int `json:"node"` Node []int `json:"node"`
Socket [][]int `json:"socket"` Socket [][]int `json:"socket"`
MemoryDomain [][]int `json:"memoryDomain"` MemoryDomain [][]int `json:"memoryDomain"`
Die [][]int `json:"die"` Die [][]*int `json:"die,omitempty"`
Core [][]int `json:"core"` Core [][]int `json:"core"`
Accelerators []*Accelerator `json:"accelerators"` Accelerators []*Accelerator `json:"accelerators,omitempty"`
}
type MetricValue struct {
Unit Unit `json:"unit"`
Value float64 `json:"value"`
} }
type SubCluster struct { type SubCluster struct {
Name string `json:"name"` Name string `json:"name"`
Nodes string `json:"nodes"` Nodes string `json:"nodes"`
NumberOfNodes int `json:"numberOfNodes"`
ProcessorType string `json:"processorType"` ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"` SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"` CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"` ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"` FlopRateScalar MetricValue `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"` FlopRateSimd MetricValue `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"` MemoryBandwidth MetricValue `json:"memoryBandwidth"`
Topology *Topology `json:"topology"` Topology Topology `json:"topology"`
} }
type SubClusterConfig struct { type SubClusterConfig struct {
@ -41,19 +48,20 @@ type SubClusterConfig struct {
Normal float64 `json:"normal"` Normal float64 `json:"normal"`
Caution float64 `json:"caution"` Caution float64 `json:"caution"`
Alert float64 `json:"alert"` Alert float64 `json:"alert"`
Remove bool `json:"remove"`
} }
type MetricConfig struct { type MetricConfig struct {
Name string `json:"name"` Name string `json:"name"`
Unit string `json:"unit"` Unit Unit `json:"unit"`
Scope MetricScope `json:"scope"` Scope MetricScope `json:"scope"`
Aggregation *string `json:"aggregation"` Aggregation string `json:"aggregation"`
Timestep int `json:"timestep"` Timestep int `json:"timestep"`
Peak *float64 `json:"peak"` Peak float64 `json:"peak"`
Normal *float64 `json:"normal"` Normal float64 `json:"normal"`
Caution *float64 `json:"caution"` Caution float64 `json:"caution"`
Alert *float64 `json:"alert"` Alert float64 `json:"alert"`
SubClusters []*SubClusterConfig `json:"subClusters"` SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
} }
type Cluster struct { type Cluster struct {
@ -152,6 +160,15 @@ func (topo *Topology) GetMemoryDomainsFromHWThreads(
return memDoms, exclusive return memDoms, exclusive
} }
// Temporary fix to convert back from int id to string id for accelerators
func (topo *Topology) GetAcceleratorID(id int) (string, error) {
if id < len(topo.Accelerators) {
return topo.Accelerators[id].ID, nil
} else {
return "", fmt.Errorf("Index %d out of range", id)
}
}
func (topo *Topology) GetAcceleratorIDs() ([]int, error) { func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
accels := make([]int, 0) accels := make([]int, 0)
for _, accel := range topo.Accelerators { for _, accel := range topo.Accelerators {
@ -163,12 +180,3 @@ func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
} }
return accels, nil return accels, nil
} }
func (topo *Topology) GetAcceleratorIndex(id string) (int, bool) {
for idx, accel := range topo.Accelerators {
if accel.ID == id {
return idx, true
}
}
return -1, false
}

View File

@ -83,10 +83,10 @@ func (s *Series) MarshalJSON() ([]byte, error) {
buf = append(buf, s.Hostname...) buf = append(buf, s.Hostname...)
buf = append(buf, '"') buf = append(buf, '"')
if s.Id != nil { if s.Id != nil {
buf = append(buf, `,"id":`...) buf = append(buf, `,"id":"`...)
buf = strconv.AppendInt(buf, int64(*s.Id), 10) buf = append(buf, *s.Id...)
buf = append(buf, '"')
} }
if s.Statistics != nil {
buf = append(buf, `,"statistics":{"min":`...) buf = append(buf, `,"statistics":{"min":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64) buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
buf = append(buf, `,"avg":`...) buf = append(buf, `,"avg":`...)
@ -94,7 +94,6 @@ func (s *Series) MarshalJSON() ([]byte, error) {
buf = append(buf, `,"max":`...) buf = append(buf, `,"max":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64) buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
buf = append(buf, '}') buf = append(buf, '}')
}
buf = append(buf, `,"data":[`...) buf = append(buf, `,"data":[`...)
for i := 0; i < len(s.Data); i++ { for i := 0; i < len(s.Data); i++ {
if i != 0 { if i != 0 {
@ -110,3 +109,23 @@ func (s *Series) MarshalJSON() ([]byte, error) {
buf = append(buf, ']', '}') buf = append(buf, ']', '}')
return buf, nil return buf, nil
} }
func ConvertFloatToFloat64(s []Float) []float64 {
fp := make([]float64, len(s))
for i, val := range s {
fp[i] = float64(val)
}
return fp
}
func GetFloat64ToFloat(s []float64) []Float {
fp := make([]Float, len(s))
for i, val := range s {
fp[i] = Float(val)
}
return fp
}

View File

@ -21,18 +21,18 @@ type BaseJob struct {
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
Partition string `json:"partition" db:"partition" example:"main"` // The Slurm partition to which the job was submitted Partition *string `json:"partition,omitempty" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
ArrayJobId int64 `json:"arrayJobId" db:"array_job_id" example:"123000"` // The unique identifier of an array job ArrayJobId *int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` // The unique identifier of an array job
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0) NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
NumHWThreads int32 `json:"numHwthreads" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0) NumHWThreads *int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumAcc int32 `json:"numAcc" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0) NumAcc *int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
SMT int32 `json:"smt" db:"smt" example:"4"` // SMT threads used by job SMT *int32 `json:"smt,omitempty" db:"smt" example:"4"` // SMT threads used by job
State JobState `json:"jobState" db:"job_state" example:"completed"` // Final state of job State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0) Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
Walltime int64 `json:"walltime" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0) Walltime *int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
Tags []*Tag `json:"tags"` // List of tags Tags []*Tag `json:"tags,omitempty"` // List of tags
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes] RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
Resources []*Resource `json:"resources"` // Resources used by job Resources []*Resource `json:"resources"` // Resources used by job
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes] RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
@ -89,11 +89,15 @@ var JobDefaults BaseJob = BaseJob{
MonitoringStatus: MonitoringStatusRunningOrArchiving, MonitoringStatus: MonitoringStatusRunningOrArchiving,
} }
type Unit struct {
Base string `json:"base"`
Prefix *string `json:"prefix,omitempty"`
}
// JobStatistics model // JobStatistics model
// @Description Specification for job metric statistics. // @Description Specification for job metric statistics.
type JobStatistics struct { type JobStatistics struct {
// Metric unit (see schema/unit.schema.json) Unit Unit `json:"unit" example:"GHz"`
Unit string `json:"unit" example:"GHz"`
Avg float64 `json:"avg" example:"2500" minimum:"0"` // Job metric average Avg float64 `json:"avg" example:"2500" minimum:"0"` // Job metric average
Min float64 `json:"min" example:"2000" minimum:"0"` // Job metric minimum Min float64 `json:"min" example:"2000" minimum:"0"` // Job metric minimum
Max float64 `json:"max" example:"3000" minimum:"0"` // Job metric maximum Max float64 `json:"max" example:"3000" minimum:"0"` // Job metric maximum
@ -102,6 +106,7 @@ type JobStatistics struct {
// Tag model // Tag model
// @Description Defines a tag using name and type. // @Description Defines a tag using name and type.
type Tag struct { type Tag struct {
// The unique DB identifier of a tag
// The unique DB identifier of a tag // The unique DB identifier of a tag
ID int64 `json:"id" db:"id"` ID int64 `json:"id" db:"id"`
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type

View File

@ -15,17 +15,16 @@ import (
type JobData map[string]map[MetricScope]*JobMetric type JobData map[string]map[MetricScope]*JobMetric
type JobMetric struct { type JobMetric struct {
Unit string `json:"unit"` Unit Unit `json:"unit"`
Scope MetricScope `json:"scope"`
Timestep int `json:"timestep"` Timestep int `json:"timestep"`
Series []Series `json:"series"` Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries"` StatisticsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
} }
type Series struct { type Series struct {
Hostname string `json:"hostname"` Hostname string `json:"hostname"`
Id *int `json:"id,omitempty"` Id *string `json:"id,omitempty"`
Statistics *MetricStatistics `json:"statistics"` Statistics MetricStatistics `json:"statistics"`
Data []Float `json:"data"` Data []Float `json:"data"`
} }
@ -218,17 +217,12 @@ func (jd *JobData) AddNodeScope(metric string) bool {
nodeJm := &JobMetric{ nodeJm := &JobMetric{
Unit: jm.Unit, Unit: jm.Unit,
Scope: MetricScopeNode,
Timestep: jm.Timestep, Timestep: jm.Timestep,
Series: make([]Series, 0, len(hosts)), Series: make([]Series, 0, len(hosts)),
} }
for hostname, series := range hosts { for hostname, series := range hosts {
min, sum, max := math.MaxFloat32, 0.0, -math.MaxFloat32 min, sum, max := math.MaxFloat32, 0.0, -math.MaxFloat32
for _, series := range series { for _, series := range series {
if series.Statistics == nil {
min, sum, max = math.NaN(), math.NaN(), math.NaN()
break
}
sum += series.Statistics.Avg sum += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min) min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max) max = math.Max(max, series.Statistics.Max)
@ -259,7 +253,7 @@ func (jd *JobData) AddNodeScope(metric string) bool {
nodeJm.Series = append(nodeJm.Series, Series{ nodeJm.Series = append(nodeJm.Series, Series{
Hostname: hostname, Hostname: hostname,
Statistics: &MetricStatistics{Min: min, Avg: sum / float64(len(series)), Max: max}, Statistics: MetricStatistics{Min: min, Avg: sum / float64(len(series)), Max: max},
Data: data, Data: data,
}) })
} }

View File

@ -21,7 +21,7 @@
}, },
"unit": { "unit": {
"description": "Metric unit", "description": "Metric unit",
"type": "string" "$ref": "embedfs://unit.schema.json"
}, },
"scope": { "scope": {
"description": "Native measurement resolution", "description": "Native measurement resolution",
@ -38,7 +38,22 @@
"sum", "sum",
"avg" "avg"
] ]
},
"peak": {
"description": "Metric peak threshold (Upper metric limit)",
"type": "number"
},
"normal": {
"description": "Metric normal threshold",
"type": "number"
},
"caution": {
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
"type": "number"
},
"alert": {
"description": "Metric alert threshold (Requires immediate action)",
"type": "number"
}, },
"subClusters": { "subClusters": {
"description": "Array of cluster hardware partition metric thresholds", "description": "Array of cluster hardware partition metric thresholds",
@ -61,13 +76,13 @@
}, },
"alert": { "alert": {
"type": "number" "type": "number"
},
"remove": {
"type": "boolean"
} }
}, },
"required": [ "required": [
"name", "name"
"peak",
"caution",
"alert"
] ]
} }
} }
@ -76,7 +91,12 @@
"name", "name",
"unit", "unit",
"scope", "scope",
"timestep" "timestep",
"aggregation",
"peak",
"normal",
"caution",
"alert"
] ]
}, },
"minItems": 1 "minItems": 1
@ -109,15 +129,42 @@
}, },
"flopRateScalar": { "flopRateScalar": {
"description": "Theoretical node peak flop rate for scalar code in GFlops/s", "description": "Theoretical node peak flop rate for scalar code in GFlops/s",
"type": "integer" "type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
}, },
"flopRateSimd": { "flopRateSimd": {
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s", "description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
"type": "integer" "type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
}, },
"memoryBandwidth": { "memoryBandwidth": {
"description": "Theoretical node peak memory bandwidth in GB/s", "description": "Theoretical node peak memory bandwidth in GB/s",
"type": "integer" "type": "object",
"properties": {
"unit": {
"description": "Metric unit",
"$ref": "embedfs://unit.schema.json"
},
"value": {
"type": "number"
}
}
}, },
"nodes": { "nodes": {
"description": "Node list expression", "description": "Node list expression",
@ -215,6 +262,7 @@
}, },
"required": [ "required": [
"name", "name",
"nodes",
"topology", "topology",
"processorType", "processorType",
"socketsPerNode", "socketsPerNode",

View File

@ -86,8 +86,8 @@
}, },
"minProperties": 1 "minProperties": 1
}, },
"cpu_used": { "cpu_user": {
"description": "CPU active core utilization", "description": "CPU user active core utilization",
"properties": { "properties": {
"node": { "node": {
"$ref": "embedfs://job-metric-data.schema.json" "$ref": "embedfs://job-metric-data.schema.json"
@ -479,7 +479,7 @@
] ]
}, },
"required": [ "required": [
"cpu_used", "cpu_user",
"mem_used", "mem_used",
"flops_any", "flops_any",
"mem_bw", "mem_bw",

View File

@ -84,11 +84,6 @@
"type": "integer", "type": "integer",
"exclusiveMinimum": 0 "exclusiveMinimum": 0
}, },
"stopTime": {
"description": "Stop epoch time stamp in seconds",
"type": "integer",
"exclusiveMinimum": 0
},
"duration": { "duration": {
"description": "Duration of job in seconds", "description": "Duration of job in seconds",
"type": "integer", "type": "integer",
@ -198,8 +193,8 @@
"description": "Instructions executed per cycle", "description": "Instructions executed per cycle",
"$ref": "embedfs://job-metric-statistics.schema.json" "$ref": "embedfs://job-metric-statistics.schema.json"
}, },
"cpu_used": { "cpu_user": {
"description": "CPU active core utilization", "description": "CPU user active core utilization",
"$ref": "embedfs://job-metric-statistics.schema.json" "$ref": "embedfs://job-metric-statistics.schema.json"
}, },
"flops_dp": { "flops_dp": {
@ -331,7 +326,7 @@
} }
}, },
"required": [ "required": [
"cpu_used", "cpu_user",
"mem_used", "mem_used",
"flops_any", "flops_any",
"mem_bw" "mem_bw"
@ -343,13 +338,13 @@
"user", "user",
"project", "project",
"cluster", "cluster",
"subCluster",
"numNodes", "numNodes",
"exclusive", "exclusive",
"startTime", "startTime",
"jobState", "jobState",
"duration", "duration",
"resources", "resources",
"tags",
"statistics" "statistics"
] ]
} }

View File

@ -193,7 +193,7 @@
}, },
"data": { "data": {
"type": "array", "type": "array",
"items": { "contains": {
"type": "number", "type": "number",
"minimum": 0 "minimum": 0
}, },

View File

@ -5,7 +5,7 @@
"description": "Format specification for job metric units", "description": "Format specification for job metric units",
"type": "object", "type": "object",
"properties": { "properties": {
"base_unit": { "base": {
"description": "Metric base unit", "description": "Metric base unit",
"type": "string", "type": "string",
"enum": [ "enum": [
@ -15,7 +15,6 @@
"F/s", "F/s",
"CPI", "CPI",
"IPC", "IPC",
"load",
"Hz", "Hz",
"W", "W",
"°C", "°C",
@ -36,6 +35,6 @@
} }
}, },
"required": [ "required": [
"base_unit" "base"
] ]
} }

View File

@ -45,9 +45,29 @@ func TestValidateCluster(t *testing.T) {
"socketsPerNode": 2, "socketsPerNode": 2,
"coresPerSocket": 10, "coresPerSocket": 10,
"threadsPerCore": 2, "threadsPerCore": 2,
"flopRateScalar": 44, "flopRateScalar": {
"flopRateSimd": 704, "unit": {
"memoryBandwidth": 80, "prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"nodes": "w11[27-45,49-63,69-72]",
"topology": { "topology": {
"node": [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29,10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39], "node": [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29,10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39],
"socket": [ "socket": [
@ -68,8 +88,13 @@ func TestValidateCluster(t *testing.T) {
{ {
"name": "cpu_load", "name": "cpu_load",
"scope": "hwthread", "scope": "hwthread",
"unit": "load", "unit": {"base": ""},
"timestep": 60 "aggregation": "avg",
"timestep": 60,
"peak": 4,
"normal": 2,
"caution": 1,
"alert": 0.25
} }
] ]
}`) }`)

View File

@ -1,6 +1,7 @@
# cc-units - A unit system for ClusterCockpit # cc-units - A unit system for ClusterCockpit
When working with metrics, the problem comes up that they may use different unit name but have the same unit in fact. There are a lot of real world examples like 'kB' and 'Kbyte'. In [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector), the collectors read data from different sources which may use different units or the programmer specifies a unit for a metric by hand. The cc-units system is not comparable with the SI unit system. If you are looking for a package for the SI units, see [here](https://pkg.go.dev/github.com/gurre/si). When working with metrics, the problem comes up that they may use different unit name but have the same unit in fact.
There are a lot of real world examples like 'kB' and 'Kbyte'. In [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector), the collectors read data from different sources which may use different units or the programmer specifies a unit for a metric by hand. The cc-units system is not comparable with the SI unit system. If you are looking for a package for the SI units, see [here](https://pkg.go.dev/github.com/gurre/si).
In order to enable unit comparison and conversion, the ccUnits package provides some helpers: In order to enable unit comparison and conversion, the ccUnits package provides some helpers:
```go ```go

View File

@ -39,7 +39,7 @@ var MeasuresMap map[Measure]MeasureData = map[Measure]MeasureData{
}, },
Flops: { Flops: {
Long: "Flops", Long: "Flops",
Short: "Flops", Short: "F",
Regex: "^([fF][lL]?[oO]?[pP]?[sS]?)", Regex: "^([fF][lL]?[oO]?[pP]?[sS]?)",
}, },
Percentage: { Percentage: {

View File

@ -1,6 +1,7 @@
package units package units
import ( import (
"math"
"regexp" "regexp"
) )
@ -172,3 +173,20 @@ func NewPrefix(prefix string) Prefix {
} }
return InvalidPrefix return InvalidPrefix
} }
func getExponent(p float64) int {
count := 0
for p > 1.0 {
p = p / 1000.0
count++
}
return count * 3
}
func NewPrefixFromFactor(op Prefix, e int) Prefix {
f := float64(op)
exp := math.Pow10(getExponent(f) - e)
return Prefix(exp)
}

View File

@ -3,7 +3,10 @@ package units
import ( import (
"fmt" "fmt"
"math"
"strings" "strings"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
) )
type unit struct { type unit struct {
@ -25,7 +28,9 @@ type Unit interface {
var INVALID_UNIT = NewUnit("foobar") var INVALID_UNIT = NewUnit("foobar")
// Valid checks whether a unit is a valid unit. A unit is valid if it has at least a prefix and a measure. The unit denominator is optional. // Valid checks whether a unit is a valid unit.
// A unit is valid if it has at least a prefix and a measure.
// The unit denominator is optional.
func (u *unit) Valid() bool { func (u *unit) Valid() bool {
return u.prefix != InvalidPrefix && u.measure != InvalidMeasure return u.prefix != InvalidPrefix && u.measure != InvalidMeasure
} }
@ -71,6 +76,90 @@ func (u *unit) getUnitDenominator() Measure {
return u.divMeasure return u.divMeasure
} }
func ConvertValue(v *float64, from string, to string) {
uf := NewUnit(from)
ut := NewUnit(to)
factor := float64(uf.getPrefix()) / float64(ut.getPrefix())
*v = math.Ceil(*v * factor)
}
func ConvertSeries(s []float64, from string, to string) {
uf := NewUnit(from)
ut := NewUnit(to)
factor := float64(uf.getPrefix()) / float64(ut.getPrefix())
for i := 0; i < len(s); i++ {
s[i] = math.Ceil(s[i] * factor)
}
}
func getNormalizationFactor(v float64) (float64, int) {
count := 0
scale := -3
if v > 1000.0 {
for v > 1000.0 {
v *= 1e-3
count++
}
} else {
for v < 1.0 {
v *= 1e3
count++
}
scale = 3
}
return math.Pow10(count * scale), count * scale
}
func NormalizeValue(v *float64, us string, nu *string) {
u := NewUnit(us)
f, e := getNormalizationFactor((*v))
*v = math.Ceil(*v * f)
u.setPrefix(NewPrefixFromFactor(u.getPrefix(), e))
*nu = u.Short()
}
func NormalizeSeries(s []float64, avg float64, us string, nu *string) {
u := NewUnit(us)
f, e := getNormalizationFactor(avg)
for i := 0; i < len(s); i++ {
s[i] *= f
s[i] = math.Ceil(s[i])
}
u.setPrefix(NewPrefixFromFactor(u.getPrefix(), e))
fmt.Printf("Prefix: %e \n", u.getPrefix())
*nu = u.Short()
}
func ConvertUnitString(us string) schema.Unit {
var nu schema.Unit
if us == "CPI" ||
us == "IPC" ||
us == "load" ||
us == "" {
nu.Base = us
return nu
}
u := NewUnit(us)
p := u.getPrefix()
if p.Prefix() != "" {
prefix := p.Prefix()
nu.Prefix = &prefix
}
m := u.getMeasure()
d := u.getUnitDenominator()
if d.Short() != "inval" {
nu.Base = fmt.Sprintf("%s/%s", m.Short(), d.Short())
} else {
nu.Base = m.Short()
}
return nu
}
// GetPrefixPrefixFactor creates the default conversion function between two prefixes. // GetPrefixPrefixFactor creates the default conversion function between two prefixes.
// It returns a conversation function for the value. // It returns a conversation function for the value.
func GetPrefixPrefixFactor(in Prefix, out Prefix) func(value interface{}) interface{} { func GetPrefixPrefixFactor(in Prefix, out Prefix) func(value interface{}) interface{} {

View File

@ -2,6 +2,7 @@ package units
import ( import (
"fmt" "fmt"
"reflect"
"regexp" "regexp"
"testing" "testing"
) )
@ -199,3 +200,108 @@ func TestPrefixRegex(t *testing.T) {
t.Logf("succussfully compiled regex '%s' for prefix %s", data.Regex, data.Long) t.Logf("succussfully compiled regex '%s' for prefix %s", data.Regex, data.Long)
} }
} }
func TestConvertValue(t *testing.T) {
v := float64(103456)
ConvertValue(&v, "MB/s", "GB/s")
if v != 104.00 {
t.Errorf("Failed ConvertValue: Want 103.456, Got %f", v)
}
}
func TestConvertValueUp(t *testing.T) {
v := float64(10.3456)
ConvertValue(&v, "GB/s", "MB/s")
if v != 10346.00 {
t.Errorf("Failed ConvertValue: Want 10346.00, Got %f", v)
}
}
func TestConvertSeries(t *testing.T) {
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
r := []float64{3, 24, 390, 391}
ConvertSeries(s, "F/s", "GF/s")
if !reflect.DeepEqual(s, r) {
t.Errorf("Failed ConvertValue: Want 3, 24, 390, 391, Got %v", s)
}
}
func TestNormalizeValue(t *testing.T) {
var s string
v := float64(103456)
NormalizeValue(&v, "MB/s", &s)
if v != 104.00 {
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
}
if s != "GB/s" {
t.Errorf("Failed Prefix or unit: Want GB/s, Got %s", s)
}
}
func TestNormalizeValueNoPrefix(t *testing.T) {
var s string
v := float64(103458596)
NormalizeValue(&v, "F/s", &s)
if v != 104.00 {
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
}
if s != "MF/s" {
t.Errorf("Failed Prefix or unit: Want MF/s, Got %s", s)
}
}
func TestNormalizeValueKeep(t *testing.T) {
var s string
v := float64(345)
NormalizeValue(&v, "MB/s", &s)
if v != 345.00 {
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
}
if s != "MB/s" {
t.Errorf("Failed Prefix or unit: Want GB/s, Got %s", s)
}
}
func TestNormalizeValueDown(t *testing.T) {
var s string
v := float64(0.0004578)
NormalizeValue(&v, "GB/s", &s)
if v != 458.00 {
t.Errorf("Failed ConvertValue: Want 458.00, Got %f", v)
}
if s != "KB/s" {
t.Errorf("Failed Prefix or unit: Want KB/s, Got %s", s)
}
}
func TestNormalizeSeries(t *testing.T) {
var us string
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
r := []float64{3, 24, 390, 391}
total := 0.0
for _, number := range s {
total += number
}
avg := total / float64(len(s))
fmt.Printf("AVG: %e\n", avg)
NormalizeSeries(s, avg, "KB/s", &us)
if !reflect.DeepEqual(s, r) {
t.Errorf("Failed ConvertValue: Want 3, 24, 390, 391, Got %v", s)
}
if us != "TB/s" {
t.Errorf("Failed Prefix or unit: Want TB/s, Got %s", us)
}
}

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -1 +1,194 @@
{"exclusive":1,"jobId":1403244,"statistics":{"mem_bw":{"avg":63.57,"min":0,"unit":"GB/s","max":74.5},"rapl_power":{"avg":228.07,"min":0,"unit":"W","max":258.56},"ipc":{"unit":"IPC","max":0.510204081632653,"avg":1.53846153846154,"min":0.0},"clock":{"min":1380.32,"avg":2599.39,"unit":"MHz","max":2634.46},"cpu_load":{"avg":18.4,"min":0,"max":23.58,"unit":"load"},"flops_any":{"max":404.62,"unit":"GF/s","avg":225.59,"min":0},"flops_dp":{"max":0.24,"unit":"GF/s","min":0,"avg":0},"mem_used":{"min":1.55,"avg":27.84,"unit":"GB","max":37.5},"flops_sp":{"min":0,"avg":225.59,"max":404.62,"unit":"GF/s"}},"resources":[{"hostname":"e0102"},{"hostname":"e0103"},{"hostname":"e0105"},{"hostname":"e0106"},{"hostname":"e0107"},{"hostname":"e0108"},{"hostname":"e0114"},{"hostname":"e0320"},{"hostname":"e0321"},{"hostname":"e0325"},{"hostname":"e0404"},{"hostname":"e0415"},{"hostname":"e0433"},{"hostname":"e0437"},{"hostname":"e0439"},{"hostname":"e0501"},{"hostname":"e0503"},{"hostname":"e0505"},{"hostname":"e0506"},{"hostname":"e0512"},{"hostname":"e0513"},{"hostname":"e0514"},{"hostname":"e0653"},{"hostname":"e0701"},{"hostname":"e0716"},{"hostname":"e0727"},{"hostname":"e0728"},{"hostname":"e0925"},{"hostname":"e0926"},{"hostname":"e0929"},{"hostname":"e0934"},{"hostname":"e0951"}],"walltime":10,"jobState":"completed","cluster":"emmy","stopTime":1609009562,"user":"emmyUser6","startTime":1608923076,"partition":"work","tags":[],"project":"no project","numNodes":32,"duration":86486} {
"exclusive": 1,
"jobId": 1403244,
"statistics": {
"mem_bw": {
"avg": 63.57,
"min": 0,
"unit": {
"base": "B/s",
"prefix": "G"
},
"max": 74.5
},
"rapl_power": {
"avg": 228.07,
"min": 0,
"unit": {
"base": "W"
},
"max": 258.56
},
"ipc": {
"unit": {
"base": "IPC"
},
"max": 0.510204081632653,
"avg": 1.53846153846154,
"min": 0.0
},
"clock": {
"min": 1380.32,
"avg": 2599.39,
"unit": {
"base": "Hz",
"prefix": "M"
},
"max": 2634.46
},
"cpu_load": {
"avg": 18.4,
"min": 0,
"max": 23.58,
"unit": {
"base": "load"
}
},
"flops_any": {
"max": 404.62,
"unit": {
"base": "F/s",
"prefix": "G"
},
"avg": 225.59,
"min": 0
},
"flops_dp": {
"max": 0.24,
"unit": {
"base": "F/s",
"prefix": "G"
},
"min": 0,
"avg": 0
},
"mem_used": {
"min": 1.55,
"avg": 27.84,
"unit": {
"base": "B",
"prefix": "G"
},
"max": 37.5
},
"flops_sp": {
"min": 0,
"avg": 225.59,
"max": 404.62,
"unit": {
"base": "F/s",
"prefix": "G"
}
}
},
"resources": [
{
"hostname": "e0102"
},
{
"hostname": "e0103"
},
{
"hostname": "e0105"
},
{
"hostname": "e0106"
},
{
"hostname": "e0107"
},
{
"hostname": "e0108"
},
{
"hostname": "e0114"
},
{
"hostname": "e0320"
},
{
"hostname": "e0321"
},
{
"hostname": "e0325"
},
{
"hostname": "e0404"
},
{
"hostname": "e0415"
},
{
"hostname": "e0433"
},
{
"hostname": "e0437"
},
{
"hostname": "e0439"
},
{
"hostname": "e0501"
},
{
"hostname": "e0503"
},
{
"hostname": "e0505"
},
{
"hostname": "e0506"
},
{
"hostname": "e0512"
},
{
"hostname": "e0513"
},
{
"hostname": "e0514"
},
{
"hostname": "e0653"
},
{
"hostname": "e0701"
},
{
"hostname": "e0716"
},
{
"hostname": "e0727"
},
{
"hostname": "e0728"
},
{
"hostname": "e0925"
},
{
"hostname": "e0926"
},
{
"hostname": "e0929"
},
{
"hostname": "e0934"
},
{
"hostname": "e0951"
}
],
"walltime": 10,
"jobState": "completed",
"cluster": "emmy",
"subCluster": "haswell",
"stopTime": 1609009562,
"user": "emmyUser6",
"startTime": 1608923076,
"partition": "work",
"tags": [],
"project": "no project",
"numNodes": 32,
"duration": 86486
}

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -1 +1,194 @@
{"stopTime":1609387081,"resources":[{"hostname":"e0151"},{"hostname":"e0152"},{"hostname":"e0153"},{"hostname":"e0232"},{"hostname":"e0303"},{"hostname":"e0314"},{"hostname":"e0344"},{"hostname":"e0345"},{"hostname":"e0348"},{"hostname":"e0507"},{"hostname":"e0518"},{"hostname":"e0520"},{"hostname":"e0522"},{"hostname":"e0526"},{"hostname":"e0527"},{"hostname":"e0528"},{"hostname":"e0530"},{"hostname":"e0551"},{"hostname":"e0604"},{"hostname":"e0613"},{"hostname":"e0634"},{"hostname":"e0639"},{"hostname":"e0640"},{"hostname":"e0651"},{"hostname":"e0653"},{"hostname":"e0701"},{"hostname":"e0704"},{"hostname":"e0751"},{"hostname":"e0809"},{"hostname":"e0814"},{"hostname":"e0819"},{"hostname":"e0908"}],"walltime":10,"cluster":"emmy","jobState":"completed","statistics":{"clock":{"max":2634.9,"unit":"MHz","min":0,"avg":2597.8},"cpu_load":{"max":27.41,"unit":"load","min":0,"avg":18.39},"mem_bw":{"min":0,"avg":63.23,"unit":"GB/s","max":75.06},"ipc":{"min":0.0,"avg":1.53846153846154,"unit":"IPC","max":0.490196078431373},"rapl_power":{"min":0,"avg":227.32,"unit":"W","max":256.22},"mem_used":{"min":1.5,"avg":27.77,"unit":"GB","max":37.43},"flops_sp":{"unit":"GF/s","max":413.21,"min":0,"avg":224.41},"flops_dp":{"max":5.72,"unit":"GF/s","min":0,"avg":0},"flops_any":{"min":0,"avg":224.42,"max":413.21,"unit":"GF/s"}},"exclusive":1,"jobId":1404397,"tags":[],"partition":"work","project":"no project","user":"emmyUser6","startTime":1609300556,"duration":86525,"numNodes":32} {
"stopTime": 1609387081,
"resources": [
{
"hostname": "e0151"
},
{
"hostname": "e0152"
},
{
"hostname": "e0153"
},
{
"hostname": "e0232"
},
{
"hostname": "e0303"
},
{
"hostname": "e0314"
},
{
"hostname": "e0344"
},
{
"hostname": "e0345"
},
{
"hostname": "e0348"
},
{
"hostname": "e0507"
},
{
"hostname": "e0518"
},
{
"hostname": "e0520"
},
{
"hostname": "e0522"
},
{
"hostname": "e0526"
},
{
"hostname": "e0527"
},
{
"hostname": "e0528"
},
{
"hostname": "e0530"
},
{
"hostname": "e0551"
},
{
"hostname": "e0604"
},
{
"hostname": "e0613"
},
{
"hostname": "e0634"
},
{
"hostname": "e0639"
},
{
"hostname": "e0640"
},
{
"hostname": "e0651"
},
{
"hostname": "e0653"
},
{
"hostname": "e0701"
},
{
"hostname": "e0704"
},
{
"hostname": "e0751"
},
{
"hostname": "e0809"
},
{
"hostname": "e0814"
},
{
"hostname": "e0819"
},
{
"hostname": "e0908"
}
],
"walltime": 10,
"cluster": "emmy",
"subCluster": "haswell",
"jobState": "completed",
"statistics": {
"clock": {
"max": 2634.9,
"unit": {
"base": "Hz",
"prefix": "M"
},
"min": 0,
"avg": 2597.8
},
"cpu_load": {
"max": 27.41,
"min": 0,
"avg": 18.39,
"unit": {
"base": "load"
}
},
"mem_bw": {
"min": 0,
"avg": 63.23,
"unit": {
"base": "B/s",
"prefix": "G"
},
"max": 75.06
},
"ipc": {
"min": 0.0,
"avg": 1.53846153846154,
"unit": {
"base": "IPC"
},
"max": 0.490196078431373
},
"rapl_power": {
"min": 0,
"avg": 227.32,
"unit": {
"base": "W"
},
"max": 256.22
},
"mem_used": {
"min": 1.5,
"avg": 27.77,
"unit": {
"base": "B",
"prefix": "G"
},
"max": 37.43
},
"flops_sp": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"max": 413.21,
"min": 0,
"avg": 224.41
},
"flops_dp": {
"max": 5.72,
"unit": {
"base": "F/s",
"prefix": "G"
},
"min": 0,
"avg": 0
},
"flops_any": {
"min": 0,
"avg": 224.42,
"max": 413.21,
"unit": {
"base": "F/s",
"prefix": "G"
}
}
},
"exclusive": 1,
"jobId": 1404397,
"tags": [],
"partition": "work",
"project": "no project",
"user": "emmyUser6",
"startTime": 1609300556,
"duration": 86525,
"numNodes": 32
}

File diff suppressed because it is too large Load Diff

1
test/archive/version.txt Normal file
View File

@ -0,0 +1 @@
1

View File

@ -1,13 +1,14 @@
{ {
"cpu_used": { "cpu_used": {
"core": { "core": {
"unit": "cpu used", "unit": {
"scope": "core", "base": ""
},
"timestep": 30, "timestep": 30,
"series": [ "series": [
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 0, "id": "0",
"statistics": { "statistics": {
"min": 0.09090909090909093, "min": 0.09090909090909093,
"avg": 0.9173553719008265, "avg": 0.9173553719008265,
@ -29,7 +30,7 @@
}, },
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 1, "id": "1",
"statistics": { "statistics": {
"min": 0.03694102397926118, "min": 0.03694102397926118,
"avg": 0.045968409230268584, "avg": 0.045968409230268584,
@ -51,7 +52,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 10, "id": "10",
"statistics": { "statistics": {
"min": 0.10505319148936171, "min": 0.10505319148936171,
"avg": 0.9186411992263056, "avg": 0.9186411992263056,
@ -73,7 +74,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 11, "id": "11",
"statistics": { "statistics": {
"min": 0.05286048845767815, "min": 0.05286048845767815,
"avg": 0.07053823838706144, "avg": 0.07053823838706144,
@ -99,13 +100,14 @@
}, },
"ipc": { "ipc": {
"core": { "core": {
"unit": "IPC", "unit": {
"scope": "core", "base": "IPC"
},
"timestep": 60, "timestep": 60,
"series": [ "series": [
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 0, "id": "0",
"statistics": { "statistics": {
"min": 1.3808406263195592, "min": 1.3808406263195592,
"avg": 1.3960848578375105, "avg": 1.3960848578375105,
@ -121,7 +123,7 @@
}, },
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 1, "id": "1",
"statistics": { "statistics": {
"min": 0.30469640475234366, "min": 0.30469640475234366,
"avg": 0.8816944294664065, "avg": 0.8816944294664065,
@ -137,7 +139,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 10, "id": "10",
"statistics": { "statistics": {
"min": 1.3791232173760588, "min": 1.3791232173760588,
"avg": 1.3850247295506815, "avg": 1.3850247295506815,
@ -153,7 +155,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 11, "id": "11",
"statistics": { "statistics": {
"min": 0.6424094604392216, "min": 0.6424094604392216,
"avg": 0.9544442638400293, "avg": 0.9544442638400293,
@ -173,13 +175,14 @@
}, },
"flops_any": { "flops_any": {
"core": { "core": {
"unit": "F/s", "unit": {
"scope": "core", "base": "F/s"
},
"timestep": 60, "timestep": 60,
"series": [ "series": [
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 0, "id": "0",
"statistics": { "statistics": {
"min": 0.0, "min": 0.0,
"avg": 184.2699002412084, "avg": 184.2699002412084,
@ -195,7 +198,7 @@
}, },
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 1, "id": "1",
"statistics": { "statistics": {
"min": 0.13559227208748068, "min": 0.13559227208748068,
"avg": 273.2997868356056, "avg": 273.2997868356056,
@ -211,7 +214,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 10, "id": "10",
"statistics": { "statistics": {
"min": 0.0, "min": 0.0,
"avg": 1678.8419461262179, "avg": 1678.8419461262179,
@ -227,7 +230,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 11, "id": "11",
"statistics": { "statistics": {
"min": 45.28689133054866, "min": 45.28689133054866,
"avg": 609.6644949204072, "avg": 609.6644949204072,
@ -247,13 +250,14 @@
}, },
"mem_bw": { "mem_bw": {
"socket": { "socket": {
"unit": "B/s", "unit": {
"scope": "socket", "base": "B/s"
},
"timestep": 60, "timestep": 60,
"series": [ "series": [
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 0, "id": "0",
"statistics": { "statistics": {
"min": 653671812.1661415, "min": 653671812.1661415,
"avg": 1637585527.5854635, "avg": 1637585527.5854635,
@ -269,7 +273,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 0, "id": "0",
"statistics": { "statistics": {
"min": 1520190251.61048, "min": 1520190251.61048,
"avg": 1572477682.3850098, "avg": 1572477682.3850098,
@ -289,8 +293,9 @@
}, },
"file_bw": { "file_bw": {
"node": { "node": {
"unit": "B/s", "unit": {
"scope": "node", "base": "B/s"
},
"timestep": 30, "timestep": 30,
"series": [ "series": [
{ {
@ -341,8 +346,9 @@
}, },
"net_bw": { "net_bw": {
"node": { "node": {
"unit": "B/s", "unit": {
"scope": "node", "base": "B/s"
},
"timestep": 30, "timestep": 30,
"series": [ "series": [
{ {
@ -393,8 +399,9 @@
}, },
"mem_used": { "mem_used": {
"node": { "node": {
"unit": "B", "unit": {
"scope": "node", "base": "B"
},
"timestep": 30, "timestep": 30,
"series": [ "series": [
{ {
@ -445,13 +452,14 @@
}, },
"cpu_power": { "cpu_power": {
"socket": { "socket": {
"unit": "W", "unit": {
"scope": "socket", "base": "W"
},
"timestep": 60, "timestep": 60,
"series": [ "series": [
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"id": 0, "id": "0",
"statistics": { "statistics": {
"min": 35.50647456742635, "min": 35.50647456742635,
"avg": 72.08313211552377, "avg": 72.08313211552377,
@ -467,7 +475,7 @@
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"id": 0, "id": "0",
"statistics": { "statistics": {
"min": 83.8466923147859, "min": 83.8466923147859,
"avg": 85.18572681122097, "avg": 85.18572681122097,

View File

@ -59,10 +59,6 @@ func setup(t *testing.T) *api.RestApi {
const testclusterJson = `{ const testclusterJson = `{
"name": "testcluster", "name": "testcluster",
"subClusters": [ "subClusters": [
{
"name": "sc0",
"nodes": "host120,host121,host122"
},
{ {
"name": "sc1", "name": "sc1",
"nodes": "host123,host124,host125", "nodes": "host123,host124,host125",
@ -70,9 +66,28 @@ func setup(t *testing.T) *api.RestApi {
"socketsPerNode": 1, "socketsPerNode": 1,
"coresPerSocket": 4, "coresPerSocket": 4,
"threadsPerCore": 2, "threadsPerCore": 2,
"flopRateScalar": 44, "flopRateScalar": {
"flopRateSimd": 704, "unit": {
"memoryBandwidth": 80, "prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"topology": { "topology": {
"node": [0, 1, 2, 3, 4, 5, 6, 7], "node": [0, 1, 2, 3, 4, 5, 6, 7],
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]], "socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
@ -85,9 +100,10 @@ func setup(t *testing.T) *api.RestApi {
"metricConfig": [ "metricConfig": [
{ {
"name": "load_one", "name": "load_one",
"unit": "load", "unit": { "base": ""},
"scope": "node", "scope": "node",
"timestep": 60, "timestep": 60,
"aggregation": "avg",
"peak": 8, "peak": 8,
"normal": 0, "normal": 0,
"caution": 0, "caution": 0,
@ -95,19 +111,38 @@ func setup(t *testing.T) *api.RestApi {
} }
] ]
}` }`
const taurusclusterJson = `{ const taurusclusterJson = `{
"name": "taurus", "name": "taurus",
"SubClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
"processorType": "Intel Haswell", "processorType": "Intel Haswell",
"socketsPerNode": 2, "socketsPerNode": 2,
"coresPerSocket": 12, "coresPerSocket": 12,
"threadsPerCore": 1, "threadsPerCore": 1,
"flopRateScalar": 32, "flopRateScalar": {
"flopRateSimd": 512, "unit": {
"memoryBandwidth": 60, "prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"nodes": "w11[27-45,49-63,69-72]",
"topology": { "topology": {
"node": [ 0, 1 ], "node": [ 0, 1 ],
"socket": [ "socket": [
@ -126,8 +161,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "cpu_used", "name": "cpu_used",
"scope": "core", "scope": "core",
"unit": "", "unit": {"base": ""},
"aggregation": "avg",
"timestep": 30, "timestep": 30,
"peak": 1,
"normal": 0.5,
"caution": 2e-07,
"alert": 1e-07,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -141,8 +181,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "ipc", "name": "ipc",
"scope": "core", "scope": "core",
"unit": "IPC", "unit": { "base": "IPC"},
"aggregation": "avg",
"timestep": 60, "timestep": 60,
"peak": 2,
"normal": 1,
"caution": 0.1,
"alert": 0.5,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -156,8 +201,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "flops_any", "name": "flops_any",
"scope": "core", "scope": "core",
"unit": "F/s", "unit": { "base": "F/s"},
"aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 40000000000,
"normal": 20000000000,
"caution": 30000000000,
"alert": 35000000000,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -171,8 +221,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "mem_bw", "name": "mem_bw",
"scope": "socket", "scope": "socket",
"unit": "B/s", "unit": { "base": "B/s"},
"aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 58800000000,
"normal": 28800000000,
"caution": 38800000000,
"alert": 48800000000,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -186,8 +241,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "file_bw", "name": "file_bw",
"scope": "node", "scope": "node",
"unit": "B/s", "unit": { "base": "B/s"},
"aggregation": "sum",
"timestep": 30, "timestep": 30,
"peak": 20000000000,
"normal": 5000000000,
"caution": 9000000000,
"alert": 19000000000,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -201,8 +261,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "net_bw", "name": "net_bw",
"scope": "node", "scope": "node",
"unit": "B/s", "unit": { "base": "B/s"},
"timestep": 30, "timestep": 30,
"aggregation": "sum",
"peak": 7000000000,
"normal": 5000000000,
"caution": 6000000000,
"alert": 6500000000,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -216,8 +281,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "mem_used", "name": "mem_used",
"scope": "node", "scope": "node",
"unit": "B", "unit": {"base": "B"},
"aggregation": "sum",
"timestep": 30, "timestep": 30,
"peak": 32000000000,
"normal": 2000000000,
"caution": 31000000000,
"alert": 30000000000,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -231,8 +301,13 @@ func setup(t *testing.T) *api.RestApi {
{ {
"name": "cpu_power", "name": "cpu_power",
"scope": "socket", "scope": "socket",
"unit": "W", "unit": {"base": "W"},
"aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 100,
"normal": 80,
"caution": 90,
"alert": 90,
"subClusters": [ "subClusters": [
{ {
"name": "haswell", "name": "haswell",
@ -253,6 +328,10 @@ func setup(t *testing.T) *api.RestApi {
t.Fatal(err) t.Fatal(err)
} }
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil { if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -315,13 +394,12 @@ func TestRestApi(t *testing.T) {
testData := schema.JobData{ testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{ "load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: { schema.MetricScopeNode: {
Unit: "load", Unit: schema.Unit{Base: "load"},
Scope: schema.MetricScopeNode,
Timestep: 60, Timestep: 60,
Series: []schema.Series{ Series: []schema.Series{
{ {
Hostname: "host123", Hostname: "host123",
Statistics: &schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3}, Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3}, Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
}, },
}, },
@ -392,15 +470,15 @@ func TestRestApi(t *testing.T) {
job.Project != "testproj" || job.Project != "testproj" ||
job.Cluster != "testcluster" || job.Cluster != "testcluster" ||
job.SubCluster != "sc1" || job.SubCluster != "sc1" ||
job.Partition != "default" || *job.Partition != "default" ||
job.Walltime != 3600 || *job.Walltime != 3600 ||
job.ArrayJobId != 0 || *job.ArrayJobId != 0 ||
job.NumNodes != 1 || job.NumNodes != 1 ||
job.NumHWThreads != 8 || *job.NumHWThreads != 8 ||
job.NumAcc != 0 || *job.NumAcc != 0 ||
job.Exclusive != 1 || job.Exclusive != 1 ||
job.MonitoringStatus != 1 || job.MonitoringStatus != 1 ||
job.SMT != 1 || *job.SMT != 1 ||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) || !reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
job.StartTime.Unix() != 123456789 { job.StartTime.Unix() != 123456789 {
t.Fatalf("unexpected job properties: %#v", job) t.Fatalf("unexpected job properties: %#v", job)
@ -488,13 +566,13 @@ func TestRestApi(t *testing.T) {
} }
}) })
t.Run("FailedJob", func(t *testing.T) { // t.Run("FailedJob", func(t *testing.T) {
subtestLetJobFail(t, restapi, r) // subtestLetJobFail(t, restapi, r)
}) // })
t.Run("ImportJob", func(t *testing.T) { // t.Run("ImportJob", func(t *testing.T) {
testImportFlag(t) // testImportFlag(t)
}) // })
} }
func subtestLetJobFail(t *testing.T, restapi *api.RestApi, r *mux.Router) { func subtestLetJobFail(t *testing.T, restapi *api.RestApi, r *mux.Router) {
@ -505,19 +583,15 @@ func subtestLetJobFail(t *testing.T, restapi *api.RestApi, r *mux.Router) {
"cluster": "testcluster", "cluster": "testcluster",
"partition": "default", "partition": "default",
"walltime": 3600, "walltime": 3600,
"arrayJobId": 0,
"numNodes": 1, "numNodes": 1,
"numAcc": 0,
"exclusive": 1, "exclusive": 1,
"monitoringStatus": 1, "monitoringStatus": 1,
"smt": 1, "smt": 1,
"tags": [],
"resources": [ "resources": [
{ {
"hostname": "host123" "hostname": "host123"
} }
], ],
"metaData": {},
"startTime": 12345678 "startTime": 12345678
}` }`
@ -596,4 +670,17 @@ func testImportFlag(t *testing.T) {
if len(data) != 8 { if len(data) != 8 {
t.Errorf("Job data length: Got %d, want 8", len(data)) t.Errorf("Job data length: Got %d, want 8", len(data))
} }
r := map[string]string{"mem_used": "GB", "net_bw": "KB/s",
"cpu_power": "W", "cpu_used": "",
"file_bw": "KB/s", "flops_any": "F/s",
"mem_bw": "GB/s", "ipc": "IPC"}
for name, scopes := range data {
for _, metric := range scopes {
if metric.Unit.Base != r[name] {
t.Errorf("Metric %s unit: Got %s, want %s", name, metric.Unit.Base, r[name])
}
}
}
} }

View File

@ -5,10 +5,8 @@
"cluster": "taurus", "cluster": "taurus",
"subCluster": "haswell", "subCluster": "haswell",
"partition": "haswell64", "partition": "haswell64",
"arrayJobId": 0,
"numNodes": 2, "numNodes": 2,
"numHwthreads": 4, "numHwthreads": 4,
"numAcc": 0,
"exclusive": 0, "exclusive": 0,
"startTime": 1635856524, "startTime": 1635856524,
"jobState": "completed", "jobState": "completed",
@ -18,11 +16,17 @@
"resources": [ "resources": [
{ {
"hostname": "taurusi6489", "hostname": "taurusi6489",
"hwthreads": [ 0, 1 ] "hwthreads": [
0,
1
]
}, },
{ {
"hostname": "taurusi6490", "hostname": "taurusi6490",
"hwthreads": [ 10, 11 ] "hwthreads": [
10,
11
]
} }
], ],
"statistics": { "statistics": {
@ -30,49 +34,65 @@
"min": 0.03694102397926118, "min": 0.03694102397926118,
"avg": 0.48812580468611544, "avg": 0.48812580468611544,
"max": 1.0000000000000002, "max": 1.0000000000000002,
"unit": "cpu used" "unit": {
"base": ""
}
}, },
"ipc": { "ipc": {
"min": 0.30469640475234366, "min": 0.30469640475234366,
"avg": 1.154312070173657, "avg": 1.154312070173657,
"max": 1.797623522191001, "max": 1.797623522191001,
"unit": "IPC" "unit": {
"base": "IPC"
}
}, },
"flops_any": { "flops_any": {
"min": 0.0, "min": 0.0,
"avg": 686.5190320308598, "avg": 686.5190320308598,
"max": 4346.591400350933, "max": 4346.591400350933,
"unit": "F/s" "unit": {
"base": "F/s"
}
}, },
"mem_bw": { "mem_bw": {
"min": 653671812.1661415, "min": 653671812.1661415,
"avg": 1605031604.9852366, "avg": 1605031604.9852366,
"max": 2614718291.9554267, "max": 2614718291.9554267,
"unit": "B/s" "unit": {
"base": "B/s"
}
}, },
"file_bw": { "file_bw": {
"min": 0.0, "min": 0.0,
"avg": 620592.5419124186, "avg": 620592.5419124186,
"max": 11559156.360352296, "max": 11559156.360352296,
"unit": "B/s" "unit": {
"base": "B/s"
}
}, },
"net_bw": { "net_bw": {
"min": 126779.89655880642, "min": 126779.89655880642,
"avg": 763101.082138246, "avg": 763101.082138246,
"max": 1916309.7075416835, "max": 1916309.7075416835,
"unit": "B/s" "unit": {
"base": "B/s"
}
}, },
"mem_used": { "mem_used": {
"min": 2779066368.0, "min": 2779066368.0,
"avg": 9647598685.09091, "avg": 9647598685.09091,
"max": 10202595328.0, "max": 10202595328.0,
"unit": "B" "unit": {
"base": "B"
}
}, },
"cpu_power": { "cpu_power": {
"min": 35.50647456742635, "min": 35.50647456742635,
"avg": 78.63442946337237, "avg": 78.63442946337237,
"max": 85.83909286117324, "max": 85.83909286117324,
"unit": "W" "unit": {
"base": "W"
}
} }
} }
} }

View File

@ -0,0 +1,36 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
)
func main() {
var srcPath, flagConfigFile string
flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
flag.Parse()
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", srcPath)
config.Init(flagConfigFile)
config.Keys.Validate = true
if err := archive.Init(json.RawMessage(archiveCfg), false); err != nil {
log.Fatal(err)
}
ar := archive.GetHandle()
for job := range ar.Iter(true) {
log.Printf("Validate %s - %d\n", job.Meta.Cluster, job.Meta.JobID)
}
}

View File

@ -0,0 +1,65 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// type Accelerator struct {
// ID string `json:"id"`
// Type string `json:"type"`
// Model string `json:"model"`
// }
// type Topology struct {
// Node []int `json:"node"`
// Socket [][]int `json:"socket"`
// MemoryDomain [][]int `json:"memoryDomain"`
// Die [][]int `json:"die"`
// Core [][]int `json:"core"`
// Accelerators []*Accelerator `json:"accelerators"`
// }
type SubCluster struct {
Name string `json:"name"`
Nodes string `json:"nodes"`
NumberOfNodes int `json:"numberOfNodes"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
Topology *schema.Topology `json:"topology"`
}
// type SubClusterConfig struct {
// Name string `json:"name"`
// Peak float64 `json:"peak"`
// Normal float64 `json:"normal"`
// Caution float64 `json:"caution"`
// Alert float64 `json:"alert"`
// }
type MetricConfig struct {
Name string `json:"name"`
Unit string `json:"unit"`
Scope schema.MetricScope `json:"scope"`
Aggregation string `json:"aggregation"`
Timestep int `json:"timestep"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
SubClusters []*schema.SubClusterConfig `json:"subClusters"`
}
type Cluster struct {
Name string `json:"name"`
MetricConfig []*MetricConfig `json:"metricConfig"`
SubClusters []*SubCluster `json:"subClusters"`
}

View File

@ -0,0 +1,166 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"errors"
"fmt"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var Clusters []*Cluster
var nodeLists map[string]map[string]archive.NodeList
func initClusterConfig() error {
Clusters = []*Cluster{}
nodeLists = map[string]map[string]archive.NodeList{}
for _, c := range ar.GetClusters() {
cluster, err := ar.LoadClusterCfg(c)
if err != nil {
return err
}
if len(cluster.Name) == 0 ||
len(cluster.MetricConfig) == 0 ||
len(cluster.SubClusters) == 0 {
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
}
for _, mc := range cluster.MetricConfig {
if len(mc.Name) == 0 {
return errors.New("cluster.metricConfig.name should not be empty")
}
if mc.Timestep < 1 {
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
}
// For backwards compability...
if mc.Scope == "" {
mc.Scope = schema.MetricScopeNode
}
if !mc.Scope.Valid() {
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
}
}
Clusters = append(Clusters, cluster)
nodeLists[cluster.Name] = make(map[string]archive.NodeList)
for _, sc := range cluster.SubClusters {
if sc.Nodes == "" {
continue
}
nl, err := archive.ParseNodeList(sc.Nodes)
if err != nil {
return fmt.Errorf("in %s/cluster.json: %w", cluster.Name, err)
}
nodeLists[cluster.Name][sc.Name] = nl
}
}
return nil
}
func GetCluster(cluster string) *Cluster {
for _, c := range Clusters {
if c.Name == cluster {
return c
}
}
return nil
}
func GetSubCluster(cluster, subcluster string) *SubCluster {
for _, c := range Clusters {
if c.Name == cluster {
for _, p := range c.SubClusters {
if p.Name == subcluster {
return p
}
}
}
}
return nil
}
func GetMetricConfig(cluster, metric string) *MetricConfig {
for _, c := range Clusters {
if c.Name == cluster {
for _, m := range c.MetricConfig {
if m.Name == metric {
return m
}
}
}
}
return nil
}
// AssignSubCluster sets the `job.subcluster` property of the job based
// on its cluster and resources.
func AssignSubCluster(job *BaseJob) error {
cluster := GetCluster(job.Cluster)
if cluster == nil {
return fmt.Errorf("unkown cluster: %#v", job.Cluster)
}
if job.SubCluster != "" {
for _, sc := range cluster.SubClusters {
if sc.Name == job.SubCluster {
return nil
}
}
return fmt.Errorf("already assigned subcluster %#v unkown (cluster: %#v)", job.SubCluster, job.Cluster)
}
if len(job.Resources) == 0 {
return fmt.Errorf("job without any resources/hosts")
}
host0 := job.Resources[0].Hostname
for sc, nl := range nodeLists[job.Cluster] {
if nl != nil && nl.Contains(host0) {
job.SubCluster = sc
return nil
}
}
if cluster.SubClusters[0].Nodes == "" {
job.SubCluster = cluster.SubClusters[0].Name
return nil
}
return fmt.Errorf("no subcluster found for cluster %#v and host %#v", job.Cluster, host0)
}
func GetSubClusterByNode(cluster, hostname string) (string, error) {
for sc, nl := range nodeLists[cluster] {
if nl != nil && nl.Contains(hostname) {
return sc, nil
}
}
c := GetCluster(cluster)
if c == nil {
return "", fmt.Errorf("unkown cluster: %#v", cluster)
}
if c.SubClusters[0].Nodes == "" {
return c.SubClusters[0].Name, nil
}
return "", fmt.Errorf("no subcluster found for cluster %#v and host %#v", cluster, hostname)
}

View File

@ -0,0 +1,109 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"errors"
"io"
"math"
"strconv"
)
// A custom float type is used so that (Un)MarshalJSON and
// (Un)MarshalGQL can be overloaded and NaN/null can be used.
// The default behaviour of putting every nullable value behind
// a pointer has a bigger overhead.
type Float float64
var NaN Float = Float(math.NaN())
var nullAsBytes []byte = []byte("null")
func (f Float) IsNaN() bool {
return math.IsNaN(float64(f))
}
// NaN will be serialized to `null`.
func (f Float) MarshalJSON() ([]byte, error) {
if f.IsNaN() {
return nullAsBytes, nil
}
return strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64), nil
}
// `null` will be unserialized to NaN.
func (f *Float) UnmarshalJSON(input []byte) error {
s := string(input)
if s == "null" {
*f = NaN
return nil
}
val, err := strconv.ParseFloat(s, 64)
if err != nil {
return err
}
*f = Float(val)
return nil
}
// UnmarshalGQL implements the graphql.Unmarshaler interface.
func (f *Float) UnmarshalGQL(v interface{}) error {
f64, ok := v.(float64)
if !ok {
return errors.New("invalid Float scalar")
}
*f = Float(f64)
return nil
}
// MarshalGQL implements the graphql.Marshaler interface.
// NaN will be serialized to `null`.
func (f Float) MarshalGQL(w io.Writer) {
if f.IsNaN() {
w.Write(nullAsBytes)
} else {
w.Write(strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64))
}
}
// Only used via REST-API, not via GraphQL.
// This uses a lot less allocations per series,
// but it turns out that the performance increase
// from using this is not that big.
func (s *Series) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 512+len(s.Data)*8)
buf = append(buf, `{"hostname":"`...)
buf = append(buf, s.Hostname...)
buf = append(buf, '"')
if s.Id != nil {
buf = append(buf, `,"id":`...)
buf = strconv.AppendInt(buf, int64(*s.Id), 10)
}
if s.Statistics != nil {
buf = append(buf, `,"statistics":{"min":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
buf = append(buf, `,"avg":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Avg, 'f', 2, 64)
buf = append(buf, `,"max":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
buf = append(buf, '}')
}
buf = append(buf, `,"data":[`...)
for i := 0; i < len(s.Data); i++ {
if i != 0 {
buf = append(buf, ',')
}
if s.Data[i].IsNaN() {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(s.Data[i]), 'f', 2, 32)
}
}
buf = append(buf, ']', '}')
return buf, nil
}

View File

@ -0,0 +1,142 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
type FsArchiveConfig struct {
Path string `json:"path"`
}
type FsArchive struct {
path string
clusters []string
}
func getPath(
job *JobMeta,
rootPath string,
file string) string {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
return filepath.Join(
rootPath,
job.Cluster,
lvl1, lvl2,
strconv.FormatInt(job.StartTime, 10), file)
}
func loadJobMeta(filename string) (*JobMeta, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend loadJobMeta()- %v", err)
return &JobMeta{}, err
}
defer f.Close()
return DecodeJobMeta(bufio.NewReader(f))
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) error {
var config FsArchiveConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Errorf("fsBackend Init()- %v", err)
return err
}
if config.Path == "" {
err := fmt.Errorf("fsBackend Init()- empty path")
log.Errorf("fsBackend Init()- %v", err)
return err
}
fsa.path = config.Path
entries, err := os.ReadDir(fsa.path)
if err != nil {
log.Errorf("fsBackend Init()- %v", err)
return err
}
for _, de := range entries {
fsa.clusters = append(fsa.clusters, de.Name())
}
return nil
}
func (fsa *FsArchive) Iter() <-chan *JobMeta {
ch := make(chan *JobMeta)
go func() {
clustersDir, err := os.ReadDir(fsa.path)
if err != nil {
log.Fatalf("Reading clusters failed: %s", err.Error())
}
for _, clusterDir := range clustersDir {
lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name()))
if err != nil {
log.Fatalf("Reading jobs failed: %s", err.Error())
}
for _, lvl1Dir := range lvl1Dirs {
if !lvl1Dir.IsDir() {
// Could be the cluster.json file
continue
}
lvl2Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name()))
if err != nil {
log.Fatalf("Reading jobs failed: %s", err.Error())
}
for _, lvl2Dir := range lvl2Dirs {
dirpath := filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name(), lvl2Dir.Name())
startTimeDirs, err := os.ReadDir(dirpath)
if err != nil {
log.Fatalf("Reading jobs failed: %s", err.Error())
}
for _, startTimeDir := range startTimeDirs {
if startTimeDir.IsDir() {
job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json"))
if err != nil {
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
} else {
ch <- job
}
}
}
}
}
}
close(ch)
}()
return ch
}
func (fsa *FsArchive) LoadClusterCfg(name string) (*Cluster, error) {
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
if err != nil {
log.Errorf("fsBackend LoadClusterCfg()- %v", err)
return &Cluster{}, err
}
return DecodeCluster(bytes.NewReader(b))
}
func (fsa *FsArchive) GetClusters() []string {
return fsa.clusters
}

View File

@ -0,0 +1,162 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"errors"
"fmt"
"io"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// Non-Swaggered Comment: BaseJob
// Non-Swaggered Comment: Common subset of Job and JobMeta. Use one of those, not this type directly.
type BaseJob struct {
// The unique identifier of a job
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
User string `json:"user" db:"user" example:"abcd100h"` // The unique identifier of a user
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
Partition *string `json:"partition" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
ArrayJobId *int64 `json:"arrayJobId" db:"array_job_id" example:"123000"` // The unique identifier of an array job
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
NumHWThreads *int32 `json:"numHwthreads" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
NumAcc *int32 `json:"numAcc" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
SMT *int32 `json:"smt" db:"smt" example:"4"` // SMT threads used by job
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
Walltime *int64 `json:"walltime" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
Tags []*schema.Tag `json:"tags"` // List of tags
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
Resources []*Resource `json:"resources"` // Resources used by job
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
MetaData map[string]string `json:"metaData"` // Additional information about the job
}
// Non-Swaggered Comment: Job
// Non-Swaggered Comment: This type is used as the GraphQL interface and using sqlx as a table row.
// Job model
// @Description Information of a HPC job.
type Job struct {
// The unique identifier of a job in the database
ID int64 `json:"id" db:"id"`
BaseJob
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds
StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type
MemUsedMax float64 `json:"-" db:"mem_used_max"` // MemUsedMax as Float64
FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"` // FlopsAnyAvg as Float64
MemBwAvg float64 `json:"-" db:"mem_bw_avg"` // MemBwAvg as Float64
LoadAvg float64 `json:"-" db:"load_avg"` // LoadAvg as Float64
NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64
FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"` // FileDataVolTotal as Float64
}
// Non-Swaggered Comment: JobMeta
// Non-Swaggered Comment: When reading from the database or sending data via GraphQL, the start time can be in the much more
// Non-Swaggered Comment: convenient time.Time type. In the `meta.json` files, the start time is encoded as a unix epoch timestamp.
// Non-Swaggered Comment: This is why there is this struct, which contains all fields from the regular job struct, but "overwrites"
// Non-Swaggered Comment: the StartTime field with one of type int64.
// Non-Swaggered Comment: ID *int64 `json:"id,omitempty"` >> never used in the job-archive, only available via REST-API
// JobMeta model
// @Description Meta data information of a HPC job.
type JobMeta struct {
// The unique identifier of a job in the database
ID *int64 `json:"id,omitempty"`
BaseJob
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` // Start epoch time stamp in seconds (Min > 0)
Statistics map[string]JobStatistics `json:"statistics,omitempty"` // Metric statistics of job
}
const (
MonitoringStatusDisabled int32 = 0
MonitoringStatusRunningOrArchiving int32 = 1
MonitoringStatusArchivingFailed int32 = 2
MonitoringStatusArchivingSuccessful int32 = 3
)
var JobDefaults BaseJob = BaseJob{
Exclusive: 1,
MonitoringStatus: MonitoringStatusRunningOrArchiving,
}
// JobStatistics model
// @Description Specification for job metric statistics.
type JobStatistics struct {
// Metric unit (see schema/unit.schema.json)
Unit string `json:"unit" example:"GHz"`
Avg float64 `json:"avg" example:"2500" minimum:"0"` // Job metric average
Min float64 `json:"min" example:"2000" minimum:"0"` // Job metric minimum
Max float64 `json:"max" example:"3000" minimum:"0"` // Job metric maximum
}
// Tag model
// @Description Defines a tag using name and type.
type Tag struct {
// The unique DB identifier of a tag
ID int64 `json:"id" db:"id"`
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type
Name string `json:"name" db:"tag_name" example:"Testjob"` // Tag Name
}
// Resource model
// @Description A resource used by a job
type Resource struct {
Hostname string `json:"hostname"` // Name of the host (= node)
HWThreads []int `json:"hwthreads,omitempty"` // List of OS processor ids
Accelerators []string `json:"accelerators,omitempty"` // List of of accelerator device ids
Configuration string `json:"configuration,omitempty"` // The configuration options of the node
}
type JobState string
const (
JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed"
JobStateFailed JobState = "failed"
JobStateCancelled JobState = "cancelled"
JobStateStopped JobState = "stopped"
JobStateTimeout JobState = "timeout"
JobStatePreempted JobState = "preempted"
JobStateOutOfMemory JobState = "out_of_memory"
)
func (e *JobState) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = JobState(str)
if !e.Valid() {
return errors.New("invalid job state")
}
return nil
}
func (e JobState) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}
func (e JobState) Valid() bool {
return e == JobStateRunning ||
e == JobStateCompleted ||
e == JobStateFailed ||
e == JobStateCancelled ||
e == JobStateStopped ||
e == JobStateTimeout ||
e == JobStatePreempted ||
e == JobStateOutOfMemory
}

View File

@ -0,0 +1,66 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"encoding/json"
"io"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func DecodeJobData(r io.Reader) (*JobData, error) {
var d JobData
if err := json.NewDecoder(r).Decode(&d); err != nil {
return nil, err
}
return &d, nil
}
func DecodeJobMeta(r io.Reader) (*JobMeta, error) {
var d JobMeta
if err := json.NewDecoder(r).Decode(&d); err != nil {
return nil, err
}
return &d, nil
}
func DecodeCluster(r io.Reader) (*Cluster, error) {
var c Cluster
if err := json.NewDecoder(r).Decode(&c); err != nil {
return nil, err
}
return &c, nil
}
func EncodeJobData(w io.Writer, d *schema.JobData) error {
// Sanitize parameters
if err := json.NewEncoder(w).Encode(d); err != nil {
return err
}
return nil
}
func EncodeJobMeta(w io.Writer, d *schema.JobMeta) error {
// Sanitize parameters
if err := json.NewEncoder(w).Encode(d); err != nil {
return err
}
return nil
}
func EncodeCluster(w io.Writer, c *schema.Cluster) error {
// Sanitize parameters
if err := json.NewEncoder(w).Encode(c); err != nil {
return err
}
return nil
}

View File

@ -0,0 +1,318 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"bufio"
"encoding/json"
"errors"
"flag"
"fmt"
"log"
"os"
"path/filepath"
"sync"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/units"
)
const Version = 1
var ar FsArchive
func loadJobData(filename string) (*JobData, error) {
f, err := os.Open(filename)
if err != nil {
return &JobData{}, fmt.Errorf("fsBackend loadJobData()- %v", err)
}
defer f.Close()
return DecodeJobData(bufio.NewReader(f))
}
func deepCopyJobMeta(j *JobMeta) schema.JobMeta {
var jn schema.JobMeta
//required properties
jn.JobID = j.JobID
jn.User = j.User
jn.Project = j.Project
jn.Cluster = j.Cluster
jn.SubCluster = j.SubCluster
jn.NumNodes = j.NumNodes
jn.Exclusive = j.Exclusive
jn.StartTime = j.StartTime
jn.State = schema.JobState(j.State)
jn.Duration = j.Duration
for _, ro := range j.Resources {
var rn schema.Resource
rn.Hostname = ro.Hostname
rn.Configuration = ro.Configuration
hwt := make([]int, len(ro.HWThreads))
if ro.HWThreads != nil {
copy(hwt, ro.HWThreads)
}
rn.HWThreads = hwt
acc := make([]string, len(ro.Accelerators))
if ro.Accelerators != nil {
copy(acc, ro.Accelerators)
}
rn.Accelerators = acc
jn.Resources = append(jn.Resources, &rn)
}
jn.MetaData = make(map[string]string)
for k, v := range j.MetaData {
jn.MetaData[k] = v
}
jn.Statistics = make(map[string]schema.JobStatistics)
for k, v := range j.Statistics {
var sn schema.JobStatistics
sn.Avg = v.Avg
sn.Max = v.Max
sn.Min = v.Min
tmpUnit := units.ConvertUnitString(v.Unit)
if tmpUnit.Base == "inval" {
sn.Unit = schema.Unit{Base: ""}
} else {
sn.Unit = tmpUnit
}
jn.Statistics[k] = sn
}
//optional properties
jn.Partition = j.Partition
jn.ArrayJobId = j.ArrayJobId
jn.NumHWThreads = j.NumHWThreads
jn.NumAcc = j.NumAcc
jn.MonitoringStatus = j.MonitoringStatus
jn.SMT = j.SMT
jn.Walltime = j.Walltime
for _, t := range j.Tags {
jn.Tags = append(jn.Tags, t)
}
return jn
}
func deepCopyJobData(d *JobData, cluster string, subCluster string) *schema.JobData {
var dn = make(schema.JobData)
for k, v := range *d {
// fmt.Printf("Metric %s\n", k)
dn[k] = make(map[schema.MetricScope]*schema.JobMetric)
for mk, mv := range v {
// fmt.Printf("Scope %s\n", mk)
var mn schema.JobMetric
tmpUnit := units.ConvertUnitString(mv.Unit)
if tmpUnit.Base == "inval" {
mn.Unit = schema.Unit{Base: ""}
} else {
mn.Unit = tmpUnit
}
mn.Timestep = mv.Timestep
for _, v := range mv.Series {
var sn schema.Series
sn.Hostname = v.Hostname
if v.Id != nil {
var id = new(string)
if mk == schema.MetricScopeAccelerator {
s := GetSubCluster(cluster, subCluster)
var err error
*id, err = s.Topology.GetAcceleratorID(*v.Id)
if err != nil {
log.Fatal(err)
}
} else {
*id = fmt.Sprint(*v.Id)
}
sn.Id = id
}
if v.Statistics != nil {
sn.Statistics = schema.MetricStatistics{
Avg: v.Statistics.Avg,
Min: v.Statistics.Min,
Max: v.Statistics.Max}
}
sn.Data = make([]schema.Float, len(v.Data))
copy(sn.Data, v.Data)
mn.Series = append(mn.Series, sn)
}
dn[k][mk] = &mn
}
// fmt.Printf("FINISH %s\n", k)
}
return &dn
}
func deepCopyClusterConfig(co *Cluster) schema.Cluster {
var cn schema.Cluster
cn.Name = co.Name
for _, sco := range co.SubClusters {
var scn schema.SubCluster
scn.Name = sco.Name
scn.Nodes = sco.Nodes
scn.ProcessorType = sco.ProcessorType
scn.SocketsPerNode = sco.SocketsPerNode
scn.CoresPerSocket = sco.CoresPerSocket
scn.ThreadsPerCore = sco.ThreadsPerCore
var prefix = new(string)
*prefix = "G"
scn.FlopRateScalar = schema.MetricValue{
Unit: schema.Unit{Base: "F/s", Prefix: prefix},
Value: float64(sco.FlopRateScalar)}
scn.FlopRateSimd = schema.MetricValue{
Unit: schema.Unit{Base: "F/s", Prefix: prefix},
Value: float64(sco.FlopRateSimd)}
scn.MemoryBandwidth = schema.MetricValue{
Unit: schema.Unit{Base: "B/s", Prefix: prefix},
Value: float64(sco.MemoryBandwidth)}
scn.Topology = *sco.Topology
cn.SubClusters = append(cn.SubClusters, &scn)
}
for _, mco := range co.MetricConfig {
var mcn schema.MetricConfig
mcn.Name = mco.Name
mcn.Scope = mco.Scope
if mco.Aggregation == "" {
fmt.Println("Property aggregation missing! Please review file!")
mcn.Aggregation = "sum"
} else {
mcn.Aggregation = mco.Aggregation
}
mcn.Timestep = mco.Timestep
tmpUnit := units.ConvertUnitString(mco.Unit)
if tmpUnit.Base == "inval" {
mcn.Unit = schema.Unit{Base: ""}
} else {
mcn.Unit = tmpUnit
}
mcn.Peak = mco.Peak
mcn.Normal = mco.Normal
mcn.Caution = mco.Caution
mcn.Alert = mco.Alert
mcn.SubClusters = mco.SubClusters
cn.MetricConfig = append(cn.MetricConfig, &mcn)
}
return cn
}
func main() {
var srcPath string
var dstPath string
flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive")
flag.StringVar(&dstPath, "d", "./var/job-archive-new", "Specify the destination job archive path. Default is ./var/job-archive-new")
flag.Parse()
if _, err := os.Stat(filepath.Join(srcPath, "version.txt")); !errors.Is(err, os.ErrNotExist) {
log.Fatal("Archive version exists!")
}
srcConfig := fmt.Sprintf("{\"path\": \"%s\"}", srcPath)
err := ar.Init(json.RawMessage(srcConfig))
if err != nil {
log.Fatal(err)
}
err = initClusterConfig()
if err != nil {
log.Fatal(err)
}
// setup new job archive
err = os.Mkdir(dstPath, 0750)
if err != nil {
log.Fatal(err)
}
for _, c := range Clusters {
path := fmt.Sprintf("%s/%s", dstPath, c.Name)
fmt.Println(path)
err = os.Mkdir(path, 0750)
if err != nil {
log.Fatal(err)
}
cn := deepCopyClusterConfig(c)
f, err := os.Create(fmt.Sprintf("%s/%s/cluster.json", dstPath, c.Name))
if err != nil {
log.Fatal(err)
}
if err := EncodeCluster(f, &cn); err != nil {
log.Fatal(err)
}
if err := f.Close(); err != nil {
log.Fatal(err)
}
}
var wg sync.WaitGroup
for job := range ar.Iter() {
// fmt.Printf("Job %d\n", job.JobID)
job := job
wg.Add(1)
go func() {
defer wg.Done()
path := getPath(job, dstPath, "meta.json")
err = os.MkdirAll(filepath.Dir(path), 0750)
if err != nil {
log.Fatal(err)
}
f, err := os.Create(path)
if err != nil {
log.Fatal(err)
}
jmn := deepCopyJobMeta(job)
if err = EncodeJobMeta(f, &jmn); err != nil {
log.Fatal(err)
}
if err = f.Close(); err != nil {
log.Fatal(err)
}
f, err = os.Create(getPath(job, dstPath, "data.json"))
if err != nil {
log.Fatal(err)
}
var jd *JobData
jd, err = loadJobData(getPath(job, srcPath, "data.json"))
if err != nil {
log.Fatal(err)
}
jdn := deepCopyJobData(jd, job.Cluster, job.SubCluster)
if err := EncodeJobData(f, jdn); err != nil {
log.Fatal(err)
}
if err := f.Close(); err != nil {
log.Fatal(err)
}
}()
}
wg.Wait()
os.WriteFile(filepath.Join(dstPath, "version.txt"), []byte(fmt.Sprintf("%d", Version)), 0644)
}

View File

@ -0,0 +1,65 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type JobData map[string]map[schema.MetricScope]*JobMetric
type JobMetric struct {
Unit string `json:"unit"`
Scope schema.MetricScope `json:"scope"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries"`
}
type Series struct {
Hostname string `json:"hostname"`
Id *int `json:"id,omitempty"`
Statistics *MetricStatistics `json:"statistics"`
Data []schema.Float `json:"data"`
}
type MetricStatistics struct {
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
type StatsSeries struct {
Mean []Float `json:"mean"`
Min []Float `json:"min"`
Max []Float `json:"max"`
Percentiles map[int][]Float `json:"percentiles,omitempty"`
}
// type MetricScope string
// const (
// MetricScopeInvalid MetricScope = "invalid_scope"
// MetricScopeNode MetricScope = "node"
// MetricScopeSocket MetricScope = "socket"
// MetricScopeMemoryDomain MetricScope = "memoryDomain"
// MetricScopeCore MetricScope = "core"
// MetricScopeHWThread MetricScope = "hwthread"
// MetricScopeAccelerator MetricScope = "accelerator"
// )
// var metricScopeGranularity map[MetricScope]int = map[MetricScope]int{
// MetricScopeNode: 10,
// MetricScopeSocket: 5,
// MetricScopeMemoryDomain: 3,
// MetricScopeCore: 2,
// MetricScopeHWThread: 1,
// MetricScopeAccelerator: 5, // Special/Randomly choosen
// MetricScopeInvalid: -1,
// }

View File

@ -1,9 +0,0 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
func main() {
}

View File

@ -30,8 +30,8 @@
let rooflineMaxY let rooflineMaxY
let colWidth let colWidth
let numBins = 50 let numBins = 50
const ccconfig = getContext('cc-config'), const ccconfig = getContext('cc-config')
metricConfig = getContext('metrics') const metricConfig = getContext('metrics')
let metricsInHistograms = ccconfig.analysis_view_histogramMetrics, let metricsInHistograms = ccconfig.analysis_view_histogramMetrics,
metricsInScatterplots = ccconfig.analysis_view_scatterPlotMetrics metricsInScatterplots = ccconfig.analysis_view_scatterPlotMetrics
@ -161,24 +161,29 @@
<Histogram <Histogram
width={colWidth - 25} height={300 * 0.5} width={colWidth - 25} height={300 * 0.5}
data={$statsQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))} data={$statsQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
label={(x) => x < $statsQuery.data.topUsers.length ? $statsQuery.data.topUsers[Math.floor(x)].name : '0'} /> label={(x) => x < $statsQuery.data.topUsers.length ? $statsQuery.data.topUsers[Math.floor(x)].name : 'No Users'}
ylabel="Node Hours [h]"/>
{/key} {/key}
</div> </div>
</div> </div>
<div class="col-3"> <div class="col-3">
{#key $statsQuery.data.stats[0].histDuration} {#key $statsQuery.data.stats[0].histDuration}
<h4>Walltime Distribution</h4> <h4>Duration Distribution</h4>
<Histogram <Histogram
width={colWidth - 25} height={300} width={colWidth - 25}
data={$statsQuery.data.stats[0].histDuration} /> data={$statsQuery.data.stats[0].histDuration}
xlabel="Current Runtimes [h]"
ylabel="Number of Jobs"/>
{/key} {/key}
</div> </div>
<div class="col-3"> <div class="col-3">
{#key $statsQuery.data.stats[0].histNumNodes} {#key $statsQuery.data.stats[0].histNumNodes}
<h4>Number of Nodes Distribution</h4> <h4>Number of Nodes Distribution</h4>
<Histogram <Histogram
width={colWidth - 25} height={300} width={colWidth - 25}
data={$statsQuery.data.stats[0].histNumNodes} /> data={$statsQuery.data.stats[0].histNumNodes}
xlabel="Allocated Nodes [#]"
ylabel="Number of Jobs" />
{/key} {/key}
</div> </div>
<div class="col-3"> <div class="col-3">
@ -189,7 +194,7 @@
{:else if $rooflineQuery.data && cluster} {:else if $rooflineQuery.data && cluster}
{#key $rooflineQuery.data} {#key $rooflineQuery.data}
<Roofline <Roofline
width={colWidth - 25} height={300} width={colWidth - 25}
tiles={$rooflineQuery.data.rooflineHeatmap} tiles={$rooflineQuery.data.rooflineHeatmap}
cluster={cluster.subClusters.length == 1 ? cluster.subClusters[0] : null} cluster={cluster.subClusters.length == 1 ? cluster.subClusters[0] : null}
maxY={rooflineMaxY} /> maxY={rooflineMaxY} />
@ -211,6 +216,7 @@
<Col> <Col>
<Card body> <Card body>
These histograms show the distribution of the averages of all jobs matching the filters. Each job/average is weighted by its node hours. These histograms show the distribution of the averages of all jobs matching the filters. Each job/average is weighted by its node hours.
Note that some metrics could be disabled for specific subclusters as per metriConfig and thus could affect shown average values.
</Card> </Card>
<br/> <br/>
</Col> </Col>
@ -224,12 +230,16 @@
$footprintsQuery.data.footprints.nodehours, $footprintsQuery.data.footprints.nodehours,
$footprintsQuery.data.footprints.metrics.find(f => f.metric == metric).data, numBins) }))} $footprintsQuery.data.footprints.metrics.find(f => f.metric == metric).data, numBins) }))}
itemsPerRow={ccconfig.plot_view_plotsPerRow}> itemsPerRow={ccconfig.plot_view_plotsPerRow}>
<h4>{item.metric} [{metricConfig(cluster.name, item.metric)?.unit}]</h4> <h4>Average Distribution of '{item.metric}'</h4>
<Histogram <Histogram
width={width} height={250} width={width} height={250}
min={item.min} max={item.max} min={item.min} max={item.max}
data={item.bins} label={item.label} /> data={item.bins}
label={item.label}
xlabel={`${item.metric} Average [${(metricConfig(cluster.name, item.metric)?.unit?.prefix ? metricConfig(cluster.name, item.metric)?.unit?.prefix : '') +
(metricConfig(cluster.name, item.metric)?.unit?.base ? metricConfig(cluster.name, item.metric)?.unit?.base : '')}]`}
ylabel="Node Hours [h]" />
</PlotTable> </PlotTable>
</Col> </Col>
</Row> </Row>
@ -238,6 +248,7 @@
<Col> <Col>
<Card body> <Card body>
Each circle represents one job. The size of a circle is proportional to its node hours. Darker circles mean multiple jobs have the same averages for the respective metrics. Each circle represents one job. The size of a circle is proportional to its node hours. Darker circles mean multiple jobs have the same averages for the respective metrics.
Note that some metrics could be disabled for specific subclusters as per metriConfig and thus could affect shown average values.
</Card> </Card>
<br/> <br/>
</Col> </Col>
@ -254,12 +265,18 @@
<ScatterPlot <ScatterPlot
width={width} height={250} color={"rgba(0, 102, 204, 0.33)"} width={width} height={250} color={"rgba(0, 102, 204, 0.33)"}
xLabel={`${item.m1} [${metricConfig(cluster.name, item.m1)?.unit}]`} xLabel={`${item.m1} [${(metricConfig(cluster.name, item.m1)?.unit?.prefix ? metricConfig(cluster.name, item.m1)?.unit?.prefix : '') +
yLabel={`${item.m2} [${metricConfig(cluster.name, item.m2)?.unit}]`} (metricConfig(cluster.name, item.m1)?.unit?.base ? metricConfig(cluster.name, item.m1)?.unit?.base : '')}]`}
yLabel={`${item.m2} [${(metricConfig(cluster.name, item.m2)?.unit?.prefix ? metricConfig(cluster.name, item.m2)?.unit?.prefix : '') +
(metricConfig(cluster.name, item.m2)?.unit?.base ? metricConfig(cluster.name, item.m2)?.unit?.base : '')}]`}
X={item.f1} Y={item.f2} S={$footprintsQuery.data.footprints.nodehours} /> X={item.f1} Y={item.f2} S={$footprintsQuery.data.footprints.nodehours} />
</PlotTable> </PlotTable>
</Col> </Col>
</Row> </Row>
{/if} {/if}
<style>
h4 {
text-align: center;
}
</style>

View File

@ -81,7 +81,7 @@
missingMetrics = metricNames.filter(metric => !metrics.some(jm => jm.name == metric)) missingMetrics = metricNames.filter(metric => !metrics.some(jm => jm.name == metric))
missingHosts = job.resources.map(({ hostname }) => ({ missingHosts = job.resources.map(({ hostname }) => ({
hostname: hostname, hostname: hostname,
metrics: metricNames.filter(metric => !metrics.some(jm => jm.metric.scope == 'node' && jm.metric.series.some(series => series.hostname == hostname))) metrics: metricNames.filter(metric => !metrics.some(jm => jm.scope == 'node' && jm.metric.series.some(series => series.hostname == hostname)))
})).filter(({ metrics }) => metrics.length > 0) })).filter(({ metrics }) => metrics.length > 0)
somethingMissing = missingMetrics.length > 0 || missingHosts.length > 0 somethingMissing = missingMetrics.length > 0 || missingHosts.length > 0
} }
@ -114,8 +114,8 @@
cluster={clusters cluster={clusters
.find(c => c.name == $initq.data.job.cluster).subClusters .find(c => c.name == $initq.data.job.cluster).subClusters
.find(sc => sc.name == $initq.data.job.subCluster)} .find(sc => sc.name == $initq.data.job.subCluster)}
flopsAny={$jobMetrics.data.jobMetrics.find(m => m.name == 'flops_any' && m.metric.scope == 'node')} flopsAny={$jobMetrics.data.jobMetrics.find(m => m.name == 'flops_any' && m.scope == 'node').metric}
memBw={$jobMetrics.data.jobMetrics.find(m => m.name == 'mem_bw' && m.metric.scope == 'node')} /> memBw={$jobMetrics.data.jobMetrics.find(m => m.name == 'mem_bw' && m.scope == 'node').metric} />
</Col> </Col>
{:else} {:else}
<Col></Col> <Col></Col>
@ -163,8 +163,9 @@
bind:this={plots[item.metric]} bind:this={plots[item.metric]}
on:more-loaded={({ detail }) => statsTable.moreLoaded(detail)} on:more-loaded={({ detail }) => statsTable.moreLoaded(detail)}
job={$initq.data.job} job={$initq.data.job}
metric={item.metric} metricName={item.metric}
scopes={item.data.map(x => x.metric)} rawData={item.data.map(x => x.metric)}
scopes={item.data.map(x => x.scope)}
width={width}/> width={width}/>
{:else} {:else}
<Card body color="warning">No data for <code>{item.metric}</code></Card> <Card body color="warning">No data for <code>{item.metric}</code></Card>

View File

@ -17,11 +17,15 @@
export let authlevel export let authlevel
export let roles export let roles
let filters, jobList, matchedJobs = null let filters = []
let jobList, matchedJobs = null
let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false
let metrics = filterPresets.cluster let metrics = filterPresets.cluster
? ccconfig[`plot_list_selectedMetrics:${filterPresets.cluster}`] || ccconfig.plot_list_selectedMetrics ? ccconfig[`plot_list_selectedMetrics:${filterPresets.cluster}`] || ccconfig.plot_list_selectedMetrics
: ccconfig.plot_list_selectedMetrics : ccconfig.plot_list_selectedMetrics
let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null
$: selectedCluster = filters[0]?.cluster ? filters[0].cluster.eq : null
// The filterPresets are handled by the Filters component, // The filterPresets are handled by the Filters component,
// so we need to wait for it to be ready before we can start a query. // so we need to wait for it to be ready before we can start a query.
@ -58,7 +62,10 @@
<Filters <Filters
filterPresets={filterPresets} filterPresets={filterPresets}
bind:this={filters} bind:this={filters}
on:update={({ detail }) => jobList.update(detail.filters)} /> on:update={({ detail }) => {
filters = detail.filters
jobList.update(detail.filters)}
} />
</Col> </Col>
<Col xs="3" style="margin-left: auto;"> <Col xs="3" style="margin-left: auto;">
@ -84,7 +91,7 @@
bind:isOpen={isSortingOpen} /> bind:isOpen={isSortingOpen} />
<MetricSelection <MetricSelection
cluster={filterPresets.cluster} bind:cluster={selectedCluster}
configName="plot_list_selectedMetrics" configName="plot_list_selectedMetrics"
bind:metrics={metrics} bind:metrics={metrics}
bind:isOpen={isMetricsSelectionOpen} /> bind:isOpen={isMetricsSelectionOpen} />

View File

@ -5,19 +5,22 @@
import { fetchMetrics, minScope } from './utils' import { fetchMetrics, minScope } from './utils'
export let job export let job
export let metric export let metricName
export let scopes export let scopes
export let width export let width
export let rawData
const dispatch = createEventDispatcher() const dispatch = createEventDispatcher()
const cluster = getContext('clusters').find(cluster => cluster.name == job.cluster) const cluster = getContext('clusters').find(cluster => cluster.name == job.cluster)
const subCluster = cluster.subClusters.find(subCluster => subCluster.name == job.subCluster) const subCluster = cluster.subClusters.find(subCluster => subCluster.name == job.subCluster)
const metricConfig = cluster.metricConfig.find(metricConfig => metricConfig.name == metric) const metricConfig = cluster.metricConfig.find(metricConfig => metricConfig.name == metricName)
let selectedScope = minScope(scopes.map(s => s.scope)), selectedHost = null, plot, fetching = false, error = null let selectedHost = null, plot, fetching = false, error = null
let selectedScope = minScope(scopes)
let selectedScopeIndex = scopes.findIndex(s => s == selectedScope)
$: avaliableScopes = scopes.map(metric => metric.scope) $: avaliableScopes = scopes
$: data = scopes.find(metric => metric.scope == selectedScope) $: data = rawData[selectedScopeIndex]
$: series = data?.series.filter(series => selectedHost == null || series.hostname == selectedHost) $: series = data?.series.filter(series => selectedHost == null || series.hostname == selectedHost)
let from = null, to = null let from = null, to = null
@ -29,7 +32,7 @@
export async function loadMore() { export async function loadMore() {
fetching = true fetching = true
let response = await fetchMetrics(job, [metric], ["core"]) let response = await fetchMetrics(job, [metricName], ["core"])
fetching = false fetching = false
if (response.error) { if (response.error) {
@ -38,9 +41,9 @@
} }
for (let jm of response.data.jobMetrics) { for (let jm of response.data.jobMetrics) {
if (jm.metric.scope != "node") { if (jm.scope != "node") {
scopes.push(jm.metric) scopes.push(jm.metric)
selectedScope = jm.metric.scope selectedScope = jm.scope
dispatch('more-loaded', jm) dispatch('more-loaded', jm)
if (!avaliableScopes.includes(selectedScope)) if (!avaliableScopes.includes(selectedScope))
avaliableScopes = [...avaliableScopes, selectedScope] avaliableScopes = [...avaliableScopes, selectedScope]
@ -52,7 +55,8 @@
</script> </script>
<InputGroup> <InputGroup>
<InputGroupText style="min-width: 150px;"> <InputGroupText style="min-width: 150px;">
{metric} ({metricConfig?.unit}) {metricName} ({(metricConfig?.unit?.prefix ? metricConfig.unit.prefix : '') +
(metricConfig?.unit?.base ? metricConfig.unit.base : '')})
</InputGroupText> </InputGroupText>
<select class="form-select" bind:value={selectedScope}> <select class="form-select" bind:value={selectedScope}>
{#each avaliableScopes as scope} {#each avaliableScopes as scope}
@ -82,7 +86,7 @@
width={width} height={300} width={width} height={300}
cluster={cluster} subCluster={subCluster} cluster={cluster} subCluster={subCluster}
timestep={data.timestep} timestep={data.timestep}
scope={selectedScope} metric={metric} scope={selectedScope} metric={metricName}
series={series} /> series={series} />
{/if} {/if}
{/key} {/key}

View File

@ -95,7 +95,7 @@
<Modal isOpen={isOpen} toggle={() => (isOpen = !isOpen)}> <Modal isOpen={isOpen} toggle={() => (isOpen = !isOpen)}>
<ModalHeader> <ModalHeader>
Configure columns Configure columns (Metric availability shown)
</ModalHeader> </ModalHeader>
<ModalBody> <ModalBody>
<ListGroup> <ListGroup>
@ -113,9 +113,26 @@
{/if} {/if}
{metric} {metric}
<span style="float: right;"> <span style="float: right;">
{cluster == null ? clusters {cluster == null ?
clusters // No single cluster specified: List Clusters with Metric
.filter(cluster => cluster.metricConfig.find(m => m.name == metric) != null) .filter(cluster => cluster.metricConfig.find(m => m.name == metric) != null)
.map(cluster => cluster.name).join(', ') : ''} .map(cluster => cluster.name).join(', ') :
clusters // Single cluster requested: List Subclusters with do not have metric remove flag
.filter(cluster => cluster.metricConfig.find(m => m.name == metric) != null)
.map(function(cluster) {
let scNames = cluster.subClusters.map(sc => sc.name)
scNames.forEach(function(scName){
let met = cluster.metricConfig.find(m => m.name == metric)
let msc = met.subClusters.find(msc => msc.name == scName)
if (msc != null) {
if (msc.remove == true) {
scNames = scNames.filter(scn => scn != msc.name)
}
}
})
return scNames
})
.join(', ')}
</span> </span>
</li> </li>
{/each} {/each}

View File

@ -20,16 +20,19 @@
from.setMinutes(from.getMinutes() - 30) from.setMinutes(from.getMinutes() - 30)
} }
const ccconfig = getContext('cc-config'), clusters = getContext('clusters') const ccconfig = getContext('cc-config')
const clusters = getContext('clusters')
const nodesQuery = operationStore(`query($cluster: String!, $nodes: [String!], $from: Time!, $to: Time!) { const nodesQuery = operationStore(`query($cluster: String!, $nodes: [String!], $from: Time!, $to: Time!) {
nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) { nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) {
host, subCluster host
subCluster
metrics { metrics {
name, name
scope
metric { metric {
timestep timestep
scope unit { base, prefix }
series { series {
statistics { min, avg, max } statistics { min, avg, max }
data data
@ -46,6 +49,17 @@
$: $nodesQuery.variables = { cluster, nodes: [hostname], from: from.toISOString(), to: to.toISOString() } $: $nodesQuery.variables = { cluster, nodes: [hostname], from: from.toISOString(), to: to.toISOString() }
let metricUnits = {}
$: if ($nodesQuery.data) {
for (let metric of clusters.find(c => c.name == cluster).metricConfig) {
if (metric.unit.prefix || metric.unit.base) {
metricUnits[metric.name] = '(' + (metric.unit.prefix ? metric.unit.prefix : '') + (metric.unit.base ? metric.unit.base : '') + ')'
} else { // If no unit defined: Omit Unit Display
metricUnits[metric.name] = ''
}
}
}
query(nodesQuery) query(nodesQuery)
// $: console.log($nodesQuery?.data?.nodeMetrics[0].metrics) // $: console.log($nodesQuery?.data?.nodeMetrics[0].metrics)
@ -83,7 +97,7 @@
let:width let:width
itemsPerRow={ccconfig.plot_view_plotsPerRow} itemsPerRow={ccconfig.plot_view_plotsPerRow}
items={$nodesQuery.data.nodeMetrics[0].metrics.sort((a, b) => a.name.localeCompare(b.name))}> items={$nodesQuery.data.nodeMetrics[0].metrics.sort((a, b) => a.name.localeCompare(b.name))}>
<h4 style="text-align: center;">{item.name}</h4> <h4 style="text-align: center;">{item.name} {metricUnits[item.name]}</h4>
<MetricPlot <MetricPlot
width={width} height={300} metric={item.name} timestep={item.metric.timestep} width={width} height={300} metric={item.name} timestep={item.metric.timestep}
cluster={clusters.find(c => c.name == cluster)} subCluster={$nodesQuery.data.nodeMetrics[0].subCluster} cluster={clusters.find(c => c.name == cluster)} subCluster={$nodesQuery.data.nodeMetrics[0].subCluster}

View File

@ -11,7 +11,7 @@
const allMetrics = [...new Set(jobMetrics.map(m => m.name))].sort(), const allMetrics = [...new Set(jobMetrics.map(m => m.name))].sort(),
scopesForMetric = (metric) => jobMetrics scopesForMetric = (metric) => jobMetrics
.filter(jm => jm.name == metric) .filter(jm => jm.name == metric)
.map(jm => jm.metric.scope) .map(jm => jm.scope)
let hosts = job.resources.map(r => r.hostname).sort(), let hosts = job.resources.map(r => r.hostname).sort(),
selectedScopes = {}, selectedScopes = {},
@ -40,7 +40,7 @@
s.active = true s.active = true
} }
let series = jobMetrics.find(jm => jm.name == metric && jm.metric.scope == 'node')?.metric.series let series = jobMetrics.find(jm => jm.name == metric && jm.scope == 'node')?.metric.series
sorting = {...sorting} sorting = {...sorting}
hosts = hosts.sort((h1, h2) => { hosts = hosts.sort((h1, h2) => {
let s1 = series.find(s => s.hostname == h1)?.statistics let s1 = series.find(s => s.hostname == h1)?.statistics

View File

@ -5,7 +5,7 @@
export let jobMetrics export let jobMetrics
$: series = jobMetrics $: series = jobMetrics
.find(jm => jm.name == metric && jm.metric.scope == scope) .find(jm => jm.name == metric && jm.scope == scope)
?.metric.series.filter(s => s.hostname == host && s.statistics != null) ?.metric.series.filter(s => s.hostname == host && s.statistics != null)
</script> </script>

View File

@ -2,8 +2,8 @@
import Refresher from './joblist/Refresher.svelte' import Refresher from './joblist/Refresher.svelte'
import Roofline, { transformPerNodeData } from './plots/Roofline.svelte' import Roofline, { transformPerNodeData } from './plots/Roofline.svelte'
import Histogram from './plots/Histogram.svelte' import Histogram from './plots/Histogram.svelte'
import { Row, Col, Spinner, Card, Table, Progress } from 'sveltestrap' import { Row, Col, Spinner, Card, CardHeader, CardTitle, CardBody, Table, Progress, Icon } from 'sveltestrap'
import { init } from './utils.js' import { init, formatNumber } from './utils.js'
import { operationStore, query } from '@urql/svelte' import { operationStore, query } from '@urql/svelte'
const { query: initq } = init() const { query: initq } = init()
@ -15,13 +15,14 @@
let from = new Date(Date.now() - 5 * 60 * 1000), to = new Date(Date.now()) let from = new Date(Date.now() - 5 * 60 * 1000), to = new Date(Date.now())
const mainQuery = operationStore(`query($cluster: String!, $filter: [JobFilter!]!, $metrics: [String!], $from: Time!, $to: Time!) { const mainQuery = operationStore(`query($cluster: String!, $filter: [JobFilter!]!, $metrics: [String!], $from: Time!, $to: Time!) {
nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) { nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) {
host, host
subCluster, subCluster
metrics { metrics {
name, name
metric {
scope scope
timestep, metric {
timestep
unit { base, prefix }
series { data } series { data }
} }
} }
@ -47,20 +48,27 @@
? sum + (node.metrics.find(m => m.name == metric)?.metric.series.reduce((sum, series) => sum + series.data[series.data.length - 1], 0) || 0) ? sum + (node.metrics.find(m => m.name == metric)?.metric.series.reduce((sum, series) => sum + series.data[series.data.length - 1], 0) || 0)
: sum, 0) : sum, 0)
let allocatedNodes = {}, flopRate = {}, memBwRate = {} let allocatedNodes = {}, flopRate = {}, flopRateUnit = {}, memBwRate = {}, memBwRateUnit = {}
$: if ($initq.data && $mainQuery.data) { $: if ($initq.data && $mainQuery.data) {
let subClusters = $initq.data.clusters.find(c => c.name == cluster).subClusters let subClusters = $initq.data.clusters.find(c => c.name == cluster).subClusters
for (let subCluster of subClusters) { for (let subCluster of subClusters) {
allocatedNodes[subCluster.name] = $mainQuery.data.allocatedNodes.find(({ name }) => name == subCluster.name)?.count || 0 allocatedNodes[subCluster.name] = $mainQuery.data.allocatedNodes.find(({ name }) => name == subCluster.name)?.count || 0
flopRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'flops_any') * 100) / 100 flopRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'flops_any') * 100) / 100
flopRateUnit[subCluster.name] = subCluster.flopRateSimd.unit.prefix + subCluster.flopRateSimd.unit.base
memBwRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'mem_bw') * 100) / 100 memBwRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'mem_bw') * 100) / 100
memBwRateUnit[subCluster.name] = subCluster.memoryBandwidth.unit.prefix + subCluster.memoryBandwidth.unit.base
} }
} }
query(mainQuery) query(mainQuery)
</script> </script>
<!-- Loading indicator & Refresh -->
<Row> <Row>
<Col xs="auto" style="align-self: flex-end;">
<h4 class="mb-0" >Current usage of cluster "{cluster}"</h4>
</Col>
<Col xs="auto"> <Col xs="auto">
{#if $initq.fetching || $mainQuery.fetching} {#if $initq.fetching || $mainQuery.fetching}
<Spinner/> <Spinner/>
@ -89,54 +97,72 @@
</Col> </Col>
</Row> </Row>
{/if} {/if}
<hr>
<!-- Gauges & Roofline per Subcluster-->
{#if $initq.data && $mainQuery.data} {#if $initq.data && $mainQuery.data}
{#each $initq.data.clusters.find(c => c.name == cluster).subClusters as subCluster, i} {#each $initq.data.clusters.find(c => c.name == cluster).subClusters as subCluster, i}
<Row> <Row cols={2} class="mb-3 justify-content-center">
<Col xs="3"> <Col xs="4" class="px-3">
<Card class="h-auto mt-1">
<CardHeader>
<CardTitle class="mb-0">SubCluster "{subCluster.name}"</CardTitle>
</CardHeader>
<CardBody>
<Table> <Table>
<tr>
<th scope="col">SubCluster</th>
<td colspan="2">{subCluster.name}</td>
</tr>
<tr> <tr>
<th scope="col">Allocated Nodes</th> <th scope="col">Allocated Nodes</th>
<td style="min-width: 75px;"><div class="col"><Progress value={allocatedNodes[subCluster.name]} max={subCluster.numberOfNodes}/></div></td> <td style="min-width: 100px;"><div class="col"><Progress value={allocatedNodes[subCluster.name]} max={subCluster.numberOfNodes}/></div></td>
<td>({allocatedNodes[subCluster.name]} / {subCluster.numberOfNodes})</td> <td>({allocatedNodes[subCluster.name]} Nodes / {subCluster.numberOfNodes} Total Nodes)</td>
</tr> </tr>
<tr> <tr>
<th scope="col">Flop Rate</th> <th scope="col">Flop Rate (Any) <Icon name="info-circle" class="p-1" style="cursor: help;" title="Flops[Any] = (Flops[Double] x 2) + Flops[Single]"/></th>
<td style="min-width: 75px;"><div class="col"><Progress value={flopRate[subCluster.name]} max={subCluster.flopRateSimd * subCluster.numberOfNodes}/></div></td> <td style="min-width: 100px;"><div class="col"><Progress value={flopRate[subCluster.name]} max={subCluster.flopRateSimd.value * subCluster.numberOfNodes}/></div></td>
<td>({flopRate[subCluster.name]} / {subCluster.flopRateSimd * subCluster.numberOfNodes})</td> <td>({flopRate[subCluster.name]} {flopRateUnit[subCluster.name]} / {(subCluster.flopRateSimd.value * subCluster.numberOfNodes)} {flopRateUnit[subCluster.name]} [Max])</td>
</tr> </tr>
<tr> <tr>
<th scope="col">MemBw Rate</th> <th scope="col">MemBw Rate</th>
<td style="min-width: 75px;"><div class="col"><Progress value={memBwRate[subCluster.name]} max={subCluster.memoryBandwidth * subCluster.numberOfNodes}/></div></td> <td style="min-width: 100px;"><div class="col"><Progress value={memBwRate[subCluster.name]} max={subCluster.memoryBandwidth.value * subCluster.numberOfNodes}/></div></td>
<td>({memBwRate[subCluster.name]} / {subCluster.memoryBandwidth * subCluster.numberOfNodes})</td> <td>({memBwRate[subCluster.name]} {memBwRateUnit[subCluster.name]} / {(subCluster.memoryBandwidth.value * subCluster.numberOfNodes)} {memBwRateUnit[subCluster.name]} [Max])</td>
</tr> </tr>
</Table> </Table>
</CardBody>
</Card>
</Col> </Col>
<div class="col-9" bind:clientWidth={plotWidths[i]}> <Col class="px-3">
<div bind:clientWidth={plotWidths[i]}>
{#key $mainQuery.data.nodeMetrics} {#key $mainQuery.data.nodeMetrics}
<Roofline <Roofline
width={plotWidths[i] - 10} height={300} colorDots={false} cluster={subCluster} width={plotWidths[i] - 10} height={300} colorDots={true} showTime={false} cluster={subCluster}
data={transformPerNodeData($mainQuery.data.nodeMetrics.filter(data => data.subCluster == subCluster.name))} /> data={transformPerNodeData($mainQuery.data.nodeMetrics.filter(data => data.subCluster == subCluster.name))} />
{/key} {/key}
</div> </div>
</Col>
</Row> </Row>
{/each} {/each}
<Row>
<div class="col-4" bind:clientWidth={colWidth1}> <hr style="margin-top: -1em;">
<h4>Top Users</h4>
<!-- Usage Stats as Histograms -->
<Row cols={4}>
<Col class="p-2">
<div bind:clientWidth={colWidth1}>
<h4 class="mb-3 text-center">Top Users</h4>
{#key $mainQuery.data} {#key $mainQuery.data}
<Histogram <Histogram
width={colWidth1 - 25} height={300} width={colWidth1 - 25}
data={$mainQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))} data={$mainQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
label={(x) => x < $mainQuery.data.topUsers.length ? $mainQuery.data.topUsers[Math.floor(x)].name : '0'} /> label={(x) => x < $mainQuery.data.topUsers.length ? $mainQuery.data.topUsers[Math.floor(x)].name : '0'}
xlabel="User Name" ylabel="Number of Jobs" />
{/key} {/key}
</div> </div>
<div class="col-2"> </Col>
<Col class="px-4 py-2">
<Table> <Table>
<tr><th>Name</th><th>Number of Nodes</th></tr> <tr class="mb-2"><th>User Name</th><th>Number of Nodes</th></tr>
{#each $mainQuery.data.topUsers.sort((a, b) => b.count - a.count) as { name, count }} {#each $mainQuery.data.topUsers.sort((a, b) => b.count - a.count) as { name, count }}
<tr> <tr>
<th scope="col"><a href="/monitoring/user/{name}">{name}</a></th> <th scope="col"><a href="/monitoring/user/{name}">{name}</a></th>
@ -144,41 +170,48 @@
</tr> </tr>
{/each} {/each}
</Table> </Table>
</div> </Col>
<div class="col-4"> <Col class="p-2">
<h4>Top Projects</h4> <h4 class="mb-3 text-center">Top Projects</h4>
{#key $mainQuery.data} {#key $mainQuery.data}
<Histogram <Histogram
width={colWidth1 - 25} height={300} width={colWidth1 - 25}
data={$mainQuery.data.topProjects.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))} data={$mainQuery.data.topProjects.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
label={(x) => x < $mainQuery.data.topProjects.length ? $mainQuery.data.topProjects[Math.floor(x)].name : '0'} /> label={(x) => x < $mainQuery.data.topProjects.length ? $mainQuery.data.topProjects[Math.floor(x)].name : '0'}
xlabel="Project Code" ylabel="Number of Jobs" />
{/key} {/key}
</div> </Col>
<div class="col-2"> <Col class="px-4 py-2">
<Table> <Table>
<tr><th>Name</th><th>Number of Nodes</th></tr> <tr class="mb-2"><th>Project Code</th><th>Number of Nodes</th></tr>
{#each $mainQuery.data.topProjects.sort((a, b) => b.count - a.count) as { name, count }} {#each $mainQuery.data.topProjects.sort((a, b) => b.count - a.count) as { name, count }}
<tr><th scope="col">{name}</th><td>{count}</td></tr> <tr><th scope="col">{name}</th><td>{count}</td></tr>
{/each} {/each}
</Table> </Table>
</div> </Col>
</Row> </Row>
<Row> <Row cols={2} class="mt-3">
<div class="col" bind:clientWidth={colWidth2}> <Col class="p-2">
<h4>Duration Distribution</h4> <div bind:clientWidth={colWidth2}>
<h4 class="mb-3 text-center">Duration Distribution</h4>
{#key $mainQuery.data.stats} {#key $mainQuery.data.stats}
<Histogram <Histogram
width={colWidth2 - 25} height={300} width={colWidth2 - 25}
data={$mainQuery.data.stats[0].histDuration} /> data={$mainQuery.data.stats[0].histDuration}
xlabel="Current Runtimes [h]"
ylabel="Number of Jobs" />
{/key} {/key}
</div> </div>
<div class="col"> </Col>
<h4>Number of Nodes Distribution</h4> <Col class="p-2">
<h4 class="mb-3 text-center">Number of Nodes Distribution</h4>
{#key $mainQuery.data.stats} {#key $mainQuery.data.stats}
<Histogram <Histogram
width={colWidth2 - 25} height={300} width={colWidth2 - 25}
data={$mainQuery.data.stats[0].histNumNodes} /> data={$mainQuery.data.stats[0].histNumNodes}
xlabel="Allocated Nodes [#]"
ylabel="Number of Jobs" />
{/key} {/key}
</div> </Col>
</Row> </Row>
{/if} {/if}

View File

@ -21,6 +21,7 @@
const clusters = getContext('clusters') const clusters = getContext('clusters')
const ccconfig = getContext('cc-config') const ccconfig = getContext('cc-config')
const metricConfig = getContext('metrics')
let plotHeight = 300 let plotHeight = 300
let hostnameFilter = '' let hostnameFilter = ''
@ -28,13 +29,14 @@
const nodesQuery = operationStore(`query($cluster: String!, $metrics: [String!], $from: Time!, $to: Time!) { const nodesQuery = operationStore(`query($cluster: String!, $metrics: [String!], $from: Time!, $to: Time!) {
nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) { nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) {
host, host
subCluster subCluster
metrics { metrics {
name, name
metric {
scope scope
timestep, metric {
timestep
unit { base, prefix }
series { series {
statistics { min, avg, max } statistics { min, avg, max }
data data
@ -49,6 +51,18 @@
to: to.toISOString() to: to.toISOString()
}) })
let metricUnits = {}
$: if ($nodesQuery.data) {
let thisCluster = clusters.find(c => c.name == cluster)
for (let metric of thisCluster.metricConfig) {
if (metric.unit.prefix || metric.unit.base) {
metricUnits[metric.name] = '(' + (metric.unit.prefix ? metric.unit.prefix : '') + (metric.unit.base ? metric.unit.base : '') + ')'
} else { // If no unit defined: Omit Unit Display
metricUnits[metric.name] = ''
}
}
}
$: $nodesQuery.variables = { cluster, metrics: [selectedMetric], from: from.toISOString(), to: to.toISOString() } $: $nodesQuery.variables = { cluster, metrics: [selectedMetric], from: from.toISOString(), to: to.toISOString() }
query(nodesQuery) query(nodesQuery)
@ -71,7 +85,7 @@
<InputGroupText>Metric</InputGroupText> <InputGroupText>Metric</InputGroupText>
<select class="form-select" bind:value={selectedMetric}> <select class="form-select" bind:value={selectedMetric}>
{#each clusters.find(c => c.name == cluster).metricConfig as metric} {#each clusters.find(c => c.name == cluster).metricConfig as metric}
<option value={metric.name}>{metric.name} ({metric.unit})</option> <option value={metric.name}>{metric.name} {metricUnits[metric.name]}</option>
{/each} {/each}
</select> </select>
</InputGroup> </InputGroup>
@ -98,11 +112,23 @@
let:width let:width
itemsPerRow={ccconfig.plot_view_plotsPerRow} itemsPerRow={ccconfig.plot_view_plotsPerRow}
items={$nodesQuery.data.nodeMetrics items={$nodesQuery.data.nodeMetrics
.filter(h => h.host.includes(hostnameFilter) && h.metrics.some(m => m.name == selectedMetric && m.metric.scope == 'node')) .filter(h => h.host.includes(hostnameFilter) && h.metrics.some(m => m.name == selectedMetric && m.scope == 'node'))
.map(h => ({ host: h.host, subCluster: h.subCluster, data: h.metrics.find(m => m.name == selectedMetric && m.metric.scope == 'node') })) .map(function (h) {
let thisConfig = metricConfig(cluster, selectedMetric)
let thisSCIndex = thisConfig.subClusters.findIndex(sc => sc.name == h.subCluster)
// Metric remove == true
if (thisSCIndex >= 0) {
if (thisConfig.subClusters[thisSCIndex].remove == true) {
return { host: h.host, subCluster: h.subCluster, data: null, removed: true }
}
}
// Else
return { host: h.host, subCluster: h.subCluster, data: h.metrics.find(m => m.name == selectedMetric && m.scope == 'node'), removed: false }
})
.sort((a, b) => a.host.localeCompare(b.host))}> .sort((a, b) => a.host.localeCompare(b.host))}>
<h4 style="width: 100%; text-align: center;"><a href="/monitoring/node/{cluster}/{item.host}">{item.host} ({item.subCluster})</a></h4> <h4 style="width: 100%; text-align: center;"><a href="/monitoring/node/{cluster}/{item.host}">{item.host} ({item.subCluster})</a></h4>
{#if item.removed == false && item.data != null}
<MetricPlot <MetricPlot
width={width} width={width}
height={plotHeight} height={plotHeight}
@ -111,6 +137,11 @@
metric={item.data.name} metric={item.data.name}
cluster={clusters.find(c => c.name == cluster)} cluster={clusters.find(c => c.name == cluster)}
subCluster={item.subCluster} /> subCluster={item.subCluster} />
{:else if item.removed == true && item.data == null}
<Card body color="info">Metric '{ selectedMetric }' disabled for subcluster '{ item.subCluster }'</Card>
{:else}
<Card body color="warning">Missing Data</Card>
{/if}
</PlotTable> </PlotTable>
{/if} {/if}
</Col> </Col>

View File

@ -18,10 +18,12 @@
export let user export let user
export let filterPresets export let filterPresets
let filters, jobList let filters = []
let jobList
let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false
let metrics = ccconfig.plot_list_selectedMetrics, isMetricsSelectionOpen = false let metrics = ccconfig.plot_list_selectedMetrics, isMetricsSelectionOpen = false
let w1, w2, histogramHeight = 250 let w1, w2, histogramHeight = 250
let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null
const stats = operationStore(` const stats = operationStore(`
query($filter: [JobFilter!]!) { query($filter: [JobFilter!]!) {
@ -40,6 +42,12 @@
pause: true pause: true
}) })
// filters[filters.findIndex(filter => filter.cluster != null)] ?
// filters[filters.findIndex(filter => filter.cluster != null)].cluster.eq :
// null
// Cluster filter has to be alwas @ first index, above will throw error
$: selectedCluster = filters[0]?.cluster ? filters[0].cluster.eq : null
query(stats) query(stats)
onMount(() => filters.update()) onMount(() => filters.update())
@ -75,11 +83,12 @@
startTimeQuickSelect={true} startTimeQuickSelect={true}
bind:this={filters} bind:this={filters}
on:update={({ detail }) => { on:update={({ detail }) => {
let filters = [...detail.filters, { user: { eq: user.username } }] let jobFilters = [...detail.filters, { user: { eq: user.username } }]
$stats.variables = { filter: filters } $stats.variables = { filter: jobFilters }
$stats.context.pause = false $stats.context.pause = false
$stats.reexecute() $stats.reexecute()
jobList.update(filters) filters = jobFilters
jobList.update(jobFilters)
}} /> }} />
</Col> </Col>
<Col xs="auto" style="margin-left: auto;"> <Col xs="auto" style="margin-left: auto;">
@ -136,19 +145,23 @@
</Table> </Table>
</Col> </Col>
<div class="col-4" style="text-align: center;" bind:clientWidth={w1}> <div class="col-4" style="text-align: center;" bind:clientWidth={w1}>
<b>Walltime</b> <b>Duration Distribution</b>
{#key $stats.data.jobsStatistics[0].histDuration} {#key $stats.data.jobsStatistics[0].histDuration}
<Histogram <Histogram
data={$stats.data.jobsStatistics[0].histDuration} data={$stats.data.jobsStatistics[0].histDuration}
width={w1 - 25} height={histogramHeight} /> width={w1 - 25} height={histogramHeight}
xlabel="Current Runtimes [h]"
ylabel="Number of Jobs"/>
{/key} {/key}
</div> </div>
<div class="col-4" style="text-align: center;" bind:clientWidth={w2}> <div class="col-4" style="text-align: center;" bind:clientWidth={w2}>
<b>Number of Nodes</b> <b>Number of Nodes Distribution</b>
{#key $stats.data.jobsStatistics[0].histNumNodes} {#key $stats.data.jobsStatistics[0].histNumNodes}
<Histogram <Histogram
data={$stats.data.jobsStatistics[0].histNumNodes} data={$stats.data.jobsStatistics[0].histNumNodes}
width={w2 - 25} height={histogramHeight} /> width={w2 - 25} height={histogramHeight}
xlabel="Allocated Nodes [#]"
ylabel="Number of Jobs" />
{/key} {/key}
</div> </div>
{/if} {/if}
@ -167,6 +180,8 @@
bind:sorting={sorting} bind:sorting={sorting}
bind:isOpen={isSortingOpen} /> bind:isOpen={isSortingOpen} />
<MetricSelection configName="plot_list_selectedMetrics" <MetricSelection
bind:cluster={selectedCluster}
configName="plot_list_selectedMetrics"
bind:metrics={metrics} bind:metrics={metrics}
bind:isOpen={isMetricsSelectionOpen} /> bind:isOpen={isMetricsSelectionOpen} />

View File

@ -20,6 +20,7 @@
let text = await res.text() let text = await res.text()
popMessage(text, '#048109') popMessage(text, '#048109')
reloadUserList() reloadUserList()
form.reset()
} else { } else {
let text = await res.text() let text = await res.text()
// console.log(res.statusText) // console.log(res.statusText)
@ -79,7 +80,12 @@
{#if i == 0} {#if i == 0}
<div> <div>
<input type="radio" id={role} name="role" value={role} checked/> <input type="radio" id={role} name="role" value={role} checked/>
<label for={role}>{role.charAt(0).toUpperCase() + role.slice(1)} (regular user, same as if created via LDAP sync.)</label> <label for={role}>{role.toUpperCase()} (Allowed to interact with REST API.)</label>
</div>
{:else if i == 1}
<div>
<input type="radio" id={role} name="role" value={role} checked/>
<label for={role}>{role.charAt(0).toUpperCase() + role.slice(1)} (Same as if created via LDAP sync.)</label>
</div> </div>
{:else} {:else}
<div> <div>

View File

@ -102,9 +102,11 @@
{#if $initialized} {#if $initialized}
({clusters ({clusters
.map(cluster => cluster.metricConfig.find(m => m.name == metric)) .map(cluster => cluster.metricConfig.find(m => m.name == metric))
.filter(m => m != null).map(m => m.unit) .filter(m => m != null)
.reduce((arr, unit) => arr.includes(unit) ? arr : [...arr, unit], []) .map(m => (m.unit?.prefix?m.unit?.prefix:'') + (m.unit?.base?m.unit?.base:'')) // Build unitStr
.join(', ')}) .reduce((arr, unitStr) => arr.includes(unitStr) ? arr : [...arr, unitStr], []) // w/o this, output would be [unitStr, unitStr]
.join(', ')
})
{/if} {/if}
</th> </th>
{/each} {/each}

View File

@ -24,12 +24,14 @@
let scopes = [job.numNodes == 1 ? 'core' : 'node'] let scopes = [job.numNodes == 1 ? 'core' : 'node']
const cluster = getContext('clusters').find(c => c.name == job.cluster) const cluster = getContext('clusters').find(c => c.name == job.cluster)
// Get all MetricConfs which include subCluster-specific settings for this job
const metricConfig = getContext('metrics')
const metricsQuery = operationStore(`query($id: ID!, $metrics: [String!]!, $scopes: [MetricScope!]!) { const metricsQuery = operationStore(`query($id: ID!, $metrics: [String!]!, $scopes: [MetricScope!]!) {
jobMetrics(id: $id, metrics: $metrics, scopes: $scopes) { jobMetrics(id: $id, metrics: $metrics, scopes: $scopes) {
name name
scope
metric { metric {
unit, scope, timestep unit { prefix, base }, timestep
statisticsSeries { min, mean, max } statisticsSeries { min, mean, max }
series { series {
hostname, id, data hostname, id, data
@ -44,13 +46,47 @@
}) })
const selectScope = (jobMetrics) => jobMetrics.reduce( const selectScope = (jobMetrics) => jobMetrics.reduce(
(a, b) => maxScope([a.metric.scope, b.metric.scope]) == a.metric.scope (a, b) => maxScope([a.scope, b.scope]) == a.scope
? (job.numNodes > 1 ? a : b) ? (job.numNodes > 1 ? a : b)
: (job.numNodes > 1 ? b : a), jobMetrics[0]) : (job.numNodes > 1 ? b : a), jobMetrics[0])
const sortAndSelectScope = (jobMetrics) => metrics const sortAndSelectScope = (jobMetrics) => metrics
.map(name => jobMetrics.filter(jobMetric => jobMetric.name == name)) .map(function(name) {
.map(jobMetrics => jobMetrics.length > 0 ? selectScope(jobMetrics) : null) // Get MetricConf for this selected/requested metric
let thisConfig = metricConfig(cluster, name)
let thisSCIndex = thisConfig.subClusters.findIndex(sc => sc.name == job.subCluster)
// Check if Subcluster has MetricConf: If not found (index == -1), no further remove flag check required
if (thisSCIndex >= 0) {
// SubCluster Config present: Check if remove flag is set
if (thisConfig.subClusters[thisSCIndex].remove == true) {
// Return null data and informational flag
return {removed: true, data: null}
} else {
// load and return metric, if data available
let thisMetric = jobMetrics.filter(jobMetric => jobMetric.name == name) // Returns Array
if (thisMetric.length > 0) {
return {removed: false, data: thisMetric}
} else {
return {removed: false, data: null}
}
}
} else {
// No specific subCluster config: 'remove' flag not set, deemed false -> load and return metric, if data available
let thisMetric = jobMetrics.filter(jobMetric => jobMetric.name == name) // Returns Array
if (thisMetric.length > 0) {
return {removed: false, data: thisMetric}
} else {
return {removed: false, data: null}
}
}
})
.map(function(jobMetrics) {
if (jobMetrics.data != null && jobMetrics.data.length > 0) {
return {removed: jobMetrics.removed, data: selectScope(jobMetrics.data)}
} else {
return jobMetrics
}
})
$: metricsQuery.variables = { id: job.id, metrics, scopes } $: metricsQuery.variables = { id: job.id, metrics, scopes }
@ -81,17 +117,20 @@
{:else} {:else}
{#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric || i)} {#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric || i)}
<td> <td>
{#if metric != null} <!-- Subluster Metricconfig remove keyword for jobtables (joblist main, user joblist, project joblist) to be used here as toplevel case-->
{#if metric.removed == false && metric.data != null}
<MetricPlot <MetricPlot
width={plotWidth} width={plotWidth}
height={plotHeight} height={plotHeight}
timestep={metric.metric.timestep} timestep={metric.data.metric.timestep}
scope={metric.metric.scope} scope={metric.data.scope}
series={metric.metric.series} series={metric.data.metric.series}
statisticsSeries={metric.metric.statisticsSeries} statisticsSeries={metric.data.metric.statisticsSeries}
metric={metric.name} metric={metric.data.name}
cluster={cluster} cluster={cluster}
subCluster={job.subCluster} /> subCluster={job.subCluster} />
{:else if metric.removed == true && metric.data == null}
<Card body color="info">Metric disabled for subcluster '{ job.subCluster }'</Card>
{:else} {:else}
<Card body color="warning">Missing Data</Card> <Card body color="warning">Missing Data</Card>
{/if} {/if}

View File

@ -18,15 +18,17 @@
import { onMount } from 'svelte' import { onMount } from 'svelte'
export let data export let data
export let width export let width = 500
export let height export let height = 300
export let xlabel = ''
export let ylabel = ''
export let min = null export let min = null
export let max = null export let max = null
export let label = formatNumber export let label = formatNumber
const fontSize = 12 const fontSize = 12
const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"' const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"'
const paddingLeft = 35, paddingRight = 20, paddingTop = 20, paddingBottom = 20 const paddingLeft = 50, paddingRight = 20, paddingTop = 20, paddingBottom = 20
let ctx, canvasElement let ctx, canvasElement
@ -72,9 +74,11 @@
} }
function render() { function render() {
const h = height - paddingTop - paddingBottom const labelOffset = Math.floor(height * 0.1)
const h = height - paddingTop - paddingBottom - labelOffset
const w = width - paddingLeft - paddingRight const w = width - paddingLeft - paddingRight
const barWidth = Math.ceil(w / (maxValue + 1)) const barGap = 5
const barWidth = Math.ceil(w / (maxValue + 1)) - barGap
if (Number.isNaN(barWidth)) if (Number.isNaN(barWidth))
return return
@ -83,9 +87,14 @@
const getCanvasY = (count) => (h - (count / maxCount) * h) + paddingTop const getCanvasY = (count) => (h - (count / maxCount) * h) + paddingTop
// X Axis // X Axis
ctx.font = `${fontSize}px ${fontFamily}` ctx.font = `bold ${fontSize}px ${fontFamily}`
ctx.fillStyle = 'black' ctx.fillStyle = 'black'
if (xlabel != '') {
let textWidth = ctx.measureText(xlabel).width
ctx.fillText(xlabel, Math.floor((width / 2) - (textWidth / 2) + barGap), height - Math.floor(labelOffset / 2))
}
ctx.textAlign = 'center' ctx.textAlign = 'center'
ctx.font = `${fontSize}px ${fontFamily}`
if (min != null && max != null) { if (min != null && max != null) {
const stepsizeX = getStepSize(max - min, w, 75) const stepsizeX = getStepSize(max - min, w, 75)
let startX = 0 let startX = 0
@ -94,19 +103,28 @@
for (let x = startX; x < max; x += stepsizeX) { for (let x = startX; x < max; x += stepsizeX) {
let px = ((x - min) / (max - min)) * (w - barWidth) + paddingLeft + (barWidth / 2.) let px = ((x - min) / (max - min)) * (w - barWidth) + paddingLeft + (barWidth / 2.)
ctx.fillText(`${formatNumber(x)}`, px, height - paddingBottom + 15) ctx.fillText(`${formatNumber(x)}`, px, height - paddingBottom - Math.floor(labelOffset / 2))
} }
} else { } else {
const stepsizeX = getStepSize(maxValue, w, 120) const stepsizeX = getStepSize(maxValue, w, 120)
for (let x = 0; x <= maxValue; x += stepsizeX) { for (let x = 0; x <= maxValue; x += stepsizeX) {
ctx.fillText(label(x), getCanvasX(x), height - paddingBottom + 15) ctx.fillText(label(x), getCanvasX(x), height - paddingBottom - Math.floor(labelOffset / 2))
} }
} }
// Y Axis // Y Axis
ctx.fillStyle = 'black' ctx.fillStyle = 'black'
ctx.strokeStyle = '#bbbbbb' ctx.strokeStyle = '#bbbbbb'
ctx.font = `bold ${fontSize}px ${fontFamily}`
if (ylabel != '') {
ctx.save()
ctx.translate(15, Math.floor(h / 2))
ctx.rotate(-Math.PI / 2)
ctx.fillText(ylabel, 0, 0)
ctx.restore()
}
ctx.textAlign = 'right' ctx.textAlign = 'right'
ctx.font = `${fontSize}px ${fontFamily}`
ctx.beginPath() ctx.beginPath()
const stepsizeY = getStepSize(maxCount, h, 50) const stepsizeY = getStepSize(maxCount, h, 50)
for (let y = stepsizeY; y <= maxCount; y += stepsizeY) { for (let y = stepsizeY; y <= maxCount; y += stepsizeY) {
@ -118,7 +136,7 @@
ctx.stroke() ctx.stroke()
// Draw bars // Draw bars
ctx.fillStyle = '#0066cc' ctx.fillStyle = '#85abce'
for (let p of data) { for (let p of data) {
ctx.fillRect( ctx.fillRect(
getCanvasX(p.value) - (barWidth / 2.), getCanvasX(p.value) - (barWidth / 2.),
@ -130,10 +148,10 @@
// Fat lines left and below plotting area // Fat lines left and below plotting area
ctx.strokeStyle = 'black' ctx.strokeStyle = 'black'
ctx.beginPath() ctx.beginPath()
ctx.moveTo(0, height - paddingBottom) ctx.moveTo(0, height - paddingBottom - labelOffset)
ctx.lineTo(width, height - paddingBottom) ctx.lineTo(width, height - paddingBottom - labelOffset)
ctx.moveTo(paddingLeft, 0) ctx.moveTo(paddingLeft, 0)
ctx.lineTo(paddingLeft, height- paddingBottom) ctx.lineTo(paddingLeft, height - Math.floor(labelOffset / 2))
ctx.stroke() ctx.stroke()
} }

View File

@ -18,7 +18,7 @@
let ctx, canvasElement let ctx, canvasElement
const labels = metrics.filter(name => { const labels = metrics.filter(name => {
if (!jobMetrics.find(m => m.name == name && m.metric.scope == "node")) { if (!jobMetrics.find(m => m.name == name && m.scope == "node")) {
console.warn(`PolarPlot: No metric data for '${name}'`) console.warn(`PolarPlot: No metric data for '${name}'`)
return false return false
} }
@ -27,7 +27,7 @@
const getValuesForStat = (getStat) => labels.map(name => { const getValuesForStat = (getStat) => labels.map(name => {
const peak = metricConfig(cluster, name).peak const peak = metricConfig(cluster, name).peak
const metric = jobMetrics.find(m => m.name == name && m.metric.scope == "node") const metric = jobMetrics.find(m => m.name == name && m.scope == "node")
const value = getStat(metric.metric) / peak const value = getStat(metric.metric) / peak
return value <= 1. ? value : 1. return value <= 1. ? value : 1.
}) })

View File

@ -4,7 +4,8 @@
<script context="module"> <script context="module">
const axesColor = '#aaaaaa' const axesColor = '#aaaaaa'
const fontSize = 12 const tickFontSize = 10
const labelFontSize = 12
const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"' const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"'
const paddingLeft = 40, const paddingLeft = 40,
paddingRight = 10, paddingRight = 10,
@ -67,11 +68,11 @@
return 2 return 2
} }
function render(ctx, data, cluster, width, height, colorDots, defaultMaxY) { function render(ctx, data, cluster, width, height, colorDots, showTime, defaultMaxY) {
if (width <= 0) if (width <= 0)
return return
const [minX, maxX, minY, maxY] = [0.01, 1000, 1., cluster?.flopRateSimd || defaultMaxY] const [minX, maxX, minY, maxY] = [0.01, 1000, 1., cluster?.flopRateSimd?.value || defaultMaxY]
const w = width - paddingLeft - paddingRight const w = width - paddingLeft - paddingRight
const h = height - paddingTop - paddingBottom const h = height - paddingTop - paddingBottom
@ -95,7 +96,7 @@
// Axes // Axes
ctx.fillStyle = 'black' ctx.fillStyle = 'black'
ctx.strokeStyle = axesColor ctx.strokeStyle = axesColor
ctx.font = `${fontSize}px ${fontFamily}` ctx.font = `${tickFontSize}px ${fontFamily}`
ctx.beginPath() ctx.beginPath()
for (let x = minX, i = 0; x <= maxX; i++) { for (let x = minX, i = 0; x <= maxX; i++) {
let px = getCanvasX(x) let px = getCanvasX(x)
@ -103,18 +104,20 @@
let textWidth = ctx.measureText(text).width let textWidth = ctx.measureText(text).width
ctx.fillText(text, ctx.fillText(text,
Math.floor(px - (textWidth / 2)), Math.floor(px - (textWidth / 2)),
height - paddingBottom + fontSize + 5) height - paddingBottom + tickFontSize + 5)
ctx.moveTo(px, paddingTop - 5) ctx.moveTo(px, paddingTop - 5)
ctx.lineTo(px, height - paddingBottom + 5) ctx.lineTo(px, height - paddingBottom + 5)
x *= axisStepFactor(i, w) x *= axisStepFactor(i, w)
} }
if (data.xLabel) { if (data.xLabel) {
ctx.font = `${labelFontSize}px ${fontFamily}`
let textWidth = ctx.measureText(data.xLabel).width let textWidth = ctx.measureText(data.xLabel).width
ctx.fillText(data.xLabel, Math.floor((width / 2) - (textWidth / 2)), height - 20) ctx.fillText(data.xLabel, Math.floor((width / 2) - (textWidth / 2)), height - 20)
} }
ctx.textAlign = 'center' ctx.textAlign = 'center'
ctx.font = `${tickFontSize}px ${fontFamily}`
for (let y = minY, i = 0; y <= maxY; i++) { for (let y = minY, i = 0; y <= maxY; i++) {
let py = getCanvasY(y) let py = getCanvasY(y)
ctx.moveTo(paddingLeft - 5, py) ctx.moveTo(paddingLeft - 5, py)
@ -129,6 +132,7 @@
y *= axisStepFactor(i) y *= axisStepFactor(i)
} }
if (data.yLabel) { if (data.yLabel) {
ctx.font = `${labelFontSize}px ${fontFamily}`
ctx.save() ctx.save()
ctx.translate(15, Math.floor(height / 2)) ctx.translate(15, Math.floor(height / 2))
ctx.rotate(-Math.PI / 2) ctx.rotate(-Math.PI / 2)
@ -185,13 +189,13 @@
ctx.lineWidth = 2 ctx.lineWidth = 2
ctx.beginPath() ctx.beginPath()
if (cluster != null) { if (cluster != null) {
const ycut = 0.01 * cluster.memoryBandwidth const ycut = 0.01 * cluster.memoryBandwidth.value
const scalarKnee = (cluster.flopRateScalar - ycut) / cluster.memoryBandwidth const scalarKnee = (cluster.flopRateScalar.value - ycut) / cluster.memoryBandwidth.value
const simdKnee = (cluster.flopRateSimd - ycut) / cluster.memoryBandwidth const simdKnee = (cluster.flopRateSimd.value - ycut) / cluster.memoryBandwidth.value
const scalarKneeX = getCanvasX(scalarKnee), const scalarKneeX = getCanvasX(scalarKnee),
simdKneeX = getCanvasX(simdKnee), simdKneeX = getCanvasX(simdKnee),
flopRateScalarY = getCanvasY(cluster.flopRateScalar), flopRateScalarY = getCanvasY(cluster.flopRateScalar.value),
flopRateSimdY = getCanvasY(cluster.flopRateSimd) flopRateSimdY = getCanvasY(cluster.flopRateSimd.value)
if (scalarKneeX < width - paddingRight) { if (scalarKneeX < width - paddingRight) {
ctx.moveTo(scalarKneeX, flopRateScalarY) ctx.moveTo(scalarKneeX, flopRateScalarY)
@ -222,8 +226,8 @@
} }
ctx.stroke() ctx.stroke()
if (colorDots && data.x && data.y) { if (colorDots && showTime && data.x && data.y) {
// The Color Scale // The Color Scale For Time Information
ctx.fillStyle = 'black' ctx.fillStyle = 'black'
ctx.fillText('Time:', 17, height - 5) ctx.fillText('Time:', 17, height - 5)
const start = paddingLeft + 5 const start = paddingLeft + 5
@ -237,7 +241,7 @@
} }
} }
function transformData(flopsAny, memBw, colorDots) { function transformData(flopsAny, memBw, colorDots) { // Uses Metric Object
const nodes = flopsAny.series.length const nodes = flopsAny.series.length
const timesteps = flopsAny.series[0].data.length const timesteps = flopsAny.series[0].data.length
@ -308,17 +312,18 @@
export let memBw = null export let memBw = null
export let cluster = null export let cluster = null
export let maxY = null export let maxY = null
export let width export let width = 500
export let height export let height = 300
export let tiles = null export let tiles = null
export let colorDots = true export let colorDots = true
export let showTime = true
export let data = null export let data = null
console.assert(data || tiles || (flopsAny && memBw), "you must provide flopsAny and memBw or tiles!") console.assert(data || tiles || (flopsAny && memBw), "you must provide flopsAny and memBw or tiles!")
let ctx, canvasElement, prevWidth = width, prevHeight = height let ctx, canvasElement, prevWidth = width, prevHeight = height
data = data != null ? data : (flopsAny && memBw data = data != null ? data : (flopsAny && memBw
? transformData(flopsAny.metric, memBw.metric, colorDots) ? transformData(flopsAny, memBw, colorDots) // Use Metric Object from Parent
: { : {
tiles: tiles, tiles: tiles,
xLabel: 'Intensity [FLOPS/byte]', xLabel: 'Intensity [FLOPS/byte]',
@ -334,7 +339,7 @@
canvasElement.width = width canvasElement.width = width
canvasElement.height = height canvasElement.height = height
render(ctx, data, cluster, width, height, colorDots, maxY) render(ctx, data, cluster, width, height, colorDots, showTime, maxY)
}) })
let timeoutId = null let timeoutId = null
@ -354,7 +359,7 @@
timeoutId = null timeoutId = null
canvasElement.width = width canvasElement.width = width
canvasElement.height = height canvasElement.height = height
render(ctx, data, cluster, width, height, colorDots, maxY) render(ctx, data, cluster, width, height, colorDots, showTime, maxY)
}, 250) }, 250)
} }

View File

@ -37,11 +37,11 @@ export function init(extraInitQuery = '') {
clusters { clusters {
name, name,
metricConfig { metricConfig {
name, unit, peak, name, unit { base, prefix }, peak,
normal, caution, alert, normal, caution, alert,
timestep, scope, timestep, scope,
aggregation, aggregation,
subClusters { name, peak, normal, caution, alert } subClusters { name, peak, normal, caution, alert, remove }
} }
partitions partitions
subClusters { subClusters {
@ -49,9 +49,9 @@ export function init(extraInitQuery = '') {
socketsPerNode socketsPerNode
coresPerSocket coresPerSocket
threadsPerCore threadsPerCore
flopRateScalar flopRateScalar { unit { base, prefix }, value }
flopRateSimd flopRateSimd { unit { base, prefix }, value }
memoryBandwidth memoryBandwidth { unit { base, prefix }, value }
numberOfNodes numberOfNodes
topology { topology {
node, socket, core node, socket, core