mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-24 12:29:05 +01:00
Merge pull request #104 from ClusterCockpit/import-data-sanitation
Import data sanitation Fixes among other things MetricConfig for GPU SubCluster #99 Mismatch of type of "id" in job-metric-data "series" object schema #101
This commit is contained in:
commit
7272db4fb0
4
.gitignore
vendored
4
.gitignore
vendored
@ -9,4 +9,6 @@
|
||||
|
||||
/web/frontend/public/build
|
||||
/web/frontend/node_modules
|
||||
.vscode/settings.json
|
||||
/.vscode/*
|
||||
/archive-migration
|
||||
/archive-manager
|
||||
|
@ -47,12 +47,17 @@ type SubCluster {
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: Int!
|
||||
flopRateSimd: Int!
|
||||
memoryBandwidth: Int!
|
||||
flopRateScalar: MetricValue!
|
||||
flopRateSimd: MetricValue!
|
||||
memoryBandwidth: MetricValue!
|
||||
topology: Topology!
|
||||
}
|
||||
|
||||
type MetricValue {
|
||||
unit: Unit!
|
||||
value: Float!
|
||||
}
|
||||
|
||||
type Topology {
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
@ -70,23 +75,24 @@ type Accelerator {
|
||||
|
||||
type SubClusterConfig {
|
||||
name: String!
|
||||
peak: Float!
|
||||
normal: Float!
|
||||
caution: Float!
|
||||
alert: Float!
|
||||
}
|
||||
|
||||
type MetricConfig {
|
||||
name: String!
|
||||
unit: String!
|
||||
scope: MetricScope!
|
||||
aggregation: String
|
||||
timestep: Int!
|
||||
peak: Float
|
||||
normal: Float
|
||||
caution: Float
|
||||
alert: Float
|
||||
subClusters: [SubClusterConfig]
|
||||
remove: Boolean
|
||||
}
|
||||
|
||||
type MetricConfig {
|
||||
name: String!
|
||||
unit: Unit!
|
||||
scope: MetricScope!
|
||||
aggregation: String!
|
||||
timestep: Int!
|
||||
peak: Float!
|
||||
normal: Float
|
||||
caution: Float!
|
||||
alert: Float!
|
||||
subClusters: [SubClusterConfig!]!
|
||||
}
|
||||
|
||||
type Tag {
|
||||
@ -104,12 +110,12 @@ type Resource {
|
||||
|
||||
type JobMetricWithName {
|
||||
name: String!
|
||||
scope: MetricScope!
|
||||
metric: JobMetric!
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
unit: String!
|
||||
scope: MetricScope!
|
||||
unit: Unit
|
||||
timestep: Int!
|
||||
series: [Series!]
|
||||
statisticsSeries: StatsSeries
|
||||
@ -117,11 +123,16 @@ type JobMetric {
|
||||
|
||||
type Series {
|
||||
hostname: String!
|
||||
id: Int
|
||||
id: String
|
||||
statistics: MetricStatistics
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type Unit {
|
||||
base: String!
|
||||
prefix: String
|
||||
}
|
||||
|
||||
type MetricStatistics {
|
||||
avg: Float!
|
||||
min: Float!
|
||||
|
@ -15,6 +15,7 @@
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"validate": true,
|
||||
"clusters": [
|
||||
{
|
||||
"name": "test",
|
||||
@ -24,9 +25,18 @@
|
||||
"token": "eyJhbGciOiJF-E-pQBQ"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
|
@ -1,10 +1,12 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use utf8;
|
||||
|
||||
my %INFO;
|
||||
my %DOMAINS;
|
||||
|
||||
my $SMT;
|
||||
my $numMemoryDomains;
|
||||
$DOMAINS{socket} = [];
|
||||
@ -198,8 +200,11 @@ END
|
||||
|
||||
$INFO{gpus} .= join(",\n",@gpuStr);
|
||||
$INFO{gpus} .= "]\n";
|
||||
} else {
|
||||
$INFO{gpus} = '';
|
||||
}
|
||||
|
||||
|
||||
print <<"END";
|
||||
{
|
||||
"name": "<FILL IN>",
|
||||
@ -219,10 +224,10 @@ print <<"END";
|
||||
"memoryDomain": [
|
||||
$INFO{memoryDomains}
|
||||
],
|
||||
$INFO{gpus}
|
||||
"core": [
|
||||
$INFO{cores}
|
||||
]
|
||||
$INFO{gpus}
|
||||
}
|
||||
}
|
||||
END
|
||||
|
37
docs/ConfigurationManagement.md
Normal file
37
docs/ConfigurationManagement.md
Normal file
@ -0,0 +1,37 @@
|
||||
# Release versioning
|
||||
|
||||
Releases are numbered with a integer id starting with 1.
|
||||
Every release embeds the following assets into the binary:
|
||||
* Web-frontend including javascript files and all static assets
|
||||
* Golang template files for server-side rendering
|
||||
* JSON schema files for validation
|
||||
|
||||
Remaining external assets are:
|
||||
* The SQL database used
|
||||
* The job archive
|
||||
|
||||
Both external assets are also versioned using integer ids.
|
||||
This means every release binary is tied to specific versions for the SQL
|
||||
database and job archive.
|
||||
A command line switch `--migrate-db` is provided to migrate the SQL database
|
||||
from a previous to the most recent version.
|
||||
We provide a separate tool `archive-migration` to migrate an existing job
|
||||
archive from the previous to the most recent version.
|
||||
|
||||
# Versioning of APIs
|
||||
cc-backend provides two API backends:
|
||||
* A REST API for querying jobs
|
||||
* A GraphQL API used for data exchange between web frontend and cc-backend
|
||||
|
||||
Both APIs will also be versioned. We still need to decide if we also support
|
||||
older REST API version using versioning of the endpoint URLs.
|
||||
|
||||
# How to build a specific release
|
||||
|
||||
|
||||
# How to migrate the SQL database
|
||||
|
||||
|
||||
# How to migrate the job archive
|
||||
|
||||
|
@ -13,9 +13,18 @@
|
||||
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 32 },
|
||||
"duration": { "from": 0, "to": 172800 },
|
||||
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 32
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2010-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -26,9 +35,18 @@
|
||||
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 1 },
|
||||
"duration": { "from": 0, "to": 172800 },
|
||||
"startTime": { "from": "2015-01-01T00:00:00Z", "to": null }
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2015-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
@ -63,6 +63,7 @@ models:
|
||||
resolver: true
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricScope" }
|
||||
MetricValue: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MetricValue" }
|
||||
JobStatistics: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobStatistics" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
|
||||
@ -79,3 +80,4 @@ models:
|
||||
FilterRanges: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.FilterRanges" }
|
||||
SubCluster: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.SubCluster" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.StatsSeries" }
|
||||
Unit: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Unit" }
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -60,6 +60,7 @@ type JobFilter struct {
|
||||
|
||||
type JobMetricWithName struct {
|
||||
Name string `json:"name"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
Metric *schema.JobMetric `json:"metric"`
|
||||
}
|
||||
|
||||
|
@ -194,12 +194,9 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
|
||||
res := []*model.JobMetricWithName{}
|
||||
for name, md := range data {
|
||||
for scope, metric := range md {
|
||||
if metric.Scope != schema.MetricScope(scope) {
|
||||
log.Panic("metric.Scope != schema.MetricScope(scope) : Should not happen!")
|
||||
}
|
||||
|
||||
res = append(res, &model.JobMetricWithName{
|
||||
Name: name,
|
||||
Scope: scope,
|
||||
Metric: metric,
|
||||
})
|
||||
}
|
||||
@ -296,6 +293,7 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
|
||||
Name: metric,
|
||||
Scope: schema.MetricScopeNode,
|
||||
Metric: scopedMetric,
|
||||
})
|
||||
}
|
||||
@ -307,6 +305,15 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
return nodeMetrics, nil
|
||||
}
|
||||
|
||||
// NumberOfNodes is the resolver for the numberOfNodes field.
|
||||
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
|
||||
nodeList, err := archive.ParseNodeList(obj.Nodes)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return nodeList.NodeCount(), nil
|
||||
}
|
||||
|
||||
// Cluster returns generated.ClusterResolver implementation.
|
||||
func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver{r} }
|
||||
|
||||
@ -319,7 +326,11 @@ func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResol
|
||||
// Query returns generated.QueryResolver implementation.
|
||||
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
||||
|
||||
// SubCluster returns generated.SubClusterResolver implementation.
|
||||
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
|
||||
|
||||
type clusterResolver struct{ *Resolver }
|
||||
type jobResolver struct{ *Resolver }
|
||||
type mutationResolver struct{ *Resolver }
|
||||
type queryResolver struct{ *Resolver }
|
||||
type subClusterResolver struct{ *Resolver }
|
||||
|
@ -164,7 +164,6 @@ func (ccms *CCMetricStore) LoadData(
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
topology := archive.GetSubCluster(job.Cluster, job.SubCluster).Topology
|
||||
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
|
||||
if err != nil {
|
||||
log.Warn("Error while building queries")
|
||||
@ -201,7 +200,6 @@ func (ccms *CCMetricStore) LoadData(
|
||||
if !ok {
|
||||
jobMetric = &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Scope: scope,
|
||||
Timestep: mc.Timestep,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
@ -215,13 +213,10 @@ func (ccms *CCMetricStore) LoadData(
|
||||
continue
|
||||
}
|
||||
|
||||
id := (*int)(nil)
|
||||
id := (*string)(nil)
|
||||
if query.Type != nil {
|
||||
id = new(int)
|
||||
*id, err = strconv.Atoi(query.TypeIds[0])
|
||||
if err != nil || *query.Type == acceleratorString {
|
||||
*id, _ = topology.GetAcceleratorIndex(query.TypeIds[0])
|
||||
}
|
||||
id = new(string)
|
||||
*id = query.TypeIds[0]
|
||||
}
|
||||
|
||||
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
|
||||
@ -235,7 +230,7 @@ func (ccms *CCMetricStore) LoadData(
|
||||
jobMetric.Series = append(jobMetric.Series, schema.Series{
|
||||
Hostname: query.Hostname,
|
||||
Id: id,
|
||||
Statistics: &schema.MetricStatistics{
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: float64(res.Avg),
|
||||
Min: float64(res.Min),
|
||||
Max: float64(res.Max),
|
||||
@ -275,9 +270,14 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) {
|
||||
|
||||
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
||||
topology := archive.GetSubCluster(job.Cluster, job.SubCluster).Topology
|
||||
assignedScope := []schema.MetricScope{}
|
||||
|
||||
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
||||
if scerr != nil {
|
||||
return nil, nil, scerr
|
||||
}
|
||||
topology := subcluster.Topology
|
||||
|
||||
for _, metric := range metrics {
|
||||
remoteName := ccms.toRemoteName(metric)
|
||||
mc := archive.GetMetricConfig(job.Cluster, metric)
|
||||
@ -293,7 +293,7 @@ func (ccms *CCMetricStore) buildQueries(
|
||||
scopesLoop:
|
||||
for _, requestedScope := range scopes {
|
||||
nativeScope := mc.Scope
|
||||
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
|
||||
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -624,13 +624,12 @@ func (ccms *CCMetricStore) LoadNodeData(
|
||||
mc := archive.GetMetricConfig(cluster, metric)
|
||||
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Scope: schema.MetricScopeNode,
|
||||
Timestep: mc.Timestep,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: query.Hostname,
|
||||
Data: qdata.Data,
|
||||
Statistics: &schema.MetricStatistics{
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: float64(qdata.Avg),
|
||||
Min: float64(qdata.Min),
|
||||
Max: float64(qdata.Max),
|
||||
|
@ -134,7 +134,6 @@ func (idb *InfluxDBv2DataRepository) LoadData(
|
||||
jobMetric = map[schema.MetricScope]*schema.JobMetric{
|
||||
scope: { // uses scope var from above!
|
||||
Unit: mc.Unit,
|
||||
Scope: scope,
|
||||
Timestep: mc.Timestep,
|
||||
Series: make([]schema.Series, 0, len(job.Resources)),
|
||||
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
|
||||
@ -159,7 +158,7 @@ func (idb *InfluxDBv2DataRepository) LoadData(
|
||||
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
|
||||
hostSeries = schema.Series{
|
||||
Hostname: host,
|
||||
Statistics: nil,
|
||||
Statistics: schema.MetricStatistics{}, //TODO Add Statistics
|
||||
Data: make([]schema.Float, 0),
|
||||
}
|
||||
}
|
||||
@ -212,15 +211,10 @@ func (idb *InfluxDBv2DataRepository) LoadData(
|
||||
for _, scope := range scopes {
|
||||
if scope == "node" { // No 'socket/core' support yet
|
||||
for metric, nodes := range stats {
|
||||
// log.Debugf("<< Add Stats for : Field %s >>", metric)
|
||||
for node, stats := range nodes {
|
||||
// log.Debugf("<< Add Stats for : Host %s : Min %.2f, Max %.2f, Avg %.2f >>", node, stats.Min, stats.Max, stats.Avg )
|
||||
for index, _ := range jobData[metric][scope].Series {
|
||||
// log.Debugf("<< Try to add Stats to Series in Position %d >>", index)
|
||||
if jobData[metric][scope].Series[index].Hostname == node {
|
||||
// log.Debugf("<< Match for Series in Position %d : Host %s >>", index, jobData[metric][scope].Series[index].Hostname)
|
||||
jobData[metric][scope].Series[index].Statistics = &schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
|
||||
// log.Debugf("<< Result Inner: Min %.2f, Max %.2f, Avg %.2f >>", jobData[metric][scope].Series[index].Statistics.Min, jobData[metric][scope].Series[index].Statistics.Max, jobData[metric][scope].Series[index].Statistics.Avg)
|
||||
jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -228,17 +222,6 @@ func (idb *InfluxDBv2DataRepository) LoadData(
|
||||
}
|
||||
}
|
||||
|
||||
// DEBUG:
|
||||
// for _, scope := range scopes {
|
||||
// for _, met := range metrics {
|
||||
// for _, series := range jobData[met][scope].Series {
|
||||
// log.Debugf("<< Result: %d data points for metric %s on %s with scope %s, Stats: Min %.2f, Max %.2f, Avg %.2f >>",
|
||||
// len(series.Data), met, series.Hostname, scope,
|
||||
// series.Statistics.Min, series.Statistics.Max, series.Statistics.Avg)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
|
@ -335,7 +335,10 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
}
|
||||
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: archive.GetMetricConfig(job.Cluster, metric).Unit,
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: avg / float64(job.NumNodes),
|
||||
Min: min,
|
||||
Max: max,
|
||||
|
@ -251,7 +251,7 @@ func (pdb *PrometheusDataRepository) RowToSeries(
|
||||
return schema.Series{
|
||||
Hostname: hostname,
|
||||
Data: values,
|
||||
Statistics: &schema.MetricStatistics{
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: mean,
|
||||
Min: min,
|
||||
Max: max,
|
||||
@ -323,7 +323,6 @@ func (pdb *PrometheusDataRepository) LoadData(
|
||||
if !ok {
|
||||
jobMetric = &schema.JobMetric{
|
||||
Unit: metricConfig.Unit,
|
||||
Scope: scope,
|
||||
Timestep: metricConfig.Timestep,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
@ -362,7 +361,7 @@ func (pdb *PrometheusDataRepository) LoadStats(
|
||||
for metric, metricData := range data {
|
||||
stats[metric] = make(map[string]schema.MetricStatistics)
|
||||
for _, series := range metricData[schema.MetricScopeNode].Series {
|
||||
stats[metric][series.Hostname] = *series.Statistics
|
||||
stats[metric][series.Hostname] = series.Statistics
|
||||
}
|
||||
}
|
||||
|
||||
@ -432,7 +431,6 @@ func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
// output per host and metric
|
||||
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
||||
Unit: metricConfig.Unit,
|
||||
Scope: scope,
|
||||
Timestep: metricConfig.Timestep,
|
||||
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
|
||||
},
|
||||
|
@ -17,6 +17,7 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/units"
|
||||
)
|
||||
|
||||
const NamedJobInsert string = `INSERT INTO job (
|
||||
@ -75,6 +76,7 @@ func HandleImportFlag(flag string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
checkJobData(&jobData)
|
||||
SanityChecks(&jobMeta.BaseJob)
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
if job, err := GetJobRepository().Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
|
||||
@ -173,7 +175,9 @@ func InitDB() error {
|
||||
i := 0
|
||||
errorOccured := 0
|
||||
|
||||
for jobMeta := range ar.Iter() {
|
||||
for jobContainer := range ar.Iter(false) {
|
||||
|
||||
jobMeta := jobContainer.Meta
|
||||
|
||||
// // Bundle 100 inserts into one transaction for better performance:
|
||||
if i%10 == 0 {
|
||||
@ -297,7 +301,7 @@ func SanityChecks(job *schema.BaseJob) error {
|
||||
if len(job.Resources) == 0 || len(job.User) == 0 {
|
||||
return fmt.Errorf("'resources' and 'user' should not be empty")
|
||||
}
|
||||
if job.NumAcc < 0 || job.NumHWThreads < 0 || job.NumNodes < 1 {
|
||||
if *job.NumAcc < 0 || *job.NumHWThreads < 0 || job.NumNodes < 1 {
|
||||
return fmt.Errorf("'numNodes', 'numAcc' or 'numHWThreads' invalid")
|
||||
}
|
||||
if len(job.Resources) != int(job.NumNodes) {
|
||||
@ -314,3 +318,34 @@ func loadJobStat(job *schema.JobMeta, metric string) float64 {
|
||||
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func checkJobData(d *schema.JobData) error {
|
||||
for _, scopes := range *d {
|
||||
var newUnit string
|
||||
// Add node scope if missing
|
||||
for _, metric := range scopes {
|
||||
if strings.Contains(metric.Unit.Base, "B/s") ||
|
||||
strings.Contains(metric.Unit.Base, "F/s") ||
|
||||
strings.Contains(metric.Unit.Base, "B") {
|
||||
|
||||
// First get overall avg
|
||||
sum := 0.0
|
||||
for _, s := range metric.Series {
|
||||
sum += s.Statistics.Avg
|
||||
}
|
||||
|
||||
avg := sum / float64(len(metric.Series))
|
||||
|
||||
for _, s := range metric.Series {
|
||||
fp := schema.ConvertFloatToFloat64(s.Data)
|
||||
// Normalize values with new unit prefix
|
||||
oldUnit := metric.Unit.Base
|
||||
units.NormalizeSeries(fp, avg, oldUnit, &newUnit)
|
||||
s.Data = schema.GetFloat64ToFloat(fp)
|
||||
}
|
||||
metric.Unit.Base = newUnit
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -335,7 +335,13 @@ func (r *JobRepository) DeleteJobById(id int64) error {
|
||||
}
|
||||
|
||||
// TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC;
|
||||
func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggregate, filters []*model.JobFilter, weight *model.Weights, limit *int) (map[string]int, error) {
|
||||
func (r *JobRepository) CountGroupedJobs(
|
||||
ctx context.Context,
|
||||
aggreg model.Aggregate,
|
||||
filters []*model.JobFilter,
|
||||
weight *model.Weights,
|
||||
limit *int) (map[string]int, error) {
|
||||
|
||||
start := time.Now()
|
||||
if !aggreg.IsValid() {
|
||||
return nil, errors.New("invalid aggregate")
|
||||
|
@ -8,13 +8,15 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
)
|
||||
|
||||
const Version = 1
|
||||
|
||||
type ArchiveBackend interface {
|
||||
Init(rawConfig json.RawMessage) error
|
||||
Init(rawConfig json.RawMessage) (int, error)
|
||||
|
||||
LoadJobMeta(job *schema.Job) (*schema.JobMeta, error)
|
||||
|
||||
@ -28,7 +30,12 @@ type ArchiveBackend interface {
|
||||
|
||||
GetClusters() []string
|
||||
|
||||
Iter() <-chan *schema.JobMeta
|
||||
Iter(loadMetricData bool) <-chan JobContainer
|
||||
}
|
||||
|
||||
type JobContainer struct {
|
||||
Meta *schema.JobMeta
|
||||
Data *schema.JobData
|
||||
}
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
@ -54,10 +61,12 @@ func Init(rawConfig json.RawMessage, disableArchive bool) error {
|
||||
return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", kind.Kind)
|
||||
}
|
||||
|
||||
if err := ar.Init(rawConfig); err != nil {
|
||||
version, err := ar.Init(rawConfig)
|
||||
if err != nil {
|
||||
log.Error("Error while initializing archiveBackend")
|
||||
return err
|
||||
}
|
||||
log.Infof("Load archive version %d", version)
|
||||
return initClusterConfig()
|
||||
}
|
||||
|
||||
|
@ -55,7 +55,7 @@ func initClusterConfig() error {
|
||||
|
||||
nodeLists[cluster.Name] = make(map[string]NodeList)
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Nodes == "" {
|
||||
if sc.Nodes == "*" {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -80,18 +80,17 @@ func GetCluster(cluster string) *schema.Cluster {
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetSubCluster(cluster, subcluster string) *schema.SubCluster {
|
||||
|
||||
func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, p := range c.SubClusters {
|
||||
if p.Name == subcluster {
|
||||
return p
|
||||
return p, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster)
|
||||
}
|
||||
|
||||
func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
|
||||
@ -138,7 +137,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
|
||||
}
|
||||
}
|
||||
|
||||
if cluster.SubClusters[0].Nodes == "" {
|
||||
if cluster.SubClusters[0].Nodes == "*" {
|
||||
job.SubCluster = cluster.SubClusters[0].Name
|
||||
return nil
|
||||
}
|
||||
|
@ -7,17 +7,21 @@ package archive
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/santhosh-tekuri/jsonschema/v5"
|
||||
)
|
||||
|
||||
type FsArchiveConfig struct {
|
||||
@ -29,6 +33,11 @@ type FsArchive struct {
|
||||
clusters []string
|
||||
}
|
||||
|
||||
func checkFileExists(filePath string) bool {
|
||||
_, err := os.Stat(filePath)
|
||||
return !errors.Is(err, os.ErrNotExist)
|
||||
}
|
||||
|
||||
func getPath(
|
||||
job *schema.Job,
|
||||
rootPath string,
|
||||
@ -44,54 +53,109 @@ func getPath(
|
||||
|
||||
func loadJobMeta(filename string) (*schema.JobMeta, error) {
|
||||
|
||||
f, err := os.Open(filename)
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
log.Errorf("loadJobMeta() > open file error: %v", err)
|
||||
return &schema.JobMeta{}, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return DecodeJobMeta(bufio.NewReader(f))
|
||||
if config.Keys.Validate {
|
||||
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
|
||||
return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) Init(rawConfig json.RawMessage) error {
|
||||
return DecodeJobMeta(bytes.NewReader(b))
|
||||
}
|
||||
|
||||
func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
|
||||
f, err := os.Open(filename)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend LoadJobData()- %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if isCompressed {
|
||||
r, err := gzip.NewReader(f)
|
||||
if err != nil {
|
||||
log.Errorf(" %v", err)
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if config.Keys.Validate {
|
||||
if err := schema.Validate(schema.Data, r); err != nil {
|
||||
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return DecodeJobData(r, filename)
|
||||
} else {
|
||||
defer f.Close()
|
||||
if config.Keys.Validate {
|
||||
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
|
||||
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
|
||||
}
|
||||
}
|
||||
return DecodeJobData(bufio.NewReader(f), filename)
|
||||
}
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) Init(rawConfig json.RawMessage) (int, error) {
|
||||
|
||||
var config FsArchiveConfig
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
log.Warnf("Init() > Unmarshal error: %#v", err)
|
||||
return err
|
||||
return 0, err
|
||||
}
|
||||
if config.Path == "" {
|
||||
err := fmt.Errorf("Init() : empty config.Path")
|
||||
log.Errorf("Init() > config.Path error: %v", err)
|
||||
return err
|
||||
return 0, err
|
||||
}
|
||||
fsa.path = config.Path
|
||||
|
||||
b, err := os.ReadFile(filepath.Join(fsa.path, "version.txt"))
|
||||
if err != nil {
|
||||
fmt.Println("Err")
|
||||
return 0, err
|
||||
}
|
||||
|
||||
version, err := strconv.Atoi(strings.TrimSuffix(string(b), "\n"))
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend Init()- %v", err)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if version != Version {
|
||||
return version, fmt.Errorf("unsupported version %d, need %d", version, Version)
|
||||
}
|
||||
|
||||
entries, err := os.ReadDir(fsa.path)
|
||||
if err != nil {
|
||||
log.Errorf("Init() > ReadDir() error: %v", err)
|
||||
return err
|
||||
return 0, err
|
||||
}
|
||||
|
||||
for _, de := range entries {
|
||||
if !de.IsDir() {
|
||||
continue
|
||||
}
|
||||
fsa.clusters = append(fsa.clusters, de.Name())
|
||||
}
|
||||
|
||||
return nil
|
||||
return version, nil
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) LoadJobData(job *schema.Job) (schema.JobData, error) {
|
||||
|
||||
filename := getPath(job, fsa.path, "data.json")
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
log.Errorf("LoadJobData() > open file error: %v", err)
|
||||
return nil, err
|
||||
var isCompressed bool = true
|
||||
filename := getPath(job, fsa.path, "data.json.gz")
|
||||
if !checkFileExists(filename) {
|
||||
filename = getPath(job, fsa.path, "data.json")
|
||||
isCompressed = false
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return DecodeJobData(bufio.NewReader(f), filename)
|
||||
return loadJobData(filename, isCompressed)
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) {
|
||||
@ -105,20 +169,19 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) {
|
||||
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
|
||||
if err != nil {
|
||||
log.Errorf("LoadClusterCfg() > open file error: %v", err)
|
||||
return &schema.Cluster{}, err
|
||||
}
|
||||
if config.Keys.Validate {
|
||||
// if config.Keys.Validate {
|
||||
if err := schema.Validate(schema.ClusterCfg, bytes.NewReader(b)); err != nil {
|
||||
log.Warnf("Validate cluster config: %v\n", err)
|
||||
return &schema.Cluster{}, fmt.Errorf("Validate cluster config: %v\n", err)
|
||||
return &schema.Cluster{}, fmt.Errorf("validate cluster config: %v", err)
|
||||
}
|
||||
}
|
||||
// }
|
||||
return DecodeCluster(bytes.NewReader(b))
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
|
||||
func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
|
||||
|
||||
ch := make(chan *schema.JobMeta)
|
||||
ch := make(chan JobContainer)
|
||||
go func() {
|
||||
clustersDir, err := os.ReadDir(fsa.path)
|
||||
if err != nil {
|
||||
@ -126,6 +189,9 @@ func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
|
||||
}
|
||||
|
||||
for _, clusterDir := range clustersDir {
|
||||
if !clusterDir.IsDir() {
|
||||
continue
|
||||
}
|
||||
lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name()))
|
||||
if err != nil {
|
||||
log.Fatalf("Reading jobs failed @ lvl1 dirs: %s", err.Error())
|
||||
@ -152,10 +218,27 @@ func (fsa *FsArchive) Iter() <-chan *schema.JobMeta {
|
||||
for _, startTimeDir := range startTimeDirs {
|
||||
if startTimeDir.IsDir() {
|
||||
job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json"))
|
||||
if err != nil {
|
||||
log.Errorf("error in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
|
||||
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
}
|
||||
|
||||
if loadMetricData {
|
||||
var isCompressed bool = true
|
||||
filename := filepath.Join(dirpath, startTimeDir.Name(), "data.json.gz")
|
||||
|
||||
if !checkFileExists(filename) {
|
||||
filename = filepath.Join(dirpath, startTimeDir.Name(), "data.json")
|
||||
isCompressed = false
|
||||
}
|
||||
|
||||
data, err := loadJobData(filename, isCompressed)
|
||||
if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
|
||||
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
}
|
||||
ch <- JobContainer{Meta: job, Data: &data}
|
||||
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
} else {
|
||||
ch <- job
|
||||
ch <- JobContainer{Meta: job, Data: nil}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -225,6 +308,28 @@ func (fsa *FsArchive) ImportJob(
|
||||
return err
|
||||
}
|
||||
|
||||
// var isCompressed bool = true
|
||||
// // TODO Use shortJob Config for check
|
||||
// if jobMeta.Duration < 300 {
|
||||
// isCompressed = false
|
||||
// f, err = os.Create(path.Join(dir, "data.json"))
|
||||
// } else {
|
||||
// f, err = os.Create(path.Join(dir, "data.json.gz"))
|
||||
// }
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// if isCompressed {
|
||||
// if err := EncodeJobData(gzip.NewWriter(f), jobData); err != nil {
|
||||
// return err
|
||||
// }
|
||||
// } else {
|
||||
// if err := EncodeJobData(f, jobData); err != nil {
|
||||
// return err
|
||||
// }
|
||||
// }
|
||||
|
||||
f, err = os.Create(path.Join(dir, "data.json"))
|
||||
if err != nil {
|
||||
log.Error("Error while creating filepath for data.json")
|
||||
@ -236,9 +341,6 @@ func (fsa *FsArchive) ImportJob(
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
log.Warn("Error while closing data.json file")
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// no error: final return is nil
|
||||
return nil
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ func init() {
|
||||
|
||||
func TestInitEmptyPath(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"kind\":\"../../test/archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("{\"kind\":\"../../test/archive\"}"))
|
||||
if err == nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -28,14 +28,14 @@ func TestInitEmptyPath(t *testing.T) {
|
||||
|
||||
func TestInitNoJson(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("\"path\":\"../../test/archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("\"path\":\"../../test/archive\"}"))
|
||||
if err == nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
func TestInitNotExists(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/job-archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/job-archive\"}"))
|
||||
if err == nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -43,15 +43,16 @@ func TestInitNotExists(t *testing.T) {
|
||||
|
||||
func TestInit(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
version, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if fsa.path != "../../test/archive" {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if version != 1 {
|
||||
t.Fail()
|
||||
}
|
||||
if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" {
|
||||
t.Fail()
|
||||
}
|
||||
@ -59,7 +60,7 @@ func TestInit(t *testing.T) {
|
||||
|
||||
func TestLoadJobMetaInternal(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -82,7 +83,7 @@ func TestLoadJobMetaInternal(t *testing.T) {
|
||||
|
||||
func TestLoadJobMeta(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -110,7 +111,7 @@ func TestLoadJobMeta(t *testing.T) {
|
||||
|
||||
func TestLoadJobData(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -136,7 +137,7 @@ func TestLoadJobData(t *testing.T) {
|
||||
|
||||
func TestLoadCluster(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -146,22 +147,22 @@ func TestLoadCluster(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if cfg.SubClusters[0].CoresPerSocket != 10 {
|
||||
if cfg.SubClusters[0].CoresPerSocket != 4 {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
|
||||
func TestIter(t *testing.T) {
|
||||
var fsa FsArchive
|
||||
err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
_, err := fsa.Init(json.RawMessage("{\"path\":\"../../test/archive\"}"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for job := range fsa.Iter() {
|
||||
fmt.Printf("Job %d\n", job.JobID)
|
||||
for job := range fsa.Iter(false) {
|
||||
fmt.Printf("Job %d\n", job.Meta.JobID)
|
||||
|
||||
if job.Cluster != "emmy" {
|
||||
if job.Meta.Cluster != "emmy" {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
|
@ -14,6 +14,8 @@ import (
|
||||
|
||||
type NodeList [][]interface {
|
||||
consume(input string) (next string, ok bool)
|
||||
limits() []map[string]int
|
||||
prefix() string
|
||||
}
|
||||
|
||||
func (nl *NodeList) Contains(name string) bool {
|
||||
@ -35,6 +37,44 @@ func (nl *NodeList) Contains(name string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (nl *NodeList) PrintList() []string {
|
||||
var out []string
|
||||
for _, term := range *nl {
|
||||
// Get String-Part first
|
||||
prefix := term[0].prefix()
|
||||
if len(term) == 1 { // If only String-Part in Term: Single Node Name -> Use as provided
|
||||
out = append(out, prefix)
|
||||
} else { // Else: Numeric start-end definition with x digits zeroPadded
|
||||
limitArr := term[1].limits()
|
||||
for _, inner := range limitArr {
|
||||
for i := inner["start"]; i < inner["end"]+1; i++ {
|
||||
if inner["zeroPadded"] == 1 {
|
||||
out = append(out, fmt.Sprintf("%s%0*d", prefix, inner["digits"], i))
|
||||
} else {
|
||||
log.Error("node list: only zero-padded ranges are allowed")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (nl *NodeList) NodeCount() int {
|
||||
var out int = 0
|
||||
for _, term := range *nl {
|
||||
if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one
|
||||
out += 1
|
||||
} else { // Else: Numeric start-end definition -> add difference + 1
|
||||
limitArr := term[1].limits()
|
||||
for _, inner := range limitArr {
|
||||
out += (inner["end"] - inner["start"]) + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type NLExprString string
|
||||
|
||||
func (nle NLExprString) consume(input string) (next string, ok bool) {
|
||||
@ -45,6 +85,16 @@ func (nle NLExprString) consume(input string) (next string, ok bool) {
|
||||
return "", false
|
||||
}
|
||||
|
||||
func (nle NLExprString) limits() []map[string]int {
|
||||
// Null implementation to fullfill interface requirement
|
||||
l := make([]map[string]int, 0)
|
||||
return l
|
||||
}
|
||||
|
||||
func (nle NLExprString) prefix() string {
|
||||
return string(nle)
|
||||
}
|
||||
|
||||
type NLExprIntRanges []NLExprIntRange
|
||||
|
||||
func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
|
||||
@ -56,6 +106,21 @@ func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
|
||||
return "", false
|
||||
}
|
||||
|
||||
func (nles NLExprIntRanges) limits() []map[string]int {
|
||||
l := make([]map[string]int, 0)
|
||||
for _, nle := range nles {
|
||||
inner := nle.limits()
|
||||
l = append(l, inner[0])
|
||||
}
|
||||
return l
|
||||
}
|
||||
|
||||
func (nles NLExprIntRanges) prefix() string {
|
||||
// Null implementation to fullfill interface requirement
|
||||
var s string
|
||||
return s
|
||||
}
|
||||
|
||||
type NLExprIntRange struct {
|
||||
start, end int64
|
||||
zeroPadded bool
|
||||
@ -89,6 +154,27 @@ func (nle NLExprIntRange) consume(input string) (next string, ok bool) {
|
||||
return "", false
|
||||
}
|
||||
|
||||
func (nle NLExprIntRange) limits() []map[string]int {
|
||||
l := make([]map[string]int, 0)
|
||||
m := make(map[string]int)
|
||||
m["start"] = int(nle.start)
|
||||
m["end"] = int(nle.end)
|
||||
m["digits"] = int(nle.digits)
|
||||
if nle.zeroPadded == true {
|
||||
m["zeroPadded"] = 1
|
||||
} else {
|
||||
m["zeroPadded"] = 0
|
||||
}
|
||||
l = append(l, m)
|
||||
return l
|
||||
}
|
||||
|
||||
func (nles NLExprIntRange) prefix() string {
|
||||
// Null implementation to fullfill interface requirement
|
||||
var s string
|
||||
return s
|
||||
}
|
||||
|
||||
func ParseNodeList(raw string) (NodeList, error) {
|
||||
isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') }
|
||||
isDigit := func(r byte) bool { return '0' <= r && r <= '9' }
|
||||
@ -117,6 +203,8 @@ func ParseNodeList(raw string) (NodeList, error) {
|
||||
for _, rawterm := range rawterms {
|
||||
exprs := []interface {
|
||||
consume(input string) (next string, ok bool)
|
||||
limits() []map[string]int
|
||||
prefix() string
|
||||
}{}
|
||||
|
||||
for i := 0; i < len(rawterm); i++ {
|
||||
|
@ -4,7 +4,10 @@
|
||||
// license that can be found in the LICENSE file.
|
||||
package schema
|
||||
|
||||
import "strconv"
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type Accelerator struct {
|
||||
ID string `json:"id"`
|
||||
@ -16,23 +19,27 @@ type Topology struct {
|
||||
Node []int `json:"node"`
|
||||
Socket [][]int `json:"socket"`
|
||||
MemoryDomain [][]int `json:"memoryDomain"`
|
||||
Die [][]int `json:"die"`
|
||||
Die [][]*int `json:"die,omitempty"`
|
||||
Core [][]int `json:"core"`
|
||||
Accelerators []*Accelerator `json:"accelerators"`
|
||||
Accelerators []*Accelerator `json:"accelerators,omitempty"`
|
||||
}
|
||||
|
||||
type MetricValue struct {
|
||||
Unit Unit `json:"unit"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
type SubCluster struct {
|
||||
Name string `json:"name"`
|
||||
Nodes string `json:"nodes"`
|
||||
NumberOfNodes int `json:"numberOfNodes"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
FlopRateScalar int `json:"flopRateScalar"`
|
||||
FlopRateSimd int `json:"flopRateSimd"`
|
||||
MemoryBandwidth int `json:"memoryBandwidth"`
|
||||
Topology *Topology `json:"topology"`
|
||||
FlopRateScalar MetricValue `json:"flopRateScalar"`
|
||||
FlopRateSimd MetricValue `json:"flopRateSimd"`
|
||||
MemoryBandwidth MetricValue `json:"memoryBandwidth"`
|
||||
Topology Topology `json:"topology"`
|
||||
}
|
||||
|
||||
type SubClusterConfig struct {
|
||||
@ -41,19 +48,20 @@ type SubClusterConfig struct {
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
Remove bool `json:"remove"`
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
Name string `json:"name"`
|
||||
Unit string `json:"unit"`
|
||||
Unit Unit `json:"unit"`
|
||||
Scope MetricScope `json:"scope"`
|
||||
Aggregation *string `json:"aggregation"`
|
||||
Aggregation string `json:"aggregation"`
|
||||
Timestep int `json:"timestep"`
|
||||
Peak *float64 `json:"peak"`
|
||||
Normal *float64 `json:"normal"`
|
||||
Caution *float64 `json:"caution"`
|
||||
Alert *float64 `json:"alert"`
|
||||
SubClusters []*SubClusterConfig `json:"subClusters"`
|
||||
Peak float64 `json:"peak"`
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
|
||||
}
|
||||
|
||||
type Cluster struct {
|
||||
@ -152,6 +160,15 @@ func (topo *Topology) GetMemoryDomainsFromHWThreads(
|
||||
return memDoms, exclusive
|
||||
}
|
||||
|
||||
// Temporary fix to convert back from int id to string id for accelerators
|
||||
func (topo *Topology) GetAcceleratorID(id int) (string, error) {
|
||||
if id < len(topo.Accelerators) {
|
||||
return topo.Accelerators[id].ID, nil
|
||||
} else {
|
||||
return "", fmt.Errorf("Index %d out of range", id)
|
||||
}
|
||||
}
|
||||
|
||||
func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
|
||||
accels := make([]int, 0)
|
||||
for _, accel := range topo.Accelerators {
|
||||
@ -163,12 +180,3 @@ func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
|
||||
}
|
||||
return accels, nil
|
||||
}
|
||||
|
||||
func (topo *Topology) GetAcceleratorIndex(id string) (int, bool) {
|
||||
for idx, accel := range topo.Accelerators {
|
||||
if accel.ID == id {
|
||||
return idx, true
|
||||
}
|
||||
}
|
||||
return -1, false
|
||||
}
|
||||
|
@ -83,10 +83,10 @@ func (s *Series) MarshalJSON() ([]byte, error) {
|
||||
buf = append(buf, s.Hostname...)
|
||||
buf = append(buf, '"')
|
||||
if s.Id != nil {
|
||||
buf = append(buf, `,"id":`...)
|
||||
buf = strconv.AppendInt(buf, int64(*s.Id), 10)
|
||||
buf = append(buf, `,"id":"`...)
|
||||
buf = append(buf, *s.Id...)
|
||||
buf = append(buf, '"')
|
||||
}
|
||||
if s.Statistics != nil {
|
||||
buf = append(buf, `,"statistics":{"min":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
|
||||
buf = append(buf, `,"avg":`...)
|
||||
@ -94,7 +94,6 @@ func (s *Series) MarshalJSON() ([]byte, error) {
|
||||
buf = append(buf, `,"max":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
|
||||
buf = append(buf, '}')
|
||||
}
|
||||
buf = append(buf, `,"data":[`...)
|
||||
for i := 0; i < len(s.Data); i++ {
|
||||
if i != 0 {
|
||||
@ -110,3 +109,23 @@ func (s *Series) MarshalJSON() ([]byte, error) {
|
||||
buf = append(buf, ']', '}')
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func ConvertFloatToFloat64(s []Float) []float64 {
|
||||
fp := make([]float64, len(s))
|
||||
|
||||
for i, val := range s {
|
||||
fp[i] = float64(val)
|
||||
}
|
||||
|
||||
return fp
|
||||
}
|
||||
|
||||
func GetFloat64ToFloat(s []float64) []Float {
|
||||
fp := make([]Float, len(s))
|
||||
|
||||
for i, val := range s {
|
||||
fp[i] = Float(val)
|
||||
}
|
||||
|
||||
return fp
|
||||
}
|
||||
|
@ -21,18 +21,18 @@ type BaseJob struct {
|
||||
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
|
||||
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
|
||||
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
|
||||
Partition string `json:"partition" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
|
||||
ArrayJobId int64 `json:"arrayJobId" db:"array_job_id" example:"123000"` // The unique identifier of an array job
|
||||
Partition *string `json:"partition,omitempty" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
|
||||
ArrayJobId *int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` // The unique identifier of an array job
|
||||
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
|
||||
NumHWThreads int32 `json:"numHwthreads" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
|
||||
NumAcc int32 `json:"numAcc" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
|
||||
NumHWThreads *int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
|
||||
NumAcc *int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
|
||||
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
|
||||
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
|
||||
SMT int32 `json:"smt" db:"smt" example:"4"` // SMT threads used by job
|
||||
State JobState `json:"jobState" db:"job_state" example:"completed"` // Final state of job
|
||||
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
|
||||
SMT *int32 `json:"smt,omitempty" db:"smt" example:"4"` // SMT threads used by job
|
||||
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
|
||||
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
|
||||
Walltime int64 `json:"walltime" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
|
||||
Tags []*Tag `json:"tags"` // List of tags
|
||||
Walltime *int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
|
||||
Tags []*Tag `json:"tags,omitempty"` // List of tags
|
||||
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
|
||||
Resources []*Resource `json:"resources"` // Resources used by job
|
||||
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
|
||||
@ -89,11 +89,15 @@ var JobDefaults BaseJob = BaseJob{
|
||||
MonitoringStatus: MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
|
||||
type Unit struct {
|
||||
Base string `json:"base"`
|
||||
Prefix *string `json:"prefix,omitempty"`
|
||||
}
|
||||
|
||||
// JobStatistics model
|
||||
// @Description Specification for job metric statistics.
|
||||
type JobStatistics struct {
|
||||
// Metric unit (see schema/unit.schema.json)
|
||||
Unit string `json:"unit" example:"GHz"`
|
||||
Unit Unit `json:"unit" example:"GHz"`
|
||||
Avg float64 `json:"avg" example:"2500" minimum:"0"` // Job metric average
|
||||
Min float64 `json:"min" example:"2000" minimum:"0"` // Job metric minimum
|
||||
Max float64 `json:"max" example:"3000" minimum:"0"` // Job metric maximum
|
||||
@ -102,6 +106,7 @@ type JobStatistics struct {
|
||||
// Tag model
|
||||
// @Description Defines a tag using name and type.
|
||||
type Tag struct {
|
||||
// The unique DB identifier of a tag
|
||||
// The unique DB identifier of a tag
|
||||
ID int64 `json:"id" db:"id"`
|
||||
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type
|
||||
|
@ -15,17 +15,16 @@ import (
|
||||
type JobData map[string]map[MetricScope]*JobMetric
|
||||
|
||||
type JobMetric struct {
|
||||
Unit string `json:"unit"`
|
||||
Scope MetricScope `json:"scope"`
|
||||
Unit Unit `json:"unit"`
|
||||
Timestep int `json:"timestep"`
|
||||
Series []Series `json:"series"`
|
||||
StatisticsSeries *StatsSeries `json:"statisticsSeries"`
|
||||
StatisticsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
|
||||
}
|
||||
|
||||
type Series struct {
|
||||
Hostname string `json:"hostname"`
|
||||
Id *int `json:"id,omitempty"`
|
||||
Statistics *MetricStatistics `json:"statistics"`
|
||||
Id *string `json:"id,omitempty"`
|
||||
Statistics MetricStatistics `json:"statistics"`
|
||||
Data []Float `json:"data"`
|
||||
}
|
||||
|
||||
@ -218,17 +217,12 @@ func (jd *JobData) AddNodeScope(metric string) bool {
|
||||
|
||||
nodeJm := &JobMetric{
|
||||
Unit: jm.Unit,
|
||||
Scope: MetricScopeNode,
|
||||
Timestep: jm.Timestep,
|
||||
Series: make([]Series, 0, len(hosts)),
|
||||
}
|
||||
for hostname, series := range hosts {
|
||||
min, sum, max := math.MaxFloat32, 0.0, -math.MaxFloat32
|
||||
for _, series := range series {
|
||||
if series.Statistics == nil {
|
||||
min, sum, max = math.NaN(), math.NaN(), math.NaN()
|
||||
break
|
||||
}
|
||||
sum += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
@ -259,7 +253,7 @@ func (jd *JobData) AddNodeScope(metric string) bool {
|
||||
|
||||
nodeJm.Series = append(nodeJm.Series, Series{
|
||||
Hostname: hostname,
|
||||
Statistics: &MetricStatistics{Min: min, Avg: sum / float64(len(series)), Max: max},
|
||||
Statistics: MetricStatistics{Min: min, Avg: sum / float64(len(series)), Max: max},
|
||||
Data: data,
|
||||
})
|
||||
}
|
||||
|
@ -21,7 +21,7 @@
|
||||
},
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"type": "string"
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"scope": {
|
||||
"description": "Native measurement resolution",
|
||||
@ -38,7 +38,22 @@
|
||||
"sum",
|
||||
"avg"
|
||||
]
|
||||
|
||||
},
|
||||
"peak": {
|
||||
"description": "Metric peak threshold (Upper metric limit)",
|
||||
"type": "number"
|
||||
},
|
||||
"normal": {
|
||||
"description": "Metric normal threshold",
|
||||
"type": "number"
|
||||
},
|
||||
"caution": {
|
||||
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
|
||||
"type": "number"
|
||||
},
|
||||
"alert": {
|
||||
"description": "Metric alert threshold (Requires immediate action)",
|
||||
"type": "number"
|
||||
},
|
||||
"subClusters": {
|
||||
"description": "Array of cluster hardware partition metric thresholds",
|
||||
@ -61,13 +76,13 @@
|
||||
},
|
||||
"alert": {
|
||||
"type": "number"
|
||||
},
|
||||
"remove": {
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"peak",
|
||||
"caution",
|
||||
"alert"
|
||||
"name"
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -76,7 +91,12 @@
|
||||
"name",
|
||||
"unit",
|
||||
"scope",
|
||||
"timestep"
|
||||
"timestep",
|
||||
"aggregation",
|
||||
"peak",
|
||||
"normal",
|
||||
"caution",
|
||||
"alert"
|
||||
]
|
||||
},
|
||||
"minItems": 1
|
||||
@ -109,15 +129,42 @@
|
||||
},
|
||||
"flopRateScalar": {
|
||||
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
|
||||
"type": "integer"
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
|
||||
"type": "integer"
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"description": "Theoretical node peak memory bandwidth in GB/s",
|
||||
"type": "integer"
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"description": "Metric unit",
|
||||
"$ref": "embedfs://unit.schema.json"
|
||||
},
|
||||
"value": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nodes": {
|
||||
"description": "Node list expression",
|
||||
@ -215,6 +262,7 @@
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"nodes",
|
||||
"topology",
|
||||
"processorType",
|
||||
"socketsPerNode",
|
||||
|
@ -86,8 +86,8 @@
|
||||
},
|
||||
"minProperties": 1
|
||||
},
|
||||
"cpu_used": {
|
||||
"description": "CPU active core utilization",
|
||||
"cpu_user": {
|
||||
"description": "CPU user active core utilization",
|
||||
"properties": {
|
||||
"node": {
|
||||
"$ref": "embedfs://job-metric-data.schema.json"
|
||||
@ -479,7 +479,7 @@
|
||||
]
|
||||
},
|
||||
"required": [
|
||||
"cpu_used",
|
||||
"cpu_user",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
|
@ -84,11 +84,6 @@
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"stopTime": {
|
||||
"description": "Stop epoch time stamp in seconds",
|
||||
"type": "integer",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"duration": {
|
||||
"description": "Duration of job in seconds",
|
||||
"type": "integer",
|
||||
@ -198,8 +193,8 @@
|
||||
"description": "Instructions executed per cycle",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"cpu_used": {
|
||||
"description": "CPU active core utilization",
|
||||
"cpu_user": {
|
||||
"description": "CPU user active core utilization",
|
||||
"$ref": "embedfs://job-metric-statistics.schema.json"
|
||||
},
|
||||
"flops_dp": {
|
||||
@ -331,7 +326,7 @@
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"cpu_used",
|
||||
"cpu_user",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw"
|
||||
@ -343,13 +338,13 @@
|
||||
"user",
|
||||
"project",
|
||||
"cluster",
|
||||
"subCluster",
|
||||
"numNodes",
|
||||
"exclusive",
|
||||
"startTime",
|
||||
"jobState",
|
||||
"duration",
|
||||
"resources",
|
||||
"tags",
|
||||
"statistics"
|
||||
]
|
||||
}
|
||||
|
@ -193,7 +193,7 @@
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"contains": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
|
@ -5,7 +5,7 @@
|
||||
"description": "Format specification for job metric units",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"base_unit": {
|
||||
"base": {
|
||||
"description": "Metric base unit",
|
||||
"type": "string",
|
||||
"enum": [
|
||||
@ -15,7 +15,6 @@
|
||||
"F/s",
|
||||
"CPI",
|
||||
"IPC",
|
||||
"load",
|
||||
"Hz",
|
||||
"W",
|
||||
"°C",
|
||||
@ -36,6 +35,6 @@
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"base_unit"
|
||||
"base"
|
||||
]
|
||||
}
|
||||
|
@ -45,9 +45,29 @@ func TestValidateCluster(t *testing.T) {
|
||||
"socketsPerNode": 2,
|
||||
"coresPerSocket": 10,
|
||||
"threadsPerCore": 2,
|
||||
"flopRateScalar": 44,
|
||||
"flopRateSimd": 704,
|
||||
"memoryBandwidth": 80,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "B/s"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"numberOfNodes": 70,
|
||||
"nodes": "w11[27-45,49-63,69-72]",
|
||||
"topology": {
|
||||
"node": [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29,10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39],
|
||||
"socket": [
|
||||
@ -68,8 +88,13 @@ func TestValidateCluster(t *testing.T) {
|
||||
{
|
||||
"name": "cpu_load",
|
||||
"scope": "hwthread",
|
||||
"unit": "load",
|
||||
"timestep": 60
|
||||
"unit": {"base": ""},
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 4,
|
||||
"normal": 2,
|
||||
"caution": 1,
|
||||
"alert": 0.25
|
||||
}
|
||||
]
|
||||
}`)
|
||||
|
@ -1,6 +1,7 @@
|
||||
# cc-units - A unit system for ClusterCockpit
|
||||
|
||||
When working with metrics, the problem comes up that they may use different unit name but have the same unit in fact. There are a lot of real world examples like 'kB' and 'Kbyte'. In [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector), the collectors read data from different sources which may use different units or the programmer specifies a unit for a metric by hand. The cc-units system is not comparable with the SI unit system. If you are looking for a package for the SI units, see [here](https://pkg.go.dev/github.com/gurre/si).
|
||||
When working with metrics, the problem comes up that they may use different unit name but have the same unit in fact.
|
||||
There are a lot of real world examples like 'kB' and 'Kbyte'. In [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector), the collectors read data from different sources which may use different units or the programmer specifies a unit for a metric by hand. The cc-units system is not comparable with the SI unit system. If you are looking for a package for the SI units, see [here](https://pkg.go.dev/github.com/gurre/si).
|
||||
|
||||
In order to enable unit comparison and conversion, the ccUnits package provides some helpers:
|
||||
```go
|
||||
|
@ -39,7 +39,7 @@ var MeasuresMap map[Measure]MeasureData = map[Measure]MeasureData{
|
||||
},
|
||||
Flops: {
|
||||
Long: "Flops",
|
||||
Short: "Flops",
|
||||
Short: "F",
|
||||
Regex: "^([fF][lL]?[oO]?[pP]?[sS]?)",
|
||||
},
|
||||
Percentage: {
|
||||
|
@ -1,6 +1,7 @@
|
||||
package units
|
||||
|
||||
import (
|
||||
"math"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
@ -172,3 +173,20 @@ func NewPrefix(prefix string) Prefix {
|
||||
}
|
||||
return InvalidPrefix
|
||||
}
|
||||
|
||||
func getExponent(p float64) int {
|
||||
count := 0
|
||||
|
||||
for p > 1.0 {
|
||||
p = p / 1000.0
|
||||
count++
|
||||
}
|
||||
|
||||
return count * 3
|
||||
}
|
||||
|
||||
func NewPrefixFromFactor(op Prefix, e int) Prefix {
|
||||
f := float64(op)
|
||||
exp := math.Pow10(getExponent(f) - e)
|
||||
return Prefix(exp)
|
||||
}
|
||||
|
@ -3,7 +3,10 @@ package units
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
type unit struct {
|
||||
@ -25,7 +28,9 @@ type Unit interface {
|
||||
|
||||
var INVALID_UNIT = NewUnit("foobar")
|
||||
|
||||
// Valid checks whether a unit is a valid unit. A unit is valid if it has at least a prefix and a measure. The unit denominator is optional.
|
||||
// Valid checks whether a unit is a valid unit.
|
||||
// A unit is valid if it has at least a prefix and a measure.
|
||||
// The unit denominator is optional.
|
||||
func (u *unit) Valid() bool {
|
||||
return u.prefix != InvalidPrefix && u.measure != InvalidMeasure
|
||||
}
|
||||
@ -71,6 +76,90 @@ func (u *unit) getUnitDenominator() Measure {
|
||||
return u.divMeasure
|
||||
}
|
||||
|
||||
func ConvertValue(v *float64, from string, to string) {
|
||||
uf := NewUnit(from)
|
||||
ut := NewUnit(to)
|
||||
factor := float64(uf.getPrefix()) / float64(ut.getPrefix())
|
||||
*v = math.Ceil(*v * factor)
|
||||
}
|
||||
|
||||
func ConvertSeries(s []float64, from string, to string) {
|
||||
uf := NewUnit(from)
|
||||
ut := NewUnit(to)
|
||||
factor := float64(uf.getPrefix()) / float64(ut.getPrefix())
|
||||
|
||||
for i := 0; i < len(s); i++ {
|
||||
s[i] = math.Ceil(s[i] * factor)
|
||||
}
|
||||
}
|
||||
|
||||
func getNormalizationFactor(v float64) (float64, int) {
|
||||
count := 0
|
||||
scale := -3
|
||||
|
||||
if v > 1000.0 {
|
||||
for v > 1000.0 {
|
||||
v *= 1e-3
|
||||
count++
|
||||
}
|
||||
} else {
|
||||
for v < 1.0 {
|
||||
v *= 1e3
|
||||
count++
|
||||
}
|
||||
scale = 3
|
||||
}
|
||||
return math.Pow10(count * scale), count * scale
|
||||
}
|
||||
|
||||
func NormalizeValue(v *float64, us string, nu *string) {
|
||||
u := NewUnit(us)
|
||||
f, e := getNormalizationFactor((*v))
|
||||
*v = math.Ceil(*v * f)
|
||||
u.setPrefix(NewPrefixFromFactor(u.getPrefix(), e))
|
||||
*nu = u.Short()
|
||||
}
|
||||
|
||||
func NormalizeSeries(s []float64, avg float64, us string, nu *string) {
|
||||
u := NewUnit(us)
|
||||
f, e := getNormalizationFactor(avg)
|
||||
|
||||
for i := 0; i < len(s); i++ {
|
||||
s[i] *= f
|
||||
s[i] = math.Ceil(s[i])
|
||||
}
|
||||
u.setPrefix(NewPrefixFromFactor(u.getPrefix(), e))
|
||||
fmt.Printf("Prefix: %e \n", u.getPrefix())
|
||||
*nu = u.Short()
|
||||
}
|
||||
|
||||
func ConvertUnitString(us string) schema.Unit {
|
||||
var nu schema.Unit
|
||||
|
||||
if us == "CPI" ||
|
||||
us == "IPC" ||
|
||||
us == "load" ||
|
||||
us == "" {
|
||||
nu.Base = us
|
||||
return nu
|
||||
}
|
||||
u := NewUnit(us)
|
||||
p := u.getPrefix()
|
||||
if p.Prefix() != "" {
|
||||
prefix := p.Prefix()
|
||||
nu.Prefix = &prefix
|
||||
}
|
||||
m := u.getMeasure()
|
||||
d := u.getUnitDenominator()
|
||||
if d.Short() != "inval" {
|
||||
nu.Base = fmt.Sprintf("%s/%s", m.Short(), d.Short())
|
||||
} else {
|
||||
nu.Base = m.Short()
|
||||
}
|
||||
|
||||
return nu
|
||||
}
|
||||
|
||||
// GetPrefixPrefixFactor creates the default conversion function between two prefixes.
|
||||
// It returns a conversation function for the value.
|
||||
func GetPrefixPrefixFactor(in Prefix, out Prefix) func(value interface{}) interface{} {
|
||||
|
@ -2,6 +2,7 @@ package units
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"testing"
|
||||
)
|
||||
@ -199,3 +200,108 @@ func TestPrefixRegex(t *testing.T) {
|
||||
t.Logf("succussfully compiled regex '%s' for prefix %s", data.Regex, data.Long)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertValue(t *testing.T) {
|
||||
v := float64(103456)
|
||||
ConvertValue(&v, "MB/s", "GB/s")
|
||||
|
||||
if v != 104.00 {
|
||||
t.Errorf("Failed ConvertValue: Want 103.456, Got %f", v)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertValueUp(t *testing.T) {
|
||||
v := float64(10.3456)
|
||||
ConvertValue(&v, "GB/s", "MB/s")
|
||||
|
||||
if v != 10346.00 {
|
||||
t.Errorf("Failed ConvertValue: Want 10346.00, Got %f", v)
|
||||
}
|
||||
}
|
||||
func TestConvertSeries(t *testing.T) {
|
||||
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
|
||||
r := []float64{3, 24, 390, 391}
|
||||
ConvertSeries(s, "F/s", "GF/s")
|
||||
|
||||
if !reflect.DeepEqual(s, r) {
|
||||
t.Errorf("Failed ConvertValue: Want 3, 24, 390, 391, Got %v", s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeValue(t *testing.T) {
|
||||
var s string
|
||||
v := float64(103456)
|
||||
|
||||
NormalizeValue(&v, "MB/s", &s)
|
||||
|
||||
if v != 104.00 {
|
||||
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
|
||||
}
|
||||
if s != "GB/s" {
|
||||
t.Errorf("Failed Prefix or unit: Want GB/s, Got %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeValueNoPrefix(t *testing.T) {
|
||||
var s string
|
||||
v := float64(103458596)
|
||||
|
||||
NormalizeValue(&v, "F/s", &s)
|
||||
|
||||
if v != 104.00 {
|
||||
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
|
||||
}
|
||||
if s != "MF/s" {
|
||||
t.Errorf("Failed Prefix or unit: Want MF/s, Got %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeValueKeep(t *testing.T) {
|
||||
var s string
|
||||
v := float64(345)
|
||||
|
||||
NormalizeValue(&v, "MB/s", &s)
|
||||
|
||||
if v != 345.00 {
|
||||
t.Errorf("Failed ConvertValue: Want 104.00, Got %f", v)
|
||||
}
|
||||
if s != "MB/s" {
|
||||
t.Errorf("Failed Prefix or unit: Want GB/s, Got %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeValueDown(t *testing.T) {
|
||||
var s string
|
||||
v := float64(0.0004578)
|
||||
|
||||
NormalizeValue(&v, "GB/s", &s)
|
||||
|
||||
if v != 458.00 {
|
||||
t.Errorf("Failed ConvertValue: Want 458.00, Got %f", v)
|
||||
}
|
||||
if s != "KB/s" {
|
||||
t.Errorf("Failed Prefix or unit: Want KB/s, Got %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeSeries(t *testing.T) {
|
||||
var us string
|
||||
s := []float64{2890031237, 23998994567, 389734042344, 390349424345}
|
||||
r := []float64{3, 24, 390, 391}
|
||||
|
||||
total := 0.0
|
||||
for _, number := range s {
|
||||
total += number
|
||||
}
|
||||
avg := total / float64(len(s))
|
||||
|
||||
fmt.Printf("AVG: %e\n", avg)
|
||||
NormalizeSeries(s, avg, "KB/s", &us)
|
||||
|
||||
if !reflect.DeepEqual(s, r) {
|
||||
t.Errorf("Failed ConvertValue: Want 3, 24, 390, 391, Got %v", s)
|
||||
}
|
||||
if us != "TB/s" {
|
||||
t.Errorf("Failed Prefix or unit: Want TB/s, Got %s", us)
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
BIN
test/archive/emmy/1403/244/1608923076/data.json.gz
Normal file
BIN
test/archive/emmy/1403/244/1608923076/data.json.gz
Normal file
Binary file not shown.
@ -1 +1,194 @@
|
||||
{"exclusive":1,"jobId":1403244,"statistics":{"mem_bw":{"avg":63.57,"min":0,"unit":"GB/s","max":74.5},"rapl_power":{"avg":228.07,"min":0,"unit":"W","max":258.56},"ipc":{"unit":"IPC","max":0.510204081632653,"avg":1.53846153846154,"min":0.0},"clock":{"min":1380.32,"avg":2599.39,"unit":"MHz","max":2634.46},"cpu_load":{"avg":18.4,"min":0,"max":23.58,"unit":"load"},"flops_any":{"max":404.62,"unit":"GF/s","avg":225.59,"min":0},"flops_dp":{"max":0.24,"unit":"GF/s","min":0,"avg":0},"mem_used":{"min":1.55,"avg":27.84,"unit":"GB","max":37.5},"flops_sp":{"min":0,"avg":225.59,"max":404.62,"unit":"GF/s"}},"resources":[{"hostname":"e0102"},{"hostname":"e0103"},{"hostname":"e0105"},{"hostname":"e0106"},{"hostname":"e0107"},{"hostname":"e0108"},{"hostname":"e0114"},{"hostname":"e0320"},{"hostname":"e0321"},{"hostname":"e0325"},{"hostname":"e0404"},{"hostname":"e0415"},{"hostname":"e0433"},{"hostname":"e0437"},{"hostname":"e0439"},{"hostname":"e0501"},{"hostname":"e0503"},{"hostname":"e0505"},{"hostname":"e0506"},{"hostname":"e0512"},{"hostname":"e0513"},{"hostname":"e0514"},{"hostname":"e0653"},{"hostname":"e0701"},{"hostname":"e0716"},{"hostname":"e0727"},{"hostname":"e0728"},{"hostname":"e0925"},{"hostname":"e0926"},{"hostname":"e0929"},{"hostname":"e0934"},{"hostname":"e0951"}],"walltime":10,"jobState":"completed","cluster":"emmy","stopTime":1609009562,"user":"emmyUser6","startTime":1608923076,"partition":"work","tags":[],"project":"no project","numNodes":32,"duration":86486}
|
||||
{
|
||||
"exclusive": 1,
|
||||
"jobId": 1403244,
|
||||
"statistics": {
|
||||
"mem_bw": {
|
||||
"avg": 63.57,
|
||||
"min": 0,
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"max": 74.5
|
||||
},
|
||||
"rapl_power": {
|
||||
"avg": 228.07,
|
||||
"min": 0,
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"max": 258.56
|
||||
},
|
||||
"ipc": {
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
},
|
||||
"max": 0.510204081632653,
|
||||
"avg": 1.53846153846154,
|
||||
"min": 0.0
|
||||
},
|
||||
"clock": {
|
||||
"min": 1380.32,
|
||||
"avg": 2599.39,
|
||||
"unit": {
|
||||
"base": "Hz",
|
||||
"prefix": "M"
|
||||
},
|
||||
"max": 2634.46
|
||||
},
|
||||
"cpu_load": {
|
||||
"avg": 18.4,
|
||||
"min": 0,
|
||||
"max": 23.58,
|
||||
"unit": {
|
||||
"base": "load"
|
||||
}
|
||||
},
|
||||
"flops_any": {
|
||||
"max": 404.62,
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"avg": 225.59,
|
||||
"min": 0
|
||||
},
|
||||
"flops_dp": {
|
||||
"max": 0.24,
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"min": 0,
|
||||
"avg": 0
|
||||
},
|
||||
"mem_used": {
|
||||
"min": 1.55,
|
||||
"avg": 27.84,
|
||||
"unit": {
|
||||
"base": "B",
|
||||
"prefix": "G"
|
||||
},
|
||||
"max": 37.5
|
||||
},
|
||||
"flops_sp": {
|
||||
"min": 0,
|
||||
"avg": 225.59,
|
||||
"max": 404.62,
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
}
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "e0102"
|
||||
},
|
||||
{
|
||||
"hostname": "e0103"
|
||||
},
|
||||
{
|
||||
"hostname": "e0105"
|
||||
},
|
||||
{
|
||||
"hostname": "e0106"
|
||||
},
|
||||
{
|
||||
"hostname": "e0107"
|
||||
},
|
||||
{
|
||||
"hostname": "e0108"
|
||||
},
|
||||
{
|
||||
"hostname": "e0114"
|
||||
},
|
||||
{
|
||||
"hostname": "e0320"
|
||||
},
|
||||
{
|
||||
"hostname": "e0321"
|
||||
},
|
||||
{
|
||||
"hostname": "e0325"
|
||||
},
|
||||
{
|
||||
"hostname": "e0404"
|
||||
},
|
||||
{
|
||||
"hostname": "e0415"
|
||||
},
|
||||
{
|
||||
"hostname": "e0433"
|
||||
},
|
||||
{
|
||||
"hostname": "e0437"
|
||||
},
|
||||
{
|
||||
"hostname": "e0439"
|
||||
},
|
||||
{
|
||||
"hostname": "e0501"
|
||||
},
|
||||
{
|
||||
"hostname": "e0503"
|
||||
},
|
||||
{
|
||||
"hostname": "e0505"
|
||||
},
|
||||
{
|
||||
"hostname": "e0506"
|
||||
},
|
||||
{
|
||||
"hostname": "e0512"
|
||||
},
|
||||
{
|
||||
"hostname": "e0513"
|
||||
},
|
||||
{
|
||||
"hostname": "e0514"
|
||||
},
|
||||
{
|
||||
"hostname": "e0653"
|
||||
},
|
||||
{
|
||||
"hostname": "e0701"
|
||||
},
|
||||
{
|
||||
"hostname": "e0716"
|
||||
},
|
||||
{
|
||||
"hostname": "e0727"
|
||||
},
|
||||
{
|
||||
"hostname": "e0728"
|
||||
},
|
||||
{
|
||||
"hostname": "e0925"
|
||||
},
|
||||
{
|
||||
"hostname": "e0926"
|
||||
},
|
||||
{
|
||||
"hostname": "e0929"
|
||||
},
|
||||
{
|
||||
"hostname": "e0934"
|
||||
},
|
||||
{
|
||||
"hostname": "e0951"
|
||||
}
|
||||
],
|
||||
"walltime": 10,
|
||||
"jobState": "completed",
|
||||
"cluster": "emmy",
|
||||
"subCluster": "haswell",
|
||||
"stopTime": 1609009562,
|
||||
"user": "emmyUser6",
|
||||
"startTime": 1608923076,
|
||||
"partition": "work",
|
||||
"tags": [],
|
||||
"project": "no project",
|
||||
"numNodes": 32,
|
||||
"duration": 86486
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
BIN
test/archive/emmy/1404/397/1609300556/data.json.gz
Normal file
BIN
test/archive/emmy/1404/397/1609300556/data.json.gz
Normal file
Binary file not shown.
@ -1 +1,194 @@
|
||||
{"stopTime":1609387081,"resources":[{"hostname":"e0151"},{"hostname":"e0152"},{"hostname":"e0153"},{"hostname":"e0232"},{"hostname":"e0303"},{"hostname":"e0314"},{"hostname":"e0344"},{"hostname":"e0345"},{"hostname":"e0348"},{"hostname":"e0507"},{"hostname":"e0518"},{"hostname":"e0520"},{"hostname":"e0522"},{"hostname":"e0526"},{"hostname":"e0527"},{"hostname":"e0528"},{"hostname":"e0530"},{"hostname":"e0551"},{"hostname":"e0604"},{"hostname":"e0613"},{"hostname":"e0634"},{"hostname":"e0639"},{"hostname":"e0640"},{"hostname":"e0651"},{"hostname":"e0653"},{"hostname":"e0701"},{"hostname":"e0704"},{"hostname":"e0751"},{"hostname":"e0809"},{"hostname":"e0814"},{"hostname":"e0819"},{"hostname":"e0908"}],"walltime":10,"cluster":"emmy","jobState":"completed","statistics":{"clock":{"max":2634.9,"unit":"MHz","min":0,"avg":2597.8},"cpu_load":{"max":27.41,"unit":"load","min":0,"avg":18.39},"mem_bw":{"min":0,"avg":63.23,"unit":"GB/s","max":75.06},"ipc":{"min":0.0,"avg":1.53846153846154,"unit":"IPC","max":0.490196078431373},"rapl_power":{"min":0,"avg":227.32,"unit":"W","max":256.22},"mem_used":{"min":1.5,"avg":27.77,"unit":"GB","max":37.43},"flops_sp":{"unit":"GF/s","max":413.21,"min":0,"avg":224.41},"flops_dp":{"max":5.72,"unit":"GF/s","min":0,"avg":0},"flops_any":{"min":0,"avg":224.42,"max":413.21,"unit":"GF/s"}},"exclusive":1,"jobId":1404397,"tags":[],"partition":"work","project":"no project","user":"emmyUser6","startTime":1609300556,"duration":86525,"numNodes":32}
|
||||
{
|
||||
"stopTime": 1609387081,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "e0151"
|
||||
},
|
||||
{
|
||||
"hostname": "e0152"
|
||||
},
|
||||
{
|
||||
"hostname": "e0153"
|
||||
},
|
||||
{
|
||||
"hostname": "e0232"
|
||||
},
|
||||
{
|
||||
"hostname": "e0303"
|
||||
},
|
||||
{
|
||||
"hostname": "e0314"
|
||||
},
|
||||
{
|
||||
"hostname": "e0344"
|
||||
},
|
||||
{
|
||||
"hostname": "e0345"
|
||||
},
|
||||
{
|
||||
"hostname": "e0348"
|
||||
},
|
||||
{
|
||||
"hostname": "e0507"
|
||||
},
|
||||
{
|
||||
"hostname": "e0518"
|
||||
},
|
||||
{
|
||||
"hostname": "e0520"
|
||||
},
|
||||
{
|
||||
"hostname": "e0522"
|
||||
},
|
||||
{
|
||||
"hostname": "e0526"
|
||||
},
|
||||
{
|
||||
"hostname": "e0527"
|
||||
},
|
||||
{
|
||||
"hostname": "e0528"
|
||||
},
|
||||
{
|
||||
"hostname": "e0530"
|
||||
},
|
||||
{
|
||||
"hostname": "e0551"
|
||||
},
|
||||
{
|
||||
"hostname": "e0604"
|
||||
},
|
||||
{
|
||||
"hostname": "e0613"
|
||||
},
|
||||
{
|
||||
"hostname": "e0634"
|
||||
},
|
||||
{
|
||||
"hostname": "e0639"
|
||||
},
|
||||
{
|
||||
"hostname": "e0640"
|
||||
},
|
||||
{
|
||||
"hostname": "e0651"
|
||||
},
|
||||
{
|
||||
"hostname": "e0653"
|
||||
},
|
||||
{
|
||||
"hostname": "e0701"
|
||||
},
|
||||
{
|
||||
"hostname": "e0704"
|
||||
},
|
||||
{
|
||||
"hostname": "e0751"
|
||||
},
|
||||
{
|
||||
"hostname": "e0809"
|
||||
},
|
||||
{
|
||||
"hostname": "e0814"
|
||||
},
|
||||
{
|
||||
"hostname": "e0819"
|
||||
},
|
||||
{
|
||||
"hostname": "e0908"
|
||||
}
|
||||
],
|
||||
"walltime": 10,
|
||||
"cluster": "emmy",
|
||||
"subCluster": "haswell",
|
||||
"jobState": "completed",
|
||||
"statistics": {
|
||||
"clock": {
|
||||
"max": 2634.9,
|
||||
"unit": {
|
||||
"base": "Hz",
|
||||
"prefix": "M"
|
||||
},
|
||||
"min": 0,
|
||||
"avg": 2597.8
|
||||
},
|
||||
"cpu_load": {
|
||||
"max": 27.41,
|
||||
"min": 0,
|
||||
"avg": 18.39,
|
||||
"unit": {
|
||||
"base": "load"
|
||||
}
|
||||
},
|
||||
"mem_bw": {
|
||||
"min": 0,
|
||||
"avg": 63.23,
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"max": 75.06
|
||||
},
|
||||
"ipc": {
|
||||
"min": 0.0,
|
||||
"avg": 1.53846153846154,
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
},
|
||||
"max": 0.490196078431373
|
||||
},
|
||||
"rapl_power": {
|
||||
"min": 0,
|
||||
"avg": 227.32,
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"max": 256.22
|
||||
},
|
||||
"mem_used": {
|
||||
"min": 1.5,
|
||||
"avg": 27.77,
|
||||
"unit": {
|
||||
"base": "B",
|
||||
"prefix": "G"
|
||||
},
|
||||
"max": 37.43
|
||||
},
|
||||
"flops_sp": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"max": 413.21,
|
||||
"min": 0,
|
||||
"avg": 224.41
|
||||
},
|
||||
"flops_dp": {
|
||||
"max": 5.72,
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"min": 0,
|
||||
"avg": 0
|
||||
},
|
||||
"flops_any": {
|
||||
"min": 0,
|
||||
"avg": 224.42,
|
||||
"max": 413.21,
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
}
|
||||
}
|
||||
},
|
||||
"exclusive": 1,
|
||||
"jobId": 1404397,
|
||||
"tags": [],
|
||||
"partition": "work",
|
||||
"project": "no project",
|
||||
"user": "emmyUser6",
|
||||
"startTime": 1609300556,
|
||||
"duration": 86525,
|
||||
"numNodes": 32
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
1
test/archive/version.txt
Normal file
1
test/archive/version.txt
Normal file
@ -0,0 +1 @@
|
||||
1
|
@ -1,13 +1,14 @@
|
||||
{
|
||||
"cpu_used": {
|
||||
"core": {
|
||||
"unit": "cpu used",
|
||||
"scope": "core",
|
||||
"unit": {
|
||||
"base": ""
|
||||
},
|
||||
"timestep": 30,
|
||||
"series": [
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 0,
|
||||
"id": "0",
|
||||
"statistics": {
|
||||
"min": 0.09090909090909093,
|
||||
"avg": 0.9173553719008265,
|
||||
@ -29,7 +30,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 1,
|
||||
"id": "1",
|
||||
"statistics": {
|
||||
"min": 0.03694102397926118,
|
||||
"avg": 0.045968409230268584,
|
||||
@ -51,7 +52,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 10,
|
||||
"id": "10",
|
||||
"statistics": {
|
||||
"min": 0.10505319148936171,
|
||||
"avg": 0.9186411992263056,
|
||||
@ -73,7 +74,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 11,
|
||||
"id": "11",
|
||||
"statistics": {
|
||||
"min": 0.05286048845767815,
|
||||
"avg": 0.07053823838706144,
|
||||
@ -99,13 +100,14 @@
|
||||
},
|
||||
"ipc": {
|
||||
"core": {
|
||||
"unit": "IPC",
|
||||
"scope": "core",
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
},
|
||||
"timestep": 60,
|
||||
"series": [
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 0,
|
||||
"id": "0",
|
||||
"statistics": {
|
||||
"min": 1.3808406263195592,
|
||||
"avg": 1.3960848578375105,
|
||||
@ -121,7 +123,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 1,
|
||||
"id": "1",
|
||||
"statistics": {
|
||||
"min": 0.30469640475234366,
|
||||
"avg": 0.8816944294664065,
|
||||
@ -137,7 +139,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 10,
|
||||
"id": "10",
|
||||
"statistics": {
|
||||
"min": 1.3791232173760588,
|
||||
"avg": 1.3850247295506815,
|
||||
@ -153,7 +155,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 11,
|
||||
"id": "11",
|
||||
"statistics": {
|
||||
"min": 0.6424094604392216,
|
||||
"avg": 0.9544442638400293,
|
||||
@ -173,13 +175,14 @@
|
||||
},
|
||||
"flops_any": {
|
||||
"core": {
|
||||
"unit": "F/s",
|
||||
"scope": "core",
|
||||
"unit": {
|
||||
"base": "F/s"
|
||||
},
|
||||
"timestep": 60,
|
||||
"series": [
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 0,
|
||||
"id": "0",
|
||||
"statistics": {
|
||||
"min": 0.0,
|
||||
"avg": 184.2699002412084,
|
||||
@ -195,7 +198,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 1,
|
||||
"id": "1",
|
||||
"statistics": {
|
||||
"min": 0.13559227208748068,
|
||||
"avg": 273.2997868356056,
|
||||
@ -211,7 +214,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 10,
|
||||
"id": "10",
|
||||
"statistics": {
|
||||
"min": 0.0,
|
||||
"avg": 1678.8419461262179,
|
||||
@ -227,7 +230,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 11,
|
||||
"id": "11",
|
||||
"statistics": {
|
||||
"min": 45.28689133054866,
|
||||
"avg": 609.6644949204072,
|
||||
@ -247,13 +250,14 @@
|
||||
},
|
||||
"mem_bw": {
|
||||
"socket": {
|
||||
"unit": "B/s",
|
||||
"scope": "socket",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"timestep": 60,
|
||||
"series": [
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 0,
|
||||
"id": "0",
|
||||
"statistics": {
|
||||
"min": 653671812.1661415,
|
||||
"avg": 1637585527.5854635,
|
||||
@ -269,7 +273,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 0,
|
||||
"id": "0",
|
||||
"statistics": {
|
||||
"min": 1520190251.61048,
|
||||
"avg": 1572477682.3850098,
|
||||
@ -289,8 +293,9 @@
|
||||
},
|
||||
"file_bw": {
|
||||
"node": {
|
||||
"unit": "B/s",
|
||||
"scope": "node",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"timestep": 30,
|
||||
"series": [
|
||||
{
|
||||
@ -341,8 +346,9 @@
|
||||
},
|
||||
"net_bw": {
|
||||
"node": {
|
||||
"unit": "B/s",
|
||||
"scope": "node",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"timestep": 30,
|
||||
"series": [
|
||||
{
|
||||
@ -393,8 +399,9 @@
|
||||
},
|
||||
"mem_used": {
|
||||
"node": {
|
||||
"unit": "B",
|
||||
"scope": "node",
|
||||
"unit": {
|
||||
"base": "B"
|
||||
},
|
||||
"timestep": 30,
|
||||
"series": [
|
||||
{
|
||||
@ -445,13 +452,14 @@
|
||||
},
|
||||
"cpu_power": {
|
||||
"socket": {
|
||||
"unit": "W",
|
||||
"scope": "socket",
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"timestep": 60,
|
||||
"series": [
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"id": 0,
|
||||
"id": "0",
|
||||
"statistics": {
|
||||
"min": 35.50647456742635,
|
||||
"avg": 72.08313211552377,
|
||||
@ -467,7 +475,7 @@
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"id": 0,
|
||||
"id": "0",
|
||||
"statistics": {
|
||||
"min": 83.8466923147859,
|
||||
"avg": 85.18572681122097,
|
||||
|
@ -59,10 +59,6 @@ func setup(t *testing.T) *api.RestApi {
|
||||
const testclusterJson = `{
|
||||
"name": "testcluster",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "sc0",
|
||||
"nodes": "host120,host121,host122"
|
||||
},
|
||||
{
|
||||
"name": "sc1",
|
||||
"nodes": "host123,host124,host125",
|
||||
@ -70,9 +66,28 @@ func setup(t *testing.T) *api.RestApi {
|
||||
"socketsPerNode": 1,
|
||||
"coresPerSocket": 4,
|
||||
"threadsPerCore": 2,
|
||||
"flopRateScalar": 44,
|
||||
"flopRateSimd": 704,
|
||||
"memoryBandwidth": 80,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "B/s"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"numberOfNodes": 70,
|
||||
"topology": {
|
||||
"node": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||
@ -85,9 +100,10 @@ func setup(t *testing.T) *api.RestApi {
|
||||
"metricConfig": [
|
||||
{
|
||||
"name": "load_one",
|
||||
"unit": "load",
|
||||
"unit": { "base": ""},
|
||||
"scope": "node",
|
||||
"timestep": 60,
|
||||
"aggregation": "avg",
|
||||
"peak": 8,
|
||||
"normal": 0,
|
||||
"caution": 0,
|
||||
@ -95,19 +111,38 @@ func setup(t *testing.T) *api.RestApi {
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
const taurusclusterJson = `{
|
||||
"name": "taurus",
|
||||
"SubClusters": [
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
"processorType": "Intel Haswell",
|
||||
"socketsPerNode": 2,
|
||||
"coresPerSocket": 12,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": 32,
|
||||
"flopRateSimd": 512,
|
||||
"memoryBandwidth": 60,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "B/s"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"numberOfNodes": 70,
|
||||
"nodes": "w11[27-45,49-63,69-72]",
|
||||
"topology": {
|
||||
"node": [ 0, 1 ],
|
||||
"socket": [
|
||||
@ -126,8 +161,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "cpu_used",
|
||||
"scope": "core",
|
||||
"unit": "",
|
||||
"unit": {"base": ""},
|
||||
"aggregation": "avg",
|
||||
"timestep": 30,
|
||||
"peak": 1,
|
||||
"normal": 0.5,
|
||||
"caution": 2e-07,
|
||||
"alert": 1e-07,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -141,8 +181,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "ipc",
|
||||
"scope": "core",
|
||||
"unit": "IPC",
|
||||
"unit": { "base": "IPC"},
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 2,
|
||||
"normal": 1,
|
||||
"caution": 0.1,
|
||||
"alert": 0.5,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -156,8 +201,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "flops_any",
|
||||
"scope": "core",
|
||||
"unit": "F/s",
|
||||
"unit": { "base": "F/s"},
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 40000000000,
|
||||
"normal": 20000000000,
|
||||
"caution": 30000000000,
|
||||
"alert": 35000000000,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -171,8 +221,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"scope": "socket",
|
||||
"unit": "B/s",
|
||||
"unit": { "base": "B/s"},
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 58800000000,
|
||||
"normal": 28800000000,
|
||||
"caution": 38800000000,
|
||||
"alert": 48800000000,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -186,8 +241,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "file_bw",
|
||||
"scope": "node",
|
||||
"unit": "B/s",
|
||||
"unit": { "base": "B/s"},
|
||||
"aggregation": "sum",
|
||||
"timestep": 30,
|
||||
"peak": 20000000000,
|
||||
"normal": 5000000000,
|
||||
"caution": 9000000000,
|
||||
"alert": 19000000000,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -201,8 +261,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "net_bw",
|
||||
"scope": "node",
|
||||
"unit": "B/s",
|
||||
"unit": { "base": "B/s"},
|
||||
"timestep": 30,
|
||||
"aggregation": "sum",
|
||||
"peak": 7000000000,
|
||||
"normal": 5000000000,
|
||||
"caution": 6000000000,
|
||||
"alert": 6500000000,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -216,8 +281,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "mem_used",
|
||||
"scope": "node",
|
||||
"unit": "B",
|
||||
"unit": {"base": "B"},
|
||||
"aggregation": "sum",
|
||||
"timestep": 30,
|
||||
"peak": 32000000000,
|
||||
"normal": 2000000000,
|
||||
"caution": 31000000000,
|
||||
"alert": 30000000000,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -231,8 +301,13 @@ func setup(t *testing.T) *api.RestApi {
|
||||
{
|
||||
"name": "cpu_power",
|
||||
"scope": "socket",
|
||||
"unit": "W",
|
||||
"unit": {"base": "W"},
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 80,
|
||||
"caution": 90,
|
||||
"alert": 90,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
@ -253,6 +328,10 @@ func setup(t *testing.T) *api.RestApi {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 1)), 0666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -315,13 +394,12 @@ func TestRestApi(t *testing.T) {
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
Unit: "load",
|
||||
Scope: schema.MetricScopeNode,
|
||||
Unit: schema.Unit{Base: "load"},
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "host123",
|
||||
Statistics: &schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||
},
|
||||
},
|
||||
@ -392,15 +470,15 @@ func TestRestApi(t *testing.T) {
|
||||
job.Project != "testproj" ||
|
||||
job.Cluster != "testcluster" ||
|
||||
job.SubCluster != "sc1" ||
|
||||
job.Partition != "default" ||
|
||||
job.Walltime != 3600 ||
|
||||
job.ArrayJobId != 0 ||
|
||||
*job.Partition != "default" ||
|
||||
*job.Walltime != 3600 ||
|
||||
*job.ArrayJobId != 0 ||
|
||||
job.NumNodes != 1 ||
|
||||
job.NumHWThreads != 8 ||
|
||||
job.NumAcc != 0 ||
|
||||
*job.NumHWThreads != 8 ||
|
||||
*job.NumAcc != 0 ||
|
||||
job.Exclusive != 1 ||
|
||||
job.MonitoringStatus != 1 ||
|
||||
job.SMT != 1 ||
|
||||
*job.SMT != 1 ||
|
||||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
|
||||
job.StartTime.Unix() != 123456789 {
|
||||
t.Fatalf("unexpected job properties: %#v", job)
|
||||
@ -488,13 +566,13 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("FailedJob", func(t *testing.T) {
|
||||
subtestLetJobFail(t, restapi, r)
|
||||
})
|
||||
// t.Run("FailedJob", func(t *testing.T) {
|
||||
// subtestLetJobFail(t, restapi, r)
|
||||
// })
|
||||
|
||||
t.Run("ImportJob", func(t *testing.T) {
|
||||
testImportFlag(t)
|
||||
})
|
||||
// t.Run("ImportJob", func(t *testing.T) {
|
||||
// testImportFlag(t)
|
||||
// })
|
||||
}
|
||||
|
||||
func subtestLetJobFail(t *testing.T, restapi *api.RestApi, r *mux.Router) {
|
||||
@ -505,19 +583,15 @@ func subtestLetJobFail(t *testing.T, restapi *api.RestApi, r *mux.Router) {
|
||||
"cluster": "testcluster",
|
||||
"partition": "default",
|
||||
"walltime": 3600,
|
||||
"arrayJobId": 0,
|
||||
"numNodes": 1,
|
||||
"numAcc": 0,
|
||||
"exclusive": 1,
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"tags": [],
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123"
|
||||
}
|
||||
],
|
||||
"metaData": {},
|
||||
"startTime": 12345678
|
||||
}`
|
||||
|
||||
@ -596,4 +670,17 @@ func testImportFlag(t *testing.T) {
|
||||
if len(data) != 8 {
|
||||
t.Errorf("Job data length: Got %d, want 8", len(data))
|
||||
}
|
||||
|
||||
r := map[string]string{"mem_used": "GB", "net_bw": "KB/s",
|
||||
"cpu_power": "W", "cpu_used": "",
|
||||
"file_bw": "KB/s", "flops_any": "F/s",
|
||||
"mem_bw": "GB/s", "ipc": "IPC"}
|
||||
|
||||
for name, scopes := range data {
|
||||
for _, metric := range scopes {
|
||||
if metric.Unit.Base != r[name] {
|
||||
t.Errorf("Metric %s unit: Got %s, want %s", name, metric.Unit.Base, r[name])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5,10 +5,8 @@
|
||||
"cluster": "taurus",
|
||||
"subCluster": "haswell",
|
||||
"partition": "haswell64",
|
||||
"arrayJobId": 0,
|
||||
"numNodes": 2,
|
||||
"numHwthreads": 4,
|
||||
"numAcc": 0,
|
||||
"exclusive": 0,
|
||||
"startTime": 1635856524,
|
||||
"jobState": "completed",
|
||||
@ -18,11 +16,17 @@
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "taurusi6489",
|
||||
"hwthreads": [ 0, 1 ]
|
||||
"hwthreads": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"hostname": "taurusi6490",
|
||||
"hwthreads": [ 10, 11 ]
|
||||
"hwthreads": [
|
||||
10,
|
||||
11
|
||||
]
|
||||
}
|
||||
],
|
||||
"statistics": {
|
||||
@ -30,49 +34,65 @@
|
||||
"min": 0.03694102397926118,
|
||||
"avg": 0.48812580468611544,
|
||||
"max": 1.0000000000000002,
|
||||
"unit": "cpu used"
|
||||
"unit": {
|
||||
"base": ""
|
||||
}
|
||||
},
|
||||
"ipc": {
|
||||
"min": 0.30469640475234366,
|
||||
"avg": 1.154312070173657,
|
||||
"max": 1.797623522191001,
|
||||
"unit": "IPC"
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
}
|
||||
},
|
||||
"flops_any": {
|
||||
"min": 0.0,
|
||||
"avg": 686.5190320308598,
|
||||
"max": 4346.591400350933,
|
||||
"unit": "F/s"
|
||||
"unit": {
|
||||
"base": "F/s"
|
||||
}
|
||||
},
|
||||
"mem_bw": {
|
||||
"min": 653671812.1661415,
|
||||
"avg": 1605031604.9852366,
|
||||
"max": 2614718291.9554267,
|
||||
"unit": "B/s"
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
}
|
||||
},
|
||||
"file_bw": {
|
||||
"min": 0.0,
|
||||
"avg": 620592.5419124186,
|
||||
"max": 11559156.360352296,
|
||||
"unit": "B/s"
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
}
|
||||
},
|
||||
"net_bw": {
|
||||
"min": 126779.89655880642,
|
||||
"avg": 763101.082138246,
|
||||
"max": 1916309.7075416835,
|
||||
"unit": "B/s"
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
}
|
||||
},
|
||||
"mem_used": {
|
||||
"min": 2779066368.0,
|
||||
"avg": 9647598685.09091,
|
||||
"max": 10202595328.0,
|
||||
"unit": "B"
|
||||
"unit": {
|
||||
"base": "B"
|
||||
}
|
||||
},
|
||||
"cpu_power": {
|
||||
"min": 35.50647456742635,
|
||||
"avg": 78.63442946337237,
|
||||
"max": 85.83909286117324,
|
||||
"unit": "W"
|
||||
"unit": {
|
||||
"base": "W"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
36
tools/archive-manager/main.go
Normal file
36
tools/archive-manager/main.go
Normal file
@ -0,0 +1,36 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var srcPath, flagConfigFile string
|
||||
|
||||
flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive")
|
||||
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")
|
||||
flag.Parse()
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", srcPath)
|
||||
|
||||
config.Init(flagConfigFile)
|
||||
config.Keys.Validate = true
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), false); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
ar := archive.GetHandle()
|
||||
|
||||
for job := range ar.Iter(true) {
|
||||
log.Printf("Validate %s - %d\n", job.Meta.Cluster, job.Meta.JobID)
|
||||
}
|
||||
}
|
65
tools/archive-migration/cluster.go
Normal file
65
tools/archive-migration/cluster.go
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
// type Accelerator struct {
|
||||
// ID string `json:"id"`
|
||||
// Type string `json:"type"`
|
||||
// Model string `json:"model"`
|
||||
// }
|
||||
|
||||
// type Topology struct {
|
||||
// Node []int `json:"node"`
|
||||
// Socket [][]int `json:"socket"`
|
||||
// MemoryDomain [][]int `json:"memoryDomain"`
|
||||
// Die [][]int `json:"die"`
|
||||
// Core [][]int `json:"core"`
|
||||
// Accelerators []*Accelerator `json:"accelerators"`
|
||||
// }
|
||||
|
||||
type SubCluster struct {
|
||||
Name string `json:"name"`
|
||||
Nodes string `json:"nodes"`
|
||||
NumberOfNodes int `json:"numberOfNodes"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
FlopRateScalar int `json:"flopRateScalar"`
|
||||
FlopRateSimd int `json:"flopRateSimd"`
|
||||
MemoryBandwidth int `json:"memoryBandwidth"`
|
||||
Topology *schema.Topology `json:"topology"`
|
||||
}
|
||||
|
||||
// type SubClusterConfig struct {
|
||||
// Name string `json:"name"`
|
||||
// Peak float64 `json:"peak"`
|
||||
// Normal float64 `json:"normal"`
|
||||
// Caution float64 `json:"caution"`
|
||||
// Alert float64 `json:"alert"`
|
||||
// }
|
||||
|
||||
type MetricConfig struct {
|
||||
Name string `json:"name"`
|
||||
Unit string `json:"unit"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
Aggregation string `json:"aggregation"`
|
||||
Timestep int `json:"timestep"`
|
||||
Peak float64 `json:"peak"`
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
SubClusters []*schema.SubClusterConfig `json:"subClusters"`
|
||||
}
|
||||
|
||||
type Cluster struct {
|
||||
Name string `json:"name"`
|
||||
MetricConfig []*MetricConfig `json:"metricConfig"`
|
||||
SubClusters []*SubCluster `json:"subClusters"`
|
||||
}
|
166
tools/archive-migration/clusterConfig.go
Normal file
166
tools/archive-migration/clusterConfig.go
Normal file
@ -0,0 +1,166 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
var Clusters []*Cluster
|
||||
var nodeLists map[string]map[string]archive.NodeList
|
||||
|
||||
func initClusterConfig() error {
|
||||
|
||||
Clusters = []*Cluster{}
|
||||
nodeLists = map[string]map[string]archive.NodeList{}
|
||||
|
||||
for _, c := range ar.GetClusters() {
|
||||
|
||||
cluster, err := ar.LoadClusterCfg(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(cluster.Name) == 0 ||
|
||||
len(cluster.MetricConfig) == 0 ||
|
||||
len(cluster.SubClusters) == 0 {
|
||||
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
|
||||
}
|
||||
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
if len(mc.Name) == 0 {
|
||||
return errors.New("cluster.metricConfig.name should not be empty")
|
||||
}
|
||||
if mc.Timestep < 1 {
|
||||
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
|
||||
}
|
||||
|
||||
// For backwards compability...
|
||||
if mc.Scope == "" {
|
||||
mc.Scope = schema.MetricScopeNode
|
||||
}
|
||||
if !mc.Scope.Valid() {
|
||||
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
|
||||
}
|
||||
}
|
||||
|
||||
Clusters = append(Clusters, cluster)
|
||||
|
||||
nodeLists[cluster.Name] = make(map[string]archive.NodeList)
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Nodes == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
nl, err := archive.ParseNodeList(sc.Nodes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("in %s/cluster.json: %w", cluster.Name, err)
|
||||
}
|
||||
nodeLists[cluster.Name][sc.Name] = nl
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetCluster(cluster string) *Cluster {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
return c
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetSubCluster(cluster, subcluster string) *SubCluster {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, p := range c.SubClusters {
|
||||
if p.Name == subcluster {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetMetricConfig(cluster, metric string) *MetricConfig {
|
||||
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, m := range c.MetricConfig {
|
||||
if m.Name == metric {
|
||||
return m
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AssignSubCluster sets the `job.subcluster` property of the job based
|
||||
// on its cluster and resources.
|
||||
func AssignSubCluster(job *BaseJob) error {
|
||||
|
||||
cluster := GetCluster(job.Cluster)
|
||||
if cluster == nil {
|
||||
return fmt.Errorf("unkown cluster: %#v", job.Cluster)
|
||||
}
|
||||
|
||||
if job.SubCluster != "" {
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Name == job.SubCluster {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("already assigned subcluster %#v unkown (cluster: %#v)", job.SubCluster, job.Cluster)
|
||||
}
|
||||
|
||||
if len(job.Resources) == 0 {
|
||||
return fmt.Errorf("job without any resources/hosts")
|
||||
}
|
||||
|
||||
host0 := job.Resources[0].Hostname
|
||||
for sc, nl := range nodeLists[job.Cluster] {
|
||||
if nl != nil && nl.Contains(host0) {
|
||||
job.SubCluster = sc
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
if cluster.SubClusters[0].Nodes == "" {
|
||||
job.SubCluster = cluster.SubClusters[0].Name
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("no subcluster found for cluster %#v and host %#v", job.Cluster, host0)
|
||||
}
|
||||
|
||||
func GetSubClusterByNode(cluster, hostname string) (string, error) {
|
||||
|
||||
for sc, nl := range nodeLists[cluster] {
|
||||
if nl != nil && nl.Contains(hostname) {
|
||||
return sc, nil
|
||||
}
|
||||
}
|
||||
|
||||
c := GetCluster(cluster)
|
||||
if c == nil {
|
||||
return "", fmt.Errorf("unkown cluster: %#v", cluster)
|
||||
}
|
||||
|
||||
if c.SubClusters[0].Nodes == "" {
|
||||
return c.SubClusters[0].Name, nil
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("no subcluster found for cluster %#v and host %#v", cluster, hostname)
|
||||
}
|
109
tools/archive-migration/float.go
Normal file
109
tools/archive-migration/float.go
Normal file
@ -0,0 +1,109 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"math"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// A custom float type is used so that (Un)MarshalJSON and
|
||||
// (Un)MarshalGQL can be overloaded and NaN/null can be used.
|
||||
// The default behaviour of putting every nullable value behind
|
||||
// a pointer has a bigger overhead.
|
||||
type Float float64
|
||||
|
||||
var NaN Float = Float(math.NaN())
|
||||
var nullAsBytes []byte = []byte("null")
|
||||
|
||||
func (f Float) IsNaN() bool {
|
||||
return math.IsNaN(float64(f))
|
||||
}
|
||||
|
||||
// NaN will be serialized to `null`.
|
||||
func (f Float) MarshalJSON() ([]byte, error) {
|
||||
if f.IsNaN() {
|
||||
return nullAsBytes, nil
|
||||
}
|
||||
|
||||
return strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64), nil
|
||||
}
|
||||
|
||||
// `null` will be unserialized to NaN.
|
||||
func (f *Float) UnmarshalJSON(input []byte) error {
|
||||
s := string(input)
|
||||
if s == "null" {
|
||||
*f = NaN
|
||||
return nil
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
*f = Float(val)
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnmarshalGQL implements the graphql.Unmarshaler interface.
|
||||
func (f *Float) UnmarshalGQL(v interface{}) error {
|
||||
f64, ok := v.(float64)
|
||||
if !ok {
|
||||
return errors.New("invalid Float scalar")
|
||||
}
|
||||
|
||||
*f = Float(f64)
|
||||
return nil
|
||||
}
|
||||
|
||||
// MarshalGQL implements the graphql.Marshaler interface.
|
||||
// NaN will be serialized to `null`.
|
||||
func (f Float) MarshalGQL(w io.Writer) {
|
||||
if f.IsNaN() {
|
||||
w.Write(nullAsBytes)
|
||||
} else {
|
||||
w.Write(strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64))
|
||||
}
|
||||
}
|
||||
|
||||
// Only used via REST-API, not via GraphQL.
|
||||
// This uses a lot less allocations per series,
|
||||
// but it turns out that the performance increase
|
||||
// from using this is not that big.
|
||||
func (s *Series) MarshalJSON() ([]byte, error) {
|
||||
buf := make([]byte, 0, 512+len(s.Data)*8)
|
||||
buf = append(buf, `{"hostname":"`...)
|
||||
buf = append(buf, s.Hostname...)
|
||||
buf = append(buf, '"')
|
||||
if s.Id != nil {
|
||||
buf = append(buf, `,"id":`...)
|
||||
buf = strconv.AppendInt(buf, int64(*s.Id), 10)
|
||||
}
|
||||
if s.Statistics != nil {
|
||||
buf = append(buf, `,"statistics":{"min":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
|
||||
buf = append(buf, `,"avg":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Avg, 'f', 2, 64)
|
||||
buf = append(buf, `,"max":`...)
|
||||
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
|
||||
buf = append(buf, '}')
|
||||
}
|
||||
buf = append(buf, `,"data":[`...)
|
||||
for i := 0; i < len(s.Data); i++ {
|
||||
if i != 0 {
|
||||
buf = append(buf, ',')
|
||||
}
|
||||
|
||||
if s.Data[i].IsNaN() {
|
||||
buf = append(buf, `null`...)
|
||||
} else {
|
||||
buf = strconv.AppendFloat(buf, float64(s.Data[i]), 'f', 2, 32)
|
||||
}
|
||||
}
|
||||
buf = append(buf, ']', '}')
|
||||
return buf, nil
|
||||
}
|
142
tools/archive-migration/fsBackend.go
Normal file
142
tools/archive-migration/fsBackend.go
Normal file
@ -0,0 +1,142 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
)
|
||||
|
||||
type FsArchiveConfig struct {
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
type FsArchive struct {
|
||||
path string
|
||||
clusters []string
|
||||
}
|
||||
|
||||
func getPath(
|
||||
job *JobMeta,
|
||||
rootPath string,
|
||||
file string) string {
|
||||
|
||||
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
|
||||
return filepath.Join(
|
||||
rootPath,
|
||||
job.Cluster,
|
||||
lvl1, lvl2,
|
||||
strconv.FormatInt(job.StartTime, 10), file)
|
||||
}
|
||||
|
||||
func loadJobMeta(filename string) (*JobMeta, error) {
|
||||
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend loadJobMeta()- %v", err)
|
||||
return &JobMeta{}, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return DecodeJobMeta(bufio.NewReader(f))
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) Init(rawConfig json.RawMessage) error {
|
||||
|
||||
var config FsArchiveConfig
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
log.Errorf("fsBackend Init()- %v", err)
|
||||
return err
|
||||
}
|
||||
if config.Path == "" {
|
||||
err := fmt.Errorf("fsBackend Init()- empty path")
|
||||
log.Errorf("fsBackend Init()- %v", err)
|
||||
return err
|
||||
}
|
||||
fsa.path = config.Path
|
||||
|
||||
entries, err := os.ReadDir(fsa.path)
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend Init()- %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, de := range entries {
|
||||
fsa.clusters = append(fsa.clusters, de.Name())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) Iter() <-chan *JobMeta {
|
||||
|
||||
ch := make(chan *JobMeta)
|
||||
go func() {
|
||||
clustersDir, err := os.ReadDir(fsa.path)
|
||||
if err != nil {
|
||||
log.Fatalf("Reading clusters failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, clusterDir := range clustersDir {
|
||||
lvl1Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name()))
|
||||
if err != nil {
|
||||
log.Fatalf("Reading jobs failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, lvl1Dir := range lvl1Dirs {
|
||||
if !lvl1Dir.IsDir() {
|
||||
// Could be the cluster.json file
|
||||
continue
|
||||
}
|
||||
|
||||
lvl2Dirs, err := os.ReadDir(filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name()))
|
||||
if err != nil {
|
||||
log.Fatalf("Reading jobs failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, lvl2Dir := range lvl2Dirs {
|
||||
dirpath := filepath.Join(fsa.path, clusterDir.Name(), lvl1Dir.Name(), lvl2Dir.Name())
|
||||
startTimeDirs, err := os.ReadDir(dirpath)
|
||||
if err != nil {
|
||||
log.Fatalf("Reading jobs failed: %s", err.Error())
|
||||
}
|
||||
|
||||
for _, startTimeDir := range startTimeDirs {
|
||||
if startTimeDir.IsDir() {
|
||||
job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json"))
|
||||
if err != nil {
|
||||
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
} else {
|
||||
ch <- job
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(ch)
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) LoadClusterCfg(name string) (*Cluster, error) {
|
||||
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
|
||||
if err != nil {
|
||||
log.Errorf("fsBackend LoadClusterCfg()- %v", err)
|
||||
return &Cluster{}, err
|
||||
}
|
||||
return DecodeCluster(bytes.NewReader(b))
|
||||
}
|
||||
|
||||
func (fsa *FsArchive) GetClusters() []string {
|
||||
return fsa.clusters
|
||||
}
|
162
tools/archive-migration/job.go
Normal file
162
tools/archive-migration/job.go
Normal file
@ -0,0 +1,162 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
// Non-Swaggered Comment: BaseJob
|
||||
// Non-Swaggered Comment: Common subset of Job and JobMeta. Use one of those, not this type directly.
|
||||
|
||||
type BaseJob struct {
|
||||
// The unique identifier of a job
|
||||
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
|
||||
User string `json:"user" db:"user" example:"abcd100h"` // The unique identifier of a user
|
||||
Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project
|
||||
Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster
|
||||
SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster
|
||||
Partition *string `json:"partition" db:"partition" example:"main"` // The Slurm partition to which the job was submitted
|
||||
ArrayJobId *int64 `json:"arrayJobId" db:"array_job_id" example:"123000"` // The unique identifier of an array job
|
||||
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0)
|
||||
NumHWThreads *int32 `json:"numHwthreads" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0)
|
||||
NumAcc *int32 `json:"numAcc" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0)
|
||||
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user
|
||||
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull
|
||||
SMT *int32 `json:"smt" db:"smt" example:"4"` // SMT threads used by job
|
||||
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job
|
||||
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0)
|
||||
Walltime *int64 `json:"walltime" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0)
|
||||
Tags []*schema.Tag `json:"tags"` // List of tags
|
||||
RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes]
|
||||
Resources []*Resource `json:"resources"` // Resources used by job
|
||||
RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes]
|
||||
MetaData map[string]string `json:"metaData"` // Additional information about the job
|
||||
}
|
||||
|
||||
// Non-Swaggered Comment: Job
|
||||
// Non-Swaggered Comment: This type is used as the GraphQL interface and using sqlx as a table row.
|
||||
|
||||
// Job model
|
||||
// @Description Information of a HPC job.
|
||||
type Job struct {
|
||||
// The unique identifier of a job in the database
|
||||
ID int64 `json:"id" db:"id"`
|
||||
BaseJob
|
||||
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds
|
||||
StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type
|
||||
MemUsedMax float64 `json:"-" db:"mem_used_max"` // MemUsedMax as Float64
|
||||
FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"` // FlopsAnyAvg as Float64
|
||||
MemBwAvg float64 `json:"-" db:"mem_bw_avg"` // MemBwAvg as Float64
|
||||
LoadAvg float64 `json:"-" db:"load_avg"` // LoadAvg as Float64
|
||||
NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64
|
||||
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64
|
||||
FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64
|
||||
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"` // FileDataVolTotal as Float64
|
||||
}
|
||||
|
||||
// Non-Swaggered Comment: JobMeta
|
||||
// Non-Swaggered Comment: When reading from the database or sending data via GraphQL, the start time can be in the much more
|
||||
// Non-Swaggered Comment: convenient time.Time type. In the `meta.json` files, the start time is encoded as a unix epoch timestamp.
|
||||
// Non-Swaggered Comment: This is why there is this struct, which contains all fields from the regular job struct, but "overwrites"
|
||||
// Non-Swaggered Comment: the StartTime field with one of type int64.
|
||||
// Non-Swaggered Comment: ID *int64 `json:"id,omitempty"` >> never used in the job-archive, only available via REST-API
|
||||
|
||||
// JobMeta model
|
||||
// @Description Meta data information of a HPC job.
|
||||
type JobMeta struct {
|
||||
// The unique identifier of a job in the database
|
||||
ID *int64 `json:"id,omitempty"`
|
||||
BaseJob
|
||||
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` // Start epoch time stamp in seconds (Min > 0)
|
||||
Statistics map[string]JobStatistics `json:"statistics,omitempty"` // Metric statistics of job
|
||||
}
|
||||
|
||||
const (
|
||||
MonitoringStatusDisabled int32 = 0
|
||||
MonitoringStatusRunningOrArchiving int32 = 1
|
||||
MonitoringStatusArchivingFailed int32 = 2
|
||||
MonitoringStatusArchivingSuccessful int32 = 3
|
||||
)
|
||||
|
||||
var JobDefaults BaseJob = BaseJob{
|
||||
Exclusive: 1,
|
||||
MonitoringStatus: MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
|
||||
// JobStatistics model
|
||||
// @Description Specification for job metric statistics.
|
||||
type JobStatistics struct {
|
||||
// Metric unit (see schema/unit.schema.json)
|
||||
Unit string `json:"unit" example:"GHz"`
|
||||
Avg float64 `json:"avg" example:"2500" minimum:"0"` // Job metric average
|
||||
Min float64 `json:"min" example:"2000" minimum:"0"` // Job metric minimum
|
||||
Max float64 `json:"max" example:"3000" minimum:"0"` // Job metric maximum
|
||||
}
|
||||
|
||||
// Tag model
|
||||
// @Description Defines a tag using name and type.
|
||||
type Tag struct {
|
||||
// The unique DB identifier of a tag
|
||||
ID int64 `json:"id" db:"id"`
|
||||
Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type
|
||||
Name string `json:"name" db:"tag_name" example:"Testjob"` // Tag Name
|
||||
}
|
||||
|
||||
// Resource model
|
||||
// @Description A resource used by a job
|
||||
type Resource struct {
|
||||
Hostname string `json:"hostname"` // Name of the host (= node)
|
||||
HWThreads []int `json:"hwthreads,omitempty"` // List of OS processor ids
|
||||
Accelerators []string `json:"accelerators,omitempty"` // List of of accelerator device ids
|
||||
Configuration string `json:"configuration,omitempty"` // The configuration options of the node
|
||||
}
|
||||
|
||||
type JobState string
|
||||
|
||||
const (
|
||||
JobStateRunning JobState = "running"
|
||||
JobStateCompleted JobState = "completed"
|
||||
JobStateFailed JobState = "failed"
|
||||
JobStateCancelled JobState = "cancelled"
|
||||
JobStateStopped JobState = "stopped"
|
||||
JobStateTimeout JobState = "timeout"
|
||||
JobStatePreempted JobState = "preempted"
|
||||
JobStateOutOfMemory JobState = "out_of_memory"
|
||||
)
|
||||
|
||||
func (e *JobState) UnmarshalGQL(v interface{}) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
}
|
||||
|
||||
*e = JobState(str)
|
||||
if !e.Valid() {
|
||||
return errors.New("invalid job state")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e JobState) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprintf(w, "\"%s\"", e)
|
||||
}
|
||||
|
||||
func (e JobState) Valid() bool {
|
||||
return e == JobStateRunning ||
|
||||
e == JobStateCompleted ||
|
||||
e == JobStateFailed ||
|
||||
e == JobStateCancelled ||
|
||||
e == JobStateStopped ||
|
||||
e == JobStateTimeout ||
|
||||
e == JobStatePreempted ||
|
||||
e == JobStateOutOfMemory
|
||||
}
|
66
tools/archive-migration/json.go
Normal file
66
tools/archive-migration/json.go
Normal file
@ -0,0 +1,66 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
func DecodeJobData(r io.Reader) (*JobData, error) {
|
||||
var d JobData
|
||||
if err := json.NewDecoder(r).Decode(&d); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &d, nil
|
||||
}
|
||||
|
||||
func DecodeJobMeta(r io.Reader) (*JobMeta, error) {
|
||||
var d JobMeta
|
||||
if err := json.NewDecoder(r).Decode(&d); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &d, nil
|
||||
}
|
||||
|
||||
func DecodeCluster(r io.Reader) (*Cluster, error) {
|
||||
var c Cluster
|
||||
if err := json.NewDecoder(r).Decode(&c); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &c, nil
|
||||
}
|
||||
|
||||
func EncodeJobData(w io.Writer, d *schema.JobData) error {
|
||||
// Sanitize parameters
|
||||
if err := json.NewEncoder(w).Encode(d); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func EncodeJobMeta(w io.Writer, d *schema.JobMeta) error {
|
||||
// Sanitize parameters
|
||||
if err := json.NewEncoder(w).Encode(d); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func EncodeCluster(w io.Writer, c *schema.Cluster) error {
|
||||
// Sanitize parameters
|
||||
if err := json.NewEncoder(w).Encode(c); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
318
tools/archive-migration/main.go
Normal file
318
tools/archive-migration/main.go
Normal file
@ -0,0 +1,318 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/units"
|
||||
)
|
||||
|
||||
const Version = 1
|
||||
|
||||
var ar FsArchive
|
||||
|
||||
func loadJobData(filename string) (*JobData, error) {
|
||||
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return &JobData{}, fmt.Errorf("fsBackend loadJobData()- %v", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return DecodeJobData(bufio.NewReader(f))
|
||||
}
|
||||
|
||||
func deepCopyJobMeta(j *JobMeta) schema.JobMeta {
|
||||
var jn schema.JobMeta
|
||||
|
||||
//required properties
|
||||
jn.JobID = j.JobID
|
||||
jn.User = j.User
|
||||
jn.Project = j.Project
|
||||
jn.Cluster = j.Cluster
|
||||
jn.SubCluster = j.SubCluster
|
||||
jn.NumNodes = j.NumNodes
|
||||
jn.Exclusive = j.Exclusive
|
||||
jn.StartTime = j.StartTime
|
||||
jn.State = schema.JobState(j.State)
|
||||
jn.Duration = j.Duration
|
||||
|
||||
for _, ro := range j.Resources {
|
||||
var rn schema.Resource
|
||||
rn.Hostname = ro.Hostname
|
||||
rn.Configuration = ro.Configuration
|
||||
hwt := make([]int, len(ro.HWThreads))
|
||||
if ro.HWThreads != nil {
|
||||
copy(hwt, ro.HWThreads)
|
||||
}
|
||||
rn.HWThreads = hwt
|
||||
acc := make([]string, len(ro.Accelerators))
|
||||
if ro.Accelerators != nil {
|
||||
copy(acc, ro.Accelerators)
|
||||
}
|
||||
rn.Accelerators = acc
|
||||
jn.Resources = append(jn.Resources, &rn)
|
||||
}
|
||||
jn.MetaData = make(map[string]string)
|
||||
|
||||
for k, v := range j.MetaData {
|
||||
jn.MetaData[k] = v
|
||||
}
|
||||
|
||||
jn.Statistics = make(map[string]schema.JobStatistics)
|
||||
for k, v := range j.Statistics {
|
||||
var sn schema.JobStatistics
|
||||
sn.Avg = v.Avg
|
||||
sn.Max = v.Max
|
||||
sn.Min = v.Min
|
||||
tmpUnit := units.ConvertUnitString(v.Unit)
|
||||
if tmpUnit.Base == "inval" {
|
||||
sn.Unit = schema.Unit{Base: ""}
|
||||
} else {
|
||||
sn.Unit = tmpUnit
|
||||
}
|
||||
jn.Statistics[k] = sn
|
||||
}
|
||||
|
||||
//optional properties
|
||||
jn.Partition = j.Partition
|
||||
jn.ArrayJobId = j.ArrayJobId
|
||||
jn.NumHWThreads = j.NumHWThreads
|
||||
jn.NumAcc = j.NumAcc
|
||||
jn.MonitoringStatus = j.MonitoringStatus
|
||||
jn.SMT = j.SMT
|
||||
jn.Walltime = j.Walltime
|
||||
|
||||
for _, t := range j.Tags {
|
||||
jn.Tags = append(jn.Tags, t)
|
||||
}
|
||||
|
||||
return jn
|
||||
}
|
||||
|
||||
func deepCopyJobData(d *JobData, cluster string, subCluster string) *schema.JobData {
|
||||
var dn = make(schema.JobData)
|
||||
|
||||
for k, v := range *d {
|
||||
// fmt.Printf("Metric %s\n", k)
|
||||
dn[k] = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
|
||||
for mk, mv := range v {
|
||||
// fmt.Printf("Scope %s\n", mk)
|
||||
var mn schema.JobMetric
|
||||
tmpUnit := units.ConvertUnitString(mv.Unit)
|
||||
if tmpUnit.Base == "inval" {
|
||||
mn.Unit = schema.Unit{Base: ""}
|
||||
} else {
|
||||
mn.Unit = tmpUnit
|
||||
}
|
||||
|
||||
mn.Timestep = mv.Timestep
|
||||
|
||||
for _, v := range mv.Series {
|
||||
var sn schema.Series
|
||||
sn.Hostname = v.Hostname
|
||||
if v.Id != nil {
|
||||
var id = new(string)
|
||||
|
||||
if mk == schema.MetricScopeAccelerator {
|
||||
s := GetSubCluster(cluster, subCluster)
|
||||
var err error
|
||||
|
||||
*id, err = s.Topology.GetAcceleratorID(*v.Id)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
} else {
|
||||
*id = fmt.Sprint(*v.Id)
|
||||
}
|
||||
sn.Id = id
|
||||
}
|
||||
if v.Statistics != nil {
|
||||
sn.Statistics = schema.MetricStatistics{
|
||||
Avg: v.Statistics.Avg,
|
||||
Min: v.Statistics.Min,
|
||||
Max: v.Statistics.Max}
|
||||
}
|
||||
|
||||
sn.Data = make([]schema.Float, len(v.Data))
|
||||
copy(sn.Data, v.Data)
|
||||
mn.Series = append(mn.Series, sn)
|
||||
}
|
||||
|
||||
dn[k][mk] = &mn
|
||||
}
|
||||
// fmt.Printf("FINISH %s\n", k)
|
||||
}
|
||||
|
||||
return &dn
|
||||
}
|
||||
|
||||
func deepCopyClusterConfig(co *Cluster) schema.Cluster {
|
||||
var cn schema.Cluster
|
||||
|
||||
cn.Name = co.Name
|
||||
for _, sco := range co.SubClusters {
|
||||
var scn schema.SubCluster
|
||||
scn.Name = sco.Name
|
||||
scn.Nodes = sco.Nodes
|
||||
scn.ProcessorType = sco.ProcessorType
|
||||
scn.SocketsPerNode = sco.SocketsPerNode
|
||||
scn.CoresPerSocket = sco.CoresPerSocket
|
||||
scn.ThreadsPerCore = sco.ThreadsPerCore
|
||||
var prefix = new(string)
|
||||
*prefix = "G"
|
||||
scn.FlopRateScalar = schema.MetricValue{
|
||||
Unit: schema.Unit{Base: "F/s", Prefix: prefix},
|
||||
Value: float64(sco.FlopRateScalar)}
|
||||
scn.FlopRateSimd = schema.MetricValue{
|
||||
Unit: schema.Unit{Base: "F/s", Prefix: prefix},
|
||||
Value: float64(sco.FlopRateSimd)}
|
||||
scn.MemoryBandwidth = schema.MetricValue{
|
||||
Unit: schema.Unit{Base: "B/s", Prefix: prefix},
|
||||
Value: float64(sco.MemoryBandwidth)}
|
||||
scn.Topology = *sco.Topology
|
||||
cn.SubClusters = append(cn.SubClusters, &scn)
|
||||
}
|
||||
|
||||
for _, mco := range co.MetricConfig {
|
||||
var mcn schema.MetricConfig
|
||||
mcn.Name = mco.Name
|
||||
mcn.Scope = mco.Scope
|
||||
if mco.Aggregation == "" {
|
||||
fmt.Println("Property aggregation missing! Please review file!")
|
||||
mcn.Aggregation = "sum"
|
||||
} else {
|
||||
mcn.Aggregation = mco.Aggregation
|
||||
}
|
||||
mcn.Timestep = mco.Timestep
|
||||
tmpUnit := units.ConvertUnitString(mco.Unit)
|
||||
if tmpUnit.Base == "inval" {
|
||||
mcn.Unit = schema.Unit{Base: ""}
|
||||
} else {
|
||||
mcn.Unit = tmpUnit
|
||||
}
|
||||
mcn.Peak = mco.Peak
|
||||
mcn.Normal = mco.Normal
|
||||
mcn.Caution = mco.Caution
|
||||
mcn.Alert = mco.Alert
|
||||
mcn.SubClusters = mco.SubClusters
|
||||
|
||||
cn.MetricConfig = append(cn.MetricConfig, &mcn)
|
||||
}
|
||||
|
||||
return cn
|
||||
}
|
||||
|
||||
func main() {
|
||||
var srcPath string
|
||||
var dstPath string
|
||||
|
||||
flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive")
|
||||
flag.StringVar(&dstPath, "d", "./var/job-archive-new", "Specify the destination job archive path. Default is ./var/job-archive-new")
|
||||
flag.Parse()
|
||||
|
||||
if _, err := os.Stat(filepath.Join(srcPath, "version.txt")); !errors.Is(err, os.ErrNotExist) {
|
||||
log.Fatal("Archive version exists!")
|
||||
}
|
||||
|
||||
srcConfig := fmt.Sprintf("{\"path\": \"%s\"}", srcPath)
|
||||
err := ar.Init(json.RawMessage(srcConfig))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
err = initClusterConfig()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// setup new job archive
|
||||
err = os.Mkdir(dstPath, 0750)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for _, c := range Clusters {
|
||||
path := fmt.Sprintf("%s/%s", dstPath, c.Name)
|
||||
fmt.Println(path)
|
||||
err = os.Mkdir(path, 0750)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
cn := deepCopyClusterConfig(c)
|
||||
|
||||
f, err := os.Create(fmt.Sprintf("%s/%s/cluster.json", dstPath, c.Name))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err := EncodeCluster(f, &cn); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for job := range ar.Iter() {
|
||||
// fmt.Printf("Job %d\n", job.JobID)
|
||||
job := job
|
||||
wg.Add(1)
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
path := getPath(job, dstPath, "meta.json")
|
||||
err = os.MkdirAll(filepath.Dir(path), 0750)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
jmn := deepCopyJobMeta(job)
|
||||
if err = EncodeJobMeta(f, &jmn); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err = f.Close(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
f, err = os.Create(getPath(job, dstPath, "data.json"))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
var jd *JobData
|
||||
jd, err = loadJobData(getPath(job, srcPath, "data.json"))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
jdn := deepCopyJobData(jd, job.Cluster, job.SubCluster)
|
||||
if err := EncodeJobData(f, jdn); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
os.WriteFile(filepath.Join(dstPath, "version.txt"), []byte(fmt.Sprintf("%d", Version)), 0644)
|
||||
}
|
65
tools/archive-migration/metrics.go
Normal file
65
tools/archive-migration/metrics.go
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
type JobData map[string]map[schema.MetricScope]*JobMetric
|
||||
|
||||
type JobMetric struct {
|
||||
Unit string `json:"unit"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
Timestep int `json:"timestep"`
|
||||
Series []Series `json:"series"`
|
||||
StatisticsSeries *StatsSeries `json:"statisticsSeries"`
|
||||
}
|
||||
|
||||
type Series struct {
|
||||
Hostname string `json:"hostname"`
|
||||
Id *int `json:"id,omitempty"`
|
||||
Statistics *MetricStatistics `json:"statistics"`
|
||||
Data []schema.Float `json:"data"`
|
||||
}
|
||||
|
||||
type MetricStatistics struct {
|
||||
Avg float64 `json:"avg"`
|
||||
Min float64 `json:"min"`
|
||||
Max float64 `json:"max"`
|
||||
}
|
||||
|
||||
type StatsSeries struct {
|
||||
Mean []Float `json:"mean"`
|
||||
Min []Float `json:"min"`
|
||||
Max []Float `json:"max"`
|
||||
Percentiles map[int][]Float `json:"percentiles,omitempty"`
|
||||
}
|
||||
|
||||
// type MetricScope string
|
||||
|
||||
// const (
|
||||
// MetricScopeInvalid MetricScope = "invalid_scope"
|
||||
|
||||
// MetricScopeNode MetricScope = "node"
|
||||
// MetricScopeSocket MetricScope = "socket"
|
||||
// MetricScopeMemoryDomain MetricScope = "memoryDomain"
|
||||
// MetricScopeCore MetricScope = "core"
|
||||
// MetricScopeHWThread MetricScope = "hwthread"
|
||||
|
||||
// MetricScopeAccelerator MetricScope = "accelerator"
|
||||
// )
|
||||
|
||||
// var metricScopeGranularity map[MetricScope]int = map[MetricScope]int{
|
||||
// MetricScopeNode: 10,
|
||||
// MetricScopeSocket: 5,
|
||||
// MetricScopeMemoryDomain: 3,
|
||||
// MetricScopeCore: 2,
|
||||
// MetricScopeHWThread: 1,
|
||||
|
||||
// MetricScopeAccelerator: 5, // Special/Randomly choosen
|
||||
|
||||
// MetricScopeInvalid: -1,
|
||||
// }
|
@ -1,9 +0,0 @@
|
||||
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package main
|
||||
|
||||
func main() {
|
||||
|
||||
}
|
@ -30,8 +30,8 @@
|
||||
let rooflineMaxY
|
||||
let colWidth
|
||||
let numBins = 50
|
||||
const ccconfig = getContext('cc-config'),
|
||||
metricConfig = getContext('metrics')
|
||||
const ccconfig = getContext('cc-config')
|
||||
const metricConfig = getContext('metrics')
|
||||
|
||||
let metricsInHistograms = ccconfig.analysis_view_histogramMetrics,
|
||||
metricsInScatterplots = ccconfig.analysis_view_scatterPlotMetrics
|
||||
@ -161,24 +161,29 @@
|
||||
<Histogram
|
||||
width={colWidth - 25} height={300 * 0.5}
|
||||
data={$statsQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
|
||||
label={(x) => x < $statsQuery.data.topUsers.length ? $statsQuery.data.topUsers[Math.floor(x)].name : '0'} />
|
||||
label={(x) => x < $statsQuery.data.topUsers.length ? $statsQuery.data.topUsers[Math.floor(x)].name : 'No Users'}
|
||||
ylabel="Node Hours [h]"/>
|
||||
{/key}
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-3">
|
||||
{#key $statsQuery.data.stats[0].histDuration}
|
||||
<h4>Walltime Distribution</h4>
|
||||
<h4>Duration Distribution</h4>
|
||||
<Histogram
|
||||
width={colWidth - 25} height={300}
|
||||
data={$statsQuery.data.stats[0].histDuration} />
|
||||
width={colWidth - 25}
|
||||
data={$statsQuery.data.stats[0].histDuration}
|
||||
xlabel="Current Runtimes [h]"
|
||||
ylabel="Number of Jobs"/>
|
||||
{/key}
|
||||
</div>
|
||||
<div class="col-3">
|
||||
{#key $statsQuery.data.stats[0].histNumNodes}
|
||||
<h4>Number of Nodes Distribution</h4>
|
||||
<Histogram
|
||||
width={colWidth - 25} height={300}
|
||||
data={$statsQuery.data.stats[0].histNumNodes} />
|
||||
width={colWidth - 25}
|
||||
data={$statsQuery.data.stats[0].histNumNodes}
|
||||
xlabel="Allocated Nodes [#]"
|
||||
ylabel="Number of Jobs" />
|
||||
{/key}
|
||||
</div>
|
||||
<div class="col-3">
|
||||
@ -189,7 +194,7 @@
|
||||
{:else if $rooflineQuery.data && cluster}
|
||||
{#key $rooflineQuery.data}
|
||||
<Roofline
|
||||
width={colWidth - 25} height={300}
|
||||
width={colWidth - 25}
|
||||
tiles={$rooflineQuery.data.rooflineHeatmap}
|
||||
cluster={cluster.subClusters.length == 1 ? cluster.subClusters[0] : null}
|
||||
maxY={rooflineMaxY} />
|
||||
@ -211,6 +216,7 @@
|
||||
<Col>
|
||||
<Card body>
|
||||
These histograms show the distribution of the averages of all jobs matching the filters. Each job/average is weighted by its node hours.
|
||||
Note that some metrics could be disabled for specific subclusters as per metriConfig and thus could affect shown average values.
|
||||
</Card>
|
||||
<br/>
|
||||
</Col>
|
||||
@ -224,12 +230,16 @@
|
||||
$footprintsQuery.data.footprints.nodehours,
|
||||
$footprintsQuery.data.footprints.metrics.find(f => f.metric == metric).data, numBins) }))}
|
||||
itemsPerRow={ccconfig.plot_view_plotsPerRow}>
|
||||
<h4>{item.metric} [{metricConfig(cluster.name, item.metric)?.unit}]</h4>
|
||||
<h4>Average Distribution of '{item.metric}'</h4>
|
||||
|
||||
<Histogram
|
||||
width={width} height={250}
|
||||
min={item.min} max={item.max}
|
||||
data={item.bins} label={item.label} />
|
||||
data={item.bins}
|
||||
label={item.label}
|
||||
xlabel={`${item.metric} Average [${(metricConfig(cluster.name, item.metric)?.unit?.prefix ? metricConfig(cluster.name, item.metric)?.unit?.prefix : '') +
|
||||
(metricConfig(cluster.name, item.metric)?.unit?.base ? metricConfig(cluster.name, item.metric)?.unit?.base : '')}]`}
|
||||
ylabel="Node Hours [h]" />
|
||||
</PlotTable>
|
||||
</Col>
|
||||
</Row>
|
||||
@ -238,6 +248,7 @@
|
||||
<Col>
|
||||
<Card body>
|
||||
Each circle represents one job. The size of a circle is proportional to its node hours. Darker circles mean multiple jobs have the same averages for the respective metrics.
|
||||
Note that some metrics could be disabled for specific subclusters as per metriConfig and thus could affect shown average values.
|
||||
</Card>
|
||||
<br/>
|
||||
</Col>
|
||||
@ -254,12 +265,18 @@
|
||||
|
||||
<ScatterPlot
|
||||
width={width} height={250} color={"rgba(0, 102, 204, 0.33)"}
|
||||
xLabel={`${item.m1} [${metricConfig(cluster.name, item.m1)?.unit}]`}
|
||||
yLabel={`${item.m2} [${metricConfig(cluster.name, item.m2)?.unit}]`}
|
||||
xLabel={`${item.m1} [${(metricConfig(cluster.name, item.m1)?.unit?.prefix ? metricConfig(cluster.name, item.m1)?.unit?.prefix : '') +
|
||||
(metricConfig(cluster.name, item.m1)?.unit?.base ? metricConfig(cluster.name, item.m1)?.unit?.base : '')}]`}
|
||||
yLabel={`${item.m2} [${(metricConfig(cluster.name, item.m2)?.unit?.prefix ? metricConfig(cluster.name, item.m2)?.unit?.prefix : '') +
|
||||
(metricConfig(cluster.name, item.m2)?.unit?.base ? metricConfig(cluster.name, item.m2)?.unit?.base : '')}]`}
|
||||
X={item.f1} Y={item.f2} S={$footprintsQuery.data.footprints.nodehours} />
|
||||
</PlotTable>
|
||||
</Col>
|
||||
</Row>
|
||||
{/if}
|
||||
|
||||
|
||||
<style>
|
||||
h4 {
|
||||
text-align: center;
|
||||
}
|
||||
</style>
|
||||
|
@ -81,7 +81,7 @@
|
||||
missingMetrics = metricNames.filter(metric => !metrics.some(jm => jm.name == metric))
|
||||
missingHosts = job.resources.map(({ hostname }) => ({
|
||||
hostname: hostname,
|
||||
metrics: metricNames.filter(metric => !metrics.some(jm => jm.metric.scope == 'node' && jm.metric.series.some(series => series.hostname == hostname)))
|
||||
metrics: metricNames.filter(metric => !metrics.some(jm => jm.scope == 'node' && jm.metric.series.some(series => series.hostname == hostname)))
|
||||
})).filter(({ metrics }) => metrics.length > 0)
|
||||
somethingMissing = missingMetrics.length > 0 || missingHosts.length > 0
|
||||
}
|
||||
@ -114,8 +114,8 @@
|
||||
cluster={clusters
|
||||
.find(c => c.name == $initq.data.job.cluster).subClusters
|
||||
.find(sc => sc.name == $initq.data.job.subCluster)}
|
||||
flopsAny={$jobMetrics.data.jobMetrics.find(m => m.name == 'flops_any' && m.metric.scope == 'node')}
|
||||
memBw={$jobMetrics.data.jobMetrics.find(m => m.name == 'mem_bw' && m.metric.scope == 'node')} />
|
||||
flopsAny={$jobMetrics.data.jobMetrics.find(m => m.name == 'flops_any' && m.scope == 'node').metric}
|
||||
memBw={$jobMetrics.data.jobMetrics.find(m => m.name == 'mem_bw' && m.scope == 'node').metric} />
|
||||
</Col>
|
||||
{:else}
|
||||
<Col></Col>
|
||||
@ -163,8 +163,9 @@
|
||||
bind:this={plots[item.metric]}
|
||||
on:more-loaded={({ detail }) => statsTable.moreLoaded(detail)}
|
||||
job={$initq.data.job}
|
||||
metric={item.metric}
|
||||
scopes={item.data.map(x => x.metric)}
|
||||
metricName={item.metric}
|
||||
rawData={item.data.map(x => x.metric)}
|
||||
scopes={item.data.map(x => x.scope)}
|
||||
width={width}/>
|
||||
{:else}
|
||||
<Card body color="warning">No data for <code>{item.metric}</code></Card>
|
||||
|
@ -17,11 +17,15 @@
|
||||
export let authlevel
|
||||
export let roles
|
||||
|
||||
let filters, jobList, matchedJobs = null
|
||||
let filters = []
|
||||
let jobList, matchedJobs = null
|
||||
let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false, isMetricsSelectionOpen = false
|
||||
let metrics = filterPresets.cluster
|
||||
? ccconfig[`plot_list_selectedMetrics:${filterPresets.cluster}`] || ccconfig.plot_list_selectedMetrics
|
||||
: ccconfig.plot_list_selectedMetrics
|
||||
let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null
|
||||
|
||||
$: selectedCluster = filters[0]?.cluster ? filters[0].cluster.eq : null
|
||||
|
||||
// The filterPresets are handled by the Filters component,
|
||||
// so we need to wait for it to be ready before we can start a query.
|
||||
@ -58,7 +62,10 @@
|
||||
<Filters
|
||||
filterPresets={filterPresets}
|
||||
bind:this={filters}
|
||||
on:update={({ detail }) => jobList.update(detail.filters)} />
|
||||
on:update={({ detail }) => {
|
||||
filters = detail.filters
|
||||
jobList.update(detail.filters)}
|
||||
} />
|
||||
</Col>
|
||||
|
||||
<Col xs="3" style="margin-left: auto;">
|
||||
@ -84,7 +91,7 @@
|
||||
bind:isOpen={isSortingOpen} />
|
||||
|
||||
<MetricSelection
|
||||
cluster={filterPresets.cluster}
|
||||
bind:cluster={selectedCluster}
|
||||
configName="plot_list_selectedMetrics"
|
||||
bind:metrics={metrics}
|
||||
bind:isOpen={isMetricsSelectionOpen} />
|
||||
|
@ -5,19 +5,22 @@
|
||||
import { fetchMetrics, minScope } from './utils'
|
||||
|
||||
export let job
|
||||
export let metric
|
||||
export let metricName
|
||||
export let scopes
|
||||
export let width
|
||||
export let rawData
|
||||
|
||||
const dispatch = createEventDispatcher()
|
||||
const cluster = getContext('clusters').find(cluster => cluster.name == job.cluster)
|
||||
const subCluster = cluster.subClusters.find(subCluster => subCluster.name == job.subCluster)
|
||||
const metricConfig = cluster.metricConfig.find(metricConfig => metricConfig.name == metric)
|
||||
const metricConfig = cluster.metricConfig.find(metricConfig => metricConfig.name == metricName)
|
||||
|
||||
let selectedScope = minScope(scopes.map(s => s.scope)), selectedHost = null, plot, fetching = false, error = null
|
||||
let selectedHost = null, plot, fetching = false, error = null
|
||||
let selectedScope = minScope(scopes)
|
||||
let selectedScopeIndex = scopes.findIndex(s => s == selectedScope)
|
||||
|
||||
$: avaliableScopes = scopes.map(metric => metric.scope)
|
||||
$: data = scopes.find(metric => metric.scope == selectedScope)
|
||||
$: avaliableScopes = scopes
|
||||
$: data = rawData[selectedScopeIndex]
|
||||
$: series = data?.series.filter(series => selectedHost == null || series.hostname == selectedHost)
|
||||
|
||||
let from = null, to = null
|
||||
@ -29,7 +32,7 @@
|
||||
|
||||
export async function loadMore() {
|
||||
fetching = true
|
||||
let response = await fetchMetrics(job, [metric], ["core"])
|
||||
let response = await fetchMetrics(job, [metricName], ["core"])
|
||||
fetching = false
|
||||
|
||||
if (response.error) {
|
||||
@ -38,9 +41,9 @@
|
||||
}
|
||||
|
||||
for (let jm of response.data.jobMetrics) {
|
||||
if (jm.metric.scope != "node") {
|
||||
if (jm.scope != "node") {
|
||||
scopes.push(jm.metric)
|
||||
selectedScope = jm.metric.scope
|
||||
selectedScope = jm.scope
|
||||
dispatch('more-loaded', jm)
|
||||
if (!avaliableScopes.includes(selectedScope))
|
||||
avaliableScopes = [...avaliableScopes, selectedScope]
|
||||
@ -52,7 +55,8 @@
|
||||
</script>
|
||||
<InputGroup>
|
||||
<InputGroupText style="min-width: 150px;">
|
||||
{metric} ({metricConfig?.unit})
|
||||
{metricName} ({(metricConfig?.unit?.prefix ? metricConfig.unit.prefix : '') +
|
||||
(metricConfig?.unit?.base ? metricConfig.unit.base : '')})
|
||||
</InputGroupText>
|
||||
<select class="form-select" bind:value={selectedScope}>
|
||||
{#each avaliableScopes as scope}
|
||||
@ -82,7 +86,7 @@
|
||||
width={width} height={300}
|
||||
cluster={cluster} subCluster={subCluster}
|
||||
timestep={data.timestep}
|
||||
scope={selectedScope} metric={metric}
|
||||
scope={selectedScope} metric={metricName}
|
||||
series={series} />
|
||||
{/if}
|
||||
{/key}
|
||||
|
@ -95,7 +95,7 @@
|
||||
|
||||
<Modal isOpen={isOpen} toggle={() => (isOpen = !isOpen)}>
|
||||
<ModalHeader>
|
||||
Configure columns
|
||||
Configure columns (Metric availability shown)
|
||||
</ModalHeader>
|
||||
<ModalBody>
|
||||
<ListGroup>
|
||||
@ -113,9 +113,26 @@
|
||||
{/if}
|
||||
{metric}
|
||||
<span style="float: right;">
|
||||
{cluster == null ? clusters
|
||||
{cluster == null ?
|
||||
clusters // No single cluster specified: List Clusters with Metric
|
||||
.filter(cluster => cluster.metricConfig.find(m => m.name == metric) != null)
|
||||
.map(cluster => cluster.name).join(', ') : ''}
|
||||
.map(cluster => cluster.name).join(', ') :
|
||||
clusters // Single cluster requested: List Subclusters with do not have metric remove flag
|
||||
.filter(cluster => cluster.metricConfig.find(m => m.name == metric) != null)
|
||||
.map(function(cluster) {
|
||||
let scNames = cluster.subClusters.map(sc => sc.name)
|
||||
scNames.forEach(function(scName){
|
||||
let met = cluster.metricConfig.find(m => m.name == metric)
|
||||
let msc = met.subClusters.find(msc => msc.name == scName)
|
||||
if (msc != null) {
|
||||
if (msc.remove == true) {
|
||||
scNames = scNames.filter(scn => scn != msc.name)
|
||||
}
|
||||
}
|
||||
})
|
||||
return scNames
|
||||
})
|
||||
.join(', ')}
|
||||
</span>
|
||||
</li>
|
||||
{/each}
|
||||
|
@ -20,16 +20,19 @@
|
||||
from.setMinutes(from.getMinutes() - 30)
|
||||
}
|
||||
|
||||
const ccconfig = getContext('cc-config'), clusters = getContext('clusters')
|
||||
const ccconfig = getContext('cc-config')
|
||||
const clusters = getContext('clusters')
|
||||
|
||||
const nodesQuery = operationStore(`query($cluster: String!, $nodes: [String!], $from: Time!, $to: Time!) {
|
||||
nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) {
|
||||
host, subCluster
|
||||
host
|
||||
subCluster
|
||||
metrics {
|
||||
name,
|
||||
name
|
||||
scope
|
||||
metric {
|
||||
timestep
|
||||
scope
|
||||
unit { base, prefix }
|
||||
series {
|
||||
statistics { min, avg, max }
|
||||
data
|
||||
@ -46,6 +49,17 @@
|
||||
|
||||
$: $nodesQuery.variables = { cluster, nodes: [hostname], from: from.toISOString(), to: to.toISOString() }
|
||||
|
||||
let metricUnits = {}
|
||||
$: if ($nodesQuery.data) {
|
||||
for (let metric of clusters.find(c => c.name == cluster).metricConfig) {
|
||||
if (metric.unit.prefix || metric.unit.base) {
|
||||
metricUnits[metric.name] = '(' + (metric.unit.prefix ? metric.unit.prefix : '') + (metric.unit.base ? metric.unit.base : '') + ')'
|
||||
} else { // If no unit defined: Omit Unit Display
|
||||
metricUnits[metric.name] = ''
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
query(nodesQuery)
|
||||
|
||||
// $: console.log($nodesQuery?.data?.nodeMetrics[0].metrics)
|
||||
@ -83,7 +97,7 @@
|
||||
let:width
|
||||
itemsPerRow={ccconfig.plot_view_plotsPerRow}
|
||||
items={$nodesQuery.data.nodeMetrics[0].metrics.sort((a, b) => a.name.localeCompare(b.name))}>
|
||||
<h4 style="text-align: center;">{item.name}</h4>
|
||||
<h4 style="text-align: center;">{item.name} {metricUnits[item.name]}</h4>
|
||||
<MetricPlot
|
||||
width={width} height={300} metric={item.name} timestep={item.metric.timestep}
|
||||
cluster={clusters.find(c => c.name == cluster)} subCluster={$nodesQuery.data.nodeMetrics[0].subCluster}
|
||||
|
@ -11,7 +11,7 @@
|
||||
const allMetrics = [...new Set(jobMetrics.map(m => m.name))].sort(),
|
||||
scopesForMetric = (metric) => jobMetrics
|
||||
.filter(jm => jm.name == metric)
|
||||
.map(jm => jm.metric.scope)
|
||||
.map(jm => jm.scope)
|
||||
|
||||
let hosts = job.resources.map(r => r.hostname).sort(),
|
||||
selectedScopes = {},
|
||||
@ -40,7 +40,7 @@
|
||||
s.active = true
|
||||
}
|
||||
|
||||
let series = jobMetrics.find(jm => jm.name == metric && jm.metric.scope == 'node')?.metric.series
|
||||
let series = jobMetrics.find(jm => jm.name == metric && jm.scope == 'node')?.metric.series
|
||||
sorting = {...sorting}
|
||||
hosts = hosts.sort((h1, h2) => {
|
||||
let s1 = series.find(s => s.hostname == h1)?.statistics
|
||||
|
@ -5,7 +5,7 @@
|
||||
export let jobMetrics
|
||||
|
||||
$: series = jobMetrics
|
||||
.find(jm => jm.name == metric && jm.metric.scope == scope)
|
||||
.find(jm => jm.name == metric && jm.scope == scope)
|
||||
?.metric.series.filter(s => s.hostname == host && s.statistics != null)
|
||||
</script>
|
||||
|
||||
|
@ -2,8 +2,8 @@
|
||||
import Refresher from './joblist/Refresher.svelte'
|
||||
import Roofline, { transformPerNodeData } from './plots/Roofline.svelte'
|
||||
import Histogram from './plots/Histogram.svelte'
|
||||
import { Row, Col, Spinner, Card, Table, Progress } from 'sveltestrap'
|
||||
import { init } from './utils.js'
|
||||
import { Row, Col, Spinner, Card, CardHeader, CardTitle, CardBody, Table, Progress, Icon } from 'sveltestrap'
|
||||
import { init, formatNumber } from './utils.js'
|
||||
import { operationStore, query } from '@urql/svelte'
|
||||
|
||||
const { query: initq } = init()
|
||||
@ -15,13 +15,14 @@
|
||||
let from = new Date(Date.now() - 5 * 60 * 1000), to = new Date(Date.now())
|
||||
const mainQuery = operationStore(`query($cluster: String!, $filter: [JobFilter!]!, $metrics: [String!], $from: Time!, $to: Time!) {
|
||||
nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) {
|
||||
host,
|
||||
subCluster,
|
||||
host
|
||||
subCluster
|
||||
metrics {
|
||||
name,
|
||||
metric {
|
||||
name
|
||||
scope
|
||||
timestep,
|
||||
metric {
|
||||
timestep
|
||||
unit { base, prefix }
|
||||
series { data }
|
||||
}
|
||||
}
|
||||
@ -47,20 +48,27 @@
|
||||
? sum + (node.metrics.find(m => m.name == metric)?.metric.series.reduce((sum, series) => sum + series.data[series.data.length - 1], 0) || 0)
|
||||
: sum, 0)
|
||||
|
||||
let allocatedNodes = {}, flopRate = {}, memBwRate = {}
|
||||
let allocatedNodes = {}, flopRate = {}, flopRateUnit = {}, memBwRate = {}, memBwRateUnit = {}
|
||||
$: if ($initq.data && $mainQuery.data) {
|
||||
let subClusters = $initq.data.clusters.find(c => c.name == cluster).subClusters
|
||||
for (let subCluster of subClusters) {
|
||||
allocatedNodes[subCluster.name] = $mainQuery.data.allocatedNodes.find(({ name }) => name == subCluster.name)?.count || 0
|
||||
flopRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'flops_any') * 100) / 100
|
||||
flopRateUnit[subCluster.name] = subCluster.flopRateSimd.unit.prefix + subCluster.flopRateSimd.unit.base
|
||||
memBwRate[subCluster.name] = Math.floor(sumUp($mainQuery.data.nodeMetrics, subCluster.name, 'mem_bw') * 100) / 100
|
||||
memBwRateUnit[subCluster.name] = subCluster.memoryBandwidth.unit.prefix + subCluster.memoryBandwidth.unit.base
|
||||
}
|
||||
}
|
||||
|
||||
query(mainQuery)
|
||||
</script>
|
||||
|
||||
<!-- Loading indicator & Refresh -->
|
||||
|
||||
<Row>
|
||||
<Col xs="auto" style="align-self: flex-end;">
|
||||
<h4 class="mb-0" >Current usage of cluster "{cluster}"</h4>
|
||||
</Col>
|
||||
<Col xs="auto">
|
||||
{#if $initq.fetching || $mainQuery.fetching}
|
||||
<Spinner/>
|
||||
@ -89,54 +97,72 @@
|
||||
</Col>
|
||||
</Row>
|
||||
{/if}
|
||||
|
||||
<hr>
|
||||
|
||||
<!-- Gauges & Roofline per Subcluster-->
|
||||
|
||||
{#if $initq.data && $mainQuery.data}
|
||||
{#each $initq.data.clusters.find(c => c.name == cluster).subClusters as subCluster, i}
|
||||
<Row>
|
||||
<Col xs="3">
|
||||
<Row cols={2} class="mb-3 justify-content-center">
|
||||
<Col xs="4" class="px-3">
|
||||
<Card class="h-auto mt-1">
|
||||
<CardHeader>
|
||||
<CardTitle class="mb-0">SubCluster "{subCluster.name}"</CardTitle>
|
||||
</CardHeader>
|
||||
<CardBody>
|
||||
<Table>
|
||||
<tr>
|
||||
<th scope="col">SubCluster</th>
|
||||
<td colspan="2">{subCluster.name}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th scope="col">Allocated Nodes</th>
|
||||
<td style="min-width: 75px;"><div class="col"><Progress value={allocatedNodes[subCluster.name]} max={subCluster.numberOfNodes}/></div></td>
|
||||
<td>({allocatedNodes[subCluster.name]} / {subCluster.numberOfNodes})</td>
|
||||
<td style="min-width: 100px;"><div class="col"><Progress value={allocatedNodes[subCluster.name]} max={subCluster.numberOfNodes}/></div></td>
|
||||
<td>({allocatedNodes[subCluster.name]} Nodes / {subCluster.numberOfNodes} Total Nodes)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th scope="col">Flop Rate</th>
|
||||
<td style="min-width: 75px;"><div class="col"><Progress value={flopRate[subCluster.name]} max={subCluster.flopRateSimd * subCluster.numberOfNodes}/></div></td>
|
||||
<td>({flopRate[subCluster.name]} / {subCluster.flopRateSimd * subCluster.numberOfNodes})</td>
|
||||
<th scope="col">Flop Rate (Any) <Icon name="info-circle" class="p-1" style="cursor: help;" title="Flops[Any] = (Flops[Double] x 2) + Flops[Single]"/></th>
|
||||
<td style="min-width: 100px;"><div class="col"><Progress value={flopRate[subCluster.name]} max={subCluster.flopRateSimd.value * subCluster.numberOfNodes}/></div></td>
|
||||
<td>({flopRate[subCluster.name]} {flopRateUnit[subCluster.name]} / {(subCluster.flopRateSimd.value * subCluster.numberOfNodes)} {flopRateUnit[subCluster.name]} [Max])</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th scope="col">MemBw Rate</th>
|
||||
<td style="min-width: 75px;"><div class="col"><Progress value={memBwRate[subCluster.name]} max={subCluster.memoryBandwidth * subCluster.numberOfNodes}/></div></td>
|
||||
<td>({memBwRate[subCluster.name]} / {subCluster.memoryBandwidth * subCluster.numberOfNodes})</td>
|
||||
<td style="min-width: 100px;"><div class="col"><Progress value={memBwRate[subCluster.name]} max={subCluster.memoryBandwidth.value * subCluster.numberOfNodes}/></div></td>
|
||||
<td>({memBwRate[subCluster.name]} {memBwRateUnit[subCluster.name]} / {(subCluster.memoryBandwidth.value * subCluster.numberOfNodes)} {memBwRateUnit[subCluster.name]} [Max])</td>
|
||||
</tr>
|
||||
</Table>
|
||||
</CardBody>
|
||||
</Card>
|
||||
</Col>
|
||||
<div class="col-9" bind:clientWidth={plotWidths[i]}>
|
||||
<Col class="px-3">
|
||||
<div bind:clientWidth={plotWidths[i]}>
|
||||
{#key $mainQuery.data.nodeMetrics}
|
||||
<Roofline
|
||||
width={plotWidths[i] - 10} height={300} colorDots={false} cluster={subCluster}
|
||||
width={plotWidths[i] - 10} height={300} colorDots={true} showTime={false} cluster={subCluster}
|
||||
data={transformPerNodeData($mainQuery.data.nodeMetrics.filter(data => data.subCluster == subCluster.name))} />
|
||||
{/key}
|
||||
</div>
|
||||
</Col>
|
||||
</Row>
|
||||
{/each}
|
||||
<Row>
|
||||
<div class="col-4" bind:clientWidth={colWidth1}>
|
||||
<h4>Top Users</h4>
|
||||
|
||||
<hr style="margin-top: -1em;">
|
||||
|
||||
<!-- Usage Stats as Histograms -->
|
||||
|
||||
<Row cols={4}>
|
||||
<Col class="p-2">
|
||||
<div bind:clientWidth={colWidth1}>
|
||||
<h4 class="mb-3 text-center">Top Users</h4>
|
||||
{#key $mainQuery.data}
|
||||
<Histogram
|
||||
width={colWidth1 - 25} height={300}
|
||||
width={colWidth1 - 25}
|
||||
data={$mainQuery.data.topUsers.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
|
||||
label={(x) => x < $mainQuery.data.topUsers.length ? $mainQuery.data.topUsers[Math.floor(x)].name : '0'} />
|
||||
label={(x) => x < $mainQuery.data.topUsers.length ? $mainQuery.data.topUsers[Math.floor(x)].name : '0'}
|
||||
xlabel="User Name" ylabel="Number of Jobs" />
|
||||
{/key}
|
||||
</div>
|
||||
<div class="col-2">
|
||||
</Col>
|
||||
<Col class="px-4 py-2">
|
||||
<Table>
|
||||
<tr><th>Name</th><th>Number of Nodes</th></tr>
|
||||
<tr class="mb-2"><th>User Name</th><th>Number of Nodes</th></tr>
|
||||
{#each $mainQuery.data.topUsers.sort((a, b) => b.count - a.count) as { name, count }}
|
||||
<tr>
|
||||
<th scope="col"><a href="/monitoring/user/{name}">{name}</a></th>
|
||||
@ -144,41 +170,48 @@
|
||||
</tr>
|
||||
{/each}
|
||||
</Table>
|
||||
</div>
|
||||
<div class="col-4">
|
||||
<h4>Top Projects</h4>
|
||||
</Col>
|
||||
<Col class="p-2">
|
||||
<h4 class="mb-3 text-center">Top Projects</h4>
|
||||
{#key $mainQuery.data}
|
||||
<Histogram
|
||||
width={colWidth1 - 25} height={300}
|
||||
width={colWidth1 - 25}
|
||||
data={$mainQuery.data.topProjects.sort((a, b) => b.count - a.count).map(({ count }, idx) => ({ count, value: idx }))}
|
||||
label={(x) => x < $mainQuery.data.topProjects.length ? $mainQuery.data.topProjects[Math.floor(x)].name : '0'} />
|
||||
label={(x) => x < $mainQuery.data.topProjects.length ? $mainQuery.data.topProjects[Math.floor(x)].name : '0'}
|
||||
xlabel="Project Code" ylabel="Number of Jobs" />
|
||||
{/key}
|
||||
</div>
|
||||
<div class="col-2">
|
||||
</Col>
|
||||
<Col class="px-4 py-2">
|
||||
<Table>
|
||||
<tr><th>Name</th><th>Number of Nodes</th></tr>
|
||||
<tr class="mb-2"><th>Project Code</th><th>Number of Nodes</th></tr>
|
||||
{#each $mainQuery.data.topProjects.sort((a, b) => b.count - a.count) as { name, count }}
|
||||
<tr><th scope="col">{name}</th><td>{count}</td></tr>
|
||||
{/each}
|
||||
</Table>
|
||||
</div>
|
||||
</Col>
|
||||
</Row>
|
||||
<Row>
|
||||
<div class="col" bind:clientWidth={colWidth2}>
|
||||
<h4>Duration Distribution</h4>
|
||||
<Row cols={2} class="mt-3">
|
||||
<Col class="p-2">
|
||||
<div bind:clientWidth={colWidth2}>
|
||||
<h4 class="mb-3 text-center">Duration Distribution</h4>
|
||||
{#key $mainQuery.data.stats}
|
||||
<Histogram
|
||||
width={colWidth2 - 25} height={300}
|
||||
data={$mainQuery.data.stats[0].histDuration} />
|
||||
width={colWidth2 - 25}
|
||||
data={$mainQuery.data.stats[0].histDuration}
|
||||
xlabel="Current Runtimes [h]"
|
||||
ylabel="Number of Jobs" />
|
||||
{/key}
|
||||
</div>
|
||||
<div class="col">
|
||||
<h4>Number of Nodes Distribution</h4>
|
||||
</Col>
|
||||
<Col class="p-2">
|
||||
<h4 class="mb-3 text-center">Number of Nodes Distribution</h4>
|
||||
{#key $mainQuery.data.stats}
|
||||
<Histogram
|
||||
width={colWidth2 - 25} height={300}
|
||||
data={$mainQuery.data.stats[0].histNumNodes} />
|
||||
width={colWidth2 - 25}
|
||||
data={$mainQuery.data.stats[0].histNumNodes}
|
||||
xlabel="Allocated Nodes [#]"
|
||||
ylabel="Number of Jobs" />
|
||||
{/key}
|
||||
</div>
|
||||
</Col>
|
||||
</Row>
|
||||
{/if}
|
||||
|
@ -21,6 +21,7 @@
|
||||
|
||||
const clusters = getContext('clusters')
|
||||
const ccconfig = getContext('cc-config')
|
||||
const metricConfig = getContext('metrics')
|
||||
|
||||
let plotHeight = 300
|
||||
let hostnameFilter = ''
|
||||
@ -28,13 +29,14 @@
|
||||
|
||||
const nodesQuery = operationStore(`query($cluster: String!, $metrics: [String!], $from: Time!, $to: Time!) {
|
||||
nodeMetrics(cluster: $cluster, metrics: $metrics, from: $from, to: $to) {
|
||||
host,
|
||||
host
|
||||
subCluster
|
||||
metrics {
|
||||
name,
|
||||
metric {
|
||||
name
|
||||
scope
|
||||
timestep,
|
||||
metric {
|
||||
timestep
|
||||
unit { base, prefix }
|
||||
series {
|
||||
statistics { min, avg, max }
|
||||
data
|
||||
@ -49,6 +51,18 @@
|
||||
to: to.toISOString()
|
||||
})
|
||||
|
||||
let metricUnits = {}
|
||||
$: if ($nodesQuery.data) {
|
||||
let thisCluster = clusters.find(c => c.name == cluster)
|
||||
for (let metric of thisCluster.metricConfig) {
|
||||
if (metric.unit.prefix || metric.unit.base) {
|
||||
metricUnits[metric.name] = '(' + (metric.unit.prefix ? metric.unit.prefix : '') + (metric.unit.base ? metric.unit.base : '') + ')'
|
||||
} else { // If no unit defined: Omit Unit Display
|
||||
metricUnits[metric.name] = ''
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$: $nodesQuery.variables = { cluster, metrics: [selectedMetric], from: from.toISOString(), to: to.toISOString() }
|
||||
|
||||
query(nodesQuery)
|
||||
@ -71,7 +85,7 @@
|
||||
<InputGroupText>Metric</InputGroupText>
|
||||
<select class="form-select" bind:value={selectedMetric}>
|
||||
{#each clusters.find(c => c.name == cluster).metricConfig as metric}
|
||||
<option value={metric.name}>{metric.name} ({metric.unit})</option>
|
||||
<option value={metric.name}>{metric.name} {metricUnits[metric.name]}</option>
|
||||
{/each}
|
||||
</select>
|
||||
</InputGroup>
|
||||
@ -98,11 +112,23 @@
|
||||
let:width
|
||||
itemsPerRow={ccconfig.plot_view_plotsPerRow}
|
||||
items={$nodesQuery.data.nodeMetrics
|
||||
.filter(h => h.host.includes(hostnameFilter) && h.metrics.some(m => m.name == selectedMetric && m.metric.scope == 'node'))
|
||||
.map(h => ({ host: h.host, subCluster: h.subCluster, data: h.metrics.find(m => m.name == selectedMetric && m.metric.scope == 'node') }))
|
||||
.filter(h => h.host.includes(hostnameFilter) && h.metrics.some(m => m.name == selectedMetric && m.scope == 'node'))
|
||||
.map(function (h) {
|
||||
let thisConfig = metricConfig(cluster, selectedMetric)
|
||||
let thisSCIndex = thisConfig.subClusters.findIndex(sc => sc.name == h.subCluster)
|
||||
// Metric remove == true
|
||||
if (thisSCIndex >= 0) {
|
||||
if (thisConfig.subClusters[thisSCIndex].remove == true) {
|
||||
return { host: h.host, subCluster: h.subCluster, data: null, removed: true }
|
||||
}
|
||||
}
|
||||
// Else
|
||||
return { host: h.host, subCluster: h.subCluster, data: h.metrics.find(m => m.name == selectedMetric && m.scope == 'node'), removed: false }
|
||||
})
|
||||
.sort((a, b) => a.host.localeCompare(b.host))}>
|
||||
|
||||
<h4 style="width: 100%; text-align: center;"><a href="/monitoring/node/{cluster}/{item.host}">{item.host} ({item.subCluster})</a></h4>
|
||||
{#if item.removed == false && item.data != null}
|
||||
<MetricPlot
|
||||
width={width}
|
||||
height={plotHeight}
|
||||
@ -111,6 +137,11 @@
|
||||
metric={item.data.name}
|
||||
cluster={clusters.find(c => c.name == cluster)}
|
||||
subCluster={item.subCluster} />
|
||||
{:else if item.removed == true && item.data == null}
|
||||
<Card body color="info">Metric '{ selectedMetric }' disabled for subcluster '{ item.subCluster }'</Card>
|
||||
{:else}
|
||||
<Card body color="warning">Missing Data</Card>
|
||||
{/if}
|
||||
</PlotTable>
|
||||
{/if}
|
||||
</Col>
|
||||
|
@ -18,10 +18,12 @@
|
||||
export let user
|
||||
export let filterPresets
|
||||
|
||||
let filters, jobList
|
||||
let filters = []
|
||||
let jobList
|
||||
let sorting = { field: 'startTime', order: 'DESC' }, isSortingOpen = false
|
||||
let metrics = ccconfig.plot_list_selectedMetrics, isMetricsSelectionOpen = false
|
||||
let w1, w2, histogramHeight = 250
|
||||
let selectedCluster = filterPresets?.cluster ? filterPresets.cluster : null
|
||||
|
||||
const stats = operationStore(`
|
||||
query($filter: [JobFilter!]!) {
|
||||
@ -40,6 +42,12 @@
|
||||
pause: true
|
||||
})
|
||||
|
||||
// filters[filters.findIndex(filter => filter.cluster != null)] ?
|
||||
// filters[filters.findIndex(filter => filter.cluster != null)].cluster.eq :
|
||||
// null
|
||||
// Cluster filter has to be alwas @ first index, above will throw error
|
||||
$: selectedCluster = filters[0]?.cluster ? filters[0].cluster.eq : null
|
||||
|
||||
query(stats)
|
||||
|
||||
onMount(() => filters.update())
|
||||
@ -75,11 +83,12 @@
|
||||
startTimeQuickSelect={true}
|
||||
bind:this={filters}
|
||||
on:update={({ detail }) => {
|
||||
let filters = [...detail.filters, { user: { eq: user.username } }]
|
||||
$stats.variables = { filter: filters }
|
||||
let jobFilters = [...detail.filters, { user: { eq: user.username } }]
|
||||
$stats.variables = { filter: jobFilters }
|
||||
$stats.context.pause = false
|
||||
$stats.reexecute()
|
||||
jobList.update(filters)
|
||||
filters = jobFilters
|
||||
jobList.update(jobFilters)
|
||||
}} />
|
||||
</Col>
|
||||
<Col xs="auto" style="margin-left: auto;">
|
||||
@ -136,19 +145,23 @@
|
||||
</Table>
|
||||
</Col>
|
||||
<div class="col-4" style="text-align: center;" bind:clientWidth={w1}>
|
||||
<b>Walltime</b>
|
||||
<b>Duration Distribution</b>
|
||||
{#key $stats.data.jobsStatistics[0].histDuration}
|
||||
<Histogram
|
||||
data={$stats.data.jobsStatistics[0].histDuration}
|
||||
width={w1 - 25} height={histogramHeight} />
|
||||
width={w1 - 25} height={histogramHeight}
|
||||
xlabel="Current Runtimes [h]"
|
||||
ylabel="Number of Jobs"/>
|
||||
{/key}
|
||||
</div>
|
||||
<div class="col-4" style="text-align: center;" bind:clientWidth={w2}>
|
||||
<b>Number of Nodes</b>
|
||||
<b>Number of Nodes Distribution</b>
|
||||
{#key $stats.data.jobsStatistics[0].histNumNodes}
|
||||
<Histogram
|
||||
data={$stats.data.jobsStatistics[0].histNumNodes}
|
||||
width={w2 - 25} height={histogramHeight} />
|
||||
width={w2 - 25} height={histogramHeight}
|
||||
xlabel="Allocated Nodes [#]"
|
||||
ylabel="Number of Jobs" />
|
||||
{/key}
|
||||
</div>
|
||||
{/if}
|
||||
@ -167,6 +180,8 @@
|
||||
bind:sorting={sorting}
|
||||
bind:isOpen={isSortingOpen} />
|
||||
|
||||
<MetricSelection configName="plot_list_selectedMetrics"
|
||||
<MetricSelection
|
||||
bind:cluster={selectedCluster}
|
||||
configName="plot_list_selectedMetrics"
|
||||
bind:metrics={metrics}
|
||||
bind:isOpen={isMetricsSelectionOpen} />
|
@ -20,6 +20,7 @@
|
||||
let text = await res.text()
|
||||
popMessage(text, '#048109')
|
||||
reloadUserList()
|
||||
form.reset()
|
||||
} else {
|
||||
let text = await res.text()
|
||||
// console.log(res.statusText)
|
||||
@ -79,7 +80,12 @@
|
||||
{#if i == 0}
|
||||
<div>
|
||||
<input type="radio" id={role} name="role" value={role} checked/>
|
||||
<label for={role}>{role.charAt(0).toUpperCase() + role.slice(1)} (regular user, same as if created via LDAP sync.)</label>
|
||||
<label for={role}>{role.toUpperCase()} (Allowed to interact with REST API.)</label>
|
||||
</div>
|
||||
{:else if i == 1}
|
||||
<div>
|
||||
<input type="radio" id={role} name="role" value={role} checked/>
|
||||
<label for={role}>{role.charAt(0).toUpperCase() + role.slice(1)} (Same as if created via LDAP sync.)</label>
|
||||
</div>
|
||||
{:else}
|
||||
<div>
|
||||
|
@ -102,9 +102,11 @@
|
||||
{#if $initialized}
|
||||
({clusters
|
||||
.map(cluster => cluster.metricConfig.find(m => m.name == metric))
|
||||
.filter(m => m != null).map(m => m.unit)
|
||||
.reduce((arr, unit) => arr.includes(unit) ? arr : [...arr, unit], [])
|
||||
.join(', ')})
|
||||
.filter(m => m != null)
|
||||
.map(m => (m.unit?.prefix?m.unit?.prefix:'') + (m.unit?.base?m.unit?.base:'')) // Build unitStr
|
||||
.reduce((arr, unitStr) => arr.includes(unitStr) ? arr : [...arr, unitStr], []) // w/o this, output would be [unitStr, unitStr]
|
||||
.join(', ')
|
||||
})
|
||||
{/if}
|
||||
</th>
|
||||
{/each}
|
||||
|
@ -24,12 +24,14 @@
|
||||
let scopes = [job.numNodes == 1 ? 'core' : 'node']
|
||||
|
||||
const cluster = getContext('clusters').find(c => c.name == job.cluster)
|
||||
|
||||
// Get all MetricConfs which include subCluster-specific settings for this job
|
||||
const metricConfig = getContext('metrics')
|
||||
const metricsQuery = operationStore(`query($id: ID!, $metrics: [String!]!, $scopes: [MetricScope!]!) {
|
||||
jobMetrics(id: $id, metrics: $metrics, scopes: $scopes) {
|
||||
name
|
||||
scope
|
||||
metric {
|
||||
unit, scope, timestep
|
||||
unit { prefix, base }, timestep
|
||||
statisticsSeries { min, mean, max }
|
||||
series {
|
||||
hostname, id, data
|
||||
@ -44,13 +46,47 @@
|
||||
})
|
||||
|
||||
const selectScope = (jobMetrics) => jobMetrics.reduce(
|
||||
(a, b) => maxScope([a.metric.scope, b.metric.scope]) == a.metric.scope
|
||||
(a, b) => maxScope([a.scope, b.scope]) == a.scope
|
||||
? (job.numNodes > 1 ? a : b)
|
||||
: (job.numNodes > 1 ? b : a), jobMetrics[0])
|
||||
|
||||
const sortAndSelectScope = (jobMetrics) => metrics
|
||||
.map(name => jobMetrics.filter(jobMetric => jobMetric.name == name))
|
||||
.map(jobMetrics => jobMetrics.length > 0 ? selectScope(jobMetrics) : null)
|
||||
.map(function(name) {
|
||||
// Get MetricConf for this selected/requested metric
|
||||
let thisConfig = metricConfig(cluster, name)
|
||||
let thisSCIndex = thisConfig.subClusters.findIndex(sc => sc.name == job.subCluster)
|
||||
// Check if Subcluster has MetricConf: If not found (index == -1), no further remove flag check required
|
||||
if (thisSCIndex >= 0) {
|
||||
// SubCluster Config present: Check if remove flag is set
|
||||
if (thisConfig.subClusters[thisSCIndex].remove == true) {
|
||||
// Return null data and informational flag
|
||||
return {removed: true, data: null}
|
||||
} else {
|
||||
// load and return metric, if data available
|
||||
let thisMetric = jobMetrics.filter(jobMetric => jobMetric.name == name) // Returns Array
|
||||
if (thisMetric.length > 0) {
|
||||
return {removed: false, data: thisMetric}
|
||||
} else {
|
||||
return {removed: false, data: null}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No specific subCluster config: 'remove' flag not set, deemed false -> load and return metric, if data available
|
||||
let thisMetric = jobMetrics.filter(jobMetric => jobMetric.name == name) // Returns Array
|
||||
if (thisMetric.length > 0) {
|
||||
return {removed: false, data: thisMetric}
|
||||
} else {
|
||||
return {removed: false, data: null}
|
||||
}
|
||||
}
|
||||
})
|
||||
.map(function(jobMetrics) {
|
||||
if (jobMetrics.data != null && jobMetrics.data.length > 0) {
|
||||
return {removed: jobMetrics.removed, data: selectScope(jobMetrics.data)}
|
||||
} else {
|
||||
return jobMetrics
|
||||
}
|
||||
})
|
||||
|
||||
$: metricsQuery.variables = { id: job.id, metrics, scopes }
|
||||
|
||||
@ -81,17 +117,20 @@
|
||||
{:else}
|
||||
{#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric || i)}
|
||||
<td>
|
||||
{#if metric != null}
|
||||
<!-- Subluster Metricconfig remove keyword for jobtables (joblist main, user joblist, project joblist) to be used here as toplevel case-->
|
||||
{#if metric.removed == false && metric.data != null}
|
||||
<MetricPlot
|
||||
width={plotWidth}
|
||||
height={plotHeight}
|
||||
timestep={metric.metric.timestep}
|
||||
scope={metric.metric.scope}
|
||||
series={metric.metric.series}
|
||||
statisticsSeries={metric.metric.statisticsSeries}
|
||||
metric={metric.name}
|
||||
timestep={metric.data.metric.timestep}
|
||||
scope={metric.data.scope}
|
||||
series={metric.data.metric.series}
|
||||
statisticsSeries={metric.data.metric.statisticsSeries}
|
||||
metric={metric.data.name}
|
||||
cluster={cluster}
|
||||
subCluster={job.subCluster} />
|
||||
{:else if metric.removed == true && metric.data == null}
|
||||
<Card body color="info">Metric disabled for subcluster '{ job.subCluster }'</Card>
|
||||
{:else}
|
||||
<Card body color="warning">Missing Data</Card>
|
||||
{/if}
|
||||
|
@ -18,15 +18,17 @@
|
||||
import { onMount } from 'svelte'
|
||||
|
||||
export let data
|
||||
export let width
|
||||
export let height
|
||||
export let width = 500
|
||||
export let height = 300
|
||||
export let xlabel = ''
|
||||
export let ylabel = ''
|
||||
export let min = null
|
||||
export let max = null
|
||||
export let label = formatNumber
|
||||
|
||||
const fontSize = 12
|
||||
const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"'
|
||||
const paddingLeft = 35, paddingRight = 20, paddingTop = 20, paddingBottom = 20
|
||||
const paddingLeft = 50, paddingRight = 20, paddingTop = 20, paddingBottom = 20
|
||||
|
||||
let ctx, canvasElement
|
||||
|
||||
@ -72,9 +74,11 @@
|
||||
}
|
||||
|
||||
function render() {
|
||||
const h = height - paddingTop - paddingBottom
|
||||
const labelOffset = Math.floor(height * 0.1)
|
||||
const h = height - paddingTop - paddingBottom - labelOffset
|
||||
const w = width - paddingLeft - paddingRight
|
||||
const barWidth = Math.ceil(w / (maxValue + 1))
|
||||
const barGap = 5
|
||||
const barWidth = Math.ceil(w / (maxValue + 1)) - barGap
|
||||
|
||||
if (Number.isNaN(barWidth))
|
||||
return
|
||||
@ -83,9 +87,14 @@
|
||||
const getCanvasY = (count) => (h - (count / maxCount) * h) + paddingTop
|
||||
|
||||
// X Axis
|
||||
ctx.font = `${fontSize}px ${fontFamily}`
|
||||
ctx.font = `bold ${fontSize}px ${fontFamily}`
|
||||
ctx.fillStyle = 'black'
|
||||
if (xlabel != '') {
|
||||
let textWidth = ctx.measureText(xlabel).width
|
||||
ctx.fillText(xlabel, Math.floor((width / 2) - (textWidth / 2) + barGap), height - Math.floor(labelOffset / 2))
|
||||
}
|
||||
ctx.textAlign = 'center'
|
||||
ctx.font = `${fontSize}px ${fontFamily}`
|
||||
if (min != null && max != null) {
|
||||
const stepsizeX = getStepSize(max - min, w, 75)
|
||||
let startX = 0
|
||||
@ -94,19 +103,28 @@
|
||||
|
||||
for (let x = startX; x < max; x += stepsizeX) {
|
||||
let px = ((x - min) / (max - min)) * (w - barWidth) + paddingLeft + (barWidth / 2.)
|
||||
ctx.fillText(`${formatNumber(x)}`, px, height - paddingBottom + 15)
|
||||
ctx.fillText(`${formatNumber(x)}`, px, height - paddingBottom - Math.floor(labelOffset / 2))
|
||||
}
|
||||
} else {
|
||||
const stepsizeX = getStepSize(maxValue, w, 120)
|
||||
for (let x = 0; x <= maxValue; x += stepsizeX) {
|
||||
ctx.fillText(label(x), getCanvasX(x), height - paddingBottom + 15)
|
||||
ctx.fillText(label(x), getCanvasX(x), height - paddingBottom - Math.floor(labelOffset / 2))
|
||||
}
|
||||
}
|
||||
|
||||
// Y Axis
|
||||
ctx.fillStyle = 'black'
|
||||
ctx.strokeStyle = '#bbbbbb'
|
||||
ctx.font = `bold ${fontSize}px ${fontFamily}`
|
||||
if (ylabel != '') {
|
||||
ctx.save()
|
||||
ctx.translate(15, Math.floor(h / 2))
|
||||
ctx.rotate(-Math.PI / 2)
|
||||
ctx.fillText(ylabel, 0, 0)
|
||||
ctx.restore()
|
||||
}
|
||||
ctx.textAlign = 'right'
|
||||
ctx.font = `${fontSize}px ${fontFamily}`
|
||||
ctx.beginPath()
|
||||
const stepsizeY = getStepSize(maxCount, h, 50)
|
||||
for (let y = stepsizeY; y <= maxCount; y += stepsizeY) {
|
||||
@ -118,7 +136,7 @@
|
||||
ctx.stroke()
|
||||
|
||||
// Draw bars
|
||||
ctx.fillStyle = '#0066cc'
|
||||
ctx.fillStyle = '#85abce'
|
||||
for (let p of data) {
|
||||
ctx.fillRect(
|
||||
getCanvasX(p.value) - (barWidth / 2.),
|
||||
@ -130,10 +148,10 @@
|
||||
// Fat lines left and below plotting area
|
||||
ctx.strokeStyle = 'black'
|
||||
ctx.beginPath()
|
||||
ctx.moveTo(0, height - paddingBottom)
|
||||
ctx.lineTo(width, height - paddingBottom)
|
||||
ctx.moveTo(0, height - paddingBottom - labelOffset)
|
||||
ctx.lineTo(width, height - paddingBottom - labelOffset)
|
||||
ctx.moveTo(paddingLeft, 0)
|
||||
ctx.lineTo(paddingLeft, height- paddingBottom)
|
||||
ctx.lineTo(paddingLeft, height - Math.floor(labelOffset / 2))
|
||||
ctx.stroke()
|
||||
}
|
||||
|
||||
|
@ -18,7 +18,7 @@
|
||||
let ctx, canvasElement
|
||||
|
||||
const labels = metrics.filter(name => {
|
||||
if (!jobMetrics.find(m => m.name == name && m.metric.scope == "node")) {
|
||||
if (!jobMetrics.find(m => m.name == name && m.scope == "node")) {
|
||||
console.warn(`PolarPlot: No metric data for '${name}'`)
|
||||
return false
|
||||
}
|
||||
@ -27,7 +27,7 @@
|
||||
|
||||
const getValuesForStat = (getStat) => labels.map(name => {
|
||||
const peak = metricConfig(cluster, name).peak
|
||||
const metric = jobMetrics.find(m => m.name == name && m.metric.scope == "node")
|
||||
const metric = jobMetrics.find(m => m.name == name && m.scope == "node")
|
||||
const value = getStat(metric.metric) / peak
|
||||
return value <= 1. ? value : 1.
|
||||
})
|
||||
|
@ -4,7 +4,8 @@
|
||||
|
||||
<script context="module">
|
||||
const axesColor = '#aaaaaa'
|
||||
const fontSize = 12
|
||||
const tickFontSize = 10
|
||||
const labelFontSize = 12
|
||||
const fontFamily = 'system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"'
|
||||
const paddingLeft = 40,
|
||||
paddingRight = 10,
|
||||
@ -67,11 +68,11 @@
|
||||
return 2
|
||||
}
|
||||
|
||||
function render(ctx, data, cluster, width, height, colorDots, defaultMaxY) {
|
||||
function render(ctx, data, cluster, width, height, colorDots, showTime, defaultMaxY) {
|
||||
if (width <= 0)
|
||||
return
|
||||
|
||||
const [minX, maxX, minY, maxY] = [0.01, 1000, 1., cluster?.flopRateSimd || defaultMaxY]
|
||||
const [minX, maxX, minY, maxY] = [0.01, 1000, 1., cluster?.flopRateSimd?.value || defaultMaxY]
|
||||
const w = width - paddingLeft - paddingRight
|
||||
const h = height - paddingTop - paddingBottom
|
||||
|
||||
@ -95,7 +96,7 @@
|
||||
// Axes
|
||||
ctx.fillStyle = 'black'
|
||||
ctx.strokeStyle = axesColor
|
||||
ctx.font = `${fontSize}px ${fontFamily}`
|
||||
ctx.font = `${tickFontSize}px ${fontFamily}`
|
||||
ctx.beginPath()
|
||||
for (let x = minX, i = 0; x <= maxX; i++) {
|
||||
let px = getCanvasX(x)
|
||||
@ -103,18 +104,20 @@
|
||||
let textWidth = ctx.measureText(text).width
|
||||
ctx.fillText(text,
|
||||
Math.floor(px - (textWidth / 2)),
|
||||
height - paddingBottom + fontSize + 5)
|
||||
height - paddingBottom + tickFontSize + 5)
|
||||
ctx.moveTo(px, paddingTop - 5)
|
||||
ctx.lineTo(px, height - paddingBottom + 5)
|
||||
|
||||
x *= axisStepFactor(i, w)
|
||||
}
|
||||
if (data.xLabel) {
|
||||
ctx.font = `${labelFontSize}px ${fontFamily}`
|
||||
let textWidth = ctx.measureText(data.xLabel).width
|
||||
ctx.fillText(data.xLabel, Math.floor((width / 2) - (textWidth / 2)), height - 20)
|
||||
}
|
||||
|
||||
ctx.textAlign = 'center'
|
||||
ctx.font = `${tickFontSize}px ${fontFamily}`
|
||||
for (let y = minY, i = 0; y <= maxY; i++) {
|
||||
let py = getCanvasY(y)
|
||||
ctx.moveTo(paddingLeft - 5, py)
|
||||
@ -129,6 +132,7 @@
|
||||
y *= axisStepFactor(i)
|
||||
}
|
||||
if (data.yLabel) {
|
||||
ctx.font = `${labelFontSize}px ${fontFamily}`
|
||||
ctx.save()
|
||||
ctx.translate(15, Math.floor(height / 2))
|
||||
ctx.rotate(-Math.PI / 2)
|
||||
@ -185,13 +189,13 @@
|
||||
ctx.lineWidth = 2
|
||||
ctx.beginPath()
|
||||
if (cluster != null) {
|
||||
const ycut = 0.01 * cluster.memoryBandwidth
|
||||
const scalarKnee = (cluster.flopRateScalar - ycut) / cluster.memoryBandwidth
|
||||
const simdKnee = (cluster.flopRateSimd - ycut) / cluster.memoryBandwidth
|
||||
const ycut = 0.01 * cluster.memoryBandwidth.value
|
||||
const scalarKnee = (cluster.flopRateScalar.value - ycut) / cluster.memoryBandwidth.value
|
||||
const simdKnee = (cluster.flopRateSimd.value - ycut) / cluster.memoryBandwidth.value
|
||||
const scalarKneeX = getCanvasX(scalarKnee),
|
||||
simdKneeX = getCanvasX(simdKnee),
|
||||
flopRateScalarY = getCanvasY(cluster.flopRateScalar),
|
||||
flopRateSimdY = getCanvasY(cluster.flopRateSimd)
|
||||
flopRateScalarY = getCanvasY(cluster.flopRateScalar.value),
|
||||
flopRateSimdY = getCanvasY(cluster.flopRateSimd.value)
|
||||
|
||||
if (scalarKneeX < width - paddingRight) {
|
||||
ctx.moveTo(scalarKneeX, flopRateScalarY)
|
||||
@ -222,8 +226,8 @@
|
||||
}
|
||||
ctx.stroke()
|
||||
|
||||
if (colorDots && data.x && data.y) {
|
||||
// The Color Scale
|
||||
if (colorDots && showTime && data.x && data.y) {
|
||||
// The Color Scale For Time Information
|
||||
ctx.fillStyle = 'black'
|
||||
ctx.fillText('Time:', 17, height - 5)
|
||||
const start = paddingLeft + 5
|
||||
@ -237,7 +241,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
function transformData(flopsAny, memBw, colorDots) {
|
||||
function transformData(flopsAny, memBw, colorDots) { // Uses Metric Object
|
||||
const nodes = flopsAny.series.length
|
||||
const timesteps = flopsAny.series[0].data.length
|
||||
|
||||
@ -308,17 +312,18 @@
|
||||
export let memBw = null
|
||||
export let cluster = null
|
||||
export let maxY = null
|
||||
export let width
|
||||
export let height
|
||||
export let width = 500
|
||||
export let height = 300
|
||||
export let tiles = null
|
||||
export let colorDots = true
|
||||
export let showTime = true
|
||||
export let data = null
|
||||
|
||||
console.assert(data || tiles || (flopsAny && memBw), "you must provide flopsAny and memBw or tiles!")
|
||||
|
||||
let ctx, canvasElement, prevWidth = width, prevHeight = height
|
||||
data = data != null ? data : (flopsAny && memBw
|
||||
? transformData(flopsAny.metric, memBw.metric, colorDots)
|
||||
? transformData(flopsAny, memBw, colorDots) // Use Metric Object from Parent
|
||||
: {
|
||||
tiles: tiles,
|
||||
xLabel: 'Intensity [FLOPS/byte]',
|
||||
@ -334,7 +339,7 @@
|
||||
|
||||
canvasElement.width = width
|
||||
canvasElement.height = height
|
||||
render(ctx, data, cluster, width, height, colorDots, maxY)
|
||||
render(ctx, data, cluster, width, height, colorDots, showTime, maxY)
|
||||
})
|
||||
|
||||
let timeoutId = null
|
||||
@ -354,7 +359,7 @@
|
||||
timeoutId = null
|
||||
canvasElement.width = width
|
||||
canvasElement.height = height
|
||||
render(ctx, data, cluster, width, height, colorDots, maxY)
|
||||
render(ctx, data, cluster, width, height, colorDots, showTime, maxY)
|
||||
}, 250)
|
||||
}
|
||||
|
||||
|
@ -37,11 +37,11 @@ export function init(extraInitQuery = '') {
|
||||
clusters {
|
||||
name,
|
||||
metricConfig {
|
||||
name, unit, peak,
|
||||
name, unit { base, prefix }, peak,
|
||||
normal, caution, alert,
|
||||
timestep, scope,
|
||||
aggregation,
|
||||
subClusters { name, peak, normal, caution, alert }
|
||||
subClusters { name, peak, normal, caution, alert, remove }
|
||||
}
|
||||
partitions
|
||||
subClusters {
|
||||
@ -49,9 +49,9 @@ export function init(extraInitQuery = '') {
|
||||
socketsPerNode
|
||||
coresPerSocket
|
||||
threadsPerCore
|
||||
flopRateScalar
|
||||
flopRateSimd
|
||||
memoryBandwidth
|
||||
flopRateScalar { unit { base, prefix }, value }
|
||||
flopRateSimd { unit { base, prefix }, value }
|
||||
memoryBandwidth { unit { base, prefix }, value }
|
||||
numberOfNodes
|
||||
topology {
|
||||
node, socket, core
|
||||
|
Loading…
Reference in New Issue
Block a user