mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-11-14 02:37:25 +01:00
Merge branch 'Refactor-job-footprint' into sample_resolution_select
This commit is contained in:
commit
6ab2e02fe6
@ -38,6 +38,15 @@ var (
|
|||||||
apiHandle *api.RestApi
|
apiHandle *api.RestApi
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) {
|
||||||
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
|
rw.WriteHeader(http.StatusUnauthorized)
|
||||||
|
json.NewEncoder(rw).Encode(map[string]string{
|
||||||
|
"status": http.StatusText(http.StatusUnauthorized),
|
||||||
|
"error": err.Error(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func serverInit() {
|
func serverInit() {
|
||||||
// Setup the http.Handler/Router used by the server
|
// Setup the http.Handler/Router used by the server
|
||||||
graph.Init()
|
graph.Init()
|
||||||
@ -166,64 +175,32 @@ func serverInit() {
|
|||||||
return authHandle.AuthApi(
|
return authHandle.AuthApi(
|
||||||
// On success;
|
// On success;
|
||||||
next,
|
next,
|
||||||
|
|
||||||
// On failure: JSON Response
|
// On failure: JSON Response
|
||||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
onFailureResponse)
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
|
||||||
rw.WriteHeader(http.StatusUnauthorized)
|
|
||||||
json.NewEncoder(rw).Encode(map[string]string{
|
|
||||||
"status": http.StatusText(http.StatusUnauthorized),
|
|
||||||
"error": err.Error(),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
userapi.Use(func(next http.Handler) http.Handler {
|
userapi.Use(func(next http.Handler) http.Handler {
|
||||||
return authHandle.AuthUserApi(
|
return authHandle.AuthUserApi(
|
||||||
// On success;
|
// On success;
|
||||||
next,
|
next,
|
||||||
|
|
||||||
// On failure: JSON Response
|
// On failure: JSON Response
|
||||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
onFailureResponse)
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
|
||||||
rw.WriteHeader(http.StatusUnauthorized)
|
|
||||||
json.NewEncoder(rw).Encode(map[string]string{
|
|
||||||
"status": http.StatusText(http.StatusUnauthorized),
|
|
||||||
"error": err.Error(),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
configapi.Use(func(next http.Handler) http.Handler {
|
configapi.Use(func(next http.Handler) http.Handler {
|
||||||
return authHandle.AuthConfigApi(
|
return authHandle.AuthConfigApi(
|
||||||
// On success;
|
// On success;
|
||||||
next,
|
next,
|
||||||
|
|
||||||
// On failure: JSON Response
|
// On failure: JSON Response
|
||||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
onFailureResponse)
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
|
||||||
rw.WriteHeader(http.StatusUnauthorized)
|
|
||||||
json.NewEncoder(rw).Encode(map[string]string{
|
|
||||||
"status": http.StatusText(http.StatusUnauthorized),
|
|
||||||
"error": err.Error(),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
frontendapi.Use(func(next http.Handler) http.Handler {
|
frontendapi.Use(func(next http.Handler) http.Handler {
|
||||||
return authHandle.AuthFrontendApi(
|
return authHandle.AuthFrontendApi(
|
||||||
// On success;
|
// On success;
|
||||||
next,
|
next,
|
||||||
|
|
||||||
// On failure: JSON Response
|
// On failure: JSON Response
|
||||||
func(rw http.ResponseWriter, r *http.Request, err error) {
|
onFailureResponse)
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
|
||||||
rw.WriteHeader(http.StatusUnauthorized)
|
|
||||||
json.NewEncoder(rw).Encode(map[string]string{
|
|
||||||
"status": http.StatusText(http.StatusUnauthorized),
|
|
||||||
"error": err.Error(),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,6 +307,10 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
scopes = append(scopes, schema.MetricScopeCore)
|
scopes = append(scopes, schema.MetricScopeCore)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if job.NumAcc > 0 {
|
||||||
|
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||||
|
}
|
||||||
|
|
||||||
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Error wile loading job data for archiving")
|
log.Error("Error wile loading job data for archiving")
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
ALTER TABLE job DROP energy;
|
||||||
|
ALTER TABLE job DROP energy_footprint;
|
||||||
|
ALTER TABLE job ADD COLUMN flops_any_avg;
|
||||||
|
ALTER TABLE job ADD COLUMN mem_bw_avg;
|
||||||
|
ALTER TABLE job ADD COLUMN mem_used_max;
|
||||||
|
ALTER TABLE job ADD COLUMN load_avg;
|
||||||
|
ALTER TABLE job ADD COLUMN net_bw_avg;
|
||||||
|
ALTER TABLE job ADD COLUMN net_data_vol_total;
|
||||||
|
ALTER TABLE job ADD COLUMN file_bw_avg;
|
||||||
|
ALTER TABLE job ADD COLUMN file_data_vol_total;
|
||||||
|
|
||||||
|
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
|
||||||
|
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
|
||||||
|
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
|
||||||
|
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
|
||||||
|
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
|
||||||
|
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
|
||||||
|
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
|
||||||
|
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
|
||||||
|
|
||||||
|
ALTER TABLE job DROP footprint;
|
@ -1,4 +1,5 @@
|
|||||||
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
|
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
|
||||||
|
ALTER TABLE job ADD COLUMN energy_footprint TEXT DEFAULT NULL;
|
||||||
|
|
||||||
ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL;
|
ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL;
|
||||||
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
|
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
|
||||||
@ -6,7 +7,16 @@ UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_
|
|||||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
|
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
|
||||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
|
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
|
||||||
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
|
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
|
||||||
|
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
|
||||||
|
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
|
||||||
|
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
|
||||||
|
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
|
||||||
|
|
||||||
ALTER TABLE job DROP flops_any_avg;
|
ALTER TABLE job DROP flops_any_avg;
|
||||||
ALTER TABLE job DROP mem_bw_avg;
|
ALTER TABLE job DROP mem_bw_avg;
|
||||||
ALTER TABLE job DROP mem_used_max;
|
ALTER TABLE job DROP mem_used_max;
|
||||||
ALTER TABLE job DROP load_avg;
|
ALTER TABLE job DROP load_avg;
|
||||||
|
ALTER TABLE job DROP net_bw_avg;
|
||||||
|
ALTER TABLE job DROP net_data_vol_total;
|
||||||
|
ALTER TABLE job DROP file_bw_avg;
|
||||||
|
ALTER TABLE job DROP file_data_vol_total;
|
||||||
|
@ -47,11 +47,11 @@ type SubCluster struct {
|
|||||||
|
|
||||||
type SubClusterConfig struct {
|
type SubClusterConfig struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
Footprint string `json:"footprint,omitempty"`
|
||||||
Peak float64 `json:"peak"`
|
Peak float64 `json:"peak"`
|
||||||
Normal float64 `json:"normal"`
|
Normal float64 `json:"normal"`
|
||||||
Caution float64 `json:"caution"`
|
Caution float64 `json:"caution"`
|
||||||
Alert float64 `json:"alert"`
|
Alert float64 `json:"alert"`
|
||||||
Footprint string `json:"footprint,omitempty"`
|
|
||||||
Remove bool `json:"remove"`
|
Remove bool `json:"remove"`
|
||||||
LowerIsBetter bool `json:"lowerIsBetter"`
|
LowerIsBetter bool `json:"lowerIsBetter"`
|
||||||
Energy bool `json:"energy"`
|
Energy bool `json:"energy"`
|
||||||
@ -62,14 +62,14 @@ type MetricConfig struct {
|
|||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Scope MetricScope `json:"scope"`
|
Scope MetricScope `json:"scope"`
|
||||||
Aggregation string `json:"aggregation"`
|
Aggregation string `json:"aggregation"`
|
||||||
|
Footprint string `json:"footprint,omitempty"`
|
||||||
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
|
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
|
||||||
Timestep int `json:"timestep"`
|
|
||||||
Peak float64 `json:"peak"`
|
Peak float64 `json:"peak"`
|
||||||
Normal float64 `json:"normal"`
|
Normal float64 `json:"normal"`
|
||||||
Caution float64 `json:"caution"`
|
Caution float64 `json:"caution"`
|
||||||
Alert float64 `json:"alert"`
|
Alert float64 `json:"alert"`
|
||||||
|
Timestep int `json:"timestep"`
|
||||||
LowerIsBetter bool `json:"lowerIsBetter"`
|
LowerIsBetter bool `json:"lowerIsBetter"`
|
||||||
Footprint string `json:"footprint,omitempty"`
|
|
||||||
Energy bool `json:"energy"`
|
Energy bool `json:"energy"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ type BaseJob struct {
|
|||||||
Footprint map[string]float64 `json:"footprint"`
|
Footprint map[string]float64 `json:"footprint"`
|
||||||
MetaData map[string]string `json:"metaData"`
|
MetaData map[string]string `json:"metaData"`
|
||||||
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
|
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
|
||||||
Energy float64 `json:"energy"`
|
Energy float64 `json:"energy" db:"energy"`
|
||||||
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"`
|
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"`
|
||||||
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"`
|
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"`
|
||||||
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
|
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
|
||||||
|
@ -1,284 +1,319 @@
|
|||||||
{
|
{
|
||||||
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
"$schema": "http://json-schema.org/draft/2020-12/schema",
|
||||||
"$id": "embedfs://cluster.schema.json",
|
"$id": "embedfs://cluster.schema.json",
|
||||||
"title": "HPC cluster description",
|
"title": "HPC cluster description",
|
||||||
"description": "Meta data information of a HPC cluster",
|
"description": "Meta data information of a HPC cluster",
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"name": {
|
"name": {
|
||||||
"description": "The unique identifier of a cluster",
|
"description": "The unique identifier of a cluster",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
|
||||||
"metricConfig": {
|
|
||||||
"description": "Metric specifications",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"name": {
|
|
||||||
"description": "Metric name",
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"unit": {
|
|
||||||
"description": "Metric unit",
|
|
||||||
"$ref": "embedfs://unit.schema.json"
|
|
||||||
},
|
|
||||||
"scope": {
|
|
||||||
"description": "Native measurement resolution",
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"timestep": {
|
|
||||||
"description": "Frequency of timeseries points",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"aggregation": {
|
|
||||||
"description": "How the metric is aggregated",
|
|
||||||
"type": "string",
|
|
||||||
"enum": [
|
|
||||||
"sum",
|
|
||||||
"avg"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"peak": {
|
|
||||||
"description": "Metric peak threshold (Upper metric limit)",
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"normal": {
|
|
||||||
"description": "Metric normal threshold",
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"caution": {
|
|
||||||
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"alert": {
|
|
||||||
"description": "Metric alert threshold (Requires immediate action)",
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"subClusters": {
|
|
||||||
"description": "Array of cluster hardware partition metric thresholds",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"name": {
|
|
||||||
"description": "Hardware partition name",
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"peak": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"normal": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"caution": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"alert": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"remove": {
|
|
||||||
"type": "boolean"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"name"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"name",
|
|
||||||
"unit",
|
|
||||||
"scope",
|
|
||||||
"timestep",
|
|
||||||
"aggregation",
|
|
||||||
"peak",
|
|
||||||
"normal",
|
|
||||||
"caution",
|
|
||||||
"alert"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"minItems": 1
|
|
||||||
},
|
|
||||||
"subClusters": {
|
|
||||||
"description": "Array of cluster hardware partitions",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"name": {
|
|
||||||
"description": "Hardware partition name",
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"processorType": {
|
|
||||||
"description": "Processor type",
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"socketsPerNode": {
|
|
||||||
"description": "Number of sockets per node",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"coresPerSocket": {
|
|
||||||
"description": "Number of cores per socket",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"threadsPerCore": {
|
|
||||||
"description": "Number of SMT threads per core",
|
|
||||||
"type": "integer"
|
|
||||||
},
|
|
||||||
"flopRateScalar": {
|
|
||||||
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"unit": {
|
|
||||||
"description": "Metric unit",
|
|
||||||
"$ref": "embedfs://unit.schema.json"
|
|
||||||
},
|
|
||||||
"value": {
|
|
||||||
"type": "number"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"flopRateSimd": {
|
|
||||||
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"unit": {
|
|
||||||
"description": "Metric unit",
|
|
||||||
"$ref": "embedfs://unit.schema.json"
|
|
||||||
},
|
|
||||||
"value": {
|
|
||||||
"type": "number"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"memoryBandwidth": {
|
|
||||||
"description": "Theoretical node peak memory bandwidth in GB/s",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"unit": {
|
|
||||||
"description": "Metric unit",
|
|
||||||
"$ref": "embedfs://unit.schema.json"
|
|
||||||
},
|
|
||||||
"value": {
|
|
||||||
"type": "number"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nodes": {
|
|
||||||
"description": "Node list expression",
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"topology": {
|
|
||||||
"description": "Node topology",
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"node": {
|
|
||||||
"description": "HwTread lists of node",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "integer"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"socket": {
|
|
||||||
"description": "HwTread lists of sockets",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "integer"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"memoryDomain": {
|
|
||||||
"description": "HwTread lists of memory domains",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "integer"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"die": {
|
|
||||||
"description": "HwTread lists of dies",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "integer"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"core": {
|
|
||||||
"description": "HwTread lists of cores",
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "integer"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"accelerators": {
|
|
||||||
"type": "array",
|
|
||||||
"description": "List of of accelerator devices",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"id": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The unique device id"
|
|
||||||
},
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The accelerator type",
|
|
||||||
"enum": [
|
|
||||||
"Nvidia GPU",
|
|
||||||
"AMD GPU",
|
|
||||||
"Intel GPU"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"model": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The accelerator model"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"id",
|
|
||||||
"type",
|
|
||||||
"model"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"node",
|
|
||||||
"socket",
|
|
||||||
"memoryDomain"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"name",
|
|
||||||
"nodes",
|
|
||||||
"topology",
|
|
||||||
"processorType",
|
|
||||||
"socketsPerNode",
|
|
||||||
"coresPerSocket",
|
|
||||||
"threadsPerCore",
|
|
||||||
"flopRateScalar",
|
|
||||||
"flopRateSimd",
|
|
||||||
"memoryBandwidth"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"minItems": 1
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"required": [
|
"metricConfig": {
|
||||||
"name",
|
"description": "Metric specifications",
|
||||||
"metricConfig",
|
"type": "array",
|
||||||
"subClusters"
|
"items": {
|
||||||
]
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"description": "Metric name",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"unit": {
|
||||||
|
"description": "Metric unit",
|
||||||
|
"$ref": "embedfs://unit.schema.json"
|
||||||
|
},
|
||||||
|
"scope": {
|
||||||
|
"description": "Native measurement resolution",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"timestep": {
|
||||||
|
"description": "Frequency of timeseries points",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"aggregation": {
|
||||||
|
"description": "How the metric is aggregated",
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"sum",
|
||||||
|
"avg"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"footprint": {
|
||||||
|
"description": "Is it a footprint metric and what type",
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"avg",
|
||||||
|
"max",
|
||||||
|
"min"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"energy": {
|
||||||
|
"description": "Is it used to calculate job energy",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"lowerIsBetter": {
|
||||||
|
"description": "Is lower better.",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"peak": {
|
||||||
|
"description": "Metric peak threshold (Upper metric limit)",
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"normal": {
|
||||||
|
"description": "Metric normal threshold",
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"caution": {
|
||||||
|
"description": "Metric caution threshold (Suspicious but does not require immediate action)",
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"alert": {
|
||||||
|
"description": "Metric alert threshold (Requires immediate action)",
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"subClusters": {
|
||||||
|
"description": "Array of cluster hardware partition metric thresholds",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"description": "Hardware partition name",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"footprint": {
|
||||||
|
"description": "Is it a footprint metric and what type. Overwrite global setting",
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"avg",
|
||||||
|
"max",
|
||||||
|
"min"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"energy": {
|
||||||
|
"description": "Is it used to calculate job energy. Overwrite global",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"lowerIsBetter": {
|
||||||
|
"description": "Is lower better. Overwrite global",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"peak": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"normal": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"caution": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"alert": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"remove": {
|
||||||
|
"description": "Remove this metric for this subcluster",
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"name"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"name",
|
||||||
|
"unit",
|
||||||
|
"scope",
|
||||||
|
"timestep",
|
||||||
|
"aggregation",
|
||||||
|
"peak",
|
||||||
|
"normal",
|
||||||
|
"caution",
|
||||||
|
"alert"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
|
},
|
||||||
|
"subClusters": {
|
||||||
|
"description": "Array of cluster hardware partitions",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"description": "Hardware partition name",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"processorType": {
|
||||||
|
"description": "Processor type",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"socketsPerNode": {
|
||||||
|
"description": "Number of sockets per node",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"coresPerSocket": {
|
||||||
|
"description": "Number of cores per socket",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"threadsPerCore": {
|
||||||
|
"description": "Number of SMT threads per core",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"flopRateScalar": {
|
||||||
|
"description": "Theoretical node peak flop rate for scalar code in GFlops/s",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"unit": {
|
||||||
|
"description": "Metric unit",
|
||||||
|
"$ref": "embedfs://unit.schema.json"
|
||||||
|
},
|
||||||
|
"value": {
|
||||||
|
"type": "number"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"flopRateSimd": {
|
||||||
|
"description": "Theoretical node peak flop rate for SIMD code in GFlops/s",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"unit": {
|
||||||
|
"description": "Metric unit",
|
||||||
|
"$ref": "embedfs://unit.schema.json"
|
||||||
|
},
|
||||||
|
"value": {
|
||||||
|
"type": "number"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"memoryBandwidth": {
|
||||||
|
"description": "Theoretical node peak memory bandwidth in GB/s",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"unit": {
|
||||||
|
"description": "Metric unit",
|
||||||
|
"$ref": "embedfs://unit.schema.json"
|
||||||
|
},
|
||||||
|
"value": {
|
||||||
|
"type": "number"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nodes": {
|
||||||
|
"description": "Node list expression",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"topology": {
|
||||||
|
"description": "Node topology",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"node": {
|
||||||
|
"description": "HwTread lists of node",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"socket": {
|
||||||
|
"description": "HwTread lists of sockets",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"memoryDomain": {
|
||||||
|
"description": "HwTread lists of memory domains",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"die": {
|
||||||
|
"description": "HwTread lists of dies",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"core": {
|
||||||
|
"description": "HwTread lists of cores",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"accelerators": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "List of of accelerator devices",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The unique device id"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The accelerator type",
|
||||||
|
"enum": [
|
||||||
|
"Nvidia GPU",
|
||||||
|
"AMD GPU",
|
||||||
|
"Intel GPU"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The accelerator model"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"type",
|
||||||
|
"model"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"node",
|
||||||
|
"socket",
|
||||||
|
"memoryDomain"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"name",
|
||||||
|
"nodes",
|
||||||
|
"topology",
|
||||||
|
"processorType",
|
||||||
|
"socketsPerNode",
|
||||||
|
"coresPerSocket",
|
||||||
|
"threadsPerCore",
|
||||||
|
"flopRateScalar",
|
||||||
|
"flopRateSimd",
|
||||||
|
"memoryBandwidth"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"minItems": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"name",
|
||||||
|
"metricConfig",
|
||||||
|
"subClusters"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
@ -67,62 +67,74 @@
|
|||||||
export let height = "310px";
|
export let height = "310px";
|
||||||
|
|
||||||
const footprintData = job?.footprint?.map((jf) => {
|
const footprintData = job?.footprint?.map((jf) => {
|
||||||
// Unit
|
|
||||||
const fmc = getContext("getMetricConfig")(job.cluster, job.subCluster, jf.name);
|
const fmc = getContext("getMetricConfig")(job.cluster, job.subCluster, jf.name);
|
||||||
const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "")
|
if (fmc) {
|
||||||
|
// Unit
|
||||||
|
const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "")
|
||||||
|
|
||||||
// Threshold / -Differences
|
// Threshold / -Differences
|
||||||
const fmt = findJobThresholds(job, fmc);
|
const fmt = findJobThresholds(job, fmc);
|
||||||
if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
|
if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
|
||||||
|
|
||||||
// Define basic data -> Value: Use as Provided
|
// Define basic data -> Value: Use as Provided
|
||||||
const fmBase = {
|
const fmBase = {
|
||||||
name: jf.name + ' (' + jf.stat + ')',
|
name: jf.name + ' (' + jf.stat + ')',
|
||||||
avg: jf.value,
|
avg: jf.value,
|
||||||
unit: unit,
|
unit: unit,
|
||||||
max: fmt.peak,
|
max: fmt.peak,
|
||||||
dir: fmc.lowerIsBetter
|
dir: fmc.lowerIsBetter
|
||||||
};
|
};
|
||||||
|
|
||||||
if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "alert")) {
|
if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "alert")) {
|
||||||
|
return {
|
||||||
|
...fmBase,
|
||||||
|
color: "danger",
|
||||||
|
message: `Metric average way ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`,
|
||||||
|
impact: 3
|
||||||
|
};
|
||||||
|
} else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "caution")) {
|
||||||
|
return {
|
||||||
|
...fmBase,
|
||||||
|
color: "warning",
|
||||||
|
message: `Metric average ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`,
|
||||||
|
impact: 2,
|
||||||
|
};
|
||||||
|
} else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "normal")) {
|
||||||
|
return {
|
||||||
|
...fmBase,
|
||||||
|
color: "success",
|
||||||
|
message: "Metric average within expected thresholds.",
|
||||||
|
impact: 1,
|
||||||
|
};
|
||||||
|
} else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "peak")) {
|
||||||
|
return {
|
||||||
|
...fmBase,
|
||||||
|
color: "info",
|
||||||
|
message:
|
||||||
|
"Metric average above expected normal thresholds: Check for artifacts recommended.",
|
||||||
|
impact: 0,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return {
|
||||||
|
...fmBase,
|
||||||
|
color: "secondary",
|
||||||
|
message:
|
||||||
|
"Metric average above expected peak threshold: Check for artifacts!",
|
||||||
|
impact: -1,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} else { // No matching metric config: display as single value
|
||||||
return {
|
return {
|
||||||
...fmBase,
|
name: jf.name + ' (' + jf.stat + ')',
|
||||||
color: "danger",
|
avg: jf.value,
|
||||||
message: `Metric average way ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`,
|
|
||||||
impact: 3
|
|
||||||
};
|
|
||||||
} else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "caution")) {
|
|
||||||
return {
|
|
||||||
...fmBase,
|
|
||||||
color: "warning",
|
|
||||||
message: `Metric average ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`,
|
|
||||||
impact: 2,
|
|
||||||
};
|
|
||||||
} else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "normal")) {
|
|
||||||
return {
|
|
||||||
...fmBase,
|
|
||||||
color: "success",
|
|
||||||
message: "Metric average within expected thresholds.",
|
|
||||||
impact: 1,
|
|
||||||
};
|
|
||||||
} else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "peak")) {
|
|
||||||
return {
|
|
||||||
...fmBase,
|
|
||||||
color: "info",
|
|
||||||
message:
|
message:
|
||||||
"Metric average above expected normal thresholds: Check for artifacts recommended.",
|
`No config for metric ${jf.name} found.`,
|
||||||
impact: 0,
|
impact: 4,
|
||||||
};
|
|
||||||
} else {
|
|
||||||
return {
|
|
||||||
...fmBase,
|
|
||||||
color: "secondary",
|
|
||||||
message:
|
|
||||||
"Metric average above expected peak threshold: Check for artifacts!",
|
|
||||||
impact: -1,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
});
|
}).sort(function (a, b) { // Sort by impact value primarily, within impact sort name alphabetically
|
||||||
|
return a.impact - b.impact || ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0));
|
||||||
|
});;
|
||||||
|
|
||||||
function evalFootprint(mean, thresholds, lowerIsBetter, level) {
|
function evalFootprint(mean, thresholds, lowerIsBetter, level) {
|
||||||
// Handle Metrics in which less value is better
|
// Handle Metrics in which less value is better
|
||||||
@ -159,37 +171,76 @@
|
|||||||
{/if}
|
{/if}
|
||||||
<CardBody>
|
<CardBody>
|
||||||
{#each footprintData as fpd, index}
|
{#each footprintData as fpd, index}
|
||||||
<div class="mb-1 d-flex justify-content-between">
|
{#if fpd.impact !== 4}
|
||||||
<div> <b>{fpd.name}</b></div>
|
<div class="mb-1 d-flex justify-content-between">
|
||||||
<!-- For symmetry, see below ...-->
|
<div> <b>{fpd.name}</b></div>
|
||||||
<div
|
<!-- For symmetry, see below ...-->
|
||||||
class="cursor-help d-inline-flex"
|
<div
|
||||||
id={`footprint-${job.jobId}-${index}`}
|
class="cursor-help d-inline-flex"
|
||||||
>
|
id={`footprint-${job.jobId}-${index}`}
|
||||||
<div class="mx-1">
|
>
|
||||||
<!-- Alerts Only -->
|
<div class="mx-1">
|
||||||
{#if fpd.impact === 3 || fpd.impact === -1}
|
<!-- Alerts Only -->
|
||||||
<Icon name="exclamation-triangle-fill" class="text-danger" />
|
{#if fpd.impact === 3 || fpd.impact === -1}
|
||||||
{:else if fpd.impact === 2}
|
<Icon name="exclamation-triangle-fill" class="text-danger" />
|
||||||
<Icon name="exclamation-triangle" class="text-warning" />
|
{:else if fpd.impact === 2}
|
||||||
{/if}
|
<Icon name="exclamation-triangle" class="text-warning" />
|
||||||
<!-- Emoji for all states-->
|
{/if}
|
||||||
{#if fpd.impact === 3}
|
<!-- Emoji for all states-->
|
||||||
<Icon name="emoji-frown" class="text-danger" />
|
{#if fpd.impact === 3}
|
||||||
{:else if fpd.impact === 2}
|
<Icon name="emoji-frown" class="text-danger" />
|
||||||
<Icon name="emoji-neutral" class="text-warning" />
|
{:else if fpd.impact === 2}
|
||||||
{:else if fpd.impact === 1}
|
<Icon name="emoji-neutral" class="text-warning" />
|
||||||
<Icon name="emoji-smile" class="text-success" />
|
{:else if fpd.impact === 1}
|
||||||
{:else if fpd.impact === 0}
|
<Icon name="emoji-smile" class="text-success" />
|
||||||
<Icon name="emoji-laughing" class="text-info" />
|
{:else if fpd.impact === 0}
|
||||||
{:else if fpd.impact === -1}
|
<Icon name="emoji-laughing" class="text-info" />
|
||||||
<Icon name="emoji-dizzy" class="text-danger" />
|
{:else if fpd.impact === -1}
|
||||||
{/if}
|
<Icon name="emoji-dizzy" class="text-danger" />
|
||||||
|
{/if}
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<!-- Print Values -->
|
||||||
|
{fpd.avg} / {fpd.max}
|
||||||
|
{fpd.unit} <!-- To increase margin to tooltip: No other way manageable ... -->
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
<Tooltip
|
||||||
|
target={`footprint-${job.jobId}-${index}`}
|
||||||
|
placement="right"
|
||||||
|
offset={[0, 20]}>{fpd.message}</Tooltip
|
||||||
|
>
|
||||||
|
</div>
|
||||||
|
<Row cols={12} class="{(footprintData.length == (index + 1)) ? 'mb-0' : 'mb-2'}">
|
||||||
|
{#if fpd.dir}
|
||||||
|
<Col xs="1">
|
||||||
|
<Icon name="caret-left-fill" />
|
||||||
|
</Col>
|
||||||
|
{/if}
|
||||||
|
<Col xs="11" class="align-content-center">
|
||||||
|
<Progress value={fpd.avg} max={fpd.max} color={fpd.color} />
|
||||||
|
</Col>
|
||||||
|
{#if !fpd.dir}
|
||||||
|
<Col xs="1">
|
||||||
|
<Icon name="caret-right-fill" />
|
||||||
|
</Col>
|
||||||
|
{/if}
|
||||||
|
</Row>
|
||||||
|
{:else}
|
||||||
|
<div class="mb-1 d-flex justify-content-between">
|
||||||
<div>
|
<div>
|
||||||
<!-- Print Values -->
|
<b>{fpd.name}</b>
|
||||||
{fpd.avg} / {fpd.max}
|
</div>
|
||||||
{fpd.unit} <!-- To increase margin to tooltip: No other way manageable ... -->
|
<div
|
||||||
|
class="cursor-help d-inline-flex"
|
||||||
|
id={`footprint-${job.jobId}-${index}`}
|
||||||
|
>
|
||||||
|
<div class="mx-1">
|
||||||
|
<Icon name="info-circle"/>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
{fpd.avg}
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<Tooltip
|
<Tooltip
|
||||||
@ -197,22 +248,7 @@
|
|||||||
placement="right"
|
placement="right"
|
||||||
offset={[0, 20]}>{fpd.message}</Tooltip
|
offset={[0, 20]}>{fpd.message}</Tooltip
|
||||||
>
|
>
|
||||||
</div>
|
{/if}
|
||||||
<Row cols={12} class="{(footprintData.length == (index + 1)) ? 'mb-0' : 'mb-2'}">
|
|
||||||
{#if fpd.dir}
|
|
||||||
<Col xs="1">
|
|
||||||
<Icon name="caret-left-fill" />
|
|
||||||
</Col>
|
|
||||||
{/if}
|
|
||||||
<Col xs="11" class="align-content-center">
|
|
||||||
<Progress value={fpd.avg} max={fpd.max} color={fpd.color} />
|
|
||||||
</Col>
|
|
||||||
{#if !fpd.dir}
|
|
||||||
<Col xs="1">
|
|
||||||
<Icon name="caret-right-fill" />
|
|
||||||
</Col>
|
|
||||||
{/if}
|
|
||||||
</Row>
|
|
||||||
{/each}
|
{/each}
|
||||||
{#if job?.metaData?.message}
|
{#if job?.metaData?.message}
|
||||||
<hr class="mt-1 mb-2" />
|
<hr class="mt-1 mb-2" />
|
||||||
|
Loading…
Reference in New Issue
Block a user