From e8794b8c79d72c7794e7676889a1fff1c9f2abe5 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 28 Jun 2024 15:41:11 +0200 Subject: [PATCH 01/10] Add graphql generation target to Makefile --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 68a8059..f54c6ea 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \ $(wildcard $(FRONTEND)/src/plots/*.svelte) \ $(wildcard $(FRONTEND)/src/joblist/*.svelte) -.PHONY: clean distclean test tags frontend swagger $(TARGET) +.PHONY: clean distclean test tags frontend swagger graphql $(TARGET) .NOTPARALLEL: @@ -45,6 +45,10 @@ swagger: @go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./pkg/schema -g rest.go -o ./api @mv ./api/docs.go ./internal/api/docs.go +graphql: + $(info ===> GENERATE graphql) + @go run github.com/99designs/gqlgen + clean: $(info ===> CLEAN) @go clean From 786770f56abac5863ccbe62795c7c029f24ab5e8 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 28 Jun 2024 16:48:10 +0200 Subject: [PATCH 02/10] Start to convert to new footprint layout --- internal/repository/job.go | 14 ++++-- internal/repository/migration.go | 2 +- internal/repository/query.go | 18 +++---- pkg/schema/cluster.go | 38 +++++++------- pkg/schema/job.go | 85 ++++++++++++++------------------ 5 files changed, 78 insertions(+), 79 deletions(-) diff --git a/internal/repository/job.go b/internal/repository/job.go index b42598d..da71566 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -59,7 +59,7 @@ func GetJobRepository() *JobRepository { var jobColumns []string = []string{ "job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id", "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", - "job.duration", "job.walltime", "job.resources", "job.mem_used_max", "job.flops_any_avg", "job.mem_bw_avg", "job.load_avg", // "job.meta_data", + "job.duration", "job.walltime", "job.resources", "job.footprint", // "job.meta_data", } func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { @@ -67,15 +67,22 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { if err := row.Scan( &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, - &job.Duration, &job.Walltime, &job.RawResources, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.LoadAvg /*&job.RawMetaData*/); err != nil { + &job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint /*&job.RawMetaData*/); err != nil { log.Warnf("Error while scanning rows (Job): %v", err) return nil, err } if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil { - log.Warn("Error while unmarhsaling raw resources json") + log.Warn("Error while unmarshaling raw resources json") return nil, err } + job.RawResources = nil + + if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil { + log.Warn("Error while unmarshaling raw footprint json") + return nil, err + } + job.RawFootprint = nil // if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil { // return nil, err @@ -86,7 +93,6 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { job.Duration = int32(time.Since(job.StartTime).Seconds()) } - job.RawResources = nil return job, nil } diff --git a/internal/repository/migration.go b/internal/repository/migration.go index 0259c61..970fbc2 100644 --- a/internal/repository/migration.go +++ b/internal/repository/migration.go @@ -16,7 +16,7 @@ import ( "github.com/golang-migrate/migrate/v4/source/iofs" ) -const Version uint = 7 +const Version uint = 8 //go:embed migrations/* var migrationFiles embed.FS diff --git a/internal/repository/query.go b/internal/repository/query.go index 5ca98fb..394856d 100644 --- a/internal/repository/query.go +++ b/internal/repository/query.go @@ -22,8 +22,8 @@ func (r *JobRepository) QueryJobs( ctx context.Context, filters []*model.JobFilter, page *model.PageRequest, - order *model.OrderByInput) ([]*schema.Job, error) { - + order *model.OrderByInput, +) ([]*schema.Job, error) { query, qerr := SecurityCheck(ctx, sq.Select(jobColumns...).From("job")) if qerr != nil { return nil, qerr @@ -73,8 +73,8 @@ func (r *JobRepository) QueryJobs( func (r *JobRepository) CountJobs( ctx context.Context, - filters []*model.JobFilter) (int, error) { - + filters []*model.JobFilter, +) (int, error) { query, qerr := SecurityCheck(ctx, sq.Select("count(*)").From("job")) if qerr != nil { return 0, qerr @@ -227,9 +227,7 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select } if cond.In != nil { queryElements := make([]string, len(cond.In)) - for i, val := range cond.In { - queryElements[i] = val - } + copy(queryElements, cond.In) return query.Where(sq.Or{sq.Eq{field: queryElements}}) } return query @@ -257,8 +255,10 @@ func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq. return query } -var matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)") -var matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])") +var ( + matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)") + matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])") +) func toSnakeCase(str string) string { for _, c := range str { diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index e4ca658..6edd830 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -33,35 +33,37 @@ type SubCluster struct { Name string `json:"name"` Nodes string `json:"nodes"` ProcessorType string `json:"processorType"` - SocketsPerNode int `json:"socketsPerNode"` - CoresPerSocket int `json:"coresPerSocket"` - ThreadsPerCore int `json:"threadsPerCore"` + Topology Topology `json:"topology"` FlopRateScalar MetricValue `json:"flopRateScalar"` FlopRateSimd MetricValue `json:"flopRateSimd"` MemoryBandwidth MetricValue `json:"memoryBandwidth"` - Topology Topology `json:"topology"` + SocketsPerNode int `json:"socketsPerNode"` + CoresPerSocket int `json:"coresPerSocket"` + ThreadsPerCore int `json:"threadsPerCore"` } type SubClusterConfig struct { - Name string `json:"name"` - Peak float64 `json:"peak"` - Normal float64 `json:"normal"` - Caution float64 `json:"caution"` - Alert float64 `json:"alert"` - Remove bool `json:"remove"` + Name string `json:"name"` + Peak float64 `json:"peak"` + Normal float64 `json:"normal"` + Caution float64 `json:"caution"` + Alert float64 `json:"alert"` + Footprint bool `json:"footprint"` + Remove bool `json:"remove"` } type MetricConfig struct { - Name string `json:"name"` Unit Unit `json:"unit"` + Name string `json:"name"` Scope MetricScope `json:"scope"` Aggregation string `json:"aggregation"` + SubClusters []*SubClusterConfig `json:"subClusters,omitempty"` Timestep int `json:"timestep"` Peak float64 `json:"peak"` Normal float64 `json:"normal"` Caution float64 `json:"caution"` Alert float64 `json:"alert"` - SubClusters []*SubClusterConfig `json:"subClusters,omitempty"` + Footprint bool `json:"footprint"` } type Cluster struct { @@ -76,8 +78,8 @@ type Cluster struct { // return value, return true as the second value. TODO: Optimize this, there // must be a more efficient way/algorithm. func (topo *Topology) GetSocketsFromHWThreads( - hwthreads []int) (sockets []int, exclusive bool) { - + hwthreads []int, +) (sockets []int, exclusive bool) { socketsMap := map[int]int{} for _, hwthread := range hwthreads { for socket, hwthreadsInSocket := range topo.Socket { @@ -106,8 +108,8 @@ func (topo *Topology) GetSocketsFromHWThreads( // return value, return true as the second value. TODO: Optimize this, there // must be a more efficient way/algorithm. func (topo *Topology) GetCoresFromHWThreads( - hwthreads []int) (cores []int, exclusive bool) { - + hwthreads []int, +) (cores []int, exclusive bool) { coresMap := map[int]int{} for _, hwthread := range hwthreads { for core, hwthreadsInCore := range topo.Core { @@ -136,8 +138,8 @@ func (topo *Topology) GetCoresFromHWThreads( // memory domains in the first return value, return true as the second value. // TODO: Optimize this, there must be a more efficient way/algorithm. func (topo *Topology) GetMemoryDomainsFromHWThreads( - hwthreads []int) (memDoms []int, exclusive bool) { - + hwthreads []int, +) (memDoms []int, exclusive bool) { memDomsMap := map[int]int{} for _, hwthread := range hwthreads { for memDom, hwthreadsInmemDom := range topo.MemoryDomain { diff --git a/pkg/schema/job.go b/pkg/schema/job.go index ad3e6dc..f5363b3 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -16,30 +16,31 @@ import ( // Common subset of Job and JobMeta. Use one of those, not this type directly. type BaseJob struct { - // The unique identifier of a job - JobID int64 `json:"jobId" db:"job_id" example:"123000"` - User string `json:"user" db:"user" example:"abcd100h"` // The unique identifier of a user - Project string `json:"project" db:"project" example:"abcd200"` // The unique identifier of a project - Cluster string `json:"cluster" db:"cluster" example:"fritz"` // The unique identifier of a cluster - SubCluster string `json:"subCluster" db:"subcluster" example:"main"` // The unique identifier of a sub cluster - Partition string `json:"partition,omitempty" db:"partition" example:"main"` // The Slurm partition to which the job was submitted - ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` // The unique identifier of an array job - NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` // Number of nodes used (Min > 0) - // NumCores int32 `json:"numCores" db:"num_cores" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0) - NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` // Number of HWThreads used (Min > 0) - NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` // Number of accelerators used (Min > 0) - Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` // Specifies how nodes are shared: 0 - Shared among multiple jobs of multiple users, 1 - Job exclusive (Default), 2 - Shared among multiple jobs of same user - MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` // State of monitoring system during job run: 0 - Disabled, 1 - Running or Archiving (Default), 2 - Archiving Failed, 3 - Archiving Successfull - SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` // SMT threads used by job - State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` // Final state of job - Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` // Duration of job in seconds (Min > 0) - Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` // Requested walltime of job in seconds (Min > 0) - Tags []*Tag `json:"tags,omitempty"` // List of tags - RawResources []byte `json:"-" db:"resources"` // Resources used by job [As Bytes] - Resources []*Resource `json:"resources"` // Resources used by job - RawMetaData []byte `json:"-" db:"meta_data"` // Additional information about the job [As Bytes] - MetaData map[string]string `json:"metaData"` // Additional information about the job - ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` + Footprint map[string]float64 `json:"footPrint"` + MetaData map[string]string `json:"metaData"` + Cluster string `json:"cluster" db:"cluster" example:"fritz"` + SubCluster string `json:"subCluster" db:"subcluster" example:"main"` + Partition string `json:"partition,omitempty" db:"partition" example:"main"` + Project string `json:"project" db:"project" example:"abcd200"` + User string `json:"user" db:"user" example:"abcd100h"` + State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + Tags []*Tag `json:"tags,omitempty"` + RawFootprint []byte `json:"-" db:"footprint"` + RawMetaData []byte `json:"-" db:"meta_data"` + Resources []*Resource `json:"resources"` + RawResources []byte `json:"-" db:"resources"` + ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` + Energy float64 `json:"energy"` + ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` + Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` + JobID int64 `json:"jobId" db:"job_id" example:"123000"` + Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` + SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` + MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` + Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` + NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` + NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` + NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` } // Job struct type @@ -49,19 +50,10 @@ type BaseJob struct { // Job model // @Description Information of a HPC job. type Job struct { - // The unique identifier of a job in the database - ID int64 `json:"id" db:"id"` + StartTime time.Time `json:"startTime"` BaseJob - StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` // Start epoch time stamp in seconds - StartTime time.Time `json:"startTime"` // Start time as 'time.Time' data type - MemUsedMax float64 `json:"memUsedMax" db:"mem_used_max"` // MemUsedMax as Float64 - FlopsAnyAvg float64 `json:"flopsAnyAvg" db:"flops_any_avg"` // FlopsAnyAvg as Float64 - MemBwAvg float64 `json:"memBwAvg" db:"mem_bw_avg"` // MemBwAvg as Float64 - LoadAvg float64 `json:"loadAvg" db:"load_avg"` // LoadAvg as Float64 - NetBwAvg float64 `json:"-" db:"net_bw_avg"` // NetBwAvg as Float64 - NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"` // NetDataVolTotal as Float64 - FileBwAvg float64 `json:"-" db:"file_bw_avg"` // FileBwAvg as Float64 - FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"` // FileDataVolTotal as Float64 + ID int64 `json:"id" db:"id"` + StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` } // JobMeta struct type @@ -88,11 +80,10 @@ type JobLinkResultList struct { // JobMeta model // @Description Meta data information of a HPC job. type JobMeta struct { - // The unique identifier of a job in the database - ID *int64 `json:"id,omitempty"` + ID *int64 `json:"id,omitempty"` + Statistics map[string]JobStatistics `json:"statistics"` BaseJob - StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` // Start epoch time stamp in seconds (Min > 0) - Statistics map[string]JobStatistics `json:"statistics"` // Metric statistics of job + StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` } const ( @@ -124,18 +115,18 @@ type JobStatistics struct { // Tag model // @Description Defines a tag using name and type. type Tag struct { - ID int64 `json:"id" db:"id"` // The unique DB identifier of a tag - Type string `json:"type" db:"tag_type" example:"Debug"` // Tag Type - Name string `json:"name" db:"tag_name" example:"Testjob"` // Tag Name + Type string `json:"type" db:"tag_type" example:"Debug"` + Name string `json:"name" db:"tag_name" example:"Testjob"` + ID int64 `json:"id" db:"id"` } // Resource model // @Description A resource used by a job type Resource struct { - Hostname string `json:"hostname"` // Name of the host (= node) - HWThreads []int `json:"hwthreads,omitempty"` // List of OS processor ids - Accelerators []string `json:"accelerators,omitempty"` // List of of accelerator device ids - Configuration string `json:"configuration,omitempty"` // The configuration options of the node + Hostname string `json:"hostname"` + Configuration string `json:"configuration,omitempty"` + HWThreads []int `json:"hwthreads,omitempty"` + Accelerators []string `json:"accelerators,omitempty"` } type JobState string From aede5f71ec88d01fc03ca9f7a557379925aef3bb Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 28 Jun 2024 16:49:02 +0200 Subject: [PATCH 03/10] Introduce adapted graphql schema --- api/schema.graphqls | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/api/schema.graphqls b/api/schema.graphqls index 5c5bc2c..154bc35 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -27,12 +27,7 @@ type Job { tags: [Tag!]! resources: [Resource!]! concurrentJobs: JobLinkResultList - - memUsedMax: Float - flopsAnyAvg: Float - memBwAvg: Float - loadAvg: Float - + footprint: [MetricValue] metaData: Any userData: User } @@ -64,6 +59,7 @@ type SubCluster { } type MetricValue { + name: String unit: Unit! value: Float! } From 97c807cd33e28bb24ac621071ec7f3169eab82db Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 28 Jun 2024 16:49:24 +0200 Subject: [PATCH 04/10] Add migration for footprint --- .../migrations/sqlite3/08_add-footprint.down.sql | 0 .../migrations/sqlite3/08_add-footprint.up.sql | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 internal/repository/migrations/sqlite3/08_add-footprint.down.sql create mode 100644 internal/repository/migrations/sqlite3/08_add-footprint.up.sql diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.down.sql b/internal/repository/migrations/sqlite3/08_add-footprint.down.sql new file mode 100644 index 0000000..e69de29 diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql new file mode 100644 index 0000000..5db1d1b --- /dev/null +++ b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql @@ -0,0 +1,12 @@ +ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0; + +ALTER TABLE job ADD COLUMN footprint TEXT DEFAULT NULL; +UPDATE job SET footprint = '{"flops_any_avg": 0.0}'; +UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg); +UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg); +UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max); +UPDATE job SET footprint = json_insert(footprint, '$.load_avg', job.load_avg); +ALTER TABLE job DROP flops_any_avg; +ALTER TABLE job DROP mem_bw_avg; +ALTER TABLE job DROP mem_used_max; +ALTER TABLE job DROP load_avg; From 130613b717a501a3d709bdc21026c9b48a706cb7 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 28 Jun 2024 17:08:28 +0200 Subject: [PATCH 05/10] Fix build errors Code not yet functional --- internal/graph/generated/generated.go | 396 +++++++++++++------------- internal/graph/schema.resolvers.go | 14 + internal/importer/handleImport.go | 14 +- internal/importer/initDB.go | 14 +- 4 files changed, 231 insertions(+), 207 deletions(-) diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index 29a2a24..e40a1a2 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -42,6 +42,7 @@ type Config struct { type ResolverRoot interface { Cluster() ClusterResolver Job() JobResolver + MetricValue() MetricValueResolver Mutation() MutationResolver Query() QueryResolver SubCluster() SubClusterResolver @@ -90,12 +91,9 @@ type ComplexityRoot struct { ConcurrentJobs func(childComplexity int) int Duration func(childComplexity int) int Exclusive func(childComplexity int) int - FlopsAnyAvg func(childComplexity int) int + Footprint func(childComplexity int) int ID func(childComplexity int) int JobID func(childComplexity int) int - LoadAvg func(childComplexity int) int - MemBwAvg func(childComplexity int) int - MemUsedMax func(childComplexity int) int MetaData func(childComplexity int) int MonitoringStatus func(childComplexity int) int NumAcc func(childComplexity int) int @@ -204,6 +202,7 @@ type ComplexityRoot struct { } MetricValue struct { + Name func(childComplexity int) int Unit func(childComplexity int) int Value func(childComplexity int) int } @@ -324,10 +323,13 @@ type JobResolver interface { Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) - + Footprint(ctx context.Context, obj *schema.Job) ([]*schema.MetricValue, error) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) UserData(ctx context.Context, obj *schema.Job) (*model.User, error) } +type MetricValueResolver interface { + Name(ctx context.Context, obj *schema.MetricValue) (*string, error) +} type MutationResolver interface { CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) DeleteTag(ctx context.Context, id string) (string, error) @@ -511,12 +513,12 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Job.Exclusive(childComplexity), true - case "Job.flopsAnyAvg": - if e.complexity.Job.FlopsAnyAvg == nil { + case "Job.footprint": + if e.complexity.Job.Footprint == nil { break } - return e.complexity.Job.FlopsAnyAvg(childComplexity), true + return e.complexity.Job.Footprint(childComplexity), true case "Job.id": if e.complexity.Job.ID == nil { @@ -532,27 +534,6 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Job.JobID(childComplexity), true - case "Job.loadAvg": - if e.complexity.Job.LoadAvg == nil { - break - } - - return e.complexity.Job.LoadAvg(childComplexity), true - - case "Job.memBwAvg": - if e.complexity.Job.MemBwAvg == nil { - break - } - - return e.complexity.Job.MemBwAvg(childComplexity), true - - case "Job.memUsedMax": - if e.complexity.Job.MemUsedMax == nil { - break - } - - return e.complexity.Job.MemUsedMax(childComplexity), true - case "Job.metaData": if e.complexity.Job.MetaData == nil { break @@ -1057,6 +1038,13 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.MetricStatistics.Min(childComplexity), true + case "MetricValue.name": + if e.complexity.MetricValue.Name == nil { + break + } + + return e.complexity.MetricValue.Name(childComplexity), true + case "MetricValue.unit": if e.complexity.MetricValue.Unit == nil { break @@ -1744,12 +1732,7 @@ type Job { tags: [Tag!]! resources: [Resource!]! concurrentJobs: JobLinkResultList - - memUsedMax: Float - flopsAnyAvg: Float - memBwAvg: Float - loadAvg: Float - + footprint: [MetricValue] metaData: Any userData: User } @@ -1781,6 +1764,7 @@ type SubCluster { } type MetricValue { + name: String unit: Unit! value: Float! } @@ -4200,8 +4184,8 @@ func (ec *executionContext) fieldContext_Job_concurrentJobs(ctx context.Context, return fc, nil } -func (ec *executionContext) _Job_memUsedMax(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Job_memUsedMax(ctx, field) +func (ec *executionContext) _Job_footprint(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Job_footprint(ctx, field) if err != nil { return graphql.Null } @@ -4214,7 +4198,7 @@ func (ec *executionContext) _Job_memUsedMax(ctx context.Context, field graphql.C }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { ctx = rctx // use context from middleware stack in children - return obj.MemUsedMax, nil + return ec.resolvers.Job().Footprint(rctx, obj) }) if err != nil { ec.Error(ctx, err) @@ -4223,142 +4207,27 @@ func (ec *executionContext) _Job_memUsedMax(ctx context.Context, field graphql.C if resTmp == nil { return graphql.Null } - res := resTmp.(float64) + res := resTmp.([]*schema.MetricValue) fc.Result = res - return ec.marshalOFloat2float64(ctx, field.Selections, res) + return ec.marshalOMetricValue2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐMetricValue(ctx, field.Selections, res) } -func (ec *executionContext) fieldContext_Job_memUsedMax(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_Job_footprint(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "Job", Field: field, - IsMethod: false, - IsResolver: false, + IsMethod: true, + IsResolver: true, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { - return nil, errors.New("field of type Float does not have child fields") - }, - } - return fc, nil -} - -func (ec *executionContext) _Job_flopsAnyAvg(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Job_flopsAnyAvg(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { - ctx = rctx // use context from middleware stack in children - return obj.FlopsAnyAvg, nil - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - return graphql.Null - } - res := resTmp.(float64) - fc.Result = res - return ec.marshalOFloat2float64(ctx, field.Selections, res) -} - -func (ec *executionContext) fieldContext_Job_flopsAnyAvg(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { - fc = &graphql.FieldContext{ - Object: "Job", - Field: field, - IsMethod: false, - IsResolver: false, - Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { - return nil, errors.New("field of type Float does not have child fields") - }, - } - return fc, nil -} - -func (ec *executionContext) _Job_memBwAvg(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Job_memBwAvg(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { - ctx = rctx // use context from middleware stack in children - return obj.MemBwAvg, nil - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - return graphql.Null - } - res := resTmp.(float64) - fc.Result = res - return ec.marshalOFloat2float64(ctx, field.Selections, res) -} - -func (ec *executionContext) fieldContext_Job_memBwAvg(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { - fc = &graphql.FieldContext{ - Object: "Job", - Field: field, - IsMethod: false, - IsResolver: false, - Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { - return nil, errors.New("field of type Float does not have child fields") - }, - } - return fc, nil -} - -func (ec *executionContext) _Job_loadAvg(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Job_loadAvg(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { - ctx = rctx // use context from middleware stack in children - return obj.LoadAvg, nil - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - return graphql.Null - } - res := resTmp.(float64) - fc.Result = res - return ec.marshalOFloat2float64(ctx, field.Selections, res) -} - -func (ec *executionContext) fieldContext_Job_loadAvg(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { - fc = &graphql.FieldContext{ - Object: "Job", - Field: field, - IsMethod: false, - IsResolver: false, - Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { - return nil, errors.New("field of type Float does not have child fields") + switch field.Name { + case "name": + return ec.fieldContext_MetricValue_name(ctx, field) + case "unit": + return ec.fieldContext_MetricValue_unit(ctx, field) + case "value": + return ec.fieldContext_MetricValue_value(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type MetricValue", field.Name) }, } return fc, nil @@ -5088,14 +4957,8 @@ func (ec *executionContext) fieldContext_JobResultList_items(ctx context.Context return ec.fieldContext_Job_resources(ctx, field) case "concurrentJobs": return ec.fieldContext_Job_concurrentJobs(ctx, field) - case "memUsedMax": - return ec.fieldContext_Job_memUsedMax(ctx, field) - case "flopsAnyAvg": - return ec.fieldContext_Job_flopsAnyAvg(ctx, field) - case "memBwAvg": - return ec.fieldContext_Job_memBwAvg(ctx, field) - case "loadAvg": - return ec.fieldContext_Job_loadAvg(ctx, field) + case "footprint": + return ec.fieldContext_Job_footprint(ctx, field) case "metaData": return ec.fieldContext_Job_metaData(ctx, field) case "userData": @@ -7034,6 +6897,47 @@ func (ec *executionContext) fieldContext_MetricStatistics_max(ctx context.Contex return fc, nil } +func (ec *executionContext) _MetricValue_name(ctx context.Context, field graphql.CollectedField, obj *schema.MetricValue) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_MetricValue_name(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.MetricValue().Name(rctx, obj) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*string) + fc.Result = res + return ec.marshalOString2ᚖstring(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_MetricValue_name(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "MetricValue", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _MetricValue_unit(ctx context.Context, field graphql.CollectedField, obj *schema.MetricValue) (ret graphql.Marshaler) { fc, err := ec.fieldContext_MetricValue_unit(ctx, field) if err != nil { @@ -7869,14 +7773,8 @@ func (ec *executionContext) fieldContext_Query_job(ctx context.Context, field gr return ec.fieldContext_Job_resources(ctx, field) case "concurrentJobs": return ec.fieldContext_Job_concurrentJobs(ctx, field) - case "memUsedMax": - return ec.fieldContext_Job_memUsedMax(ctx, field) - case "flopsAnyAvg": - return ec.fieldContext_Job_flopsAnyAvg(ctx, field) - case "memBwAvg": - return ec.fieldContext_Job_memBwAvg(ctx, field) - case "loadAvg": - return ec.fieldContext_Job_loadAvg(ctx, field) + case "footprint": + return ec.fieldContext_Job_footprint(ctx, field) case "metaData": return ec.fieldContext_Job_metaData(ctx, field) case "userData": @@ -9249,6 +9147,8 @@ func (ec *executionContext) fieldContext_SubCluster_flopRateScalar(ctx context.C IsResolver: false, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { switch field.Name { + case "name": + return ec.fieldContext_MetricValue_name(ctx, field) case "unit": return ec.fieldContext_MetricValue_unit(ctx, field) case "value": @@ -9299,6 +9199,8 @@ func (ec *executionContext) fieldContext_SubCluster_flopRateSimd(ctx context.Con IsResolver: false, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { switch field.Name { + case "name": + return ec.fieldContext_MetricValue_name(ctx, field) case "unit": return ec.fieldContext_MetricValue_unit(ctx, field) case "value": @@ -9349,6 +9251,8 @@ func (ec *executionContext) fieldContext_SubCluster_memoryBandwidth(ctx context. IsResolver: false, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { switch field.Name { + case "name": + return ec.fieldContext_MetricValue_name(ctx, field) case "unit": return ec.fieldContext_MetricValue_unit(ctx, field) case "value": @@ -13159,14 +13063,39 @@ func (ec *executionContext) _Job(ctx context.Context, sel ast.SelectionSet, obj } out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) - case "memUsedMax": - out.Values[i] = ec._Job_memUsedMax(ctx, field, obj) - case "flopsAnyAvg": - out.Values[i] = ec._Job_flopsAnyAvg(ctx, field, obj) - case "memBwAvg": - out.Values[i] = ec._Job_memBwAvg(ctx, field, obj) - case "loadAvg": - out.Values[i] = ec._Job_loadAvg(ctx, field, obj) + case "footprint": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Job_footprint(ctx, field, obj) + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) case "metaData": field := field @@ -13879,15 +13808,48 @@ func (ec *executionContext) _MetricValue(ctx context.Context, sel ast.SelectionS switch field.Name { case "__typename": out.Values[i] = graphql.MarshalString("MetricValue") + case "name": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._MetricValue_name(ctx, field, obj) + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) case "unit": out.Values[i] = ec._MetricValue_unit(ctx, field, obj) if out.Values[i] == graphql.Null { - out.Invalids++ + atomic.AddUint32(&out.Invalids, 1) } case "value": out.Values[i] = ec._MetricValue_value(ctx, field, obj) if out.Values[i] == graphql.Null { - out.Invalids++ + atomic.AddUint32(&out.Invalids, 1) } default: panic("unknown field " + strconv.Quote(field.Name)) @@ -17279,6 +17241,54 @@ func (ec *executionContext) marshalOMetricStatistics2githubᚗcomᚋClusterCockp return ec._MetricStatistics(ctx, sel, &v) } +func (ec *executionContext) marshalOMetricValue2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐMetricValue(ctx context.Context, sel ast.SelectionSet, v []*schema.MetricValue) graphql.Marshaler { + if v == nil { + return graphql.Null + } + ret := make(graphql.Array, len(v)) + var wg sync.WaitGroup + isLen1 := len(v) == 1 + if !isLen1 { + wg.Add(len(v)) + } + for i := range v { + i := i + fc := &graphql.FieldContext{ + Index: &i, + Result: &v[i], + } + ctx := graphql.WithFieldContext(ctx, fc) + f := func(i int) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = nil + } + }() + if !isLen1 { + defer wg.Done() + } + ret[i] = ec.marshalOMetricValue2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐMetricValue(ctx, sel, v[i]) + } + if isLen1 { + f(i) + } else { + go f(i) + } + + } + wg.Wait() + + return ret +} + +func (ec *executionContext) marshalOMetricValue2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐMetricValue(ctx context.Context, sel ast.SelectionSet, v *schema.MetricValue) graphql.Marshaler { + if v == nil { + return graphql.Null + } + return ec._MetricValue(ctx, sel, v) +} + func (ec *executionContext) unmarshalOOrderByInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐOrderByInput(ctx context.Context, v interface{}) (*model.OrderByInput, error) { if v == nil { return nil, nil diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 5f55139..21d5194 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -44,6 +44,11 @@ func (r *jobResolver) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*mod return nil, nil } +// Footprint is the resolver for the footprint field. +func (r *jobResolver) Footprint(ctx context.Context, obj *schema.Job) ([]*schema.MetricValue, error) { + panic(fmt.Errorf("not implemented: Footprint - footprint")) +} + // MetaData is the resolver for the metaData field. func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) { return r.Repo.FetchMetadata(obj) @@ -54,6 +59,11 @@ func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.Use return repository.GetUserRepository().FetchUserInCtx(ctx, obj.User) } +// Name is the resolver for the name field. +func (r *metricValueResolver) Name(ctx context.Context, obj *schema.MetricValue) (*string, error) { + panic(fmt.Errorf("not implemented: Name - name")) +} + // CreateTag is the resolver for the createTag field. func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) { id, err := r.Repo.CreateTag(typeArg, name) @@ -392,6 +402,9 @@ func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver // Job returns generated.JobResolver implementation. func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} } +// MetricValue returns generated.MetricValueResolver implementation. +func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricValueResolver{r} } + // Mutation returns generated.MutationResolver implementation. func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} } @@ -403,6 +416,7 @@ func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subCluste type clusterResolver struct{ *Resolver } type jobResolver struct{ *Resolver } +type metricValueResolver struct{ *Resolver } type mutationResolver struct{ *Resolver } type queryResolver struct{ *Resolver } type subClusterResolver struct{ *Resolver } diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 2d507a2..fd1b2f4 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -86,13 +86,13 @@ func HandleImportFlag(flag string) error { StartTimeUnix: jobMeta.StartTime, } - // TODO: Other metrics... - job.LoadAvg = loadJobStat(&jobMeta, "cpu_load") - job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any") - job.MemUsedMax = loadJobStat(&jobMeta, "mem_used") - job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw") - job.NetBwAvg = loadJobStat(&jobMeta, "net_bw") - job.FileBwAvg = loadJobStat(&jobMeta, "file_bw") + // TODO: Do loop for new sub structure for stats + // job.LoadAvg = loadJobStat(&jobMeta, "cpu_load") + // job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any") + // job.MemUsedMax = loadJobStat(&jobMeta, "mem_used") + // job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw") + // job.NetBwAvg = loadJobStat(&jobMeta, "net_bw") + // job.FileBwAvg = loadJobStat(&jobMeta, "file_bw") job.RawResources, err = json.Marshal(job.Resources) if err != nil { diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index 0e7a6bb..0055768 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -60,13 +60,13 @@ func InitDB() error { StartTimeUnix: jobMeta.StartTime, } - // TODO: Other metrics... - job.LoadAvg = loadJobStat(jobMeta, "cpu_load") - job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any") - job.MemUsedMax = loadJobStat(jobMeta, "mem_used") - job.MemBwAvg = loadJobStat(jobMeta, "mem_bw") - job.NetBwAvg = loadJobStat(jobMeta, "net_bw") - job.FileBwAvg = loadJobStat(jobMeta, "file_bw") + // TODO: Convert to loop for new footprint layout + // job.LoadAvg = loadJobStat(jobMeta, "cpu_load") + // job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any") + // job.MemUsedMax = loadJobStat(jobMeta, "mem_used") + // job.MemBwAvg = loadJobStat(jobMeta, "mem_bw") + // job.NetBwAvg = loadJobStat(jobMeta, "net_bw") + // job.FileBwAvg = loadJobStat(jobMeta, "file_bw") job.RawResources, err = json.Marshal(job.Resources) if err != nil { From bd89ce7cc92ebb520efc3e13e33e7461e6529822 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 2 Jul 2024 10:13:11 +0200 Subject: [PATCH 06/10] Extend schema and start Unit test implementation Does not compile and work yet --- pkg/archive/clusterConfig.go | 39 +- pkg/archive/clusterConfig_test.go | 15 + pkg/archive/fsBackend_test.go | 5 +- .../testdata/archive/alex/cluster.json | 2774 +++++++++++++++++ .../testdata/archive/fritz/cluster.json | 746 +++++ pkg/schema/cluster.go | 22 +- 6 files changed, 3580 insertions(+), 21 deletions(-) create mode 100644 pkg/archive/clusterConfig_test.go create mode 100644 pkg/archive/testdata/archive/alex/cluster.json create mode 100644 pkg/archive/testdata/archive/fritz/cluster.json diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index d0bf397..14c9fd9 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -12,11 +12,12 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" ) -var Clusters []*schema.Cluster -var nodeLists map[string]map[string]NodeList +var ( + Clusters []*schema.Cluster + nodeLists map[string]map[string]NodeList +) func initClusterConfig() error { - Clusters = []*schema.Cluster{} nodeLists = map[string]map[string]NodeList{} @@ -49,6 +50,32 @@ func initClusterConfig() error { if !mc.Scope.Valid() { return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)") } + + scLookup := make(map[string]*schema.SubClusterConfig) + + for _, scc := range mc.SubClusters { + scLookup[scc.Name] = scc + } + + for _, sc := range cluster.SubClusters { + newMetric := mc + + if cfg, ok := scLookup[sc.Name]; ok { + if !cfg.Remove { + newMetric.Peak = cfg.Peak + newMetric.Peak = cfg.Peak + newMetric.Normal = cfg.Normal + newMetric.Caution = cfg.Caution + newMetric.Alert = cfg.Alert + newMetric.Footprint = cfg.Footprint + sc.MetricConfig = append(sc.MetricConfig, *newMetric) + } + } + + if newMetric.Footprint { + sc.Footprint = append(sc.Footprint, newMetric.Name) + } + } } Clusters = append(Clusters, cluster) @@ -71,7 +98,6 @@ func initClusterConfig() error { } func GetCluster(cluster string) *schema.Cluster { - for _, c := range Clusters { if c.Name == cluster { return c @@ -90,11 +116,10 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) { } } } - return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster) + return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster) } func GetMetricConfig(cluster, metric string) *schema.MetricConfig { - for _, c := range Clusters { if c.Name == cluster { for _, m := range c.MetricConfig { @@ -110,7 +135,6 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig { // AssignSubCluster sets the `job.subcluster` property of the job based // on its cluster and resources. func AssignSubCluster(job *schema.BaseJob) error { - cluster := GetCluster(job.Cluster) if cluster == nil { return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster) @@ -146,7 +170,6 @@ func AssignSubCluster(job *schema.BaseJob) error { } func GetSubClusterByNode(cluster, hostname string) (string, error) { - for sc, nl := range nodeLists[cluster] { if nl != nil && nl.Contains(hostname) { return sc, nil diff --git a/pkg/archive/clusterConfig_test.go b/pkg/archive/clusterConfig_test.go new file mode 100644 index 0000000..b4dc6ef --- /dev/null +++ b/pkg/archive/clusterConfig_test.go @@ -0,0 +1,15 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package archive + +import ( + "encoding/json" + "testing" +) + +func TestClusterConfig(t *testing.T) { + var fsa FsArchive + version, err := fsa.Init(json.RawMessage("{\"path\":\"testdata/archive\"}")) +} diff --git a/pkg/archive/fsBackend_test.go b/pkg/archive/fsBackend_test.go index 5e0a06c..78d634a 100644 --- a/pkg/archive/fsBackend_test.go +++ b/pkg/archive/fsBackend_test.go @@ -30,6 +30,7 @@ func TestInitNoJson(t *testing.T) { t.Fatal(err) } } + func TestInitNotExists(t *testing.T) { var fsa FsArchive _, err := fsa.Init(json.RawMessage("{\"path\":\"testdata/job-archive\"}")) @@ -50,7 +51,7 @@ func TestInit(t *testing.T) { if version != 1 { t.Fail() } - if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" { + if len(fsa.clusters) != 3 || fsa.clusters[0] != "emmy" { t.Fail() } } @@ -133,7 +134,6 @@ func TestLoadJobData(t *testing.T) { } func BenchmarkLoadJobData(b *testing.B) { - tmpdir := b.TempDir() jobarchive := filepath.Join(tmpdir, "job-archive") util.CopyDir("./testdata/archive/", jobarchive) @@ -157,7 +157,6 @@ func BenchmarkLoadJobData(b *testing.B) { } func BenchmarkLoadJobDataCompressed(b *testing.B) { - tmpdir := b.TempDir() jobarchive := filepath.Join(tmpdir, "job-archive") util.CopyDir("./testdata/archive/", jobarchive) diff --git a/pkg/archive/testdata/archive/alex/cluster.json b/pkg/archive/testdata/archive/alex/cluster.json new file mode 100644 index 0000000..5a04a78 --- /dev/null +++ b/pkg/archive/testdata/archive/alex/cluster.json @@ -0,0 +1,2774 @@ +{ + "name": "alex", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 128, + "normal": 128, + "caution": 10, + "alert": 5 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 512, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 9216, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "core_power", + "unit": { + "base": "W" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 10, + "alert": 5 + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 400, + "normal": 200, + "caution": 50, + "alert": 20 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + } + ], + "subClusters": [ + { + "name": "a40", + "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:01:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:25:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:41:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:61:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:81:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:A1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:C1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:E1:00.0", + "type": "Nvidia GPU", + "model": "A40" + } + ] + } + }, + { + "name": "a100", + "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + }, + { + "name": "a100m80", + "nodes": "a[0531-0537],a0831,a[0931-0934]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + } + ] +} diff --git a/pkg/archive/testdata/archive/fritz/cluster.json b/pkg/archive/testdata/archive/fritz/cluster.json new file mode 100644 index 0000000..23a8343 --- /dev/null +++ b/pkg/archive/testdata/archive/fritz/cluster.json @@ -0,0 +1,746 @@ +{ + "name": "fritz", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_sp", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_dp", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "f01[01-88],f02[01-88],f03[01-88],f03[01-88],f04[01-88],f05[01-88],f06[01-88],f07[01-88],f08[01-88],f09[01-88],f10[01-88],f11[01-56],f12[01-56]", + "processorType": "Intel Icelake", + "socketsPerNode": 2, + "coresPerSocket": 36, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17 + ], + [ + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53 + ], + [ + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ] + ] + } + } + ] +} diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index 6edd830..3bd05d9 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -30,16 +30,18 @@ type MetricValue struct { } type SubCluster struct { - Name string `json:"name"` - Nodes string `json:"nodes"` - ProcessorType string `json:"processorType"` - Topology Topology `json:"topology"` - FlopRateScalar MetricValue `json:"flopRateScalar"` - FlopRateSimd MetricValue `json:"flopRateSimd"` - MemoryBandwidth MetricValue `json:"memoryBandwidth"` - SocketsPerNode int `json:"socketsPerNode"` - CoresPerSocket int `json:"coresPerSocket"` - ThreadsPerCore int `json:"threadsPerCore"` + Name string `json:"name"` + Nodes string `json:"nodes"` + ProcessorType string `json:"processorType"` + Topology Topology `json:"topology"` + FlopRateScalar MetricValue `json:"flopRateScalar"` + FlopRateSimd MetricValue `json:"flopRateSimd"` + MemoryBandwidth MetricValue `json:"memoryBandwidth"` + MetricConfig []MetricConfig `json:"metricConfig,omitempty"` + Footprint []string `json:"footprint,omitempty"` + SocketsPerNode int `json:"socketsPerNode"` + CoresPerSocket int `json:"coresPerSocket"` + ThreadsPerCore int `json:"threadsPerCore"` } type SubClusterConfig struct { From b05909969f3a30ce423e90123810f6aa2db585b8 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 3 Jul 2024 12:11:43 +0200 Subject: [PATCH 07/10] Add test for clusterConfig --- go.mod | 1 + pkg/archive/clusterConfig_test.go | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index fddcfbc..07743ad 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/ClusterCockpit/cc-units v0.4.0 github.com/Masterminds/squirrel v1.5.3 github.com/coreos/go-oidc/v3 v3.9.0 + github.com/davecgh/go-spew v1.1.1 github.com/go-co-op/gocron v1.25.0 github.com/go-ldap/ldap/v3 v3.4.4 github.com/go-sql-driver/mysql v1.7.0 diff --git a/pkg/archive/clusterConfig_test.go b/pkg/archive/clusterConfig_test.go index b4dc6ef..7734a72 100644 --- a/pkg/archive/clusterConfig_test.go +++ b/pkg/archive/clusterConfig_test.go @@ -2,14 +2,22 @@ // All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. -package archive +package archive_test import ( "encoding/json" "testing" + + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/davecgh/go-spew/spew" ) func TestClusterConfig(t *testing.T) { - var fsa FsArchive - version, err := fsa.Init(json.RawMessage("{\"path\":\"testdata/archive\"}")) + if err := archive.Init(json.RawMessage("{\"kind\":\"testdata/archive\"}"), false); err != nil { + t.Fatal(err) + } + + c := archive.GetCluster("fritz") + spew.Dump(c) + t.Fail() } From 1b70596735fea5a13131f900cfe573aca2a15eb4 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 4 Jul 2024 06:49:59 +0200 Subject: [PATCH 08/10] Fix and test subcluster Config --- pkg/archive/clusterConfig.go | 14 +- pkg/archive/clusterConfig_test.go | 17 +- .../testdata/archive/alex/cluster.json | 2332 +------------ .../testdata/archive/fritz/cluster.json | 2971 +++++++++++++---- 4 files changed, 2277 insertions(+), 3057 deletions(-) diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 14c9fd9..43d131b 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -59,6 +59,7 @@ func initClusterConfig() error { for _, sc := range cluster.SubClusters { newMetric := mc + newMetric.SubClusters = nil if cfg, ok := scLookup[sc.Name]; ok { if !cfg.Remove { @@ -69,12 +70,19 @@ func initClusterConfig() error { newMetric.Alert = cfg.Alert newMetric.Footprint = cfg.Footprint sc.MetricConfig = append(sc.MetricConfig, *newMetric) + + if newMetric.Footprint { + sc.Footprint = append(sc.Footprint, newMetric.Name) + } + } + } else { + sc.MetricConfig = append(sc.MetricConfig, *newMetric) + + if newMetric.Footprint { + sc.Footprint = append(sc.Footprint, newMetric.Name) } } - if newMetric.Footprint { - sc.Footprint = append(sc.Footprint, newMetric.Name) - } } } diff --git a/pkg/archive/clusterConfig_test.go b/pkg/archive/clusterConfig_test.go index 7734a72..942d29c 100644 --- a/pkg/archive/clusterConfig_test.go +++ b/pkg/archive/clusterConfig_test.go @@ -9,15 +9,22 @@ import ( "testing" "github.com/ClusterCockpit/cc-backend/pkg/archive" - "github.com/davecgh/go-spew/spew" ) func TestClusterConfig(t *testing.T) { - if err := archive.Init(json.RawMessage("{\"kind\":\"testdata/archive\"}"), false); err != nil { + if err := archive.Init(json.RawMessage("{\"kind\": \"file\",\"path\": \"testdata/archive\"}"), false); err != nil { t.Fatal(err) } - c := archive.GetCluster("fritz") - spew.Dump(c) - t.Fail() + sc, err := archive.GetSubCluster("fritz", "spr1tb") + if err != nil { + t.Fatal(err) + } + // spew.Dump(sc.MetricConfig) + if len(sc.Footprint) != 3 { + t.Fail() + } + if len(sc.MetricConfig) != 15 { + t.Fail() + } } diff --git a/pkg/archive/testdata/archive/alex/cluster.json b/pkg/archive/testdata/archive/alex/cluster.json index 5a04a78..89af72d 100644 --- a/pkg/archive/testdata/archive/alex/cluster.json +++ b/pkg/archive/testdata/archive/alex/cluster.json @@ -44,7 +44,7 @@ { "name": "flops_any", "unit": { - "base": "F/s", + "base": "Flops/s", "prefix": "G" }, "scope": "hwthread", @@ -152,11 +152,10 @@ { "name": "nv_temp", "unit": { - "base": "B", - "prefix": "G" + "base": "°C" }, "scope": "accelerator", - "aggregation": "sum", + "aggregation": "avg", "timestep": 60, "peak": 40, "normal": 20, @@ -170,7 +169,7 @@ "prefix": "M" }, "scope": "accelerator", - "aggregation": "sum", + "aggregation": "avg", "timestep": 60, "peak": 1400, "normal": 1200, @@ -207,7 +206,7 @@ "subClusters": [ { "name": "a40", - "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522]", + "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522],a[1621-1624],a[1721-1722]", "processorType": "AMD Milan", "socketsPerNode": 2, "coresPerSocket": 64, @@ -235,786 +234,23 @@ }, "topology": { "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ], "socket": [ [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], [ - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "memoryDomain": [ [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 72 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ], - [ - 104 - ], - [ - 105 - ], - [ - 106 - ], - [ - 107 - ], - [ - 108 - ], - [ - 109 - ], - [ - 110 - ], - [ - 111 - ], - [ - 112 - ], - [ - 113 - ], - [ - 114 - ], - [ - 115 - ], - [ - 116 - ], - [ - 117 - ], - [ - 118 - ], - [ - 119 - ], - [ - 120 - ], - [ - 121 - ], - [ - 122 - ], - [ - 123 - ], - [ - 124 - ], - [ - 125 - ], - [ - 126 - ], - [ - 127 - ] + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] ], "accelerators": [ { @@ -1090,786 +326,23 @@ }, "topology": { "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ], "socket": [ [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], [ - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "memoryDomain": [ [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 72 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ], - [ - 104 - ], - [ - 105 - ], - [ - 106 - ], - [ - 107 - ], - [ - 108 - ], - [ - 109 - ], - [ - 110 - ], - [ - 111 - ], - [ - 112 - ], - [ - 113 - ], - [ - 114 - ], - [ - 115 - ], - [ - 116 - ], - [ - 117 - ], - [ - 118 - ], - [ - 119 - ], - [ - 120 - ], - [ - 121 - ], - [ - 122 - ], - [ - 123 - ], - [ - 124 - ], - [ - 125 - ], - [ - 126 - ], - [ - 127 - ] + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] ], "accelerators": [ { @@ -1917,7 +390,7 @@ }, { "name": "a100m80", - "nodes": "a[0531-0537],a0831,a[0931-0934]", + "nodes": "a[0531-0537],a[0631-0633],a0831,a[0931-0934]", "processorType": "AMD Milan", "socketsPerNode": 2, "coresPerSocket": 64, @@ -1945,786 +418,23 @@ }, "topology": { "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ], "socket": [ [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], [ - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "memoryDomain": [ [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] ], "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 72 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ], - [ - 104 - ], - [ - 105 - ], - [ - 106 - ], - [ - 107 - ], - [ - 108 - ], - [ - 109 - ], - [ - 110 - ], - [ - 111 - ], - [ - 112 - ], - [ - 113 - ], - [ - 114 - ], - [ - 115 - ], - [ - 116 - ], - [ - 117 - ], - [ - 118 - ], - [ - 119 - ], - [ - 120 - ], - [ - 121 - ], - [ - 122 - ], - [ - 123 - ], - [ - 124 - ], - [ - 125 - ], - [ - 126 - ], - [ - 127 - ] + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] ], "accelerators": [ { diff --git a/pkg/archive/testdata/archive/fritz/cluster.json b/pkg/archive/testdata/archive/fritz/cluster.json index 23a8343..00a8f70 100644 --- a/pkg/archive/testdata/archive/fritz/cluster.json +++ b/pkg/archive/testdata/archive/fritz/cluster.json @@ -1,746 +1,2241 @@ { - "name": "fritz", - "metricConfig": [ + "name": "fritz", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": true, + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20, + "subClusters": [ { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 72, - "normal": 72, - "caution": 36, - "alert": 20 + "name": "spr1tb", + "peak": 104, + "normal": 104, + "caution": 52, + "footprint": true, + "alert": 20 }, { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 256, - "normal": 128, - "caution": 200, - "alert": 240 - }, - { - "name": "flops_any", - "unit": { - "base": "F/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "flops_sp", - "unit": { - "base": "F/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "flops_dp", - "unit": { - "base": "F/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 2300, - "normal": 500, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 10 - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "ib_recv", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_xmit", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_recv_pkts", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "ib_xmit_pkts", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_read", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_write", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 + "name": "spr2tb", + "peak": 104, + "normal": 104, + "caution": 52, + "footprint": true, + "alert": 20 } - ], - "subClusters": [ + ] + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": true, + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240, + "subClusters": [ { - "name": "main", - "nodes": "f01[01-88],f02[01-88],f03[01-88],f03[01-88],f04[01-88],f05[01-88],f06[01-88],f07[01-88],f08[01-88],f09[01-88],f10[01-88],f11[01-56],f12[01-56]", - "processorType": "Intel Icelake", - "socketsPerNode": 2, - "coresPerSocket": 36, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 350 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17 - ], - [ - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53 - ], - [ - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ] - ] - } + "name": "spr1tb", + "peak": 1024, + "normal": 512, + "caution": 900, + "footprint": true, + "alert": 1000 + }, + { + "name": "spr2tb", + "peak": 2048, + "normal": 1024, + "caution": 1800, + "footprint": true, + "alert": 2000 } - ] + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": true, + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "footprint": true + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "remove": true + } + ] + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "remove": true + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "remove": true + } + ] + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50, + "remove": true + }, + { + "name": "spr2tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50, + "remove": true + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": true, + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "peak": 549, + "normal": 200, + "caution": 100, + "alert": 20, + "remove": true + }, + { + "name": "spr2tb", + "peak": 520, + "normal": 200, + "caution": 100, + "alert": 20, + "remove": true + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200, + "subClusters": [ + { + "name": "spr1tb", + "peak": 549, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": true + }, + { + "name": "spr2tb", + "peak": 520, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": true + } + ] + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "f[0101-0188,0201-0288,0301-0388,0401-0488,0501-0588,0601-0688,0701-0788,0801-0888,0901-0988,1001-1088,1101-1156,1201-1256]", + "processorType": "Intel Icelake", + "socketsPerNode": 2, + "coresPerSocket": 36, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17 + ], + [ + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53 + ], + [ + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ] + ] + } + }, + { + "name": "spr1tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 549 + }, + "nodes": "f[2157-2188,2257-2288]", + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 5152, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12 + ], + [ + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25 + ], + [ + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38 + ], + [ + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64 + ], + [ + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77 + ], + [ + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90 + ], + [ + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ] + ] + } + }, + { + "name": "spr2tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 515 + }, + "nodes": "f[2181-2188,2281-2288]", + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12 + ], + [ + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25 + ], + [ + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38 + ], + [ + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51 + ], + [ + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64 + ], + [ + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77 + ], + [ + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90 + ], + [ + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ] + ] + } + } + ] } From 80c46bea7fefa3c0acd96d9306765a7dfd945883 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 4 Jul 2024 14:14:27 +0200 Subject: [PATCH 09/10] Fix bugs and failed testcases --- internal/importer/handleImport.go | 24 +- internal/importer/initDB.go | 36 +- internal/importer/testdata/cluster-fritz.json | 1486 +++++++++-------- internal/repository/job.go | 64 +- internal/util/statistics.go | 17 +- pkg/archive/fsBackend_test.go | 2 +- pkg/schema/job.go | 6 +- 7 files changed, 833 insertions(+), 802 deletions(-) diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index fd1b2f4..4ef9f84 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -14,6 +14,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -86,13 +87,22 @@ func HandleImportFlag(flag string) error { StartTimeUnix: jobMeta.StartTime, } - // TODO: Do loop for new sub structure for stats - // job.LoadAvg = loadJobStat(&jobMeta, "cpu_load") - // job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any") - // job.MemUsedMax = loadJobStat(&jobMeta, "mem_used") - // job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw") - // job.NetBwAvg = loadJobStat(&jobMeta, "net_bw") - // job.FileBwAvg = loadJobStat(&jobMeta, "file_bw") + sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) + if err != nil { + log.Errorf("cannot get subcluster: %s", err.Error()) + return err + } + + job.Footprint = make(map[string]float64) + + for _, fp := range sc.Footprint { + job.Footprint[fp] = util.LoadJobStat(&jobMeta, fp) + } + job.RawFootprint, err = json.Marshal(job.Footprint) + if err != nil { + log.Warn("Error while marshaling job footprint") + return err + } job.RawResources, err = json.Marshal(job.Resources) if err != nil { diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index 0055768..468ebb1 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -11,6 +11,7 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -60,13 +61,22 @@ func InitDB() error { StartTimeUnix: jobMeta.StartTime, } - // TODO: Convert to loop for new footprint layout - // job.LoadAvg = loadJobStat(jobMeta, "cpu_load") - // job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any") - // job.MemUsedMax = loadJobStat(jobMeta, "mem_used") - // job.MemBwAvg = loadJobStat(jobMeta, "mem_bw") - // job.NetBwAvg = loadJobStat(jobMeta, "net_bw") - // job.FileBwAvg = loadJobStat(jobMeta, "file_bw") + sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) + if err != nil { + log.Errorf("cannot get subcluster: %s", err.Error()) + return err + } + job.Footprint = make(map[string]float64) + + for _, fp := range sc.Footprint { + job.Footprint[fp] = util.LoadJobStat(jobMeta, fp) + } + + job.RawFootprint, err = json.Marshal(job.Footprint) + if err != nil { + log.Warn("Error while marshaling job footprint") + return err + } job.RawResources, err = json.Marshal(job.Resources) if err != nil { @@ -150,18 +160,6 @@ func SanityChecks(job *schema.BaseJob) error { return nil } -func loadJobStat(job *schema.JobMeta, metric string) float64 { - if stats, ok := job.Statistics[metric]; ok { - if metric == "mem_used" { - return stats.Max - } else { - return stats.Avg - } - } - - return 0.0 -} - func checkJobData(d *schema.JobData) error { for _, scopes := range *d { // var newUnit schema.Unit diff --git a/internal/importer/testdata/cluster-fritz.json b/internal/importer/testdata/cluster-fritz.json index 23a8343..8519fd4 100644 --- a/internal/importer/testdata/cluster-fritz.json +++ b/internal/importer/testdata/cluster-fritz.json @@ -1,746 +1,750 @@ { - "name": "fritz", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 72, - "normal": 72, - "caution": 36, - "alert": 20 + "name": "fritz", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": true, + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": true, + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": true, + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_sp", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_dp", + "unit": { + "base": "F/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": true, + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "f01[01-88],f02[01-88],f03[01-88],f03[01-88],f04[01-88],f05[01-88],f06[01-88],f07[01-88],f08[01-88],f09[01-88],f10[01-88],f11[01-56],f12[01-56]", + "processorType": "Intel Icelake", + "socketsPerNode": 2, + "coresPerSocket": 36, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 256, - "normal": 128, - "caution": 200, - "alert": 240 + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" }, - { - "name": "flops_any", - "unit": { - "base": "F/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "flops_sp", - "unit": { - "base": "F/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "flops_dp", - "unit": { - "base": "F/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 2300, - "normal": 500, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 10 - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "ib_recv", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_xmit", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_recv_pkts", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "ib_xmit_pkts", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_read", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_write", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "B/s", - "prefix": "M" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "main", - "nodes": "f01[01-88],f02[01-88],f03[01-88],f03[01-88],f04[01-88],f05[01-88],f06[01-88],f07[01-88],f08[01-88],f09[01-88],f10[01-88],f11[01-56],f12[01-56]", - "processorType": "Intel Icelake", - "socketsPerNode": 2, - "coresPerSocket": 36, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 350 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17 - ], - [ - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53 - ], - [ - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ] - ] - } - } - ] + "value": 350 + }, + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17 + ], + [ + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35 + ], + [ + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53 + ], + [ + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ] + ] + } + } + ] } diff --git a/internal/repository/job.go b/internal/repository/job.go index da71566..9438edd 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -16,6 +16,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/lrucache" @@ -64,6 +65,7 @@ var jobColumns []string = []string{ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { job := &schema.Job{} + if err := row.Scan( &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, @@ -79,7 +81,7 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { job.RawResources = nil if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil { - log.Warn("Error while unmarshaling raw footprint json") + log.Warnf("Error while unmarshaling raw footprint json: %v", err) return nil, err } job.RawFootprint = nil @@ -242,6 +244,7 @@ func (r *JobRepository) Find( } log.Debugf("Timer Find %s", time.Since(start)) + return scanJob(q.RunWith(r.stmtCache).QueryRow()) } @@ -397,6 +400,11 @@ func (r *JobRepository) FindConcurrentJobs( // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { + job.RawFootprint, err = json.Marshal(job.Footprint) + if err != nil { + return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err) + } + job.RawResources, err = json.Marshal(job.Resources) if err != nil { return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err) @@ -409,10 +417,10 @@ func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { res, err := r.DB.NamedExec(`INSERT INTO job ( job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data + exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, resources, meta_data ) VALUES ( :job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data + :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :resources, :meta_data );`, job) if err != nil { return -1, err @@ -478,34 +486,32 @@ func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32 // Stop updates the job with the database id jobId using the provided arguments. func (r *JobRepository) MarkArchived( - jobId int64, + jobMeta *schema.JobMeta, monitoringStatus int32, - metricStats map[string]schema.JobStatistics, ) error { stmt := sq.Update("job"). Set("monitoring_status", monitoringStatus). - Where("job.id = ?", jobId) + Where("job.id = ?", jobMeta.JobID) - for metric, stats := range metricStats { - switch metric { - case "flops_any": - stmt = stmt.Set("flops_any_avg", stats.Avg) - case "mem_used": - stmt = stmt.Set("mem_used_max", stats.Max) - case "mem_bw": - stmt = stmt.Set("mem_bw_avg", stats.Avg) - case "load": - stmt = stmt.Set("load_avg", stats.Avg) - case "cpu_load": - stmt = stmt.Set("load_avg", stats.Avg) - case "net_bw": - stmt = stmt.Set("net_bw_avg", stats.Avg) - case "file_bw": - stmt = stmt.Set("file_bw_avg", stats.Avg) - default: - log.Debugf("MarkArchived() Metric '%v' unknown", metric) - } + sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) + if err != nil { + log.Errorf("cannot get subcluster: %s", err.Error()) + return err } + footprint := make(map[string]float64) + + for _, fp := range sc.Footprint { + footprint[fp] = util.LoadJobStat(jobMeta, fp) + } + + var rawFootprint []byte + + if rawFootprint, err = json.Marshal(footprint); err != nil { + log.Warnf("Error while marshaling footprint for job, DB ID '%v'", jobMeta.ID) + return err + } + + stmt = stmt.Set("footprint", rawFootprint) if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil { log.Warn("Error while marking job as archived") @@ -541,7 +547,7 @@ func (r *JobRepository) archivingWorker() { } // Update the jobs database entry one last time: - if err := r.MarkArchived(job.ID, schema.MonitoringStatusArchivingSuccessful, jobMeta.Statistics); err != nil { + if err := r.MarkArchived(jobMeta, schema.MonitoringStatusArchivingSuccessful); err != nil { log.Errorf("archiving job (dbid: %d) failed at marking archived step: %s", job.ID, err.Error()) continue } @@ -786,12 +792,10 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64 const NamedJobInsert string = `INSERT INTO job ( job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data, - mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total + exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, resources, meta_data ) VALUES ( :job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data, - :mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total + :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :resources, :meta_data );` func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) { diff --git a/internal/util/statistics.go b/internal/util/statistics.go index 384de58..9d971dc 100644 --- a/internal/util/statistics.go +++ b/internal/util/statistics.go @@ -4,7 +4,10 @@ // license that can be found in the LICENSE file. package util -import "golang.org/x/exp/constraints" +import ( + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "golang.org/x/exp/constraints" +) func Min[T constraints.Ordered](a, b T) T { if a < b { @@ -19,3 +22,15 @@ func Max[T constraints.Ordered](a, b T) T { } return b } + +func LoadJobStat(job *schema.JobMeta, metric string) float64 { + if stats, ok := job.Statistics[metric]; ok { + if metric == "mem_used" { + return stats.Max + } else { + return stats.Avg + } + } + + return 0.0 +} diff --git a/pkg/archive/fsBackend_test.go b/pkg/archive/fsBackend_test.go index 78d634a..d60e478 100644 --- a/pkg/archive/fsBackend_test.go +++ b/pkg/archive/fsBackend_test.go @@ -51,7 +51,7 @@ func TestInit(t *testing.T) { if version != 1 { t.Fail() } - if len(fsa.clusters) != 3 || fsa.clusters[0] != "emmy" { + if len(fsa.clusters) != 3 || fsa.clusters[1] != "emmy" { t.Fail() } } diff --git a/pkg/schema/job.go b/pkg/schema/job.go index f5363b3..3cfdf55 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -16,8 +16,6 @@ import ( // Common subset of Job and JobMeta. Use one of those, not this type directly. type BaseJob struct { - Footprint map[string]float64 `json:"footPrint"` - MetaData map[string]string `json:"metaData"` Cluster string `json:"cluster" db:"cluster" example:"fritz"` SubCluster string `json:"subCluster" db:"subcluster" example:"main"` Partition string `json:"partition,omitempty" db:"partition" example:"main"` @@ -27,8 +25,10 @@ type BaseJob struct { Tags []*Tag `json:"tags,omitempty"` RawFootprint []byte `json:"-" db:"footprint"` RawMetaData []byte `json:"-" db:"meta_data"` - Resources []*Resource `json:"resources"` RawResources []byte `json:"-" db:"resources"` + Resources []*Resource `json:"resources"` + Footprint map[string]float64 `json:"footPrint"` + MetaData map[string]string `json:"metaData"` ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` Energy float64 `json:"energy"` ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` From ac9bba8b5bed2a7f8f82da6e82d853ad3a9dcce1 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 4 Jul 2024 15:05:24 +0200 Subject: [PATCH 10/10] Restructure and simplify job repo --- internal/importer/handleImport.go | 31 +-- internal/repository/job.go | 247 ------------------ internal/repository/jobCreate.go | 75 ++++++ internal/repository/jobFind.go | 192 ++++++++++++++ internal/repository/{query.go => jobQuery.go} | 0 internal/repository/user.go | 33 ++- 6 files changed, 288 insertions(+), 290 deletions(-) create mode 100644 internal/repository/jobCreate.go create mode 100644 internal/repository/jobFind.go rename internal/repository/{query.go => jobQuery.go} (100%) diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 4ef9f84..81a312f 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -10,7 +10,6 @@ import ( "fmt" "os" "strings" - "time" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/repository" @@ -43,8 +42,8 @@ func HandleImportFlag(flag string) error { } dec := json.NewDecoder(bytes.NewReader(raw)) dec.DisallowUnknownFields() - jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults} - if err = dec.Decode(&jobMeta); err != nil { + job := schema.JobMeta{BaseJob: schema.JobDefaults} + if err = dec.Decode(&job); err != nil { log.Warn("Error while decoding raw json metadata for import") return err } @@ -68,26 +67,9 @@ func HandleImportFlag(flag string) error { return err } - // checkJobData(&jobData) + job.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful - jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful - - // if _, err = r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows { - // if err != nil { - // log.Warn("Error while finding job in jobRepository") - // return err - // } - // - // return fmt.Errorf("REPOSITORY/INIT > a job with that jobId, cluster and startTime does already exist") - // } - // - job := schema.Job{ - BaseJob: jobMeta.BaseJob, - StartTime: time.Unix(jobMeta.StartTime, 0), - StartTimeUnix: jobMeta.StartTime, - } - - sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) + sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster) if err != nil { log.Errorf("cannot get subcluster: %s", err.Error()) return err @@ -96,14 +78,13 @@ func HandleImportFlag(flag string) error { job.Footprint = make(map[string]float64) for _, fp := range sc.Footprint { - job.Footprint[fp] = util.LoadJobStat(&jobMeta, fp) + job.Footprint[fp] = util.LoadJobStat(&job, fp) } job.RawFootprint, err = json.Marshal(job.Footprint) if err != nil { log.Warn("Error while marshaling job footprint") return err } - job.RawResources, err = json.Marshal(job.Resources) if err != nil { log.Warn("Error while marshaling job resources") @@ -120,7 +101,7 @@ func HandleImportFlag(flag string) error { return err } - if err = archive.GetHandle().ImportJob(&jobMeta, &jobData); err != nil { + if err = archive.GetHandle().ImportJob(&job, &jobData); err != nil { log.Error("Error while importing job") return err } diff --git a/internal/repository/job.go b/internal/repository/job.go index 9438edd..8dda6e0 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -222,230 +222,6 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er return archive.UpdateMetadata(job, job.MetaData) } -// Find executes a SQL query to find a specific batch job. -// The job is queried using the batch job id, the cluster name, -// and the start time of the job in UNIX epoch time seconds. -// It returns a pointer to a schema.Job data structure and an error variable. -// To check if no job was found test err == sql.ErrNoRows -func (r *JobRepository) Find( - jobId *int64, - cluster *string, - startTime *int64, -) (*schema.Job, error) { - start := time.Now() - q := sq.Select(jobColumns...).From("job"). - Where("job.job_id = ?", *jobId) - - if cluster != nil { - q = q.Where("job.cluster = ?", *cluster) - } - if startTime != nil { - q = q.Where("job.start_time = ?", *startTime) - } - - log.Debugf("Timer Find %s", time.Since(start)) - - return scanJob(q.RunWith(r.stmtCache).QueryRow()) -} - -// Find executes a SQL query to find a specific batch job. -// The job is queried using the batch job id, the cluster name, -// and the start time of the job in UNIX epoch time seconds. -// It returns a pointer to a schema.Job data structure and an error variable. -// To check if no job was found test err == sql.ErrNoRows -func (r *JobRepository) FindAll( - jobId *int64, - cluster *string, - startTime *int64, -) ([]*schema.Job, error) { - start := time.Now() - q := sq.Select(jobColumns...).From("job"). - Where("job.job_id = ?", *jobId) - - if cluster != nil { - q = q.Where("job.cluster = ?", *cluster) - } - if startTime != nil { - q = q.Where("job.start_time = ?", *startTime) - } - - rows, err := q.RunWith(r.stmtCache).Query() - if err != nil { - log.Error("Error while running query") - return nil, err - } - - jobs := make([]*schema.Job, 0, 10) - for rows.Next() { - job, err := scanJob(rows) - if err != nil { - log.Warn("Error while scanning rows") - return nil, err - } - jobs = append(jobs, job) - } - log.Debugf("Timer FindAll %s", time.Since(start)) - return jobs, nil -} - -// FindById executes a SQL query to find a specific batch job. -// The job is queried using the database id. -// It returns a pointer to a schema.Job data structure and an error variable. -// To check if no job was found test err == sql.ErrNoRows -func (r *JobRepository) FindById(jobId int64) (*schema.Job, error) { - q := sq.Select(jobColumns...). - From("job").Where("job.id = ?", jobId) - return scanJob(q.RunWith(r.stmtCache).QueryRow()) -} - -func (r *JobRepository) FindConcurrentJobs( - ctx context.Context, - job *schema.Job, -) (*model.JobLinkResultList, error) { - if job == nil { - return nil, nil - } - - query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job")) - if qerr != nil { - return nil, qerr - } - - query = query.Where("cluster = ?", job.Cluster) - var startTime int64 - var stopTime int64 - - startTime = job.StartTimeUnix - hostname := job.Resources[0].Hostname - - if job.State == schema.JobStateRunning { - stopTime = time.Now().Unix() - } else { - stopTime = startTime + int64(job.Duration) - } - - // Add 200s overlap for jobs start time at the end - startTimeTail := startTime + 10 - stopTimeTail := stopTime - 200 - startTimeFront := startTime + 200 - - queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)", - "running", startTimeTail, stopTimeTail, startTime) - queryRunning = queryRunning.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%")) - - query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)", - "running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime) - query = query.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%")) - - rows, err := query.RunWith(r.stmtCache).Query() - if err != nil { - log.Errorf("Error while running query: %v", err) - return nil, err - } - - items := make([]*model.JobLink, 0, 10) - queryString := fmt.Sprintf("cluster=%s", job.Cluster) - - for rows.Next() { - var id, jobId, startTime sql.NullInt64 - - if err = rows.Scan(&id, &jobId, &startTime); err != nil { - log.Warn("Error while scanning rows") - return nil, err - } - - if id.Valid { - queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64)) - items = append(items, - &model.JobLink{ - ID: fmt.Sprint(id.Int64), - JobID: int(jobId.Int64), - }) - } - } - - rows, err = queryRunning.RunWith(r.stmtCache).Query() - if err != nil { - log.Errorf("Error while running query: %v", err) - return nil, err - } - - for rows.Next() { - var id, jobId, startTime sql.NullInt64 - - if err := rows.Scan(&id, &jobId, &startTime); err != nil { - log.Warn("Error while scanning rows") - return nil, err - } - - if id.Valid { - queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64)) - items = append(items, - &model.JobLink{ - ID: fmt.Sprint(id.Int64), - JobID: int(jobId.Int64), - }) - } - } - - cnt := len(items) - - return &model.JobLinkResultList{ - ListQuery: &queryString, - Items: items, - Count: &cnt, - }, nil -} - -// Start inserts a new job in the table, returning the unique job ID. -// Statistics are not transfered! -func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { - job.RawFootprint, err = json.Marshal(job.Footprint) - if err != nil { - return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err) - } - - job.RawResources, err = json.Marshal(job.Resources) - if err != nil { - return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err) - } - - job.RawMetaData, err = json.Marshal(job.MetaData) - if err != nil { - return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err) - } - - res, err := r.DB.NamedExec(`INSERT INTO job ( - job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, resources, meta_data - ) VALUES ( - :job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :resources, :meta_data - );`, job) - if err != nil { - return -1, err - } - - return res.LastInsertId() -} - -// Stop updates the job with the database id jobId using the provided arguments. -func (r *JobRepository) Stop( - jobId int64, - duration int32, - state schema.JobState, - monitoringStatus int32, -) (err error) { - stmt := sq.Update("job"). - Set("job_state", state). - Set("duration", duration). - Set("monitoring_status", monitoringStatus). - Where("job.id = ?", jobId) - - _, err = stmt.RunWith(r.stmtCache).Exec() - return -} - func (r *JobRepository) DeleteJobsBefore(startTime int64) (int, error) { var cnt int q := sq.Select("count(*)").From("job").Where("job.start_time < ?", startTime) @@ -789,26 +565,3 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64 log.Infof("Return job count %d", len(jobs)) return jobs, nil } - -const NamedJobInsert string = `INSERT INTO job ( - job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, resources, meta_data -) VALUES ( - :job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :resources, :meta_data -);` - -func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) { - res, err := r.DB.NamedExec(NamedJobInsert, job) - if err != nil { - log.Warn("Error while NamedJobInsert") - return 0, err - } - id, err := res.LastInsertId() - if err != nil { - log.Warn("Error while getting last insert ID") - return 0, err - } - - return id, nil -} diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go new file mode 100644 index 0000000..43c26c1 --- /dev/null +++ b/internal/repository/jobCreate.go @@ -0,0 +1,75 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "encoding/json" + "fmt" + + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + sq "github.com/Masterminds/squirrel" +) + +const NamedJobInsert string = `INSERT INTO job ( + job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc, + exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, resources, meta_data +) VALUES ( + :job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :resources, :meta_data +);` + +func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { + res, err := r.DB.NamedExec(NamedJobInsert, job) + if err != nil { + log.Warn("Error while NamedJobInsert") + return 0, err + } + id, err := res.LastInsertId() + if err != nil { + log.Warn("Error while getting last insert ID") + return 0, err + } + + return id, nil +} + +// Start inserts a new job in the table, returning the unique job ID. +// Statistics are not transfered! +func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { + job.RawFootprint, err = json.Marshal(job.Footprint) + if err != nil { + return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err) + } + + job.RawResources, err = json.Marshal(job.Resources) + if err != nil { + return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err) + } + + job.RawMetaData, err = json.Marshal(job.MetaData) + if err != nil { + return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err) + } + + return r.InsertJob(job) +} + +// Stop updates the job with the database id jobId using the provided arguments. +func (r *JobRepository) Stop( + jobId int64, + duration int32, + state schema.JobState, + monitoringStatus int32, +) (err error) { + stmt := sq.Update("job"). + Set("job_state", state). + Set("duration", duration). + Set("monitoring_status", monitoringStatus). + Where("job.id = ?", jobId) + + _, err = stmt.RunWith(r.stmtCache).Exec() + return +} diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go new file mode 100644 index 0000000..f1264dd --- /dev/null +++ b/internal/repository/jobFind.go @@ -0,0 +1,192 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/graph/model" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + sq "github.com/Masterminds/squirrel" +) + +// Find executes a SQL query to find a specific batch job. +// The job is queried using the batch job id, the cluster name, +// and the start time of the job in UNIX epoch time seconds. +// It returns a pointer to a schema.Job data structure and an error variable. +// To check if no job was found test err == sql.ErrNoRows +func (r *JobRepository) Find( + jobId *int64, + cluster *string, + startTime *int64, +) (*schema.Job, error) { + start := time.Now() + q := sq.Select(jobColumns...).From("job"). + Where("job.job_id = ?", *jobId) + + if cluster != nil { + q = q.Where("job.cluster = ?", *cluster) + } + if startTime != nil { + q = q.Where("job.start_time = ?", *startTime) + } + + log.Debugf("Timer Find %s", time.Since(start)) + + return scanJob(q.RunWith(r.stmtCache).QueryRow()) +} + +// Find executes a SQL query to find a specific batch job. +// The job is queried using the batch job id, the cluster name, +// and the start time of the job in UNIX epoch time seconds. +// It returns a pointer to a schema.Job data structure and an error variable. +// To check if no job was found test err == sql.ErrNoRows +func (r *JobRepository) FindAll( + jobId *int64, + cluster *string, + startTime *int64, +) ([]*schema.Job, error) { + start := time.Now() + q := sq.Select(jobColumns...).From("job"). + Where("job.job_id = ?", *jobId) + + if cluster != nil { + q = q.Where("job.cluster = ?", *cluster) + } + if startTime != nil { + q = q.Where("job.start_time = ?", *startTime) + } + + rows, err := q.RunWith(r.stmtCache).Query() + if err != nil { + log.Error("Error while running query") + return nil, err + } + + jobs := make([]*schema.Job, 0, 10) + for rows.Next() { + job, err := scanJob(rows) + if err != nil { + log.Warn("Error while scanning rows") + return nil, err + } + jobs = append(jobs, job) + } + log.Debugf("Timer FindAll %s", time.Since(start)) + return jobs, nil +} + +// FindById executes a SQL query to find a specific batch job. +// The job is queried using the database id. +// It returns a pointer to a schema.Job data structure and an error variable. +// To check if no job was found test err == sql.ErrNoRows +func (r *JobRepository) FindById(jobId int64) (*schema.Job, error) { + q := sq.Select(jobColumns...). + From("job").Where("job.id = ?", jobId) + return scanJob(q.RunWith(r.stmtCache).QueryRow()) +} + +func (r *JobRepository) FindConcurrentJobs( + ctx context.Context, + job *schema.Job, +) (*model.JobLinkResultList, error) { + if job == nil { + return nil, nil + } + + query, qerr := SecurityCheck(ctx, sq.Select("job.id", "job.job_id", "job.start_time").From("job")) + if qerr != nil { + return nil, qerr + } + + query = query.Where("cluster = ?", job.Cluster) + var startTime int64 + var stopTime int64 + + startTime = job.StartTimeUnix + hostname := job.Resources[0].Hostname + + if job.State == schema.JobStateRunning { + stopTime = time.Now().Unix() + } else { + stopTime = startTime + int64(job.Duration) + } + + // Add 200s overlap for jobs start time at the end + startTimeTail := startTime + 10 + stopTimeTail := stopTime - 200 + startTimeFront := startTime + 200 + + queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)", + "running", startTimeTail, stopTimeTail, startTime) + queryRunning = queryRunning.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%")) + + query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)", + "running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime) + query = query.Where("job.resources LIKE ?", fmt.Sprint("%", hostname, "%")) + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Errorf("Error while running query: %v", err) + return nil, err + } + + items := make([]*model.JobLink, 0, 10) + queryString := fmt.Sprintf("cluster=%s", job.Cluster) + + for rows.Next() { + var id, jobId, startTime sql.NullInt64 + + if err = rows.Scan(&id, &jobId, &startTime); err != nil { + log.Warn("Error while scanning rows") + return nil, err + } + + if id.Valid { + queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64)) + items = append(items, + &model.JobLink{ + ID: fmt.Sprint(id.Int64), + JobID: int(jobId.Int64), + }) + } + } + + rows, err = queryRunning.RunWith(r.stmtCache).Query() + if err != nil { + log.Errorf("Error while running query: %v", err) + return nil, err + } + + for rows.Next() { + var id, jobId, startTime sql.NullInt64 + + if err := rows.Scan(&id, &jobId, &startTime); err != nil { + log.Warn("Error while scanning rows") + return nil, err + } + + if id.Valid { + queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64)) + items = append(items, + &model.JobLink{ + ID: fmt.Sprint(id.Int64), + JobID: int(jobId.Int64), + }) + } + } + + cnt := len(items) + + return &model.JobLinkResultList{ + ListQuery: &queryString, + Items: items, + Count: &cnt, + }, nil +} diff --git a/internal/repository/query.go b/internal/repository/jobQuery.go similarity index 100% rename from internal/repository/query.go rename to internal/repository/jobQuery.go diff --git a/internal/repository/user.go b/internal/repository/user.go index 3b7d945..a8776a9 100644 --- a/internal/repository/user.go +++ b/internal/repository/user.go @@ -72,7 +72,6 @@ func (r *UserRepository) GetUser(username string) (*schema.User, error) { } func (r *UserRepository) GetLdapUsernames() ([]string, error) { - var users []string rows, err := r.DB.Query(`SELECT username FROM user WHERE user.ldap = 1`) if err != nil { @@ -132,7 +131,6 @@ func (r *UserRepository) AddUser(user *schema.User) error { } func (r *UserRepository) DelUser(username string) error { - _, err := r.DB.Exec(`DELETE FROM user WHERE user.username = ?`, username) if err != nil { log.Errorf("Error while deleting user '%s' from DB", username) @@ -143,7 +141,6 @@ func (r *UserRepository) DelUser(username string) error { } func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) { - q := sq.Select("username", "name", "email", "roles", "projects").From("user") if specialsOnly { q = q.Where("(roles != '[\"user\"]' AND roles != '[]')") @@ -186,8 +183,8 @@ func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) { func (r *UserRepository) AddRole( ctx context.Context, username string, - queryrole string) error { - + queryrole string, +) error { newRole := strings.ToLower(queryrole) user, err := r.GetUser(username) if err != nil { @@ -198,15 +195,15 @@ func (r *UserRepository) AddRole( exists, valid := user.HasValidRole(newRole) if !valid { - return fmt.Errorf("Supplied role is no valid option : %v", newRole) + return fmt.Errorf("supplied role is no valid option : %v", newRole) } if exists { - return fmt.Errorf("User %v already has role %v", username, newRole) + return fmt.Errorf("user %v already has role %v", username, newRole) } roles, _ := json.Marshal(append(user.Roles, newRole)) if _, err := sq.Update("user").Set("roles", roles).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil { - log.Errorf("Error while adding new role for user '%s'", user.Username) + log.Errorf("error while adding new role for user '%s'", user.Username) return err } return nil @@ -223,14 +220,14 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr exists, valid := user.HasValidRole(oldRole) if !valid { - return fmt.Errorf("Supplied role is no valid option : %v", oldRole) + return fmt.Errorf("supplied role is no valid option : %v", oldRole) } if !exists { - return fmt.Errorf("Role already deleted for user '%v': %v", username, oldRole) + return fmt.Errorf("role already deleted for user '%v': %v", username, oldRole) } if oldRole == schema.GetRoleString(schema.RoleManager) && len(user.Projects) != 0 { - return fmt.Errorf("Cannot remove role 'manager' while user %s still has assigned project(s) : %v", username, user.Projects) + return fmt.Errorf("cannot remove role 'manager' while user %s still has assigned project(s) : %v", username, user.Projects) } var newroles []string @@ -240,7 +237,7 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr } } - var mroles, _ = json.Marshal(newroles) + mroles, _ := json.Marshal(newroles) if _, err := sq.Update("user").Set("roles", mroles).Where("user.username = ?", username).RunWith(r.DB).Exec(); err != nil { log.Errorf("Error while removing role for user '%s'", user.Username) return err @@ -251,15 +248,15 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr func (r *UserRepository) AddProject( ctx context.Context, username string, - project string) error { - + project string, +) error { user, err := r.GetUser(username) if err != nil { return err } if !user.HasRole(schema.RoleManager) { - return fmt.Errorf("user '%s' is not a manager!", username) + return fmt.Errorf("user '%s' is not a manager", username) } if user.HasProject(project) { @@ -281,11 +278,11 @@ func (r *UserRepository) RemoveProject(ctx context.Context, username string, pro } if !user.HasRole(schema.RoleManager) { - return fmt.Errorf("user '%#v' is not a manager!", username) + return fmt.Errorf("user '%#v' is not a manager", username) } if !user.HasProject(project) { - return fmt.Errorf("user '%#v': Cannot remove project '%#v' - Does not match!", username, project) + return fmt.Errorf("user '%#v': Cannot remove project '%#v' - Does not match", username, project) } var exists bool @@ -298,7 +295,7 @@ func (r *UserRepository) RemoveProject(ctx context.Context, username string, pro } } - if exists == true { + if exists { var result interface{} if len(newprojects) == 0 { result = "[]"