diff --git a/api/schema.graphqls b/api/schema.graphqls index b3dadb5..d1c78f3 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -237,10 +237,12 @@ enum Aggregate { USER PROJECT CLUSTER + SUBCLUSTER } enum SortByAggregate { TOTALWALLTIME TOTALJOBS + TOTALUSERS TOTALNODES TOTALNODEHOURS TOTALCORES @@ -501,11 +503,12 @@ type MetricHistoPoint { } type JobsStatistics { - id: ID! # If `groupBy` was used, ID of the user/project/cluster + id: ID! # If `groupBy` was used, ID of the user/project/cluster/subcluster name: String! # if User-Statistics: Given Name of Account (ID) Owner + totalUsers: Int! # if *not* User-Statistics: Number of active users (based on running jobs) totalJobs: Int! # Number of jobs runningJobs: Int! # Number of running jobs - shortJobs: Int! # Number of jobs with a duration of less than duration + shortJobs: Int! # Number of jobs with a duration of less than config'd ShortRunningJobsDuration totalWalltime: Int! # Sum of the duration of all matched jobs in hours totalNodes: Int! # Sum of the nodes of all matched jobs totalNodeHours: Int! # Sum of the node hours of all matched jobs diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index a725802..ff4469a 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -201,6 +201,7 @@ type ComplexityRoot struct { TotalJobs func(childComplexity int) int TotalNodeHours func(childComplexity int) int TotalNodes func(childComplexity int) int + TotalUsers func(childComplexity int) int TotalWalltime func(childComplexity int) int } @@ -1166,6 +1167,13 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.JobsStatistics.TotalNodes(childComplexity), true + case "JobsStatistics.totalUsers": + if e.complexity.JobsStatistics.TotalUsers == nil { + break + } + + return e.complexity.JobsStatistics.TotalUsers(childComplexity), true + case "JobsStatistics.totalWalltime": if e.complexity.JobsStatistics.TotalWalltime == nil { break @@ -2567,10 +2575,12 @@ enum Aggregate { USER PROJECT CLUSTER + SUBCLUSTER } enum SortByAggregate { TOTALWALLTIME TOTALJOBS + TOTALUSERS TOTALNODES TOTALNODEHOURS TOTALCORES @@ -2831,8 +2841,9 @@ type MetricHistoPoint { } type JobsStatistics { - id: ID! # If ` + "`" + `groupBy` + "`" + ` was used, ID of the user/project/cluster + id: ID! # If ` + "`" + `groupBy` + "`" + ` was used, ID of the user/project/cluster/subcluster name: String! # if User-Statistics: Given Name of Account (ID) Owner + totalUsers: Int! # if *not* User-Statistics: Number of active users (based on running jobs) totalJobs: Int! # Number of jobs runningJobs: Int! # Number of running jobs shortJobs: Int! # Number of jobs with a duration of less than duration @@ -8334,6 +8345,50 @@ func (ec *executionContext) fieldContext_JobsStatistics_name(_ context.Context, return fc, nil } +func (ec *executionContext) _JobsStatistics_totalUsers(ctx context.Context, field graphql.CollectedField, obj *model.JobsStatistics) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_JobsStatistics_totalUsers(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.TotalUsers, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(int) + fc.Result = res + return ec.marshalNInt2int(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_JobsStatistics_totalUsers(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "JobsStatistics", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _JobsStatistics_totalJobs(ctx context.Context, field graphql.CollectedField, obj *model.JobsStatistics) (ret graphql.Marshaler) { fc, err := ec.fieldContext_JobsStatistics_totalJobs(ctx, field) if err != nil { @@ -12636,6 +12691,8 @@ func (ec *executionContext) fieldContext_Query_jobsStatistics(ctx context.Contex return ec.fieldContext_JobsStatistics_id(ctx, field) case "name": return ec.fieldContext_JobsStatistics_name(ctx, field) + case "totalUsers": + return ec.fieldContext_JobsStatistics_totalUsers(ctx, field) case "totalJobs": return ec.fieldContext_JobsStatistics_totalJobs(ctx, field) case "runningJobs": @@ -19240,6 +19297,11 @@ func (ec *executionContext) _JobsStatistics(ctx context.Context, sel ast.Selecti if out.Values[i] == graphql.Null { out.Invalids++ } + case "totalUsers": + out.Values[i] = ec._JobsStatistics_totalUsers(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } case "totalJobs": out.Values[i] = ec._JobsStatistics_totalJobs(ctx, field, obj) if out.Values[i] == graphql.Null { diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index e6619b7..a5fe2a2 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -112,6 +112,7 @@ type JobStats struct { type JobsStatistics struct { ID string `json:"id"` Name string `json:"name"` + TotalUsers int `json:"totalUsers"` TotalJobs int `json:"totalJobs"` RunningJobs int `json:"runningJobs"` ShortJobs int `json:"shortJobs"` @@ -247,20 +248,22 @@ type User struct { type Aggregate string const ( - AggregateUser Aggregate = "USER" - AggregateProject Aggregate = "PROJECT" - AggregateCluster Aggregate = "CLUSTER" + AggregateUser Aggregate = "USER" + AggregateProject Aggregate = "PROJECT" + AggregateCluster Aggregate = "CLUSTER" + AggregateSubcluster Aggregate = "SUBCLUSTER" ) var AllAggregate = []Aggregate{ AggregateUser, AggregateProject, AggregateCluster, + AggregateSubcluster, } func (e Aggregate) IsValid() bool { switch e { - case AggregateUser, AggregateProject, AggregateCluster: + case AggregateUser, AggregateProject, AggregateCluster, AggregateSubcluster: return true } return false @@ -292,6 +295,7 @@ type SortByAggregate string const ( SortByAggregateTotalwalltime SortByAggregate = "TOTALWALLTIME" SortByAggregateTotaljobs SortByAggregate = "TOTALJOBS" + SortByAggregateTotalusers SortByAggregate = "TOTALUSERS" SortByAggregateTotalnodes SortByAggregate = "TOTALNODES" SortByAggregateTotalnodehours SortByAggregate = "TOTALNODEHOURS" SortByAggregateTotalcores SortByAggregate = "TOTALCORES" @@ -303,6 +307,7 @@ const ( var AllSortByAggregate = []SortByAggregate{ SortByAggregateTotalwalltime, SortByAggregateTotaljobs, + SortByAggregateTotalusers, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, @@ -313,7 +318,7 @@ var AllSortByAggregate = []SortByAggregate{ func (e SortByAggregate) IsValid() bool { switch e { - case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours: + case SortByAggregateTotalwalltime, SortByAggregateTotaljobs, SortByAggregateTotalusers, SortByAggregateTotalnodes, SortByAggregateTotalnodehours, SortByAggregateTotalcores, SortByAggregateTotalcorehours, SortByAggregateTotalaccs, SortByAggregateTotalacchours: return true } return false diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index e0a7948..b993ebb 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -581,7 +581,7 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF defaultDurationBins := "1h" defaultMetricBins := 10 - if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") || + if requireField(ctx, "totalJobs") || requireField(ctx, "totalUsers") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") || requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") { if groupBy == nil { stats, err = r.Repo.JobsStats(ctx, filter) diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 7beb674..1aa3c55 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -21,13 +21,15 @@ import ( // GraphQL validation should make sure that no unkown values can be specified. var groupBy2column = map[model.Aggregate]string{ - model.AggregateUser: "job.hpc_user", - model.AggregateProject: "job.project", - model.AggregateCluster: "job.cluster", + model.AggregateUser: "job.hpc_user", + model.AggregateProject: "job.project", + model.AggregateCluster: "job.cluster", + model.AggregateSubcluster: "job.subcluster", } var sortBy2column = map[model.SortByAggregate]string{ model.SortByAggregateTotaljobs: "totalJobs", + model.SortByAggregateTotalusers: "totalUsers", model.SortByAggregateTotalwalltime: "totalWalltime", model.SortByAggregateTotalnodes: "totalNodes", model.SortByAggregateTotalnodehours: "totalNodeHours", @@ -76,8 +78,12 @@ func (r *JobRepository) buildStatsQuery( // fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType) if col != "" { - // Scan columns: id, totalJobs, name, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours - query = sq.Select(col, "COUNT(job.id) as totalJobs", "name", + // Scan columns: id, name, totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours + query = sq.Select( + col, + "name", + "COUNT(job.id) as totalJobs", + "COUNT(DISTINCT job.hpc_user) AS totalUsers", fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s) as totalWalltime`, time.Now().Unix(), castType), fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s) as totalNodes`, castType), fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s) as totalNodeHours`, time.Now().Unix(), castType), @@ -87,8 +93,10 @@ func (r *JobRepository) buildStatsQuery( fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as %s) as totalAccHours`, time.Now().Unix(), castType), ).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col) } else { - // Scan columns: totalJobs, name, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours - query = sq.Select("COUNT(job.id)", + // Scan columns: totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours + query = sq.Select( + "COUNT(job.id) as totalJobs", + "COUNT(DISTINCT job.hpc_user) AS totalUsers", fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s)`, time.Now().Unix(), castType), fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s)`, castType), fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s)`, time.Now().Unix(), castType), @@ -167,14 +175,14 @@ func (r *JobRepository) JobsStatsGrouped( for rows.Next() { var id sql.NullString var name sql.NullString - var jobs, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64 - if err := rows.Scan(&id, &jobs, &name, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil { + var jobs, users, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64 + if err := rows.Scan(&id, &name, &jobs, &users, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil { cclog.Warn("Error while scanning rows") return nil, err } if id.Valid { - var totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours int + var totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours int var personName string if name.Valid { @@ -185,6 +193,10 @@ func (r *JobRepository) JobsStatsGrouped( totalJobs = int(jobs.Int64) } + if users.Valid { + totalUsers = int(users.Int64) + } + if walltime.Valid { totalWalltime = int(walltime.Int64) } @@ -228,8 +240,9 @@ func (r *JobRepository) JobsStatsGrouped( stats = append(stats, &model.JobsStatistics{ ID: id.String, - TotalJobs: int(jobs.Int64), - TotalWalltime: int(walltime.Int64), + TotalJobs: totalJobs, + TotalUsers: totalUsers, + TotalWalltime: totalWalltime, TotalNodes: totalNodes, TotalNodeHours: totalNodeHours, TotalCores: totalCores, @@ -259,8 +272,8 @@ func (r *JobRepository) JobsStats( row := query.RunWith(r.DB).QueryRow() stats := make([]*model.JobsStatistics, 0, 1) - var jobs, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64 - if err := row.Scan(&jobs, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil { + var jobs, users, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64 + if err := row.Scan(&jobs, &users, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil { cclog.Warn("Error while scanning rows") return nil, err } @@ -280,6 +293,7 @@ func (r *JobRepository) JobsStats( stats = append(stats, &model.JobsStatistics{ TotalJobs: int(jobs.Int64), + TotalUsers: int(users.Int64), TotalWalltime: int(walltime.Int64), TotalNodeHours: totalNodeHours, TotalCoreHours: totalCoreHours, diff --git a/web/frontend/src/status/StatusDash.svelte b/web/frontend/src/status/StatusDash.svelte index f98c1c3..a1196e5 100644 --- a/web/frontend/src/status/StatusDash.svelte +++ b/web/frontend/src/status/StatusDash.svelte @@ -45,12 +45,17 @@ let plotWidths = $state([]); // Bar Gauges let allocatedNodes = $state({}); + let allocatedAccs = $state({}); let flopRate = $state({}); let flopRateUnitPrefix = $state({}); let flopRateUnitBase = $state({}); let memBwRate = $state({}); let memBwRateUnitPrefix = $state({}); let memBwRateUnitBase = $state({}); + // Plain Infos + let runningJobs = $state({}); + let activeUsers = $state({}); + let totalAccs = $state({}); /* Derived */ // Note: nodeMetrics are requested on configured $timestep resolution @@ -63,6 +68,8 @@ $metrics: [String!] $from: Time! $to: Time! + $filter: [JobFilter!]! + $paging: PageRequest! ) { nodeMetrics( cluster: $cluster @@ -87,11 +94,23 @@ } } } - + # Only counts shared nodes once allocatedNodes(cluster: $cluster) { name count } + # totalNodes includes multiples if shared jobs + jobsStatistics( + filter: $filter + page: $paging + sortBy: TOTALJOBS + groupBy: SUBCLUSTER + ) { + id + totalJobs + totalUsers + totalAccs + } } `, variables: { @@ -99,7 +118,8 @@ metrics: ["flops_any", "mem_bw"], // Fixed names for roofline and status bars from: from.toISOString(), to: to.toISOString(), - // filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], + filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], + paging: { itemsPerPage: -1, page: 1 }, // Get all: -1 }, })); @@ -110,10 +130,27 @@ (c) => c.name == cluster, ).subClusters; for (let subCluster of subClusters) { + // Allocations allocatedNodes[subCluster.name] = $statusQuery.data.allocatedNodes.find( ({ name }) => name == subCluster.name, )?.count || 0; + allocatedAccs[subCluster.name] = + $statusQuery.data.jobsStatistics.find( + ({ id }) => id == subCluster.name, + )?.totalAccs || 0; + // Infos + activeUsers[subCluster.name] = + $statusQuery.data.jobsStatistics.find( + ({ id }) => id == subCluster.name, + )?.totalUsers || 0; + runningJobs[subCluster.name] = + $statusQuery.data.jobsStatistics.find( + ({ id }) => id == subCluster.name, + )?.totalJobs || 0; + totalAccs[subCluster.name] = + (subCluster?.numberOfNodes * subCluster?.topology?.accelerators?.length) || null; + // Keymetrics flopRate[subCluster.name] = Math.floor( sumUp($statusQuery.data.nodeMetrics, subCluster.name, "flops_any") * @@ -158,9 +195,15 @@ SubCluster "{subCluster.name}" + {subCluster.processorType} + + + + +
+ {#if totalAccs[subCluster.name] !== null} + + + + + + {/if}
{runningJobs[subCluster.name]} Running Jobs{activeUsers[subCluster.name]} Active Users
Allocated Nodes
Allocated Accelerators
+ +
{allocatedAccs[subCluster.name]} / {totalAccs[subCluster.name]} + Accelerators
Flop Rate (Any)