cc-backend/internal/repository/stats.go

610 lines
17 KiB
Go
Raw Normal View History

2023-06-06 10:27:55 +02:00
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"context"
"database/sql"
"fmt"
"time"
2023-06-08 06:18:19 +02:00
"github.com/ClusterCockpit/cc-backend/internal/config"
2023-06-06 10:27:55 +02:00
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
2023-12-01 13:22:01 +01:00
"github.com/ClusterCockpit/cc-backend/pkg/archive"
2023-06-06 10:27:55 +02:00
"github.com/ClusterCockpit/cc-backend/pkg/log"
2023-12-01 13:22:01 +01:00
"github.com/ClusterCockpit/cc-backend/pkg/schema"
2023-06-06 10:27:55 +02:00
sq "github.com/Masterminds/squirrel"
)
// GraphQL validation should make sure that no unkown values can be specified.
var groupBy2column = map[model.Aggregate]string{
model.AggregateUser: "job.user",
model.AggregateProject: "job.project",
model.AggregateCluster: "job.cluster",
}
var sortBy2column = map[model.SortByAggregate]string{
model.SortByAggregateTotaljobs: "totalJobs",
model.SortByAggregateTotalwalltime: "totalWalltime",
model.SortByAggregateTotalnodes: "totalNodes",
model.SortByAggregateTotalnodehours: "totalNodeHours",
model.SortByAggregateTotalcores: "totalCores",
model.SortByAggregateTotalcorehours: "totalCoreHours",
model.SortByAggregateTotalaccs: "totalAccs",
model.SortByAggregateTotalacchours: "totalAccHours",
}
2023-06-08 06:18:19 +02:00
func (r *JobRepository) buildCountQuery(
filter []*model.JobFilter,
kind string,
col string) sq.SelectBuilder {
var query sq.SelectBuilder
if col != "" {
// Scan columns: id, cnt
query = sq.Select(col, "COUNT(job.id)").From("job").GroupBy(col)
} else {
// Scan columns: cnt
query = sq.Select("COUNT(job.id)").From("job")
}
switch kind {
case "running":
query = query.Where("job.job_state = ?", "running")
case "short":
query = query.Where("job.duration < ?", config.Keys.ShortRunningJobsDuration)
}
for _, f := range filter {
query = BuildWhereClause(f, query)
}
return query
}
func (r *JobRepository) buildStatsQuery(
2023-06-06 10:27:55 +02:00
filter []*model.JobFilter,
2023-06-07 11:58:58 +02:00
col string) sq.SelectBuilder {
2023-06-06 10:27:55 +02:00
2023-06-07 11:58:58 +02:00
var query sq.SelectBuilder
castType := r.getCastType()
// fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType)
2023-06-07 11:58:58 +02:00
if col != "" {
// Scan columns: id, totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
query = sq.Select(col, "COUNT(job.id) as totalJobs",
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s) as totalWalltime`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s) as totalNodes`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s) as totalNodeHours`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as %s) as totalCores`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as %s) as totalCoreHours`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_acc) as %s) as totalAccs`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as %s) as totalAccHours`, time.Now().Unix(), castType),
2023-06-07 11:58:58 +02:00
).From("job").GroupBy(col)
2023-06-07 11:58:58 +02:00
} else {
// Scan columns: totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
2023-06-07 11:58:58 +02:00
query = sq.Select("COUNT(job.id)",
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s)`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s)`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s)`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as %s)`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as %s)`, time.Now().Unix(), castType),
fmt.Sprintf(`CAST(SUM(job.num_acc) as %s)`, castType),
fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as %s)`, time.Now().Unix(), castType),
2023-06-07 11:58:58 +02:00
).From("job")
}
for _, f := range filter {
query = BuildWhereClause(f, query)
}
return query
}
func (r *JobRepository) getUserName(ctx context.Context, id string) string {
user := GetUserFromContext(ctx)
2023-06-07 11:58:58 +02:00
name, _ := r.FindColumnValue(user, id, "user", "name", "username", false)
if name != "" {
return name
} else {
return "-"
}
}
func (r *JobRepository) getCastType() string {
2023-06-06 10:27:55 +02:00
var castType string
switch r.driver {
case "sqlite3":
castType = "int"
case "mysql":
castType = "unsigned"
2023-06-07 11:58:58 +02:00
default:
castType = ""
2023-06-06 10:27:55 +02:00
}
2023-06-07 11:58:58 +02:00
return castType
}
2023-06-09 09:09:41 +02:00
func (r *JobRepository) JobsStatsGrouped(
2023-06-07 11:58:58 +02:00
ctx context.Context,
filter []*model.JobFilter,
page *model.PageRequest,
sortBy *model.SortByAggregate,
2023-06-07 11:58:58 +02:00
groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
start := time.Now()
col := groupBy2column[*groupBy]
2023-06-08 06:18:19 +02:00
query := r.buildStatsQuery(filter, col)
2023-06-07 11:58:58 +02:00
query, err := SecurityCheck(ctx, query)
if err != nil {
return nil, err
}
if sortBy != nil {
sortBy := sortBy2column[*sortBy]
query = query.OrderBy(fmt.Sprintf("%s DESC", sortBy))
}
if page != nil && page.ItemsPerPage != -1 {
limit := uint64(page.ItemsPerPage)
query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
}
2023-06-07 11:58:58 +02:00
rows, err := query.RunWith(r.DB).Query()
if err != nil {
log.Warn("Error while querying DB for job statistics")
return nil, err
}
stats := make([]*model.JobsStatistics, 0, 100)
for rows.Next() {
var id sql.NullString
var jobs, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64
if err := rows.Scan(&id, &jobs, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
2023-06-07 11:58:58 +02:00
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
var totalJobs, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours int
if jobs.Valid {
totalJobs = int(jobs.Int64)
}
if walltime.Valid {
totalWalltime = int(walltime.Int64)
}
if nodes.Valid {
totalNodes = int(nodes.Int64)
}
if cores.Valid {
totalCores = int(cores.Int64)
}
if accs.Valid {
totalAccs = int(accs.Int64)
}
2023-06-09 11:29:07 +02:00
if nodeHours.Valid {
totalNodeHours = int(nodeHours.Int64)
}
2023-06-09 11:29:07 +02:00
if coreHours.Valid {
totalCoreHours = int(coreHours.Int64)
}
if accHours.Valid {
totalAccHours = int(accHours.Int64)
}
2023-06-07 11:58:58 +02:00
if col == "job.user" {
name := r.getUserName(ctx, id.String)
stats = append(stats,
&model.JobsStatistics{
2023-06-09 11:29:07 +02:00
ID: id.String,
Name: name,
TotalJobs: totalJobs,
TotalWalltime: totalWalltime,
TotalNodes: totalNodes,
TotalNodeHours: totalNodeHours,
TotalCores: totalCores,
2023-06-09 11:29:07 +02:00
TotalCoreHours: totalCoreHours,
TotalAccs: totalAccs,
2023-06-09 11:29:07 +02:00
TotalAccHours: totalAccHours})
2023-06-06 10:27:55 +02:00
} else {
2023-06-07 11:58:58 +02:00
stats = append(stats,
&model.JobsStatistics{
2023-06-09 11:29:07 +02:00
ID: id.String,
TotalJobs: int(jobs.Int64),
TotalWalltime: int(walltime.Int64),
TotalNodes: totalNodes,
TotalNodeHours: totalNodeHours,
TotalCores: totalCores,
2023-06-09 11:29:07 +02:00
TotalCoreHours: totalCoreHours,
TotalAccs: totalAccs,
2023-06-09 11:29:07 +02:00
TotalAccHours: totalAccHours})
2023-06-06 10:27:55 +02:00
}
2023-06-07 11:58:58 +02:00
}
}
2023-06-06 10:27:55 +02:00
2023-06-20 15:47:38 +02:00
log.Debugf("Timer JobsStatsGrouped %s", time.Since(start))
2023-06-07 11:58:58 +02:00
return stats, nil
}
func (r *JobRepository) JobsStats(
ctx context.Context,
2023-06-07 11:58:58 +02:00
filter []*model.JobFilter) ([]*model.JobsStatistics, error) {
start := time.Now()
query := r.buildStatsQuery(filter, "")
query, err := SecurityCheck(ctx, query)
if err != nil {
return nil, err
}
2023-06-07 11:58:58 +02:00
row := query.RunWith(r.DB).QueryRow()
stats := make([]*model.JobsStatistics, 0, 1)
2023-06-09 09:09:41 +02:00
var jobs, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64
if err := row.Scan(&jobs, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
2023-06-07 11:58:58 +02:00
log.Warn("Error while scanning rows")
return nil, err
}
if jobs.Valid {
var totalNodeHours, totalCoreHours, totalAccHours int
2023-06-09 11:29:07 +02:00
if nodeHours.Valid {
totalNodeHours = int(nodeHours.Int64)
}
2023-06-09 11:29:07 +02:00
if coreHours.Valid {
totalCoreHours = int(coreHours.Int64)
}
if accHours.Valid {
totalAccHours = int(accHours.Int64)
}
2023-06-07 11:58:58 +02:00
stats = append(stats,
&model.JobsStatistics{
2023-06-09 11:29:07 +02:00
TotalJobs: int(jobs.Int64),
TotalWalltime: int(walltime.Int64),
TotalNodeHours: totalNodeHours,
2023-06-09 11:29:07 +02:00
TotalCoreHours: totalCoreHours,
TotalAccHours: totalAccHours})
2023-06-07 11:58:58 +02:00
}
log.Debugf("Timer JobStats %s", time.Since(start))
return stats, nil
}
2023-06-09 09:09:41 +02:00
func (r *JobRepository) JobCountGrouped(
2023-06-07 11:58:58 +02:00
ctx context.Context,
filter []*model.JobFilter,
groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
start := time.Now()
col := groupBy2column[*groupBy]
2023-06-09 09:09:41 +02:00
query := r.buildCountQuery(filter, "", col)
2023-06-08 06:18:19 +02:00
query, err := SecurityCheck(ctx, query)
if err != nil {
return nil, err
}
rows, err := query.RunWith(r.DB).Query()
if err != nil {
log.Warn("Error while querying DB for job statistics")
return nil, err
}
2023-06-09 09:09:41 +02:00
stats := make([]*model.JobsStatistics, 0, 100)
2023-06-08 06:18:19 +02:00
for rows.Next() {
var id sql.NullString
var cnt sql.NullInt64
if err := rows.Scan(&id, &cnt); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
2023-06-09 09:09:41 +02:00
stats = append(stats,
&model.JobsStatistics{
ID: id.String,
TotalJobs: int(cnt.Int64)})
2023-06-08 06:18:19 +02:00
}
}
2023-06-20 15:47:38 +02:00
log.Debugf("Timer JobCountGrouped %s", time.Since(start))
2023-06-09 09:09:41 +02:00
return stats, nil
}
func (r *JobRepository) AddJobCountGrouped(
ctx context.Context,
filter []*model.JobFilter,
groupBy *model.Aggregate,
stats []*model.JobsStatistics,
kind string) ([]*model.JobsStatistics, error) {
start := time.Now()
col := groupBy2column[*groupBy]
query := r.buildCountQuery(filter, kind, col)
query, err := SecurityCheck(ctx, query)
2023-06-08 06:18:19 +02:00
if err != nil {
return nil, err
}
2023-06-09 09:09:41 +02:00
rows, err := query.RunWith(r.DB).Query()
2023-06-08 06:18:19 +02:00
if err != nil {
log.Warn("Error while querying DB for job statistics")
return nil, err
}
2023-06-09 09:09:41 +02:00
counts := make(map[string]int)
2023-06-08 06:18:19 +02:00
for rows.Next() {
var id sql.NullString
var cnt sql.NullInt64
if err := rows.Scan(&id, &cnt); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
if id.Valid {
2023-06-09 09:09:41 +02:00
counts[id.String] = int(cnt.Int64)
2023-06-08 06:18:19 +02:00
}
}
2023-06-09 09:09:41 +02:00
switch kind {
case "running":
for _, s := range stats {
s.RunningJobs = counts[s.ID]
}
case "short":
for _, s := range stats {
s.ShortJobs = counts[s.ID]
}
2023-06-08 06:18:19 +02:00
}
2023-06-09 09:09:41 +02:00
2023-06-20 15:47:38 +02:00
log.Debugf("Timer AddJobCountGrouped %s", time.Since(start))
2023-06-08 06:18:19 +02:00
return stats, nil
}
2023-06-09 13:15:25 +02:00
func (r *JobRepository) AddJobCount(
ctx context.Context,
filter []*model.JobFilter,
stats []*model.JobsStatistics,
kind string) ([]*model.JobsStatistics, error) {
start := time.Now()
query := r.buildCountQuery(filter, kind, "")
query, err := SecurityCheck(ctx, query)
if err != nil {
return nil, err
}
rows, err := query.RunWith(r.DB).Query()
if err != nil {
log.Warn("Error while querying DB for job statistics")
return nil, err
}
2023-08-24 14:26:23 +02:00
var count int
2023-06-09 13:15:25 +02:00
for rows.Next() {
var cnt sql.NullInt64
if err := rows.Scan(&cnt); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
2023-08-24 14:26:23 +02:00
count = int(cnt.Int64)
2023-06-09 13:15:25 +02:00
}
switch kind {
case "running":
for _, s := range stats {
2023-08-24 14:26:23 +02:00
s.RunningJobs = count
2023-06-09 13:15:25 +02:00
}
case "short":
for _, s := range stats {
2023-08-24 14:26:23 +02:00
s.ShortJobs = count
2023-06-09 13:15:25 +02:00
}
}
2023-08-24 14:26:23 +02:00
log.Debugf("Timer AddJobCount %s", time.Since(start))
2023-06-09 13:15:25 +02:00
return stats, nil
}
2023-06-07 11:58:58 +02:00
func (r *JobRepository) AddHistograms(
ctx context.Context,
filter []*model.JobFilter,
stat *model.JobsStatistics) (*model.JobsStatistics, error) {
2023-06-09 15:02:22 +02:00
start := time.Now()
2023-06-07 11:58:58 +02:00
castType := r.getCastType()
var err error
value := fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType)
stat.HistDuration, err = r.jobsStatisticsHistogram(ctx, value, filter)
if err != nil {
log.Warn("Error while loading job statistics histogram: running jobs")
return nil, err
}
stat.HistNumNodes, err = r.jobsStatisticsHistogram(ctx, "job.num_nodes as value", filter)
if err != nil {
log.Warn("Error while loading job statistics histogram: num nodes")
return nil, err
}
stat.HistNumCores, err = r.jobsStatisticsHistogram(ctx, "job.num_hwthreads as value", filter)
if err != nil {
log.Warn("Error while loading job statistics histogram: num hwthreads")
return nil, err
}
stat.HistNumAccs, err = r.jobsStatisticsHistogram(ctx, "job.num_acc as value", filter)
if err != nil {
log.Warn("Error while loading job statistics histogram: num acc")
2023-06-07 11:58:58 +02:00
return nil, err
}
2023-06-20 15:47:38 +02:00
log.Debugf("Timer AddHistograms %s", time.Since(start))
2023-06-07 11:58:58 +02:00
return stat, nil
}
2023-12-01 13:22:01 +01:00
// Requires thresholds for metric from config for cluster? Of all clusters and use largest? split to 10 + 1 for artifacts?
func (r *JobRepository) AddMetricHistograms(
ctx context.Context,
filter []*model.JobFilter,
metrics []string,
stat *model.JobsStatistics) (*model.JobsStatistics, error) {
start := time.Now()
for i, m := range metrics {
// DEBUG
fmt.Println(i, m)
var err error
var metricHisto *model.MetricHistoPoints
metricHisto, err = r.jobsMetricStatisticsHistogram(ctx, m, filter)
if err != nil {
log.Warnf("Error while loading job metric statistics histogram: %s", m)
continue
}
stat.HistMetrics = append(stat.HistMetrics, metricHisto)
}
log.Debugf("Timer AddMetricHistograms %s", time.Since(start))
return stat, nil
}
2023-06-07 11:58:58 +02:00
// `value` must be the column grouped by, but renamed to "value"
func (r *JobRepository) jobsStatisticsHistogram(
ctx context.Context,
value string,
filters []*model.JobFilter) ([]*model.HistoPoint, error) {
2023-06-06 10:27:55 +02:00
start := time.Now()
2023-06-07 11:58:58 +02:00
query, qerr := SecurityCheck(ctx,
sq.Select(value, "COUNT(job.id) AS count").From("job"))
2023-06-06 10:27:55 +02:00
if qerr != nil {
return nil, qerr
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
rows, err := query.GroupBy("value").RunWith(r.DB).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
points := make([]*model.HistoPoint, 0)
for rows.Next() {
point := model.HistoPoint{}
if err := rows.Scan(&point.Value, &point.Count); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
points = append(points, &point)
}
2023-06-20 15:47:38 +02:00
log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
2023-06-06 10:27:55 +02:00
return points, nil
}
2023-12-01 13:22:01 +01:00
func (r *JobRepository) jobsMetricStatisticsHistogram(
ctx context.Context,
metric string,
filters []*model.JobFilter) (*model.MetricHistoPoints, error) {
var dbMetric string
switch metric {
case "cpu_load":
dbMetric = "load_avg"
case "flops_any":
dbMetric = "flops_any_avg"
case "mem_bw":
dbMetric = "mem_bw_avg"
default:
return nil, fmt.Errorf("%s not implemented", metric)
}
2023-12-01 13:22:01 +01:00
// Get specific Peak or largest Peak
var metricConfig *schema.MetricConfig
var peak float64 = 0.0
2023-12-01 13:22:01 +01:00
for _, f := range filters {
if f.Cluster != nil {
metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric)
peak = metricConfig.Peak
log.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric)
}
}
if peak == 0.0 {
for _, c := range archive.Clusters {
for _, m := range c.MetricConfig {
if m.Name == metric {
if m.Peak > peak {
peak = m.Peak
2023-12-01 13:22:01 +01:00
}
}
}
}
}
// Make bins, see https://jereze.com/code/sql-histogram/
// Diffs:
// CAST(X AS INTEGER) instead of floor(X), used also for for Min , Max selection
// renamed to bin for simplicity and model struct
// Ditched rename from job to data, as it conflicts with security check afterwards
2023-12-01 13:22:01 +01:00
start := time.Now()
prepQuery := sq.Select(
fmt.Sprintf(`CAST(min(job.%s) as INTEGER) as min`, dbMetric),
fmt.Sprintf(`CAST(max(job.%s) as INTEGER) as max`, dbMetric),
fmt.Sprintf(`count(job.%s) as count`, dbMetric),
fmt.Sprintf(`CAST((case when job.%s = value.max then value.max*0.999999999 else job.%s end - value.min) / (value.max - value.min) * 10 as INTEGER) +1 as bin`, dbMetric, dbMetric))
prepQuery = prepQuery.From("job")
prepQuery = prepQuery.CrossJoin(fmt.Sprintf(`(select max(%s) as max, min(%s) as min from job where %s is not null and %s < %f) as value`, dbMetric, dbMetric, dbMetric, dbMetric, peak))
prepQuery = prepQuery.Where(fmt.Sprintf(`job.%s is not null and job.%s < %f`, dbMetric, dbMetric, peak))
query, qerr := SecurityCheck(ctx, prepQuery)
2023-12-01 13:22:01 +01:00
if qerr != nil {
return nil, qerr
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
// Finalize query with Grouping and Ordering
query = query.GroupBy("bin").OrderBy("bin")
rows, err := query.RunWith(r.DB).Query()
2023-12-01 13:22:01 +01:00
if err != nil {
log.Errorf("Error while running query: %s", err)
2023-12-01 13:22:01 +01:00
return nil, err
}
points := make([]*model.MetricHistoPoint, 0)
2023-12-01 13:22:01 +01:00
for rows.Next() {
point := model.MetricHistoPoint{}
if err := rows.Scan(&point.Min, &point.Max, &point.Count, &point.Bin); err != nil {
2023-12-01 13:22:01 +01:00
log.Warn("Error while scanning rows")
return nil, err
}
points = append(points, &point)
}
result := model.MetricHistoPoints{Metric: metric, Data: points}
2023-12-01 13:22:01 +01:00
log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
return &result, nil
2023-12-01 13:22:01 +01:00
}