2024-04-11 23:04:30 +02:00
|
|
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
2022-07-29 06:29:21 +02:00
|
|
|
// All rights reserved.
|
|
|
|
// Use of this source code is governed by a MIT-style
|
|
|
|
// license that can be found in the LICENSE file.
|
2021-10-26 10:24:43 +02:00
|
|
|
package graph
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"math"
|
|
|
|
|
2023-06-09 09:09:41 +02:00
|
|
|
"github.com/99designs/gqlgen/graphql"
|
2022-06-21 17:52:36 +02:00
|
|
|
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
|
|
|
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
2024-08-22 14:29:51 +02:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
2023-01-31 18:28:44 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
2023-02-13 13:53:24 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
2023-08-24 12:56:35 +02:00
|
|
|
// "github.com/ClusterCockpit/cc-backend/pkg/archive"
|
2021-10-26 10:24:43 +02:00
|
|
|
)
|
|
|
|
|
2022-02-17 09:04:57 +01:00
|
|
|
const MAX_JOBS_FOR_ANALYSIS = 500
|
|
|
|
|
2021-10-26 10:24:43 +02:00
|
|
|
// Helper function for the rooflineHeatmap GraphQL query placed here so that schema.resolvers.go is not too full.
|
2022-09-07 12:24:45 +02:00
|
|
|
func (r *queryResolver) rooflineHeatmap(
|
|
|
|
ctx context.Context,
|
|
|
|
filter []*model.JobFilter,
|
|
|
|
rows int, cols int,
|
2022-09-12 13:33:01 +02:00
|
|
|
minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
2022-09-07 12:24:45 +02:00
|
|
|
|
2022-02-17 09:04:57 +01:00
|
|
|
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
2021-10-26 10:24:43 +02:00
|
|
|
if err != nil {
|
2023-01-31 18:28:44 +01:00
|
|
|
log.Error("Error while querying jobs for roofline")
|
2021-10-26 10:24:43 +02:00
|
|
|
return nil, err
|
|
|
|
}
|
2022-02-17 09:04:57 +01:00
|
|
|
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
2023-08-31 15:10:57 +02:00
|
|
|
return nil, fmt.Errorf("GRAPH/UTIL > too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
fcols, frows := float64(cols), float64(rows)
|
|
|
|
minX, minY, maxX, maxY = math.Log10(minX), math.Log10(minY), math.Log10(maxX), math.Log10(maxY)
|
2022-09-12 13:33:01 +02:00
|
|
|
tiles := make([][]float64, rows)
|
2021-10-26 10:24:43 +02:00
|
|
|
for i := range tiles {
|
2022-09-12 13:33:01 +02:00
|
|
|
tiles[i] = make([]float64, cols)
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, job := range jobs {
|
2022-03-30 09:39:13 +02:00
|
|
|
if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2024-08-22 14:29:51 +02:00
|
|
|
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
|
|
|
resolution := 0
|
|
|
|
|
|
|
|
for _, mc := range metricConfigs {
|
|
|
|
resolution = max(resolution, mc.Timestep)
|
|
|
|
}
|
|
|
|
|
|
|
|
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, resolution)
|
2021-10-26 10:24:43 +02:00
|
|
|
if err != nil {
|
2023-08-31 15:10:57 +02:00
|
|
|
log.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
2021-10-26 10:24:43 +02:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2021-12-17 15:49:22 +01:00
|
|
|
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
|
|
|
if flops_ == nil && membw_ == nil {
|
2023-08-31 15:10:57 +02:00
|
|
|
log.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
|
|
|
continue
|
|
|
|
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
2021-12-17 15:49:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
flops, ok1 := flops_["node"]
|
|
|
|
membw, ok2 := membw_["node"]
|
|
|
|
if !ok1 || !ok2 {
|
2023-08-31 15:10:57 +02:00
|
|
|
log.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
|
|
|
continue
|
2021-12-17 15:49:22 +01:00
|
|
|
// TODO/FIXME:
|
2023-08-31 15:10:57 +02:00
|
|
|
// return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for n := 0; n < len(flops.Series); n++ {
|
|
|
|
flopsSeries, membwSeries := flops.Series[n], membw.Series[n]
|
|
|
|
for i := 0; i < len(flopsSeries.Data); i++ {
|
|
|
|
if i >= len(membwSeries.Data) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
x, y := math.Log10(float64(flopsSeries.Data[i]/membwSeries.Data[i])), math.Log10(float64(flopsSeries.Data[i]))
|
|
|
|
if math.IsNaN(x) || math.IsNaN(y) || x < minX || x >= maxX || y < minY || y > maxY {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
x, y = math.Floor(((x-minX)/(maxX-minX))*fcols), math.Floor(((y-minY)/(maxY-minY))*frows)
|
|
|
|
if x < 0 || x >= fcols || y < 0 || y >= frows {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
tiles[int(y)][int(x)] += 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return tiles, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Helper function for the jobsFootprints GraphQL query placed here so that schema.resolvers.go is not too full.
|
2022-03-16 16:11:28 +01:00
|
|
|
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
2022-02-17 09:04:57 +01:00
|
|
|
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
2021-10-26 10:24:43 +02:00
|
|
|
if err != nil {
|
2023-01-31 18:28:44 +01:00
|
|
|
log.Error("Error while querying jobs for footprint")
|
2021-10-26 10:24:43 +02:00
|
|
|
return nil, err
|
|
|
|
}
|
2022-02-17 09:04:57 +01:00
|
|
|
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
2023-08-31 15:10:57 +02:00
|
|
|
return nil, fmt.Errorf("GRAPH/UTIL > too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
avgs := make([][]schema.Float, len(metrics))
|
|
|
|
for i := range avgs {
|
|
|
|
avgs[i] = make([]schema.Float, 0, len(jobs))
|
|
|
|
}
|
|
|
|
|
2023-08-24 11:52:36 +02:00
|
|
|
timeweights := new(model.TimeWeights)
|
|
|
|
timeweights.NodeHours = make([]schema.Float, 0, len(jobs))
|
|
|
|
timeweights.AccHours = make([]schema.Float, 0, len(jobs))
|
|
|
|
timeweights.CoreHours = make([]schema.Float, 0, len(jobs))
|
|
|
|
|
2021-10-26 10:24:43 +02:00
|
|
|
for _, job := range jobs {
|
2022-03-30 09:39:13 +02:00
|
|
|
if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-10-26 10:24:43 +02:00
|
|
|
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
2023-01-31 18:28:44 +01:00
|
|
|
log.Error("Error while loading averages for footprint")
|
2021-10-26 10:24:43 +02:00
|
|
|
return nil, err
|
|
|
|
}
|
2022-03-16 16:11:28 +01:00
|
|
|
|
2023-07-21 16:33:53 +02:00
|
|
|
// #166 collect arrays: Null values or no null values?
|
2023-08-24 11:52:36 +02:00
|
|
|
timeweights.NodeHours = append(timeweights.NodeHours, schema.Float(float64(job.Duration)/60.0*float64(job.NumNodes)))
|
2023-07-21 16:33:53 +02:00
|
|
|
if job.NumAcc > 0 {
|
2023-08-24 11:52:36 +02:00
|
|
|
timeweights.AccHours = append(timeweights.AccHours, schema.Float(float64(job.Duration)/60.0*float64(job.NumAcc)))
|
2023-07-21 16:33:53 +02:00
|
|
|
} else {
|
2023-08-24 11:52:36 +02:00
|
|
|
timeweights.AccHours = append(timeweights.AccHours, schema.Float(1.0))
|
2023-07-21 16:33:53 +02:00
|
|
|
}
|
|
|
|
if job.NumHWThreads > 0 {
|
2023-08-24 12:56:35 +02:00
|
|
|
timeweights.CoreHours = append(timeweights.CoreHours, schema.Float(float64(job.Duration)/60.0*float64(job.NumHWThreads))) // SQLite HWThreads == Cores; numCoresForJob(job)
|
2023-07-21 16:33:53 +02:00
|
|
|
} else {
|
2023-08-24 11:52:36 +02:00
|
|
|
timeweights.CoreHours = append(timeweights.CoreHours, schema.Float(1.0))
|
2023-07-21 16:33:53 +02:00
|
|
|
}
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
res := make([]*model.MetricFootprints, len(avgs))
|
|
|
|
for i, arr := range avgs {
|
|
|
|
res[i] = &model.MetricFootprints{
|
2022-03-16 16:11:28 +01:00
|
|
|
Metric: metrics[i],
|
|
|
|
Data: arr,
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-16 16:11:28 +01:00
|
|
|
return &model.Footprints{
|
2023-08-24 11:52:36 +02:00
|
|
|
TimeWeights: timeweights,
|
2023-07-21 16:33:53 +02:00
|
|
|
Metrics: res,
|
2022-03-16 16:11:28 +01:00
|
|
|
}, nil
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
2023-06-09 09:09:41 +02:00
|
|
|
|
2023-08-24 12:56:35 +02:00
|
|
|
// func numCoresForJob(job *schema.Job) (numCores int) {
|
2023-08-24 11:52:36 +02:00
|
|
|
|
2023-08-24 12:56:35 +02:00
|
|
|
// subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
|
|
|
// if scerr != nil {
|
|
|
|
// return 1
|
|
|
|
// }
|
2023-08-24 11:52:36 +02:00
|
|
|
|
2023-08-24 12:56:35 +02:00
|
|
|
// totalJobCores := 0
|
|
|
|
// topology := subcluster.Topology
|
2023-08-24 11:52:36 +02:00
|
|
|
|
2023-08-24 12:56:35 +02:00
|
|
|
// for _, host := range job.Resources {
|
|
|
|
// hwthreads := host.HWThreads
|
|
|
|
// if hwthreads == nil {
|
|
|
|
// hwthreads = topology.Node
|
|
|
|
// }
|
2023-08-24 11:52:36 +02:00
|
|
|
|
2023-08-24 12:56:35 +02:00
|
|
|
// hostCores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
|
|
|
// totalJobCores += len(hostCores)
|
|
|
|
// }
|
2023-08-24 11:52:36 +02:00
|
|
|
|
2023-08-24 12:56:35 +02:00
|
|
|
// return totalJobCores
|
|
|
|
// }
|
2023-08-24 11:52:36 +02:00
|
|
|
|
2023-06-09 09:09:41 +02:00
|
|
|
func requireField(ctx context.Context, name string) bool {
|
|
|
|
fields := graphql.CollectAllFields(ctx)
|
|
|
|
|
|
|
|
for _, f := range fields {
|
|
|
|
if f == name {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|