diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index e10a994..abaecd6 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -23,9 +23,9 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { allMetrics = append(allMetrics, mc.Name) } - // TODO: Talk about this! What resolutions to store data at... scopes := []schema.MetricScope{schema.MetricScopeNode} - if job.NumNodes <= 8 { + if job.NumNodes <= 8 { // FIXME: Add a config option for this + // This will add the native scope if core scope is not available scopes = append(scopes, schema.MetricScopeCore) } @@ -49,7 +49,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 nodeData, ok := data["node"] if !ok { - // TODO/FIXME: Calc average for non-node metrics as well! + // This should never happen ? continue } diff --git a/internal/repository/job.go b/internal/repository/job.go index 1e552e1..7cfe4fd 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -453,6 +453,33 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { return nil } +func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { + query := sq.Select(jobColumns...).From("job"). + Where(fmt.Sprintf("job.cluster = '%s'", cluster)). + Where("job.job_state = 'running'"). + Where("job.duration>600") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Error("Error while running query") + return nil, err + } + + jobs := make([]*schema.Job, 0, 50) + for rows.Next() { + job, err := scanJob(rows) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jobs = append(jobs, job) + } + + log.Infof("Return job count %d", len(jobs)) + return jobs, nil +} + func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) { var query sq.SelectBuilder @@ -532,7 +559,7 @@ func (r *JobRepository) UpdateEnergy(jobMeta *schema.JobMeta) error { if sc.MetricConfig[i].Energy == "power" { energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration) } else if sc.MetricConfig[i].Energy == "energy" { - // FIXME: Compute sum of energy metric + // This assumes the metric is of aggregation type sum } } @@ -574,7 +601,8 @@ func (r *JobRepository) UpdateFootprint(jobMeta *schema.JobMeta) error { statType = sc.MetricConfig[i].Footprint } - footprint[fp] = LoadJobStat(jobMeta, fp, statType) + name := fmt.Sprintf("%s_%s", fp, statType) + footprint[fp] = LoadJobStat(jobMeta, name, statType) } var rawFootprint []byte diff --git a/internal/taskManager/footprintService.go b/internal/taskManager/footprintService.go index ba1fb09..d14026d 100644 --- a/internal/taskManager/footprintService.go +++ b/internal/taskManager/footprintService.go @@ -5,9 +5,14 @@ package taskManager import ( + "context" + "math" "time" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/go-co-op/gocron/v2" ) @@ -19,7 +24,69 @@ func registerFootprintWorker() { func() { t := time.Now() log.Printf("Update Footprints started at %s", t.Format(time.RFC3339)) + for _, cluster := range archive.Clusters { + jobs, err := jobRepo.FindRunningJobs(cluster.Name) + if err != nil { + continue + } + allMetrics := make([]string, 0) + metricConfigs := archive.GetCluster(cluster.Name).MetricConfig + for _, mc := range metricConfigs { + allMetrics = append(allMetrics, mc.Name) + } + scopes := []schema.MetricScope{schema.MetricScopeNode} + scopes = append(scopes, schema.MetricScopeCore) + scopes = append(scopes, schema.MetricScopeAccelerator) + + for _, job := range jobs { + jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, context.Background()) + if err != nil { + log.Error("Error wile loading job data for footprint update") + continue + } + + jobMeta := &schema.JobMeta{ + BaseJob: job.BaseJob, + StartTime: job.StartTime.Unix(), + Statistics: make(map[string]schema.JobStatistics), + } + + for metric, data := range jobData { + avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 + nodeData, ok := data["node"] + if !ok { + // This should never happen ? + continue + } + + for _, series := range nodeData.Series { + avg += series.Statistics.Avg + min = math.Min(min, series.Statistics.Min) + max = math.Max(max, series.Statistics.Max) + } + + jobMeta.Statistics[metric] = schema.JobStatistics{ + Unit: schema.Unit{ + Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, + Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, + }, + Avg: avg / float64(job.NumNodes), + Min: min, + Max: max, + } + } + + if err := jobRepo.UpdateFootprint(jobMeta); err != nil { + log.Errorf("Update job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error()) + continue + } + if err := jobRepo.UpdateEnergy(jobMeta); err != nil { + log.Errorf("Update job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error()) + continue + } + } + } log.Print("Update Footprints done") })) }