Add initial version of footprint update service

Not tested yet
2025-07-23 04:51:39 +02:00 · 2024-08-30 07:22:40 +02:00
parent 7c51d88501
commit b0c9d1164d
3 changed files with 100 additions and 5 deletions
--- a/internal/archiver/archiver.go
+++ b/internal/archiver/archiver.go
@@ -23,9 +23,9 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
 		allMetrics = append(allMetrics, mc.Name)
 	}

-	// TODO: Talk about this! What resolutions to store data at...
 	scopes := []schema.MetricScope{schema.MetricScopeNode}
-	if job.NumNodes <= 8 {
+	if job.NumNodes <= 8 { // FIXME: Add a config option for this
+		// This will add the native scope if core scope is not available
 		scopes = append(scopes, schema.MetricScopeCore)
 	}

@@ -49,7 +49,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
 		avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
 		nodeData, ok := data["node"]
 		if !ok {
-			// TODO/FIXME: Calc average for non-node metrics as well!
+			// This should never happen ?
 			continue
 		}

--- a/internal/repository/job.go
+++ b/internal/repository/job.go
@@ -453,6 +453,33 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
 	return nil
 }

+func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
+	query := sq.Select(jobColumns...).From("job").
+		Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
+		Where("job.job_state = 'running'").
+		Where("job.duration>600")
+
+	rows, err := query.RunWith(r.stmtCache).Query()
+	if err != nil {
+		log.Error("Error while running query")
+		return nil, err
+	}
+
+	jobs := make([]*schema.Job, 0, 50)
+	for rows.Next() {
+		job, err := scanJob(rows)
+		if err != nil {
+			rows.Close()
+			log.Warn("Error while scanning rows")
+			return nil, err
+		}
+		jobs = append(jobs, job)
+	}
+
+	log.Infof("Return job count %d", len(jobs))
+	return jobs, nil
+}
+
 func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) {
 	var query sq.SelectBuilder

@@ -532,7 +559,7 @@ func (r *JobRepository) UpdateEnergy(jobMeta *schema.JobMeta) error {
 			if sc.MetricConfig[i].Energy == "power" {
 				energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration)
 			} else if sc.MetricConfig[i].Energy == "energy" {
-				// FIXME: Compute sum of energy metric
+				// This assumes the metric is of aggregation type sum
 			}
 		}

@@ -574,7 +601,8 @@ func (r *JobRepository) UpdateFootprint(jobMeta *schema.JobMeta) error {
 			statType = sc.MetricConfig[i].Footprint
 		}

-		footprint[fp] = LoadJobStat(jobMeta, fp, statType)
+		name := fmt.Sprintf("%s_%s", fp, statType)
+		footprint[fp] = LoadJobStat(jobMeta, name, statType)
 	}

 	var rawFootprint []byte
--- a/internal/taskManager/footprintService.go
+++ b/internal/taskManager/footprintService.go
@@ -5,9 +5,14 @@
 package taskManager

 import (
+	"context"
+	"math"
 	"time"

+	"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
+	"github.com/ClusterCockpit/cc-backend/pkg/archive"
 	"github.com/ClusterCockpit/cc-backend/pkg/log"
+	"github.com/ClusterCockpit/cc-backend/pkg/schema"
 	"github.com/go-co-op/gocron/v2"
 )

@@ -19,7 +24,69 @@ func registerFootprintWorker() {
 			func() {
 				t := time.Now()
 				log.Printf("Update Footprints started at %s", t.Format(time.RFC3339))
+				for _, cluster := range archive.Clusters {
+					jobs, err := jobRepo.FindRunningJobs(cluster.Name)
+					if err != nil {
+						continue
+					}
+					allMetrics := make([]string, 0)
+					metricConfigs := archive.GetCluster(cluster.Name).MetricConfig
+					for _, mc := range metricConfigs {
+						allMetrics = append(allMetrics, mc.Name)
+					}

+					scopes := []schema.MetricScope{schema.MetricScopeNode}
+					scopes = append(scopes, schema.MetricScopeCore)
+					scopes = append(scopes, schema.MetricScopeAccelerator)
+
+					for _, job := range jobs {
+						jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, context.Background())
+						if err != nil {
+							log.Error("Error wile loading job data for footprint update")
+							continue
+						}
+
+						jobMeta := &schema.JobMeta{
+							BaseJob:    job.BaseJob,
+							StartTime:  job.StartTime.Unix(),
+							Statistics: make(map[string]schema.JobStatistics),
+						}
+
+						for metric, data := range jobData {
+							avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
+							nodeData, ok := data["node"]
+							if !ok {
+								// This should never happen ?
+								continue
+							}
+
+							for _, series := range nodeData.Series {
+								avg += series.Statistics.Avg
+								min = math.Min(min, series.Statistics.Min)
+								max = math.Max(max, series.Statistics.Max)
+							}
+
+							jobMeta.Statistics[metric] = schema.JobStatistics{
+								Unit: schema.Unit{
+									Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
+									Base:   archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
+								},
+								Avg: avg / float64(job.NumNodes),
+								Min: min,
+								Max: max,
+							}
+						}
+
+						if err := jobRepo.UpdateFootprint(jobMeta); err != nil {
+							log.Errorf("Update job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
+							continue
+						}
+						if err := jobRepo.UpdateEnergy(jobMeta); err != nil {
+							log.Errorf("Update job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
+							continue
+						}
+					}
+				}
 				log.Print("Update Footprints done")
 			}))
 }