Add initial version of footprint update service

Not tested yet
This commit is contained in:
Jan Eitzinger 2024-08-30 07:22:40 +02:00
parent 7c51d88501
commit b0c9d1164d
3 changed files with 100 additions and 5 deletions

View File

@ -23,9 +23,9 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
allMetrics = append(allMetrics, mc.Name) allMetrics = append(allMetrics, mc.Name)
} }
// TODO: Talk about this! What resolutions to store data at...
scopes := []schema.MetricScope{schema.MetricScopeNode} scopes := []schema.MetricScope{schema.MetricScopeNode}
if job.NumNodes <= 8 { if job.NumNodes <= 8 { // FIXME: Add a config option for this
// This will add the native scope if core scope is not available
scopes = append(scopes, schema.MetricScopeCore) scopes = append(scopes, schema.MetricScopeCore)
} }
@ -49,7 +49,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
nodeData, ok := data["node"] nodeData, ok := data["node"]
if !ok { if !ok {
// TODO/FIXME: Calc average for non-node metrics as well! // This should never happen ?
continue continue
} }

View File

@ -453,6 +453,33 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
return nil return nil
} }
func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
query := sq.Select(jobColumns...).From("job").
Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
Where("job.job_state = 'running'").
Where("job.duration>600")
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
rows.Close()
log.Warn("Error while scanning rows")
return nil, err
}
jobs = append(jobs, job)
}
log.Infof("Return job count %d", len(jobs))
return jobs, nil
}
func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) { func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) {
var query sq.SelectBuilder var query sq.SelectBuilder
@ -532,7 +559,7 @@ func (r *JobRepository) UpdateEnergy(jobMeta *schema.JobMeta) error {
if sc.MetricConfig[i].Energy == "power" { if sc.MetricConfig[i].Energy == "power" {
energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration) energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration)
} else if sc.MetricConfig[i].Energy == "energy" { } else if sc.MetricConfig[i].Energy == "energy" {
// FIXME: Compute sum of energy metric // This assumes the metric is of aggregation type sum
} }
} }
@ -574,7 +601,8 @@ func (r *JobRepository) UpdateFootprint(jobMeta *schema.JobMeta) error {
statType = sc.MetricConfig[i].Footprint statType = sc.MetricConfig[i].Footprint
} }
footprint[fp] = LoadJobStat(jobMeta, fp, statType) name := fmt.Sprintf("%s_%s", fp, statType)
footprint[fp] = LoadJobStat(jobMeta, name, statType)
} }
var rawFootprint []byte var rawFootprint []byte

View File

@ -5,9 +5,14 @@
package taskManager package taskManager
import ( import (
"context"
"math"
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/go-co-op/gocron/v2" "github.com/go-co-op/gocron/v2"
) )
@ -19,7 +24,69 @@ func registerFootprintWorker() {
func() { func() {
t := time.Now() t := time.Now()
log.Printf("Update Footprints started at %s", t.Format(time.RFC3339)) log.Printf("Update Footprints started at %s", t.Format(time.RFC3339))
for _, cluster := range archive.Clusters {
jobs, err := jobRepo.FindRunningJobs(cluster.Name)
if err != nil {
continue
}
allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(cluster.Name).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
scopes := []schema.MetricScope{schema.MetricScopeNode}
scopes = append(scopes, schema.MetricScopeCore)
scopes = append(scopes, schema.MetricScopeAccelerator)
for _, job := range jobs {
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, context.Background())
if err != nil {
log.Error("Error wile loading job data for footprint update")
continue
}
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
nodeData, ok := data["node"]
if !ok {
// This should never happen ?
continue
}
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: avg / float64(job.NumNodes),
Min: min,
Max: max,
}
}
if err := jobRepo.UpdateFootprint(jobMeta); err != nil {
log.Errorf("Update job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
continue
}
if err := jobRepo.UpdateEnergy(jobMeta); err != nil {
log.Errorf("Update job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
continue
}
}
}
log.Print("Update Footprints done") log.Print("Update Footprints done")
})) }))
} }