Files
.github
api
cmd
configs
init
internal
api
archiver
auth
config
graph
importer
metricDataDispatcher
metricdata
repository
routerConfig
tagger
taskManager
commitJobService.go
compressionService.go
ldapSyncService.go
retentionService.go
stopJobsExceedTime.go
taskManager.go
updateDurationService.go
updateFootprintService.go
util
pkg
tools
web
.gitignore
.goreleaser.yaml
LICENSE
Makefile
README.md
ReleaseNotes.md
go.mod
go.sum
gqlgen.yml
startDemo.sh
tools.go
cc-backend/internal/taskManager/updateFootprintService.go

143 lines
4.6 KiB
Go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"context"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
"github.com/go-co-op/gocron/v2"
)
func RegisterFootprintWorker() {
var frequency string
if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.FootprintWorker != "" {
frequency = config.Keys.CronFrequency.FootprintWorker
} else {
frequency = "10m"
}
d, _ := time.ParseDuration(frequency)
log.Infof("Register Footprint Update service with %s interval", frequency)
s.NewJob(gocron.DurationJob(d),
gocron.NewTask(
func() {
s := time.Now()
c := 0
ce := 0
cl := 0
log.Printf("Update Footprints started at %s", s.Format(time.RFC3339))
for _, cluster := range archive.Clusters {
s_cluster := time.Now()
jobs, err := jobRepo.FindRunningJobs(cluster.Name)
if err != nil {
continue
}
// NOTE: Additional Subcluster Loop Could Allow For Limited List Of Footprint-Metrics Only.
// - Chunk-Size Would Then Be 'SubCluster' (Running Jobs, Transactions) as Lists Can Change Within SCs
// - Would Require Review of 'updateFootprint' Usage (Logic Could Possibly Be Included Here Completely)
allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(cluster.Name).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
repo, err := metricdata.GetMetricDataRepo(cluster.Name)
if err != nil {
log.Errorf("no metric data repository configured for '%s'", cluster.Name)
continue
}
pendingStatements := []sq.UpdateBuilder{}
for _, job := range jobs {
log.Debugf("Prepare job %d", job.JobID)
cl++
s_job := time.Now()
jobStats, err := repo.LoadStats(job, allMetrics, context.Background())
if err != nil {
log.Errorf("error wile loading job data stats for footprint update: %v", err)
ce++
continue
}
job.Statistics = make(map[string]schema.JobStatistics)
for _, metric := range allMetrics {
avg, min, max := 0.0, 0.0, 0.0
data, ok := jobStats[metric] // JobStats[Metric1:[Hostname1:[Stats], Hostname2:[Stats], ...], Metric2[...] ...]
if ok {
for _, res := range job.Resources {
hostStats, ok := data[res.Hostname]
if ok {
avg += hostStats.Avg
min = math.Min(min, hostStats.Min)
max = math.Max(max, hostStats.Max)
}
}
}
// Add values rounded to 2 digits: repo.LoadStats may return unrounded
job.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
}
// Build Statement per Job, Add to Pending Array
stmt := sq.Update("job")
stmt, err = jobRepo.UpdateFootprint(stmt, job)
if err != nil {
log.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", job.ID, err.Error())
ce++
continue
}
stmt = stmt.Where("job.id = ?", job.ID)
pendingStatements = append(pendingStatements, stmt)
log.Debugf("Job %d took %s", job.JobID, time.Since(s_job))
}
t, err := jobRepo.TransactionInit()
if err != nil {
log.Errorf("failed TransactionInit %v", err)
log.Errorf("skipped %d transactions for cluster %s", len(pendingStatements), cluster.Name)
ce += len(pendingStatements)
} else {
for _, ps := range pendingStatements {
query, args, err := ps.ToSql()
if err != nil {
log.Errorf("failed in ToSQL conversion: %v", err)
ce++
} else {
// args...: Footprint-JSON, Energyfootprint-JSON, TotalEnergy, JobID
jobRepo.TransactionAdd(t, query, args...)
c++
}
}
jobRepo.TransactionEnd(t)
}
log.Debugf("Finish Cluster %s, took %s", cluster.Name, time.Since(s_cluster))
}
log.Printf("Updating %d (of %d; Skipped %d) Footprints is done and took %s", c, cl, ce, time.Since(s))
}))
}