mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-26 05:19:05 +01:00
Add initial version of footprint update service
Not tested yet
This commit is contained in:
parent
7c51d88501
commit
b0c9d1164d
@ -23,9 +23,9 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
// TODO: Talk about this! What resolutions to store data at...
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
if job.NumNodes <= 8 {
|
||||
if job.NumNodes <= 8 { // FIXME: Add a config option for this
|
||||
// This will add the native scope if core scope is not available
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
@ -49,7 +49,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// TODO/FIXME: Calc average for non-node metrics as well!
|
||||
// This should never happen ?
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -453,6 +453,33 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
|
||||
query := sq.Select(jobColumns...).From("job").
|
||||
Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
|
||||
Where("job.job_state = 'running'").
|
||||
Where("job.duration>600")
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
log.Error("Error while running query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
log.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
log.Infof("Return job count %d", len(jobs))
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) {
|
||||
var query sq.SelectBuilder
|
||||
|
||||
@ -532,7 +559,7 @@ func (r *JobRepository) UpdateEnergy(jobMeta *schema.JobMeta) error {
|
||||
if sc.MetricConfig[i].Energy == "power" {
|
||||
energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration)
|
||||
} else if sc.MetricConfig[i].Energy == "energy" {
|
||||
// FIXME: Compute sum of energy metric
|
||||
// This assumes the metric is of aggregation type sum
|
||||
}
|
||||
}
|
||||
|
||||
@ -574,7 +601,8 @@ func (r *JobRepository) UpdateFootprint(jobMeta *schema.JobMeta) error {
|
||||
statType = sc.MetricConfig[i].Footprint
|
||||
}
|
||||
|
||||
footprint[fp] = LoadJobStat(jobMeta, fp, statType)
|
||||
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||
footprint[fp] = LoadJobStat(jobMeta, name, statType)
|
||||
}
|
||||
|
||||
var rawFootprint []byte
|
||||
|
@ -5,9 +5,14 @@
|
||||
package taskManager
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/go-co-op/gocron/v2"
|
||||
)
|
||||
|
||||
@ -19,7 +24,69 @@ func registerFootprintWorker() {
|
||||
func() {
|
||||
t := time.Now()
|
||||
log.Printf("Update Footprints started at %s", t.Format(time.RFC3339))
|
||||
for _, cluster := range archive.Clusters {
|
||||
jobs, err := jobRepo.FindRunningJobs(cluster.Name)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := archive.GetCluster(cluster.Name).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||
|
||||
for _, job := range jobs {
|
||||
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, context.Background())
|
||||
if err != nil {
|
||||
log.Error("Error wile loading job data for footprint update")
|
||||
continue
|
||||
}
|
||||
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// This should never happen ?
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: schema.Unit{
|
||||
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||
},
|
||||
Avg: avg / float64(job.NumNodes),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
if err := jobRepo.UpdateFootprint(jobMeta); err != nil {
|
||||
log.Errorf("Update job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
if err := jobRepo.UpdateEnergy(jobMeta); err != nil {
|
||||
log.Errorf("Update job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
log.Print("Update Footprints done")
|
||||
}))
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user