mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-26 05:19:05 +01:00
Add initial version of footprint update service
Not tested yet
This commit is contained in:
parent
7c51d88501
commit
b0c9d1164d
@ -23,9 +23,9 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
allMetrics = append(allMetrics, mc.Name)
|
allMetrics = append(allMetrics, mc.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Talk about this! What resolutions to store data at...
|
|
||||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||||
if job.NumNodes <= 8 {
|
if job.NumNodes <= 8 { // FIXME: Add a config option for this
|
||||||
|
// This will add the native scope if core scope is not available
|
||||||
scopes = append(scopes, schema.MetricScopeCore)
|
scopes = append(scopes, schema.MetricScopeCore)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -49,7 +49,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||||
nodeData, ok := data["node"]
|
nodeData, ok := data["node"]
|
||||||
if !ok {
|
if !ok {
|
||||||
// TODO/FIXME: Calc average for non-node metrics as well!
|
// This should never happen ?
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -453,6 +453,33 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
|
||||||
|
query := sq.Select(jobColumns...).From("job").
|
||||||
|
Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
|
||||||
|
Where("job.job_state = 'running'").
|
||||||
|
Where("job.duration>600")
|
||||||
|
|
||||||
|
rows, err := query.RunWith(r.stmtCache).Query()
|
||||||
|
if err != nil {
|
||||||
|
log.Error("Error while running query")
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
jobs := make([]*schema.Job, 0, 50)
|
||||||
|
for rows.Next() {
|
||||||
|
job, err := scanJob(rows)
|
||||||
|
if err != nil {
|
||||||
|
rows.Close()
|
||||||
|
log.Warn("Error while scanning rows")
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
jobs = append(jobs, job)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("Return job count %d", len(jobs))
|
||||||
|
return jobs, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) {
|
func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) {
|
||||||
var query sq.SelectBuilder
|
var query sq.SelectBuilder
|
||||||
|
|
||||||
@ -532,7 +559,7 @@ func (r *JobRepository) UpdateEnergy(jobMeta *schema.JobMeta) error {
|
|||||||
if sc.MetricConfig[i].Energy == "power" {
|
if sc.MetricConfig[i].Energy == "power" {
|
||||||
energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration)
|
energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration)
|
||||||
} else if sc.MetricConfig[i].Energy == "energy" {
|
} else if sc.MetricConfig[i].Energy == "energy" {
|
||||||
// FIXME: Compute sum of energy metric
|
// This assumes the metric is of aggregation type sum
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -574,7 +601,8 @@ func (r *JobRepository) UpdateFootprint(jobMeta *schema.JobMeta) error {
|
|||||||
statType = sc.MetricConfig[i].Footprint
|
statType = sc.MetricConfig[i].Footprint
|
||||||
}
|
}
|
||||||
|
|
||||||
footprint[fp] = LoadJobStat(jobMeta, fp, statType)
|
name := fmt.Sprintf("%s_%s", fp, statType)
|
||||||
|
footprint[fp] = LoadJobStat(jobMeta, name, statType)
|
||||||
}
|
}
|
||||||
|
|
||||||
var rawFootprint []byte
|
var rawFootprint []byte
|
||||||
|
@ -5,9 +5,14 @@
|
|||||||
package taskManager
|
package taskManager
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"math"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||||
"github.com/go-co-op/gocron/v2"
|
"github.com/go-co-op/gocron/v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -19,7 +24,69 @@ func registerFootprintWorker() {
|
|||||||
func() {
|
func() {
|
||||||
t := time.Now()
|
t := time.Now()
|
||||||
log.Printf("Update Footprints started at %s", t.Format(time.RFC3339))
|
log.Printf("Update Footprints started at %s", t.Format(time.RFC3339))
|
||||||
|
for _, cluster := range archive.Clusters {
|
||||||
|
jobs, err := jobRepo.FindRunningJobs(cluster.Name)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
allMetrics := make([]string, 0)
|
||||||
|
metricConfigs := archive.GetCluster(cluster.Name).MetricConfig
|
||||||
|
for _, mc := range metricConfigs {
|
||||||
|
allMetrics = append(allMetrics, mc.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||||
|
scopes = append(scopes, schema.MetricScopeCore)
|
||||||
|
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||||
|
|
||||||
|
for _, job := range jobs {
|
||||||
|
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, context.Background())
|
||||||
|
if err != nil {
|
||||||
|
log.Error("Error wile loading job data for footprint update")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
jobMeta := &schema.JobMeta{
|
||||||
|
BaseJob: job.BaseJob,
|
||||||
|
StartTime: job.StartTime.Unix(),
|
||||||
|
Statistics: make(map[string]schema.JobStatistics),
|
||||||
|
}
|
||||||
|
|
||||||
|
for metric, data := range jobData {
|
||||||
|
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||||
|
nodeData, ok := data["node"]
|
||||||
|
if !ok {
|
||||||
|
// This should never happen ?
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, series := range nodeData.Series {
|
||||||
|
avg += series.Statistics.Avg
|
||||||
|
min = math.Min(min, series.Statistics.Min)
|
||||||
|
max = math.Max(max, series.Statistics.Max)
|
||||||
|
}
|
||||||
|
|
||||||
|
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||||
|
Unit: schema.Unit{
|
||||||
|
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
|
||||||
|
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
|
||||||
|
},
|
||||||
|
Avg: avg / float64(job.NumNodes),
|
||||||
|
Min: min,
|
||||||
|
Max: max,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := jobRepo.UpdateFootprint(jobMeta); err != nil {
|
||||||
|
log.Errorf("Update job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := jobRepo.UpdateEnergy(jobMeta); err != nil {
|
||||||
|
log.Errorf("Update job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
log.Print("Update Footprints done")
|
log.Print("Update Footprints done")
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user