diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 3a49923..9f7e673 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -181,7 +181,7 @@ func main() { log.Fatalf("failed to initialize archive: %s", err.Error()) } - if err := metricdata.Init(config.Keys.DisableArchive); err != nil { + if err := metricdata.Init(); err != nil { log.Fatalf("failed to initialize metricdata repository: %s", err.Error()) } diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index d2b62e2..bc20fcf 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -20,6 +20,7 @@ import ( "github.com/99designs/gqlgen/graphql/handler" "github.com/99designs/gqlgen/graphql/playground" "github.com/ClusterCockpit/cc-backend/internal/api" + "github.com/ClusterCockpit/cc-backend/internal/archiver" "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph" @@ -260,8 +261,8 @@ func serverStart() { }) server = &http.Server{ - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, + ReadTimeout: 20 * time.Second, + WriteTimeout: 20 * time.Second, Handler: handler, Addr: config.Keys.Addr, } @@ -308,5 +309,5 @@ func serverShutdown() { server.Shutdown(context.Background()) // Then, wait for any async archivings still pending... - apiHandle.JobRepository.WaitForArchiving() + archiver.WaitForArchiving() } diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 423bf5c..d0a7916 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -19,9 +19,11 @@ import ( "testing" "github.com/ClusterCockpit/cc-backend/internal/api" + "github.com/ClusterCockpit/cc-backend/internal/archiver" "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" "github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" @@ -150,10 +152,11 @@ func setup(t *testing.T) *api.RestApi { t.Fatal(err) } - if err := metricdata.Init(config.Keys.DisableArchive); err != nil { + if err := metricdata.Init(); err != nil { t.Fatal(err) } + archiver.Start(repository.GetJobRepository()) auth.Init() graph.Init() @@ -311,7 +314,7 @@ func TestRestApi(t *testing.T) { t.Fatal(response.Status, recorder.Body.String()) } - restapi.JobRepository.WaitForArchiving() + archiver.WaitForArchiving() resolver := graph.GetResolverInstance() job, err := resolver.Query().Job(ctx, strconv.Itoa(int(dbid))) if err != nil { @@ -341,7 +344,7 @@ func TestRestApi(t *testing.T) { } t.Run("CheckArchive", func(t *testing.T) { - data, err := metricdata.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background()) + data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background()) if err != nil { t.Fatal(err) } @@ -422,7 +425,7 @@ func TestRestApi(t *testing.T) { t.Fatal(response.Status, recorder.Body.String()) } - restapi.JobRepository.WaitForArchiving() + archiver.WaitForArchiving() jobid, cluster := int64(12345), "testcluster" job, err := restapi.JobRepository.Find(&jobid, &cluster, nil) if err != nil { diff --git a/internal/api/rest.go b/internal/api/rest.go index 4efdd4a..eee65c7 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -19,12 +19,13 @@ import ( "sync" "time" + "github.com/ClusterCockpit/cc-backend/internal/archiver" "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/internal/importer" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" @@ -516,7 +517,7 @@ func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) var data schema.JobData if r.URL.Query().Get("all-metrics") == "true" { - data, err = metricdata.LoadData(job, nil, scopes, r.Context()) + data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context()) if err != nil { log.Warn("Error while loading job data") return @@ -605,7 +606,7 @@ func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) { scopes = []schema.MetricScope{"node"} } - data, err := metricdata.LoadData(job, metrics, scopes, r.Context()) + data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context()) if err != nil { log.Warn("Error while loading job data") return @@ -1083,7 +1084,7 @@ func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo } // Trigger async archiving - api.JobRepository.TriggerArchiving(job) + archiver.TriggerArchiving(job) } func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) { diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go new file mode 100644 index 0000000..628e36e --- /dev/null +++ b/internal/archiver/archiveWorker.go @@ -0,0 +1,94 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package archiver + +import ( + "context" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + sq "github.com/Masterminds/squirrel" +) + +var ( + archivePending sync.WaitGroup + archiveChannel chan *schema.Job + jobRepo *repository.JobRepository +) + +func Start(r *repository.JobRepository) { + archiveChannel = make(chan *schema.Job, 128) + jobRepo = r + + go archivingWorker() +} + +// Archiving worker thread +func archivingWorker() { + for { + select { + case job, ok := <-archiveChannel: + if !ok { + break + } + start := time.Now() + // not using meta data, called to load JobMeta into Cache? + // will fail if job meta not in repository + if _, err := jobRepo.FetchMetadata(job); err != nil { + log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error()) + jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) + continue + } + + // ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend + // TODO: Maybe use context with cancel/timeout here + jobMeta, err := ArchiveJob(job, context.Background()) + if err != nil { + log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error()) + jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) + continue + } + + stmt := sq.Update("job").Where("job.id = ?", job.ID) + + if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil { + log.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error()) + continue + } + if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil { + log.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error()) + continue + } + // Update the jobs database entry one last time: + stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful) + if err := jobRepo.Execute(stmt); err != nil { + log.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error()) + continue + } + log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) + log.Printf("archiving job (dbid: %d) successful", job.ID) + archivePending.Done() + } + } +} + +// Trigger async archiving +func TriggerArchiving(job *schema.Job) { + if archiveChannel == nil { + log.Fatal("Cannot archive without archiving channel. Did you Start the archiver?") + } + + archivePending.Add(1) + archiveChannel <- job +} + +// Wait for background thread to finish pending archiving operations +func WaitForArchiving() { + // close channel and wait for worker to process remaining jobs + archivePending.Wait() +} diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go new file mode 100644 index 0000000..de84cf0 --- /dev/null +++ b/internal/archiver/archiver.go @@ -0,0 +1,82 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package archiver + +import ( + "context" + "math" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +// Writes a running job to the job-archive +func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { + allMetrics := make([]string, 0) + metricConfigs := archive.GetCluster(job.Cluster).MetricConfig + for _, mc := range metricConfigs { + allMetrics = append(allMetrics, mc.Name) + } + + scopes := []schema.MetricScope{schema.MetricScopeNode} + // FIXME: Add a config option for this + if job.NumNodes <= 8 { + // This will add the native scope if core scope is not available + scopes = append(scopes, schema.MetricScopeCore) + } + + if job.NumAcc > 0 { + scopes = append(scopes, schema.MetricScopeAccelerator) + } + + jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx) + if err != nil { + log.Error("Error wile loading job data for archiving") + return nil, err + } + + jobMeta := &schema.JobMeta{ + BaseJob: job.BaseJob, + StartTime: job.StartTime.Unix(), + Statistics: make(map[string]schema.JobStatistics), + } + + for metric, data := range jobData { + avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 + nodeData, ok := data["node"] + if !ok { + // This should never happen ? + continue + } + + for _, series := range nodeData.Series { + avg += series.Statistics.Avg + min = math.Min(min, series.Statistics.Min) + max = math.Max(max, series.Statistics.Max) + } + + jobMeta.Statistics[metric] = schema.JobStatistics{ + Unit: schema.Unit{ + Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, + Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, + }, + Avg: avg / float64(job.NumNodes), + Min: min, + Max: max, + } + } + + // If the file based archive is disabled, + // only return the JobMeta structure as the + // statistics in there are needed. + if config.Keys.DisableArchive { + return jobMeta, nil + } + + return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData) +} diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 4673354..6fcb4a0 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -15,7 +15,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/generated" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" @@ -233,7 +233,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str return nil, err } - data, err := metricdata.LoadData(job, metrics, scopes, ctx) + data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx) if err != nil { log.Warn("Error while loading job data") return nil, err @@ -385,7 +385,7 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [ } } - data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) + data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) if err != nil { log.Warn("Error while loading node data") return nil, err @@ -442,9 +442,11 @@ func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } // SubCluster returns generated.SubClusterResolver implementation. func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} } -type clusterResolver struct{ *Resolver } -type jobResolver struct{ *Resolver } -type metricValueResolver struct{ *Resolver } -type mutationResolver struct{ *Resolver } -type queryResolver struct{ *Resolver } -type subClusterResolver struct{ *Resolver } +type ( + clusterResolver struct{ *Resolver } + jobResolver struct{ *Resolver } + metricValueResolver struct{ *Resolver } + mutationResolver struct{ *Resolver } + queryResolver struct{ *Resolver } + subClusterResolver struct{ *Resolver } +) diff --git a/internal/graph/util.go b/internal/graph/util.go index 3e65b6c..8296a02 100644 --- a/internal/graph/util.go +++ b/internal/graph/util.go @@ -11,7 +11,7 @@ import ( "github.com/99designs/gqlgen/graphql" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" // "github.com/ClusterCockpit/cc-backend/pkg/archive" @@ -24,8 +24,8 @@ func (r *queryResolver) rooflineHeatmap( ctx context.Context, filter []*model.JobFilter, rows int, cols int, - minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) { - + minX float64, minY float64, maxX float64, maxY float64, +) ([][]float64, error) { jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil) if err != nil { log.Error("Error while querying jobs for roofline") @@ -47,7 +47,7 @@ func (r *queryResolver) rooflineHeatmap( continue } - jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx) + jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx) if err != nil { log.Errorf("Error while loading roofline metrics for job %d", job.ID) return nil, err @@ -120,7 +120,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF continue } - if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil { + if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil { log.Error("Error while loading averages for footprint") return nil, err } diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 54158c4..153402a 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -77,8 +77,16 @@ func HandleImportFlag(flag string) error { job.Footprint = make(map[string]float64) for _, fp := range sc.Footprint { - job.Footprint[fp] = repository.LoadJobStat(&job, fp) + statType := "avg" + + if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil { + statType = sc.MetricConfig[i].Footprint + } + + name := fmt.Sprintf("%s_%s", fp, statType) + job.Footprint[fp] = repository.LoadJobStat(&job, name, statType) } + job.RawFootprint, err = json.Marshal(job.Footprint) if err != nil { log.Warn("Error while marshaling job footprint") diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index 4b9abab..5f06f36 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -16,6 +16,11 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/schema" ) +const ( + addTagQuery = "INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)" + setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)" +) + // Delete the tables "job", "tag" and "jobtag" from the database and // repopulate them using the jobs found in `archive`. func InitDB() error { @@ -68,7 +73,15 @@ func InitDB() error { job.Footprint = make(map[string]float64) for _, fp := range sc.Footprint { - job.Footprint[fp] = repository.LoadJobStat(jobMeta, fp) + statType := "avg" + + if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil { + statType = sc.MetricConfig[i].Footprint + } + + name := fmt.Sprintf("%s_%s", fp, statType) + + job.Footprint[fp] = repository.LoadJobStat(jobMeta, name, statType) } job.RawFootprint, err = json.Marshal(job.Footprint) @@ -97,7 +110,8 @@ func InitDB() error { continue } - id, err := r.TransactionAdd(t, job) + id, err := r.TransactionAddNamed(t, + repository.NamedJobInsert, job) if err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ @@ -108,7 +122,9 @@ func InitDB() error { tagstr := tag.Name + ":" + tag.Type tagId, ok := tags[tagstr] if !ok { - tagId, err = r.TransactionAddTag(t, tag) + tagId, err = r.TransactionAdd(t, + addTagQuery, + tag.Name, tag.Type) if err != nil { log.Errorf("Error adding tag: %v", err) errorOccured++ @@ -117,7 +133,9 @@ func InitDB() error { tags[tagstr] = tagId } - r.TransactionSetTag(t, id, tagId) + r.TransactionAdd(t, + setTagQuery, + id, tagId) } if err == nil { diff --git a/internal/metricDataDispatcher/dataLoader.go b/internal/metricDataDispatcher/dataLoader.go new file mode 100644 index 0000000..2c7cfa6 --- /dev/null +++ b/internal/metricDataDispatcher/dataLoader.go @@ -0,0 +1,231 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package metricDataDispatcher + +import ( + "context" + "fmt" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/lrucache" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024) + +func cacheKey( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, +) string { + // Duration and StartTime do not need to be in the cache key as StartTime is less unique than + // job.ID and the TTL of the cache entry makes sure it does not stay there forever. + return fmt.Sprintf("%d(%s):[%v],[%v]", + job.ID, job.State, metrics, scopes) +} + +// Fetches the metric data for a job. +func LoadData(job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, +) (schema.JobData, error) { + data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) { + var jd schema.JobData + var err error + + if job.State == schema.JobStateRunning || + job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving || + config.Keys.DisableArchive { + + repo, err := metricdata.GetMetricDataRepo(job.Cluster) + if err != nil { + return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0 + } + + if scopes == nil { + scopes = append(scopes, schema.MetricScopeNode) + } + + if metrics == nil { + cluster := archive.GetCluster(job.Cluster) + for _, mc := range cluster.MetricConfig { + metrics = append(metrics, mc.Name) + } + } + + jd, err = repo.LoadData(job, metrics, scopes, ctx) + if err != nil { + if len(jd) != 0 { + log.Warnf("partial error: %s", err.Error()) + // return err, 0, 0 // Reactivating will block archiving on one partial error + } else { + log.Error("Error while loading job data from metric repository") + return err, 0, 0 + } + } + size = jd.Size() + } else { + jd, err = archive.GetHandle().LoadJobData(job) + if err != nil { + log.Error("Error while loading job data from archive") + return err, 0, 0 + } + + // Avoid sending unrequested data to the client: + if metrics != nil || scopes != nil { + if metrics == nil { + metrics = make([]string, 0, len(jd)) + for k := range jd { + metrics = append(metrics, k) + } + } + + res := schema.JobData{} + for _, metric := range metrics { + if perscope, ok := jd[metric]; ok { + if len(perscope) > 1 { + subset := make(map[schema.MetricScope]*schema.JobMetric) + for _, scope := range scopes { + if jm, ok := perscope[scope]; ok { + subset[scope] = jm + } + } + + if len(subset) > 0 { + perscope = subset + } + } + + res[metric] = perscope + } + } + jd = res + } + size = jd.Size() + } + + ttl = 5 * time.Hour + if job.State == schema.JobStateRunning { + ttl = 2 * time.Minute + } + + // FIXME: Review: Is this really necessary or correct. + // For /monitoring/job/ and some other places, flops_any and mem_bw need + // to be available at the scope 'node'. If a job has a lot of nodes, + // statisticsSeries should be available so that a min/median/max Graph can be + // used instead of a lot of single lines. + const maxSeriesSize int = 15 + for _, scopes := range jd { + for _, jm := range scopes { + if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize { + continue + } + + jm.AddStatisticsSeries() + } + } + + nodeScopeRequested := false + for _, scope := range scopes { + if scope == schema.MetricScopeNode { + nodeScopeRequested = true + } + } + + if nodeScopeRequested { + jd.AddNodeScope("flops_any") + jd.AddNodeScope("mem_bw") + } + + return jd, ttl, size + }) + + if err, ok := data.(error); ok { + log.Error("Error in returned dataset") + return nil, err + } + + return data.(schema.JobData), nil +} + +// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize. +func LoadAverages( + job *schema.Job, + metrics []string, + data [][]schema.Float, + ctx context.Context, +) error { + if job.State != schema.JobStateRunning && !config.Keys.DisableArchive { + return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here? + } + + repo, err := metricdata.GetMetricDataRepo(job.Cluster) + if err != nil { + return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster) + } + + stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion? + if err != nil { + log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project) + return err + } + + for i, m := range metrics { + nodes, ok := stats[m] + if !ok { + data[i] = append(data[i], schema.NaN) + continue + } + + sum := 0.0 + for _, node := range nodes { + sum += node.Avg + } + data[i] = append(data[i], schema.Float(sum)) + } + + return nil +} + +// Used for the node/system view. Returns a map of nodes to a map of metrics. +func LoadNodeData( + cluster string, + metrics, nodes []string, + scopes []schema.MetricScope, + from, to time.Time, + ctx context.Context, +) (map[string]map[string][]*schema.JobMetric, error) { + repo, err := metricdata.GetMetricDataRepo(cluster) + if err != nil { + return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster) + } + + if metrics == nil { + for _, m := range archive.GetCluster(cluster).MetricConfig { + metrics = append(metrics, m.Name) + } + } + + data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) + if err != nil { + if len(data) != 0 { + log.Warnf("partial error: %s", err.Error()) + } else { + log.Error("Error while loading node data from metric repository") + return nil, err + } + } + + if data == nil { + return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster) + } + + return data, nil +} diff --git a/internal/metricdata/metricdata.go b/internal/metricdata/metricdata.go index eba9dee..68d8d32 100644 --- a/internal/metricdata/metricdata.go +++ b/internal/metricdata/metricdata.go @@ -8,13 +8,10 @@ import ( "context" "encoding/json" "fmt" - "math" "time" "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" - "github.com/ClusterCockpit/cc-backend/pkg/lrucache" "github.com/ClusterCockpit/cc-backend/pkg/schema" ) @@ -35,10 +32,7 @@ type MetricDataRepository interface { var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{} -var useArchive bool - -func Init(disableArchive bool) error { - useArchive = !disableArchive +func Init() error { for _, cluster := range config.Keys.Clusters { if cluster.MetricDataRepository != nil { var kind struct { @@ -73,287 +67,13 @@ func Init(disableArchive bool) error { return nil } -var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024) - -// Fetches the metric data for a job. -func LoadData(job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, -) (schema.JobData, error) { - data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) { - var jd schema.JobData - var err error - - if job.State == schema.JobStateRunning || - job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving || - !useArchive { - - repo, ok := metricDataRepos[job.Cluster] - - if !ok { - return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0 - } - - if scopes == nil { - scopes = append(scopes, schema.MetricScopeNode) - } - - if metrics == nil { - cluster := archive.GetCluster(job.Cluster) - for _, mc := range cluster.MetricConfig { - metrics = append(metrics, mc.Name) - } - } - - jd, err = repo.LoadData(job, metrics, scopes, ctx) - if err != nil { - if len(jd) != 0 { - log.Warnf("partial error: %s", err.Error()) - // return err, 0, 0 // Reactivating will block archiving on one partial error - } else { - log.Error("Error while loading job data from metric repository") - return err, 0, 0 - } - } - size = jd.Size() - } else { - jd, err = archive.GetHandle().LoadJobData(job) - if err != nil { - log.Error("Error while loading job data from archive") - return err, 0, 0 - } - - // Avoid sending unrequested data to the client: - if metrics != nil || scopes != nil { - if metrics == nil { - metrics = make([]string, 0, len(jd)) - for k := range jd { - metrics = append(metrics, k) - } - } - - res := schema.JobData{} - for _, metric := range metrics { - if perscope, ok := jd[metric]; ok { - if len(perscope) > 1 { - subset := make(map[schema.MetricScope]*schema.JobMetric) - for _, scope := range scopes { - if jm, ok := perscope[scope]; ok { - subset[scope] = jm - } - } - - if len(subset) > 0 { - perscope = subset - } - } - - res[metric] = perscope - } - } - jd = res - } - size = jd.Size() - } - - ttl = 5 * time.Hour - if job.State == schema.JobStateRunning { - ttl = 2 * time.Minute - } - - prepareJobData(jd, scopes) - - return jd, ttl, size - }) - - if err, ok := data.(error); ok { - log.Error("Error in returned dataset") - return nil, err - } - - return data.(schema.JobData), nil -} - -// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize. -func LoadAverages( - job *schema.Job, - metrics []string, - data [][]schema.Float, - ctx context.Context, -) error { - if job.State != schema.JobStateRunning && useArchive { - return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here? - } - - repo, ok := metricDataRepos[job.Cluster] - if !ok { - return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster) - } - - stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion? - if err != nil { - log.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project) - return err - } - - for i, m := range metrics { - nodes, ok := stats[m] - if !ok { - data[i] = append(data[i], schema.NaN) - continue - } - - sum := 0.0 - for _, node := range nodes { - sum += node.Avg - } - data[i] = append(data[i], schema.Float(sum)) - } - - return nil -} - -// Used for the node/system view. Returns a map of nodes to a map of metrics. -func LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context, -) (map[string]map[string][]*schema.JobMetric, error) { +func GetMetricDataRepo(cluster string) (MetricDataRepository, error) { + var err error repo, ok := metricDataRepos[cluster] + if !ok { - return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster) + err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster) } - if metrics == nil { - for _, m := range archive.GetCluster(cluster).MetricConfig { - metrics = append(metrics, m.Name) - } - } - - data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) - if err != nil { - if len(data) != 0 { - log.Warnf("partial error: %s", err.Error()) - } else { - log.Error("Error while loading node data from metric repository") - return nil, err - } - } - - if data == nil { - return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster) - } - - return data, nil -} - -func cacheKey( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, -) string { - // Duration and StartTime do not need to be in the cache key as StartTime is less unique than - // job.ID and the TTL of the cache entry makes sure it does not stay there forever. - return fmt.Sprintf("%d(%s):[%v],[%v]", - job.ID, job.State, metrics, scopes) -} - -// For /monitoring/job/ and some other places, flops_any and mem_bw need -// to be available at the scope 'node'. If a job has a lot of nodes, -// statisticsSeries should be available so that a min/median/max Graph can be -// used instead of a lot of single lines. -func prepareJobData( - jobData schema.JobData, - scopes []schema.MetricScope, -) { - const maxSeriesSize int = 15 - for _, scopes := range jobData { - for _, jm := range scopes { - if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize { - continue - } - - jm.AddStatisticsSeries() - } - } - - nodeScopeRequested := false - for _, scope := range scopes { - if scope == schema.MetricScopeNode { - nodeScopeRequested = true - } - } - - if nodeScopeRequested { - jobData.AddNodeScope("flops_any") - jobData.AddNodeScope("mem_bw") - } -} - -// Writes a running job to the job-archive -func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { - allMetrics := make([]string, 0) - metricConfigs := archive.GetCluster(job.Cluster).MetricConfig - for _, mc := range metricConfigs { - allMetrics = append(allMetrics, mc.Name) - } - - // TODO: Talk about this! What resolutions to store data at... - scopes := []schema.MetricScope{schema.MetricScopeNode} - if job.NumNodes <= 8 { - scopes = append(scopes, schema.MetricScopeCore) - } - - if job.NumAcc > 0 { - scopes = append(scopes, schema.MetricScopeAccelerator) - } - - jobData, err := LoadData(job, allMetrics, scopes, ctx) - if err != nil { - log.Error("Error wile loading job data for archiving") - return nil, err - } - - jobMeta := &schema.JobMeta{ - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - Statistics: make(map[string]schema.JobStatistics), - } - - for metric, data := range jobData { - avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 - nodeData, ok := data["node"] - if !ok { - // TODO/FIXME: Calc average for non-node metrics as well! - continue - } - - for _, series := range nodeData.Series { - avg += series.Statistics.Avg - min = math.Min(min, series.Statistics.Min) - max = math.Max(max, series.Statistics.Max) - } - - jobMeta.Statistics[metric] = schema.JobStatistics{ - Unit: schema.Unit{ - Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, - Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, - }, - Avg: avg / float64(job.NumNodes), - Min: min, - Max: max, - } - } - - // If the file based archive is disabled, - // only return the JobMeta structure as the - // statistics in there are needed. - if !useArchive { - return jobMeta, nil - } - - return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData) + return repo, err } diff --git a/internal/repository/archiveWorker.go b/internal/repository/archiveWorker.go deleted file mode 100644 index 42febb5..0000000 --- a/internal/repository/archiveWorker.go +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package repository - -import ( - "context" - "encoding/json" - "time" - - "github.com/ClusterCockpit/cc-backend/internal/metricdata" - "github.com/ClusterCockpit/cc-backend/pkg/archive" - "github.com/ClusterCockpit/cc-backend/pkg/log" - "github.com/ClusterCockpit/cc-backend/pkg/schema" - sq "github.com/Masterminds/squirrel" -) - -// Archiving worker thread -func (r *JobRepository) archivingWorker() { - for { - select { - case job, ok := <-r.archiveChannel: - if !ok { - break - } - start := time.Now() - // not using meta data, called to load JobMeta into Cache? - // will fail if job meta not in repository - if _, err := r.FetchMetadata(job); err != nil { - log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error()) - r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) - continue - } - - // metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and push into configured archive backend - // TODO: Maybe use context with cancel/timeout here - jobMeta, err := metricdata.ArchiveJob(job, context.Background()) - if err != nil { - log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error()) - r.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) - continue - } - - // Update the jobs database entry one last time: - if err := r.MarkArchived(jobMeta, schema.MonitoringStatusArchivingSuccessful); err != nil { - log.Errorf("archiving job (dbid: %d) failed at marking archived step: %s", job.ID, err.Error()) - continue - } - log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) - log.Printf("archiving job (dbid: %d) successful", job.ID) - r.archivePending.Done() - } - } -} - -// Stop updates the job with the database id jobId using the provided arguments. -func (r *JobRepository) MarkArchived( - jobMeta *schema.JobMeta, - monitoringStatus int32, -) error { - stmt := sq.Update("job"). - Set("monitoring_status", monitoringStatus). - Where("job.id = ?", jobMeta.JobID) - - sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) - if err != nil { - log.Errorf("cannot get subcluster: %s", err.Error()) - return err - } - footprint := make(map[string]float64) - - for _, fp := range sc.Footprint { - footprint[fp] = LoadJobStat(jobMeta, fp) - } - - var rawFootprint []byte - - if rawFootprint, err = json.Marshal(footprint); err != nil { - log.Warnf("Error while marshaling footprint for job, DB ID '%v'", jobMeta.ID) - return err - } - - stmt = stmt.Set("footprint", rawFootprint) - - if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil { - log.Warn("Error while marking job as archived") - return err - } - return nil -} - -func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) { - stmt := sq.Update("job"). - Set("monitoring_status", monitoringStatus). - Where("job.id = ?", job) - - _, err = stmt.RunWith(r.stmtCache).Exec() - return -} - -// Trigger async archiving -func (r *JobRepository) TriggerArchiving(job *schema.Job) { - r.archivePending.Add(1) - r.archiveChannel <- job -} - -// Wait for background thread to finish pending archiving operations -func (r *JobRepository) WaitForArchiving() { - // close channel and wait for worker to process remaining jobs - r.archivePending.Wait() -} diff --git a/internal/repository/job.go b/internal/repository/job.go index ca8350f..e5e2569 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -28,12 +28,10 @@ var ( ) type JobRepository struct { - DB *sqlx.DB - stmtCache *sq.StmtCache - cache *lrucache.Cache - archiveChannel chan *schema.Job - driver string - archivePending sync.WaitGroup + DB *sqlx.DB + stmtCache *sq.StmtCache + cache *lrucache.Cache + driver string } func GetJobRepository() *JobRepository { @@ -44,12 +42,9 @@ func GetJobRepository() *JobRepository { DB: db.DB, driver: db.Driver, - stmtCache: sq.NewStmtCache(db.DB), - cache: lrucache.New(1024 * 1024), - archiveChannel: make(chan *schema.Job, 128), + stmtCache: sq.NewStmtCache(db.DB), + cache: lrucache.New(1024 * 1024), } - // start archiving worker - go jobRepoInstance.archivingWorker() }) return jobRepoInstance } @@ -210,7 +205,10 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er return err } - if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil { + if _, err = sq.Update("job"). + Set("meta_data", job.RawMetaData). + Where("job.id = ?", job.ID). + RunWith(r.stmtCache).Exec(); err != nil { log.Warnf("Error while updating metadata for job, DB ID '%v'", job.ID) return err } @@ -458,6 +456,44 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { return nil } +func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { + query := sq.Select(jobColumns...).From("job"). + Where(fmt.Sprintf("job.cluster = '%s'", cluster)). + Where("job.job_state = 'running'"). + Where("job.duration>600") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Error("Error while running query") + return nil, err + } + + jobs := make([]*schema.Job, 0, 50) + for rows.Next() { + job, err := scanJob(rows) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jobs = append(jobs, job) + } + + log.Infof("Return job count %d", len(jobs)) + return jobs, nil +} + +func (r *JobRepository) UpdateDuration() error { + if _, err := sq.Update("job"). + Set("duration", sq.Expr("? - job.start_time", time.Now().Unix())). + Where("job_state = running"). + RunWith(r.stmtCache).Exec(); err != nil { + return err + } + + return nil +} + func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64) ([]*schema.Job, error) { var query sq.SelectBuilder @@ -495,3 +531,100 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64 log.Infof("Return job count %d", len(jobs)) return jobs, nil } + +func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) { + stmt := sq.Update("job"). + Set("monitoring_status", monitoringStatus). + Where("job.id = ?", job) + + _, err = stmt.RunWith(r.stmtCache).Exec() + return +} + +func (r *JobRepository) Execute(stmt sq.UpdateBuilder) error { + if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil { + return err + } + + return nil +} + +func (r *JobRepository) MarkArchived( + stmt sq.UpdateBuilder, + monitoringStatus int32, +) sq.UpdateBuilder { + return stmt.Set("monitoring_status", monitoringStatus) +} + +func (r *JobRepository) UpdateEnergy( + stmt sq.UpdateBuilder, + jobMeta *schema.JobMeta, +) (sq.UpdateBuilder, error) { + sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) + if err != nil { + log.Errorf("cannot get subcluster: %s", err.Error()) + return stmt, err + } + energyFootprint := make(map[string]float64) + var totalEnergy float64 + var energy float64 + + for _, fp := range sc.EnergyFootprint { + if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil { + // FIXME: Check for unit conversions + if sc.MetricConfig[i].Energy == "power" { + energy = LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.Duration) + } else if sc.MetricConfig[i].Energy == "energy" { + // This assumes the metric is of aggregation type sum + } + } + + energyFootprint[fp] = energy + totalEnergy += energy + } + + var rawFootprint []byte + + if rawFootprint, err = json.Marshal(energyFootprint); err != nil { + log.Warnf("Error while marshaling energy footprint for job, DB ID '%v'", jobMeta.ID) + return stmt, err + } + + stmt.Set("energy_footprint", rawFootprint). + Set("energy", totalEnergy) + + return stmt, nil +} + +func (r *JobRepository) UpdateFootprint( + stmt sq.UpdateBuilder, + jobMeta *schema.JobMeta, +) (sq.UpdateBuilder, error) { + sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) + if err != nil { + log.Errorf("cannot get subcluster: %s", err.Error()) + return stmt, err + } + footprint := make(map[string]float64) + + for _, fp := range sc.Footprint { + statType := "avg" + + if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil { + statType = sc.MetricConfig[i].Footprint + } + + name := fmt.Sprintf("%s_%s", fp, statType) + footprint[fp] = LoadJobStat(jobMeta, name, statType) + } + + var rawFootprint []byte + + if rawFootprint, err = json.Marshal(footprint); err != nil { + log.Warnf("Error while marshaling footprint for job, DB ID '%v'", jobMeta.ID) + return stmt, err + } + + stmt.Set("footprint", rawFootprint) + return stmt, nil +} diff --git a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql index c7464b5..4c17137 100644 --- a/internal/repository/migrations/sqlite3/08_add-footprint.up.sql +++ b/internal/repository/migrations/sqlite3/08_add-footprint.up.sql @@ -1,5 +1,7 @@ -CREATE INDEX IF NOT EXISTS job_by_project ON job (project); -CREATE INDEX IF NOT EXISTS job_list_projects ON job (project, job_state); +CREATE INDEX IF NOT EXISTS jobs_cluster_orderby_starttime ON job (cluster, start_time DESC); +CREATE INDEX IF NOT EXISTS jobs_cluster_count ON job (cluster, job_state, start_time); +CREATE INDEX IF NOT EXISTS jobs_project_orderby_starttime ON job (project, start_time DESC); +CREATE INDEX IF NOT EXISTS jobs_project_count ON job (project, job_state, start_time); ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0; ALTER TABLE job ADD COLUMN energy_footprint TEXT DEFAULT NULL; @@ -25,3 +27,5 @@ ALTER TABLE job DROP net_bw_avg; ALTER TABLE job DROP net_data_vol_total; ALTER TABLE job DROP file_bw_avg; ALTER TABLE job DROP file_data_vol_total; + +PRAGMA optimize; \ No newline at end of file diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 81ca8d1..ca05ca3 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -13,7 +13,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -286,13 +286,17 @@ func (r *JobRepository) JobsStats( return stats, nil } -// FIXME: Make generic -func LoadJobStat(job *schema.JobMeta, metric string) float64 { +func LoadJobStat(job *schema.JobMeta, metric string, statType string) float64 { if stats, ok := job.Statistics[metric]; ok { - if metric == "mem_used" { - return stats.Max - } else { + switch statType { + case "avg": return stats.Avg + case "max": + return stats.Max + case "min": + return stats.Min + default: + log.Errorf("Unknown stat type %s", statType) } } @@ -691,7 +695,7 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram( continue } - if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil { + if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil { log.Errorf("Error while loading averages for histogram: %s", err) return nil } diff --git a/internal/repository/transaction.go b/internal/repository/transaction.go index 9398354..8c5d357 100644 --- a/internal/repository/transaction.go +++ b/internal/repository/transaction.go @@ -6,7 +6,6 @@ package repository import ( "github.com/ClusterCockpit/cc-backend/pkg/log" - "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/jmoiron/sqlx" ) @@ -18,20 +17,12 @@ type Transaction struct { func (r *JobRepository) TransactionInit() (*Transaction, error) { var err error t := new(Transaction) - // Inserts are bundled into transactions because in sqlite, - // that speeds up inserts A LOT. + t.tx, err = r.DB.Beginx() if err != nil { log.Warn("Error while bundling transactions") return nil, err } - - t.stmt, err = t.tx.PrepareNamed(NamedJobInsert) - if err != nil { - log.Warn("Error while preparing namedJobInsert") - return nil, err - } - return t, nil } @@ -50,7 +41,6 @@ func (r *JobRepository) TransactionCommit(t *Transaction) error { return err } - t.stmt = t.tx.NamedStmt(t.stmt) return nil } @@ -63,10 +53,14 @@ func (r *JobRepository) TransactionEnd(t *Transaction) error { return nil } -func (r *JobRepository) TransactionAdd(t *Transaction, job schema.Job) (int64, error) { - res, err := t.stmt.Exec(job) +func (r *JobRepository) TransactionAddNamed( + t *Transaction, + query string, + args ...interface{}, +) (int64, error) { + res, err := t.tx.NamedExec(query, args) if err != nil { - log.Errorf("repository initDB(): %v", err) + log.Errorf("Named Exec failed: %v", err) return 0, err } @@ -79,26 +73,14 @@ func (r *JobRepository) TransactionAdd(t *Transaction, job schema.Job) (int64, e return id, nil } -func (r *JobRepository) TransactionAddTag(t *Transaction, tag *schema.Tag) (int64, error) { - res, err := t.tx.Exec(`INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)`, tag.Name, tag.Type) +func (r *JobRepository) TransactionAdd(t *Transaction, query string, args ...interface{}) (int64, error) { + res := t.tx.MustExec(query, args) + + id, err := res.LastInsertId() if err != nil { - log.Errorf("Error while inserting tag into tag table: %v (Type %v)", tag.Name, tag.Type) - return 0, err - } - tagId, err := res.LastInsertId() - if err != nil { - log.Warn("Error while getting last insert ID") + log.Errorf("repository initDB(): %v", err) return 0, err } - return tagId, nil -} - -func (r *JobRepository) TransactionSetTag(t *Transaction, jobId int64, tagId int64) error { - if _, err := t.tx.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, jobId, tagId); err != nil { - log.Errorf("Error while inserting jobtag into jobtag table: %v (TagID %v)", jobId, tagId) - return err - } - - return nil + return id, nil } diff --git a/internal/taskManager/footprintService.go b/internal/taskManager/footprintService.go deleted file mode 100644 index 28a5a72..0000000 --- a/internal/taskManager/footprintService.go +++ /dev/null @@ -1,8 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package taskManager - -func registerFootprintWorker() { -} diff --git a/internal/taskManager/taskManager.go b/internal/taskManager/taskManager.go index 006469c..101fc4a 100644 --- a/internal/taskManager/taskManager.go +++ b/internal/taskManager/taskManager.go @@ -79,6 +79,9 @@ func Start() { RegisterLdapSyncService(lc.SyncInterval) } + RegisterFootprintWorker() + RegisterUpdateDurationWorker() + s.Start() } diff --git a/internal/taskManager/updateDurationService.go b/internal/taskManager/updateDurationService.go new file mode 100644 index 0000000..6023547 --- /dev/null +++ b/internal/taskManager/updateDurationService.go @@ -0,0 +1,26 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package taskManager + +import ( + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/go-co-op/gocron/v2" +) + +func RegisterUpdateDurationWorker() { + log.Info("Register duration update service") + + d, _ := time.ParseDuration("5m") + s.NewJob(gocron.DurationJob(d), + gocron.NewTask( + func() { + start := time.Now() + log.Printf("Update duration started at %s", start.Format(time.RFC3339)) + jobRepo.UpdateDuration() + log.Printf("Update duration is done and took %s", time.Since(start)) + })) +} diff --git a/internal/taskManager/updateFootprintService.go b/internal/taskManager/updateFootprintService.go new file mode 100644 index 0000000..2fdd6b9 --- /dev/null +++ b/internal/taskManager/updateFootprintService.go @@ -0,0 +1,114 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package taskManager + +import ( + "context" + "math" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + sq "github.com/Masterminds/squirrel" + "github.com/go-co-op/gocron/v2" +) + +func RegisterFootprintWorker() { + log.Info("Register Footprint Update service") + d, _ := time.ParseDuration("10m") + s.NewJob(gocron.DurationJob(d), + gocron.NewTask( + func() { + s := time.Now() + log.Printf("Update Footprints started at %s", s.Format(time.RFC3339)) + + t, err := jobRepo.TransactionInit() + if err != nil { + log.Errorf("Failed TransactionInit %v", err) + } + + for _, cluster := range archive.Clusters { + jobs, err := jobRepo.FindRunningJobs(cluster.Name) + if err != nil { + continue + } + allMetrics := make([]string, 0) + metricConfigs := archive.GetCluster(cluster.Name).MetricConfig + for _, mc := range metricConfigs { + allMetrics = append(allMetrics, mc.Name) + } + + scopes := []schema.MetricScope{schema.MetricScopeNode} + scopes = append(scopes, schema.MetricScopeCore) + scopes = append(scopes, schema.MetricScopeAccelerator) + + for _, job := range jobs { + jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, context.Background()) + if err != nil { + log.Error("Error wile loading job data for footprint update") + continue + } + + jobMeta := &schema.JobMeta{ + BaseJob: job.BaseJob, + StartTime: job.StartTime.Unix(), + Statistics: make(map[string]schema.JobStatistics), + } + + for metric, data := range jobData { + avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 + nodeData, ok := data["node"] + if !ok { + // This should never happen ? + continue + } + + for _, series := range nodeData.Series { + avg += series.Statistics.Avg + min = math.Min(min, series.Statistics.Min) + max = math.Max(max, series.Statistics.Max) + } + + jobMeta.Statistics[metric] = schema.JobStatistics{ + Unit: schema.Unit{ + Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, + Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, + }, + Avg: avg / float64(job.NumNodes), + Min: min, + Max: max, + } + } + + stmt := sq.Update("job").Where("job.id = ?", job.ID) + if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil { + log.Errorf("Update job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error()) + continue + } + if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil { + log.Errorf("Update job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error()) + continue + } + + query, args, err := stmt.ToSql() + if err != nil { + log.Errorf("Failed in ToSQL conversion %v", err) + continue + } + jobRepo.TransactionAdd(t, query, args) + // if err := jobRepo.Execute(stmt); err != nil { + // log.Errorf("Update job (dbid: %d) failed at db execute: %s", job.ID, err.Error()) + // continue + // } + } + + jobRepo.TransactionCommit(t) + } + jobRepo.TransactionEnd(t) + log.Printf("Update Footprints is done and took %s", time.Since(s)) + })) +} diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 6f0178c..5710d06 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -88,7 +88,7 @@ func initClusterConfig() error { sc.Footprint = append(sc.Footprint, newMetric.Name) ml.Footprint = newMetric.Footprint } - if newMetric.Energy { + if newMetric.Energy != "" { sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name) } } @@ -99,7 +99,7 @@ func initClusterConfig() error { if newMetric.Footprint != "" { sc.Footprint = append(sc.Footprint, newMetric.Name) } - if newMetric.Energy { + if newMetric.Energy != "" { sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name) } } @@ -221,3 +221,13 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) { return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname) } + +func MetricIndex(mc []schema.MetricConfig, name string) (int, error) { + for i, m := range mc { + if m.Name == name { + return i, nil + } + } + + return 0, fmt.Errorf("Unknown metric name %s", name) +} diff --git a/pkg/archive/testdata/archive/alex/cluster.json b/pkg/archive/testdata/archive/alex/cluster.json index cc2888d..f1cf085 100644 --- a/pkg/archive/testdata/archive/alex/cluster.json +++ b/pkg/archive/testdata/archive/alex/cluster.json @@ -94,7 +94,7 @@ }, "scope": "hwthread", "aggregation": "sum", - "energy": true, + "energy": "power", "timestep": 60, "peak": 500, "normal": 250, @@ -136,7 +136,7 @@ }, "scope": "accelerator", "aggregation": "sum", - "energy": true, + "energy": "power", "timestep": 60, "peak": 400, "normal": 200, @@ -190,7 +190,7 @@ }, "scope": "socket", "aggregation": "sum", - "energy": true, + "energy": "power", "timestep": 60, "peak": 500, "normal": 250, diff --git a/pkg/archive/testdata/archive/fritz/cluster.json b/pkg/archive/testdata/archive/fritz/cluster.json index 58ec3af..3df3a95 100644 --- a/pkg/archive/testdata/archive/fritz/cluster.json +++ b/pkg/archive/testdata/archive/fritz/cluster.json @@ -256,7 +256,7 @@ "normal": 250, "caution": 100, "alert": 50, - "energy": true + "energy": "power" }, { "name": "mem_power", @@ -270,7 +270,7 @@ "normal": 50, "caution": 20, "alert": 10, - "energy": true + "energy": "power" }, { "name": "ipc", diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index e9aa178..b9bf306 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -54,7 +54,7 @@ type SubClusterConfig struct { Alert float64 `json:"alert"` Remove bool `json:"remove"` LowerIsBetter bool `json:"lowerIsBetter"` - Energy bool `json:"energy"` + Energy string `json:"energy"` } type MetricConfig struct { @@ -70,7 +70,7 @@ type MetricConfig struct { Alert float64 `json:"alert"` Timestep int `json:"timestep"` LowerIsBetter bool `json:"lowerIsBetter"` - Energy bool `json:"energy"` + Energy string `json:"energy"` } type Cluster struct { diff --git a/pkg/schema/schemas/cluster.schema.json b/pkg/schema/schemas/cluster.schema.json index 81b138a..66b7ba1 100644 --- a/pkg/schema/schemas/cluster.schema.json +++ b/pkg/schema/schemas/cluster.schema.json @@ -50,7 +50,11 @@ }, "energy": { "description": "Is it used to calculate job energy", - "type": "boolean" + "type": "string", + "enum": [ + "power", + "energy" + ] }, "lowerIsBetter": { "description": "Is lower better.", @@ -93,7 +97,11 @@ }, "energy": { "description": "Is it used to calculate job energy. Overwrite global", - "type": "boolean" + "type": "string", + "enum": [ + "power", + "energy" + ] }, "lowerIsBetter": { "description": "Is lower better. Overwrite global",