diff --git a/.gitignore b/.gitignore index e23a17b..75cc004 100644 --- a/.gitignore +++ b/.gitignore @@ -1,21 +1,23 @@ /cc-backend - -/var/job-archive -/var/*.db -/var/machine-state - /.env /config.json +/var/job-archive +/var/machine-state +/var/job.db-shm +/var/job.db-wal +/var/*.db +/var/*.txt + /web/frontend/public/build /web/frontend/node_modules -/.vscode/* + /archive-migration /archive-manager -var/job.db-shm -var/job.db-wal +/internal/repository/testdata/job.db-shm +/internal/repository/testdata/job.db-wal + +/.vscode/* dist/ *.db -internal/repository/testdata/job.db-shm -internal/repository/testdata/job.db-wal diff --git a/Makefile b/Makefile index 52f0d39..dd33827 100644 --- a/Makefile +++ b/Makefile @@ -82,7 +82,7 @@ tags: @ctags -R $(VAR): - @mkdir $(VAR) + @mkdir -p $(VAR) config.json: $(info ===> Initialize config.json file) diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 1c4a3ec..1050ca1 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -60,12 +60,13 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { max = math.Max(max, series.Statistics.Max) } + // Round AVG Result to 2 Digits jobMeta.Statistics[metric] = schema.JobStatistics{ Unit: schema.Unit{ Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, }, - Avg: avg / float64(job.NumNodes), + Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100), Min: min, Max: max, } diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index d13a29b..867f740 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -303,6 +303,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str // JobsFootprints is the resolver for the jobsFootprints field. func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) { + // NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column! return r.jobsFootprints(ctx, filter, metrics) } diff --git a/internal/metricDataDispatcher/dataLoader.go b/internal/metricDataDispatcher/dataLoader.go index 939a0fb..c3291a1 100644 --- a/internal/metricDataDispatcher/dataLoader.go +++ b/internal/metricDataDispatcher/dataLoader.go @@ -170,6 +170,9 @@ func LoadData(job *schema.Job, jd.AddNodeScope("mem_bw") } + // Round Resulting Stat Values + jd.RoundMetricStats() + return jd, ttl, size }) diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go index 5db7583..2b92fbb 100644 --- a/internal/metricdata/cc-metric-store.go +++ b/internal/metricdata/cc-metric-store.go @@ -440,6 +440,23 @@ func (ccms *CCMetricStore) buildQueries( continue } + // Core -> Socket + if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { + sockets, _ := topology.GetSocketsFromCores(hwthreads) + for _, socket := range sockets { + queries = append(queries, ApiQuery{ + Metric: remoteName, + Hostname: host.Hostname, + Aggregate: true, + Type: &coreString, + TypeIds: intToStringSlice(topology.Socket[socket]), + Resolution: resolution, + }) + assignedScope = append(assignedScope, scope) + } + continue + } + // Core -> Node if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { cores, _ := topology.GetCoresFromHWThreads(hwthreads) @@ -627,7 +644,7 @@ func (ccms *CCMetricStore) LoadNodeData( req.Queries = append(req.Queries, ApiQuery{ Hostname: node, Metric: ccms.toRemoteName(metric), - Resolution: 60, // Default for Node Queries + Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution }) } } @@ -1038,6 +1055,23 @@ func (ccms *CCMetricStore) buildNodeQueries( continue } + // Core -> Socket + if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { + sockets, _ := topology.GetSocketsFromCores(topology.Node) + for _, socket := range sockets { + queries = append(queries, ApiQuery{ + Metric: remoteName, + Hostname: hostname, + Aggregate: true, + Type: &coreString, + TypeIds: intToStringSlice(topology.Socket[socket]), + Resolution: resolution, + }) + assignedScope = append(assignedScope, scope) + } + continue + } + // Core -> Node if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { cores, _ := topology.GetCoresFromHWThreads(topology.Node) diff --git a/internal/repository/job.go b/internal/repository/job.go index 11f3b46..020c3c2 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -217,11 +217,6 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) { start := time.Now() - cachekey := fmt.Sprintf("footprint:%d", job.ID) - if cached := r.cache.Get(cachekey, nil); cached != nil { - job.Footprint = cached.(map[string]float64) - return job.Footprint, nil - } if err := sq.Select("job.footprint").From("job").Where("job.id = ?", job.ID). RunWith(r.stmtCache).QueryRow().Scan(&job.RawFootprint); err != nil { @@ -238,7 +233,6 @@ func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, err return nil, err } - r.cache.Put(cachekey, job.Footprint, len(job.Footprint), 24*time.Hour) log.Debugf("Timer FetchFootprint %s", time.Since(start)) return job.Footprint, nil } @@ -606,8 +600,11 @@ func (r *JobRepository) UpdateEnergy( // FIXME: Needs sum as stats type } else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt) // Energy: Power (in Watts) * Time (in Seconds) - // Unit: ( W * s ) / 3600 / 1000 = kWh ; Rounded to 2 nearest digits - energy = math.Round(((LoadJobStat(jobMeta, fp, "avg")*float64(jobMeta.Duration))/3600/1000)*100) / 100 + // Unit: (( W * s ) / 3600) / 1000 = kWh ; Rounded to 2 nearest digits: (Energy * 100) / 100 + // Here: All-Node Metric Average * Number of Nodes * Job Runtime + // Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1 + metricNodeSum := LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes) * float64(jobMeta.Duration) + energy = math.Round(((metricNodeSum/3600)/1000)*100) / 100 } } else { log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) diff --git a/internal/taskManager/updateFootprintService.go b/internal/taskManager/updateFootprintService.go index d30d766..a220855 100644 --- a/internal/taskManager/updateFootprintService.go +++ b/internal/taskManager/updateFootprintService.go @@ -94,7 +94,7 @@ func RegisterFootprintWorker() { } } - // Add values rounded to 2 digits + // Add values rounded to 2 digits: repo.LoadStats may return unrounded jobMeta.Statistics[metric] = schema.JobStatistics{ Unit: schema.Unit{ Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index 07e4647..322f308 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -122,6 +122,38 @@ func (topo *Topology) GetSocketsFromHWThreads( return sockets, exclusive } +// Return a list of socket IDs given a list of core IDs. Even if just one +// core is in that socket, add it to the list. If no cores other than +// those in the argument list are assigned to one of the sockets in the first +// return value, return true as the second value. TODO: Optimize this, there +// must be a more efficient way/algorithm. +func (topo *Topology) GetSocketsFromCores ( + cores []int, +) (sockets []int, exclusive bool) { + socketsMap := map[int]int{} + for _, core := range cores { + for _, hwthreadInCore := range topo.Core[core] { + for socket, hwthreadsInSocket := range topo.Socket { + for _, hwthreadInSocket := range hwthreadsInSocket { + if hwthreadInCore == hwthreadInSocket { + socketsMap[socket] += 1 + } + } + } + } + } + + exclusive = true + hwthreadsPerSocket := len(topo.Node) / len(topo.Socket) + sockets = make([]int, 0, len(socketsMap)) + for socket, count := range socketsMap { + sockets = append(sockets, socket) + exclusive = exclusive && count == hwthreadsPerSocket + } + + return sockets, exclusive +} + // Return a list of core IDs given a list of hwthread IDs. Even if just one // hwthread is in that core, add it to the list. If no hwthreads other than // those in the argument list are assigned to one of the cores in the first diff --git a/pkg/schema/metrics.go b/pkg/schema/metrics.go index bbc3c74..ffac21b 100644 --- a/pkg/schema/metrics.go +++ b/pkg/schema/metrics.go @@ -291,6 +291,21 @@ func (jd *JobData) AddNodeScope(metric string) bool { return true } +func (jd *JobData) RoundMetricStats() { + // TODO: Make Digit-Precision Configurable? (Currently: Fixed to 2 Digits) + for _, scopes := range *jd { + for _, jm := range scopes { + for index := range jm.Series { + jm.Series[index].Statistics = MetricStatistics{ + Avg: (math.Round(jm.Series[index].Statistics.Avg*100) / 100), + Min: (math.Round(jm.Series[index].Statistics.Min*100) / 100), + Max: (math.Round(jm.Series[index].Statistics.Max*100) / 100), + } + } + } + } +} + func (jm *JobMetric) AddPercentiles(ps []int) bool { if jm.StatisticsSeries == nil { jm.AddStatisticsSeries() diff --git a/pkg/schema/schemas/config.schema.json b/pkg/schema/schemas/config.schema.json index 0a3905a..f372fc1 100644 --- a/pkg/schema/schemas/config.schema.json +++ b/pkg/schema/schemas/config.schema.json @@ -446,7 +446,7 @@ } }, "job_view_selectedMetrics": { - "description": "", + "description": "Initial metrics shown as plots in single job view", "type": "array", "items": { "type": "string", diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 086f25c..9d80bef 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -117,27 +117,41 @@ } `; +const roofQuery = gql` + query ($dbid: ID!, $selectedMetrics: [String!]!, $selectedScopes: [MetricScope!]!, $selectedResolution: Int) { + jobMetrics(id: $dbid, metrics: $selectedMetrics, scopes: $selectedScopes, resolution: $selectedResolution) { + name + scope + metric { + series { + data + } + } + } + } + `; + $: jobMetrics = queryStore({ client: client, query: query, variables: { dbid, selectedMetrics, selectedScopes }, }); + // Roofline: Always load roofMetrics with configured timestep (Resolution: 0) + $: roofMetrics = queryStore({ + client: client, + query: roofQuery, + variables: { dbid, selectedMetrics: ["flops_any", "mem_bw"], selectedScopes: ["node"], selectedResolution: 0 }, + }); + // Handle Job Query on Init -> is not executed anymore getContext("on-init")(() => { let job = $initq.data.job; if (!job) return; const pendingMetrics = [ - "flops_any", - "mem_bw", ...(ccconfig[`job_view_selectedMetrics:${job.cluster}`] || - $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster)) { - names.push(gm.name); - } - return names; - }, []) + ccconfig[`job_view_selectedMetrics`] ), ...(ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}`] || ccconfig[`job_view_nodestats_selectedMetrics`] @@ -276,12 +290,12 @@
Initq Error: {$initq.error?.message}
-jobMetrics Error: {$jobMetrics.error?.message}
+roofMetrics (jobMetrics) Error: {$roofMetrics.error?.message}