diff --git a/api/schema.graphqls b/api/schema.graphqls index d703990..568c15d 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -272,6 +272,7 @@ input JobFilter { input OrderByInput { field: String! + type: String!, order: SortDirectionEnum! = ASC } @@ -319,6 +320,7 @@ type HistoPoint { type MetricHistoPoints { metric: String! unit: String! + stat: String data: [MetricHistoPoint!] } diff --git a/internal/api/rest.go b/internal/api/rest.go index 01eb429..c8f4e7a 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -119,7 +119,6 @@ func (api *RestApi) MountFrontendApiRoutes(r *mux.Router) { if api.Authentication != nil { r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet) r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost) - r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) // Fetched in Job.svelte: Needs All-User-Access-Session-Auth } } diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index d54ddb1..9ca0a60 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -211,6 +211,7 @@ type ComplexityRoot struct { MetricHistoPoints struct { Data func(childComplexity int) int Metric func(childComplexity int) int + Stat func(childComplexity int) int Unit func(childComplexity int) int } @@ -1104,6 +1105,13 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.MetricHistoPoints.Metric(childComplexity), true + case "MetricHistoPoints.stat": + if e.complexity.MetricHistoPoints.Stat == nil { + break + } + + return e.complexity.MetricHistoPoints.Stat(childComplexity), true + case "MetricHistoPoints.unit": if e.complexity.MetricHistoPoints.Unit == nil { break @@ -2100,6 +2108,7 @@ input JobFilter { input OrderByInput { field: String! + type: String!, order: SortDirectionEnum! = ASC } @@ -2147,6 +2156,7 @@ type HistoPoint { type MetricHistoPoints { metric: String! unit: String! + stat: String data: [MetricHistoPoint!] } @@ -6445,6 +6455,8 @@ func (ec *executionContext) fieldContext_JobsStatistics_histMetrics(_ context.Co return ec.fieldContext_MetricHistoPoints_metric(ctx, field) case "unit": return ec.fieldContext_MetricHistoPoints_unit(ctx, field) + case "stat": + return ec.fieldContext_MetricHistoPoints_stat(ctx, field) case "data": return ec.fieldContext_MetricHistoPoints_data(ctx, field) } @@ -7295,6 +7307,47 @@ func (ec *executionContext) fieldContext_MetricHistoPoints_unit(_ context.Contex return fc, nil } +func (ec *executionContext) _MetricHistoPoints_stat(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoints) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_MetricHistoPoints_stat(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (interface{}, error) { + ctx = rctx // use context from middleware stack in children + return obj.Stat, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*string) + fc.Result = res + return ec.marshalOString2ᚖstring(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_MetricHistoPoints_stat(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "MetricHistoPoints", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _MetricHistoPoints_data(ctx context.Context, field graphql.CollectedField, obj *model.MetricHistoPoints) (ret graphql.Marshaler) { fc, err := ec.fieldContext_MetricHistoPoints_data(ctx, field) if err != nil { @@ -13217,7 +13270,7 @@ func (ec *executionContext) unmarshalInputOrderByInput(ctx context.Context, obj asMap["order"] = "ASC" } - fieldsInOrder := [...]string{"field", "order"} + fieldsInOrder := [...]string{"field", "type", "order"} for _, k := range fieldsInOrder { v, ok := asMap[k] if !ok { @@ -13231,6 +13284,13 @@ func (ec *executionContext) unmarshalInputOrderByInput(ctx context.Context, obj return it, err } it.Field = data + case "type": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("type")) + data, err := ec.unmarshalNString2string(ctx, v) + if err != nil { + return it, err + } + it.Type = data case "order": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("order")) data, err := ec.unmarshalNSortDirectionEnum2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐSortDirectionEnum(ctx, v) @@ -14673,6 +14733,8 @@ func (ec *executionContext) _MetricHistoPoints(ctx context.Context, sel ast.Sele if out.Values[i] == graphql.Null { out.Invalids++ } + case "stat": + out.Values[i] = ec._MetricHistoPoints_stat(ctx, field, obj) case "data": out.Values[i] = ec._MetricHistoPoints_data(ctx, field, obj) default: diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index e3b4a11..6c731a2 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -123,6 +123,7 @@ type MetricHistoPoint struct { type MetricHistoPoints struct { Metric string `json:"metric"` Unit string `json:"unit"` + Stat *string `json:"stat,omitempty"` Data []*MetricHistoPoint `json:"data,omitempty"` } @@ -142,6 +143,7 @@ type NodeMetrics struct { type OrderByInput struct { Field string `json:"field"` + Type string `json:"type"` Order SortDirectionEnum `json:"order"` } diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index c52577d..7b575ef 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -31,14 +31,28 @@ func (r *JobRepository) QueryJobs( if order != nil { field := toSnakeCase(order.Field) - - switch order.Order { - case model.SortDirectionEnumAsc: - query = query.OrderBy(fmt.Sprintf("job.%s ASC", field)) - case model.SortDirectionEnumDesc: - query = query.OrderBy(fmt.Sprintf("job.%s DESC", field)) - default: - return nil, errors.New("REPOSITORY/QUERY > invalid sorting order") + if order.Type == "col" { + // "col": Fixed column name query + switch order.Order { + case model.SortDirectionEnumAsc: + query = query.OrderBy(fmt.Sprintf("job.%s ASC", field)) + case model.SortDirectionEnumDesc: + query = query.OrderBy(fmt.Sprintf("job.%s DESC", field)) + default: + return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column") + } + } else { + // "foot": Order by footprint JSON field values + // Verify and Search Only in Valid Jsons + query = query.Where("JSON_VALID(meta_data)") + switch order.Order { + case model.SortDirectionEnumAsc: + query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field)) + case model.SortDirectionEnumDesc: + query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field)) + default: + return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint") + } } } @@ -177,8 +191,8 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select query = buildStringCondition("job.resources", filter.Node, query) } if filter.MetricStats != nil { - for _, m := range filter.MetricStats { - query = buildFloatJsonCondition("job.metric_stats", m.Range, query) + for _, ms := range filter.MetricStats { + query = buildFloatJsonCondition(ms.MetricName, ms.Range, query) } } return query @@ -200,8 +214,10 @@ func buildTimeCondition(field string, cond *schema.TimeRange, query sq.SelectBui } } -func buildFloatJsonCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder { - return query.Where("JSON_EXTRACT(footprint, '$."+field+"') BETWEEN ? AND ?", cond.From, cond.To) +func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder { + // Verify and Search Only in Valid Jsons + query = query.Where("JSON_VALID(footprint)") + return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To) } func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder { diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 33cafa0..81ca8d1 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -552,12 +552,14 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( var metricConfig *schema.MetricConfig var peak float64 = 0.0 var unit string = "" + var footprintStat string = "" for _, f := range filters { if f.Cluster != nil { metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric) peak = metricConfig.Peak unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base + footprintStat = metricConfig.Footprint log.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric) } } @@ -572,21 +574,26 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( if unit == "" { unit = m.Unit.Prefix + m.Unit.Base } + if footprintStat == "" { + footprintStat = m.Footprint + } } } } } - // log.Debugf("Metric %s: DB %s, Peak %f, Unit %s", metric, dbMetric, peak, unit) + // log.Debugf("Metric %s, Peak %f, Unit %s, Aggregation %s", metric, peak, unit, aggreg) // Make bins, see https://jereze.com/code/sql-histogram/ start := time.Now() - jm := fmt.Sprintf(`json_extract(footprint, "$.%s")`, metric) + jm := fmt.Sprintf(`json_extract(footprint, "$.%s")`, (metric + "_" + footprintStat)) crossJoinQuery := sq.Select( fmt.Sprintf(`max(%s) as max`, jm), fmt.Sprintf(`min(%s) as min`, jm), ).From("job").Where( + "JSON_VALID(footprint)", + ).Where( fmt.Sprintf(`%s is not null`, jm), ).Where( fmt.Sprintf(`%s <= %f`, jm, peak), @@ -651,7 +658,7 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( points = append(points, &point) } - result := model.MetricHistoPoints{Metric: metric, Unit: unit, Data: points} + result := model.MetricHistoPoints{Metric: metric, Unit: unit, Stat: &footprintStat, Data: points} log.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start)) return &result, nil diff --git a/web/frontend/src/Analysis.root.svelte b/web/frontend/src/Analysis.root.svelte index 0592f28..7ae073d 100644 --- a/web/frontend/src/Analysis.root.svelte +++ b/web/frontend/src/Analysis.root.svelte @@ -48,8 +48,10 @@ let colWidth1, colWidth2, colWidth3, colWidth4; let numBins = 50; let maxY = -1; + + const initialized = getContext("initialized"); + const globalMetrics = getContext("globalMetrics"); const ccconfig = getContext("cc-config"); - const metricConfig = getContext("metrics"); let metricsInHistograms = ccconfig.analysis_view_histogramMetrics, metricsInScatterplots = ccconfig.analysis_view_scatterPlotMetrics; @@ -268,6 +270,19 @@ } } + let availableMetrics = []; + let metricUnits = {}; + let metricScopes = {}; + function loadMetrics(isInitialized) { + if (!isInitialized) return + availableMetrics = [...globalMetrics.filter((gm) => gm?.availability.find((av) => av.cluster == cluster.name))] + for (let sm of availableMetrics) { + metricUnits[sm.name] = (sm?.unit?.prefix ? sm.unit.prefix : "") + (sm?.unit?.base ? sm.unit.base : "") + metricScopes[sm.name] = sm?.scope + } + } + + $: loadMetrics($initialized) $: updateEntityConfiguration(groupSelection.key); $: updateCategoryConfiguration(sortSelection.key); @@ -285,7 +300,7 @@ {$initq.error.message} {:else if cluster} mc.name)} + availableMetrics={availableMetrics.map((av) => av.name)} bind:metricsInHistograms bind:metricsInScatterplots /> @@ -506,7 +521,7 @@ metric, ...binsFromFootprint( $footprintsQuery.data.footprints.timeWeights, - metricConfig(cluster.name, metric)?.scope, + metricScopes[metric], $footprintsQuery.data.footprints.metrics.find( (f) => f.metric == metric, ).data, @@ -521,22 +536,8 @@ height={250} usesBins={true} title="Average Distribution of '{item.metric}'" - xlabel={`${item.metric} bin maximum ${ - (metricConfig(cluster.name, item.metric)?.unit?.prefix - ? "[" + metricConfig(cluster.name, item.metric)?.unit?.prefix - : "") + - (metricConfig(cluster.name, item.metric)?.unit?.base - ? metricConfig(cluster.name, item.metric)?.unit?.base + "]" - : "") - }`} - xunit={`${ - (metricConfig(cluster.name, item.metric)?.unit?.prefix - ? metricConfig(cluster.name, item.metric)?.unit?.prefix - : "") + - (metricConfig(cluster.name, item.metric)?.unit?.base - ? metricConfig(cluster.name, item.metric)?.unit?.base - : "") - }`} + xlabel={`${item.metric} bin maximum [${metricUnits[item.metric]}]`} + xunit={`${metricUnits[item.metric]}`} ylabel="Normalized Hours" yunit="Hours" /> @@ -578,22 +579,8 @@ {width} height={250} color={"rgba(0, 102, 204, 0.33)"} - xLabel={`${item.m1} [${ - (metricConfig(cluster.name, item.m1)?.unit?.prefix - ? metricConfig(cluster.name, item.m1)?.unit?.prefix - : "") + - (metricConfig(cluster.name, item.m1)?.unit?.base - ? metricConfig(cluster.name, item.m1)?.unit?.base - : "") - }]`} - yLabel={`${item.m2} [${ - (metricConfig(cluster.name, item.m2)?.unit?.prefix - ? metricConfig(cluster.name, item.m2)?.unit?.prefix - : "") + - (metricConfig(cluster.name, item.m2)?.unit?.base - ? metricConfig(cluster.name, item.m2)?.unit?.base - : "") - }]`} + xLabel={`${item.m1} [${metricUnits[item.m1]}]`} + yLabel={`${item.m2} [${metricUnits[item.m2]}]`} X={item.f1} Y={item.f2} S={$footprintsQuery.data.footprints.timeWeights.nodeHours} diff --git a/web/frontend/src/Config.root.svelte b/web/frontend/src/Config.root.svelte index 61e99a8..fde9342 100644 --- a/web/frontend/src/Config.root.svelte +++ b/web/frontend/src/Config.root.svelte @@ -1,12 +1,9 @@ (isOpen = !isOpen)}> diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 8cf8f87..fa7ddd1 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -2,10 +2,14 @@ import { init, groupByScope, - fetchMetricsStore, checkMetricDisabled, transformDataForRoofline, } from "./utils.js"; + import { + queryStore, + gql, + getContextClient + } from "@urql/svelte"; import { Row, Col, @@ -34,15 +38,27 @@ export let authlevel; export let roles; - const accMetrics = [ - "acc_utilization", - "acc_mem_used", - "acc_power", - "nv_mem_util", - "nv_sm_clock", - "nv_temp", - ]; - let accNodeOnly; + // Setup General + + const ccconfig = getContext("cc-config") + + let isMetricsSelectionOpen = false, + showFootprint = !!ccconfig[`job_view_showFootprint`], + selectedMetrics = [], + selectedScopes = []; + + let plots = {}, + jobTags, + statsTable, + jobFootprint; + + let missingMetrics = [], + missingHosts = [], + somethingMissing = false; + + // Setup GQL + // First: Add Job Query to init function -> Only requires DBID as argument, received via URL-ID + // Second: Trigger jobMetrics query with now received jobInfos (scopes: from job metadata, selectedMetrics: from config or all, job: from url-id) const { query: initq } = init(` job(id: "${dbid}") { @@ -55,99 +71,100 @@ metaData, userData { name, email }, concurrentJobs { items { id, jobId }, count, listQuery }, - flopsAnyAvg, memBwAvg, loadAvg + footprint { name, stat, value } } `); - const ccconfig = getContext("cc-config"), - clusters = getContext("clusters"), - metrics = getContext("metrics"); + const client = getContextClient(); + const query = gql` + query ($dbid: ID!, $selectedMetrics: [String!]!, $selectedScopes: [MetricScope!]!) { + jobMetrics(id: $dbid, metrics: $selectedMetrics, scopes: $selectedScopes) { + name + scope + metric { + unit { + prefix + base + } + timestep + statisticsSeries { + min + median + max + } + series { + hostname + id + data + statistics { + min + avg + max + } + } + } + } + } + `; - let isMetricsSelectionOpen = false, - selectedMetrics = [], - showFootprint = true, - isFetched = new Set(); - const [jobMetrics, startFetching] = fetchMetricsStore(); + $: jobMetrics = queryStore({ + client: client, + query: query, + variables: { dbid, selectedMetrics, selectedScopes }, + }); + + function loadAllScopes() { + selectedScopes = [...selectedScopes, "socket", "core"] + jobMetrics = queryStore({ + client: client, + query: query, + variables: { dbid, selectedMetrics, selectedScopes}, + }); + } + + // Handle Job Query on Init -> is not executed anymore getContext("on-init")(() => { let job = $initq.data.job; if (!job) return; - selectedMetrics = - ccconfig[`job_view_selectedMetrics:${job.cluster}`] || - clusters - .find((c) => c.name == job.cluster) - .metricConfig.map((mc) => mc.name); - - showFootprint = - ccconfig[`job_view_showFootprint`] - - let toFetch = new Set([ + const pendingMetrics = [ "flops_any", "mem_bw", - ...selectedMetrics, + ...(ccconfig[`job_view_selectedMetrics:${job.cluster}`] || + $initq.data.globalMetrics.reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === job.cluster)) { + names.push(gm.name); + } + return names; + }, []) + ), ...(ccconfig[`job_view_polarPlotMetrics:${job.cluster}`] || - ccconfig[`job_view_polarPlotMetrics`]), + ccconfig[`job_view_polarPlotMetrics`] + ), ...(ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}`] || - ccconfig[`job_view_nodestats_selectedMetrics`]), - ]); + ccconfig[`job_view_nodestats_selectedMetrics`] + ), + ]; - // Select default Scopes to load: Check before if accelerator metrics are not on accelerator scope by default - accNodeOnly = [...toFetch].some(function (m) { - if (accMetrics.includes(m)) { - const mc = metrics(job.cluster, m); - return mc.scope !== "accelerator"; - } else { - return false; - } + // Select default Scopes to load: Check before if any metric has accelerator scope by default + const accScopeDefault = [...pendingMetrics].some(function (m) { + const cluster = $initq.data.clusters.find((c) => c.name == job.cluster); + const subCluster = cluster.subClusters.find((sc) => sc.name == job.subCluster); + return subCluster.metricConfig.find((smc) => smc.name == m)?.scope === "accelerator"; }); - if (job.numAcc === 0 || accNodeOnly === true) { - // No Accels or Accels on Node Scope - startFetching( - job, - [...toFetch], - job.numNodes > 2 ? ["node"] : ["node", "socket", "core"], - ); - } else { - // Accels and not on node scope - startFetching( - job, - [...toFetch], - job.numNodes > 2 - ? ["node", "accelerator"] - : ["node", "accelerator", "socket", "core"], - ); + const pendingScopes = ["node"] + if (accScopeDefault) pendingScopes.push("accelerator") + if (job.numNodes === 1) { + pendingScopes.push("socket") + pendingScopes.push("core") } - isFetched = toFetch; + selectedMetrics = [...new Set(pendingMetrics)]; + selectedScopes = [...new Set(pendingScopes)]; }); - const lazyFetchMoreMetrics = () => { - let notYetFetched = new Set(); - for (let m of selectedMetrics) { - if (!isFetched.has(m)) { - notYetFetched.add(m); - isFetched.add(m); - } - } - - if (notYetFetched.size > 0) - startFetching( - $initq.data.job, - [...notYetFetched], - $initq.data.job.numNodes > 2 ? ["node"] : ["node", "core"], - ); - }; - - // Fetch more data once required: - $: if ($initq.data && $jobMetrics.data && selectedMetrics) - lazyFetchMoreMetrics(); - - let plots = {}, - jobTags, - statsTable, - jobFootprint; - + // Interactive Document Title $: document.title = $initq.fetching ? "Loading..." : $initq.error @@ -155,15 +172,15 @@ : `Job ${$initq.data.job.jobId} - ClusterCockpit`; // Find out what metrics or hosts are missing: - let missingMetrics = [], - missingHosts = [], - somethingMissing = false; - $: if ($initq.data && $jobMetrics.data) { + $: if ($initq?.data && $jobMetrics?.data?.jobMetrics) { let job = $initq.data.job, metrics = $jobMetrics.data.jobMetrics, - metricNames = clusters - .find((c) => c.name == job.cluster) - .metricConfig.map((mc) => mc.name); + metricNames = $initq.data.globalMetrics.reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === job.cluster)) { + names.push(gm.name); + } + return names; + }, []); // Metric not found in JobMetrics && Metric not explicitly disabled in config or deselected: Was expected, but is Missing missingMetrics = metricNames.filter( @@ -192,6 +209,7 @@ somethingMissing = missingMetrics.length > 0 || missingHosts.length > 0; } + // Helper const orderAndMap = (grouped, selectedMetrics) => selectedMetrics.map((metric) => ({ metric: metric, @@ -214,18 +232,15 @@ {/if} - {#if $jobMetrics.data && showFootprint} - {#key $jobMetrics.data} - - - - {/key} + {#if $initq.data && showFootprint} + + + {/if} - {#if $jobMetrics.data && $initq.data} + {#if $initq?.data && $jobMetrics?.data?.jobMetrics} {#if $initq.data.job.concurrentJobs != null && $initq.data.job.concurrentJobs.items.length != 0} {#if authlevel > roles.manager} @@ -270,27 +285,29 @@ `job_view_polarPlotMetrics:${$initq.data.job.cluster}` ] || ccconfig[`job_view_polarPlotMetrics`]} cluster={$initq.data.job.cluster} + subCluster={$initq.data.job.subCluster} jobMetrics={$jobMetrics.data.jobMetrics} /> c.name == $initq.data.job.cluster) .subClusters.find((sc) => sc.name == $initq.data.job.subCluster)} data={transformDataForRoofline( $jobMetrics.data.jobMetrics.find( (m) => m.name == "flops_any" && m.scope == "node", - ).metric, + )?.metric, $jobMetrics.data.jobMetrics.find( (m) => m.name == "mem_bw" && m.scope == "node", - ).metric, + )?.metric, )} /> {:else} + {/if} @@ -318,7 +335,7 @@ {$jobMetrics.error.message} {:else if $jobMetrics.fetching} - {:else if $jobMetrics.data && $initq.data} + {:else if $initq?.data && $jobMetrics?.data?.jobMetrics} statsTable.moreLoaded(detail)} + on:load-all={loadAllScopes} job={$initq.data.job} metricName={item.metric} + metricUnit={$initq.data.globalMetrics.find((gm) => gm.name == item.metric)?.unit} + nativeScope={$initq.data.globalMetrics.find((gm) => gm.name == item.metric)?.scope} rawData={item.data.map((x) => x.metric)} scopes={item.data.map((x) => x.scope)} {width} @@ -388,8 +407,8 @@ tab="Statistics Table" active={!somethingMissing} > - {#if $jobMetrics.data} - {#key $jobMetrics.data} + {#if $jobMetrics?.data?.jobMetrics} + {#key $jobMetrics.data.jobMetrics} - export function findJobThresholds(job, metricConfig, subClusterConfig) { - if (!job || !metricConfig || !subClusterConfig) { + export function findJobThresholds(job, metricConfig) { + if (!job || !metricConfig) { console.warn("Argument missing for findJobThresholds!"); return null; } - const subclusterThresholds = metricConfig.subClusters.find( - (sc) => sc.name == subClusterConfig.name, - ); + // metricConfig is on subCluster-Level const defaultThresholds = { - peak: subclusterThresholds - ? subclusterThresholds.peak - : metricConfig.peak, - normal: subclusterThresholds - ? subclusterThresholds.normal - : metricConfig.normal, - caution: subclusterThresholds - ? subclusterThresholds.caution - : metricConfig.caution, - alert: subclusterThresholds - ? subclusterThresholds.alert - : metricConfig.alert, + peak: metricConfig.peak, + normal: metricConfig.normal, + caution: metricConfig.caution, + alert: metricConfig.alert }; // Job_Exclusivity does not matter, only aggregation if (metricConfig.aggregation === "avg") { return defaultThresholds; } else if (metricConfig.aggregation === "sum") { - const jobFraction = - job.numHWThreads / subClusterConfig.topology.node.length; + const topol = getContext("getHardwareTopology")(job.cluster, job.subCluster) + const jobFraction = job.numHWThreads / topol.node.length; + return { peak: round(defaultThresholds.peak * jobFraction, 0), normal: round(defaultThresholds.normal * jobFraction, 0), @@ -55,109 +46,56 @@ Progress, Icon, Tooltip, + Row, + Col } from "@sveltestrap/sveltestrap"; - import { mean, round } from "mathjs"; + import { round } from "mathjs"; export let job; - export let jobMetrics; export let view = "job"; export let width = "auto"; + export let height = "310px"; - const clusters = getContext("clusters"); - const subclusterConfig = clusters - .find((c) => c.name == job.cluster) - .subClusters.find((sc) => sc.name == job.subCluster); - - const footprintMetrics = - job.numAcc !== 0 - ? job.exclusive !== 1 // GPU - ? ["acc_utilization", "acc_mem_used", "nv_sm_clock", "nv_mem_util"] // Shared - : ["acc_utilization", "acc_mem_used", "nv_sm_clock", "nv_mem_util"] // Exclusive - : (job.exclusive !== 1) // CPU Only - ? ["flops_any", "mem_used"] // Shared - : ["cpu_load", "flops_any", "mem_used", "mem_bw"]; // Exclusive - - const footprintData = footprintMetrics.map((fm) => { + const footprintData = job?.footprint?.map((jf) => { // Unit - const fmc = getContext("metrics")(job.cluster, fm); - let unit = ""; - if (fmc?.unit?.base) unit = fmc.unit.prefix + fmc.unit.base; + const fmc = getContext("getMetricConfig")(job.cluster, job.subCluster, jf.name); + const unit = (fmc?.unit?.prefix ? fmc.unit.prefix : "") + (fmc?.unit?.base ? fmc.unit.base : "") // Threshold / -Differences - const fmt = findJobThresholds(job, fmc, subclusterConfig); - if (fm === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0); + const fmt = findJobThresholds(job, fmc); + if (jf.name === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0); - // Value: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata - // Exclusivity does not matter - let mv = 0.0; - if (fmc.aggregation === "avg") { - if (fm === "cpu_load" && job.loadAvg !== 0) { - mv = round(job.loadAvg, 2); - } else if (fm === "flops_any" && job.flopsAnyAvg !== 0) { - mv = round(job.flopsAnyAvg, 2); - } else if (fm === "mem_bw" && job.memBwAvg !== 0) { - mv = round(job.memBwAvg, 2); - } else { - // Calculate Avg from jobMetrics - const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === "node"); - if (jm?.metric?.statisticsSeries) { - const noNan = jm.metric.statisticsSeries.median.filter(function (val) { - return val != null; - }); - mv = round(mean(noNan), 2); - } else if (jm?.metric?.series?.length > 1) { - const avgs = jm.metric.series.map((jms) => jms.statistics.avg); - mv = round(mean(avgs), 2); - } else if (jm?.metric?.series) { - mv = round(jm.metric.series[0].statistics.avg, 2); - } - } - } else if (fmc.aggregation === "sum") { - // Calculate Sum from jobMetrics: Sum all node averages - const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === "node"); - if (jm?.metric?.series?.length > 1) { // More than 1 node - const avgs = jm.metric.series.map((jms) => jms.statistics.avg); - mv = round(avgs.reduce((a, b) => a + b, 0)); - } else if (jm?.metric?.series) { - mv = round(jm.metric.series[0].statistics.avg, 2); - } - } else { - console.warn( - "Missing or unkown aggregation mode (sum/avg) for metric:", - metricConfig, - ); - } - - // Define basic data + // Define basic data -> Value: Use as Provided const fmBase = { - name: fm, + name: jf.name + ' (' + jf.stat + ')', + avg: jf.value, unit: unit, - avg: mv, max: fmt.peak, + dir: fmc.lowerIsBetter }; - if (evalFootprint(fm, mv, fmt, "alert")) { + if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "alert")) { return { ...fmBase, color: "danger", - message: `Metric average way ${fm === "mem_used" ? "above" : "below"} expected normal thresholds.`, - impact: 3, + message: `Metric average way ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`, + impact: 3 }; - } else if (evalFootprint(fm, mv, fmt, "caution")) { + } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "caution")) { return { ...fmBase, color: "warning", - message: `Metric average ${fm === "mem_used" ? "above" : "below"} expected normal thresholds.`, + message: `Metric average ${fmc.lowerIsBetter ? "above" : "below"} expected normal thresholds.`, impact: 2, }; - } else if (evalFootprint(fm, mv, fmt, "normal")) { + } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "normal")) { return { ...fmBase, color: "success", message: "Metric average within expected thresholds.", impact: 1, }; - } else if (evalFootprint(fm, mv, fmt, "peak")) { + } else if (evalFootprint(jf.value, fmt, fmc.lowerIsBetter, "peak")) { return { ...fmBase, color: "info", @@ -176,23 +114,23 @@ } }); - function evalFootprint(metric, mean, thresholds, level) { - // mem_used has inverse logic regarding threshold levels, notify levels triggered if mean > threshold + function evalFootprint(mean, thresholds, lowerIsBetter, level) { + // Handle Metrics in which less value is better switch (level) { case "peak": - if (metric === "mem_used") - return false; // mem_used over peak -> return false to trigger impact -1 + if (lowerIsBetter) + return false; // metric over peak -> return false to trigger impact -1 else return mean <= thresholds.peak && mean > thresholds.normal; case "alert": - if (metric === "mem_used") + if (lowerIsBetter) return mean <= thresholds.peak && mean >= thresholds.alert; else return mean <= thresholds.alert && mean >= 0; case "caution": - if (metric === "mem_used") + if (lowerIsBetter) return mean < thresholds.alert && mean >= thresholds.caution; else return mean <= thresholds.caution && mean > thresholds.alert; case "normal": - if (metric === "mem_used") + if (lowerIsBetter) return mean < thresholds.caution && mean >= 0; else return mean <= thresholds.normal && mean > thresholds.caution; default: @@ -201,7 +139,7 @@ } - + {#if view === "job"} @@ -250,9 +188,21 @@ offset={[0, 20]}>{fpd.message} -
- -
+ + {#if fpd.dir} + + + + {/if} + + + + {#if !fpd.dir} + + + + {/if} + {/each} {#if job?.metaData?.message}
diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index f7c99ff..d0e4fca 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -27,7 +27,7 @@ let filterComponent; // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the let jobList, matchedJobs = null; - let sorting = { field: "startTime", order: "DESC" }, + let sorting = { field: "startTime", type: "col", order: "DESC" }, isSortingOpen = false, isMetricsSelectionOpen = false; let metrics = filterPresets.cluster diff --git a/web/frontend/src/Metric.svelte b/web/frontend/src/Metric.svelte index 279df13..0e1359f 100644 --- a/web/frontend/src/Metric.svelte +++ b/web/frontend/src/Metric.svelte @@ -1,5 +1,5 @@ - {metricName} ({(metricConfig?.unit?.prefix - ? metricConfig.unit.prefix - : "") + (metricConfig?.unit?.base ? metricConfig.unit.base : "")}) + {metricName} ({unit}) {#if job.resources.length > 1} @@ -118,8 +87,8 @@ bind:this={plot} {width} height={300} - {cluster} - {subCluster} + cluster={job.cluster} + subCluster={job.subCluster} timestep={data.timestep} scope={selectedScope} metric={metricName} @@ -132,8 +101,8 @@ bind:this={plot} {width} height={300} - {cluster} - {subCluster} + cluster={job.cluster} + subCluster={job.subCluster} timestep={data.timestep} scope={selectedScope} metric={metricName} diff --git a/web/frontend/src/MetricSelection.svelte b/web/frontend/src/MetricSelection.svelte index 91fd8e6..76398af 100644 --- a/web/frontend/src/MetricSelection.svelte +++ b/web/frontend/src/MetricSelection.svelte @@ -27,8 +27,8 @@ export let showFootprint = false; export let view = "job"; - const clusters = getContext("clusters"), - onInit = getContext("on-init"); + const onInit = getContext("on-init") + const globalMetrics = getContext("globalMetrics") let newMetricsOrder = []; let unorderedMetrics = [...metrics]; @@ -36,30 +36,34 @@ onInit(() => { if (allMetrics == null) allMetrics = new Set(); - for (let c of clusters) - for (let metric of c.metricConfig) allMetrics.add(metric.name); + for (let metric of globalMetrics) allMetrics.add(metric.name); }); $: { if (allMetrics != null) { if (cluster == null) { - // console.log('Reset to full metric list') - for (let c of clusters) - for (let metric of c.metricConfig) allMetrics.add(metric.name); + for (let metric of globalMetrics) allMetrics.add(metric.name); } else { - // console.log('Recalculate available metrics for ' + cluster) allMetrics.clear(); - for (let c of clusters) - if (c.name == cluster) - for (let metric of c.metricConfig) allMetrics.add(metric.name); + for (let gm of globalMetrics) { + if (gm.availability.find((av) => av.cluster === cluster)) allMetrics.add(gm.name); + } } - newMetricsOrder = [...allMetrics].filter((m) => !metrics.includes(m)); newMetricsOrder.unshift(...metrics.filter((m) => allMetrics.has(m))); unorderedMetrics = unorderedMetrics.filter((m) => allMetrics.has(m)); } } + function printAvailability(metric, cluster) { + const avail = globalMetrics.find((gm) => gm.name === metric)?.availability + if (cluster == null) { + return avail.map((av) => av.cluster).join(',') + } else { + return avail.find((av) => av.cluster === cluster).subClusters.join(',') + } + } + const client = getContextClient(); const updateConfigurationMutation = ({ name, value }) => { return mutationStore({ @@ -106,7 +110,6 @@ }).subscribe((res) => { if (res.fetching === false && res.error) { throw res.error; - // console.log('Error on subscription: ' + res.error) } }); @@ -118,7 +121,6 @@ value: JSON.stringify(showFootprint), }).subscribe((res) => { if (res.fetching === false && res.error) { - console.log("Error on footprint subscription: " + res.error); throw res.error; } }); @@ -161,34 +163,7 @@ {/if} {metric} - {cluster == null - ? clusters // No single cluster specified: List Clusters with Metric - .filter( - (c) => c.metricConfig.find((m) => m.name == metric) != null, - ) - .map((c) => c.name) - .join(", ") - : clusters // Single cluster requested: List Subclusters with do not have metric remove flag - .filter((c) => c.name == cluster) - .filter( - (c) => c.metricConfig.find((m) => m.name == metric) != null, - ) - .map(function (c) { - let scNames = c.subClusters.map((sc) => sc.name); - scNames.forEach(function (scName) { - let met = c.metricConfig.find((m) => m.name == metric); - let msc = met.subClusters.find( - (msc) => msc.name == scName, - ); - if (msc != null) { - if (msc.remove == true) { - scNames = scNames.filter((scn) => scn != msc.name); - } - } - }); - return scNames; - }) - .join(", ")} + {printAvailability(metric, cluster)} {/each} diff --git a/web/frontend/src/Node.root.svelte b/web/frontend/src/Node.root.svelte index 0a5a75e..98ee3e9 100644 --- a/web/frontend/src/Node.root.svelte +++ b/web/frontend/src/Node.root.svelte @@ -29,6 +29,8 @@ from.setMinutes(from.getMinutes() - 30); } + const initialized = getContext("initialized") + const globalMetrics = getContext("globalMetrics") const ccconfig = getContext("cc-config"); const clusters = getContext("clusters"); const client = getContextClient(); @@ -74,15 +76,11 @@ let itemsPerPage = ccconfig.plot_list_jobsPerPage; let page = 1; let paging = { itemsPerPage, page }; - let sorting = { field: "startTime", order: "DESC" }; + let sorting = { field: "startTime", type: "col", order: "DESC" }; $: filter = [ { cluster: { eq: cluster } }, { node: { contains: hostname } }, { state: ["running"] }, - // {startTime: { - // from: from.toISOString(), - // to: to.toISOString() - // }} ]; const nodeJobsQuery = gql` @@ -92,10 +90,6 @@ $paging: PageRequest! ) { jobs(filter: $filter, order: $sorting, page: $paging) { - # items { - # id - # jobId - # } count } } @@ -107,26 +101,16 @@ variables: { paging, sorting, filter }, }); - let metricUnits = {}; - $: if ($nodeMetricsData.data) { - let thisCluster = clusters.find((c) => c.name == cluster); - if (thisCluster) { - for (let metric of thisCluster.metricConfig) { - if (metric.unit.prefix || metric.unit.base) { - metricUnits[metric.name] = - "(" + - (metric.unit.prefix ? metric.unit.prefix : "") + - (metric.unit.base ? metric.unit.base : "") + - ")"; - } else { - // If no unit defined: Omit Unit Display - metricUnits[metric.name] = ""; - } - } + let systemUnits = {}; + function loadUnits(isInitialized) { + if (!isInitialized) return + const systemMetrics = [...globalMetrics.filter((gm) => gm?.availability.find((av) => av.cluster == cluster))] + for (let sm of systemMetrics) { + systemUnits[sm.name] = (sm?.unit?.prefix ? sm.unit.prefix : "") + (sm?.unit?.base ? sm.unit.base : "") } } - const dateToUnixEpoch = (rfc3339) => Math.floor(Date.parse(rfc3339) / 1000); + $: loadUnits($initialized) @@ -195,7 +179,7 @@ >

{item.name} - {metricUnits[item.name]} + {systemUnits[item.name] ? "(" + systemUnits[item.name] + ")" : ""}

{#if item.disabled === false && item.metric} { if (res.fetching === false && res.error) { throw res.error; - // console.log('Error on subscription: ' + res.error) } }); } diff --git a/web/frontend/src/StatsTable.svelte b/web/frontend/src/StatsTable.svelte index 3a9d84d..0db6a71 100644 --- a/web/frontend/src/StatsTable.svelte +++ b/web/frontend/src/StatsTable.svelte @@ -74,10 +74,6 @@ return s.dir != "up" ? s1[stat] - s2[stat] : s2[stat] - s1[stat]; }); } - - export function moreLoaded(jobMetric) { - jobMetrics = [...jobMetrics, jobMetric]; - } @@ -85,7 +81,6 @@ diff --git a/web/frontend/src/Status.root.svelte b/web/frontend/src/Status.root.svelte index 48c3711..aadb3a9 100644 --- a/web/frontend/src/Status.root.svelte +++ b/web/frontend/src/Status.root.svelte @@ -146,7 +146,7 @@ `, variables: { cluster: cluster, - metrics: ["flops_any", "mem_bw"], + metrics: ["flops_any", "mem_bw"], // Fixed names for roofline and status bars from: from.toISOString(), to: to.toISOString(), filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], diff --git a/web/frontend/src/Systems.root.svelte b/web/frontend/src/Systems.root.svelte index 4a7f633..95ceecd 100644 --- a/web/frontend/src/Systems.root.svelte +++ b/web/frontend/src/Systems.root.svelte @@ -29,9 +29,10 @@ from.setMinutes(from.getMinutes() - 30); } - const clusters = getContext("clusters"); + const initialized = getContext("initialized"); const ccconfig = getContext("cc-config"); - const metricConfig = getContext("metrics"); + const clusters = getContext("clusters"); + const globalMetrics = getContext("globalMetrics"); let plotHeight = 300; let hostnameFilter = ""; @@ -80,24 +81,18 @@ }, }); - let metricUnits = {}; - $: if ($nodesQuery.data) { - let thisCluster = clusters.find((c) => c.name == cluster); - if (thisCluster) { - for (let metric of thisCluster.metricConfig) { - if (metric.unit.prefix || metric.unit.base) { - metricUnits[metric.name] = - "(" + - (metric.unit.prefix ? metric.unit.prefix : "") + - (metric.unit.base ? metric.unit.base : "") + - ")"; - } else { - // If no unit defined: Omit Unit Display - metricUnits[metric.name] = ""; - } - } + let systemMetrics = []; + let systemUnits = {}; + function loadMetrics(isInitialized) { + if (!isInitialized) return + systemMetrics = [...globalMetrics.filter((gm) => gm?.availability.find((av) => av.cluster == cluster))] + for (let sm of systemMetrics) { + systemUnits[sm.name] = (sm?.unit?.prefix ? sm.unit.prefix : "") + (sm?.unit?.base ? sm.unit.base : "") } } + + $: loadMetrics($initialized) + @@ -123,9 +118,9 @@ Metric diff --git a/web/frontend/src/TagManagement.svelte b/web/frontend/src/TagManagement.svelte index e9fb9e9..afec176 100644 --- a/web/frontend/src/TagManagement.svelte +++ b/web/frontend/src/TagManagement.svelte @@ -107,7 +107,6 @@ addTagToJob(res.data.createTag); } else if (res.fetching === false && res.error) { throw res.error; - // console.log('Error on subscription: ' + res.error) } }); } @@ -120,7 +119,6 @@ pendingChange = false; } else if (res.fetching === false && res.error) { throw res.error; - // console.log('Error on subscription: ' + res.error) } }); } @@ -134,7 +132,6 @@ pendingChange = false; } else if (res.fetching === false && res.error) { throw res.error; - // console.log('Error on subscription: ' + res.error) } }, ); diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 41969d9..6526e6f 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -32,7 +32,7 @@ let filterComponent; // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the let jobList; let jobFilters = []; - let sorting = { field: "startTime", order: "DESC" }, + let sorting = { field: "startTime", type: "col", order: "DESC" }, isSortingOpen = false; let metrics = ccconfig.plot_list_selectedMetrics, isMetricsSelectionOpen = false; @@ -70,6 +70,7 @@ histMetrics { metric unit + stat data { min max @@ -245,7 +246,7 @@ usesBins={true} {width} height={250} - title="Distribution of '{item.metric}' averages" + title="Distribution of '{item.metric} ({item.stat})' footprints" xlabel={`${item.metric} bin maximum ${item?.unit ? `[${item.unit}]` : ``}`} xunit={item.unit} ylabel="Number of Jobs" diff --git a/web/frontend/src/Zoom.svelte b/web/frontend/src/Zoom.svelte deleted file mode 100644 index c5f73c1..0000000 --- a/web/frontend/src/Zoom.svelte +++ /dev/null @@ -1,65 +0,0 @@ - - -
- - - - - - Window Size: - - - ({windowSize}%) - - - - Window Position: - - - -
diff --git a/web/frontend/src/config/UserSettings.svelte b/web/frontend/src/config/UserSettings.svelte index cd1d9a3..7eaa04e 100644 --- a/web/frontend/src/config/UserSettings.svelte +++ b/web/frontend/src/config/UserSettings.svelte @@ -23,7 +23,6 @@ popMessage(text, target, "#048109"); } else { let text = await res.text(); - // console.log(res.statusText) throw new Error("Response Code " + res.status + "-> " + text); } } catch (err) { diff --git a/web/frontend/src/config/admin/AddUser.svelte b/web/frontend/src/config/admin/AddUser.svelte index 84aacc3..154bb3e 100644 --- a/web/frontend/src/config/admin/AddUser.svelte +++ b/web/frontend/src/config/admin/AddUser.svelte @@ -23,7 +23,6 @@ form.reset(); } else { let text = await res.text(); - // console.log(res.statusText) throw new Error("Response Code " + res.status + "-> " + text); } } catch (err) { diff --git a/web/frontend/src/config/admin/EditProject.svelte b/web/frontend/src/config/admin/EditProject.svelte index e1c518f..e7d6379 100644 --- a/web/frontend/src/config/admin/EditProject.svelte +++ b/web/frontend/src/config/admin/EditProject.svelte @@ -32,7 +32,6 @@ reloadUserList(); } else { let text = await res.text(); - // console.log(res.statusText) throw new Error("Response Code " + res.status + "-> " + text); } } catch (err) { @@ -64,7 +63,6 @@ reloadUserList(); } else { let text = await res.text(); - // console.log(res.statusText) throw new Error("Response Code " + res.status + "-> " + text); } } catch (err) { diff --git a/web/frontend/src/config/admin/EditRole.svelte b/web/frontend/src/config/admin/EditRole.svelte index 6b24e3e..a26c48b 100644 --- a/web/frontend/src/config/admin/EditRole.svelte +++ b/web/frontend/src/config/admin/EditRole.svelte @@ -34,7 +34,6 @@ reloadUserList(); } else { let text = await res.text(); - // console.log(res.statusText) throw new Error("Response Code " + res.status + "-> " + text); } } catch (err) { @@ -66,7 +65,6 @@ reloadUserList(); } else { let text = await res.text(); - // console.log(res.statusText) throw new Error("Response Code " + res.status + "-> " + text); } } catch (err) { diff --git a/web/frontend/src/filters/Filters.svelte b/web/frontend/src/filters/Filters.svelte index 7253ff7..ef92c31 100644 --- a/web/frontend/src/filters/Filters.svelte +++ b/web/frontend/src/filters/Filters.svelte @@ -136,8 +136,8 @@ if (filters.project) items.push({ project: { [filters.projectMatch]: filters.project } }); if (filters.jobName) items.push({ jobName: { contains: filters.jobName } }); - for (let stat of filters.stats) - items.push({ [stat.field]: { from: stat.from, to: stat.to } }); + if (filters.stats.length != 0) + items.push({ metricStats: filters.stats.map((st) => { return { metricName: st.field, range: { from: st.from, to: st.to }} }) }); dispatch("update", { filters: items }); changeURL(); @@ -412,7 +412,6 @@ /> update()} diff --git a/web/frontend/src/filters/Resources.svelte b/web/frontend/src/filters/Resources.svelte index 01f1c57..19af205 100644 --- a/web/frontend/src/filters/Resources.svelte +++ b/web/frontend/src/filters/Resources.svelte @@ -59,7 +59,6 @@ 0, ); - // console.log(header) let minNumNodes = 1, maxNumNodes = 0, minNumHWThreads = 1, diff --git a/web/frontend/src/filters/Stats.svelte b/web/frontend/src/filters/Stats.svelte index ee80a4b..b19793f 100644 --- a/web/frontend/src/filters/Stats.svelte +++ b/web/frontend/src/filters/Stats.svelte @@ -1,5 +1,6 @@ (isOpen = !isOpen)}> @@ -126,8 +73,7 @@ color="danger" on:click={() => { isOpen = false; - resetRange($initialized, cluster); - statistics.forEach((stat) => (stat.enabled = false)); + resetRanges(); stats = []; dispatch("update", { stats }); }}>Reset gm.name === m)?.unit + return (rawUnit?.prefix ? rawUnit.prefix : "") + (rawUnit?.base ? rawUnit.base : "") + } + const client = getContextClient(); const query = gql` query ( @@ -75,7 +80,11 @@ name } metaData - footprint + footprint { + name + stat + value + } } count hasNextPage @@ -141,7 +150,6 @@ paging = { itemsPerPage: value, page: page }; // Trigger reload of jobList } else if (res.fetching === false && res.error) { throw res.error; - // console.log('Error on subscription: ' + res.error) } }); } @@ -215,22 +223,7 @@ > {metric} {#if $initialized} - ({clusters - .map((cluster) => - cluster.metricConfig.find((m) => m.name == metric), - ) - .filter((m) => m != null) - .map( - (m) => - (m.unit?.prefix ? m.unit?.prefix : "") + - (m.unit?.base ? m.unit?.base : ""), - ) // Build unitStr - .reduce( - (arr, unitStr) => - arr.includes(unitStr) ? arr : [...arr, unitStr], - [], - ) // w/o this, output would be [unitStr, unitStr] - .join(", ")}) + ({getUnit(metric)}) {/if} {/each} diff --git a/web/frontend/src/joblist/Row.svelte b/web/frontend/src/joblist/Row.svelte index dd92ec4..5d4b2c7 100644 --- a/web/frontend/src/joblist/Row.svelte +++ b/web/frontend/src/joblist/Row.svelte @@ -30,16 +30,11 @@ : ["core"] : ["node"]; - function distinct(value, index, array) { - return array.indexOf(value) === index; - } - const cluster = getContext("clusters").find((c) => c.name == job.cluster); - const metricConfig = getContext("metrics"); // Get all MetricConfs which include subCluster-specific settings for this job const client = getContextClient(); const query = gql` - query ($id: ID!, $queryMetrics: [String!]!, $scopes: [MetricScope!]!) { - jobMetrics(id: $id, metrics: $queryMetrics, scopes: $scopes) { + query ($id: ID!, $metrics: [String!]!, $scopes: [MetricScope!]!) { + jobMetrics(id: $id, metrics: $metrics, scopes: $scopes) { name scope metric { @@ -71,34 +66,14 @@ $: metricsQuery = queryStore({ client: client, query: query, - variables: { id, queryMetrics, scopes }, + variables: { id, metrics, scopes }, }); - - let queryMetrics = null; - $: if (showFootprint) { - queryMetrics = [ - "cpu_load", - "flops_any", - "mem_used", - "mem_bw", - "acc_utilization", - ...metrics, - ].filter(distinct); - scopes = ["node"]; - } else { - queryMetrics = [...metrics]; - scopes = job.numNodes == 1 - ? job.numAcc >= 1 - ? ["core", "accelerator"] - : ["core"] - : ["node"]; - } - + export function refresh() { metricsQuery = queryStore({ client: client, query: query, - variables: { id, queryMetrics, scopes }, + variables: { id, metrics, scopes }, // requestPolicy: 'network-only' // use default cache-first for refresh }); } @@ -166,8 +141,8 @@
diff --git a/web/frontend/src/joblist/SortSelection.svelte b/web/frontend/src/joblist/SortSelection.svelte index 2cc8615..ba6f9b8 100644 --- a/web/frontend/src/joblist/SortSelection.svelte +++ b/web/frontend/src/joblist/SortSelection.svelte @@ -17,24 +17,39 @@ ModalHeader, ModalFooter, } from "@sveltestrap/sveltestrap"; + import { getContext } from "svelte"; + import { getSortItems } from "../utils.js"; export let isOpen = false; - export let sorting = { field: "startTime", order: "DESC" }; + export let sorting = { field: "startTime", type: "col", order: "DESC" }; - let sortableColumns = [ - { field: "startTime", text: "Start Time", order: "DESC" }, - { field: "duration", text: "Duration", order: "DESC" }, - { field: "numNodes", text: "Number of Nodes", order: "DESC" }, - { field: "memUsedMax", text: "Max. Memory Used", order: "DESC" }, - { field: "flopsAnyAvg", text: "Avg. FLOPs", order: "DESC" }, - { field: "memBwAvg", text: "Avg. Memory Bandwidth", order: "DESC" }, - { field: "netBwAvg", text: "Avg. Network Bandwidth", order: "DESC" }, - ]; + let sortableColumns = []; + let activeColumnIdx; - let activeColumnIdx = sortableColumns.findIndex( - (col) => col.field == sorting.field, - ); - sortableColumns[activeColumnIdx].order = sorting.order; + const initialized = getContext("initialized"); + + function loadSortables(isInitialized) { + if (!isInitialized) return; + sortableColumns = [ + { field: "startTime", type: "col", text: "Start Time", order: "DESC" }, + { field: "duration", type: "col", text: "Duration", order: "DESC" }, + { field: "numNodes", type: "col", text: "Number of Nodes", order: "DESC" }, + { field: "numHwthreads", type: "col", text: "Number of HWThreads", order: "DESC" }, + { field: "numAcc", type: "col", text: "Number of Accelerators", order: "DESC" }, + ...getSortItems() + ] + } + + function loadActiveIndex(isInitialized) { + if (!isInitialized) return; + activeColumnIdx = sortableColumns.findIndex( + (col) => col.field == sorting.field, + ); + sortableColumns[activeColumnIdx].order = sorting.order; + } + + $: loadSortables($initialized); + $: loadActiveIndex($initialized) - export function formatTime(t, forNode = false) { + function formatTime(t, forNode = false) { if (t !== null) { if (isNaN(t)) { return t; @@ -15,7 +15,7 @@ } } - export function timeIncrs(timestep, maxX, forNode) { + function timeIncrs(timestep, maxX, forNode) { if (forNode === true) { return [60, 300, 900, 1800, 3600, 7200, 14400, 21600]; // forNode fixed increments } else { @@ -27,93 +27,63 @@ } } - export function findThresholds( + // removed arg "subcluster": input metricconfig and topology now directly derived from subcluster + function findThresholds( + subClusterTopology, metricConfig, scope, - subCluster, isShared, numhwthreads, numaccs ) { - // console.log('NAME ' + metricConfig.name + ' / SCOPE ' + scope + ' / SUBCLUSTER ' + subCluster.name) - if (!metricConfig || !scope || !subCluster) { + + if (!subClusterTopology || !metricConfig || !scope) { console.warn("Argument missing for findThresholds!"); return null; } if ( (scope == "node" && isShared == false) || - metricConfig.aggregation == "avg" + metricConfig?.aggregation == "avg" ) { - if (metricConfig.subClusters && metricConfig.subClusters.length === 0) { - // console.log('subClusterConfigs array empty, use metricConfig defaults') return { normal: metricConfig.normal, caution: metricConfig.caution, alert: metricConfig.alert, peak: metricConfig.peak, }; - } else if ( - metricConfig.subClusters && - metricConfig.subClusters.length > 0 - ) { - // console.log('subClusterConfigs found, use subCluster Settings if matching jobs subcluster:') - let forSubCluster = metricConfig.subClusters.find( - (sc) => sc.name == subCluster.name, - ); - if ( - forSubCluster && - forSubCluster.normal && - forSubCluster.caution && - forSubCluster.alert && - forSubCluster.peak - ) - return forSubCluster; - else - return { - normal: metricConfig.normal, - caution: metricConfig.caution, - alert: metricConfig.alert, - peak: metricConfig.peak, - }; - } else { - console.warn("metricConfig.subClusters not found!"); + } + + + if (metricConfig?.aggregation == "sum") { + let divisor = 1 + if (isShared == true) { // Shared + if (numaccs > 0) divisor = subClusterTopology.accelerators.length / numaccs; + else if (numhwthreads > 0) divisor = subClusterTopology.node.length / numhwthreads; + } + else if (scope == 'socket') divisor = subClusterTopology.socket.length; + else if (scope == "core") divisor = subClusterTopology.core.length; + else if (scope == "accelerator") + divisor = subClusterTopology.accelerators.length; + else if (scope == "hwthread") divisor = subClusterTopology.node.length; + else { + // console.log('TODO: how to calc thresholds for ', scope) return null; } + + return { + peak: metricConfig.peak / divisor, + normal: metricConfig.normal / divisor, + caution: metricConfig.caution / divisor, + alert: metricConfig.alert / divisor, + }; } - if (metricConfig.aggregation != "sum") { - console.warn( - "Missing or unkown aggregation mode (sum/avg) for metric:", - metricConfig, - ); - return null; - } - - let divisor = 1 - if (isShared == true) { // Shared - if (numaccs > 0) divisor = subCluster.topology.accelerators.length / numaccs; - else if (numhwthreads > 0) divisor = subCluster.topology.node.length / numhwthreads; - } - else if (scope == 'socket') divisor = subCluster.topology.socket.length; - else if (scope == "core") divisor = subCluster.topology.core.length; - else if (scope == "accelerator") - divisor = subCluster.topology.accelerators.length; - else if (scope == "hwthread") divisor = subCluster.topology.node.length; - else { - // console.log('TODO: how to calc thresholds for ', scope) - return null; - } - - let mc = - metricConfig?.subClusters?.find((sc) => sc.name == subCluster.name) || - metricConfig; - return { - peak: mc.peak / divisor, - normal: mc.normal / divisor, - caution: mc.caution / divisor, - alert: mc.alert / divisor, - }; + console.warn( + "Missing or unkown aggregation mode (sum/avg) for metric:", + metricConfig, + ); + return null; } @@ -165,7 +135,8 @@ if (useStatsSeries == false && series == null) useStatsSeries = true; - const metricConfig = getContext("metrics")(cluster, metric); + const subClusterTopology = getContext("getHardwareTopology")(cluster, subCluster); + const metricConfig = getContext("getMetricConfig")(cluster, subCluster, metric); const clusterCockpitConfig = getContext("cc-config"); const resizeSleepTime = 250; const normalLineColor = "#000000"; @@ -178,11 +149,9 @@ alert: "rgba(255, 0, 0, 0.3)", }; const thresholds = findThresholds( + subClusterTopology, metricConfig, scope, - typeof subCluster == "string" - ? cluster.subClusters.find((sc) => sc.name == subCluster) - : subCluster, isShared, numhwthreads, numaccs @@ -479,8 +448,6 @@ cursor: { drag: { x: true, y: true } }, }; - // console.log(opts) - let plotWrapper = null; let uplot = null; let timeoutId = null; diff --git a/web/frontend/src/plots/Polar.svelte b/web/frontend/src/plots/Polar.svelte index 59f89f3..ae0e249 100644 --- a/web/frontend/src/plots/Polar.svelte +++ b/web/frontend/src/plots/Polar.svelte @@ -24,10 +24,11 @@ export let metrics export let cluster + export let subCluster export let jobMetrics export let height = 365 - const metricConfig = getContext('metrics') + const getMetricConfig = getContext("getMetricConfig") const labels = metrics.filter(name => { if (!jobMetrics.find(m => m.name == name && m.scope == "node")) { @@ -38,7 +39,7 @@ }) const getValuesForStat = (getStat) => labels.map(name => { - const peak = metricConfig(cluster, name).peak + const peak = getMetricConfig(cluster, subCluster, name).peak const metric = jobMetrics.find(m => m.name == name && m.scope == "node") const value = getStat(metric.metric) / peak return value <= 1. ? value : 1. diff --git a/web/frontend/src/plots/Roofline.svelte b/web/frontend/src/plots/Roofline.svelte index 11d1d25..a05eec3 100644 --- a/web/frontend/src/plots/Roofline.svelte +++ b/web/frontend/src/plots/Roofline.svelte @@ -209,7 +209,6 @@ draw: [ (u) => { // draw roofs when cluster set - // console.log(u) if (cluster != null) { const padding = u._padding; // [top, right, bottom, left] @@ -237,9 +236,6 @@ true, ); - // Debug get zoomLevel from browser - // console.log("Zoom", Math.round(window.devicePixelRatio * 100)) - if ( scalarKneeX < width * window.devicePixelRatio - @@ -323,7 +319,7 @@ }; uplot = new uPlot(opts, plotData, plotWrapper); } else { - console.log("No data for roofline!"); + // console.log("No data for roofline!"); } } diff --git a/web/frontend/src/units.js b/web/frontend/src/units.js index 9a4defd..4c1fea4 100644 --- a/web/frontend/src/units.js +++ b/web/frontend/src/units.js @@ -31,3 +31,4 @@ export function scaleNumbers(x, y , p = '') { return Math.abs(rawYValue) >= 1000 ? `${rawXValue.toExponential()} / ${rawYValue.toExponential()}` : `${rawYValue.toString()} / ${rawYValue.toString()}` } +// export const dateToUnixEpoch = (rfc3339) => Math.floor(Date.parse(rfc3339) / 1000); diff --git a/web/frontend/src/utils.js b/web/frontend/src/utils.js index 3ab86da..7510ace 100644 --- a/web/frontend/src/utils.js +++ b/web/frontend/src/utils.js @@ -6,7 +6,6 @@ import { } from "@urql/svelte"; import { setContext, getContext, hasContext, onDestroy, tick } from "svelte"; import { readable } from "svelte/store"; -// import { formatNumber } from './units.js' /* * Call this function only at component initialization time! @@ -16,7 +15,9 @@ import { readable } from "svelte/store"; * - Creates a readable store 'initialization' which indicates when the values below can be used. * - Adds 'tags' to the context (list of all tags) * - Adds 'clusters' to the context (object with cluster names as keys) - * - Adds 'metrics' to the context, a function that takes a cluster and metric name and returns the MetricConfig (or undefined) + * - Adds 'globalMetrics' to the context (list of globally available metric infos) + * - Adds 'getMetricConfig' to the context, a function that takes a cluster, subCluster and metric name and returns the MetricConfig (or undefined) + * - Adds 'getHardwareTopology' to the context, a function that takes a cluster nad subCluster and returns the subCluster topology (or undefined) */ export function init(extraInitQuery = "") { const jwt = hasContext("jwt") @@ -71,11 +72,19 @@ export function init(extraInitQuery = "") { normal caution alert + lowerIsBetter } footprint } } tags { id, name, type } + globalMetrics { + name + scope + footprint + unit { base, prefix } + availability { cluster, subClusters } + } ${extraInitQuery} }` ) @@ -91,12 +100,13 @@ export function init(extraInitQuery = "") { }; }; - const tags = [], - clusters = []; - const allMetrics = []; + const tags = [] + const clusters = [] + const globalMetrics = [] + setContext("tags", tags); setContext("clusters", clusters); - setContext("allmetrics", allMetrics); + setContext("globalMetrics", globalMetrics); setContext("getMetricConfig", (cluster, subCluster, metric) => { if (typeof cluster !== "object") cluster = clusters.find((c) => c.name == cluster); @@ -106,6 +116,15 @@ export function init(extraInitQuery = "") { return subCluster.metricConfig.find((m) => m.name == metric); }); + setContext("getHardwareTopology", (cluster, subCluster) => { + if (typeof cluster !== "object") + cluster = clusters.find((c) => c.name == cluster); + + if (typeof subCluster !== "object") + subCluster = cluster.subClusters.find((sc) => sc.name == subCluster); + + return subCluster?.topology; + }); setContext("on-init", (callback) => state.fetching ? subscribers.push(callback) : callback(state) ); @@ -124,32 +143,11 @@ export function init(extraInitQuery = "") { } for (let tag of data.tags) tags.push(tag); + for (let cluster of data.clusters) clusters.push(cluster); + for (let gm of data.globalMetrics) globalMetrics.push(gm); - let globalmetrics = []; - for (let cluster of data.clusters) { - // Add full info to context object - clusters.push(cluster); - // Build global metric list with availability for joblist metricselect - for (let subcluster of cluster.subClusters) { - for (let scm of subcluster.metricConfig) { - let match = globalmetrics.find((gm) => gm.name == scm.name); - if (match) { - let submatch = match.availability.find((av) => av.cluster == cluster.name); - if (submatch) { - submatch.subclusters.push(subcluster.name) - } else { - match.availability.push({cluster: cluster.name, subclusters: [subcluster.name]}) - } - } else { - globalmetrics.push({name: scm.name, availability: [{cluster: cluster.name, subclusters: [subcluster.name]}]}); - } - } - } - } - // Add to ctx object - for (let gm of globalmetrics) allMetrics.push(gm); - - console.log('All Metrics List', allMetrics); + // Unified Sort + globalMetrics.sort((a, b) => a.name.localeCompare(b.name)) state.data = data; tick().then(() => subscribers.forEach((cb) => cb(state))); @@ -159,6 +157,7 @@ export function init(extraInitQuery = "") { query: { subscribe }, tags, clusters, + globalMetrics }; } @@ -171,6 +170,11 @@ function fuzzyMatch(term, string) { return string.toLowerCase().includes(term); } +// Use in filter() function to return only unique values +export function distinct(value, index, array) { + return array.indexOf(value) === index; +} + export function fuzzySearchTags(term, tags) { if (!tags) return []; @@ -260,56 +264,6 @@ export function minScope(scopes) { return sm; } -export async function fetchMetrics(job, metrics, scopes) { - if (job.monitoringStatus == 0) return null; - - let query = []; - if (metrics != null) { - for (let metric of metrics) { - query.push(`metric=${metric}`); - } - } - if (scopes != null) { - for (let scope of scopes) { - query.push(`scope=${scope}`); - } - } - - try { - let res = await fetch( - `/frontend/jobs/metrics/${job.id}${query.length > 0 ? "?" : ""}${query.join( - "&" - )}` - ); - if (res.status != 200) { - return { error: { status: res.status, message: await res.text() } }; - } - - return await res.json(); - } catch (e) { - return { error: e }; - } -} - -export function fetchMetricsStore() { - let set = null; - let prev = { fetching: true, error: null, data: null }; - return [ - readable(prev, (_set) => { - set = _set; - }), - (job, metrics, scopes) => - fetchMetrics(job, metrics, scopes).then((res) => { - let next = { fetching: false, error: res.error, data: res.data }; - if (prev.data && next.data) - next.data.jobMetrics.push(...prev.data.jobMetrics); - - prev = next; - set(next); - }), - ]; -} - export function stickyHeader(datatableHeaderSelector, updatePading) { const header = document.querySelector("header > nav.navbar"); if (!header) return; @@ -336,22 +290,98 @@ export function stickyHeader(datatableHeaderSelector, updatePading) { onDestroy(() => document.removeEventListener("scroll", onscroll)); } -// Outdated: Frontend Will Now Receive final MetricList from backend export function checkMetricDisabled(m, c, s) { //[m]etric, [c]luster, [s]ubcluster - const mc = getContext("metrics"); - const thisConfig = mc(c, m); - let thisSCIndex = -1; - if (thisConfig) { - thisSCIndex = thisConfig.subClusters.findIndex( - (subcluster) => subcluster.name == s - ); - }; - if (thisSCIndex >= 0) { - if (thisConfig.subClusters[thisSCIndex].remove == true) { - return true; + const metrics = getContext("globalMetrics"); + const result = metrics?.find((gm) => gm.name === m)?.availability?.find((av) => av.cluster === c)?.subClusters?.includes(s) + return !result +} + +export function getStatsItems() { + // console.time('stats') + // console.log('getStatsItems ...') + const globalMetrics = getContext("globalMetrics") + const result = globalMetrics.map((gm) => { + if (gm?.footprint) { + // Footprint contains suffix naming the used stat-type + // console.time('deep') + // console.log('Deep Config for', gm.name) + const mc = getMetricConfigDeep(gm.name, null, null) + // console.timeEnd('deep') + return { + field: gm.name + '_' + gm.footprint, + text: gm.name + ' (' + gm.footprint + ')', + metric: gm.name, + from: 0, + to: mc.peak, + peak: mc.peak, + enabled: false + } } + return null + }).filter((r) => r != null) + // console.timeEnd('stats') + return [...result]; +}; + +export function getSortItems() { + //console.time('sort') + //console.log('getSortItems ...') + const globalMetrics = getContext("globalMetrics") + const result = globalMetrics.map((gm) => { + if (gm?.footprint) { + // Footprint contains suffix naming the used stat-type + return { + field: gm.name + '_' + gm.footprint, + type: 'foot', + text: gm.name + ' (' + gm.footprint + ')', + order: 'DESC' + } + } + return null + }).filter((r) => r != null) + //console.timeEnd('sort') + return [...result]; +}; + +function getMetricConfigDeep(metric, cluster, subCluster) { + const clusters = getContext("clusters"); + if (cluster != null) { + let c = clusters.find((c) => c.name == cluster); + if (subCluster != null) { + let sc = c.subClusters.find((sc) => sc.name == subCluster); + return sc.metricConfig.find((mc) => mc.name == metric) + } else { + let result; + for (let sc of c.subClusters) { + const mc = sc.metricConfig.find((mc) => mc.name == metric) + if (result) { // If lowerIsBetter: Peak is still maximum value, no special case required + result.alert = (mc.alert > result.alert) ? mc.alert : result.alert + result.caution = (mc.caution > result.caution) ? mc.caution : result.caution + result.normal = (mc.normal > result.normal) ? mc.normal : result.normal + result.peak = (mc.peak > result.peak) ? mc.peak : result.peak + } else { + if (mc) result = {...mc}; + } + } + return result + } + } else { + let result; + for (let c of clusters) { + for (let sc of c.subClusters) { + const mc = sc.metricConfig.find((mc) => mc.name == metric) + if (result) { // If lowerIsBetter: Peak is still maximum value, no special case required + result.alert = (mc.alert > result.alert) ? mc.alert : result.alert + result.caution = (mc.caution > result.caution) ? mc.caution : result.caution + result.normal = (mc.normal > result.normal) ? mc.normal : result.normal + result.peak = (mc.peak > result.peak) ? mc.peak : result.peak + } else { + if (mc) result = {...mc}; + } + } + } + return result } - return false; } export function convert2uplot(canvasData) { @@ -413,14 +443,14 @@ export function binsFromFootprint(weights, scope, values, numBins) { } export function transformDataForRoofline(flopsAny, memBw) { // Uses Metric Objects: {series:[{},{},...], timestep:60, name:$NAME} - const nodes = flopsAny.series.length - const timesteps = flopsAny.series[0].data.length - /* c will contain values from 0 to 1 representing the time */ let data = null const x = [], y = [], c = [] if (flopsAny && memBw) { + const nodes = flopsAny.series.length + const timesteps = flopsAny.series[0].data.length + for (let i = 0; i < nodes; i++) { const flopsData = flopsAny.series[i].data const memBwData = memBw.series[i].data @@ -446,7 +476,7 @@ export function transformDataForRoofline(flopsAny, memBw) { // Uses Metric Objec // Return something to be plotted. The argument shall be the result of the // `nodeMetrics` GraphQL query. -// Remove "hardcoded" here or deemed necessary? +// Hardcoded metric names required for correct render export function transformPerNodeDataForRoofline(nodes) { let data = null const x = [], y = [] diff --git a/web/templates/monitoring/job.tmpl b/web/templates/monitoring/job.tmpl index 1e3b09c..9b344f9 100644 --- a/web/templates/monitoring/job.tmpl +++ b/web/templates/monitoring/job.tmpl @@ -9,8 +9,6 @@