Merge pull request #343 from ClusterCockpit/dev

Changes for Release v1.4.3
This commit is contained in:
Jan Eitzinger 2025-02-25 13:09:54 +01:00 committed by GitHub
commit c562746e5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 196 additions and 121 deletions

22
.gitignore vendored
View File

@ -1,21 +1,23 @@
/cc-backend
/var/job-archive
/var/*.db
/var/machine-state
/.env
/config.json
/var/job-archive
/var/machine-state
/var/job.db-shm
/var/job.db-wal
/var/*.db
/var/*.txt
/web/frontend/public/build
/web/frontend/node_modules
/.vscode/*
/archive-migration
/archive-manager
var/job.db-shm
var/job.db-wal
/internal/repository/testdata/job.db-shm
/internal/repository/testdata/job.db-wal
/.vscode/*
dist/
*.db
internal/repository/testdata/job.db-shm
internal/repository/testdata/job.db-wal

View File

@ -82,7 +82,7 @@ tags:
@ctags -R
$(VAR):
@mkdir $(VAR)
@mkdir -p $(VAR)
config.json:
$(info ===> Initialize config.json file)

View File

@ -60,12 +60,13 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
max = math.Max(max, series.Statistics.Max)
}
// Round AVG Result to 2 Digits
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
},
Avg: avg / float64(job.NumNodes),
Avg: (math.Round((avg/float64(job.NumNodes))*100) / 100),
Min: min,
Max: max,
}

View File

@ -303,6 +303,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
// JobsFootprints is the resolver for the jobsFootprints field.
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
// NOTE: Legacy Naming! This resolver is for normalized histograms in analysis view only - *Not* related to DB "footprint" column!
return r.jobsFootprints(ctx, filter, metrics)
}

View File

@ -170,6 +170,9 @@ func LoadData(job *schema.Job,
jd.AddNodeScope("mem_bw")
}
// Round Resulting Stat Values
jd.RoundMetricStats()
return jd, ttl, size
})

View File

@ -440,6 +440,23 @@ func (ccms *CCMetricStore) buildQueries(
continue
}
// Core -> Socket
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromCores(hwthreads)
for _, socket := range sockets {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &coreString,
TypeIds: intToStringSlice(topology.Socket[socket]),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
}
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
@ -627,7 +644,7 @@ func (ccms *CCMetricStore) LoadNodeData(
req.Queries = append(req.Queries, ApiQuery{
Hostname: node,
Metric: ccms.toRemoteName(metric),
Resolution: 60, // Default for Node Queries
Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution
})
}
}
@ -1038,6 +1055,23 @@ func (ccms *CCMetricStore) buildNodeQueries(
continue
}
// Core -> Socket
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromCores(topology.Node)
for _, socket := range sockets {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &coreString,
TypeIds: intToStringSlice(topology.Socket[socket]),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
}
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(topology.Node)

View File

@ -217,11 +217,6 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) {
start := time.Now()
cachekey := fmt.Sprintf("footprint:%d", job.ID)
if cached := r.cache.Get(cachekey, nil); cached != nil {
job.Footprint = cached.(map[string]float64)
return job.Footprint, nil
}
if err := sq.Select("job.footprint").From("job").Where("job.id = ?", job.ID).
RunWith(r.stmtCache).QueryRow().Scan(&job.RawFootprint); err != nil {
@ -238,7 +233,6 @@ func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, err
return nil, err
}
r.cache.Put(cachekey, job.Footprint, len(job.Footprint), 24*time.Hour)
log.Debugf("Timer FetchFootprint %s", time.Since(start))
return job.Footprint, nil
}
@ -606,8 +600,11 @@ func (r *JobRepository) UpdateEnergy(
// FIXME: Needs sum as stats type
} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
// Energy: Power (in Watts) * Time (in Seconds)
// Unit: ( W * s ) / 3600 / 1000 = kWh ; Rounded to 2 nearest digits
energy = math.Round(((LoadJobStat(jobMeta, fp, "avg")*float64(jobMeta.Duration))/3600/1000)*100) / 100
// Unit: (( W * s ) / 3600) / 1000 = kWh ; Rounded to 2 nearest digits: (Energy * 100) / 100
// Here: All-Node Metric Average * Number of Nodes * Job Runtime
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
metricNodeSum := LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes) * float64(jobMeta.Duration)
energy = math.Round(((metricNodeSum/3600)/1000)*100) / 100
}
} else {
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)

View File

@ -94,7 +94,7 @@ func RegisterFootprintWorker() {
}
}
// Add values rounded to 2 digits
// Add values rounded to 2 digits: repo.LoadStats may return unrounded
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,

View File

@ -122,6 +122,38 @@ func (topo *Topology) GetSocketsFromHWThreads(
return sockets, exclusive
}
// Return a list of socket IDs given a list of core IDs. Even if just one
// core is in that socket, add it to the list. If no cores other than
// those in the argument list are assigned to one of the sockets in the first
// return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm.
func (topo *Topology) GetSocketsFromCores (
cores []int,
) (sockets []int, exclusive bool) {
socketsMap := map[int]int{}
for _, core := range cores {
for _, hwthreadInCore := range topo.Core[core] {
for socket, hwthreadsInSocket := range topo.Socket {
for _, hwthreadInSocket := range hwthreadsInSocket {
if hwthreadInCore == hwthreadInSocket {
socketsMap[socket] += 1
}
}
}
}
}
exclusive = true
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
sockets = make([]int, 0, len(socketsMap))
for socket, count := range socketsMap {
sockets = append(sockets, socket)
exclusive = exclusive && count == hwthreadsPerSocket
}
return sockets, exclusive
}
// Return a list of core IDs given a list of hwthread IDs. Even if just one
// hwthread is in that core, add it to the list. If no hwthreads other than
// those in the argument list are assigned to one of the cores in the first

View File

@ -291,6 +291,21 @@ func (jd *JobData) AddNodeScope(metric string) bool {
return true
}
func (jd *JobData) RoundMetricStats() {
// TODO: Make Digit-Precision Configurable? (Currently: Fixed to 2 Digits)
for _, scopes := range *jd {
for _, jm := range scopes {
for index := range jm.Series {
jm.Series[index].Statistics = MetricStatistics{
Avg: (math.Round(jm.Series[index].Statistics.Avg*100) / 100),
Min: (math.Round(jm.Series[index].Statistics.Min*100) / 100),
Max: (math.Round(jm.Series[index].Statistics.Max*100) / 100),
}
}
}
}
}
func (jm *JobMetric) AddPercentiles(ps []int) bool {
if jm.StatisticsSeries == nil {
jm.AddStatisticsSeries()

View File

@ -446,7 +446,7 @@
}
},
"job_view_selectedMetrics": {
"description": "",
"description": "Initial metrics shown as plots in single job view",
"type": "array",
"items": {
"type": "string",

View File

@ -117,27 +117,41 @@
}
`;
const roofQuery = gql`
query ($dbid: ID!, $selectedMetrics: [String!]!, $selectedScopes: [MetricScope!]!, $selectedResolution: Int) {
jobMetrics(id: $dbid, metrics: $selectedMetrics, scopes: $selectedScopes, resolution: $selectedResolution) {
name
scope
metric {
series {
data
}
}
}
}
`;
$: jobMetrics = queryStore({
client: client,
query: query,
variables: { dbid, selectedMetrics, selectedScopes },
});
// Roofline: Always load roofMetrics with configured timestep (Resolution: 0)
$: roofMetrics = queryStore({
client: client,
query: roofQuery,
variables: { dbid, selectedMetrics: ["flops_any", "mem_bw"], selectedScopes: ["node"], selectedResolution: 0 },
});
// Handle Job Query on Init -> is not executed anymore
getContext("on-init")(() => {
let job = $initq.data.job;
if (!job) return;
const pendingMetrics = [
"flops_any",
"mem_bw",
...(ccconfig[`job_view_selectedMetrics:${job.cluster}`] ||
$initq.data.globalMetrics.reduce((names, gm) => {
if (gm.availability.find((av) => av.cluster === job.cluster)) {
names.push(gm.name);
}
return names;
}, [])
ccconfig[`job_view_selectedMetrics`]
),
...(ccconfig[`job_view_nodestats_selectedMetrics:${job.cluster}`] ||
ccconfig[`job_view_nodestats_selectedMetrics`]
@ -276,12 +290,12 @@
<!-- Column 3: Job Roofline; If footprint Enabled: full width, else half width -->
<Col xs={12} md={12} xl={5} xxl={6}>
{#if $initq.error || $jobMetrics.error}
{#if $initq.error || $roofMetrics.error}
<Card body color="danger">
<p>Initq Error: {$initq.error?.message}</p>
<p>jobMetrics Error: {$jobMetrics.error?.message}</p>
<p>roofMetrics (jobMetrics) Error: {$roofMetrics.error?.message}</p>
</Card>
{:else if $initq?.data && $jobMetrics?.data}
{:else if $initq?.data && $roofMetrics?.data}
<Card style="height: 400px;">
<div bind:clientWidth={roofWidth}>
<Roofline
@ -292,10 +306,10 @@
.find((c) => c.name == $initq.data.job.cluster)
.subClusters.find((sc) => sc.name == $initq.data.job.subCluster)}
data={transformDataForRoofline(
$jobMetrics.data?.jobMetrics?.find(
$roofMetrics.data?.jobMetrics?.find(
(m) => m.name == "flops_any" && m.scope == "node",
)?.metric,
$jobMetrics.data?.jobMetrics?.find(
$roofMetrics.data?.jobMetrics?.find(
(m) => m.name == "mem_bw" && m.scope == "node",
)?.metric,
)}

View File

@ -80,6 +80,7 @@
: ccconfig.user_view_histogramMetrics || [];
const client = getContextClient();
// Note: nodeMetrics are requested on configured $timestep resolution
$: mainQuery = queryStore({
client: client,
query: gql`

View File

@ -77,6 +77,7 @@
for (let sm of systemMetrics) {
systemUnits[sm.name] = (sm?.unit?.prefix ? sm.unit.prefix : "") + (sm?.unit?.base ? sm.unit.base : "")
}
if (!selectedMetric) selectedMetric = systemMetrics[0].name
}
$: loadMetrics($initialized)

View File

@ -2,16 +2,12 @@
@component Polar Plot based on chartJS Radar
Properties:
- `footprintData [Object]?`: job.footprint content, evaluated in regards to peak config in jobSummary.svelte [Default: null]
- `metrics [String]?`: Metric names to display as polar plot [Default: null]
- `cluster GraphQL.Cluster?`: Cluster Object of the parent job [Default: null]
- `subCluster GraphQL.SubCluster?`: SubCluster Object of the parent job [Default: null]
- `polarMetrics [Object]?`: Metric names and scaled peak values for rendering polar plot [Default: [] ]
- `jobMetrics [GraphQL.JobMetricWithName]?`: Metric data [Default: null]
- `height Number?`: Plot height [Default: 365]
-->
<script>
import { getContext } from 'svelte'
import { Radar } from 'svelte-chartjs';
import {
Chart as ChartJS,
@ -34,54 +30,37 @@
LineElement
);
export let footprintData = null;
export let metrics = null;
export let cluster = null;
export let subCluster = null;
export let polarMetrics = [];
export let jobMetrics = null;
export let height = 350;
function getLabels() {
if (footprintData) {
return footprintData.filter(fpd => {
if (!jobMetrics.find(m => m.name == fpd.name && m.scope == "node" || fpd.impact == 4)) {
console.warn(`PolarPlot: No metric data for '${fpd.name}'`)
return false
}
return true
})
.map(filtered => filtered.name)
.sort(function (a, b) {
return ((a > b) ? 1 : ((b > a) ? -1 : 0));
});
const labels = polarMetrics
.filter((m) => (m.peak != null))
.map(pm => pm.name)
.sort(function (a, b) {return ((a > b) ? 1 : ((b > a) ? -1 : 0))});
function loadData(type) {
if (!labels) {
console.warn("Empty 'polarMetrics' array prop! Cannot render Polar representation.")
return []
} else {
return metrics.filter(name => {
if (!jobMetrics.find(m => m.name == name && m.scope == "node")) {
console.warn(`PolarPlot: No metric data for '${name}'`)
return false
}
return true
})
.sort(function (a, b) {
return ((a > b) ? 1 : ((b > a) ? -1 : 0));
});
if (type === 'avg') {
return getValues(getAvg)
} else if (type === 'max') {
return getValues(getMax)
} else if (type === 'min') {
return getValues(getMin)
}
console.log('Unknown Type For Polar Data (must be one of [min, max, avg])')
return []
}
}
const labels = getLabels();
const getMetricConfig = getContext("getMetricConfig");
// Helpers
const getValuesForStatGeneric = (getStat) => labels.map(name => {
// TODO: Requires Scaling if Shared Job
const peak = getMetricConfig(cluster, subCluster, name).peak
const metric = jobMetrics.find(m => m.name == name && m.scope == "node")
const value = getStat(metric.metric) / peak
return value <= 1. ? value : 1.
})
const getValuesForStatFootprint = (getStat) => labels.map(name => {
// FootprintData 'Peak' is pre-scaled for Shared Jobs in JobSummary Component
const peak = footprintData.find(fpd => fpd.name === name).peak
const getValues = (getStat) => labels.map(name => {
// Peak is adapted and scaled for job shared state
const peak = polarMetrics.find(m => m.name == name).peak
const metric = jobMetrics.find(m => m.name == name && m.scope == "node")
const value = getStat(metric.metric) / peak
return value <= 1. ? value : 1.
@ -108,36 +87,14 @@
return avg / metric.series.length
}
function loadDataGeneric(type) {
if (type === 'avg') {
return getValuesForStatGeneric(getAvg)
} else if (type === 'max') {
return getValuesForStatGeneric(getMax)
} else if (type === 'min') {
return getValuesForStatGeneric(getMin)
}
console.log('Unknown Type For Polar Data')
return []
}
function loadDataForFootprint(type) {
if (type === 'avg') {
return getValuesForStatFootprint(getAvg)
} else if (type === 'max') {
return getValuesForStatFootprint(getMax)
} else if (type === 'min') {
return getValuesForStatFootprint(getMin)
}
console.log('Unknown Type For Polar Data')
return []
}
// Chart JS Objects
const data = {
labels: labels,
datasets: [
{
label: 'Max',
data: footprintData ? loadDataForFootprint('max') : loadDataGeneric('max'), // Node Scope Only
data: loadData('max'), // Node Scope Only
fill: 1,
backgroundColor: 'rgba(0, 0, 255, 0.25)',
borderColor: 'rgb(0, 0, 255)',
@ -148,7 +105,7 @@
},
{
label: 'Avg',
data: footprintData ? loadDataForFootprint('avg') : loadDataGeneric('avg'), // Node Scope Only
data: loadData('avg'), // Node Scope Only
fill: 2,
backgroundColor: 'rgba(255, 210, 0, 0.25)',
borderColor: 'rgb(255, 210, 0)',
@ -159,7 +116,7 @@
},
{
label: 'Min',
data: footprintData ? loadDataForFootprint('min') : loadDataGeneric('min'), // Node Scope Only
data: loadData('min'), // Node Scope Only
fill: true,
backgroundColor: 'rgba(255, 0, 0, 0.25)',
borderColor: 'rgb(255, 0, 0)',

View File

@ -30,9 +30,25 @@
export let height = "400px";
const ccconfig = getContext("cc-config")
const showFootprint = !!ccconfig[`job_view_showFootprint`];
const globalMetrics = getContext("globalMetrics")
const showFootprintTab = !!ccconfig[`job_view_showFootprint`];
const footprintData = job?.footprint?.map((jf) => {
// Metrics Configured To Be Footprints For (sub)Cluster
const clusterFootprintMetrics = getContext("clusters")
.find((c) => c.name == job.cluster)?.subClusters
.find((sc) => sc.name == job.subCluster)?.footprint || []
// Data For Polarplot Will Be Calculated Based On JobMetrics And Thresholds
const polarMetrics = globalMetrics.reduce((pms, gm) => {
if (clusterFootprintMetrics.includes(gm.name)) {
const fmt = findJobFootprintThresholds(job, gm.footprint, getContext("getMetricConfig")(job.cluster, job.subCluster, gm.name));
pms.push({ name: gm.name, peak: fmt ? fmt.peak : null });
}
return pms;
}, [])
// Prepare Job Footprint Data Based On Values Saved In Database
const jobFootprintData = !showFootprintTab ? null : job?.footprint?.map((jf) => {
const fmc = getContext("getMetricConfig")(job.cluster, job.subCluster, jf.name);
if (fmc) {
// Unit
@ -187,16 +203,16 @@
return res;
};
$: summaryMessages = writeSummary(footprintData)
$: summaryMessages = writeSummary(jobFootprintData)
*/
</script>
<Card class="overflow-auto" style="width: {width}; height: {height}">
<TabContent> <!-- on:tab={(e) => (status = e.detail)} -->
{#if showFootprint}
{#if showFootprintTab}
<TabPane tabId="foot" tab="Footprint" active>
<CardBody>
{#each footprintData as fpd, index}
{#each jobFootprintData as fpd, index}
{#if fpd.impact !== 4}
<div class="mb-1 d-flex justify-content-between">
<div>&nbsp;<b>{fpd.name} ({fpd.stat})</b></div>
@ -237,7 +253,7 @@
>{fpd.message}</Tooltip
>
</div>
<Row cols={12} class="{(footprintData.length == (index + 1)) ? 'mb-0' : 'mb-2'}">
<Row cols={12} class="{(jobFootprintData.length == (index + 1)) ? 'mb-0' : 'mb-2'}">
{#if fpd.dir}
<Col xs="1">
<Icon name="caret-left-fill" />
@ -279,10 +295,10 @@
</CardBody>
</TabPane>
{/if}
<TabPane tabId="polar" tab="Polar" active={!showFootprint}>
<TabPane tabId="polar" tab="Polar" active={!showFootprintTab}>
<CardBody>
<Polar
{footprintData}
{polarMetrics}
{jobMetrics}
/>
</CardBody>

View File

@ -148,17 +148,18 @@
zoomState = {...pendingZoomState}
}
// Set selected scope to min of returned scopes
// On additional scope request
if (selectedScope == "load-all") {
// Push scope to statsTable (Needs to be in this case, else newly selected 'Metric.svelte' renders cause statsTable race condition)
const statsTableData = $metricData.data.singleUpdate.filter((x) => x.scope !== "node")
if (statsTableData.length > 0) {
dispatch("more-loaded", statsTableData);
}
// Set selected scope to min of returned scopes
selectedScope = minScope(scopes)
nodeOnly = (selectedScope == "node") // "node" still only scope after load-all
}
const statsTableData = $metricData.data.singleUpdate.filter((x) => x.scope !== "node")
if (statsTableData.length > 0) {
dispatch("more-loaded", statsTableData);
}
patternMatches = statsPattern.exec(selectedScope)
if (!patternMatches) {