mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-12-31 10:56:15 +01:00
Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev
This commit is contained in:
@@ -260,7 +260,10 @@ func (api *NatsAPI) handleNodeState(subject string, data []byte) {
|
|||||||
JobsRunning: node.JobsRunning,
|
JobsRunning: node.JobsRunning,
|
||||||
}
|
}
|
||||||
|
|
||||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||||
|
cclog.Errorf("NATS %s: updating node state for %s on %s failed: %v",
|
||||||
|
subject, node.Hostname, req.Cluster, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster)
|
cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster)
|
||||||
|
|||||||
@@ -770,21 +770,25 @@ func (ccms *CCMetricStore) LoadNodeData(
|
|||||||
}
|
}
|
||||||
|
|
||||||
mc := archive.GetMetricConfig(cluster, metric)
|
mc := archive.GetMetricConfig(cluster, metric)
|
||||||
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
if mc != nil {
|
||||||
Unit: mc.Unit,
|
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
||||||
Timestep: mc.Timestep,
|
Unit: mc.Unit,
|
||||||
Series: []schema.Series{
|
Timestep: mc.Timestep,
|
||||||
{
|
Series: []schema.Series{
|
||||||
Hostname: query.Hostname,
|
{
|
||||||
Data: qdata.Data,
|
Hostname: query.Hostname,
|
||||||
Statistics: schema.MetricStatistics{
|
Data: qdata.Data,
|
||||||
Avg: float64(qdata.Avg),
|
Statistics: schema.MetricStatistics{
|
||||||
Min: float64(qdata.Min),
|
Avg: float64(qdata.Avg),
|
||||||
Max: float64(qdata.Max),
|
Min: float64(qdata.Min),
|
||||||
|
Max: float64(qdata.Max),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
})
|
||||||
})
|
} else {
|
||||||
|
cclog.Warnf("Metric '%s' not configured for cluster '%s': Skipped in LoadNodeData() Return!", metric, cluster)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(errors) != 0 {
|
if len(errors) != 0 {
|
||||||
|
|||||||
@@ -30,7 +30,8 @@
|
|||||||
Table,
|
Table,
|
||||||
Progress,
|
Progress,
|
||||||
Icon,
|
Icon,
|
||||||
Button
|
Button,
|
||||||
|
Badge
|
||||||
} from "@sveltestrap/sveltestrap";
|
} from "@sveltestrap/sveltestrap";
|
||||||
import Roofline from "./generic/plots/Roofline.svelte";
|
import Roofline from "./generic/plots/Roofline.svelte";
|
||||||
import Pie, { colors } from "./generic/plots/Pie.svelte";
|
import Pie, { colors } from "./generic/plots/Pie.svelte";
|
||||||
@@ -85,7 +86,8 @@
|
|||||||
query: gql`
|
query: gql`
|
||||||
query (
|
query (
|
||||||
$cluster: String!
|
$cluster: String!
|
||||||
$metrics: [String!]
|
$nmetrics: [String!]
|
||||||
|
$cmetrics: [String!]
|
||||||
$from: Time!
|
$from: Time!
|
||||||
$to: Time!
|
$to: Time!
|
||||||
$clusterFrom: Time!
|
$clusterFrom: Time!
|
||||||
@@ -97,7 +99,7 @@
|
|||||||
# Node 5 Minute Averages for Roofline
|
# Node 5 Minute Averages for Roofline
|
||||||
nodeMetrics(
|
nodeMetrics(
|
||||||
cluster: $cluster
|
cluster: $cluster
|
||||||
metrics: $metrics
|
metrics: $nmetrics
|
||||||
from: $from
|
from: $from
|
||||||
to: $to
|
to: $to
|
||||||
) {
|
) {
|
||||||
@@ -106,6 +108,10 @@
|
|||||||
metrics {
|
metrics {
|
||||||
name
|
name
|
||||||
metric {
|
metric {
|
||||||
|
unit {
|
||||||
|
base
|
||||||
|
prefix
|
||||||
|
}
|
||||||
series {
|
series {
|
||||||
statistics {
|
statistics {
|
||||||
avg
|
avg
|
||||||
@@ -114,21 +120,6 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
# Running Job Metric Average for Rooflines
|
|
||||||
jobsMetricStats(filter: $jobFilter, metrics: $metrics) {
|
|
||||||
id
|
|
||||||
jobId
|
|
||||||
duration
|
|
||||||
numNodes
|
|
||||||
numAccelerators
|
|
||||||
subCluster
|
|
||||||
stats {
|
|
||||||
name
|
|
||||||
data {
|
|
||||||
avg
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
# Get Jobs for Per-Node Counts
|
# Get Jobs for Per-Node Counts
|
||||||
jobs(filter: $jobFilter, order: $sorting, page: $paging) {
|
jobs(filter: $jobFilter, order: $sorting, page: $paging) {
|
||||||
items {
|
items {
|
||||||
@@ -175,7 +166,7 @@
|
|||||||
# ClusterMetrics for doubleMetricPlot
|
# ClusterMetrics for doubleMetricPlot
|
||||||
clusterMetrics(
|
clusterMetrics(
|
||||||
cluster: $cluster
|
cluster: $cluster
|
||||||
metrics: $metrics
|
metrics: $cmetrics
|
||||||
from: $clusterFrom
|
from: $clusterFrom
|
||||||
to: $to
|
to: $to
|
||||||
) {
|
) {
|
||||||
@@ -194,7 +185,8 @@
|
|||||||
`,
|
`,
|
||||||
variables: {
|
variables: {
|
||||||
cluster: presetCluster,
|
cluster: presetCluster,
|
||||||
metrics: ["flops_any", "mem_bw"], // Metrics For Cluster Plot and Roofline
|
nmetrics: ["flops_any", "mem_bw", "cpu_power", "acc_power"], // Metrics For Roofline and Stats
|
||||||
|
cmetrics: ["flops_any", "mem_bw"], // Metrics For Cluster Plot
|
||||||
from: from.toISOString(),
|
from: from.toISOString(),
|
||||||
clusterFrom: clusterFrom.toISOString(),
|
clusterFrom: clusterFrom.toISOString(),
|
||||||
to: to.toISOString(),
|
to: to.toISOString(),
|
||||||
@@ -258,6 +250,11 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get Idle Infos after Sums
|
||||||
|
if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes'];
|
||||||
|
if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores'];
|
||||||
|
if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs'];
|
||||||
|
|
||||||
// Keymetrics (Data on Cluster-Scope)
|
// Keymetrics (Data on Cluster-Scope)
|
||||||
let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
|
let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
|
||||||
sum + (node.metrics.find((m) => m.name == 'flops_any')?.metric?.series[0]?.statistics?.avg || 0),
|
sum + (node.metrics.find((m) => m.name == 'flops_any')?.metric?.series[0]?.statistics?.avg || 0),
|
||||||
@@ -271,6 +268,26 @@
|
|||||||
) || 0;
|
) || 0;
|
||||||
rawInfos['memBwRate'] = Math.floor((rawMemBw * 100) / 100)
|
rawInfos['memBwRate'] = Math.floor((rawMemBw * 100) / 100)
|
||||||
|
|
||||||
|
let rawCpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
|
||||||
|
sum + (node.metrics.find((m) => m.name == 'cpu_power')?.metric?.series[0]?.statistics?.avg || 0),
|
||||||
|
0, // Initial Value
|
||||||
|
) || 0;
|
||||||
|
rawInfos['cpuPwr'] = Math.floor((rawCpuPwr * 100) / 100)
|
||||||
|
if (!rawInfos['cpuPwrUnit']) {
|
||||||
|
let rawCpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'cpu_power')?.metric?.unit || null
|
||||||
|
rawInfos['cpuPwrUnit'] = rawCpuUnit ? rawCpuUnit.prefix + rawCpuUnit.base : ''
|
||||||
|
}
|
||||||
|
|
||||||
|
let rawGpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
|
||||||
|
sum + (node.metrics.find((m) => m.name == 'acc_power')?.metric?.series[0]?.statistics?.avg || 0),
|
||||||
|
0, // Initial Value
|
||||||
|
) || 0;
|
||||||
|
rawInfos['gpuPwr'] = Math.floor((rawGpuPwr * 100) / 100)
|
||||||
|
if (!rawInfos['gpuPwrUnit']) {
|
||||||
|
let rawGpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'acc_power')?.metric?.unit || null
|
||||||
|
rawInfos['gpuPwrUnit'] = rawGpuUnit ? rawGpuUnit.prefix + rawGpuUnit.base : ''
|
||||||
|
}
|
||||||
|
|
||||||
return rawInfos
|
return rawInfos
|
||||||
} else {
|
} else {
|
||||||
return {};
|
return {};
|
||||||
@@ -338,7 +355,7 @@
|
|||||||
</script>
|
</script>
|
||||||
|
|
||||||
<Card style="height: 98vh;">
|
<Card style="height: 98vh;">
|
||||||
<CardBody class="align-content-center p-1">
|
<CardBody class="align-content-center p-2">
|
||||||
<Row>
|
<Row>
|
||||||
<Col>
|
<Col>
|
||||||
<Refresher
|
<Refresher
|
||||||
@@ -408,79 +425,99 @@
|
|||||||
<Col> <!-- Utilization Info Card -->
|
<Col> <!-- Utilization Info Card -->
|
||||||
<Card class="h-100">
|
<Card class="h-100">
|
||||||
<CardBody>
|
<CardBody>
|
||||||
<Table borderless>
|
<Row class="mb-1">
|
||||||
<tr class="py-2">
|
<Col xs={4} class="d-inline-flex align-items-center justify-content-center">
|
||||||
<td style="font-size:x-large;">{clusterInfo?.runningJobs} Running Jobs</td>
|
<Badge color="primary" style="font-size:x-large;margin-right:0.25rem;">
|
||||||
<td colspan="2" style="font-size:x-large;">{clusterInfo?.activeUsers} Active Users</td>
|
{clusterInfo?.runningJobs}
|
||||||
</tr>
|
</Badge>
|
||||||
<hr class="my-1"/>
|
<div style="font-size:large;">
|
||||||
<tr class="pt-2">
|
Running Jobs
|
||||||
<td style="font-size: large;">
|
</div>
|
||||||
Flop Rate (<span style="cursor: help;" title="Flops[Any] = (Flops[Double] x 2) + Flops[Single]">Any</span>)
|
</Col>
|
||||||
</td>
|
<Col xs={4} class="d-inline-flex align-items-center justify-content-center">
|
||||||
<td colspan="2" style="font-size: large;">
|
<Badge color="primary" style="font-size:x-large;margin-right:0.25rem;">
|
||||||
Memory BW Rate
|
{clusterInfo?.activeUsers}
|
||||||
</td>
|
</Badge>
|
||||||
</tr>
|
<div style="font-size:large;">
|
||||||
<tr class="pb-2">
|
Active Users
|
||||||
<td style="font-size:x-large;">
|
</div>
|
||||||
{clusterInfo?.flopRate}
|
</Col>
|
||||||
{clusterInfo?.flopRateUnit}
|
<Col xs={4} class="d-inline-flex align-items-center justify-content-center">
|
||||||
</td>
|
<Badge color="primary" style="font-size:x-large;margin-right:0.25rem;">
|
||||||
<td colspan="2" style="font-size:x-large;">
|
{clusterInfo?.allocatedNodes}
|
||||||
{clusterInfo?.memBwRate}
|
</Badge>
|
||||||
{clusterInfo?.memBwRateUnit}
|
<div style="font-size:large;">
|
||||||
</td>
|
Active Nodes
|
||||||
</tr>
|
</div>
|
||||||
<hr class="my-1"/>
|
</Col>
|
||||||
<tr class="py-2">
|
</Row>
|
||||||
<th scope="col">Allocated Nodes</th>
|
<Row class="mt-1 mb-2">
|
||||||
<td style="min-width: 100px;"
|
<Col xs={4} class="d-inline-flex align-items-center justify-content-center">
|
||||||
><div class="col">
|
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
|
||||||
<Progress
|
{clusterInfo?.flopRate} {clusterInfo?.flopRateUnit}
|
||||||
value={clusterInfo?.allocatedNodes}
|
</Badge>
|
||||||
max={clusterInfo?.totalNodes}
|
<div style="font-size:large;">
|
||||||
/>
|
Total Flop Rate
|
||||||
</div></td
|
</div>
|
||||||
>
|
</Col>
|
||||||
<td
|
<Col xs={4} class="d-inline-flex align-items-center justify-content-center">
|
||||||
>{clusterInfo?.allocatedNodes} / {clusterInfo?.totalNodes}
|
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
|
||||||
Nodes</td
|
{clusterInfo?.memBwRate} {clusterInfo?.memBwRateUnit}
|
||||||
>
|
</Badge>
|
||||||
</tr>
|
<div style="font-size:large;">
|
||||||
<tr class="py-2">
|
Total Memory Bandwidth
|
||||||
<th scope="col">Allocated Cores</th>
|
</div>
|
||||||
<td style="min-width: 100px;"
|
</Col>
|
||||||
><div class="col">
|
|
||||||
<Progress
|
|
||||||
value={clusterInfo?.allocatedCores}
|
|
||||||
max={clusterInfo?.totalCores}
|
|
||||||
/>
|
|
||||||
</div></td
|
|
||||||
>
|
|
||||||
<td
|
|
||||||
>{formatNumber(clusterInfo?.allocatedCores)} / {formatNumber(clusterInfo?.totalCores)}
|
|
||||||
Cores</td
|
|
||||||
>
|
|
||||||
</tr>
|
|
||||||
{#if clusterInfo?.totalAccs !== 0}
|
{#if clusterInfo?.totalAccs !== 0}
|
||||||
<tr class="py-2">
|
<Col xs={4} class="d-inline-flex align-items-center justify-content-center">
|
||||||
<th scope="col">Allocated Accelerators</th>
|
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
|
||||||
<td style="min-width: 100px;"
|
{clusterInfo?.gpuPwr} {clusterInfo?.gpuPwrUnit}
|
||||||
><div class="col">
|
</Badge>
|
||||||
<Progress
|
<div style="font-size:large;">
|
||||||
value={clusterInfo?.allocatedAccs}
|
Total GPU Power
|
||||||
max={clusterInfo?.totalAccs}
|
</div>
|
||||||
/>
|
</Col>
|
||||||
</div></td
|
{:else}
|
||||||
>
|
<Col xs={4} class="d-inline-flex align-items-center justify-content-center">
|
||||||
<td
|
<Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
|
||||||
>{clusterInfo?.allocatedAccs} / {clusterInfo?.totalAccs}
|
{clusterInfo?.cpuPwr} {clusterInfo?.cpuPwrUnit}
|
||||||
Accelerators</td
|
</Badge>
|
||||||
>
|
<div style="font-size:large;">
|
||||||
</tr>
|
Total CPU Power
|
||||||
|
</div>
|
||||||
|
</Col>
|
||||||
{/if}
|
{/if}
|
||||||
</Table>
|
</Row>
|
||||||
|
<Row class="my-1 align-items-baseline">
|
||||||
|
<Col xs={2} style="font-size:large;">
|
||||||
|
Active Cores
|
||||||
|
</Col>
|
||||||
|
<Col xs={8}>
|
||||||
|
<Progress multi style="height:2.5rem;font-size:x-large;">
|
||||||
|
<Progress bar color="success" value={clusterInfo?.allocatedCores}>{formatNumber(clusterInfo?.allocatedCores)}</Progress>
|
||||||
|
<Progress bar color="light" value={clusterInfo?.idleCores}>{formatNumber(clusterInfo?.idleCores)}</Progress>
|
||||||
|
</Progress>
|
||||||
|
</Col>
|
||||||
|
<Col xs={2} style="font-size:large;">
|
||||||
|
Idle Cores
|
||||||
|
</Col>
|
||||||
|
</Row>
|
||||||
|
{#if clusterInfo?.totalAccs !== 0}
|
||||||
|
<Row class="my-1 align-items-baseline">
|
||||||
|
<Col xs={2} style="font-size:large;">
|
||||||
|
Active GPU
|
||||||
|
</Col>
|
||||||
|
<Col xs={8}>
|
||||||
|
<Progress multi style="height:2.5rem;font-size:x-large;">
|
||||||
|
<Progress bar color="success" value={clusterInfo?.allocatedAccs}>{formatNumber(clusterInfo?.allocatedAccs)}</Progress>
|
||||||
|
<Progress bar color="light" value={clusterInfo?.idleAccs}>{formatNumber(clusterInfo?.idleAccs)}</Progress>
|
||||||
|
</Progress>
|
||||||
|
</Col>
|
||||||
|
<Col xs={2} style="font-size:large;">
|
||||||
|
Idle GPU
|
||||||
|
</Col>
|
||||||
|
</Row>
|
||||||
|
{/if}
|
||||||
</CardBody>
|
</CardBody>
|
||||||
</Card>
|
</Card>
|
||||||
</Col>
|
</Col>
|
||||||
@@ -506,7 +543,7 @@
|
|||||||
useColors={false}
|
useColors={false}
|
||||||
useLegend={false}
|
useLegend={false}
|
||||||
allowSizeChange
|
allowSizeChange
|
||||||
width={colWidthRoof - 10}
|
width={colWidthRoof}
|
||||||
height={300}
|
height={300}
|
||||||
cluster={presetCluster}
|
cluster={presetCluster}
|
||||||
subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null}
|
subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null}
|
||||||
@@ -574,8 +611,8 @@
|
|||||||
{#key $statesTimed?.data?.nodeStatesTimed}
|
{#key $statesTimed?.data?.nodeStatesTimed}
|
||||||
<Stacked
|
<Stacked
|
||||||
data={$statesTimed?.data?.nodeStatesTimed}
|
data={$statesTimed?.data?.nodeStatesTimed}
|
||||||
width={colWidthStacked * 0.95}
|
width={colWidthStacked}
|
||||||
xlabel="Time"
|
height={260}
|
||||||
ylabel="Nodes"
|
ylabel="Nodes"
|
||||||
yunit = "#Count"
|
yunit = "#Count"
|
||||||
title = "Cluster Status"
|
title = "Cluster Status"
|
||||||
|
|||||||
@@ -23,7 +23,7 @@
|
|||||||
width = 0,
|
width = 0,
|
||||||
height = 300,
|
height = 300,
|
||||||
data = null,
|
data = null,
|
||||||
xlabel = "",
|
xlabel = null,
|
||||||
ylabel = "",
|
ylabel = "",
|
||||||
yunit = "",
|
yunit = "",
|
||||||
title = "",
|
title = "",
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
const power = [1, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21]
|
const power = [1, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21]
|
||||||
const prefix = ['', 'K', 'M', 'G', 'T', 'P', 'E']
|
const prefix = ['', 'k', 'M', 'G', 'T', 'P', 'E']
|
||||||
|
|
||||||
export function formatNumber(x) {
|
export function formatNumber(x) {
|
||||||
if ( isNaN(x) || x == null) {
|
if ( isNaN(x) || x == null) {
|
||||||
|
|||||||
@@ -355,7 +355,7 @@
|
|||||||
|
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<Card style="height: 88vh;">
|
<Card>
|
||||||
<CardBody class="align-content-center">
|
<CardBody class="align-content-center">
|
||||||
<Row>
|
<Row>
|
||||||
<Col>
|
<Col>
|
||||||
@@ -540,7 +540,7 @@
|
|||||||
<Roofline
|
<Roofline
|
||||||
useColors={true}
|
useColors={true}
|
||||||
allowSizeChange
|
allowSizeChange
|
||||||
width={colWidthRoof - 10}
|
width={colWidthRoof}
|
||||||
height={300}
|
height={300}
|
||||||
subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null}
|
subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null}
|
||||||
roofData={transformJobsStatsToData($statusQuery?.data?.jobsMetricStats)}
|
roofData={transformJobsStatsToData($statusQuery?.data?.jobsMetricStats)}
|
||||||
@@ -568,7 +568,8 @@
|
|||||||
{#key $statesTimed?.data?.nodeStates}
|
{#key $statesTimed?.data?.nodeStates}
|
||||||
<Stacked
|
<Stacked
|
||||||
data={$statesTimed?.data?.nodeStates}
|
data={$statesTimed?.data?.nodeStates}
|
||||||
width={colWidthStacked1 * 0.95}
|
width={colWidthStacked1}
|
||||||
|
height={330}
|
||||||
xlabel="Time"
|
xlabel="Time"
|
||||||
ylabel="Nodes"
|
ylabel="Nodes"
|
||||||
yunit = "#Count"
|
yunit = "#Count"
|
||||||
@@ -584,7 +585,8 @@
|
|||||||
{#key $statesTimed?.data?.healthStates}
|
{#key $statesTimed?.data?.healthStates}
|
||||||
<Stacked
|
<Stacked
|
||||||
data={$statesTimed?.data?.healthStates}
|
data={$statesTimed?.data?.healthStates}
|
||||||
width={colWidthStacked2 * 0.95}
|
width={colWidthStacked2}
|
||||||
|
height={330}
|
||||||
xlabel="Time"
|
xlabel="Time"
|
||||||
ylabel="Nodes"
|
ylabel="Nodes"
|
||||||
yunit = "#Count"
|
yunit = "#Count"
|
||||||
|
|||||||
Reference in New Issue
Block a user