mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 21:41:46 +01:00
@@ -1,2 +1,3 @@
|
|||||||
vasp
|
vasp_gam
|
||||||
VASP
|
vasp_ncl
|
||||||
|
vasp_std
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import (
|
|||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -81,6 +82,8 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
m := make(map[string][]string)
|
m := make(map[string][]string)
|
||||||
healthStates := make(map[string]schema.MonitoringState)
|
healthStates := make(map[string]schema.MonitoringState)
|
||||||
|
|
||||||
|
startMs := time.Now()
|
||||||
|
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||||
m[sc] = append(m[sc], node.Hostname)
|
m[sc] = append(m[sc], node.Hostname)
|
||||||
@@ -97,6 +100,9 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cclog.Infof("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs))
|
||||||
|
startDb := time.Now()
|
||||||
|
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
state := determineState(node.States)
|
state := determineState(node.States)
|
||||||
healthState := schema.MonitoringStateFailed
|
healthState := schema.MonitoringStateFailed
|
||||||
@@ -115,4 +121,6 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cclog.Infof("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDb))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
|
|||||||
// Cluster List
|
// Cluster List
|
||||||
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
|
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
|
||||||
// Slurm node state
|
// Slurm node state
|
||||||
r.HandleFunc("/nodestates/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
||||||
// Job Handler
|
// Job Handler
|
||||||
if config.Keys.APISubjects == nil {
|
if config.Keys.APISubjects == nil {
|
||||||
cclog.Info("Enabling REST start/stop job API")
|
cclog.Info("Enabling REST start/stop job API")
|
||||||
|
|||||||
@@ -198,7 +198,6 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
|
|||||||
}
|
}
|
||||||
|
|
||||||
cclog.Debugf("Added node '%s' to database", hostname)
|
cclog.Debugf("Added node '%s' to database", hostname)
|
||||||
return nil
|
|
||||||
} else {
|
} else {
|
||||||
cclog.Warnf("Error while querying node '%v' from database", id)
|
cclog.Warnf("Error while querying node '%v' from database", id)
|
||||||
return err
|
return err
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -244,7 +245,16 @@ func (m *MemoryStore) HealthCheck(cluster string,
|
|||||||
|
|
||||||
missingCount = len(missingList)
|
missingCount = len(missingList)
|
||||||
degradedCount = len(degradedList)
|
degradedCount = len(degradedList)
|
||||||
healthyCount = len(expectedMetrics) - (missingCount + degradedCount)
|
uniqueList := mergeList(missingList, degradedList)
|
||||||
|
healthyCount = len(expectedMetrics) - len(uniqueList)
|
||||||
|
|
||||||
|
// Debug log missing and degraded metrics
|
||||||
|
if missingCount > 0 {
|
||||||
|
cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "missing metrics:", missingList)
|
||||||
|
}
|
||||||
|
if degradedCount > 0 {
|
||||||
|
cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList)
|
||||||
|
}
|
||||||
|
|
||||||
// Determine overall health status
|
// Determine overall health status
|
||||||
if missingCount > 0 || degradedCount > 0 {
|
if missingCount > 0 || degradedCount > 0 {
|
||||||
|
|||||||
@@ -319,11 +319,11 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
{#if type == "USER"}
|
{#if type == "USER"}
|
||||||
<a href="/monitoring/user/{row.id}"
|
<a href="/monitoring/user/{row.id}{fetchRunning ? '?state=running' : ''}"
|
||||||
>{scrambleNames ? scramble(row.id) : row.id}</a
|
>{scrambleNames ? scramble(row.id) : row.id}</a
|
||||||
>
|
>
|
||||||
{:else if type == "PROJECT"}
|
{:else if type == "PROJECT"}
|
||||||
<a href="/monitoring/jobs/?project={row.id}"
|
<a href="/monitoring/jobs/?project={row.id}{fetchRunning ? '&state=running' : ''}"
|
||||||
>{scrambleNames ? scramble(row.id) : row.id}</a
|
>{scrambleNames ? scramble(row.id) : row.id}</a
|
||||||
>
|
>
|
||||||
{:else}
|
{:else}
|
||||||
|
|||||||
@@ -19,7 +19,8 @@
|
|||||||
Spinner,
|
Spinner,
|
||||||
Input,
|
Input,
|
||||||
InputGroup,
|
InputGroup,
|
||||||
InputGroupText
|
InputGroupText,
|
||||||
|
Tooltip
|
||||||
} from "@sveltestrap/sveltestrap";
|
} from "@sveltestrap/sveltestrap";
|
||||||
import {
|
import {
|
||||||
queryStore,
|
queryStore,
|
||||||
@@ -32,6 +33,10 @@
|
|||||||
scramble,
|
scramble,
|
||||||
scrambleNames,
|
scrambleNames,
|
||||||
} from "./generic/utils.js";
|
} from "./generic/utils.js";
|
||||||
|
import {
|
||||||
|
formatNumber,
|
||||||
|
formatDurationTime
|
||||||
|
} from "./generic/units.js";
|
||||||
import JobList from "./generic/JobList.svelte";
|
import JobList from "./generic/JobList.svelte";
|
||||||
import JobCompare from "./generic/JobCompare.svelte";
|
import JobCompare from "./generic/JobCompare.svelte";
|
||||||
import Filters from "./generic/Filters.svelte";
|
import Filters from "./generic/Filters.svelte";
|
||||||
@@ -56,6 +61,7 @@
|
|||||||
const durationBinOptions = ["1m","10m","1h","6h","12h"];
|
const durationBinOptions = ["1m","10m","1h","6h","12h"];
|
||||||
const metricBinOptions = [10, 20, 50, 100];
|
const metricBinOptions = [10, 20, 50, 100];
|
||||||
const matchedJobCompareLimit = 500;
|
const matchedJobCompareLimit = 500;
|
||||||
|
const shortDuration = ccconfig.jobList_hideShortRunningJobs; // Always configured
|
||||||
|
|
||||||
/* State Init */
|
/* State Init */
|
||||||
// List & Control Vars
|
// List & Control Vars
|
||||||
@@ -108,6 +114,7 @@
|
|||||||
shortJobs
|
shortJobs
|
||||||
totalWalltime
|
totalWalltime
|
||||||
totalCoreHours
|
totalCoreHours
|
||||||
|
totalAccHours
|
||||||
histDuration {
|
histDuration {
|
||||||
count
|
count
|
||||||
value
|
value
|
||||||
@@ -133,6 +140,7 @@
|
|||||||
variables: { jobFilters, selectedHistograms, numDurationBins, numMetricBins },
|
variables: { jobFilters, selectedHistograms, numDurationBins, numMetricBins },
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
const hasAccHours = $derived($stats?.data?.jobsStatistics[0]?.totalAccHours != 0);
|
||||||
|
|
||||||
/* Functions */
|
/* Functions */
|
||||||
function resetJobSelection() {
|
function resetJobSelection() {
|
||||||
@@ -290,20 +298,54 @@
|
|||||||
{/if}
|
{/if}
|
||||||
<tr>
|
<tr>
|
||||||
<th scope="row">Total Jobs</th>
|
<th scope="row">Total Jobs</th>
|
||||||
<td>{$stats.data.jobsStatistics[0].totalJobs}</td>
|
<td>
|
||||||
|
<span style="cursor: help;" title="{$stats.data.jobsStatistics[0].totalJobs} Jobs">
|
||||||
|
{formatNumber($stats.data.jobsStatistics[0].totalJobs)} Jobs
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th scope="row">Short Jobs</th>
|
<th scope="row">
|
||||||
<td>{$stats.data.jobsStatistics[0].shortJobs}</td>
|
<span class="mr-1">
|
||||||
|
Short Jobs
|
||||||
|
<Icon name="info-circle" id="shortjobs-info" style="margin-left:5px; cursor:help;"/>
|
||||||
|
</span>
|
||||||
|
<Tooltip target={`shortjobs-info`} placement="right">
|
||||||
|
Job duration less than {formatDurationTime(shortDuration)}
|
||||||
|
</Tooltip>
|
||||||
|
</th>
|
||||||
|
<td>
|
||||||
|
<span style="cursor: help;" title="{$stats.data.jobsStatistics[0].shortJobs} Jobs">
|
||||||
|
{formatNumber($stats.data.jobsStatistics[0].shortJobs)} Jobs
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th scope="row">Total Walltime</th>
|
<th scope="row">Total Walltime</th>
|
||||||
<td>{$stats.data.jobsStatistics[0].totalWalltime}</td>
|
<td>
|
||||||
|
<span style="cursor: help;" title="{$stats.data.jobsStatistics[0].totalWalltime} Hours">
|
||||||
|
{formatNumber($stats.data.jobsStatistics[0].totalWalltime)} Hours
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th scope="row">Total Core Hours</th>
|
<th scope="row">Total Core Hours</th>
|
||||||
<td>{$stats.data.jobsStatistics[0].totalCoreHours}</td>
|
<td>
|
||||||
|
<span style="cursor: help;" title="{$stats.data.jobsStatistics[0].totalCoreHours} Hours">
|
||||||
|
{formatNumber($stats.data.jobsStatistics[0].totalCoreHours)} Hours
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
{#if hasAccHours}
|
||||||
|
<tr>
|
||||||
|
<th scope="row">Total Accelerator Hours</th>
|
||||||
|
<td>
|
||||||
|
<span style="cursor: help;" title="{$stats.data.jobsStatistics[0].totalAccHours} Hours">
|
||||||
|
{formatNumber($stats.data.jobsStatistics[0].totalAccHours)} Hours
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{/if}
|
||||||
</tbody>
|
</tbody>
|
||||||
</Table>
|
</Table>
|
||||||
</Col>
|
</Col>
|
||||||
@@ -316,6 +358,7 @@
|
|||||||
xunit="Runtime"
|
xunit="Runtime"
|
||||||
ylabel="Number of Jobs"
|
ylabel="Number of Jobs"
|
||||||
yunit="Jobs"
|
yunit="Jobs"
|
||||||
|
height={hasAccHours ? 290 : 250}
|
||||||
usesBins
|
usesBins
|
||||||
xtime
|
xtime
|
||||||
/>
|
/>
|
||||||
@@ -330,6 +373,7 @@
|
|||||||
xunit="Nodes"
|
xunit="Nodes"
|
||||||
ylabel="Number of Jobs"
|
ylabel="Number of Jobs"
|
||||||
yunit="Jobs"
|
yunit="Jobs"
|
||||||
|
height={hasAccHours ? 290 : 250}
|
||||||
/>
|
/>
|
||||||
{/key}
|
{/key}
|
||||||
</Col>
|
</Col>
|
||||||
|
|||||||
@@ -79,7 +79,6 @@
|
|||||||
|
|
||||||
/* Derived */
|
/* Derived */
|
||||||
const jobId = $derived(job?.id);
|
const jobId = $derived(job?.id);
|
||||||
const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope($metricsQuery.data.jobMetrics) : []);
|
|
||||||
const scopes = $derived.by(() => {
|
const scopes = $derived.by(() => {
|
||||||
if (job.numNodes == 1) {
|
if (job.numNodes == 1) {
|
||||||
if (job.numAcc >= 1) return ["core", "accelerator"];
|
if (job.numAcc >= 1) return ["core", "accelerator"];
|
||||||
@@ -95,6 +94,7 @@
|
|||||||
variables: { id: jobId, metrics, scopes, selectedResolution },
|
variables: { id: jobId, metrics, scopes, selectedResolution },
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope($metricsQuery.data.jobMetrics) : []);
|
||||||
|
|
||||||
/* Effects */
|
/* Effects */
|
||||||
$effect(() => {
|
$effect(() => {
|
||||||
|
|||||||
@@ -32,10 +32,6 @@ export function scaleNumber(x, p = '') {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function roundTwoDigits(x) {
|
|
||||||
return Math.round(x * 100) / 100
|
|
||||||
}
|
|
||||||
|
|
||||||
export function scaleNumbers(x, y, p = '') {
|
export function scaleNumbers(x, y, p = '') {
|
||||||
const oldPower = power[prefix.indexOf(p)]
|
const oldPower = power[prefix.indexOf(p)]
|
||||||
const rawXValue = x * oldPower
|
const rawXValue = x * oldPower
|
||||||
@@ -77,6 +73,10 @@ export function formatUnixTime(t, withDate = false) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function roundTwoDigits(x) {
|
||||||
|
return Math.round(x * 100) / 100
|
||||||
|
}
|
||||||
|
|
||||||
// const equalsCheck = (a, b) => {
|
// const equalsCheck = (a, b) => {
|
||||||
// return JSON.stringify(a) === JSON.stringify(b);
|
// return JSON.stringify(a) === JSON.stringify(b);
|
||||||
// }
|
// }
|
||||||
|
|||||||
Reference in New Issue
Block a user