Merge branch 'hotfix' of https://github.com/ClusterCockpit/cc-backend into hotfix

2025-09-05 00:23:00 +02:00 · 2024-03-14 11:08:37 +01:00
parent 58415ab5c3 32a57661fd
commit 7940317857
51 changed files with 8733 additions and 6982 deletions
--- a/web/frontend/src/JobFootprint.svelte
+++ b/web/frontend/src/JobFootprint.svelte
@@ -1,129 +1,199 @@
-<script>
-    import { getContext } from 'svelte'
-    import {
-        Card,
-        CardHeader,
-        CardTitle,
-        CardBody,
-        Progress,
-        Icon,
-        Tooltip
-    } from "sveltestrap";
-    import { mean, round } from 'mathjs'
-
-    export let job
-    export let jobMetrics
-    export let view = 'job'
-    export let width = 'auto'
-
-    const clusters         = getContext('clusters')
-    const subclusterConfig = clusters.find((c) => c.name == job.cluster).subClusters.find((sc) => sc.name == job.subCluster)
-
-    const footprintMetrics = (job.numAcc !== 0)
-        ? (job.exclusive !== 1) // GPU
-            ? ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Shared
-            : ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Exclusive
-        : (job.exclusive !== 1) // CPU only
-            ? ['flops_any', 'mem_used'] // Shared
-            : ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // Exclusive
-
-    const footprintData = footprintMetrics.map((fm) => {
-        // Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
-        let mv = null
-        if (fm === 'cpu_load' && job.loadAvg !== 0) {
-            mv = round(job.loadAvg, 2)
-        } else if (fm === 'flops_any' && job.flopsAnyAvg !== 0) {
-            mv = round(job.flopsAnyAvg, 2)
-        } else if (fm === 'mem_bw' && job.memBwAvg !== 0) {
-            mv = round(job.memBwAvg, 2)
-        } else { // Calculate from jobMetrics
-            const jm  = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node')
-            if (jm?.metric?.statisticsSeries) {
-                mv = round(mean(jm.metric.statisticsSeries.mean), 2)
-            } else if (jm?.metric?.series?.length > 1) {
-                const avgs = jm.metric.series.map(jms => jms.statistics.avg)
-                mv = round(mean(avgs), 2)
-            } else if (jm?.metric?.series) {
-                mv = round(jm.metric.series[0].statistics.avg, 2)
-            } else {
-                mv = 0.0
-            }
-        }
-        
-        // Unit
-        const fmc = getContext('metrics')(job.cluster, fm)
-        let unit = ''
-        if (fmc?.unit?.base) unit = fmc.unit.prefix + fmc.unit.base
-
-        // Threshold / -Differences
-        const fmt = findJobThresholds(job, fmc, subclusterConfig)
-        if (fm === 'flops_any') fmt.peak = round((fmt.peak * 0.85), 0)
-
-        // Define basic data
-        const fmBase = {
-            name: fm,
-            unit: unit,
-            avg: mv,
-            max: fmt.peak
-        }
-
-        if (evalFootprint(fm, mv, fmt, 'alert')) {
-            return {
-                ...fmBase,
-                color: 'danger',
-                message:`Metric average way ${fm === 'mem_used' ? 'above' : 'below' } expected normal thresholds.`,
-                impact: 3
-            }
-        } else if (evalFootprint(fm, mv, fmt, 'caution')) {
-            return {
-                ...fmBase,
-                color: 'warning',
-                message: `Metric average ${fm === 'mem_used' ? 'above' : 'below' } expected normal thresholds.`,
-                impact: 2
-            }
-        } else if (evalFootprint(fm, mv, fmt, 'normal')) {
-            return {
-                ...fmBase,
-                color: 'success',
-                message: 'Metric average within expected thresholds.',
-                impact: 1
-            }
-        } else if (evalFootprint(fm, mv, fmt, 'peak')) {
-            return {
-                ...fmBase,
-                color: 'info',
-                message: 'Metric average above expected normal thresholds: Check for artifacts recommended.',
-                impact: 0
-            }
-        } else {
-            return {
-                ...fmBase,
-                color: 'secondary',
-                message: 'Metric average above expected peak threshold: Check for artifacts!',
-                impact: -1
-            }
-        }
-    })
-
-    function evalFootprint(metric, mean, thresholds, level) {
-        // mem_used has inverse logic regarding threshold levels, notify levels triggered if mean > threshold
-        switch (level) {
-            case 'peak':
-                if (metric === 'mem_used') return false // mem_used over peak -> return false to trigger impact -1
-                else                       return (mean <= thresholds.peak && mean > thresholds.normal)
-            case 'alert':
-                if (metric === 'mem_used') return (mean <= thresholds.peak && mean >= thresholds.alert)
-                else                       return (mean <= thresholds.alert && mean >= 0)
-            case 'caution':
-                if (metric === 'mem_used') return (mean < thresholds.alert && mean >= thresholds.caution)
-                else                       return (mean <= thresholds.caution && mean > thresholds.alert)
-            case 'normal':
-                if (metric === 'mem_used') return (mean < thresholds.caution && mean >= 0)
-                else                       return (mean <= thresholds.normal && mean > thresholds.caution)
-            default:
-                return false
-        }
+<script context="module">
+  export function findJobThresholds(job, metricConfig, subClusterConfig) {
+    if (!job || !metricConfig || !subClusterConfig) {
+      console.warn("Argument missing for findJobThresholds!");
+      return null;
    }
+
+    const subclusterThresholds = metricConfig.subClusters.find(
+      (sc) => sc.name == subClusterConfig.name,
+    );
+    const defaultThresholds = {
+      peak: subclusterThresholds
+        ? subclusterThresholds.peak
+        : metricConfig.peak,
+      normal: subclusterThresholds
+        ? subclusterThresholds.normal
+        : metricConfig.normal,
+      caution: subclusterThresholds
+        ? subclusterThresholds.caution
+        : metricConfig.caution,
+      alert: subclusterThresholds
+        ? subclusterThresholds.alert
+        : metricConfig.alert,
+    };
+
+    if (job.exclusive === 1) {
+      // Exclusive: Use as defined
+      return defaultThresholds;
+    } else {
+      // Shared: Handle specifically
+      if (metricConfig.name === "cpu_load") {
+        // Special: Avg Aggregation BUT scaled based on #hwthreads
+        return {
+          peak: job.numHWThreads,
+          normal: job.numHWThreads,
+          caution: defaultThresholds.caution,
+          alert: defaultThresholds.alert,
+        };
+      } else if (metricConfig.aggregation === "avg") {
+        return defaultThresholds;
+      } else if (metricConfig.aggregation === "sum") {
+        const jobFraction =
+          job.numHWThreads / subClusterConfig.topology.node.length;
+        return {
+          peak: round(defaultThresholds.peak * jobFraction, 0),
+          normal: round(defaultThresholds.normal * jobFraction, 0),
+          caution: round(defaultThresholds.caution * jobFraction, 0),
+          alert: round(defaultThresholds.alert * jobFraction, 0),
+        };
+      } else {
+        console.warn(
+          "Missing or unkown aggregation mode (sum/avg) for metric:",
+          metricConfig,
+        );
+        return null;
+      }
+    } // Other job.exclusive cases?
+  }
+</script>
+
+<script>
+  import { getContext } from "svelte";
+  import {
+    Card,
+    CardHeader,
+    CardTitle,
+    CardBody,
+    Progress,
+    Icon,
+    Tooltip,
+  } from "@sveltestrap/sveltestrap";
+  import { mean, round } from "mathjs";
+
+  export let job;
+  export let jobMetrics;
+  export let view = "job";
+  export let width = "auto";
+
+  const clusters = getContext("clusters");
+  const subclusterConfig = clusters
+    .find((c) => c.name == job.cluster)
+    .subClusters.find((sc) => sc.name == job.subCluster);
+
+  const footprintMetrics =
+    job.numAcc !== 0
+      ? job.exclusive !== 1 // GPU
+        ? ["acc_utilization", "acc_mem_used", "nv_sm_clock", "nv_mem_util"] // Shared
+        : ["acc_utilization", "acc_mem_used", "nv_sm_clock", "nv_mem_util"] // Exclusive
+      : (job.exclusive !== 1) // CPU Only
+        ? ["flops_any", "mem_used"] // Shared
+        : ["cpu_load", "flops_any", "mem_used", "mem_bw"]; // Exclusive
+
+  const footprintData = footprintMetrics.map((fm) => {
+    // Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
+    let mv = null;
+    if (fm === "cpu_load" && job.loadAvg !== 0) {
+      mv = round(job.loadAvg, 2);
+    } else if (fm === "flops_any" && job.flopsAnyAvg !== 0) {
+      mv = round(job.flopsAnyAvg, 2);
+    } else if (fm === "mem_bw" && job.memBwAvg !== 0) {
+      mv = round(job.memBwAvg, 2);
+    } else {
+      // Calculate from jobMetrics
+      const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === "node");
+      if (jm?.metric?.statisticsSeries) {
+        mv = round(mean(jm.metric.statisticsSeries.mean), 2);
+      } else if (jm?.metric?.series?.length > 1) {
+        const avgs = jm.metric.series.map((jms) => jms.statistics.avg);
+        mv = round(mean(avgs), 2);
+      } else if (jm?.metric?.series) {
+        mv = round(jm.metric.series[0].statistics.avg, 2);
+      } else {
+        mv = 0.0;
+      }
+    }
+
+    // Unit
+    const fmc = getContext("metrics")(job.cluster, fm);
+    let unit = "";
+    if (fmc?.unit?.base) unit = fmc.unit.prefix + fmc.unit.base;
+
+    // Threshold / -Differences
+    const fmt = findJobThresholds(job, fmc, subclusterConfig);
+    if (fm === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
+
+    // Define basic data
+    const fmBase = {
+      name: fm,
+      unit: unit,
+      avg: mv,
+      max: fmt.peak,
+    };
+
+    if (evalFootprint(fm, mv, fmt, "alert")) {
+      return {
+        ...fmBase,
+        color: "danger",
+        message: `Metric average way ${fm === "mem_used" ? "above" : "below"} expected normal thresholds.`,
+        impact: 3,
+      };
+    } else if (evalFootprint(fm, mv, fmt, "caution")) {
+      return {
+        ...fmBase,
+        color: "warning",
+        message: `Metric average ${fm === "mem_used" ? "above" : "below"} expected normal thresholds.`,
+        impact: 2,
+      };
+    } else if (evalFootprint(fm, mv, fmt, "normal")) {
+      return {
+        ...fmBase,
+        color: "success",
+        message: "Metric average within expected thresholds.",
+        impact: 1,
+      };
+    } else if (evalFootprint(fm, mv, fmt, "peak")) {
+      return {
+        ...fmBase,
+        color: "info",
+        message:
+          "Metric average above expected normal thresholds: Check for artifacts recommended.",
+        impact: 0,
+      };
+    } else {
+      return {
+        ...fmBase,
+        color: "secondary",
+        message:
+          "Metric average above expected peak threshold: Check for artifacts!",
+        impact: -1,
+      };
+    }
+  });
+
+  function evalFootprint(metric, mean, thresholds, level) {
+    // mem_used has inverse logic regarding threshold levels, notify levels triggered if mean > threshold
+    switch (level) {
+      case "peak":
+        if (metric === "mem_used")
+          return false; // mem_used over peak -> return false to trigger impact -1
+        else return mean <= thresholds.peak && mean > thresholds.normal;
+      case "alert":
+        if (metric === "mem_used")
+          return mean <= thresholds.peak && mean >= thresholds.alert;
+        else return mean <= thresholds.alert && mean >= 0;
+      case "caution":
+        if (metric === "mem_used")
+          return mean < thresholds.alert && mean >= thresholds.caution;
+        else return mean <= thresholds.caution && mean > thresholds.alert;
+      case "normal":
+        if (metric === "mem_used")
+          return mean < thresholds.caution && mean >= 0;
+        else return mean <= thresholds.normal && mean > thresholds.caution;
+      default:
+        return false;
+    }
+  }
 </script>

 <script context="module">
@@ -155,13 +225,7 @@
        } else if (metricConfig.aggregation === 'avg' ){
            return defaultThresholds
        } else if (metricConfig.aggregation === 'sum' ){
-            let jobFraction = 0.0
-            if (job.numAcc > 0) {
-                jobFraction = job.numAcc / subClusterConfig.topology.accelerators.length
-            } else if (job.numHWThreads > 0) {
-                jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
-            }
-
+            const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
            return {
                peak: round((defaultThresholds.peak * jobFraction), 0),
                normal: round((defaultThresholds.normal * jobFraction), 0),
@@ -177,62 +241,67 @@
 </script>

 <Card class="h-auto mt-1" style="width: {width}px;">
-    {#if view === 'job'}
+  {#if view === "job"}
    <CardHeader>
-        <CardTitle class="mb-0 d-flex justify-content-center">
-            Core Metrics Footprint
-        </CardTitle>
+      <CardTitle class="mb-0 d-flex justify-content-center">
+        Core Metrics Footprint
+      </CardTitle>
    </CardHeader>
+  {/if}
+  <CardBody>
+    {#each footprintData as fpd, index}
+      <div class="mb-1 d-flex justify-content-between">
+        <div>&nbsp;<b>{fpd.name}</b></div>
+        <!-- For symmetry, see below ...-->
+        <div
+          class="cursor-help d-inline-flex"
+          id={`footprint-${job.jobId}-${index}`}
+        >
+          <div class="mx-1">
+            <!-- Alerts Only -->
+            {#if fpd.impact === 3 || fpd.impact === -1}
+              <Icon name="exclamation-triangle-fill" class="text-danger" />
+            {:else if fpd.impact === 2}
+              <Icon name="exclamation-triangle" class="text-warning" />
+            {/if}
+            <!-- Emoji for all states-->
+            {#if fpd.impact === 3}
+              <Icon name="emoji-frown" class="text-danger" />
+            {:else if fpd.impact === 2}
+              <Icon name="emoji-neutral" class="text-warning" />
+            {:else if fpd.impact === 1}
+              <Icon name="emoji-smile" class="text-success" />
+            {:else if fpd.impact === 0}
+              <Icon name="emoji-laughing" class="text-info" />
+            {:else if fpd.impact === -1}
+              <Icon name="emoji-dizzy" class="text-danger" />
+            {/if}
+          </div>
+          <div>
+            <!-- Print Values -->
+            {fpd.avg} / {fpd.max}
+            {fpd.unit} &nbsp; <!-- To increase margin to tooltip: No other way manageable ... -->
+          </div>
+        </div>
+        <Tooltip
+          target={`footprint-${job.jobId}-${index}`}
+          placement="right"
+          offset={[0, 20]}>{fpd.message}</Tooltip
+        >
+      </div>
+      <div class="mb-2">
+        <Progress value={fpd.avg} max={fpd.max} color={fpd.color} />
+      </div>
+    {/each}
+    {#if job?.metaData?.message}
+      <hr class="mt-1 mb-2" />
+      {@html job.metaData.message}
    {/if}
-    <CardBody>
-        {#each footprintData as fpd, index}
-            <div class="mb-1 d-flex justify-content-between">
-                <div>&nbsp;<b>{fpd.name}</b></div> <!-- For symmetry, see below ...-->
-                <div class="cursor-help d-inline-flex" id={`footprint-${job.jobId}-${index}`}>
-                    <div class="mx-1">
-                        <!-- Alerts Only -->
-                        {#if fpd.impact === 3 || fpd.impact === -1}
-                            <Icon name="exclamation-triangle-fill" class="text-danger"/>
-                        {:else if fpd.impact === 2}
-                            <Icon name="exclamation-triangle" class="text-warning"/>
-                        {/if}
-                        <!-- Emoji for all states-->
-                        {#if fpd.impact === 3}
-                            <Icon name="emoji-frown" class="text-danger"/>
-                        {:else if fpd.impact === 2}
-                            <Icon name="emoji-neutral" class="text-warning"/>
-                        {:else if fpd.impact === 1}
-                            <Icon name="emoji-smile" class="text-success"/>
-                        {:else if fpd.impact === 0}
-                            <Icon name="emoji-laughing" class="text-info"/>
-                        {:else if fpd.impact === -1}
-                            <Icon name="emoji-dizzy" class="text-danger"/>
-                        {/if}
-                    </div>
-                    <div>
-                        <!-- Print Values -->
-                        {fpd.avg} / {fpd.max} {fpd.unit} &nbsp; <!-- To increase margin to tooltip: No other way manageable ... -->
-                    </div>
-                </div>
-                <Tooltip target={`footprint-${job.jobId}-${index}`} placement="right" offset={[0, 20]}>{fpd.message}</Tooltip>
-            </div>
-            <div class="mb-2">
-                <Progress
-                    value={fpd.avg}
-                    max={fpd.max}
-                    color={fpd.color}
-                />
-            </div>
-        {/each}
-        {#if job?.metaData?.message}
-            <hr class="mt-1 mb-2"/>
-            {@html job.metaData.message}
-        {/if}
-    </CardBody>
+  </CardBody>
 </Card>

 <style>
-    .cursor-help {
-        cursor: help;
-    }
+  .cursor-help {
+    cursor: help;
+  }
 </style>