Update dependencies. Rebuild graphql and swagger

2026-03-04 07:07:30 +01:00 · 2026-01-15 08:32:06 +01:00
parent 8f0bb907ff
commit e1efc68476
24 changed files with 3321 additions and 555 deletions
--- a/internal/repository/stats.go
+++ b/internal/repository/stats.go
@@ -2,6 +2,44 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
+// This file contains job statistics and histogram generation functionality for the JobRepository.
+//
+// # Job Statistics
+//
+// The statistics methods provide aggregated metrics about jobs including total jobs, users,
+// walltime, and resource usage (nodes, cores, accelerators). Statistics can be computed:
+//   - Overall (JobsStats): Single aggregate across all matching jobs
+//   - Grouped (JobsStatsGrouped): Aggregated by user, project, cluster, or subcluster
+//   - Counts (JobCountGrouped, AddJobCount): Simple job counts with optional filtering
+//
+// All statistics methods support filtering via JobFilter and respect security contexts.
+//
+// # Histograms
+//
+// Histogram methods generate distribution data for visualization:
+//   - Duration, nodes, cores, accelerators (AddHistograms)
+//   - Job metrics like CPU load, memory usage (AddMetricHistograms)
+//
+// Histograms use intelligent binning:
+//   - Duration: Variable bin sizes (1m, 10m, 1h, 6h, 12h, 24h) with zero-padding
+//   - Resources: Natural value-based bins
+//   - Metrics: Normalized to peak values with configurable bin counts
+//
+// # Running vs. Completed Jobs
+//
+// Statistics handle running jobs specially:
+//   - Duration calculated as (now - start_time) for running jobs
+//   - Metric histograms for running jobs load data from metric backend instead of footprint
+//   - Job state filtering distinguishes running/completed jobs
+//
+// # Performance Considerations
+//
+// - All queries use prepared statements via stmtCache
+// - Complex aggregations use SQL for efficiency
+// - Histogram pre-initialization ensures consistent bin ranges
+// - Metric histogram queries limited to 500 jobs for running job analysis
+
 package repository

 import (
@@ -19,7 +57,9 @@ import (
 	sq "github.com/Masterminds/squirrel"
 )

-// GraphQL validation should make sure that no unkown values can be specified.
+// groupBy2column maps GraphQL Aggregate enum values to their corresponding database column names.
+// Used by JobsStatsGrouped and JobCountGrouped to translate user-facing grouping dimensions
+// into SQL GROUP BY clauses. GraphQL validation ensures only valid enum values are accepted.
 var groupBy2column = map[model.Aggregate]string{
 	model.AggregateUser:       "job.hpc_user",
 	model.AggregateProject:    "job.project",
@@ -27,6 +67,9 @@ var groupBy2column = map[model.Aggregate]string{
 	model.AggregateSubcluster: "job.subcluster",
 }

+// sortBy2column maps GraphQL SortByAggregate enum values to their corresponding computed column names.
+// Used by JobsStatsGrouped to translate sort preferences into SQL ORDER BY clauses.
+// Column names match the AS aliases used in buildStatsQuery.
 var sortBy2column = map[model.SortByAggregate]string{
 	model.SortByAggregateTotaljobs:      "totalJobs",
 	model.SortByAggregateTotalusers:     "totalUsers",
@@ -39,6 +82,21 @@ var sortBy2column = map[model.SortByAggregate]string{
 	model.SortByAggregateTotalacchours:  "totalAccHours",
 }

+// buildCountQuery constructs a SQL query to count jobs with optional grouping and filtering.
+//
+// Parameters:
+//   - filter: Job filters to apply (cluster, user, time range, etc.)
+//   - kind: Special filter - "running" for running jobs only, "short" for jobs under threshold
+//   - col: Column name to GROUP BY; empty string for total count without grouping
+//
+// Returns a SelectBuilder that produces either:
+//   - Single count: COUNT(job.id) when col is empty
+//   - Grouped counts: col, COUNT(job.id) when col is specified
+//
+// The kind parameter enables counting specific job categories:
+//   - "running": Only jobs with job_state = 'running'
+//   - "short": Only jobs with duration < ShortRunningJobsDuration config value
+//   - empty: All jobs matching filters
 func (r *JobRepository) buildCountQuery(
 	filter []*model.JobFilter,
 	kind string,
@@ -47,10 +105,8 @@ func (r *JobRepository) buildCountQuery(
 	var query sq.SelectBuilder

 	if col != "" {
-		// Scan columns: id, cnt
 		query = sq.Select(col, "COUNT(job.id)").From("job").GroupBy(col)
 	} else {
-		// Scan columns:  cnt
 		query = sq.Select("COUNT(job.id)").From("job")
 	}

@@ -68,6 +124,27 @@ func (r *JobRepository) buildCountQuery(
 	return query
 }

+// buildStatsQuery constructs a SQL query to compute comprehensive job statistics with optional grouping.
+//
+// Parameters:
+//   - filter: Job filters to apply (cluster, user, time range, etc.)
+//   - col: Column name to GROUP BY; empty string for overall statistics without grouping
+//
+// Returns a SelectBuilder that produces comprehensive statistics:
+//   - totalJobs: Count of jobs
+//   - totalUsers: Count of distinct users (always 0 when grouping by user)
+//   - totalWalltime: Sum of job durations in hours
+//   - totalNodes: Sum of nodes used across all jobs
+//   - totalNodeHours: Sum of (duration × num_nodes) in hours
+//   - totalCores: Sum of hardware threads used across all jobs
+//   - totalCoreHours: Sum of (duration × num_hwthreads) in hours
+//   - totalAccs: Sum of accelerators used across all jobs
+//   - totalAccHours: Sum of (duration × num_acc) in hours
+//
+// Special handling:
+//   - Running jobs: Duration calculated as (now - start_time) instead of stored duration
+//   - Grouped queries: Also select grouping column and user's display name from hpc_user table
+//   - All time values converted from seconds to hours (÷ 3600) and rounded
 func (r *JobRepository) buildStatsQuery(
 	filter []*model.JobFilter,
 	col string,
@@ -75,31 +152,29 @@ func (r *JobRepository) buildStatsQuery(
 	var query sq.SelectBuilder

 	if col != "" {
-		// Scan columns: id, name, totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
 		query = sq.Select(
 			col,
 			"name",
 			"COUNT(job.id) as totalJobs",
 			"COUNT(DISTINCT job.hpc_user) AS totalUsers",
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int) as totalWalltime`, time.Now().Unix()),
-			fmt.Sprintf(`CAST(SUM(job.num_nodes) as int) as totalNodes`),
+			`CAST(SUM(job.num_nodes) as int) as totalNodes`,
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int) as totalNodeHours`, time.Now().Unix()),
-			fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as int) as totalCores`),
+			`CAST(SUM(job.num_hwthreads) as int) as totalCores`,
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int) as totalCoreHours`, time.Now().Unix()),
-			fmt.Sprintf(`CAST(SUM(job.num_acc) as int) as totalAccs`),
+			`CAST(SUM(job.num_acc) as int) as totalAccs`,
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int) as totalAccHours`, time.Now().Unix()),
 		).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col)
 	} else {
-		// Scan columns: totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours
 		query = sq.Select(
 			"COUNT(job.id) as totalJobs",
 			"COUNT(DISTINCT job.hpc_user) AS totalUsers",
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int)`, time.Now().Unix()),
-			fmt.Sprintf(`CAST(SUM(job.num_nodes) as int)`),
+			`CAST(SUM(job.num_nodes) as int)`,
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int)`, time.Now().Unix()),
-			fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as int)`),
+			`CAST(SUM(job.num_hwthreads) as int)`,
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int)`, time.Now().Unix()),
-			fmt.Sprintf(`CAST(SUM(job.num_acc) as int)`),
+			`CAST(SUM(job.num_acc) as int)`,
 			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int)`, time.Now().Unix()),
 		).From("job")
 	}
@@ -111,6 +186,25 @@ func (r *JobRepository) buildStatsQuery(
 	return query
 }

+// JobsStatsGrouped computes comprehensive job statistics grouped by a dimension (user, project, cluster, or subcluster).
+//
+// This is the primary method for generating aggregated statistics views in the UI, providing
+// metrics like total jobs, walltime, and resource usage broken down by the specified grouping.
+//
+// Parameters:
+//   - ctx: Context for security checks and cancellation
+//   - filter: Filters to apply (time range, cluster, job state, etc.)
+//   - page: Optional pagination (ItemsPerPage: -1 disables pagination)
+//   - sortBy: Optional sort column (totalJobs, totalWalltime, totalCoreHours, etc.)
+//   - groupBy: Required grouping dimension (User, Project, Cluster, or Subcluster)
+//
+// Returns a slice of JobsStatistics, one per group, with:
+//   - ID: The group identifier (username, project name, cluster name, etc.)
+//   - Name: Display name (for users, from hpc_user.name; empty for other groups)
+//   - Statistics: totalJobs, totalUsers, totalWalltime, resource usage metrics
+//
+// Security: Respects user roles via SecurityCheck - users see only their own data unless admin/support.
+// Performance: Results are sorted in SQL and pagination applied before scanning rows.
 func (r *JobRepository) JobsStatsGrouped(
 	ctx context.Context,
 	filter []*model.JobFilter,
@@ -230,6 +324,21 @@ func (r *JobRepository) JobsStatsGrouped(
 	return stats, nil
 }

+// JobsStats computes overall job statistics across all matching jobs without grouping.
+//
+// This method provides a single aggregate view of job metrics, useful for dashboard
+// summaries and overall system utilization reports.
+//
+// Parameters:
+//   - ctx: Context for security checks and cancellation
+//   - filter: Filters to apply (time range, cluster, job state, etc.)
+//
+// Returns a single-element slice containing aggregate statistics:
+//   - totalJobs, totalUsers, totalWalltime
+//   - totalNodeHours, totalCoreHours, totalAccHours
+//
+// Unlike JobsStatsGrouped, this returns overall totals without breaking down by dimension.
+// Security checks are applied via SecurityCheck to respect user access levels.
 func (r *JobRepository) JobsStats(
 	ctx context.Context,
 	filter []*model.JobFilter,
@@ -303,6 +412,17 @@ func LoadJobStat(job *schema.Job, metric string, statType string) float64 {
 	return 0.0
 }

+// JobCountGrouped counts jobs grouped by a dimension without computing detailed statistics.
+//
+// This is a lightweight alternative to JobsStatsGrouped when only job counts are needed,
+// avoiding the overhead of calculating walltime and resource usage metrics.
+//
+// Parameters:
+//   - ctx: Context for security checks
+//   - filter: Filters to apply
+//   - groupBy: Grouping dimension (User, Project, Cluster, or Subcluster)
+//
+// Returns JobsStatistics with only ID and TotalJobs populated for each group.
 func (r *JobRepository) JobCountGrouped(
 	ctx context.Context,
 	filter []*model.JobFilter,
@@ -343,6 +463,20 @@ func (r *JobRepository) JobCountGrouped(
 	return stats, nil
 }

+// AddJobCountGrouped augments existing statistics with additional job counts by category.
+//
+// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped
+// with counts of running or short-running jobs, matched by group ID.
+//
+// Parameters:
+//   - ctx: Context for security checks
+//   - filter: Filters to apply
+//   - groupBy: Grouping dimension (must match the dimension used for stats parameter)
+//   - stats: Existing statistics to augment (modified in-place by ID matching)
+//   - kind: "running" to add RunningJobs count, "short" to add ShortJobs count
+//
+// Returns the same stats slice with RunningJobs or ShortJobs fields populated per group.
+// Groups without matching jobs will have 0 for the added field.
 func (r *JobRepository) AddJobCountGrouped(
 	ctx context.Context,
 	filter []*model.JobFilter,
@@ -392,6 +526,18 @@ func (r *JobRepository) AddJobCountGrouped(
 	return stats, nil
 }

+// AddJobCount augments existing overall statistics with additional job counts by category.
+//
+// Similar to AddJobCountGrouped but for ungrouped statistics. Applies the same count
+// to all statistics entries (typically just one).
+//
+// Parameters:
+//   - ctx: Context for security checks
+//   - filter: Filters to apply
+//   - stats: Existing statistics to augment (modified in-place)
+//   - kind: "running" to add RunningJobs count, "short" to add ShortJobs count
+//
+// Returns the same stats slice with RunningJobs or ShortJobs fields set to the total count.
 func (r *JobRepository) AddJobCount(
 	ctx context.Context,
 	filter []*model.JobFilter,
@@ -437,6 +583,26 @@ func (r *JobRepository) AddJobCount(
 	return stats, nil
 }

+// AddHistograms augments statistics with distribution histograms for job properties.
+//
+// Generates histogram data for visualization of job duration, node count, core count,
+// and accelerator count distributions. Duration histogram uses intelligent binning based
+// on the requested resolution.
+//
+// Parameters:
+//   - ctx: Context for security checks
+//   - filter: Filters to apply to jobs included in histograms
+//   - stat: Statistics struct to augment (modified in-place)
+//   - durationBins: Bin size - "1m", "10m", "1h", "6h", "12h", or "24h" (default)
+//
+// Populates these fields in stat:
+//   - HistDuration: Job duration distribution (zero-padded bins)
+//   - HistNumNodes: Node count distribution
+//   - HistNumCores: Core (hwthread) count distribution
+//   - HistNumAccs: Accelerator count distribution
+//
+// Duration bins are pre-initialized with zeros to ensure consistent ranges for visualization.
+// Bin size determines both the width and maximum duration displayed (e.g., "1h" = 48 bins × 1h = 48h max).
 func (r *JobRepository) AddHistograms(
 	ctx context.Context,
 	filter []*model.JobFilter,
@@ -447,20 +613,20 @@ func (r *JobRepository) AddHistograms(

 	var targetBinCount int
 	var targetBinSize int
-	switch {
-	case *durationBins == "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes
+	switch *durationBins {
+	case "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes
 		targetBinCount = 60
 		targetBinSize = 60
-	case *durationBins == "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours
+	case "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours
 		targetBinCount = 72
 		targetBinSize = 600
-	case *durationBins == "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours
+	case "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours
 		targetBinCount = 48
 		targetBinSize = 3600
-	case *durationBins == "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days
+	case "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days
 		targetBinCount = 12
 		targetBinSize = 21600
-	case *durationBins == "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days
+	case "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days
 		targetBinCount = 14
 		targetBinSize = 43200
 	default: // 24h
@@ -499,7 +665,30 @@ func (r *JobRepository) AddHistograms(
 	return stat, nil
 }

-// Requires thresholds for metric from config for cluster? Of all clusters and use largest? split to 10 + 1 for artifacts?
+// AddMetricHistograms augments statistics with distribution histograms for job metrics.
+//
+// Generates histogram data for metrics like CPU load, memory usage, etc. Handles running
+// and completed jobs differently: running jobs load data from metric backend, completed jobs
+// use footprint data from database.
+//
+// Parameters:
+//   - ctx: Context for security checks
+//   - filter: Filters to apply (MUST contain State filter for running jobs)
+//   - metrics: List of metric names to histogram (e.g., ["cpu_load", "mem_used"])
+//   - stat: Statistics struct to augment (modified in-place)
+//   - targetBinCount: Number of histogram bins (default: 10)
+//
+// Populates HistMetrics field in stat with MetricHistoPoints for each metric.
+//
+// Binning algorithm:
+//   - Values normalized to metric's peak value from cluster configuration
+//   - Bins evenly distributed from 0 to peak
+//   - Pre-initialized with zeros for consistent visualization
+//
+// Limitations:
+//   - Running jobs: Limited to 500 jobs for performance
+//   - Requires valid cluster configuration with metric peak values
+//   - Uses footprint statistic (avg/max/min) configured per metric
 func (r *JobRepository) AddMetricHistograms(
 	ctx context.Context,
 	filter []*model.JobFilter,
@@ -534,7 +723,16 @@ func (r *JobRepository) AddMetricHistograms(
 	return stat, nil
 }

-// `value` must be the column grouped by, but renamed to "value"
+// jobsStatisticsHistogram generates a simple histogram by grouping on a column value.
+//
+// Used for histograms where the column value directly represents the bin (e.g., node count, core count).
+// Unlike duration/metric histograms, this doesn't pre-initialize bins with zeros.
+//
+// Parameters:
+//   - value: SQL expression that produces the histogram value, aliased as "value"
+//   - filters: Job filters to apply
+//
+// Returns histogram points with Value (from column) and Count (number of jobs).
 func (r *JobRepository) jobsStatisticsHistogram(
 	ctx context.Context,
 	value string,
@@ -573,6 +771,26 @@ func (r *JobRepository) jobsStatisticsHistogram(
 	return points, nil
 }

+// jobsDurationStatisticsHistogram generates a duration histogram with pre-initialized bins.
+//
+// Bins are zero-padded to provide consistent ranges for visualization, unlike simple
+// histograms which only return bins with data. The value parameter should compute
+// the bin number from job duration.
+//
+// Parameters:
+//   - value: SQL expression computing bin number from duration, aliased as "value"
+//   - filters: Job filters to apply
+//   - binSizeSeconds: Width of each bin in seconds
+//   - targetBinCount: Number of bins to pre-initialize
+//
+// Returns histogram points with Value (bin_number × binSizeSeconds) and Count.
+// All bins from 1 to targetBinCount are returned, with Count=0 for empty bins.
+//
+// Algorithm:
+//  1. Pre-initialize targetBinCount bins with zero counts
+//  2. Query database for actual counts per bin
+//  3. Match query results to pre-initialized bins by value
+//  4. Bins without matches remain at zero
 func (r *JobRepository) jobsDurationStatisticsHistogram(
 	ctx context.Context,
 	value string,
@@ -588,7 +806,6 @@ func (r *JobRepository) jobsDurationStatisticsHistogram(
 		return nil, qerr
 	}

-	// Initialize histogram bins with zero counts
 	// Each bin represents a duration range: bin N = [N*binSizeSeconds, (N+1)*binSizeSeconds)
 	// Example: binSizeSeconds=3600 (1 hour), bin 1 = 0-1h, bin 2 = 1-2h, etc.
 	points := make([]*model.HistoPoint, 0)
@@ -607,8 +824,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram(
 		return nil, err
 	}

-	// Match query results to pre-initialized bins and fill counts
-	// Query returns raw duration values that need to be mapped to correct bins
+	// Match query results to pre-initialized bins.
+	// point.Value from query is the bin number; multiply by binSizeSeconds to match bin.Value.
 	for rows.Next() {
 		point := model.HistoPoint{}
 		if err := rows.Scan(&point.Value, &point.Count); err != nil {
@@ -616,13 +833,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram(
 			return nil, err
 		}

-		// Find matching bin and update count
-		// point.Value is multiplied by binSizeSeconds to match pre-calculated bin.Value
 		for _, e := range points {
 			if e.Value == (point.Value * binSizeSeconds) {
-				// Note: Matching on unmodified integer value (and multiplying point.Value
-				// by binSizeSeconds after match) causes frontend to loop into highest
-				// targetBinCount, due to zoom condition instantly being fulfilled (cause unknown)
 				e.Count = point.Count
 				break
 			}
@@ -633,13 +845,34 @@ func (r *JobRepository) jobsDurationStatisticsHistogram(
 	return points, nil
 }

+// jobsMetricStatisticsHistogram generates a metric histogram using footprint data from completed jobs.
+//
+// Values are normalized to the metric's peak value and distributed into bins. The algorithm
+// is based on SQL histogram generation techniques, extracting metric values from JSON footprint
+// and computing bin assignments in SQL.
+//
+// Parameters:
+//   - metric: Metric name (e.g., "cpu_load", "mem_used")
+//   - filters: Job filters to apply
+//   - bins: Number of bins to generate
+//
+// Returns MetricHistoPoints with metric name, unit, footprint stat type, and binned data.
+//
+// Algorithm:
+//  1. Determine peak value from cluster configuration (filtered cluster or max across all)
+//  2. Generate SQL that extracts footprint value, normalizes to [0,1], multiplies by bin count
+//  3. Pre-initialize bins with min/max ranges based on peak value
+//  4. Query database for counts per bin
+//  5. Match results to pre-initialized bins
+//
+// Special handling: Values exactly equal to peak are forced into the last bin by multiplying
+// peak by 0.999999999 to avoid creating an extra bin.
 func (r *JobRepository) jobsMetricStatisticsHistogram(
 	ctx context.Context,
 	metric string,
 	filters []*model.JobFilter,
 	bins *int,
 ) (*model.MetricHistoPoints, error) {
-	// Determine the metric's peak value for histogram normalization
 	// Peak value defines the upper bound for binning: values are distributed across
 	// bins from 0 to peak. First try to get peak from filtered cluster, otherwise
 	// scan all clusters to find the maximum peak value.
@@ -679,18 +912,14 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
 		}
 	}

-	// Construct SQL histogram bins using normalized values
+	// Construct SQL histogram bins using normalized values.
 	// Algorithm based on: https://jereze.com/code/sql-histogram/ (modified)
 	start := time.Now()

-	// Calculate bin number for each job's metric value:
-	// 1. Extract metric value from JSON footprint
-	// 2. Normalize to [0,1] by dividing by peak
-	// 3. Multiply by number of bins to get bin number
-	// 4. Cast to integer for bin assignment
-	//
-	// Special case: Values exactly equal to peak would fall into bin N+1,
-	// so we multiply peak by 0.999999999 to force it into the last bin (bin N)
+	// Bin calculation formula:
+	//   bin_number = CAST( (value / peak) * num_bins AS INTEGER ) + 1
+	// Special case: value == peak would create bin N+1, so we test for equality
+	// and multiply peak by 0.999999999 to force it into bin N.
 	binQuery := fmt.Sprintf(`CAST(
 		((case when json_extract(footprint, "$.%s") = %f then %f*0.999999999 else json_extract(footprint, "$.%s") end) / %f)
 		* %v as INTEGER )`,
@@ -699,24 +928,19 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
 	mainQuery := sq.Select(
 		fmt.Sprintf(`%s + 1 as bin`, binQuery),
 		`count(*) as count`,
-		// For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * %s as min`, peak, *bins, binQuery),
-		// For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * (%s + 1) as max`, peak, *bins, binQuery),
 	).From("job").Where(
 		"JSON_VALID(footprint)",
 	).Where(fmt.Sprintf(`json_extract(footprint, "$.%s") is not null and json_extract(footprint, "$.%s") <= %f`, (metric + "_" + footprintStat), (metric + "_" + footprintStat), peak))

-	// Only accessible Jobs...
 	mainQuery, qerr := SecurityCheck(ctx, mainQuery)
 	if qerr != nil {
 		return nil, qerr
 	}

-	// Filters...
 	for _, f := range filters {
 		mainQuery = BuildWhereClause(f, mainQuery)
 	}

-	// Finalize query with Grouping and Ordering
 	mainQuery = mainQuery.GroupBy("bin").OrderBy("bin")

 	rows, err := mainQuery.RunWith(r.DB).Query()
@@ -725,8 +949,7 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
 		return nil, err
 	}

-	// Initialize histogram bins with calculated min/max ranges
-	// Each bin represents a range of metric values
+	// Pre-initialize bins with calculated min/max ranges.
 	// Example: peak=1000, bins=10 -> bin 1=[0,100), bin 2=[100,200), ..., bin 10=[900,1000]
 	points := make([]*model.MetricHistoPoint, 0)
 	binStep := int(peak) / *bins
@@ -737,29 +960,18 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
 		points = append(points, &epoint)
 	}

-	// Fill counts from query results
-	// Query only returns bins that have jobs, so we match against pre-initialized bins
+	// Match query results to pre-initialized bins.
 	for rows.Next() {
 		rpoint := model.MetricHistoPoint{}
-		if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { // Required for Debug: &rpoint.Min, &rpoint.Max
+		if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil {
 			cclog.Warnf("Error while scanning rows for %s", metric)
-			return nil, err // FIXME: Totally bricks cc-backend if returned and if all metrics requested?
+			return nil, err
 		}

-		// Match query result to pre-initialized bin and update count
 		for _, e := range points {
-			if e.Bin != nil && rpoint.Bin != nil {
-				if *e.Bin == *rpoint.Bin {
-					e.Count = rpoint.Count
-					// Only Required For Debug: Check DB returned Min/Max against Backend Init above
-					// if rpoint.Min != nil {
-					// 	cclog.Warnf(">>>> Bin %d Min Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Min, *e.Min)
-					// }
-					// if rpoint.Max != nil {
-					// 	cclog.Warnf(">>>> Bin %d Max Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Max, *e.Max)
-					// }
-					break
-				}
+			if e.Bin != nil && rpoint.Bin != nil && *e.Bin == *rpoint.Bin {
+				e.Count = rpoint.Count
+				break
 			}
 		}
 	}
@@ -770,6 +982,28 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
 	return &result, nil
 }

+// runningJobsMetricStatisticsHistogram generates metric histograms for running jobs using live data.
+//
+// Unlike completed jobs which use footprint data from the database, running jobs require
+// fetching current metric averages from the metric backend (via metricdispatch).
+//
+// Parameters:
+//   - metrics: List of metric names
+//   - filters: Job filters (should filter to running jobs only)
+//   - bins: Number of histogram bins
+//
+// Returns slice of MetricHistoPoints, one per metric.
+//
+// Limitations:
+//   - Maximum 500 jobs (returns nil if more jobs match)
+//   - Requires metric backend availability
+//   - Bins based on metric peak values from cluster configuration
+//
+// Algorithm:
+//  1. Query first 501 jobs to check count limit
+//  2. Load metric averages for all jobs via metricdispatch
+//  3. For each metric, create bins based on peak value
+//  4. Iterate averages and count jobs per bin
 func (r *JobRepository) runningJobsMetricStatisticsHistogram(
 	ctx context.Context,
 	metrics []string,