cc-backend/internal/repository/stats.go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

// This file contains job statistics and histogram generation functionality for the JobRepository.
//
// # Job Statistics
//
// The statistics methods provide aggregated metrics about jobs including total jobs, users,
// walltime, and resource usage (nodes, cores, accelerators). Statistics can be computed:
//   - Overall (JobsStats): Single aggregate across all matching jobs
//   - Grouped (JobsStatsGrouped): Aggregated by user, project, cluster, or subcluster
//   - Counts (JobCountGrouped, AddJobCount): Simple job counts with optional filtering
//
// All statistics methods support filtering via JobFilter and respect security contexts.
//
// # Histograms
//
// Histogram methods generate distribution data for visualization:
//   - Duration, nodes, cores, accelerators (AddHistograms)
//   - Job metrics like CPU load, memory usage (AddMetricHistograms)
//
// Histograms use intelligent binning:
//   - Duration: Variable bin sizes (1m, 10m, 1h, 6h, 12h, 24h) with zero-padding
//   - Resources: Natural value-based bins
//   - Metrics: Normalized to peak values with configurable bin counts
//
// # Running vs. Completed Jobs
//
// Statistics handle running jobs specially:
//   - Duration calculated as (now - start_time) for running jobs
//   - Metric histograms for running jobs load data from metric backend instead of footprint
//   - Job state filtering distinguishes running/completed jobs
//
// # Performance Considerations
//
// - All queries use prepared statements via stmtCache
// - Complex aggregations use SQL for efficiency
// - Histogram pre-initialization ensures consistent bin ranges
// - Metric histogram queries limited to 500 jobs for running job analysis

package repository

import (
	"context"
	"database/sql"
	"fmt"
	"time"

	"github.com/ClusterCockpit/cc-backend/internal/config"
	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
	"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
	"github.com/ClusterCockpit/cc-backend/pkg/archive"
	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
	"github.com/ClusterCockpit/cc-lib/v2/schema"
	sq "github.com/Masterminds/squirrel"
)

// groupBy2column maps GraphQL Aggregate enum values to their corresponding database column names.
// Used by JobsStatsGrouped and JobCountGrouped to translate user-facing grouping dimensions
// into SQL GROUP BY clauses. GraphQL validation ensures only valid enum values are accepted.
var groupBy2column = map[model.Aggregate]string{
	model.AggregateUser:       "job.hpc_user",
	model.AggregateProject:    "job.project",
	model.AggregateCluster:    "job.cluster",
	model.AggregateSubcluster: "job.subcluster",
}

// sortBy2column maps GraphQL SortByAggregate enum values to their corresponding computed column names.
// Used by JobsStatsGrouped to translate sort preferences into SQL ORDER BY clauses.
// Column names match the AS aliases used in buildStatsQuery.
var sortBy2column = map[model.SortByAggregate]string{
	model.SortByAggregateTotaljobs:      "totalJobs",
	model.SortByAggregateTotalusers:     "totalUsers",
	model.SortByAggregateTotalwalltime:  "totalWalltime",
	model.SortByAggregateTotalnodes:     "totalNodes",
	model.SortByAggregateTotalnodehours: "totalNodeHours",
	model.SortByAggregateTotalcores:     "totalCores",
	model.SortByAggregateTotalcorehours: "totalCoreHours",
	model.SortByAggregateTotalaccs:      "totalAccs",
	model.SortByAggregateTotalacchours:  "totalAccHours",
}

// buildCountQuery constructs a SQL query to count jobs with optional grouping and filtering.
//
// Parameters:
//   - filter: Job filters to apply (cluster, user, time range, etc.)
//   - kind: Special filter - "running" for running jobs only, "short" for jobs under threshold
//   - col: Column name to GROUP BY; empty string for total count without grouping
//
// Returns a SelectBuilder that produces either:
//   - Single count: COUNT(job.id) when col is empty
//   - Grouped counts: col, COUNT(job.id) when col is specified
//
// The kind parameter enables counting specific job categories:
//   - "running": Only jobs with job_state = 'running'
//   - "short": Only jobs with duration < ShortRunningJobsDuration config value
//   - empty: All jobs matching filters
func (r *JobRepository) buildCountQuery(
	filter []*model.JobFilter,
	kind string,
	col string,
) sq.SelectBuilder {
	var query sq.SelectBuilder

	if col != "" {
		query = sq.Select(col, "COUNT(job.id)").From("job").GroupBy(col)
	} else {
		query = sq.Select("COUNT(job.id)").From("job")
	}

	switch kind {
	case "running":
		query = query.Where("job.job_state = ?", "running")
	case "short":
		query = query.Where("job.duration < ?", config.Keys.ShortRunningJobsDuration)
	}

	for _, f := range filter {
		query = BuildWhereClause(f, query)
	}

	return query
}

// buildStatsQuery constructs a SQL query to compute comprehensive job statistics with optional grouping.
//
// Parameters:
//   - filter: Job filters to apply (cluster, user, time range, etc.)
//   - col: Column name to GROUP BY; empty string for overall statistics without grouping
//
// Returns a SelectBuilder that produces comprehensive statistics:
//   - totalJobs: Count of jobs
//   - totalUsers: Count of distinct users (always 0 when grouping by user)
//   - totalWalltime: Sum of job durations in hours
//   - totalNodes: Sum of nodes used across all jobs
//   - totalNodeHours: Sum of (duration × num_nodes) in hours
//   - totalCores: Sum of hardware threads used across all jobs
//   - totalCoreHours: Sum of (duration × num_hwthreads) in hours
//   - totalAccs: Sum of accelerators used across all jobs
//   - totalAccHours: Sum of (duration × num_acc) in hours
//
// Special handling:
//   - Running jobs: Duration calculated as (now - start_time) instead of stored duration
//   - Grouped queries: Also select grouping column and user's display name from hpc_user table
//   - All time values converted from seconds to hours (÷ 3600) and rounded
func (r *JobRepository) buildStatsQuery(
	filter []*model.JobFilter,
	col string,
) sq.SelectBuilder {
	var query sq.SelectBuilder

	if col != "" {
		query = sq.Select(
			col,
			"name",
			"COUNT(job.id) as totalJobs",
			"COUNT(DISTINCT job.hpc_user) AS totalUsers",
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int) as totalWalltime`, time.Now().Unix()),
			`CAST(SUM(job.num_nodes) as int) as totalNodes`,
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int) as totalNodeHours`, time.Now().Unix()),
			`CAST(SUM(job.num_hwthreads) as int) as totalCores`,
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int) as totalCoreHours`, time.Now().Unix()),
			`CAST(SUM(job.num_acc) as int) as totalAccs`,
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int) as totalAccHours`, time.Now().Unix()),
		).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col)
	} else {
		query = sq.Select(
			"COUNT(job.id) as totalJobs",
			"COUNT(DISTINCT job.hpc_user) AS totalUsers",
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int)`, time.Now().Unix()),
			`CAST(SUM(job.num_nodes) as int)`,
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int)`, time.Now().Unix()),
			`CAST(SUM(job.num_hwthreads) as int)`,
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int)`, time.Now().Unix()),
			`CAST(SUM(job.num_acc) as int)`,
			fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int)`, time.Now().Unix()),
		).From("job")
	}

	for _, f := range filter {
		query = BuildWhereClause(f, query)
	}

	return query
}

// JobsStatsGrouped computes comprehensive job statistics grouped by a dimension (user, project, cluster, or subcluster).
//
// This is the primary method for generating aggregated statistics views in the UI, providing
// metrics like total jobs, walltime, and resource usage broken down by the specified grouping.
//
// Parameters:
//   - ctx: Context for security checks and cancellation
//   - filter: Filters to apply (time range, cluster, job state, etc.)
//   - page: Optional pagination (ItemsPerPage: -1 disables pagination)
//   - sortBy: Optional sort column (totalJobs, totalWalltime, totalCoreHours, etc.)
//   - groupBy: Required grouping dimension (User, Project, Cluster, or Subcluster)
//
// Returns a slice of JobsStatistics, one per group, with:
//   - ID: The group identifier (username, project name, cluster name, etc.)
//   - Name: Display name (for users, from hpc_user.name; empty for other groups)
//   - Statistics: totalJobs, totalUsers, totalWalltime, resource usage metrics
//
// Security: Respects user roles via SecurityCheck - users see only their own data unless admin/support.
// Performance: Results are sorted in SQL and pagination applied before scanning rows.
func (r *JobRepository) JobsStatsGrouped(
	ctx context.Context,
	filter []*model.JobFilter,
	page *model.PageRequest,
	sortBy *model.SortByAggregate,
	groupBy *model.Aggregate,
) ([]*model.JobsStatistics, error) {
	start := time.Now()
	col := groupBy2column[*groupBy]
	query := r.buildStatsQuery(filter, col)

	query, err := SecurityCheck(ctx, query)
	if err != nil {
		return nil, err
	}

	if sortBy != nil {
		sortBy := sortBy2column[*sortBy]
		query = query.OrderBy(fmt.Sprintf("%s DESC", sortBy))
	}
	if page != nil && page.ItemsPerPage != -1 {
		limit := uint64(page.ItemsPerPage)
		query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
	}

	rows, err := query.RunWith(r.DB).Query()
	if err != nil {
		cclog.Warn("Error while querying DB for job statistics")
		return nil, err
	}

	stats := make([]*model.JobsStatistics, 0, 100)

	for rows.Next() {
		var id sql.NullString
		var name sql.NullString
		var jobs, users, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64
		if err := rows.Scan(&id, &name, &jobs, &users, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
			cclog.Warnf("Error while scanning rows: %s", err.Error())
			return nil, err
		}

		if id.Valid {
			var totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours int
			var personName string

			if name.Valid {
				personName = name.String
			}

			if jobs.Valid {
				totalJobs = int(jobs.Int64)
			}

			if users.Valid {
				totalUsers = int(users.Int64)
			}

			if walltime.Valid {
				totalWalltime = int(walltime.Int64)
			}

			if nodes.Valid {
				totalNodes = int(nodes.Int64)
			}
			if cores.Valid {
				totalCores = int(cores.Int64)
			}
			if accs.Valid {
				totalAccs = int(accs.Int64)
			}

			if nodeHours.Valid {
				totalNodeHours = int(nodeHours.Int64)
			}
			if coreHours.Valid {
				totalCoreHours = int(coreHours.Int64)
			}
			if accHours.Valid {
				totalAccHours = int(accHours.Int64)
			}

			if col == "job.hpc_user" {
				// name := r.getUserName(ctx, id.String)
				stats = append(stats,
					&model.JobsStatistics{
						ID:             id.String,
						Name:           personName,
						TotalJobs:      totalJobs,
						TotalWalltime:  totalWalltime,
						TotalNodes:     totalNodes,
						TotalNodeHours: totalNodeHours,
						TotalCores:     totalCores,
						TotalCoreHours: totalCoreHours,
						TotalAccs:      totalAccs,
						TotalAccHours:  totalAccHours,
					})
			} else {
				stats = append(stats,
					&model.JobsStatistics{
						ID:             id.String,
						TotalJobs:      totalJobs,
						TotalUsers:     totalUsers,
						TotalWalltime:  totalWalltime,
						TotalNodes:     totalNodes,
						TotalNodeHours: totalNodeHours,
						TotalCores:     totalCores,
						TotalCoreHours: totalCoreHours,
						TotalAccs:      totalAccs,
						TotalAccHours:  totalAccHours,
					})
			}
		}
	}

	cclog.Debugf("Timer JobsStatsGrouped %s", time.Since(start))
	return stats, nil
}

// JobsStats computes overall job statistics across all matching jobs without grouping.
//
// This method provides a single aggregate view of job metrics, useful for dashboard
// summaries and overall system utilization reports.
//
// Parameters:
//   - ctx: Context for security checks and cancellation
//   - filter: Filters to apply (time range, cluster, job state, etc.)
//
// Returns a single-element slice containing aggregate statistics:
//   - totalJobs, totalUsers, totalWalltime
//   - totalNodeHours, totalCoreHours, totalAccHours
//
// Unlike JobsStatsGrouped, this returns overall totals without breaking down by dimension.
// Security checks are applied via SecurityCheck to respect user access levels.
func (r *JobRepository) JobsStats(
	ctx context.Context,
	filter []*model.JobFilter,
) ([]*model.JobsStatistics, error) {
	start := time.Now()
	query := r.buildStatsQuery(filter, "")
	query, err := SecurityCheck(ctx, query)
	if err != nil {
		return nil, err
	}

	row := query.RunWith(r.DB).QueryRow()
	stats := make([]*model.JobsStatistics, 0, 1)

	var jobs, users, walltime, nodes, nodeHours, cores, coreHours, accs, accHours sql.NullInt64
	if err := row.Scan(&jobs, &users, &walltime, &nodes, &nodeHours, &cores, &coreHours, &accs, &accHours); err != nil {
		cclog.Warn("Error while scanning rows")
		return nil, err
	}

	if jobs.Valid {
		var totalNodeHours, totalCoreHours, totalAccHours int

		if nodeHours.Valid {
			totalNodeHours = int(nodeHours.Int64)
		}
		if coreHours.Valid {
			totalCoreHours = int(coreHours.Int64)
		}
		if accHours.Valid {
			totalAccHours = int(accHours.Int64)
		}
		stats = append(stats,
			&model.JobsStatistics{
				TotalJobs:      int(jobs.Int64),
				TotalUsers:     int(users.Int64),
				TotalWalltime:  int(walltime.Int64),
				TotalNodeHours: totalNodeHours,
				TotalCoreHours: totalCoreHours,
				TotalAccHours:  totalAccHours,
			})
	}

	cclog.Debugf("Timer JobStats %s", time.Since(start))
	return stats, nil
}

// LoadJobStat retrieves a specific statistic for a metric from a job's statistics.
// Returns 0.0 if the metric is not found or statType is invalid.
//
// Parameters:
//   - job: Job struct with populated Statistics field
//   - metric: Name of the metric to query (e.g., "cpu_load", "mem_used")
//   - statType: Type of statistic: "avg", "min", or "max"
//
// Returns the requested statistic value or 0.0 if not found.
func LoadJobStat(job *schema.Job, metric string, statType string) float64 {
	if stats, ok := job.Statistics[metric]; ok {
		switch statType {
		case "avg":
			return stats.Avg
		case "max":
			return stats.Max
		case "min":
			return stats.Min
		default:
			cclog.Errorf("Unknown stat type %s", statType)
		}
	}

	return 0.0
}

// JobCountGrouped counts jobs grouped by a dimension without computing detailed statistics.
//
// This is a lightweight alternative to JobsStatsGrouped when only job counts are needed,
// avoiding the overhead of calculating walltime and resource usage metrics.
//
// Parameters:
//   - ctx: Context for security checks
//   - filter: Filters to apply
//   - groupBy: Grouping dimension (User, Project, Cluster, or Subcluster)
//
// Returns JobsStatistics with only ID and TotalJobs populated for each group.
func (r *JobRepository) JobCountGrouped(
	ctx context.Context,
	filter []*model.JobFilter,
	groupBy *model.Aggregate,
) ([]*model.JobsStatistics, error) {
	start := time.Now()
	col := groupBy2column[*groupBy]
	query := r.buildCountQuery(filter, "", col)
	query, err := SecurityCheck(ctx, query)
	if err != nil {
		return nil, err
	}
	rows, err := query.RunWith(r.DB).Query()
	if err != nil {
		cclog.Warn("Error while querying DB for job statistics")
		return nil, err
	}

	stats := make([]*model.JobsStatistics, 0, 100)

	for rows.Next() {
		var id sql.NullString
		var cnt sql.NullInt64
		if err := rows.Scan(&id, &cnt); err != nil {
			cclog.Warn("Error while scanning rows")
			return nil, err
		}
		if id.Valid {
			stats = append(stats,
				&model.JobsStatistics{
					ID:        id.String,
					TotalJobs: int(cnt.Int64),
				})
		}
	}

	cclog.Debugf("Timer JobCountGrouped %s", time.Since(start))
	return stats, nil
}

// AddJobCountGrouped augments existing statistics with additional job counts by category.
//
// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped
// with counts of running or short-running jobs, matched by group ID.
//
// Parameters:
//   - ctx: Context for security checks
//   - filter: Filters to apply
//   - groupBy: Grouping dimension (must match the dimension used for stats parameter)
//   - stats: Existing statistics to augment (modified in-place by ID matching)
//   - kind: "running" to add RunningJobs count, "short" to add ShortJobs count
//
// Returns the same stats slice with RunningJobs or ShortJobs fields populated per group.
// Groups without matching jobs will have 0 for the added field.
func (r *JobRepository) AddJobCountGrouped(
	ctx context.Context,
	filter []*model.JobFilter,
	groupBy *model.Aggregate,
	stats []*model.JobsStatistics,
	kind string,
) ([]*model.JobsStatistics, error) {
	start := time.Now()
	col := groupBy2column[*groupBy]
	query := r.buildCountQuery(filter, kind, col)
	query, err := SecurityCheck(ctx, query)
	if err != nil {
		return nil, err
	}
	rows, err := query.RunWith(r.DB).Query()
	if err != nil {
		cclog.Warn("Error while querying DB for job statistics")
		return nil, err
	}

	counts := make(map[string]int)

	for rows.Next() {
		var id sql.NullString
		var cnt sql.NullInt64
		if err := rows.Scan(&id, &cnt); err != nil {
			cclog.Warn("Error while scanning rows")
			return nil, err
		}
		if id.Valid {
			counts[id.String] = int(cnt.Int64)
		}
	}

	switch kind {
	case "running":
		for _, s := range stats {
			s.RunningJobs = counts[s.ID]
		}
	case "short":
		for _, s := range stats {
			s.ShortJobs = counts[s.ID]
		}
	}

	cclog.Debugf("Timer AddJobCountGrouped %s", time.Since(start))
	return stats, nil
}

// AddJobCount augments existing overall statistics with additional job counts by category.
//
// Similar to AddJobCountGrouped but for ungrouped statistics. Applies the same count
// to all statistics entries (typically just one).
//
// Parameters:
//   - ctx: Context for security checks
//   - filter: Filters to apply
//   - stats: Existing statistics to augment (modified in-place)
//   - kind: "running" to add RunningJobs count, "short" to add ShortJobs count
//
// Returns the same stats slice with RunningJobs or ShortJobs fields set to the total count.
func (r *JobRepository) AddJobCount(
	ctx context.Context,
	filter []*model.JobFilter,
	stats []*model.JobsStatistics,
	kind string,
) ([]*model.JobsStatistics, error) {
	start := time.Now()
	query := r.buildCountQuery(filter, kind, "")
	query, err := SecurityCheck(ctx, query)
	if err != nil {
		return nil, err
	}
	rows, err := query.RunWith(r.DB).Query()
	if err != nil {
		cclog.Warn("Error while querying DB for job statistics")
		return nil, err
	}

	var count int

	for rows.Next() {
		var cnt sql.NullInt64
		if err := rows.Scan(&cnt); err != nil {
			cclog.Warn("Error while scanning rows")
			return nil, err
		}

		count = int(cnt.Int64)
	}

	switch kind {
	case "running":
		for _, s := range stats {
			s.RunningJobs = count
		}
	case "short":
		for _, s := range stats {
			s.ShortJobs = count
		}
	}

	cclog.Debugf("Timer AddJobCount %s", time.Since(start))
	return stats, nil
}

// AddHistograms augments statistics with distribution histograms for job properties.
//
// Generates histogram data for visualization of job duration, node count, core count,
// and accelerator count distributions. Duration histogram uses intelligent binning based
// on the requested resolution.
//
// Parameters:
//   - ctx: Context for security checks
//   - filter: Filters to apply to jobs included in histograms
//   - stat: Statistics struct to augment (modified in-place)
//   - durationBins: Bin size - "1m", "10m", "1h", "6h", "12h", or "24h" (default)
//
// Populates these fields in stat:
//   - HistDuration: Job duration distribution (zero-padded bins)
//   - HistNumNodes: Node count distribution
//   - HistNumCores: Core (hwthread) count distribution
//   - HistNumAccs: Accelerator count distribution
//
// Duration bins are pre-initialized with zeros to ensure consistent ranges for visualization.
// Bin size determines both the width and maximum duration displayed (e.g., "1h" = 48 bins × 1h = 48h max).
func (r *JobRepository) AddHistograms(
	ctx context.Context,
	filter []*model.JobFilter,
	stat *model.JobsStatistics,
	durationBins *string,
) (*model.JobsStatistics, error) {
	start := time.Now()

	var targetBinCount int
	var targetBinSize int
	switch *durationBins {
	case "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes
		targetBinCount = 60
		targetBinSize = 60
	case "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours
		targetBinCount = 72
		targetBinSize = 600
	case "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours
		targetBinCount = 48
		targetBinSize = 3600
	case "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days
		targetBinCount = 12
		targetBinSize = 21600
	case "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days
		targetBinCount = 14
		targetBinSize = 43200
	default: // 24h
		targetBinCount = 24
		targetBinSize = 3600
	}

	var err error
	// Return X-Values always as seconds, will be formatted into minutes and hours in frontend
	value := fmt.Sprintf(`CAST(ROUND(((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / %d) + 1) as int) as value`, time.Now().Unix(), targetBinSize)
	stat.HistDuration, err = r.jobsDurationStatisticsHistogram(ctx, value, filter, targetBinSize, &targetBinCount)
	if err != nil {
		cclog.Warn("Error while loading job statistics histogram: job duration")
		return nil, err
	}

	stat.HistNumNodes, err = r.jobsStatisticsHistogram(ctx, "job.num_nodes as value", filter)
	if err != nil {
		cclog.Warn("Error while loading job statistics histogram: num nodes")
		return nil, err
	}

	stat.HistNumCores, err = r.jobsStatisticsHistogram(ctx, "job.num_hwthreads as value", filter)
	if err != nil {
		cclog.Warn("Error while loading job statistics histogram: num hwthreads")
		return nil, err
	}

	stat.HistNumAccs, err = r.jobsStatisticsHistogram(ctx, "job.num_acc as value", filter)
	if err != nil {
		cclog.Warn("Error while loading job statistics histogram: num acc")
		return nil, err
	}

	cclog.Debugf("Timer AddHistograms %s", time.Since(start))
	return stat, nil
}

// AddMetricHistograms augments statistics with distribution histograms for job metrics.
//
// Generates histogram data for metrics like CPU load, memory usage, etc. Handles running
// and completed jobs differently: running jobs load data from metric backend, completed jobs
// use footprint data from database.
//
// Parameters:
//   - ctx: Context for security checks
//   - filter: Filters to apply (MUST contain State filter for running jobs)
//   - metrics: List of metric names to histogram (e.g., ["cpu_load", "mem_used"])
//   - stat: Statistics struct to augment (modified in-place)
//   - targetBinCount: Number of histogram bins (default: 10)
//
// Populates HistMetrics field in stat with MetricHistoPoints for each metric.
//
// Binning algorithm:
//   - Values normalized to metric's peak value from cluster configuration
//   - Bins evenly distributed from 0 to peak
//   - Pre-initialized with zeros for consistent visualization
//
// Limitations:
//   - Running jobs: Limited to 500 jobs for performance
//   - Requires valid cluster configuration with metric peak values
//   - Uses footprint statistic (avg/max/min) configured per metric
func (r *JobRepository) AddMetricHistograms(
	ctx context.Context,
	filter []*model.JobFilter,
	metrics []string,
	stat *model.JobsStatistics,
	targetBinCount *int,
) (*model.JobsStatistics, error) {
	start := time.Now()

	// Running Jobs Only: First query jobdata from sqlite, then query data and make bins
	for _, f := range filter {
		if f.State != nil {
			if len(f.State) == 1 && f.State[0] == "running" {
				stat.HistMetrics = r.runningJobsMetricStatisticsHistogram(ctx, metrics, filter, targetBinCount)
				cclog.Debugf("Timer AddMetricHistograms %s", time.Since(start))
				return stat, nil
			}
		}
	}

	// All other cases: Query and make bins in sqlite directly
	for _, m := range metrics {
		metricHisto, err := r.jobsMetricStatisticsHistogram(ctx, m, filter, targetBinCount)
		if err != nil {
			cclog.Warnf("Error while loading job metric statistics histogram: %s", m)
			continue
		}
		stat.HistMetrics = append(stat.HistMetrics, metricHisto)
	}

	cclog.Debugf("Timer AddMetricHistograms %s", time.Since(start))
	return stat, nil
}

// jobsStatisticsHistogram generates a simple histogram by grouping on a column value.
//
// Used for histograms where the column value directly represents the bin (e.g., node count, core count).
// Unlike duration/metric histograms, this doesn't pre-initialize bins with zeros.
//
// Parameters:
//   - value: SQL expression that produces the histogram value, aliased as "value"
//   - filters: Job filters to apply
//
// Returns histogram points with Value (from column) and Count (number of jobs).
func (r *JobRepository) jobsStatisticsHistogram(
	ctx context.Context,
	value string,
	filters []*model.JobFilter,
) ([]*model.HistoPoint, error) {
	start := time.Now()
	query, qerr := SecurityCheck(ctx,
		sq.Select(value, "COUNT(job.id) AS count").From("job"))

	if qerr != nil {
		return nil, qerr
	}

	for _, f := range filters {
		query = BuildWhereClause(f, query)
	}

	rows, err := query.GroupBy("value").RunWith(r.DB).Query()
	if err != nil {
		cclog.Error("Error while running query")
		return nil, err
	}

	points := make([]*model.HistoPoint, 0)
	// is it possible to introduce zero values here? requires info about bincount
	for rows.Next() {
		point := model.HistoPoint{}
		if err := rows.Scan(&point.Value, &point.Count); err != nil {
			cclog.Warn("Error while scanning rows")
			return nil, err
		}

		points = append(points, &point)
	}
	cclog.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
	return points, nil
}

// jobsDurationStatisticsHistogram generates a duration histogram with pre-initialized bins.
//
// Bins are zero-padded to provide consistent ranges for visualization, unlike simple
// histograms which only return bins with data. The value parameter should compute
// the bin number from job duration.
//
// Parameters:
//   - value: SQL expression computing bin number from duration, aliased as "value"
//   - filters: Job filters to apply
//   - binSizeSeconds: Width of each bin in seconds
//   - targetBinCount: Number of bins to pre-initialize
//
// Returns histogram points with Value (bin_number × binSizeSeconds) and Count.
// All bins from 1 to targetBinCount are returned, with Count=0 for empty bins.
//
// Algorithm:
//  1. Pre-initialize targetBinCount bins with zero counts
//  2. Query database for actual counts per bin
//  3. Match query results to pre-initialized bins by value
//  4. Bins without matches remain at zero
func (r *JobRepository) jobsDurationStatisticsHistogram(
	ctx context.Context,
	value string,
	filters []*model.JobFilter,
	binSizeSeconds int,
	targetBinCount *int,
) ([]*model.HistoPoint, error) {
	start := time.Now()
	query, qerr := SecurityCheck(ctx,
		sq.Select(value, "COUNT(job.id) AS count").From("job"))

	if qerr != nil {
		return nil, qerr
	}

	// Each bin represents a duration range: bin N = [N*binSizeSeconds, (N+1)*binSizeSeconds)
	// Example: binSizeSeconds=3600 (1 hour), bin 1 = 0-1h, bin 2 = 1-2h, etc.
	points := make([]*model.HistoPoint, 0)
	for i := 1; i <= *targetBinCount; i++ {
		point := model.HistoPoint{Value: i * binSizeSeconds, Count: 0}
		points = append(points, &point)
	}

	for _, f := range filters {
		query = BuildWhereClause(f, query)
	}

	rows, err := query.GroupBy("value").RunWith(r.DB).Query()
	if err != nil {
		cclog.Error("Error while running query")
		return nil, err
	}

	// Match query results to pre-initialized bins.
	// point.Value from query is the bin number; multiply by binSizeSeconds to match bin.Value.
	for rows.Next() {
		point := model.HistoPoint{}
		if err := rows.Scan(&point.Value, &point.Count); err != nil {
			cclog.Warn("Error while scanning rows")
			return nil, err
		}

		for _, e := range points {
			if e.Value == (point.Value * binSizeSeconds) {
				e.Count = point.Count
				break
			}
		}
	}

	cclog.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
	return points, nil
}

// jobsMetricStatisticsHistogram generates a metric histogram using footprint data from completed jobs.
//
// Values are normalized to the metric's peak value and distributed into bins. The algorithm
// is based on SQL histogram generation techniques, extracting metric values from JSON footprint
// and computing bin assignments in SQL.
//
// Parameters:
//   - metric: Metric name (e.g., "cpu_load", "mem_used")
//   - filters: Job filters to apply
//   - bins: Number of bins to generate
//
// Returns MetricHistoPoints with metric name, unit, footprint stat type, and binned data.
//
// Algorithm:
//  1. Determine peak value from cluster configuration (filtered cluster or max across all)
//  2. Generate SQL that extracts footprint value, normalizes to [0,1], multiplies by bin count
//  3. Pre-initialize bins with min/max ranges based on peak value
//  4. Query database for counts per bin
//  5. Match results to pre-initialized bins
//
// Special handling: Values exactly equal to peak are forced into the last bin by multiplying
// peak by 0.999999999 to avoid creating an extra bin.
func (r *JobRepository) jobsMetricStatisticsHistogram(
	ctx context.Context,
	metric string,
	filters []*model.JobFilter,
	bins *int,
) (*model.MetricHistoPoints, error) {
	// Peak value defines the upper bound for binning: values are distributed across
	// bins from 0 to peak. First try to get peak from filtered cluster, otherwise
	// scan all clusters to find the maximum peak value.
	var metricConfig *schema.MetricConfig
	var peak float64
	var unit string
	var footprintStat string

	// Try to get metric config from filtered cluster
	for _, f := range filters {
		if f.Cluster != nil {
			metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric)
			peak = metricConfig.Peak
			unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base
			footprintStat = metricConfig.Footprint
			cclog.Debugf("Cluster %s filter found with peak %f for %s", *f.Cluster.Eq, peak, metric)
		}
	}

	// If no cluster filter or peak not found, find largest peak across all clusters
	// This ensures histogram can accommodate all possible values
	if peak == 0.0 {
		for _, c := range archive.Clusters {
			for _, m := range c.MetricConfig {
				if m.Name == metric {
					if m.Peak > peak {
						peak = m.Peak
					}
					if unit == "" {
						unit = m.Unit.Prefix + m.Unit.Base
					}
					if footprintStat == "" {
						footprintStat = m.Footprint
					}
				}
			}
		}
	}

	// Construct SQL histogram bins using normalized values.
	// Algorithm based on: https://jereze.com/code/sql-histogram/ (modified)
	start := time.Now()

	// Bin calculation formula:
	//   bin_number = CAST( (value / peak) * num_bins AS INTEGER ) + 1
	// Special case: value == peak would create bin N+1, so we test for equality
	// and multiply peak by 0.999999999 to force it into bin N.
	binQuery := fmt.Sprintf(`CAST(
		((case when json_extract(footprint, "$.%s") = %f then %f*0.999999999 else json_extract(footprint, "$.%s") end) / %f)
		* %v as INTEGER )`,
		(metric + "_" + footprintStat), peak, peak, (metric + "_" + footprintStat), peak, *bins)

	mainQuery := sq.Select(
		fmt.Sprintf(`%s + 1 as bin`, binQuery),
		`count(*) as count`,
	).From("job").Where(
		"JSON_VALID(footprint)",
	).Where(fmt.Sprintf(`json_extract(footprint, "$.%s") is not null and json_extract(footprint, "$.%s") <= %f`, (metric + "_" + footprintStat), (metric + "_" + footprintStat), peak))

	mainQuery, qerr := SecurityCheck(ctx, mainQuery)
	if qerr != nil {
		return nil, qerr
	}

	for _, f := range filters {
		mainQuery = BuildWhereClause(f, mainQuery)
	}

	mainQuery = mainQuery.GroupBy("bin").OrderBy("bin")

	rows, err := mainQuery.RunWith(r.DB).Query()
	if err != nil {
		cclog.Errorf("Error while running mainQuery: %s", err)
		return nil, err
	}

	// Pre-initialize bins with calculated min/max ranges.
	// Example: peak=1000, bins=10 -> bin 1=[0,100), bin 2=[100,200), ..., bin 10=[900,1000]
	points := make([]*model.MetricHistoPoint, 0)
	binStep := int(peak) / *bins
	for i := 1; i <= *bins; i++ {
		binMin := (binStep * (i - 1))
		binMax := (binStep * i)
		epoint := model.MetricHistoPoint{Bin: &i, Count: 0, Min: &binMin, Max: &binMax}
		points = append(points, &epoint)
	}

	// Match query results to pre-initialized bins.
	for rows.Next() {
		rpoint := model.MetricHistoPoint{}
		if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil {
			cclog.Warnf("Error while scanning rows for %s", metric)
			return nil, err
		}

		for _, e := range points {
			if e.Bin != nil && rpoint.Bin != nil && *e.Bin == *rpoint.Bin {
				e.Count = rpoint.Count
				break
			}
		}
	}

	result := model.MetricHistoPoints{Metric: metric, Unit: unit, Stat: &footprintStat, Data: points}

	cclog.Debugf("Timer jobsStatisticsHistogram %s", time.Since(start))
	return &result, nil
}

// runningJobsMetricStatisticsHistogram generates metric histograms for running jobs using live data.
//
// Unlike completed jobs which use footprint data from the database, running jobs require
// fetching current metric averages from the metric backend (via metricdispatch).
//
// Parameters:
//   - metrics: List of metric names
//   - filters: Job filters (should filter to running jobs only)
//   - bins: Number of histogram bins
//
// Returns slice of MetricHistoPoints, one per metric.
//
// Limitations:
//   - Maximum 500 jobs (returns nil if more jobs match)
//   - Requires metric backend availability
//   - Bins based on metric peak values from cluster configuration
//
// Algorithm:
//  1. Query first 501 jobs to check count limit
//  2. Load metric averages for all jobs via metricdispatch
//  3. For each metric, create bins based on peak value
//  4. Iterate averages and count jobs per bin
func (r *JobRepository) runningJobsMetricStatisticsHistogram(
	ctx context.Context,
	metrics []string,
	filters []*model.JobFilter,
	bins *int,
) []*model.MetricHistoPoints {
	// Get Jobs
	jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil)
	if err != nil {
		cclog.Errorf("Error while querying jobs for footprint: %s", err)
		return nil
	}
	if len(jobs) > 500 {
		cclog.Errorf("too many jobs matched (max: %d)", 500)
		return nil
	}

	// Get AVGs from metric repo
	avgs := make([][]schema.Float, len(metrics))
	for i := range avgs {
		avgs[i] = make([]schema.Float, 0, len(jobs))
	}

	for _, job := range jobs {
		if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed {
			continue
		}

		if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
			cclog.Errorf("Error while loading averages for histogram: %s", err)
			return nil
		}
	}

	// Iterate metrics to fill endresult
	data := make([]*model.MetricHistoPoints, 0)
	for idx, metric := range metrics {
		// Get specific Peak or largest Peak
		var metricConfig *schema.MetricConfig
		var peak float64
		var unit string

		for _, f := range filters {
			if f.Cluster != nil {
				metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric)
				peak = metricConfig.Peak
				unit = metricConfig.Unit.Prefix + metricConfig.Unit.Base
			}
		}

		if peak == 0.0 {
			for _, c := range archive.Clusters {
				for _, m := range c.MetricConfig {
					if m.Name == metric {
						if m.Peak > peak {
							peak = m.Peak
						}
						if unit == "" {
							unit = m.Unit.Prefix + m.Unit.Base
						}
					}
				}
			}
		}

		// Make and fill bins
		peakBin := int(peak) / *bins

		points := make([]*model.MetricHistoPoint, 0)
		for b := 0; b < *bins; b++ {
			count := 0
			bindex := b + 1
			bmin := peakBin * b
			bmax := peakBin * (b + 1)

			// Iterate AVG values for indexed metric and count for bins
			for _, val := range avgs[idx] {
				if int(val) >= bmin && int(val) < bmax {
					count += 1
				}
			}

			// Append Bin to Metric Result Array
			point := model.MetricHistoPoint{Bin: &bindex, Count: count, Min: &bmin, Max: &bmax}
			points = append(points, &point)
		}

		// Append Metric Result Array to final results array
		result := model.MetricHistoPoints{Metric: metric, Unit: unit, Data: points}
		data = append(data, &result)
	}

	return data
}