From 8f0bb907ff2bf75bcd576acad1f363ec280fb70d Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Jan 2026 06:41:23 +0100 Subject: [PATCH 1/2] Improve documentation and add more tests --- internal/repository/hooks_test.go | 274 +++++++++++ internal/repository/job.go | 277 ++++++++++- internal/repository/jobCreate_test.go | 500 ++++++++++++++++++++ internal/repository/jobHooks.go | 48 +- internal/repository/migration.go | 63 +++ internal/repository/stats.go | 54 ++- internal/repository/stats_test.go | 13 +- internal/repository/tags.go | 42 +- internal/repository/transaction_test.go | 311 +++++++++++++ internal/repository/user.go | 47 ++ internal/repository/user_test.go | 596 ++++++++++++++++++++++++ internal/tagger/tagger.go | 2 +- 12 files changed, 2185 insertions(+), 42 deletions(-) create mode 100644 internal/repository/hooks_test.go create mode 100644 internal/repository/jobCreate_test.go create mode 100644 internal/repository/transaction_test.go create mode 100644 internal/repository/user_test.go diff --git a/internal/repository/hooks_test.go b/internal/repository/hooks_test.go new file mode 100644 index 00000000..52f954b5 --- /dev/null +++ b/internal/repository/hooks_test.go @@ -0,0 +1,274 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "context" + "testing" + "time" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type MockJobHook struct { + startCalled bool + stopCalled bool + startJobs []*schema.Job + stopJobs []*schema.Job +} + +func (m *MockJobHook) JobStartCallback(job *schema.Job) { + m.startCalled = true + m.startJobs = append(m.startJobs, job) +} + +func (m *MockJobHook) JobStopCallback(job *schema.Job) { + m.stopCalled = true + m.stopJobs = append(m.stopJobs, job) +} + +func TestRegisterJobHook(t *testing.T) { + t.Run("register single hook", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + + RegisterJobHook(mock) + + assert.NotNil(t, hooks) + assert.Len(t, hooks, 1) + assert.Equal(t, mock, hooks[0]) + + hooks = nil + }) + + t.Run("register multiple hooks", func(t *testing.T) { + hooks = nil + mock1 := &MockJobHook{} + mock2 := &MockJobHook{} + + RegisterJobHook(mock1) + RegisterJobHook(mock2) + + assert.Len(t, hooks, 2) + assert.Equal(t, mock1, hooks[0]) + assert.Equal(t, mock2, hooks[1]) + + hooks = nil + }) + + t.Run("register nil hook does not add to hooks", func(t *testing.T) { + hooks = nil + RegisterJobHook(nil) + + if hooks != nil { + assert.Len(t, hooks, 0, "Nil hook should not be added") + } + + hooks = nil + }) +} + +func TestCallJobStartHooks(t *testing.T) { + t.Run("call start hooks with single job", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + job := &schema.Job{ + JobID: 123, + User: "testuser", + Cluster: "testcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + + assert.True(t, mock.startCalled) + assert.False(t, mock.stopCalled) + assert.Len(t, mock.startJobs, 1) + assert.Equal(t, int64(123), mock.startJobs[0].JobID) + + hooks = nil + }) + + t.Run("call start hooks with multiple jobs", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + jobs := []*schema.Job{ + {JobID: 1, User: "user1", Cluster: "cluster1"}, + {JobID: 2, User: "user2", Cluster: "cluster2"}, + {JobID: 3, User: "user3", Cluster: "cluster3"}, + } + + CallJobStartHooks(jobs) + + assert.True(t, mock.startCalled) + assert.Len(t, mock.startJobs, 3) + assert.Equal(t, int64(1), mock.startJobs[0].JobID) + assert.Equal(t, int64(2), mock.startJobs[1].JobID) + assert.Equal(t, int64(3), mock.startJobs[2].JobID) + + hooks = nil + }) + + t.Run("call start hooks with multiple registered hooks", func(t *testing.T) { + hooks = nil + mock1 := &MockJobHook{} + mock2 := &MockJobHook{} + RegisterJobHook(mock1) + RegisterJobHook(mock2) + + job := &schema.Job{ + JobID: 456, User: "testuser", Cluster: "testcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + + assert.True(t, mock1.startCalled) + assert.True(t, mock2.startCalled) + assert.Len(t, mock1.startJobs, 1) + assert.Len(t, mock2.startJobs, 1) + + hooks = nil + }) + + t.Run("call start hooks with nil hooks", func(t *testing.T) { + hooks = nil + + job := &schema.Job{ + JobID: 789, User: "testuser", Cluster: "testcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + + hooks = nil + }) + + t.Run("call start hooks with empty job list", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + CallJobStartHooks([]*schema.Job{}) + + assert.False(t, mock.startCalled) + assert.Len(t, mock.startJobs, 0) + + hooks = nil + }) +} + +func TestCallJobStopHooks(t *testing.T) { + t.Run("call stop hooks with single job", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + job := &schema.Job{ + JobID: 123, + User: "testuser", + Cluster: "testcluster", + } + + CallJobStopHooks(job) + + assert.True(t, mock.stopCalled) + assert.False(t, mock.startCalled) + assert.Len(t, mock.stopJobs, 1) + assert.Equal(t, int64(123), mock.stopJobs[0].JobID) + + hooks = nil + }) + + t.Run("call stop hooks with multiple registered hooks", func(t *testing.T) { + hooks = nil + mock1 := &MockJobHook{} + mock2 := &MockJobHook{} + RegisterJobHook(mock1) + RegisterJobHook(mock2) + + job := &schema.Job{ + JobID: 456, User: "testuser", Cluster: "testcluster", + } + + CallJobStopHooks(job) + + assert.True(t, mock1.stopCalled) + assert.True(t, mock2.stopCalled) + assert.Len(t, mock1.stopJobs, 1) + assert.Len(t, mock2.stopJobs, 1) + + hooks = nil + }) + + t.Run("call stop hooks with nil hooks", func(t *testing.T) { + hooks = nil + + job := &schema.Job{ + JobID: 789, User: "testuser", Cluster: "testcluster", + } + + CallJobStopHooks(job) + + hooks = nil + }) +} + +func TestSQLHooks(t *testing.T) { + _ = setup(t) + + t.Run("hooks log queries in debug mode", func(t *testing.T) { + h := &Hooks{} + + ctx := context.Background() + query := "SELECT * FROM job WHERE job_id = ?" + args := []any{123} + + ctxWithTime, err := h.Before(ctx, query, args...) + require.NoError(t, err) + assert.NotNil(t, ctxWithTime) + + beginTime := ctxWithTime.Value("begin") + require.NotNil(t, beginTime) + _, ok := beginTime.(time.Time) + assert.True(t, ok, "Begin time should be time.Time") + + time.Sleep(10 * time.Millisecond) + + ctxAfter, err := h.After(ctxWithTime, query, args...) + require.NoError(t, err) + assert.NotNil(t, ctxAfter) + }) +} + +func TestHookIntegration(t *testing.T) { + t.Run("hooks are called during job lifecycle", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + job := &schema.Job{ + JobID: 999, + User: "integrationuser", + Cluster: "integrationcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + assert.True(t, mock.startCalled) + assert.Equal(t, 1, len(mock.startJobs)) + + CallJobStopHooks(job) + assert.True(t, mock.stopCalled) + assert.Equal(t, 1, len(mock.stopJobs)) + + assert.Equal(t, mock.startJobs[0].JobID, mock.stopJobs[0].JobID) + + hooks = nil + }) +} diff --git a/internal/repository/job.go b/internal/repository/job.go index b1e92424..bd33774c 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -80,18 +80,33 @@ import ( ) var ( - jobRepoOnce sync.Once + // jobRepoOnce ensures singleton initialization of the JobRepository + jobRepoOnce sync.Once + // jobRepoInstance holds the single instance of JobRepository jobRepoInstance *JobRepository ) +// JobRepository provides database access for job-related operations. +// It implements the repository pattern to abstract database interactions +// and provides caching for improved performance. +// +// The repository is a singleton initialized via GetJobRepository(). +// All database queries use prepared statements via stmtCache for efficiency. +// Frequently accessed data (metadata, energy footprints) is cached in an LRU cache. type JobRepository struct { - DB *sqlx.DB - stmtCache *sq.StmtCache - cache *lrucache.Cache - driver string - Mutex sync.Mutex + DB *sqlx.DB // Database connection pool + stmtCache *sq.StmtCache // Prepared statement cache for query optimization + cache *lrucache.Cache // LRU cache for metadata and footprint data + driver string // Database driver name (e.g., "sqlite3") + Mutex sync.Mutex // Mutex for thread-safe operations } +// GetJobRepository returns the singleton instance of JobRepository. +// The repository is initialized lazily on first access with database connection, +// prepared statement cache, and LRU cache configured from repoConfig. +// +// This function is thread-safe and ensures only one instance is created. +// It must be called after Connect() has established a database connection. func GetJobRepository() *JobRepository { jobRepoOnce.Do(func() { db := GetConnection() @@ -107,6 +122,8 @@ func GetJobRepository() *JobRepository { return jobRepoInstance } +// jobColumns defines the standard set of columns selected from the job table. +// Used consistently across all job queries to ensure uniform data retrieval. var jobColumns []string = []string{ "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes", @@ -115,6 +132,8 @@ var jobColumns []string = []string{ "job.footprint", "job.energy", } +// jobCacheColumns defines columns from the job_cache table, mirroring jobColumns. +// Used for queries against cached job data for performance optimization. var jobCacheColumns []string = []string{ "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster", "job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition", @@ -124,6 +143,14 @@ var jobCacheColumns []string = []string{ "job_cache.footprint", "job_cache.energy", } +// scanJob converts a database row into a schema.Job struct. +// It handles JSON unmarshaling of resources and footprint fields, +// and calculates accurate duration for running jobs. +// +// Parameters: +// - row: Database row implementing Scan() interface (sql.Row or sql.Rows) +// +// Returns the populated Job struct or an error if scanning or unmarshaling fails. func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { job := &schema.Job{} @@ -186,6 +213,16 @@ func (r *JobRepository) Flush() error { return nil } +// FetchMetadata retrieves and unmarshals the metadata JSON for a job. +// Metadata is cached with a 24-hour TTL to improve performance. +// +// The metadata field stores arbitrary key-value pairs associated with a job, +// such as tags, labels, or custom attributes added by external systems. +// +// Parameters: +// - job: Job struct with valid ID field, metadata will be populated in job.MetaData +// +// Returns the metadata map or an error if the job is nil or database query fails. func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) { if job == nil { return nil, fmt.Errorf("job cannot be nil") @@ -218,6 +255,16 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error return job.MetaData, nil } +// UpdateMetadata adds or updates a single metadata key-value pair for a job. +// The entire metadata map is re-marshaled and stored, and the cache is invalidated. +// Also triggers archive metadata update via archive.UpdateMetadata. +// +// Parameters: +// - job: Job struct with valid ID, existing metadata will be fetched if not present +// - key: Metadata key to set +// - val: Metadata value to set +// +// Returns an error if the job is nil, metadata fetch fails, or database update fails. func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err error) { if job == nil { return fmt.Errorf("job cannot be nil") @@ -228,7 +275,7 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er if job.MetaData == nil { if _, err = r.FetchMetadata(job); err != nil { cclog.Warnf("Error while fetching metadata for job, DB ID '%v'", job.ID) - return err + return fmt.Errorf("failed to fetch metadata for job %d: %w", job.ID, err) } } @@ -243,7 +290,7 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil { cclog.Warnf("Error while marshaling metadata for job, DB ID '%v'", job.ID) - return err + return fmt.Errorf("failed to marshal metadata for job %d: %w", job.ID, err) } if _, err = sq.Update("job"). @@ -251,13 +298,23 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er Where("job.id = ?", job.ID). RunWith(r.stmtCache).Exec(); err != nil { cclog.Warnf("Error while updating metadata for job, DB ID '%v'", job.ID) - return err + return fmt.Errorf("failed to update metadata in database for job %d: %w", job.ID, err) } r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour) return archive.UpdateMetadata(job, job.MetaData) } +// FetchFootprint retrieves and unmarshals the performance footprint JSON for a job. +// Unlike FetchMetadata, footprints are NOT cached as they can be large and change frequently. +// +// The footprint contains summary statistics (avg/min/max) for monitored metrics, +// stored as JSON with keys like "cpu_load_avg", "mem_used_max", etc. +// +// Parameters: +// - job: Job struct with valid ID, footprint will be populated in job.Footprint +// +// Returns the footprint map or an error if the job is nil or database query fails. func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) { if job == nil { return nil, fmt.Errorf("job cannot be nil") @@ -284,6 +341,16 @@ func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, err return job.Footprint, nil } +// FetchEnergyFootprint retrieves and unmarshals the energy footprint JSON for a job. +// Energy footprints are cached with a 24-hour TTL as they are frequently accessed but rarely change. +// +// The energy footprint contains calculated energy consumption (in kWh) per metric, +// stored as JSON with keys like "power_avg", "acc_power_avg", etc. +// +// Parameters: +// - job: Job struct with valid ID, energy footprint will be populated in job.EnergyFootprint +// +// Returns the energy footprint map or an error if the job is nil or database query fails. func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float64, error) { if job == nil { return nil, fmt.Errorf("job cannot be nil") @@ -316,6 +383,18 @@ func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float6 return job.EnergyFootprint, nil } +// DeleteJobsBefore removes jobs older than the specified start time. +// Optionally preserves tagged jobs to protect important data from deletion. +// Cache entries for deleted jobs are automatically invalidated. +// +// This is typically used for data retention policies and cleanup operations. +// WARNING: This is a destructive operation that permanently deletes job records. +// +// Parameters: +// - startTime: Unix timestamp, jobs with start_time < this value will be deleted +// - omitTagged: If true, skip jobs that have associated tags (jobtag entries) +// +// Returns the count of deleted jobs or an error if the operation fails. func (r *JobRepository) DeleteJobsBefore(startTime int64, omitTagged bool) (int, error) { var cnt int q := sq.Select("count(*)").From("job").Where("job.start_time < ?", startTime) @@ -371,6 +450,13 @@ func (r *JobRepository) DeleteJobsBefore(startTime int64, omitTagged bool) (int, return cnt, err } +// DeleteJobByID permanently removes a single job by its database ID. +// Cache entries for the deleted job are automatically invalidated. +// +// Parameters: +// - id: Database ID (primary key) of the job to delete +// +// Returns an error if the deletion fails. func (r *JobRepository) DeleteJobByID(id int64) error { // Invalidate cache entries before deletion r.cache.Del(fmt.Sprintf("metadata:%d", id)) @@ -388,6 +474,24 @@ func (r *JobRepository) DeleteJobByID(id int64) error { return err } +// FindUserOrProjectOrJobname attempts to interpret a search term as a job ID, +// username, project ID, or job name by querying the database. +// +// Search logic (in priority order): +// 1. If searchterm is numeric, treat as job ID (returned immediately) +// 2. Try exact match in job.hpc_user column (username) +// 3. Try LIKE match in hpc_user.name column (real name) +// 4. Try exact match in job.project column (project ID) +// 5. If no matches, return searchterm as jobname for GraphQL query +// +// This powers the searchbar functionality for flexible job searching. +// Requires authenticated user for database lookups (returns empty if user is nil). +// +// Parameters: +// - user: Authenticated user context, required for database access +// - searchterm: Search string to interpret +// +// Returns up to one non-empty value among (jobid, username, project, jobname). func (r *JobRepository) FindUserOrProjectOrJobname(user *schema.User, searchterm string) (jobid string, username string, project string, jobname string) { if searchterm == "" { return "", "", "", "" @@ -423,6 +527,19 @@ var ( ErrForbidden = errors.New("not authorized") ) +// FindColumnValue performs a generic column lookup in a database table with role-based access control. +// Only users with admin, support, or manager roles can execute this query. +// +// Parameters: +// - user: User context for authorization check +// - searchterm: Value to search for (exact match or LIKE pattern) +// - table: Database table name to query +// - selectColumn: Column name to return in results +// - whereColumn: Column name to filter on +// - isLike: If true, use LIKE with wildcards; if false, use exact equality +// +// Returns the first matching value, ErrForbidden if user lacks permission, +// or ErrNotFound if no matches are found. func (r *JobRepository) FindColumnValue(user *schema.User, searchterm string, table string, selectColumn string, whereColumn string, isLike bool) (result string, err error) { if user == nil { return "", fmt.Errorf("user cannot be nil") @@ -453,6 +570,19 @@ func (r *JobRepository) FindColumnValue(user *schema.User, searchterm string, ta } } +// FindColumnValues performs a generic column lookup returning multiple matches with role-based access control. +// Similar to FindColumnValue but returns all matching values instead of just the first. +// Only users with admin, support, or manager roles can execute this query. +// +// Parameters: +// - user: User context for authorization check +// - query: Search pattern (always uses LIKE with wildcards) +// - table: Database table name to query +// - selectColumn: Column name to return in results +// - whereColumn: Column name to filter on +// +// Returns a slice of matching values, ErrForbidden if user lacks permission, +// or ErrNotFound if no matches are found. func (r *JobRepository) FindColumnValues(user *schema.User, query string, table string, selectColumn string, whereColumn string) (results []string, err error) { if user == nil { return nil, fmt.Errorf("user cannot be nil") @@ -487,6 +617,13 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table } } +// Partitions returns a list of distinct cluster partitions for a given cluster. +// Results are cached with a 1-hour TTL to improve performance. +// +// Parameters: +// - cluster: Cluster name to query partitions for +// +// Returns a slice of partition names or an error if the database query fails. func (r *JobRepository) Partitions(cluster string) ([]string, error) { var err error start := time.Now() @@ -550,6 +687,19 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in } // FIXME: Set duration to requested walltime? +// StopJobsExceedingWalltimeBy marks running jobs as failed if they exceed their walltime limit. +// This is typically called periodically to clean up stuck or orphaned jobs. +// +// Jobs are marked with: +// - monitoring_status: MonitoringStatusArchivingFailed +// - duration: 0 +// - job_state: JobStateFailed +// +// Parameters: +// - seconds: Grace period beyond walltime before marking as failed +// +// Returns an error if the database update fails. +// Logs the number of jobs marked as failed if any were affected. func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { start := time.Now() currentTime := time.Now().Unix() @@ -579,6 +729,12 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { return nil } +// FindJobIdsByTag returns all job database IDs associated with a specific tag. +// +// Parameters: +// - tagID: Database ID of the tag to search for +// +// Returns a slice of job IDs or an error if the query fails. func (r *JobRepository) FindJobIdsByTag(tagID int64) ([]int64, error) { query := sq.Select("job.id").From("job"). Join("jobtag ON jobtag.job_id = job.id"). @@ -607,6 +763,13 @@ func (r *JobRepository) FindJobIdsByTag(tagID int64) ([]int64, error) { } // FIXME: Reconsider filtering short jobs with harcoded threshold +// FindRunningJobs returns all currently running jobs for a specific cluster. +// Filters out short-running jobs based on repoConfig.MinRunningJobDuration threshold. +// +// Parameters: +// - cluster: Cluster name to filter jobs +// +// Returns a slice of running job objects or an error if the query fails. func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { query := sq.Select(jobColumns...).From("job"). Where("job.cluster = ?", cluster). @@ -634,6 +797,12 @@ func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { return jobs, nil } +// UpdateDuration recalculates and updates the duration field for all running jobs. +// Called periodically to keep job durations current without querying individual jobs. +// +// Duration is calculated as: current_time - job.start_time +// +// Returns an error if the database update fails. func (r *JobRepository) UpdateDuration() error { stmnt := sq.Update("job"). Set("duration", sq.Expr("? - job.start_time", time.Now().Unix())). @@ -648,6 +817,16 @@ func (r *JobRepository) UpdateDuration() error { return nil } +// FindJobsBetween returns jobs within a specified time range. +// If startTimeBegin is 0, returns all jobs before startTimeEnd. +// Optionally excludes tagged jobs from results. +// +// Parameters: +// - startTimeBegin: Unix timestamp for range start (use 0 for unbounded start) +// - startTimeEnd: Unix timestamp for range end +// - omitTagged: If true, exclude jobs with associated tags +// +// Returns a slice of jobs or an error if the time range is invalid or query fails. func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64, omitTagged bool) ([]*schema.Job, error) { var query sq.SelectBuilder @@ -688,6 +867,14 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64 return jobs, nil } +// UpdateMonitoringStatus updates the monitoring status for a job and invalidates its cache entries. +// Cache invalidation affects both metadata and energy footprint to ensure consistency. +// +// Parameters: +// - job: Database ID of the job to update +// - monitoringStatus: New monitoring status value (see schema.MonitoringStatus constants) +// +// Returns an error if the database update fails. func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) { // Invalidate cache entries as monitoring status affects job state r.cache.Del(fmt.Sprintf("metadata:%d", job)) @@ -704,6 +891,13 @@ func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32 return nil } +// Execute runs a Squirrel UpdateBuilder statement against the database. +// This is a generic helper for executing pre-built update queries. +// +// Parameters: +// - stmt: Squirrel UpdateBuilder with prepared update query +// +// Returns an error if the execution fails. func (r *JobRepository) Execute(stmt sq.UpdateBuilder) error { if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil { cclog.Errorf("Error while executing statement: %v", err) @@ -713,6 +907,14 @@ func (r *JobRepository) Execute(stmt sq.UpdateBuilder) error { return nil } +// MarkArchived adds monitoring status update to an existing UpdateBuilder statement. +// This is a builder helper used when constructing multi-field update queries. +// +// Parameters: +// - stmt: Existing UpdateBuilder to modify +// - monitoringStatus: Monitoring status value to set +// +// Returns the modified UpdateBuilder for method chaining. func (r *JobRepository) MarkArchived( stmt sq.UpdateBuilder, monitoringStatus int32, @@ -720,11 +922,22 @@ func (r *JobRepository) MarkArchived( return stmt.Set("monitoring_status", monitoringStatus) } +// UpdateEnergy calculates and updates the energy consumption for a job. +// This is called for running jobs during intermediate updates or when archiving. +// +// Energy calculation formula: +// - For "power" metrics: Energy (kWh) = (Power_avg * NumNodes * Duration_hours) / 1000 +// - For "energy" metrics: Currently not implemented (would need sum statistics) +// +// The calculation accounts for: +// - Multi-node jobs: Multiplies by NumNodes to get total cluster energy +// - Shared jobs: Node average is already based on partial resources, so NumNodes=1 +// - Unit conversion: Watts * hours / 1000 = kilowatt-hours (kWh) +// - Rounding: Results rounded to 2 decimal places func (r *JobRepository) UpdateEnergy( stmt sq.UpdateBuilder, jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { - /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) if err != nil { cclog.Errorf("cannot get subcluster: %s", err.Error()) @@ -732,25 +945,27 @@ func (r *JobRepository) UpdateEnergy( } energyFootprint := make(map[string]float64) - // Total Job Energy Outside Loop + // Accumulate total energy across all energy-related metrics totalEnergy := 0.0 for _, fp := range sc.EnergyFootprint { - // Always Init Metric Energy Inside Loop + // Calculate energy for this specific metric metricEnergy := 0.0 if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil { - // Note: For DB data, calculate and save as kWh switch sc.MetricConfig[i].Energy { - case "energy": // this metric has energy as unit (Joules or Wh) + case "energy": // Metric already in energy units (Joules or Wh) cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp) - // FIXME: Needs sum as stats type - case "power": // this metric has power as unit (Watt) - // Energy: Power (in Watts) * Time (in Seconds) - // Unit: (W * (s / 3600)) / 1000 = kWh - // Round 2 Digits: round(Energy * 100) / 100 - // Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000 - // Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1 + // FIXME: Needs sum as stats type to accumulate energy values over time + case "power": // Metric in power units (Watts) + // Energy (kWh) = Power (W) × Time (h) / 1000 + // Formula: (avg_power_per_node * num_nodes) * (duration_sec / 3600) / 1000 + // + // Breakdown: + // LoadJobStat(jobMeta, fp, "avg") = average power per node (W) + // jobMeta.NumNodes = number of nodes (1 for shared jobs) + // jobMeta.Duration / 3600.0 = duration in hours + // / 1000.0 = convert Wh to kWh rawEnergy := ((LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0 - metricEnergy = math.Round(rawEnergy*100.0) / 100.0 + metricEnergy = math.Round(rawEnergy*100.0) / 100.0 // Round to 2 decimal places } } else { cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) @@ -758,8 +973,6 @@ func (r *JobRepository) UpdateEnergy( energyFootprint[fp] = metricEnergy totalEnergy += metricEnergy - - // cclog.Infof("Metric %s Average %f -> %f kWh | Job %d Total -> %f kWh", fp, LoadJobStat(jobMeta, fp, "avg"), energy, jobMeta.JobID, totalEnergy) } var rawFootprint []byte @@ -771,11 +984,19 @@ func (r *JobRepository) UpdateEnergy( return stmt.Set("energy_footprint", string(rawFootprint)).Set("energy", (math.Round(totalEnergy*100.0) / 100.0)), nil } +// UpdateFootprint calculates and updates the performance footprint for a job. +// This is called for running jobs during intermediate updates or when archiving. +// +// A footprint is a summary statistic (avg/min/max) for each monitored metric. +// The specific statistic type is defined in the cluster config's Footprint field. +// Results are stored as JSON with keys like "metric_avg", "metric_max", etc. +// +// Example: For a "cpu_load" metric with Footprint="avg", this stores +// the average CPU load across all nodes as "cpu_load_avg": 85.3 func (r *JobRepository) UpdateFootprint( stmt sq.UpdateBuilder, jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { - /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) if err != nil { cclog.Errorf("cannot get subcluster: %s", err.Error()) @@ -783,7 +1004,10 @@ func (r *JobRepository) UpdateFootprint( } footprint := make(map[string]float64) + // Build footprint map with metric_stattype as keys for _, fp := range sc.Footprint { + // Determine which statistic to use: avg, min, or max + // First check global metric config, then cluster-specific config var statType string for _, gm := range archive.GlobalMetricList { if gm.Name == fp { @@ -791,15 +1015,18 @@ func (r *JobRepository) UpdateFootprint( } } + // Validate statistic type if statType != "avg" && statType != "min" && statType != "max" { cclog.Warnf("unknown statType for footprint update: %s", statType) return stmt, fmt.Errorf("unknown statType for footprint update: %s", statType) } + // Override with cluster-specific config if available if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil { statType = sc.MetricConfig[i].Footprint } + // Store as "metric_stattype": value (e.g., "cpu_load_avg": 85.3) name := fmt.Sprintf("%s_%s", fp, statType) footprint[name] = LoadJobStat(jobMeta, fp, statType) } diff --git a/internal/repository/jobCreate_test.go b/internal/repository/jobCreate_test.go new file mode 100644 index 00000000..3a586482 --- /dev/null +++ b/internal/repository/jobCreate_test.go @@ -0,0 +1,500 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "encoding/json" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// createTestJob creates a minimal valid job for testing +func createTestJob(jobID int64, cluster string) *schema.Job { + return &schema.Job{ + JobID: jobID, + User: "testuser", + Project: "testproject", + Cluster: cluster, + SubCluster: "main", + Partition: "batch", + NumNodes: 1, + NumHWThreads: 4, + NumAcc: 0, + Shared: "none", + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + SMT: 1, + State: schema.JobStateRunning, + StartTime: 1234567890, + Duration: 0, + Walltime: 3600, + Resources: []*schema.Resource{ + { + Hostname: "node01", + HWThreads: []int{0, 1, 2, 3}, + }, + }, + Footprint: map[string]float64{ + "cpu_load": 50.0, + "mem_used": 8000.0, + "flops_any": 0.5, + "mem_bw": 10.0, + "net_bw": 2.0, + "file_bw": 1.0, + "cpu_used": 2.0, + "cpu_load_core": 12.5, + }, + MetaData: map[string]string{ + "jobName": "test_job", + "queue": "normal", + "qosName": "default", + "accountName": "testaccount", + }, + } +} + +func TestInsertJob(t *testing.T) { + r := setup(t) + + t.Run("successful insertion", func(t *testing.T) { + job := createTestJob(999001, "testcluster") + job.RawResources, _ = json.Marshal(job.Resources) + job.RawFootprint, _ = json.Marshal(job.Footprint) + job.RawMetaData, _ = json.Marshal(job.MetaData) + + id, err := r.InsertJob(job) + require.NoError(t, err, "InsertJob should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + + // Verify job was inserted into job_cache + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?", + job.JobID, job.Cluster).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Job should be in job_cache table") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster) + require.NoError(t, err) + }) + + t.Run("insertion with all fields", func(t *testing.T) { + job := createTestJob(999002, "testcluster") + job.ArrayJobID = 5000 + job.Energy = 1500.5 + job.RawResources, _ = json.Marshal(job.Resources) + job.RawFootprint, _ = json.Marshal(job.Footprint) + job.RawMetaData, _ = json.Marshal(job.MetaData) + + id, err := r.InsertJob(job) + require.NoError(t, err) + assert.Greater(t, id, int64(0)) + + // Verify all fields were stored correctly + var retrievedJob schema.Job + err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, energy + FROM job_cache WHERE id = ?`, id).Scan( + &retrievedJob.JobID, &retrievedJob.User, &retrievedJob.Project, + &retrievedJob.Cluster, &retrievedJob.ArrayJobID, &retrievedJob.Energy) + require.NoError(t, err) + assert.Equal(t, job.JobID, retrievedJob.JobID) + assert.Equal(t, job.User, retrievedJob.User) + assert.Equal(t, job.Project, retrievedJob.Project) + assert.Equal(t, job.Cluster, retrievedJob.Cluster) + assert.Equal(t, job.ArrayJobID, retrievedJob.ArrayJobID) + assert.Equal(t, job.Energy, retrievedJob.Energy) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) +} + +func TestStart(t *testing.T) { + r := setup(t) + + t.Run("successful job start with JSON encoding", func(t *testing.T) { + job := createTestJob(999003, "testcluster") + + id, err := r.Start(job) + require.NoError(t, err, "Start should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + + // Verify job was inserted and JSON fields were encoded + var rawResources, rawFootprint, rawMetaData []byte + err = r.DB.QueryRow(`SELECT resources, footprint, meta_data FROM job_cache WHERE id = ?`, id).Scan( + &rawResources, &rawFootprint, &rawMetaData) + require.NoError(t, err) + + // Verify resources JSON + var resources []*schema.Resource + err = json.Unmarshal(rawResources, &resources) + require.NoError(t, err, "Resources should be valid JSON") + assert.Equal(t, 1, len(resources)) + assert.Equal(t, "node01", resources[0].Hostname) + + // Verify footprint JSON + var footprint map[string]float64 + err = json.Unmarshal(rawFootprint, &footprint) + require.NoError(t, err, "Footprint should be valid JSON") + assert.Equal(t, 50.0, footprint["cpu_load"]) + assert.Equal(t, 8000.0, footprint["mem_used"]) + + // Verify metadata JSON + var metaData map[string]string + err = json.Unmarshal(rawMetaData, &metaData) + require.NoError(t, err, "MetaData should be valid JSON") + assert.Equal(t, "test_job", metaData["jobName"]) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("job start with empty footprint", func(t *testing.T) { + job := createTestJob(999004, "testcluster") + job.Footprint = map[string]float64{} + + id, err := r.Start(job) + require.NoError(t, err) + assert.Greater(t, id, int64(0)) + + // Verify empty footprint was encoded as empty JSON object + var rawFootprint []byte + err = r.DB.QueryRow(`SELECT footprint FROM job_cache WHERE id = ?`, id).Scan(&rawFootprint) + require.NoError(t, err) + assert.Equal(t, []byte("{}"), rawFootprint) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("job start with nil metadata", func(t *testing.T) { + job := createTestJob(999005, "testcluster") + job.MetaData = nil + + id, err := r.Start(job) + require.NoError(t, err) + assert.Greater(t, id, int64(0)) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) +} + +func TestStop(t *testing.T) { + r := setup(t) + + t.Run("successful job stop", func(t *testing.T) { + // First insert a job using Start + job := createTestJob(999106, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Move from job_cache to job table (simulate SyncJobs) - exclude id to let it auto-increment + _, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) + SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint + FROM job_cache WHERE id = ?`, id) + require.NoError(t, err) + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + + // Get the new job id in the job table + err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?", + job.JobID, job.Cluster, job.StartTime).Scan(&id) + require.NoError(t, err) + + // Stop the job + duration := int32(3600) + state := schema.JobStateCompleted + monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful) + + err = r.Stop(id, duration, state, monitoringStatus) + require.NoError(t, err, "Stop should succeed") + + // Verify job was updated + var retrievedDuration int32 + var retrievedState string + var retrievedMonStatus int32 + err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job WHERE id = ?`, id).Scan( + &retrievedDuration, &retrievedState, &retrievedMonStatus) + require.NoError(t, err) + assert.Equal(t, duration, retrievedDuration) + assert.Equal(t, string(state), retrievedState) + assert.Equal(t, monitoringStatus, retrievedMonStatus) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("stop updates job state transitions", func(t *testing.T) { + // Insert a job + job := createTestJob(999107, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Move to job table + _, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) + SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint + FROM job_cache WHERE id = ?`, id) + require.NoError(t, err) + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + + // Get the new job id in the job table + err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?", + job.JobID, job.Cluster, job.StartTime).Scan(&id) + require.NoError(t, err) + + // Stop the job with different duration + err = r.Stop(id, 7200, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)) + require.NoError(t, err) + + // Verify the duration was updated correctly + var duration int32 + err = r.DB.QueryRow(`SELECT duration FROM job WHERE id = ?`, id).Scan(&duration) + require.NoError(t, err) + assert.Equal(t, int32(7200), duration, "Duration should be updated to 7200") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("stop with different states", func(t *testing.T) { + testCases := []struct { + name string + jobID int64 + state schema.JobState + monitoringStatus int32 + }{ + {"completed", 999108, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)}, + {"failed", 999118, schema.JobStateFailed, int32(schema.MonitoringStatusArchivingSuccessful)}, + {"cancelled", 999119, schema.JobStateCancelled, int32(schema.MonitoringStatusArchivingSuccessful)}, + {"timeout", 999120, schema.JobStateTimeout, int32(schema.MonitoringStatusArchivingSuccessful)}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + job := createTestJob(tc.jobID, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Move to job table + _, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) + SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint + FROM job_cache WHERE id = ?`, id) + require.NoError(t, err) + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + + // Get the new job id in the job table + err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?", + job.JobID, job.Cluster, job.StartTime).Scan(&id) + require.NoError(t, err) + + // Stop with specific state + err = r.Stop(id, 1800, tc.state, tc.monitoringStatus) + require.NoError(t, err) + + // Verify state was set correctly + var retrievedState string + err = r.DB.QueryRow(`SELECT job_state FROM job WHERE id = ?`, id).Scan(&retrievedState) + require.NoError(t, err) + assert.Equal(t, string(tc.state), retrievedState) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) + } + }) +} + +func TestStopCached(t *testing.T) { + r := setup(t) + + t.Run("successful stop cached job", func(t *testing.T) { + // Insert a job in job_cache + job := createTestJob(999009, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Stop the cached job + duration := int32(3600) + state := schema.JobStateCompleted + monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful) + + err = r.StopCached(id, duration, state, monitoringStatus) + require.NoError(t, err, "StopCached should succeed") + + // Verify job was updated in job_cache table + var retrievedDuration int32 + var retrievedState string + var retrievedMonStatus int32 + err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job_cache WHERE id = ?`, id).Scan( + &retrievedDuration, &retrievedState, &retrievedMonStatus) + require.NoError(t, err) + assert.Equal(t, duration, retrievedDuration) + assert.Equal(t, string(state), retrievedState) + assert.Equal(t, monitoringStatus, retrievedMonStatus) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("stop cached job does not affect job table", func(t *testing.T) { + // Insert a job in job_cache + job := createTestJob(999010, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Stop the cached job + err = r.StopCached(id, 3600, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)) + require.NoError(t, err) + + // Verify job table was not affected + var count int + err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE job_id = ? AND cluster = ?`, + job.JobID, job.Cluster).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "Job table should not be affected by StopCached") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) +} + +func TestSyncJobs(t *testing.T) { + r := setup(t) + + t.Run("sync jobs from cache to main table", func(t *testing.T) { + // Ensure cache is empty first + _, err := r.DB.Exec("DELETE FROM job_cache") + require.NoError(t, err) + + // Insert multiple jobs in job_cache + job1 := createTestJob(999011, "testcluster") + job2 := createTestJob(999012, "testcluster") + job3 := createTestJob(999013, "testcluster") + + _, err = r.Start(job1) + require.NoError(t, err) + _, err = r.Start(job2) + require.NoError(t, err) + _, err = r.Start(job3) + require.NoError(t, err) + + // Verify jobs are in job_cache + var cacheCount int + err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)", + job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount) + require.NoError(t, err) + assert.Equal(t, 3, cacheCount, "All jobs should be in job_cache") + + // Sync jobs + jobs, err := r.SyncJobs() + require.NoError(t, err, "SyncJobs should succeed") + assert.Equal(t, 3, len(jobs), "Should return 3 synced jobs") + + // Verify jobs were moved to job table + var jobCount int + err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE job_id IN (?, ?, ?)", + job1.JobID, job2.JobID, job3.JobID).Scan(&jobCount) + require.NoError(t, err) + assert.Equal(t, 3, jobCount, "All jobs should be in job table") + + // Verify job_cache was cleared + err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)", + job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount) + require.NoError(t, err) + assert.Equal(t, 0, cacheCount, "job_cache should be empty after sync") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE job_id IN (?, ?, ?)", job1.JobID, job2.JobID, job3.JobID) + require.NoError(t, err) + }) + + t.Run("sync preserves job data", func(t *testing.T) { + // Ensure cache is empty first + _, err := r.DB.Exec("DELETE FROM job_cache") + require.NoError(t, err) + + // Insert a job with specific data + job := createTestJob(999014, "testcluster") + job.ArrayJobID = 7777 + job.Energy = 2500.75 + job.Duration = 1800 + + id, err := r.Start(job) + require.NoError(t, err) + + // Update some fields to simulate job progress + result, err := r.DB.Exec(`UPDATE job_cache SET duration = ?, energy = ? WHERE id = ?`, + 3600, 3000.5, id) + require.NoError(t, err) + rowsAffected, _ := result.RowsAffected() + require.Equal(t, int64(1), rowsAffected, "UPDATE should affect exactly 1 row") + + // Verify the update worked + var checkDuration int32 + var checkEnergy float64 + err = r.DB.QueryRow(`SELECT duration, energy FROM job_cache WHERE id = ?`, id).Scan(&checkDuration, &checkEnergy) + require.NoError(t, err) + require.Equal(t, int32(3600), checkDuration, "Duration should be updated to 3600 before sync") + require.Equal(t, 3000.5, checkEnergy, "Energy should be updated to 3000.5 before sync") + + // Sync jobs + jobs, err := r.SyncJobs() + require.NoError(t, err) + require.Equal(t, 1, len(jobs), "Should return exactly 1 synced job") + + // Verify in database + var dbJob schema.Job + err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, duration, energy + FROM job WHERE job_id = ? AND cluster = ?`, job.JobID, job.Cluster).Scan( + &dbJob.JobID, &dbJob.User, &dbJob.Project, &dbJob.Cluster, + &dbJob.ArrayJobID, &dbJob.Duration, &dbJob.Energy) + require.NoError(t, err) + assert.Equal(t, job.JobID, dbJob.JobID) + assert.Equal(t, int32(3600), dbJob.Duration) + assert.Equal(t, 3000.5, dbJob.Energy) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster) + require.NoError(t, err) + }) + + t.Run("sync with empty cache returns empty list", func(t *testing.T) { + // Ensure cache is empty + _, err := r.DB.Exec("DELETE FROM job_cache") + require.NoError(t, err) + + // Sync should return empty list + jobs, err := r.SyncJobs() + require.NoError(t, err) + assert.Equal(t, 0, len(jobs), "Should return empty list when cache is empty") + }) +} diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index c449d308..41684d5c 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -10,8 +10,36 @@ import ( "github.com/ClusterCockpit/cc-lib/v2/schema" ) +// JobHook interface allows external components to hook into job lifecycle events. +// Implementations can perform actions when jobs start or stop, such as tagging, +// logging, notifications, or triggering external workflows. +// +// Example implementation: +// +// type MyJobTagger struct{} +// +// func (t *MyJobTagger) JobStartCallback(job *schema.Job) { +// if job.NumNodes > 100 { +// // Tag large jobs automatically +// } +// } +// +// func (t *MyJobTagger) JobStopCallback(job *schema.Job) { +// if job.State == schema.JobStateFailed { +// // Log or alert on failed jobs +// } +// } +// +// Register hooks during application initialization: +// +// repository.RegisterJobHook(&MyJobTagger{}) type JobHook interface { + // JobStartCallback is invoked when one or more jobs start. + // This is called synchronously, so implementations should be fast. JobStartCallback(job *schema.Job) + + // JobStopCallback is invoked when a job completes. + // This is called synchronously, so implementations should be fast. JobStopCallback(job *schema.Job) } @@ -20,7 +48,13 @@ var ( hooks []JobHook ) -func RegisterJobJook(hook JobHook) { +// RegisterJobHook registers a JobHook to receive job lifecycle callbacks. +// Multiple hooks can be registered and will be called in registration order. +// This function is safe to call multiple times and is typically called during +// application initialization. +// +// Nil hooks are silently ignored to simplify conditional registration. +func RegisterJobHook(hook JobHook) { initOnce.Do(func() { hooks = make([]JobHook, 0) }) @@ -30,6 +64,12 @@ func RegisterJobJook(hook JobHook) { } } +// CallJobStartHooks invokes all registered JobHook.JobStartCallback methods +// for each job in the provided slice. This is called internally by the repository +// when jobs are started (e.g., via StartJob or batch job imports). +// +// Hooks are called synchronously in registration order. If a hook panics, +// the panic will propagate to the caller. func CallJobStartHooks(jobs []*schema.Job) { if hooks == nil { return @@ -44,6 +84,12 @@ func CallJobStartHooks(jobs []*schema.Job) { } } +// CallJobStopHooks invokes all registered JobHook.JobStopCallback methods +// for the provided job. This is called internally by the repository when a +// job completes (e.g., via StopJob or job state updates). +// +// Hooks are called synchronously in registration order. If a hook panics, +// the panic will propagate to the caller. func CallJobStopHooks(job *schema.Job) { if hooks == nil { return diff --git a/internal/repository/migration.go b/internal/repository/migration.go index a47f9fcd..0f99889e 100644 --- a/internal/repository/migration.go +++ b/internal/repository/migration.go @@ -16,11 +16,29 @@ import ( "github.com/golang-migrate/migrate/v4/source/iofs" ) +// Version is the current database schema version required by this version of cc-backend. +// When the database schema changes, this version is incremented and a new migration file +// is added to internal/repository/migrations/sqlite3/. +// +// Version history: +// - Version 10: Current version +// +// Migration files are embedded at build time from the migrations directory. const Version uint = 10 //go:embed migrations/* var migrationFiles embed.FS +// checkDBVersion verifies that the database schema version matches the expected version. +// This is called automatically during Connect() to ensure schema compatibility. +// +// Returns an error if: +// - Database version is older than expected (needs migration) +// - Database version is newer than expected (needs app upgrade) +// - Database is in a dirty state (failed migration) +// +// A "dirty" database indicates a migration was started but not completed successfully. +// This requires manual intervention to fix the database and force the version. func checkDBVersion(db *sql.DB) error { driver, err := sqlite3.WithInstance(db, &sqlite3.Config{}) if err != nil { @@ -58,6 +76,8 @@ func checkDBVersion(db *sql.DB) error { return nil } +// getMigrateInstance creates a new migration instance for the given database file. +// This is used internally by MigrateDB, RevertDB, and ForceDB. func getMigrateInstance(db string) (m *migrate.Migrate, err error) { d, err := iofs.New(migrationFiles, "migrations/sqlite3") if err != nil { @@ -72,6 +92,23 @@ func getMigrateInstance(db string) (m *migrate.Migrate, err error) { return m, nil } +// MigrateDB applies all pending database migrations to bring the schema up to date. +// This should be run with the -migrate-db flag before starting the application +// after upgrading to a new version that requires schema changes. +// +// Process: +// 1. Checks current database version +// 2. Applies all migrations from current version to target Version +// 3. Updates schema_migrations table to track applied migrations +// +// Important: +// - Always backup your database before running migrations +// - Migrations are irreversible without manual intervention +// - If a migration fails, the database is marked "dirty" and requires manual fix +// +// Usage: +// +// cc-backend -migrate-db func MigrateDB(db string) error { m, err := getMigrateInstance(db) if err != nil { @@ -107,6 +144,17 @@ func MigrateDB(db string) error { return nil } +// RevertDB rolls back the database schema to the previous version (Version - 1). +// This is primarily used for testing or emergency rollback scenarios. +// +// Warning: +// - This may cause data loss if newer schema added columns/tables +// - Always backup before reverting +// - Not all migrations are safely reversible +// +// Usage: +// +// cc-backend -revert-db func RevertDB(db string) error { m, err := getMigrateInstance(db) if err != nil { @@ -125,6 +173,21 @@ func RevertDB(db string) error { return nil } +// ForceDB forces the database schema version to the current Version without running migrations. +// This is only used to recover from failed migrations that left the database in a "dirty" state. +// +// When to use: +// - After manually fixing a failed migration +// - When you've manually applied schema changes and need to update the version marker +// +// Warning: +// - This does NOT apply any schema changes +// - Only use after manually verifying the schema is correct +// - Improper use can cause schema/version mismatch +// +// Usage: +// +// cc-backend -force-db func ForceDB(db string) error { m, err := getMigrateInstance(db) if err != nil { diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 989026d1..cd175c23 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -277,6 +277,15 @@ func (r *JobRepository) JobsStats( return stats, nil } +// LoadJobStat retrieves a specific statistic for a metric from a job's statistics. +// Returns 0.0 if the metric is not found or statType is invalid. +// +// Parameters: +// - job: Job struct with populated Statistics field +// - metric: Name of the metric to query (e.g., "cpu_load", "mem_used") +// - statType: Type of statistic: "avg", "min", or "max" +// +// Returns the requested statistic value or 0.0 if not found. func LoadJobStat(job *schema.Job, metric string, statType string) float64 { if stats, ok := job.Statistics[metric]; ok { switch statType { @@ -579,7 +588,9 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, qerr } - // Setup Array + // Initialize histogram bins with zero counts + // Each bin represents a duration range: bin N = [N*binSizeSeconds, (N+1)*binSizeSeconds) + // Example: binSizeSeconds=3600 (1 hour), bin 1 = 0-1h, bin 2 = 1-2h, etc. points := make([]*model.HistoPoint, 0) for i := 1; i <= *targetBinCount; i++ { point := model.HistoPoint{Value: i * binSizeSeconds, Count: 0} @@ -596,7 +607,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, err } - // Fill Array at matching $Value + // Match query results to pre-initialized bins and fill counts + // Query returns raw duration values that need to be mapped to correct bins for rows.Next() { point := model.HistoPoint{} if err := rows.Scan(&point.Value, &point.Count); err != nil { @@ -604,11 +616,13 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, err } + // Find matching bin and update count + // point.Value is multiplied by binSizeSeconds to match pre-calculated bin.Value for _, e := range points { if e.Value == (point.Value * binSizeSeconds) { - // Note: - // Matching on unmodified integer value (and multiplying point.Value by binSizeSeconds after match) - // causes frontend to loop into highest targetBinCount, due to zoom condition instantly being fullfilled (cause unknown) + // Note: Matching on unmodified integer value (and multiplying point.Value + // by binSizeSeconds after match) causes frontend to loop into highest + // targetBinCount, due to zoom condition instantly being fulfilled (cause unknown) e.Count = point.Count break } @@ -625,12 +639,16 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( filters []*model.JobFilter, bins *int, ) (*model.MetricHistoPoints, error) { - // Get specific Peak or largest Peak + // Determine the metric's peak value for histogram normalization + // Peak value defines the upper bound for binning: values are distributed across + // bins from 0 to peak. First try to get peak from filtered cluster, otherwise + // scan all clusters to find the maximum peak value. var metricConfig *schema.MetricConfig var peak float64 var unit string var footprintStat string + // Try to get metric config from filtered cluster for _, f := range filters { if f.Cluster != nil { metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric) @@ -641,6 +659,8 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } } + // If no cluster filter or peak not found, find largest peak across all clusters + // This ensures histogram can accommodate all possible values if peak == 0.0 { for _, c := range archive.Clusters { for _, m := range c.MetricConfig { @@ -659,11 +679,18 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } } - // cclog.Debugf("Metric %s, Peak %f, Unit %s", metric, peak, unit) - // Make bins, see https://jereze.com/code/sql-histogram/ (Modified here) + // Construct SQL histogram bins using normalized values + // Algorithm based on: https://jereze.com/code/sql-histogram/ (modified) start := time.Now() - // Find Jobs' Value Bin Number: Divide Value by Peak, Multiply by RequestedBins, then CAST to INT: Gets Bin-Number of Job + // Calculate bin number for each job's metric value: + // 1. Extract metric value from JSON footprint + // 2. Normalize to [0,1] by dividing by peak + // 3. Multiply by number of bins to get bin number + // 4. Cast to integer for bin assignment + // + // Special case: Values exactly equal to peak would fall into bin N+1, + // so we multiply peak by 0.999999999 to force it into the last bin (bin N) binQuery := fmt.Sprintf(`CAST( ((case when json_extract(footprint, "$.%s") = %f then %f*0.999999999 else json_extract(footprint, "$.%s") end) / %f) * %v as INTEGER )`, @@ -698,7 +725,9 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( return nil, err } - // Setup Return Array With Bin-Numbers for Match and Min/Max based on Peak + // Initialize histogram bins with calculated min/max ranges + // Each bin represents a range of metric values + // Example: peak=1000, bins=10 -> bin 1=[0,100), bin 2=[100,200), ..., bin 10=[900,1000] points := make([]*model.MetricHistoPoint, 0) binStep := int(peak) / *bins for i := 1; i <= *bins; i++ { @@ -708,13 +737,16 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( points = append(points, &epoint) } - for rows.Next() { // Fill Count if Bin-No. Matches (Not every Bin exists in DB!) + // Fill counts from query results + // Query only returns bins that have jobs, so we match against pre-initialized bins + for rows.Next() { rpoint := model.MetricHistoPoint{} if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { // Required for Debug: &rpoint.Min, &rpoint.Max cclog.Warnf("Error while scanning rows for %s", metric) return nil, err // FIXME: Totally bricks cc-backend if returned and if all metrics requested? } + // Match query result to pre-initialized bin and update count for _, e := range points { if e.Bin != nil && rpoint.Bin != nil { if *e.Bin == *rpoint.Bin { diff --git a/internal/repository/stats_test.go b/internal/repository/stats_test.go index e10c9685..a8dfc818 100644 --- a/internal/repository/stats_test.go +++ b/internal/repository/stats_test.go @@ -25,11 +25,20 @@ func TestBuildJobStatsQuery(t *testing.T) { func TestJobStats(t *testing.T) { r := setup(t) + // First, count the actual jobs in the database (excluding test jobs) + var expectedCount int + err := r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE cluster != 'testcluster'`).Scan(&expectedCount) + noErr(t, err) + filter := &model.JobFilter{} + // Exclude test jobs created by other tests + testCluster := "testcluster" + filter.Cluster = &model.StringInput{Neq: &testCluster} + stats, err := r.JobsStats(getContext(t), []*model.JobFilter{filter}) noErr(t, err) - if stats[0].TotalJobs != 544 { - t.Fatalf("Want 544, Got %d", stats[0].TotalJobs) + if stats[0].TotalJobs != expectedCount { + t.Fatalf("Want %d, Got %d", expectedCount, stats[0].TotalJobs) } } diff --git a/internal/repository/tags.go b/internal/repository/tags.go index 9bc9abae..f6cccfe2 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -16,8 +16,32 @@ import ( sq "github.com/Masterminds/squirrel" ) +// Tag Scope Rules: +// +// Tags in ClusterCockpit have three visibility scopes that control who can see and use them: +// +// 1. "global" - Visible to all users, can be used by anyone +// Example: System-generated tags like "energy-efficient", "failed", "short" +// +// 2. "private" - Only visible to the creating user +// Example: Personal notes like "needs-review", "interesting-case" +// +// 3. "admin" - Only visible to users with admin or support roles +// Example: Internal notes like "hardware-issue", "billing-problem" +// +// Authorization Rules: +// - Regular users can only create/see "global" and their own "private" tags +// - Admin/Support can create/see all scopes including "admin" tags +// - Users can only add tags to jobs they have permission to view +// - Tag scope is enforced at query time in GetTags() and CountTags() + // AddTag adds the tag with id `tagId` to the job with the database id `jobId`. // Requires user authentication for security checks. +// +// The user must have permission to view the job. Tag visibility is determined by scope: +// - "global" tags: visible to all users +// - "private" tags: only visible to the tag creator +// - "admin" tags: only visible to admin/support users func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*schema.Tag, error) { j, err := r.FindByIDWithUser(user, job) if err != nil { @@ -180,7 +204,15 @@ func (r *JobRepository) RemoveTagById(tagID int64) error { return nil } -// CreateTag creates a new tag with the specified type and name and returns its database id. +// CreateTag creates a new tag with the specified type, name, and scope. +// Returns the database ID of the newly created tag. +// +// Scope defaults to "global" if empty string is provided. +// Valid scopes: "global", "private", "admin" +// +// Example: +// +// tagID, err := repo.CreateTag("performance", "high-memory", "global") func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagId int64, err error) { // Default to "Global" scope if none defined if tagScope == "" { @@ -199,8 +231,14 @@ func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope strin return res.LastInsertId() } +// CountTags returns all tags visible to the user and the count of jobs for each tag. +// Applies scope-based filtering to respect tag visibility rules. +// +// Returns: +// - tags: slice of tags the user can see +// - counts: map of tag name to job count +// - err: any error encountered func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts map[string]int, err error) { - // Fetch all Tags in DB for Display in Frontend Tag-View tags = make([]schema.Tag, 0, 100) xrows, err := r.DB.Queryx("SELECT id, tag_type, tag_name, tag_scope FROM tag") if err != nil { diff --git a/internal/repository/transaction_test.go b/internal/repository/transaction_test.go new file mode 100644 index 00000000..1832bea0 --- /dev/null +++ b/internal/repository/transaction_test.go @@ -0,0 +1,311 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "testing" + + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTransactionInit(t *testing.T) { + r := setup(t) + + t.Run("successful transaction init", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err, "TransactionInit should succeed") + require.NotNil(t, tx, "Transaction should not be nil") + require.NotNil(t, tx.tx, "Transaction.tx should not be nil") + + // Clean up + err = tx.Rollback() + require.NoError(t, err, "Rollback should succeed") + }) +} + +func TestTransactionCommit(t *testing.T) { + r := setup(t) + + t.Run("commit after successful operations", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + // Insert a test tag + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_tag_commit", "global") + require.NoError(t, err, "TransactionAdd should succeed") + + // Commit the transaction + err = tx.Commit() + require.NoError(t, err, "Commit should succeed") + + // Verify the tag was inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_tag_commit").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Tag should be committed to database") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name = ?", "test_tag_commit") + require.NoError(t, err) + }) + + t.Run("commit on already committed transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Commit() + require.NoError(t, err, "First commit should succeed") + + err = tx.Commit() + assert.Error(t, err, "Second commit should fail") + assert.Contains(t, err.Error(), "transaction already committed or rolled back") + }) +} + +func TestTransactionRollback(t *testing.T) { + r := setup(t) + + t.Run("rollback after operations", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + // Insert a test tag + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_tag_rollback", "global") + require.NoError(t, err, "TransactionAdd should succeed") + + // Rollback the transaction + err = tx.Rollback() + require.NoError(t, err, "Rollback should succeed") + + // Verify the tag was NOT inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_tag_rollback").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "Tag should not be in database after rollback") + }) + + t.Run("rollback on already rolled back transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Rollback() + require.NoError(t, err, "First rollback should succeed") + + err = tx.Rollback() + assert.NoError(t, err, "Second rollback should be safe (no-op)") + }) + + t.Run("rollback on committed transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Commit() + require.NoError(t, err) + + err = tx.Rollback() + assert.NoError(t, err, "Rollback after commit should be safe (no-op)") + }) +} + +func TestTransactionAdd(t *testing.T) { + r := setup(t) + + t.Run("insert with TransactionAdd", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + id, err := r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_add", "global") + require.NoError(t, err, "TransactionAdd should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + }) + + t.Run("error on nil transaction", func(t *testing.T) { + tx := &Transaction{tx: nil} + + _, err := r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_nil", "global") + assert.Error(t, err, "Should error on nil transaction") + assert.Contains(t, err.Error(), "transaction is nil or already completed") + }) + + t.Run("error on invalid SQL", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + _, err = r.TransactionAdd(tx, "INVALID SQL STATEMENT") + assert.Error(t, err, "Should error on invalid SQL") + }) + + t.Run("error after transaction committed", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Commit() + require.NoError(t, err) + + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_after_commit", "global") + assert.Error(t, err, "Should error when transaction is already committed") + }) +} + +func TestTransactionAddNamed(t *testing.T) { + r := setup(t) + + t.Run("insert with TransactionAddNamed", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + type TagArgs struct { + Type string `db:"type"` + Name string `db:"name"` + Scope string `db:"scope"` + } + + args := TagArgs{ + Type: "test_type", + Name: "test_named", + Scope: "global", + } + + id, err := r.TransactionAddNamed(tx, + "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (:type, :name, :scope)", + args) + require.NoError(t, err, "TransactionAddNamed should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + }) + + t.Run("error on nil transaction", func(t *testing.T) { + tx := &Transaction{tx: nil} + + _, err := r.TransactionAddNamed(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (:type, :name, :scope)", + map[string]interface{}{"type": "test", "name": "test", "scope": "global"}) + assert.Error(t, err, "Should error on nil transaction") + assert.Contains(t, err.Error(), "transaction is nil or already completed") + }) +} + +func TestTransactionMultipleOperations(t *testing.T) { + r := setup(t) + + t.Run("multiple inserts in single transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + // Insert multiple tags + for i := 0; i < 5; i++ { + _, err = r.TransactionAdd(tx, + "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_multi_"+string(rune('a'+i)), "global") + require.NoError(t, err, "Insert %d should succeed", i) + } + + err = tx.Commit() + require.NoError(t, err, "Commit should succeed") + + // Verify all tags were inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name LIKE 'test_multi_%'").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 5, count, "All 5 tags should be committed") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name LIKE 'test_multi_%'") + require.NoError(t, err) + }) + + t.Run("rollback undoes all operations", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + // Insert multiple tags + for i := 0; i < 3; i++ { + _, err = r.TransactionAdd(tx, + "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_rollback_"+string(rune('a'+i)), "global") + require.NoError(t, err) + } + + err = tx.Rollback() + require.NoError(t, err, "Rollback should succeed") + + // Verify no tags were inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name LIKE 'test_rollback_%'").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "No tags should be in database after rollback") + }) +} + +func TestTransactionEnd(t *testing.T) { + r := setup(t) + + t.Run("deprecated TransactionEnd calls Commit", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_end", "global") + require.NoError(t, err) + + // Use deprecated method + err = r.TransactionEnd(tx) + require.NoError(t, err, "TransactionEnd should succeed") + + // Verify the tag was committed + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_end").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Tag should be committed") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name = ?", "test_end") + require.NoError(t, err) + }) +} + +func TestTransactionDeferPattern(t *testing.T) { + r := setup(t) + + t.Run("defer rollback pattern", func(t *testing.T) { + insertTag := func() error { + tx, err := r.TransactionInit() + if err != nil { + return err + } + defer tx.Rollback() // Safe to call even after commit + + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_defer", "global") + if err != nil { + return err + } + + return tx.Commit() + } + + err := insertTag() + require.NoError(t, err, "Function should succeed") + + // Verify the tag was committed + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_defer").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Tag should be committed despite defer rollback") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name = ?", "test_defer") + require.NoError(t, err) + }) +} diff --git a/internal/repository/user.go b/internal/repository/user.go index 770915b6..42a22384 100644 --- a/internal/repository/user.go +++ b/internal/repository/user.go @@ -22,6 +22,25 @@ import ( "golang.org/x/crypto/bcrypt" ) +// Authentication and Role System: +// +// ClusterCockpit supports multiple authentication sources: +// - Local: Username/password stored in database (password hashed with bcrypt) +// - LDAP: External LDAP/Active Directory authentication +// - JWT: Token-based authentication for API access +// +// Role Hierarchy (from highest to lowest privilege): +// 1. "admin" - Full system access, can manage all users and jobs +// 2. "support" - Can view all jobs but limited management capabilities +// 3. "manager" - Can manage specific projects and their users +// 4. "api" - Programmatic access for job submission/management +// 5. "user" - Default role, can only view own jobs +// +// Project Association: +// - Managers have a list of projects they oversee +// - Regular users' project membership is determined by job data +// - Managers can view/manage all jobs within their projects + var ( userRepoOnce sync.Once userRepoInstance *UserRepository @@ -44,6 +63,9 @@ func GetUserRepository() *UserRepository { return userRepoInstance } +// GetUser retrieves a user by username from the database. +// Returns the complete user record including hashed password, roles, and projects. +// Password field contains bcrypt hash for local auth users, empty for LDAP users. func (r *UserRepository) GetUser(username string) (*schema.User, error) { user := &schema.User{Username: username} var hashedPassword, name, rawRoles, email, rawProjects sql.NullString @@ -93,6 +115,12 @@ func (r *UserRepository) GetLdapUsernames() ([]string, error) { return users, nil } +// AddUser creates a new user in the database. +// Passwords are automatically hashed with bcrypt before storage. +// Auth source determines authentication method (local, LDAP, etc.). +// +// Required fields: Username, Roles +// Optional fields: Name, Email, Password, Projects, AuthSource func (r *UserRepository) AddUser(user *schema.User) error { rolesJson, _ := json.Marshal(user.Roles) projectsJson, _ := json.Marshal(user.Projects) @@ -229,6 +257,14 @@ func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) { return users, nil } +// AddRole adds a role to a user's role list. +// Role string is automatically lowercased. +// Valid roles: admin, support, manager, api, user +// +// Returns error if: +// - User doesn't exist +// - Role is invalid +// - User already has the role func (r *UserRepository) AddRole( ctx context.Context, username string, @@ -258,6 +294,11 @@ func (r *UserRepository) AddRole( return nil } +// RemoveRole removes a role from a user's role list. +// +// Special rules: +// - Cannot remove "manager" role while user has assigned projects +// - Must remove all projects first before removing manager role func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryrole string) error { oldRole := strings.ToLower(queryrole) user, err := r.GetUser(username) @@ -294,6 +335,12 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr return nil } +// AddProject assigns a project to a manager user. +// Only users with the "manager" role can have assigned projects. +// +// Returns error if: +// - User doesn't have manager role +// - User already manages the project func (r *UserRepository) AddProject( ctx context.Context, username string, diff --git a/internal/repository/user_test.go b/internal/repository/user_test.go new file mode 100644 index 00000000..370d261d --- /dev/null +++ b/internal/repository/user_test.go @@ -0,0 +1,596 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "context" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/crypto/bcrypt" +) + +func TestAddUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("add user with all fields", func(t *testing.T) { + user := &schema.User{ + Username: "testuser1", + Name: "Test User One", + Email: "test1@example.com", + Password: "testpassword123", + Roles: []string{"user"}, + Projects: []string{"project1", "project2"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + retrievedUser, err := r.GetUser("testuser1") + require.NoError(t, err) + assert.Equal(t, user.Username, retrievedUser.Username) + assert.Equal(t, user.Name, retrievedUser.Name) + assert.Equal(t, user.Email, retrievedUser.Email) + assert.Equal(t, user.Roles, retrievedUser.Roles) + assert.Equal(t, user.Projects, retrievedUser.Projects) + assert.NotEmpty(t, retrievedUser.Password) + err = bcrypt.CompareHashAndPassword([]byte(retrievedUser.Password), []byte("testpassword123")) + assert.NoError(t, err, "Password should be hashed correctly") + + err = r.DelUser("testuser1") + require.NoError(t, err) + }) + + t.Run("add user with minimal fields", func(t *testing.T) { + user := &schema.User{ + Username: "testuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLDAP, + } + + err := r.AddUser(user) + require.NoError(t, err) + + retrievedUser, err := r.GetUser("testuser2") + require.NoError(t, err) + assert.Equal(t, user.Username, retrievedUser.Username) + assert.Equal(t, "", retrievedUser.Name) + assert.Equal(t, "", retrievedUser.Email) + assert.Equal(t, "", retrievedUser.Password) + + err = r.DelUser("testuser2") + require.NoError(t, err) + }) + + t.Run("add duplicate user fails", func(t *testing.T) { + user := &schema.User{ + Username: "testuser3", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddUser(user) + assert.Error(t, err, "Adding duplicate user should fail") + + err = r.DelUser("testuser3") + require.NoError(t, err) + }) +} + +func TestGetUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("get existing user", func(t *testing.T) { + user := &schema.User{ + Username: "getuser1", + Name: "Get User", + Email: "getuser@example.com", + Roles: []string{"user", "admin"}, + Projects: []string{"proj1"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + retrieved, err := r.GetUser("getuser1") + require.NoError(t, err) + assert.Equal(t, user.Username, retrieved.Username) + assert.Equal(t, user.Name, retrieved.Name) + assert.Equal(t, user.Email, retrieved.Email) + assert.ElementsMatch(t, user.Roles, retrieved.Roles) + assert.ElementsMatch(t, user.Projects, retrieved.Projects) + + err = r.DelUser("getuser1") + require.NoError(t, err) + }) + + t.Run("get non-existent user", func(t *testing.T) { + _, err := r.GetUser("nonexistent") + assert.Error(t, err) + }) +} + +func TestUpdateUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("update user name", func(t *testing.T) { + user := &schema.User{ + Username: "updateuser1", + Name: "Original Name", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + dbUser, err := r.GetUser("updateuser1") + require.NoError(t, err) + + updatedUser := &schema.User{ + Username: "updateuser1", + Name: "Updated Name", + } + + err = r.UpdateUser(dbUser, updatedUser) + require.NoError(t, err) + + retrieved, err := r.GetUser("updateuser1") + require.NoError(t, err) + assert.Equal(t, "Updated Name", retrieved.Name) + + err = r.DelUser("updateuser1") + require.NoError(t, err) + }) + + t.Run("update with no changes", func(t *testing.T) { + user := &schema.User{ + Username: "updateuser2", + Name: "Same Name", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + dbUser, err := r.GetUser("updateuser2") + require.NoError(t, err) + + err = r.UpdateUser(dbUser, dbUser) + assert.NoError(t, err) + + err = r.DelUser("updateuser2") + require.NoError(t, err) + }) +} + +func TestDelUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("delete existing user", func(t *testing.T) { + user := &schema.User{ + Username: "deluser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.DelUser("deluser1") + require.NoError(t, err) + + _, err = r.GetUser("deluser1") + assert.Error(t, err, "User should not exist after deletion") + }) + + t.Run("delete non-existent user", func(t *testing.T) { + err := r.DelUser("nonexistent") + assert.NoError(t, err, "Deleting non-existent user should not error") + }) +} + +func TestListUsers(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + user1 := &schema.User{ + Username: "listuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + user2 := &schema.User{ + Username: "listuser2", + Roles: []string{"admin"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + user3 := &schema.User{ + Username: "listuser3", + Roles: []string{"manager"}, + Projects: []string{"proj1"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user1) + require.NoError(t, err) + err = r.AddUser(user2) + require.NoError(t, err) + err = r.AddUser(user3) + require.NoError(t, err) + + t.Run("list all users", func(t *testing.T) { + users, err := r.ListUsers(false) + require.NoError(t, err) + assert.GreaterOrEqual(t, len(users), 3) + + usernames := make([]string, len(users)) + for i, u := range users { + usernames[i] = u.Username + } + assert.Contains(t, usernames, "listuser1") + assert.Contains(t, usernames, "listuser2") + assert.Contains(t, usernames, "listuser3") + }) + + t.Run("list special users only", func(t *testing.T) { + users, err := r.ListUsers(true) + require.NoError(t, err) + + usernames := make([]string, len(users)) + for i, u := range users { + usernames[i] = u.Username + } + assert.Contains(t, usernames, "listuser2") + assert.Contains(t, usernames, "listuser3") + }) + + err = r.DelUser("listuser1") + require.NoError(t, err) + err = r.DelUser("listuser2") + require.NoError(t, err) + err = r.DelUser("listuser3") + require.NoError(t, err) +} + +func TestGetLdapUsernames(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + ldapUser := &schema.User{ + Username: "ldapuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLDAP, + } + localUser := &schema.User{ + Username: "localuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(ldapUser) + require.NoError(t, err) + err = r.AddUser(localUser) + require.NoError(t, err) + + usernames, err := r.GetLdapUsernames() + require.NoError(t, err) + assert.Contains(t, usernames, "ldapuser1") + assert.NotContains(t, usernames, "localuser1") + + err = r.DelUser("ldapuser1") + require.NoError(t, err) + err = r.DelUser("localuser1") + require.NoError(t, err) +} + +func TestAddRole(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("add valid role", func(t *testing.T) { + user := &schema.User{ + Username: "roleuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddRole(ctx, "roleuser1", "admin") + require.NoError(t, err) + + retrieved, err := r.GetUser("roleuser1") + require.NoError(t, err) + assert.Contains(t, retrieved.Roles, "admin") + assert.Contains(t, retrieved.Roles, "user") + + err = r.DelUser("roleuser1") + require.NoError(t, err) + }) + + t.Run("add duplicate role", func(t *testing.T) { + user := &schema.User{ + Username: "roleuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddRole(ctx, "roleuser2", "user") + assert.Error(t, err, "Adding duplicate role should fail") + assert.Contains(t, err.Error(), "already has role") + + err = r.DelUser("roleuser2") + require.NoError(t, err) + }) + + t.Run("add invalid role", func(t *testing.T) { + user := &schema.User{ + Username: "roleuser3", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddRole(ctx, "roleuser3", "invalidrole") + assert.Error(t, err, "Adding invalid role should fail") + assert.Contains(t, err.Error(), "no valid option") + + err = r.DelUser("roleuser3") + require.NoError(t, err) + }) +} + +func TestRemoveRole(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("remove existing role", func(t *testing.T) { + user := &schema.User{ + Username: "rmroleuser1", + Roles: []string{"user", "admin"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveRole(ctx, "rmroleuser1", "admin") + require.NoError(t, err) + + retrieved, err := r.GetUser("rmroleuser1") + require.NoError(t, err) + assert.NotContains(t, retrieved.Roles, "admin") + assert.Contains(t, retrieved.Roles, "user") + + err = r.DelUser("rmroleuser1") + require.NoError(t, err) + }) + + t.Run("remove non-existent role", func(t *testing.T) { + user := &schema.User{ + Username: "rmroleuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveRole(ctx, "rmroleuser2", "admin") + assert.Error(t, err, "Removing non-existent role should fail") + assert.Contains(t, err.Error(), "already deleted") + + err = r.DelUser("rmroleuser2") + require.NoError(t, err) + }) + + t.Run("remove manager role with projects", func(t *testing.T) { + user := &schema.User{ + Username: "rmroleuser3", + Roles: []string{"manager"}, + Projects: []string{"proj1", "proj2"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveRole(ctx, "rmroleuser3", "manager") + assert.Error(t, err, "Removing manager role with projects should fail") + assert.Contains(t, err.Error(), "still has assigned project") + + err = r.DelUser("rmroleuser3") + require.NoError(t, err) + }) +} + +func TestAddProject(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("add project to manager", func(t *testing.T) { + user := &schema.User{ + Username: "projuser1", + Roles: []string{"manager"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddProject(ctx, "projuser1", "newproject") + require.NoError(t, err) + + retrieved, err := r.GetUser("projuser1") + require.NoError(t, err) + assert.Contains(t, retrieved.Projects, "newproject") + + err = r.DelUser("projuser1") + require.NoError(t, err) + }) + + t.Run("add project to non-manager", func(t *testing.T) { + user := &schema.User{ + Username: "projuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddProject(ctx, "projuser2", "newproject") + assert.Error(t, err, "Adding project to non-manager should fail") + assert.Contains(t, err.Error(), "not a manager") + + err = r.DelUser("projuser2") + require.NoError(t, err) + }) + + t.Run("add duplicate project", func(t *testing.T) { + user := &schema.User{ + Username: "projuser3", + Roles: []string{"manager"}, + Projects: []string{"existingproject"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddProject(ctx, "projuser3", "existingproject") + assert.Error(t, err, "Adding duplicate project should fail") + assert.Contains(t, err.Error(), "already manages") + + err = r.DelUser("projuser3") + require.NoError(t, err) + }) +} + +func TestRemoveProject(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("remove existing project", func(t *testing.T) { + user := &schema.User{ + Username: "rmprojuser1", + Roles: []string{"manager"}, + Projects: []string{"proj1", "proj2"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveProject(ctx, "rmprojuser1", "proj1") + require.NoError(t, err) + + retrieved, err := r.GetUser("rmprojuser1") + require.NoError(t, err) + assert.NotContains(t, retrieved.Projects, "proj1") + assert.Contains(t, retrieved.Projects, "proj2") + + err = r.DelUser("rmprojuser1") + require.NoError(t, err) + }) + + t.Run("remove non-existent project", func(t *testing.T) { + user := &schema.User{ + Username: "rmprojuser2", + Roles: []string{"manager"}, + Projects: []string{"proj1"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveProject(ctx, "rmprojuser2", "nonexistent") + assert.Error(t, err, "Removing non-existent project should fail") + + err = r.DelUser("rmprojuser2") + require.NoError(t, err) + }) + + t.Run("remove project from non-manager", func(t *testing.T) { + user := &schema.User{ + Username: "rmprojuser3", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveProject(ctx, "rmprojuser3", "proj1") + assert.Error(t, err, "Removing project from non-manager should fail") + assert.Contains(t, err.Error(), "not a manager") + + err = r.DelUser("rmprojuser3") + require.NoError(t, err) + }) +} + +func TestGetUserFromContext(t *testing.T) { + t.Run("get user from context", func(t *testing.T) { + user := &schema.User{ + Username: "contextuser", + Roles: []string{"user"}, + } + + ctx := context.WithValue(context.Background(), ContextUserKey, user) + retrieved := GetUserFromContext(ctx) + + require.NotNil(t, retrieved) + assert.Equal(t, user.Username, retrieved.Username) + }) + + t.Run("get user from empty context", func(t *testing.T) { + ctx := context.Background() + retrieved := GetUserFromContext(ctx) + + assert.Nil(t, retrieved) + }) +} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index 0839603d..2a5a0a7d 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -64,7 +64,7 @@ func newTagger() { func Init() { initOnce.Do(func() { newTagger() - repository.RegisterJobJook(jobTagger) + repository.RegisterJobHook(jobTagger) }) } From e1efc684763cff3cf379a9ce8704f37706833e1d Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Thu, 15 Jan 2026 08:32:06 +0100 Subject: [PATCH 2/2] Update dependencies. Rebuild graphql and swagger --- api/swagger.json | 1178 +++++++++++++++++++++--- api/swagger.yaml | 690 ++++++++++++-- go.mod | 4 +- go.sum | 35 +- internal/api/cluster.go | 2 +- internal/api/docs.go | 1178 +++++++++++++++++++++--- internal/api/job.go | 27 +- internal/api/node.go | 2 +- internal/api/user.go | 2 +- internal/graph/generated/generated.go | 20 +- internal/graph/schema.resolvers.go | 4 +- internal/importer/handleImport.go | 1 + internal/importer/normalize.go | 1 + internal/metricstore/metricstore.go | 4 +- internal/repository/hooks.go | 1 + internal/repository/job.go | 2 - internal/repository/jobHooks.go | 1 + internal/repository/job_test.go | 4 +- internal/repository/node.go | 2 +- internal/repository/repository_test.go | 8 +- internal/repository/stats.go | 356 +++++-- internal/repository/tags.go | 197 +++- pkg/archive/clusterConfig.go | 4 +- pkg/archive/nodelist.go | 153 ++- 24 files changed, 3321 insertions(+), 555 deletions(-) diff --git a/api/swagger.json b/api/swagger.json index 0327a91d..42ed7bb6 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -18,11 +18,6 @@ "paths": { "/api/clusters/": { "get": { - "security": [ - { - "ApiKeyAuth": [] - } - ], "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", "produces": [ "application/json" @@ -43,7 +38,7 @@ "200": { "description": "Array of clusters", "schema": { - "$ref": "#/definitions/api.GetClustersApiResponse" + "$ref": "#/definitions/api.GetClustersAPIResponse" } }, "400": { @@ -70,16 +65,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/": { + "get": { "description": "Get a list of all jobs. Filters can be applied using query parameters.\nNumber of results can be limited by page. Results are sorted by descending startTime.", "produces": [ "application/json" @@ -138,7 +133,7 @@ "200": { "description": "Job array and page info", "schema": { - "$ref": "#/definitions/api.GetJobsApiResponse" + "$ref": "#/definitions/api.GetJobsAPIResponse" } }, "400": { @@ -165,16 +160,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/": { + "delete": { "description": "Job to delete is specified by request body. All fields are required in this case.", "consumes": [ "application/json" @@ -193,7 +188,7 @@ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.DeleteJobApiRequest" + "$ref": "#/definitions/api.DeleteJobAPIRequest" } } ], @@ -201,7 +196,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -240,16 +235,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/{id}": { + "delete": { "description": "Job to remove is specified by database ID. This will not remove the job from the job archive.", "produces": [ "application/json" @@ -271,7 +266,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -310,16 +305,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job_before/{ts}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job_before/{ts}": { + "delete": { "description": "Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.", "produces": [ "application/json" @@ -335,13 +330,19 @@ "name": "ts", "in": "path", "required": true + }, + { + "type": "boolean", + "description": "Omit jobs with tags from deletion", + "name": "omit-tagged", + "in": "query" } ], "responses": { "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -380,16 +381,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/edit_meta/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/edit_meta/{id}": { + "post": { "description": "Edit key value pairs in job metadata json\nIf a key already exists its content will be overwritten", "consumes": [ "application/json" @@ -450,16 +451,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/start_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/start_job/": { + "post": { "description": "Job specified in request body will be saved to database as \"running\" with new DB ID.\nJob specifications follow the 'Job' scheme, API will fail to execute if requirements are not met.", "consumes": [ "application/json" @@ -486,7 +487,7 @@ "201": { "description": "Job added successfully", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -519,16 +520,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/stop_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/stop_job/": { + "post": { "description": "Job to stop is specified by request body. All fields are required in this case.\nReturns full job resource information according to 'Job' scheme.", "produces": [ "application/json" @@ -544,7 +545,7 @@ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" + "$ref": "#/definitions/api.StopJobAPIRequest" } } ], @@ -591,16 +592,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/tag_job/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/tag_job/{id}": { + "post": { "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" @@ -628,7 +629,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -664,16 +665,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/{id}": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/{id}": { + "get": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "produces": [ "application/json" @@ -701,7 +702,7 @@ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -740,14 +741,14 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - }, - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + }, + "post": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "consumes": [ "application/json" @@ -784,7 +785,7 @@ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -823,16 +824,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/nodestats/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/nodestats/": { + "post": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -856,7 +857,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -883,16 +884,86 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/users/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/api/user/{id}": { + "post": { + "description": "Allows admins to add/remove roles and projects for a user", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Update user roles and projects", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Role to add", + "name": "add-role", + "in": "formData" + }, + { + "type": "string", + "description": "Role to remove", + "name": "remove-role", + "in": "formData" + }, + { + "type": "string", + "description": "Project to add", + "name": "add-project", + "in": "formData" + }, + { + "type": "string", + "description": "Project to remove", + "name": "remove-project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/api/users/": { + "get": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -916,7 +987,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiReturnedUser" + "$ref": "#/definitions/api.APIReturnedUser" } } }, @@ -944,16 +1015,361 @@ "type": "string" } } - } - } - }, - "/jobs/tag_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + }, + "post": { + "description": "Creates a new user with specified credentials and role", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Create a new user", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "username", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Password (not required for API users)", + "name": "password", + "in": "formData" + }, + { + "type": "string", + "description": "User role", + "name": "role", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Full name", + "name": "name", + "in": "formData" + }, + { + "type": "string", + "description": "Email address", + "name": "email", + "in": "formData" + }, + { + "type": "string", + "description": "Project (required for managers)", + "name": "project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "delete": { + "description": "Deletes a user from the system", + "produces": [ + "text/plain" + ], + "tags": [ + "User" + ], + "summary": "Delete a user", + "parameters": [ + { + "type": "string", + "description": "Username to delete", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Success", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/configuration/": { + "post": { + "description": "Updates a user's configuration key-value pair.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Update user configuration", + "parameters": [ + { + "type": "string", + "description": "Configuration key", + "name": "key", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Configuration value", + "name": "value", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "success", + "schema": { + "type": "string" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/debug/": { + "post": { + "description": "This endpoint allows the users to print the content of", + "produces": [ + "application/json" + ], + "tags": [ + "debug" + ], + "summary": "Debug endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/free/": { + "post": { + "description": "This endpoint allows the users to free the Buffers from the", + "produces": [ + "application/json" + ], + "tags": [ + "free" + ], + "parameters": [ + { + "type": "string", + "description": "up to timestamp", + "name": "to", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/healthcheck/": { + "get": { + "description": "This endpoint allows the users to check if a node is healthy", + "produces": [ + "application/json" + ], + "tags": [ + "healthcheck" + ], + "summary": "HealthCheck endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/jobs/tag_job/{id}": { + "delete": { "description": "Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nIf tagged job is already finished: Tag will be removed from respective archive files.", "consumes": [ "application/json" @@ -981,7 +1397,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1017,16 +1433,276 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/tags/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/jwt/": { + "get": { + "description": "Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.", + "consumes": [ + "multipart/form-data" ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Generate JWT token", + "parameters": [ + { + "type": "string", + "description": "Username to generate JWT for", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "JWT token", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "User Not Found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/machine_state/{cluster}/{host}": { + "get": { + "description": "Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "produces": [ + "application/json" + ], + "tags": [ + "Machine State" + ], + "summary": "Retrieve machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Machine state JSON data", + "schema": { + "type": "object" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled or file not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "put": { + "description": "Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "consumes": [ + "application/json" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Machine State" + ], + "summary": "Store machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "201": { + "description": "Created" + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/notice/": { + "post": { + "description": "Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Config" + ], + "summary": "Update system notice", + "parameters": [ + { + "type": "string", + "description": "New notice content (max 10000 characters)", + "name": "new-content", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Update Notice Content Success", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/roles/": { + "get": { + "description": "Returns a list of valid user roles. Only admins are allowed.", + "produces": [ + "application/json" + ], + "tags": [ + "Config" + ], + "summary": "Get available roles", + "responses": { + "200": { + "description": "List of role names", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/tags/": { + "delete": { "description": "Removes tags by type and name. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nTag wills be removed from respective archive files.", "consumes": [ "application/json" @@ -1047,7 +1723,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1083,12 +1759,72 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/write/": { + "post": { + "consumes": [ + "text/plain" + ], + "produces": [ + "application/json" + ], + "parameters": [ + { + "type": "string", + "description": "If the lines in the body do not have a cluster tag, use this value instead.", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } } }, "definitions": { - "api.ApiReturnedUser": { + "api.APIReturnedUser": { "type": "object", "properties": { "email": { @@ -1114,7 +1850,7 @@ } } }, - "api.ApiTag": { + "api.APITag": { "type": "object", "properties": { "name": { @@ -1134,7 +1870,7 @@ } } }, - "api.DefaultApiResponse": { + "api.DefaultAPIResponse": { "type": "object", "properties": { "msg": { @@ -1142,7 +1878,7 @@ } } }, - "api.DeleteJobApiRequest": { + "api.DeleteJobAPIRequest": { "type": "object", "required": [ "jobId" @@ -1191,7 +1927,7 @@ } } }, - "api.GetClustersApiResponse": { + "api.GetClustersAPIResponse": { "type": "object", "properties": { "clusters": { @@ -1203,7 +1939,7 @@ } } }, - "api.GetJobApiResponse": { + "api.GetJobAPIResponse": { "type": "object", "properties": { "data": { @@ -1217,7 +1953,7 @@ } } }, - "api.GetJobsApiResponse": { + "api.GetJobsAPIResponse": { "type": "object", "properties": { "items": { @@ -1251,39 +1987,7 @@ } } }, - "api.Node": { - "type": "object", - "properties": { - "cpusAllocated": { - "type": "integer" - }, - "cpusTotal": { - "type": "integer" - }, - "gpusAllocated": { - "type": "integer" - }, - "gpusTotal": { - "type": "integer" - }, - "hostname": { - "type": "string" - }, - "memoryAllocated": { - "type": "integer" - }, - "memoryTotal": { - "type": "integer" - }, - "states": { - "type": "array", - "items": { - "type": "string" - } - } - } - }, - "api.StopJobApiRequest": { + "api.StopJobAPIRequest": { "type": "object", "required": [ "jobState", @@ -1326,7 +2030,7 @@ "nodes": { "type": "array", "items": { - "$ref": "#/definitions/api.Node" + "$ref": "#/definitions/schema.NodePayload" } } } @@ -1335,12 +2039,15 @@ "type": "object", "properties": { "id": { + "description": "Unique identifier for the accelerator (e.g., \"0\", \"1\", \"GPU-0\")", "type": "string" }, "model": { + "description": "Specific model name (e.g., \"A100\", \"MI100\")", "type": "string" }, "type": { + "description": "Type of accelerator (e.g., \"Nvidia GPU\", \"AMD GPU\")", "type": "string" } } @@ -1349,15 +2056,18 @@ "type": "object", "properties": { "metricConfig": { + "description": "Cluster-wide metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Unique cluster name (e.g., \"fritz\", \"alex\")", "type": "string" }, "subClusters": { + "description": "Homogeneous partitions within the cluster", "type": "array", "items": { "$ref": "#/definitions/schema.SubCluster" @@ -1366,6 +2076,7 @@ } }, "schema.Job": { + "description": "Information of a HPC job.", "type": "object", "properties": { "arrayJobId": { @@ -1394,6 +2105,13 @@ "format": "float64" } }, + "exclusive": { + "description": "for backwards compatibility", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, "footprint": { "type": "object", "additionalProperties": { @@ -1416,7 +2134,7 @@ "deadline", "failed", "node_fail", - "out_of_memory", + "out-of-memory", "pending", "preempted", "running", @@ -1528,9 +2246,11 @@ "type": "object", "properties": { "id": { + "description": "Internal database ID", "type": "integer" }, "jobId": { + "description": "The job's external job ID", "type": "integer" } } @@ -1539,9 +2259,11 @@ "type": "object", "properties": { "count": { + "description": "Total count of available items", "type": "integer" }, "items": { + "description": "List of job links", "type": "array", "items": { "$ref": "#/definitions/schema.JobLink" @@ -1553,19 +2275,31 @@ "type": "object", "properties": { "series": { + "description": "Individual time series data", "type": "array", "items": { "$ref": "#/definitions/schema.Series" } }, "statisticsSeries": { - "$ref": "#/definitions/schema.StatsSeries" + "description": "Aggregated statistics over time", + "allOf": [ + { + "$ref": "#/definitions/schema.StatsSeries" + } + ] }, "timestep": { + "description": "Sampling interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1631,46 +2365,71 @@ "type": "object", "properties": { "aggregation": { + "description": "Aggregation function (avg, sum, min, max)", "type": "string" }, "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement method", "type": "string" }, "footprint": { + "description": "Footprint category", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values are better", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, + "restrict": { + "description": "Restrict visibility to non user roles", + "type": "boolean" + }, "scope": { - "$ref": "#/definitions/schema.MetricScope" + "description": "Metric scope (node, socket, core, etc.)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricScope" + } + ] }, "subClusters": { + "description": "Subcluster-specific overrides", "type": "array", "items": { "$ref": "#/definitions/schema.SubClusterConfig" } }, "timestep": { + "description": "Measurement interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1699,12 +2458,15 @@ "type": "object", "properties": { "avg": { + "description": "Average/mean value", "type": "number" }, "max": { + "description": "Maximum value", "type": "number" }, "min": { + "description": "Minimum value", "type": "number" } } @@ -1713,30 +2475,72 @@ "type": "object", "properties": { "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement (e.g., FLOP/s, GB/s)", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] }, "value": { + "description": "Numeric value of the measurement", "type": "number" } } }, + "schema.NodePayload": { + "type": "object", + "properties": { + "cpusAllocated": { + "description": "Number of allocated CPUs", + "type": "integer" + }, + "gpusAllocated": { + "description": "Number of allocated GPUs", + "type": "integer" + }, + "hostname": { + "description": "Node hostname", + "type": "string" + }, + "jobsRunning": { + "description": "Number of running jobs", + "type": "integer" + }, + "memoryAllocated": { + "description": "Allocated memory in MB", + "type": "integer" + }, + "states": { + "description": "State strings (flexible format)", + "type": "array", + "items": { + "type": "string" + } + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", "properties": { "accelerators": { + "description": "Allocated accelerator IDs (e.g., GPU IDs)", "type": "array", "items": { "type": "string" } }, "configuration": { + "description": "Optional configuration identifier", "type": "string" }, "hostname": { + "description": "Node hostname", "type": "string" }, "hwthreads": { + "description": "Allocated hardware thread IDs", "type": "array", "items": { "type": "integer" @@ -1748,19 +2552,27 @@ "type": "object", "properties": { "data": { + "description": "Time series measurements", "type": "array", "items": { "type": "number" } }, "hostname": { + "description": "Source hostname", "type": "string" }, "id": { + "description": "Optional ID (e.g., core ID, GPU ID)", "type": "string" }, "statistics": { - "$ref": "#/definitions/schema.MetricStatistics" + "description": "Statistical summary (min/avg/max)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricStatistics" + } + ] } } }, @@ -1768,30 +2580,35 @@ "type": "object", "properties": { "max": { + "description": "Maximum values over time", "type": "array", "items": { "type": "number" } }, "mean": { + "description": "Mean values over time", "type": "array", "items": { "type": "number" } }, "median": { + "description": "Median values over time", "type": "array", "items": { "type": "number" } }, "min": { + "description": "Minimum values over time", "type": "array", "items": { "type": "number" } }, "percentiles": { + "description": "Percentile values over time (e.g., 10th, 50th, 90th)", "type": "object", "additionalProperties": { "type": "array", @@ -1807,52 +2624,81 @@ "type": "object", "properties": { "coresPerSocket": { + "description": "Number of cores per CPU socket", "type": "integer" }, "energyFootprint": { + "description": "Energy-related footprint metrics", "type": "array", "items": { "type": "string" } }, "flopRateScalar": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical scalar FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "flopRateSimd": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical SIMD FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "footprint": { + "description": "Default footprint metrics for jobs", "type": "array", "items": { "type": "string" } }, "memoryBandwidth": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical memory bandwidth per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "metricConfig": { + "description": "Subcluster-specific metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Name of the subcluster (e.g., \"main\", \"gpu\", \"bigmem\")", "type": "string" }, "nodes": { + "description": "Node list in condensed format (e.g., \"node[001-100]\")", "type": "string" }, "processorType": { + "description": "CPU model (e.g., \"Intel Xeon Gold 6148\")", "type": "string" }, "socketsPerNode": { + "description": "Number of CPU sockets per node", "type": "integer" }, "threadsPerCore": { + "description": "Number of hardware threads per core (SMT level)", "type": "integer" }, "topology": { - "$ref": "#/definitions/schema.Topology" + "description": "Hardware topology of nodes in this subcluster", + "allOf": [ + { + "$ref": "#/definitions/schema.Topology" + } + ] } } }, @@ -1860,34 +2706,52 @@ "type": "object", "properties": { "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement configuration", "type": "string" }, "footprint": { + "description": "Footprint category for this metric", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values indicate better performance", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, "remove": { + "description": "Whether to exclude this metric for this subcluster", + "type": "boolean" + }, + "restrict": { + "description": "Restrict visibility to non user roles", "type": "boolean" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1916,12 +2780,14 @@ "type": "object", "properties": { "accelerators": { + "description": "Attached accelerators (GPUs, etc.)", "type": "array", "items": { "$ref": "#/definitions/schema.Accelerator" } }, "core": { + "description": "Hardware threads grouped by core", "type": "array", "items": { "type": "array", @@ -1931,6 +2797,7 @@ } }, "die": { + "description": "Hardware threads grouped by die (optional)", "type": "array", "items": { "type": "array", @@ -1940,6 +2807,7 @@ } }, "memoryDomain": { + "description": "Hardware threads grouped by NUMA domain", "type": "array", "items": { "type": "array", @@ -1949,12 +2817,14 @@ } }, "node": { + "description": "All hardware thread IDs on this node", "type": "array", "items": { "type": "integer" } }, "socket": { + "description": "Hardware threads grouped by socket", "type": "array", "items": { "type": "array", @@ -1969,9 +2839,11 @@ "type": "object", "properties": { "base": { + "description": "Base unit (e.g., \"B/s\", \"F/s\", \"W\")", "type": "string" }, "prefix": { + "description": "SI prefix (e.g., \"G\", \"M\", \"K\", \"T\")", "type": "string" } } diff --git a/api/swagger.yaml b/api/swagger.yaml index 119e9529..0bf60082 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -1,5 +1,5 @@ definitions: - api.ApiReturnedUser: + api.APIReturnedUser: properties: email: type: string @@ -16,7 +16,7 @@ definitions: username: type: string type: object - api.ApiTag: + api.APITag: properties: name: description: Tag Name @@ -31,12 +31,12 @@ definitions: example: Debug type: string type: object - api.DefaultApiResponse: + api.DefaultAPIResponse: properties: msg: type: string type: object - api.DeleteJobApiRequest: + api.DeleteJobAPIRequest: properties: cluster: description: Cluster of job @@ -71,7 +71,7 @@ definitions: description: Statustext of Errorcode type: string type: object - api.GetClustersApiResponse: + api.GetClustersAPIResponse: properties: clusters: description: Array of clusters @@ -79,7 +79,7 @@ definitions: $ref: '#/definitions/schema.Cluster' type: array type: object - api.GetJobApiResponse: + api.GetJobAPIResponse: properties: data: items: @@ -88,7 +88,7 @@ definitions: meta: $ref: '#/definitions/schema.Job' type: object - api.GetJobsApiResponse: + api.GetJobsAPIResponse: properties: items: description: Number of jobs returned @@ -111,28 +111,7 @@ definitions: scope: $ref: '#/definitions/schema.MetricScope' type: object - api.Node: - properties: - cpusAllocated: - type: integer - cpusTotal: - type: integer - gpusAllocated: - type: integer - gpusTotal: - type: integer - hostname: - type: string - memoryAllocated: - type: integer - memoryTotal: - type: integer - states: - items: - type: string - type: array - type: object - api.StopJobApiRequest: + api.StopJobAPIRequest: properties: cluster: example: fritz @@ -161,32 +140,39 @@ definitions: type: string nodes: items: - $ref: '#/definitions/api.Node' + $ref: '#/definitions/schema.NodePayload' type: array type: object schema.Accelerator: properties: id: + description: Unique identifier for the accelerator (e.g., "0", "1", "GPU-0") type: string model: + description: Specific model name (e.g., "A100", "MI100") type: string type: + description: Type of accelerator (e.g., "Nvidia GPU", "AMD GPU") type: string type: object schema.Cluster: properties: metricConfig: + description: Cluster-wide metric configurations items: $ref: '#/definitions/schema.MetricConfig' type: array name: + description: Unique cluster name (e.g., "fritz", "alex") type: string subClusters: + description: Homogeneous partitions within the cluster items: $ref: '#/definitions/schema.SubCluster' type: array type: object schema.Job: + description: Information of a HPC job. properties: arrayJobId: example: 123000 @@ -207,6 +193,12 @@ definitions: format: float64 type: number type: object + exclusive: + description: for backwards compatibility + example: 1 + maximum: 2 + minimum: 0 + type: integer footprint: additionalProperties: format: float64 @@ -227,7 +219,7 @@ definitions: - deadline - failed - node_fail - - out_of_memory + - out-of-memory - pending - preempted - running @@ -307,15 +299,19 @@ definitions: schema.JobLink: properties: id: + description: Internal database ID type: integer jobId: + description: The job's external job ID type: integer type: object schema.JobLinkResultList: properties: count: + description: Total count of available items type: integer items: + description: List of job links items: $ref: '#/definitions/schema.JobLink' type: array @@ -323,15 +319,21 @@ definitions: schema.JobMetric: properties: series: + description: Individual time series data items: $ref: '#/definitions/schema.Series' type: array statisticsSeries: - $ref: '#/definitions/schema.StatsSeries' + allOf: + - $ref: '#/definitions/schema.StatsSeries' + description: Aggregated statistics over time timestep: + description: Sampling interval in seconds type: integer unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.JobState: enum: @@ -385,33 +387,51 @@ definitions: schema.MetricConfig: properties: aggregation: + description: Aggregation function (avg, sum, min, max) type: string alert: + description: Alert threshold (requires attention) type: number caution: + description: Caution threshold (concerning but not critical) type: number energy: + description: Energy measurement method type: string footprint: + description: Footprint category type: string lowerIsBetter: + description: Whether lower values are better type: boolean name: + description: Metric name (e.g., "cpu_load", "mem_used") type: string normal: + description: Normal/typical value (good performance) type: number peak: + description: Peak/maximum expected value (best performance) type: number + restrict: + description: Restrict visibility to non user roles + type: boolean scope: - $ref: '#/definitions/schema.MetricScope' + allOf: + - $ref: '#/definitions/schema.MetricScope' + description: Metric scope (node, socket, core, etc.) subClusters: + description: Subcluster-specific overrides items: $ref: '#/definitions/schema.SubClusterConfig' type: array timestep: + description: Measurement interval in seconds type: integer unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.MetricScope: enum: @@ -434,31 +454,64 @@ definitions: schema.MetricStatistics: properties: avg: + description: Average/mean value type: number max: + description: Maximum value type: number min: + description: Minimum value type: number type: object schema.MetricValue: properties: unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement (e.g., FLOP/s, GB/s) value: + description: Numeric value of the measurement type: number type: object + schema.NodePayload: + properties: + cpusAllocated: + description: Number of allocated CPUs + type: integer + gpusAllocated: + description: Number of allocated GPUs + type: integer + hostname: + description: Node hostname + type: string + jobsRunning: + description: Number of running jobs + type: integer + memoryAllocated: + description: Allocated memory in MB + type: integer + states: + description: State strings (flexible format) + items: + type: string + type: array + type: object schema.Resource: description: A resource used by a job properties: accelerators: + description: Allocated accelerator IDs (e.g., GPU IDs) items: type: string type: array configuration: + description: Optional configuration identifier type: string hostname: + description: Node hostname type: string hwthreads: + description: Allocated hardware thread IDs items: type: integer type: array @@ -466,31 +519,40 @@ definitions: schema.Series: properties: data: + description: Time series measurements items: type: number type: array hostname: + description: Source hostname type: string id: + description: Optional ID (e.g., core ID, GPU ID) type: string statistics: - $ref: '#/definitions/schema.MetricStatistics' + allOf: + - $ref: '#/definitions/schema.MetricStatistics' + description: Statistical summary (min/avg/max) type: object schema.StatsSeries: properties: max: + description: Maximum values over time items: type: number type: array mean: + description: Mean values over time items: type: number type: array median: + description: Median values over time items: type: number type: array min: + description: Minimum values over time items: type: number type: array @@ -500,65 +562,97 @@ definitions: format: float64 type: number type: array + description: Percentile values over time (e.g., 10th, 50th, 90th) type: object type: object schema.SubCluster: properties: coresPerSocket: + description: Number of cores per CPU socket type: integer energyFootprint: + description: Energy-related footprint metrics items: type: string type: array flopRateScalar: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical scalar FLOP rate per node flopRateSimd: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical SIMD FLOP rate per node footprint: + description: Default footprint metrics for jobs items: type: string type: array memoryBandwidth: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical memory bandwidth per node metricConfig: + description: Subcluster-specific metric configurations items: $ref: '#/definitions/schema.MetricConfig' type: array name: + description: Name of the subcluster (e.g., "main", "gpu", "bigmem") type: string nodes: + description: Node list in condensed format (e.g., "node[001-100]") type: string processorType: + description: CPU model (e.g., "Intel Xeon Gold 6148") type: string socketsPerNode: + description: Number of CPU sockets per node type: integer threadsPerCore: + description: Number of hardware threads per core (SMT level) type: integer topology: - $ref: '#/definitions/schema.Topology' + allOf: + - $ref: '#/definitions/schema.Topology' + description: Hardware topology of nodes in this subcluster type: object schema.SubClusterConfig: properties: alert: + description: Alert threshold (requires attention) type: number caution: + description: Caution threshold (concerning but not critical) type: number energy: + description: Energy measurement configuration type: string footprint: + description: Footprint category for this metric type: string lowerIsBetter: + description: Whether lower values indicate better performance type: boolean name: + description: Metric name (e.g., "cpu_load", "mem_used") type: string normal: + description: Normal/typical value (good performance) type: number peak: + description: Peak/maximum expected value (best performance) type: number remove: + description: Whether to exclude this metric for this subcluster + type: boolean + restrict: + description: Restrict visibility to non user roles type: boolean unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.Tag: description: Defines a tag using name and type. @@ -578,32 +672,38 @@ definitions: schema.Topology: properties: accelerators: + description: Attached accelerators (GPUs, etc.) items: $ref: '#/definitions/schema.Accelerator' type: array core: + description: Hardware threads grouped by core items: items: type: integer type: array type: array die: + description: Hardware threads grouped by die (optional) items: items: type: integer type: array type: array memoryDomain: + description: Hardware threads grouped by NUMA domain items: items: type: integer type: array type: array node: + description: All hardware thread IDs on this node items: type: integer type: array socket: + description: Hardware threads grouped by socket items: items: type: integer @@ -613,8 +713,10 @@ definitions: schema.Unit: properties: base: + description: Base unit (e.g., "B/s", "F/s", "W") type: string prefix: + description: SI prefix (e.g., "G", "M", "K", "T") type: string type: object host: localhost:8080 @@ -645,7 +747,7 @@ paths: "200": description: Array of clusters schema: - $ref: '#/definitions/api.GetClustersApiResponse' + $ref: '#/definitions/api.GetClustersAPIResponse' "400": description: Bad Request schema: @@ -710,7 +812,7 @@ paths: "200": description: Job array and page info schema: - $ref: '#/definitions/api.GetJobsApiResponse' + $ref: '#/definitions/api.GetJobsAPIResponse' "400": description: Bad Request schema: @@ -753,7 +855,7 @@ paths: "200": description: Job resource schema: - $ref: '#/definitions/api.GetJobApiResponse' + $ref: '#/definitions/api.GetJobAPIResponse' "400": description: Bad Request schema: @@ -810,7 +912,7 @@ paths: "200": description: Job resource schema: - $ref: '#/definitions/api.GetJobApiResponse' + $ref: '#/definitions/api.GetJobAPIResponse' "400": description: Bad Request schema: @@ -853,14 +955,14 @@ paths: name: request required: true schema: - $ref: '#/definitions/api.DeleteJobApiRequest' + $ref: '#/definitions/api.DeleteJobAPIRequest' produces: - application/json responses: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -907,7 +1009,7 @@ paths: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -948,13 +1050,17 @@ paths: name: ts required: true type: integer + - description: Omit jobs with tags from deletion + in: query + name: omit-tagged + type: boolean produces: - application/json responses: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1052,7 +1158,7 @@ paths: "201": description: Job added successfully schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1090,7 +1196,7 @@ paths: name: request required: true schema: - $ref: '#/definitions/api.StopJobApiRequest' + $ref: '#/definitions/api.StopJobAPIRequest' produces: - application/json responses: @@ -1147,7 +1253,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - application/json @@ -1195,7 +1301,7 @@ paths: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1217,7 +1323,80 @@ paths: summary: Deliver updated Slurm node states tags: - Nodestates + /api/user/{id}: + post: + description: Allows admins to add/remove roles and projects for a user + parameters: + - description: Username + in: path + name: id + required: true + type: string + - description: Role to add + in: formData + name: add-role + type: string + - description: Role to remove + in: formData + name: remove-role + type: string + - description: Project to add + in: formData + name: add-project + type: string + - description: Project to remove + in: formData + name: remove-project + type: string + produces: + - text/plain + responses: + "200": + description: Success message + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update user roles and projects + tags: + - User /api/users/: + delete: + description: Deletes a user from the system + parameters: + - description: Username to delete + in: formData + name: username + required: true + type: string + produces: + - text/plain + responses: + "200": + description: Success + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Delete a user + tags: + - User get: description: |- Returns a JSON-encoded list of users. @@ -1236,7 +1415,7 @@ paths: description: List of users returned successfully schema: items: - $ref: '#/definitions/api.ApiReturnedUser' + $ref: '#/definitions/api.APIReturnedUser' type: array "400": description: Bad Request @@ -1259,6 +1438,198 @@ paths: summary: Returns a list of users tags: - User + post: + description: Creates a new user with specified credentials and role + parameters: + - description: Username + in: formData + name: username + required: true + type: string + - description: Password (not required for API users) + in: formData + name: password + type: string + - description: User role + in: formData + name: role + required: true + type: string + - description: Full name + in: formData + name: name + type: string + - description: Email address + in: formData + name: email + type: string + - description: Project (required for managers) + in: formData + name: project + type: string + produces: + - text/plain + responses: + "200": + description: Success message + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Create a new user + tags: + - User + /configuration/: + post: + consumes: + - multipart/form-data + description: Updates a user's configuration key-value pair. + parameters: + - description: Configuration key + in: formData + name: key + required: true + type: string + - description: Configuration value + in: formData + name: value + required: true + type: string + produces: + - text/plain + responses: + "200": + description: success + schema: + type: string + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update user configuration + tags: + - Frontend + /debug/: + post: + description: This endpoint allows the users to print the content of + parameters: + - description: Selector + in: query + name: selector + type: string + produces: + - application/json + responses: + "200": + description: Debug dump + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Debug endpoint + tags: + - debug + /free/: + post: + description: This endpoint allows the users to free the Buffers from the + parameters: + - description: up to timestamp + in: query + name: to + type: string + produces: + - application/json + responses: + "200": + description: ok + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + tags: + - free + /healthcheck/: + get: + description: This endpoint allows the users to check if a node is healthy + parameters: + - description: Selector + in: query + name: selector + type: string + produces: + - application/json + responses: + "200": + description: Debug dump + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: HealthCheck endpoint + tags: + - healthcheck /jobs/tag_job/{id}: delete: consumes: @@ -1279,7 +1650,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - application/json @@ -1309,6 +1680,176 @@ paths: summary: Removes one or more tags from a job tags: - Job add and modify + /jwt/: + get: + consumes: + - multipart/form-data + description: Generates a JWT token for a user. Admins can generate tokens for + any user, regular users only for themselves. + parameters: + - description: Username to generate JWT for + in: formData + name: username + required: true + type: string + produces: + - text/plain + responses: + "200": + description: JWT token + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: User Not Found + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Generate JWT token + tags: + - Frontend + /machine_state/{cluster}/{host}: + get: + description: Retrieves stored machine state data for a specific cluster node. + Validates cluster and host names to prevent path traversal. + parameters: + - description: Cluster name + in: path + name: cluster + required: true + type: string + - description: Host name + in: path + name: host + required: true + type: string + produces: + - application/json + responses: + "200": + description: Machine state JSON data + schema: + type: object + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Machine state not enabled or file not found + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Retrieve machine state + tags: + - Machine State + put: + consumes: + - application/json + description: Stores machine state data for a specific cluster node. Validates + cluster and host names to prevent path traversal. + parameters: + - description: Cluster name + in: path + name: cluster + required: true + type: string + - description: Host name + in: path + name: host + required: true + type: string + produces: + - text/plain + responses: + "201": + description: Created + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Machine state not enabled + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Store machine state + tags: + - Machine State + /notice/: + post: + consumes: + - multipart/form-data + description: Updates the notice.txt file content. Only admins are allowed. Content + is limited to 10000 characters. + parameters: + - description: New notice content (max 10000 characters) + in: formData + name: new-content + required: true + type: string + produces: + - text/plain + responses: + "200": + description: Update Notice Content Success + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update system notice + tags: + - Config + /roles/: + get: + description: Returns a list of valid user roles. Only admins are allowed. + produces: + - application/json + responses: + "200": + description: List of role names + schema: + items: + type: string + type: array + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Get available roles + tags: + - Config /tags/: delete: consumes: @@ -1324,7 +1865,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - text/plain @@ -1354,6 +1895,41 @@ paths: summary: Removes all tags and job-relations for type:name tuple tags: - Tag remove + /write/: + post: + consumes: + - text/plain + parameters: + - description: If the lines in the body do not have a cluster tag, use this + value instead. + in: query + name: cluster + type: string + produces: + - application/json + responses: + "200": + description: ok + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] securityDefinitions: ApiKeyAuth: in: header diff --git a/go.mod b/go.mod index 808b2e7a..479f1644 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ tool ( require ( github.com/99designs/gqlgen v0.17.85 - github.com/ClusterCockpit/cc-lib/v2 v2.0.0 + github.com/ClusterCockpit/cc-lib/v2 v2.1.0 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.6 @@ -109,7 +109,6 @@ require ( github.com/urfave/cli/v2 v2.27.7 // indirect github.com/urfave/cli/v3 v3.6.1 // indirect github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect - github.com/xtgo/set v1.0.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect @@ -119,7 +118,6 @@ require ( golang.org/x/sys v0.39.0 // indirect golang.org/x/text v0.32.0 // indirect golang.org/x/tools v0.40.0 // indirect - google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 39571309..ef89e2d2 100644 --- a/go.sum +++ b/go.sum @@ -2,12 +2,10 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH1S0= github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib/v2 v2.0.0 h1:OjDADx8mf9SflqeeKUuhy5pamu4YDucae6wUX6vvNNA= -github.com/ClusterCockpit/cc-lib/v2 v2.0.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg= +github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= @@ -74,10 +72,6 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= -github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= -github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= -github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc= github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= @@ -89,16 +83,6 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= -github.com/dhui/dktest v0.4.6 h1:+DPKyScKSEp3VLtbMDHcUq6V5Lm5zfZZVb0Sk7Ahom4= -github.com/dhui/dktest v0.4.6/go.mod h1:JHTSYDtKkvFNFHJKqCzVzqXecyv+tKt8EzceOmQOgbU= -github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= -github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= -github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8= github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= @@ -236,17 +220,8 @@ github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsO github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.33 h1:A5blZ5ulQo2AtayQ9/limgHEkFreKj1Dv226a1K73s0= github.com/mattn/go-sqlite3 v1.14.33/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= -github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= -github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk= +github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g= @@ -318,8 +293,6 @@ github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkW github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= -github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY= -github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= diff --git a/internal/api/cluster.go b/internal/api/cluster.go index b6f41244..d1c3c898 100644 --- a/internal/api/cluster.go +++ b/internal/api/cluster.go @@ -27,7 +27,7 @@ type GetClustersAPIResponse struct { // @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. // @produce json // @param cluster query string false "Job Cluster" -// @success 200 {object} api.GetClustersApiResponse "Array of clusters" +// @success 200 {object} api.GetClustersAPIResponse "Array of clusters" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" diff --git a/internal/api/docs.go b/internal/api/docs.go index d0b5c6fb..78eecfa3 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -25,11 +25,6 @@ const docTemplate = `{ "paths": { "/api/clusters/": { "get": { - "security": [ - { - "ApiKeyAuth": [] - } - ], "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", "produces": [ "application/json" @@ -50,7 +45,7 @@ const docTemplate = `{ "200": { "description": "Array of clusters", "schema": { - "$ref": "#/definitions/api.GetClustersApiResponse" + "$ref": "#/definitions/api.GetClustersAPIResponse" } }, "400": { @@ -77,16 +72,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/": { + "get": { "description": "Get a list of all jobs. Filters can be applied using query parameters.\nNumber of results can be limited by page. Results are sorted by descending startTime.", "produces": [ "application/json" @@ -145,7 +140,7 @@ const docTemplate = `{ "200": { "description": "Job array and page info", "schema": { - "$ref": "#/definitions/api.GetJobsApiResponse" + "$ref": "#/definitions/api.GetJobsAPIResponse" } }, "400": { @@ -172,16 +167,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/": { + "delete": { "description": "Job to delete is specified by request body. All fields are required in this case.", "consumes": [ "application/json" @@ -200,7 +195,7 @@ const docTemplate = `{ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.DeleteJobApiRequest" + "$ref": "#/definitions/api.DeleteJobAPIRequest" } } ], @@ -208,7 +203,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -247,16 +242,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/{id}": { + "delete": { "description": "Job to remove is specified by database ID. This will not remove the job from the job archive.", "produces": [ "application/json" @@ -278,7 +273,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -317,16 +312,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job_before/{ts}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job_before/{ts}": { + "delete": { "description": "Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.", "produces": [ "application/json" @@ -342,13 +337,19 @@ const docTemplate = `{ "name": "ts", "in": "path", "required": true + }, + { + "type": "boolean", + "description": "Omit jobs with tags from deletion", + "name": "omit-tagged", + "in": "query" } ], "responses": { "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -387,16 +388,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/edit_meta/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/edit_meta/{id}": { + "post": { "description": "Edit key value pairs in job metadata json\nIf a key already exists its content will be overwritten", "consumes": [ "application/json" @@ -457,16 +458,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/start_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/start_job/": { + "post": { "description": "Job specified in request body will be saved to database as \"running\" with new DB ID.\nJob specifications follow the 'Job' scheme, API will fail to execute if requirements are not met.", "consumes": [ "application/json" @@ -493,7 +494,7 @@ const docTemplate = `{ "201": { "description": "Job added successfully", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -526,16 +527,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/stop_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/stop_job/": { + "post": { "description": "Job to stop is specified by request body. All fields are required in this case.\nReturns full job resource information according to 'Job' scheme.", "produces": [ "application/json" @@ -551,7 +552,7 @@ const docTemplate = `{ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" + "$ref": "#/definitions/api.StopJobAPIRequest" } } ], @@ -598,16 +599,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/tag_job/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/tag_job/{id}": { + "post": { "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" @@ -635,7 +636,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -671,16 +672,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/{id}": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/{id}": { + "get": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "produces": [ "application/json" @@ -708,7 +709,7 @@ const docTemplate = `{ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -747,14 +748,14 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - }, - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + }, + "post": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "consumes": [ "application/json" @@ -791,7 +792,7 @@ const docTemplate = `{ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -830,16 +831,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/nodestats/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/nodestats/": { + "post": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -863,7 +864,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -890,16 +891,86 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/users/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/api/user/{id}": { + "post": { + "description": "Allows admins to add/remove roles and projects for a user", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Update user roles and projects", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Role to add", + "name": "add-role", + "in": "formData" + }, + { + "type": "string", + "description": "Role to remove", + "name": "remove-role", + "in": "formData" + }, + { + "type": "string", + "description": "Project to add", + "name": "add-project", + "in": "formData" + }, + { + "type": "string", + "description": "Project to remove", + "name": "remove-project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/api/users/": { + "get": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -923,7 +994,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiReturnedUser" + "$ref": "#/definitions/api.APIReturnedUser" } } }, @@ -951,16 +1022,361 @@ const docTemplate = `{ "type": "string" } } - } - } - }, - "/jobs/tag_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + }, + "post": { + "description": "Creates a new user with specified credentials and role", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Create a new user", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "username", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Password (not required for API users)", + "name": "password", + "in": "formData" + }, + { + "type": "string", + "description": "User role", + "name": "role", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Full name", + "name": "name", + "in": "formData" + }, + { + "type": "string", + "description": "Email address", + "name": "email", + "in": "formData" + }, + { + "type": "string", + "description": "Project (required for managers)", + "name": "project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "delete": { + "description": "Deletes a user from the system", + "produces": [ + "text/plain" + ], + "tags": [ + "User" + ], + "summary": "Delete a user", + "parameters": [ + { + "type": "string", + "description": "Username to delete", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Success", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/configuration/": { + "post": { + "description": "Updates a user's configuration key-value pair.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Update user configuration", + "parameters": [ + { + "type": "string", + "description": "Configuration key", + "name": "key", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Configuration value", + "name": "value", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "success", + "schema": { + "type": "string" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/debug/": { + "post": { + "description": "This endpoint allows the users to print the content of", + "produces": [ + "application/json" + ], + "tags": [ + "debug" + ], + "summary": "Debug endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/free/": { + "post": { + "description": "This endpoint allows the users to free the Buffers from the", + "produces": [ + "application/json" + ], + "tags": [ + "free" + ], + "parameters": [ + { + "type": "string", + "description": "up to timestamp", + "name": "to", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/healthcheck/": { + "get": { + "description": "This endpoint allows the users to check if a node is healthy", + "produces": [ + "application/json" + ], + "tags": [ + "healthcheck" + ], + "summary": "HealthCheck endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/jobs/tag_job/{id}": { + "delete": { "description": "Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nIf tagged job is already finished: Tag will be removed from respective archive files.", "consumes": [ "application/json" @@ -988,7 +1404,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1024,16 +1440,276 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/tags/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/jwt/": { + "get": { + "description": "Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.", + "consumes": [ + "multipart/form-data" ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Generate JWT token", + "parameters": [ + { + "type": "string", + "description": "Username to generate JWT for", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "JWT token", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "User Not Found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/machine_state/{cluster}/{host}": { + "get": { + "description": "Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "produces": [ + "application/json" + ], + "tags": [ + "Machine State" + ], + "summary": "Retrieve machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Machine state JSON data", + "schema": { + "type": "object" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled or file not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "put": { + "description": "Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "consumes": [ + "application/json" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Machine State" + ], + "summary": "Store machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "201": { + "description": "Created" + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/notice/": { + "post": { + "description": "Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Config" + ], + "summary": "Update system notice", + "parameters": [ + { + "type": "string", + "description": "New notice content (max 10000 characters)", + "name": "new-content", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Update Notice Content Success", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/roles/": { + "get": { + "description": "Returns a list of valid user roles. Only admins are allowed.", + "produces": [ + "application/json" + ], + "tags": [ + "Config" + ], + "summary": "Get available roles", + "responses": { + "200": { + "description": "List of role names", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/tags/": { + "delete": { "description": "Removes tags by type and name. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nTag wills be removed from respective archive files.", "consumes": [ "application/json" @@ -1054,7 +1730,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1090,12 +1766,72 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/write/": { + "post": { + "consumes": [ + "text/plain" + ], + "produces": [ + "application/json" + ], + "parameters": [ + { + "type": "string", + "description": "If the lines in the body do not have a cluster tag, use this value instead.", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } } }, "definitions": { - "api.ApiReturnedUser": { + "api.APIReturnedUser": { "type": "object", "properties": { "email": { @@ -1121,7 +1857,7 @@ const docTemplate = `{ } } }, - "api.ApiTag": { + "api.APITag": { "type": "object", "properties": { "name": { @@ -1141,7 +1877,7 @@ const docTemplate = `{ } } }, - "api.DefaultApiResponse": { + "api.DefaultAPIResponse": { "type": "object", "properties": { "msg": { @@ -1149,7 +1885,7 @@ const docTemplate = `{ } } }, - "api.DeleteJobApiRequest": { + "api.DeleteJobAPIRequest": { "type": "object", "required": [ "jobId" @@ -1198,7 +1934,7 @@ const docTemplate = `{ } } }, - "api.GetClustersApiResponse": { + "api.GetClustersAPIResponse": { "type": "object", "properties": { "clusters": { @@ -1210,7 +1946,7 @@ const docTemplate = `{ } } }, - "api.GetJobApiResponse": { + "api.GetJobAPIResponse": { "type": "object", "properties": { "data": { @@ -1224,7 +1960,7 @@ const docTemplate = `{ } } }, - "api.GetJobsApiResponse": { + "api.GetJobsAPIResponse": { "type": "object", "properties": { "items": { @@ -1258,39 +1994,7 @@ const docTemplate = `{ } } }, - "api.Node": { - "type": "object", - "properties": { - "cpusAllocated": { - "type": "integer" - }, - "cpusTotal": { - "type": "integer" - }, - "gpusAllocated": { - "type": "integer" - }, - "gpusTotal": { - "type": "integer" - }, - "hostname": { - "type": "string" - }, - "memoryAllocated": { - "type": "integer" - }, - "memoryTotal": { - "type": "integer" - }, - "states": { - "type": "array", - "items": { - "type": "string" - } - } - } - }, - "api.StopJobApiRequest": { + "api.StopJobAPIRequest": { "type": "object", "required": [ "jobState", @@ -1333,7 +2037,7 @@ const docTemplate = `{ "nodes": { "type": "array", "items": { - "$ref": "#/definitions/api.Node" + "$ref": "#/definitions/schema.NodePayload" } } } @@ -1342,12 +2046,15 @@ const docTemplate = `{ "type": "object", "properties": { "id": { + "description": "Unique identifier for the accelerator (e.g., \"0\", \"1\", \"GPU-0\")", "type": "string" }, "model": { + "description": "Specific model name (e.g., \"A100\", \"MI100\")", "type": "string" }, "type": { + "description": "Type of accelerator (e.g., \"Nvidia GPU\", \"AMD GPU\")", "type": "string" } } @@ -1356,15 +2063,18 @@ const docTemplate = `{ "type": "object", "properties": { "metricConfig": { + "description": "Cluster-wide metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Unique cluster name (e.g., \"fritz\", \"alex\")", "type": "string" }, "subClusters": { + "description": "Homogeneous partitions within the cluster", "type": "array", "items": { "$ref": "#/definitions/schema.SubCluster" @@ -1373,6 +2083,7 @@ const docTemplate = `{ } }, "schema.Job": { + "description": "Information of a HPC job.", "type": "object", "properties": { "arrayJobId": { @@ -1401,6 +2112,13 @@ const docTemplate = `{ "format": "float64" } }, + "exclusive": { + "description": "for backwards compatibility", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, "footprint": { "type": "object", "additionalProperties": { @@ -1423,7 +2141,7 @@ const docTemplate = `{ "deadline", "failed", "node_fail", - "out_of_memory", + "out-of-memory", "pending", "preempted", "running", @@ -1535,9 +2253,11 @@ const docTemplate = `{ "type": "object", "properties": { "id": { + "description": "Internal database ID", "type": "integer" }, "jobId": { + "description": "The job's external job ID", "type": "integer" } } @@ -1546,9 +2266,11 @@ const docTemplate = `{ "type": "object", "properties": { "count": { + "description": "Total count of available items", "type": "integer" }, "items": { + "description": "List of job links", "type": "array", "items": { "$ref": "#/definitions/schema.JobLink" @@ -1560,19 +2282,31 @@ const docTemplate = `{ "type": "object", "properties": { "series": { + "description": "Individual time series data", "type": "array", "items": { "$ref": "#/definitions/schema.Series" } }, "statisticsSeries": { - "$ref": "#/definitions/schema.StatsSeries" + "description": "Aggregated statistics over time", + "allOf": [ + { + "$ref": "#/definitions/schema.StatsSeries" + } + ] }, "timestep": { + "description": "Sampling interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1638,46 +2372,71 @@ const docTemplate = `{ "type": "object", "properties": { "aggregation": { + "description": "Aggregation function (avg, sum, min, max)", "type": "string" }, "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement method", "type": "string" }, "footprint": { + "description": "Footprint category", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values are better", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, + "restrict": { + "description": "Restrict visibility to non user roles", + "type": "boolean" + }, "scope": { - "$ref": "#/definitions/schema.MetricScope" + "description": "Metric scope (node, socket, core, etc.)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricScope" + } + ] }, "subClusters": { + "description": "Subcluster-specific overrides", "type": "array", "items": { "$ref": "#/definitions/schema.SubClusterConfig" } }, "timestep": { + "description": "Measurement interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1706,12 +2465,15 @@ const docTemplate = `{ "type": "object", "properties": { "avg": { + "description": "Average/mean value", "type": "number" }, "max": { + "description": "Maximum value", "type": "number" }, "min": { + "description": "Minimum value", "type": "number" } } @@ -1720,30 +2482,72 @@ const docTemplate = `{ "type": "object", "properties": { "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement (e.g., FLOP/s, GB/s)", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] }, "value": { + "description": "Numeric value of the measurement", "type": "number" } } }, + "schema.NodePayload": { + "type": "object", + "properties": { + "cpusAllocated": { + "description": "Number of allocated CPUs", + "type": "integer" + }, + "gpusAllocated": { + "description": "Number of allocated GPUs", + "type": "integer" + }, + "hostname": { + "description": "Node hostname", + "type": "string" + }, + "jobsRunning": { + "description": "Number of running jobs", + "type": "integer" + }, + "memoryAllocated": { + "description": "Allocated memory in MB", + "type": "integer" + }, + "states": { + "description": "State strings (flexible format)", + "type": "array", + "items": { + "type": "string" + } + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", "properties": { "accelerators": { + "description": "Allocated accelerator IDs (e.g., GPU IDs)", "type": "array", "items": { "type": "string" } }, "configuration": { + "description": "Optional configuration identifier", "type": "string" }, "hostname": { + "description": "Node hostname", "type": "string" }, "hwthreads": { + "description": "Allocated hardware thread IDs", "type": "array", "items": { "type": "integer" @@ -1755,19 +2559,27 @@ const docTemplate = `{ "type": "object", "properties": { "data": { + "description": "Time series measurements", "type": "array", "items": { "type": "number" } }, "hostname": { + "description": "Source hostname", "type": "string" }, "id": { + "description": "Optional ID (e.g., core ID, GPU ID)", "type": "string" }, "statistics": { - "$ref": "#/definitions/schema.MetricStatistics" + "description": "Statistical summary (min/avg/max)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricStatistics" + } + ] } } }, @@ -1775,30 +2587,35 @@ const docTemplate = `{ "type": "object", "properties": { "max": { + "description": "Maximum values over time", "type": "array", "items": { "type": "number" } }, "mean": { + "description": "Mean values over time", "type": "array", "items": { "type": "number" } }, "median": { + "description": "Median values over time", "type": "array", "items": { "type": "number" } }, "min": { + "description": "Minimum values over time", "type": "array", "items": { "type": "number" } }, "percentiles": { + "description": "Percentile values over time (e.g., 10th, 50th, 90th)", "type": "object", "additionalProperties": { "type": "array", @@ -1814,52 +2631,81 @@ const docTemplate = `{ "type": "object", "properties": { "coresPerSocket": { + "description": "Number of cores per CPU socket", "type": "integer" }, "energyFootprint": { + "description": "Energy-related footprint metrics", "type": "array", "items": { "type": "string" } }, "flopRateScalar": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical scalar FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "flopRateSimd": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical SIMD FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "footprint": { + "description": "Default footprint metrics for jobs", "type": "array", "items": { "type": "string" } }, "memoryBandwidth": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical memory bandwidth per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "metricConfig": { + "description": "Subcluster-specific metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Name of the subcluster (e.g., \"main\", \"gpu\", \"bigmem\")", "type": "string" }, "nodes": { + "description": "Node list in condensed format (e.g., \"node[001-100]\")", "type": "string" }, "processorType": { + "description": "CPU model (e.g., \"Intel Xeon Gold 6148\")", "type": "string" }, "socketsPerNode": { + "description": "Number of CPU sockets per node", "type": "integer" }, "threadsPerCore": { + "description": "Number of hardware threads per core (SMT level)", "type": "integer" }, "topology": { - "$ref": "#/definitions/schema.Topology" + "description": "Hardware topology of nodes in this subcluster", + "allOf": [ + { + "$ref": "#/definitions/schema.Topology" + } + ] } } }, @@ -1867,34 +2713,52 @@ const docTemplate = `{ "type": "object", "properties": { "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement configuration", "type": "string" }, "footprint": { + "description": "Footprint category for this metric", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values indicate better performance", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, "remove": { + "description": "Whether to exclude this metric for this subcluster", + "type": "boolean" + }, + "restrict": { + "description": "Restrict visibility to non user roles", "type": "boolean" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1923,12 +2787,14 @@ const docTemplate = `{ "type": "object", "properties": { "accelerators": { + "description": "Attached accelerators (GPUs, etc.)", "type": "array", "items": { "$ref": "#/definitions/schema.Accelerator" } }, "core": { + "description": "Hardware threads grouped by core", "type": "array", "items": { "type": "array", @@ -1938,6 +2804,7 @@ const docTemplate = `{ } }, "die": { + "description": "Hardware threads grouped by die (optional)", "type": "array", "items": { "type": "array", @@ -1947,6 +2814,7 @@ const docTemplate = `{ } }, "memoryDomain": { + "description": "Hardware threads grouped by NUMA domain", "type": "array", "items": { "type": "array", @@ -1956,12 +2824,14 @@ const docTemplate = `{ } }, "node": { + "description": "All hardware thread IDs on this node", "type": "array", "items": { "type": "integer" } }, "socket": { + "description": "Hardware threads grouped by socket", "type": "array", "items": { "type": "array", @@ -1976,9 +2846,11 @@ const docTemplate = `{ "type": "object", "properties": { "base": { + "description": "Base unit (e.g., \"B/s\", \"F/s\", \"W\")", "type": "string" }, "prefix": { + "description": "SI prefix (e.g., \"G\", \"M\", \"K\", \"T\")", "type": "string" } } diff --git a/internal/api/job.go b/internal/api/job.go index 09f7b22c..1b1e05d6 100644 --- a/internal/api/job.go +++ b/internal/api/job.go @@ -104,7 +104,7 @@ type JobMetricWithName struct { // @param items-per-page query int false "Items per page (Default: 25)" // @param page query int false "Page Number (Default: 1)" // @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" -// @success 200 {object} api.GetJobsApiResponse "Job array and page info" +// @success 200 {object} api.GetJobsAPIResponse "Job array and page info" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -232,7 +232,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) { // @produce json // @param id path int true "Database ID of Job" // @param all-metrics query bool false "Include all available metrics" -// @success 200 {object} api.GetJobApiResponse "Job resource" +// @success 200 {object} api.GetJobAPIResponse "Job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -324,8 +324,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) // @accept json // @produce json // @param id path int true "Database ID of Job" -// @param request body api.GetJobApiRequest true "Array of metric names" -// @success 200 {object} api.GetJobApiResponse "Job resource" +// @param request body api.GetJobAPIRequest true "Array of metric names" +// @success 200 {object} api.GetJobAPIResponse "Job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -478,7 +478,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to add" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to add" // @success 200 {object} schema.Job "Updated job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -542,7 +542,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove" // @success 200 {object} schema.Job "Updated job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -606,7 +606,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) { // @description Tag wills be removed from respective archive files. // @accept json // @produce plain -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove" // @success 200 {string} string "Success Response" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -650,7 +650,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param request body schema.Job true "Job to add" -// @success 201 {object} api.DefaultApiResponse "Job added successfully" +// @success 201 {object} api.DefaultAPIResponse "Job added successfully" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -728,7 +728,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) { // @description Job to stop is specified by request body. All fields are required in this case. // @description Returns full job resource information according to 'Job' scheme. // @produce json -// @param request body api.StopJobApiRequest true "All fields required" +// @param request body api.StopJobAPIRequest true "All fields required" // @success 200 {object} schema.Job "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -754,7 +754,6 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { return } - // cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) if err != nil { // Try cached jobs if not found in main repository @@ -776,7 +775,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // @description Job to remove is specified by database ID. This will not remove the job from the job archive. // @produce json // @param id path int true "Database ID of Job" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -820,8 +819,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) { // @description Job to delete is specified by request body. All fields are required in this case. // @accept json // @produce json -// @param request body api.DeleteJobApiRequest true "All fields required" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @param request body api.DeleteJobAPIRequest true "All fields required" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -873,7 +872,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) // @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. // @produce json // @param ts path int true "Unix epoch timestamp" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" diff --git a/internal/api/node.go b/internal/api/node.go index 350f097d..4ad5337a 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -47,7 +47,7 @@ func determineState(states []string) schema.SchedulerState { // @description Required query-parameter defines if all users or only users with additional special roles are returned. // @produce json // @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" diff --git a/internal/api/user.go b/internal/api/user.go index 1821b69b..5564fd61 100644 --- a/internal/api/user.go +++ b/internal/api/user.go @@ -31,7 +31,7 @@ type APIReturnedUser struct { // @description Required query-parameter defines if all users or only users with additional special roles are returned. // @produce json // @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" -// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" +// @success 200 {array} api.APIReturnedUser "List of users returned successfully" // @failure 400 {string} string "Bad Request" // @failure 401 {string} string "Unauthorized" // @failure 403 {string} string "Forbidden" diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index 2d3aca04..c218c0af 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -10815,7 +10815,7 @@ func (ec *executionContext) _SubCluster_metricConfig(ctx context.Context, field return obj.MetricConfig, nil }, nil, - ec.marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ, + ec.marshalNMetricConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ, true, true, ) @@ -18466,11 +18466,7 @@ func (ec *executionContext) marshalNJobsStatistics2ᚖgithubᚗcomᚋClusterCock return ec._JobsStatistics(ctx, sel, v) } -func (ec *executionContext) marshalNMetricConfig2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx context.Context, sel ast.SelectionSet, v schema.MetricConfig) graphql.Marshaler { - return ec._MetricConfig(ctx, sel, &v) -} - -func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.MetricConfig) graphql.Marshaler { +func (ec *executionContext) marshalNMetricConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.MetricConfig) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup isLen1 := len(v) == 1 @@ -18494,7 +18490,7 @@ func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpi if !isLen1 { defer wg.Done() } - ret[i] = ec.marshalNMetricConfig2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx, sel, v[i]) + ret[i] = ec.marshalNMetricConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx, sel, v[i]) } if isLen1 { f(i) @@ -18514,6 +18510,16 @@ func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpi return ret } +func (ec *executionContext) marshalNMetricConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx context.Context, sel ast.SelectionSet, v *schema.MetricConfig) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._MetricConfig(ctx, sel, v) +} + func (ec *executionContext) marshalNMetricFootprints2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricFootprintsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.MetricFootprints) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 2cb4f992..21ccaf92 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -3,7 +3,7 @@ package graph // This file will be automatically regenerated based on the schema, any resolver // implementations // will be copied through when generating and any unknown code will be moved to the end. -// Code generated by github.com/99designs/gqlgen version v0.17.84 +// Code generated by github.com/99designs/gqlgen version v0.17.85 import ( "context" @@ -283,7 +283,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin // Test Access: Admins && Admin Tag OR Everyone && Private Tag if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope { // Remove from DB - if err = r.Repo.RemoveTagById(tid); err != nil { + if err = r.Repo.RemoveTagByID(tid); err != nil { cclog.Warn("Error while removing tag") return nil, err } else { diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 4b217475..2ac35ea9 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package importer import ( diff --git a/internal/importer/normalize.go b/internal/importer/normalize.go index c6e84d4b..cc6fb545 100644 --- a/internal/importer/normalize.go +++ b/internal/importer/normalize.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package importer import ( diff --git a/internal/metricstore/metricstore.go b/internal/metricstore/metricstore.go index e35b4d58..d75c9ef8 100644 --- a/internal/metricstore/metricstore.go +++ b/internal/metricstore/metricstore.go @@ -74,7 +74,7 @@ func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) { cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers) // Helper function to add metric configuration - addMetricConfig := func(mc schema.MetricConfig) { + addMetricConfig := func(mc *schema.MetricConfig) { agg, err := AssignAggregationStrategy(mc.Aggregation) if err != nil { cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error()) @@ -88,7 +88,7 @@ func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) { for _, c := range archive.Clusters { for _, mc := range c.MetricConfig { - addMetricConfig(*mc) + addMetricConfig(mc) } for _, sc := range c.SubClusters { diff --git a/internal/repository/hooks.go b/internal/repository/hooks.go index c916b57e..824beb7c 100644 --- a/internal/repository/hooks.go +++ b/internal/repository/hooks.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( diff --git a/internal/repository/job.go b/internal/repository/job.go index bd33774c..9ee51735 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -686,7 +686,6 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in return subclusters, nil } -// FIXME: Set duration to requested walltime? // StopJobsExceedingWalltimeBy marks running jobs as failed if they exceed their walltime limit. // This is typically called periodically to clean up stuck or orphaned jobs. // @@ -762,7 +761,6 @@ func (r *JobRepository) FindJobIdsByTag(tagID int64) ([]int64, error) { return jobIds, nil } -// FIXME: Reconsider filtering short jobs with harcoded threshold // FindRunningJobs returns all currently running jobs for a specific cluster. // Filters out short-running jobs based on repoConfig.MinRunningJobDuration threshold. // diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index 41684d5c..66d29eea 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index 17766c69..9f4871fd 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -90,13 +90,13 @@ func TestFindJobsBetween(t *testing.T) { // 2. Create a tag tagName := fmt.Sprintf("testtag_%d", time.Now().UnixNano()) - tagId, err := r.CreateTag("testtype", tagName, "global") + tagID, err := r.CreateTag("testtype", tagName, "global") if err != nil { t.Fatal(err) } // 3. Link Tag (Manually to avoid archive dependency side-effects in unit test) - _, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagId) + _, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagID) if err != nil { t.Fatal(err) } diff --git a/internal/repository/node.go b/internal/repository/node.go index 2890cdbc..a81fc58d 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -579,7 +579,7 @@ func (r *NodeRepository) GetNodesForList( queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}}) } if stateFilter != "all" && stateFilter != "notindb" { - var queryState schema.SchedulerState = schema.SchedulerState(stateFilter) + queryState := schema.SchedulerState(stateFilter) queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState}) } // if healthFilter != "all" { diff --git a/internal/repository/repository_test.go b/internal/repository/repository_test.go index 9d07b026..475e7bca 100644 --- a/internal/repository/repository_test.go +++ b/internal/repository/repository_test.go @@ -46,7 +46,7 @@ func BenchmarkSelect1(b *testing.B) { } func BenchmarkDB_FindJobById(b *testing.B) { - var jobId int64 = 1677322 + var jobID int64 = 1677322 b.Run("FindJobById", func(b *testing.B) { db := setup(b) @@ -55,7 +55,7 @@ func BenchmarkDB_FindJobById(b *testing.B) { b.RunParallel(func(pb *testing.PB) { for pb.Next() { - _, err := db.FindByID(getContext(b), jobId) + _, err := db.FindByID(getContext(b), jobID) noErr(b, err) } }) @@ -63,7 +63,7 @@ func BenchmarkDB_FindJobById(b *testing.B) { } func BenchmarkDB_FindJob(b *testing.B) { - var jobId int64 = 107266 + var jobID int64 = 107266 var startTime int64 = 1657557241 cluster := "fritz" @@ -74,7 +74,7 @@ func BenchmarkDB_FindJob(b *testing.B) { b.RunParallel(func(pb *testing.PB) { for pb.Next() { - _, err := db.Find(&jobId, &cluster, &startTime) + _, err := db.Find(&jobID, &cluster, &startTime) noErr(b, err) } }) diff --git a/internal/repository/stats.go b/internal/repository/stats.go index cd175c23..851a4ca1 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -2,6 +2,44 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + +// This file contains job statistics and histogram generation functionality for the JobRepository. +// +// # Job Statistics +// +// The statistics methods provide aggregated metrics about jobs including total jobs, users, +// walltime, and resource usage (nodes, cores, accelerators). Statistics can be computed: +// - Overall (JobsStats): Single aggregate across all matching jobs +// - Grouped (JobsStatsGrouped): Aggregated by user, project, cluster, or subcluster +// - Counts (JobCountGrouped, AddJobCount): Simple job counts with optional filtering +// +// All statistics methods support filtering via JobFilter and respect security contexts. +// +// # Histograms +// +// Histogram methods generate distribution data for visualization: +// - Duration, nodes, cores, accelerators (AddHistograms) +// - Job metrics like CPU load, memory usage (AddMetricHistograms) +// +// Histograms use intelligent binning: +// - Duration: Variable bin sizes (1m, 10m, 1h, 6h, 12h, 24h) with zero-padding +// - Resources: Natural value-based bins +// - Metrics: Normalized to peak values with configurable bin counts +// +// # Running vs. Completed Jobs +// +// Statistics handle running jobs specially: +// - Duration calculated as (now - start_time) for running jobs +// - Metric histograms for running jobs load data from metric backend instead of footprint +// - Job state filtering distinguishes running/completed jobs +// +// # Performance Considerations +// +// - All queries use prepared statements via stmtCache +// - Complex aggregations use SQL for efficiency +// - Histogram pre-initialization ensures consistent bin ranges +// - Metric histogram queries limited to 500 jobs for running job analysis + package repository import ( @@ -19,7 +57,9 @@ import ( sq "github.com/Masterminds/squirrel" ) -// GraphQL validation should make sure that no unkown values can be specified. +// groupBy2column maps GraphQL Aggregate enum values to their corresponding database column names. +// Used by JobsStatsGrouped and JobCountGrouped to translate user-facing grouping dimensions +// into SQL GROUP BY clauses. GraphQL validation ensures only valid enum values are accepted. var groupBy2column = map[model.Aggregate]string{ model.AggregateUser: "job.hpc_user", model.AggregateProject: "job.project", @@ -27,6 +67,9 @@ var groupBy2column = map[model.Aggregate]string{ model.AggregateSubcluster: "job.subcluster", } +// sortBy2column maps GraphQL SortByAggregate enum values to their corresponding computed column names. +// Used by JobsStatsGrouped to translate sort preferences into SQL ORDER BY clauses. +// Column names match the AS aliases used in buildStatsQuery. var sortBy2column = map[model.SortByAggregate]string{ model.SortByAggregateTotaljobs: "totalJobs", model.SortByAggregateTotalusers: "totalUsers", @@ -39,6 +82,21 @@ var sortBy2column = map[model.SortByAggregate]string{ model.SortByAggregateTotalacchours: "totalAccHours", } +// buildCountQuery constructs a SQL query to count jobs with optional grouping and filtering. +// +// Parameters: +// - filter: Job filters to apply (cluster, user, time range, etc.) +// - kind: Special filter - "running" for running jobs only, "short" for jobs under threshold +// - col: Column name to GROUP BY; empty string for total count without grouping +// +// Returns a SelectBuilder that produces either: +// - Single count: COUNT(job.id) when col is empty +// - Grouped counts: col, COUNT(job.id) when col is specified +// +// The kind parameter enables counting specific job categories: +// - "running": Only jobs with job_state = 'running' +// - "short": Only jobs with duration < ShortRunningJobsDuration config value +// - empty: All jobs matching filters func (r *JobRepository) buildCountQuery( filter []*model.JobFilter, kind string, @@ -47,10 +105,8 @@ func (r *JobRepository) buildCountQuery( var query sq.SelectBuilder if col != "" { - // Scan columns: id, cnt query = sq.Select(col, "COUNT(job.id)").From("job").GroupBy(col) } else { - // Scan columns: cnt query = sq.Select("COUNT(job.id)").From("job") } @@ -68,6 +124,27 @@ func (r *JobRepository) buildCountQuery( return query } +// buildStatsQuery constructs a SQL query to compute comprehensive job statistics with optional grouping. +// +// Parameters: +// - filter: Job filters to apply (cluster, user, time range, etc.) +// - col: Column name to GROUP BY; empty string for overall statistics without grouping +// +// Returns a SelectBuilder that produces comprehensive statistics: +// - totalJobs: Count of jobs +// - totalUsers: Count of distinct users (always 0 when grouping by user) +// - totalWalltime: Sum of job durations in hours +// - totalNodes: Sum of nodes used across all jobs +// - totalNodeHours: Sum of (duration × num_nodes) in hours +// - totalCores: Sum of hardware threads used across all jobs +// - totalCoreHours: Sum of (duration × num_hwthreads) in hours +// - totalAccs: Sum of accelerators used across all jobs +// - totalAccHours: Sum of (duration × num_acc) in hours +// +// Special handling: +// - Running jobs: Duration calculated as (now - start_time) instead of stored duration +// - Grouped queries: Also select grouping column and user's display name from hpc_user table +// - All time values converted from seconds to hours (÷ 3600) and rounded func (r *JobRepository) buildStatsQuery( filter []*model.JobFilter, col string, @@ -75,31 +152,29 @@ func (r *JobRepository) buildStatsQuery( var query sq.SelectBuilder if col != "" { - // Scan columns: id, name, totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours query = sq.Select( col, "name", "COUNT(job.id) as totalJobs", "COUNT(DISTINCT job.hpc_user) AS totalUsers", fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int) as totalWalltime`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_nodes) as int) as totalNodes`), + `CAST(SUM(job.num_nodes) as int) as totalNodes`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int) as totalNodeHours`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as int) as totalCores`), + `CAST(SUM(job.num_hwthreads) as int) as totalCores`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int) as totalCoreHours`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_acc) as int) as totalAccs`), + `CAST(SUM(job.num_acc) as int) as totalAccs`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int) as totalAccHours`, time.Now().Unix()), ).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col) } else { - // Scan columns: totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours query = sq.Select( "COUNT(job.id) as totalJobs", "COUNT(DISTINCT job.hpc_user) AS totalUsers", fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int)`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_nodes) as int)`), + `CAST(SUM(job.num_nodes) as int)`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int)`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as int)`), + `CAST(SUM(job.num_hwthreads) as int)`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int)`, time.Now().Unix()), - fmt.Sprintf(`CAST(SUM(job.num_acc) as int)`), + `CAST(SUM(job.num_acc) as int)`, fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int)`, time.Now().Unix()), ).From("job") } @@ -111,6 +186,25 @@ func (r *JobRepository) buildStatsQuery( return query } +// JobsStatsGrouped computes comprehensive job statistics grouped by a dimension (user, project, cluster, or subcluster). +// +// This is the primary method for generating aggregated statistics views in the UI, providing +// metrics like total jobs, walltime, and resource usage broken down by the specified grouping. +// +// Parameters: +// - ctx: Context for security checks and cancellation +// - filter: Filters to apply (time range, cluster, job state, etc.) +// - page: Optional pagination (ItemsPerPage: -1 disables pagination) +// - sortBy: Optional sort column (totalJobs, totalWalltime, totalCoreHours, etc.) +// - groupBy: Required grouping dimension (User, Project, Cluster, or Subcluster) +// +// Returns a slice of JobsStatistics, one per group, with: +// - ID: The group identifier (username, project name, cluster name, etc.) +// - Name: Display name (for users, from hpc_user.name; empty for other groups) +// - Statistics: totalJobs, totalUsers, totalWalltime, resource usage metrics +// +// Security: Respects user roles via SecurityCheck - users see only their own data unless admin/support. +// Performance: Results are sorted in SQL and pagination applied before scanning rows. func (r *JobRepository) JobsStatsGrouped( ctx context.Context, filter []*model.JobFilter, @@ -230,6 +324,21 @@ func (r *JobRepository) JobsStatsGrouped( return stats, nil } +// JobsStats computes overall job statistics across all matching jobs without grouping. +// +// This method provides a single aggregate view of job metrics, useful for dashboard +// summaries and overall system utilization reports. +// +// Parameters: +// - ctx: Context for security checks and cancellation +// - filter: Filters to apply (time range, cluster, job state, etc.) +// +// Returns a single-element slice containing aggregate statistics: +// - totalJobs, totalUsers, totalWalltime +// - totalNodeHours, totalCoreHours, totalAccHours +// +// Unlike JobsStatsGrouped, this returns overall totals without breaking down by dimension. +// Security checks are applied via SecurityCheck to respect user access levels. func (r *JobRepository) JobsStats( ctx context.Context, filter []*model.JobFilter, @@ -303,6 +412,17 @@ func LoadJobStat(job *schema.Job, metric string, statType string) float64 { return 0.0 } +// JobCountGrouped counts jobs grouped by a dimension without computing detailed statistics. +// +// This is a lightweight alternative to JobsStatsGrouped when only job counts are needed, +// avoiding the overhead of calculating walltime and resource usage metrics. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - groupBy: Grouping dimension (User, Project, Cluster, or Subcluster) +// +// Returns JobsStatistics with only ID and TotalJobs populated for each group. func (r *JobRepository) JobCountGrouped( ctx context.Context, filter []*model.JobFilter, @@ -343,6 +463,20 @@ func (r *JobRepository) JobCountGrouped( return stats, nil } +// AddJobCountGrouped augments existing statistics with additional job counts by category. +// +// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped +// with counts of running or short-running jobs, matched by group ID. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - groupBy: Grouping dimension (must match the dimension used for stats parameter) +// - stats: Existing statistics to augment (modified in-place by ID matching) +// - kind: "running" to add RunningJobs count, "short" to add ShortJobs count +// +// Returns the same stats slice with RunningJobs or ShortJobs fields populated per group. +// Groups without matching jobs will have 0 for the added field. func (r *JobRepository) AddJobCountGrouped( ctx context.Context, filter []*model.JobFilter, @@ -392,6 +526,18 @@ func (r *JobRepository) AddJobCountGrouped( return stats, nil } +// AddJobCount augments existing overall statistics with additional job counts by category. +// +// Similar to AddJobCountGrouped but for ungrouped statistics. Applies the same count +// to all statistics entries (typically just one). +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - stats: Existing statistics to augment (modified in-place) +// - kind: "running" to add RunningJobs count, "short" to add ShortJobs count +// +// Returns the same stats slice with RunningJobs or ShortJobs fields set to the total count. func (r *JobRepository) AddJobCount( ctx context.Context, filter []*model.JobFilter, @@ -437,6 +583,26 @@ func (r *JobRepository) AddJobCount( return stats, nil } +// AddHistograms augments statistics with distribution histograms for job properties. +// +// Generates histogram data for visualization of job duration, node count, core count, +// and accelerator count distributions. Duration histogram uses intelligent binning based +// on the requested resolution. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply to jobs included in histograms +// - stat: Statistics struct to augment (modified in-place) +// - durationBins: Bin size - "1m", "10m", "1h", "6h", "12h", or "24h" (default) +// +// Populates these fields in stat: +// - HistDuration: Job duration distribution (zero-padded bins) +// - HistNumNodes: Node count distribution +// - HistNumCores: Core (hwthread) count distribution +// - HistNumAccs: Accelerator count distribution +// +// Duration bins are pre-initialized with zeros to ensure consistent ranges for visualization. +// Bin size determines both the width and maximum duration displayed (e.g., "1h" = 48 bins × 1h = 48h max). func (r *JobRepository) AddHistograms( ctx context.Context, filter []*model.JobFilter, @@ -447,20 +613,20 @@ func (r *JobRepository) AddHistograms( var targetBinCount int var targetBinSize int - switch { - case *durationBins == "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes + switch *durationBins { + case "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes targetBinCount = 60 targetBinSize = 60 - case *durationBins == "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours + case "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours targetBinCount = 72 targetBinSize = 600 - case *durationBins == "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours + case "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours targetBinCount = 48 targetBinSize = 3600 - case *durationBins == "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days + case "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days targetBinCount = 12 targetBinSize = 21600 - case *durationBins == "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days + case "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days targetBinCount = 14 targetBinSize = 43200 default: // 24h @@ -499,7 +665,30 @@ func (r *JobRepository) AddHistograms( return stat, nil } -// Requires thresholds for metric from config for cluster? Of all clusters and use largest? split to 10 + 1 for artifacts? +// AddMetricHistograms augments statistics with distribution histograms for job metrics. +// +// Generates histogram data for metrics like CPU load, memory usage, etc. Handles running +// and completed jobs differently: running jobs load data from metric backend, completed jobs +// use footprint data from database. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply (MUST contain State filter for running jobs) +// - metrics: List of metric names to histogram (e.g., ["cpu_load", "mem_used"]) +// - stat: Statistics struct to augment (modified in-place) +// - targetBinCount: Number of histogram bins (default: 10) +// +// Populates HistMetrics field in stat with MetricHistoPoints for each metric. +// +// Binning algorithm: +// - Values normalized to metric's peak value from cluster configuration +// - Bins evenly distributed from 0 to peak +// - Pre-initialized with zeros for consistent visualization +// +// Limitations: +// - Running jobs: Limited to 500 jobs for performance +// - Requires valid cluster configuration with metric peak values +// - Uses footprint statistic (avg/max/min) configured per metric func (r *JobRepository) AddMetricHistograms( ctx context.Context, filter []*model.JobFilter, @@ -534,7 +723,16 @@ func (r *JobRepository) AddMetricHistograms( return stat, nil } -// `value` must be the column grouped by, but renamed to "value" +// jobsStatisticsHistogram generates a simple histogram by grouping on a column value. +// +// Used for histograms where the column value directly represents the bin (e.g., node count, core count). +// Unlike duration/metric histograms, this doesn't pre-initialize bins with zeros. +// +// Parameters: +// - value: SQL expression that produces the histogram value, aliased as "value" +// - filters: Job filters to apply +// +// Returns histogram points with Value (from column) and Count (number of jobs). func (r *JobRepository) jobsStatisticsHistogram( ctx context.Context, value string, @@ -573,6 +771,26 @@ func (r *JobRepository) jobsStatisticsHistogram( return points, nil } +// jobsDurationStatisticsHistogram generates a duration histogram with pre-initialized bins. +// +// Bins are zero-padded to provide consistent ranges for visualization, unlike simple +// histograms which only return bins with data. The value parameter should compute +// the bin number from job duration. +// +// Parameters: +// - value: SQL expression computing bin number from duration, aliased as "value" +// - filters: Job filters to apply +// - binSizeSeconds: Width of each bin in seconds +// - targetBinCount: Number of bins to pre-initialize +// +// Returns histogram points with Value (bin_number × binSizeSeconds) and Count. +// All bins from 1 to targetBinCount are returned, with Count=0 for empty bins. +// +// Algorithm: +// 1. Pre-initialize targetBinCount bins with zero counts +// 2. Query database for actual counts per bin +// 3. Match query results to pre-initialized bins by value +// 4. Bins without matches remain at zero func (r *JobRepository) jobsDurationStatisticsHistogram( ctx context.Context, value string, @@ -588,7 +806,6 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, qerr } - // Initialize histogram bins with zero counts // Each bin represents a duration range: bin N = [N*binSizeSeconds, (N+1)*binSizeSeconds) // Example: binSizeSeconds=3600 (1 hour), bin 1 = 0-1h, bin 2 = 1-2h, etc. points := make([]*model.HistoPoint, 0) @@ -607,8 +824,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, err } - // Match query results to pre-initialized bins and fill counts - // Query returns raw duration values that need to be mapped to correct bins + // Match query results to pre-initialized bins. + // point.Value from query is the bin number; multiply by binSizeSeconds to match bin.Value. for rows.Next() { point := model.HistoPoint{} if err := rows.Scan(&point.Value, &point.Count); err != nil { @@ -616,13 +833,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, err } - // Find matching bin and update count - // point.Value is multiplied by binSizeSeconds to match pre-calculated bin.Value for _, e := range points { if e.Value == (point.Value * binSizeSeconds) { - // Note: Matching on unmodified integer value (and multiplying point.Value - // by binSizeSeconds after match) causes frontend to loop into highest - // targetBinCount, due to zoom condition instantly being fulfilled (cause unknown) e.Count = point.Count break } @@ -633,13 +845,34 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return points, nil } +// jobsMetricStatisticsHistogram generates a metric histogram using footprint data from completed jobs. +// +// Values are normalized to the metric's peak value and distributed into bins. The algorithm +// is based on SQL histogram generation techniques, extracting metric values from JSON footprint +// and computing bin assignments in SQL. +// +// Parameters: +// - metric: Metric name (e.g., "cpu_load", "mem_used") +// - filters: Job filters to apply +// - bins: Number of bins to generate +// +// Returns MetricHistoPoints with metric name, unit, footprint stat type, and binned data. +// +// Algorithm: +// 1. Determine peak value from cluster configuration (filtered cluster or max across all) +// 2. Generate SQL that extracts footprint value, normalizes to [0,1], multiplies by bin count +// 3. Pre-initialize bins with min/max ranges based on peak value +// 4. Query database for counts per bin +// 5. Match results to pre-initialized bins +// +// Special handling: Values exactly equal to peak are forced into the last bin by multiplying +// peak by 0.999999999 to avoid creating an extra bin. func (r *JobRepository) jobsMetricStatisticsHistogram( ctx context.Context, metric string, filters []*model.JobFilter, bins *int, ) (*model.MetricHistoPoints, error) { - // Determine the metric's peak value for histogram normalization // Peak value defines the upper bound for binning: values are distributed across // bins from 0 to peak. First try to get peak from filtered cluster, otherwise // scan all clusters to find the maximum peak value. @@ -679,18 +912,14 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } } - // Construct SQL histogram bins using normalized values + // Construct SQL histogram bins using normalized values. // Algorithm based on: https://jereze.com/code/sql-histogram/ (modified) start := time.Now() - // Calculate bin number for each job's metric value: - // 1. Extract metric value from JSON footprint - // 2. Normalize to [0,1] by dividing by peak - // 3. Multiply by number of bins to get bin number - // 4. Cast to integer for bin assignment - // - // Special case: Values exactly equal to peak would fall into bin N+1, - // so we multiply peak by 0.999999999 to force it into the last bin (bin N) + // Bin calculation formula: + // bin_number = CAST( (value / peak) * num_bins AS INTEGER ) + 1 + // Special case: value == peak would create bin N+1, so we test for equality + // and multiply peak by 0.999999999 to force it into bin N. binQuery := fmt.Sprintf(`CAST( ((case when json_extract(footprint, "$.%s") = %f then %f*0.999999999 else json_extract(footprint, "$.%s") end) / %f) * %v as INTEGER )`, @@ -699,24 +928,19 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( mainQuery := sq.Select( fmt.Sprintf(`%s + 1 as bin`, binQuery), `count(*) as count`, - // For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * %s as min`, peak, *bins, binQuery), - // For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * (%s + 1) as max`, peak, *bins, binQuery), ).From("job").Where( "JSON_VALID(footprint)", ).Where(fmt.Sprintf(`json_extract(footprint, "$.%s") is not null and json_extract(footprint, "$.%s") <= %f`, (metric + "_" + footprintStat), (metric + "_" + footprintStat), peak)) - // Only accessible Jobs... mainQuery, qerr := SecurityCheck(ctx, mainQuery) if qerr != nil { return nil, qerr } - // Filters... for _, f := range filters { mainQuery = BuildWhereClause(f, mainQuery) } - // Finalize query with Grouping and Ordering mainQuery = mainQuery.GroupBy("bin").OrderBy("bin") rows, err := mainQuery.RunWith(r.DB).Query() @@ -725,8 +949,7 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( return nil, err } - // Initialize histogram bins with calculated min/max ranges - // Each bin represents a range of metric values + // Pre-initialize bins with calculated min/max ranges. // Example: peak=1000, bins=10 -> bin 1=[0,100), bin 2=[100,200), ..., bin 10=[900,1000] points := make([]*model.MetricHistoPoint, 0) binStep := int(peak) / *bins @@ -737,29 +960,18 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( points = append(points, &epoint) } - // Fill counts from query results - // Query only returns bins that have jobs, so we match against pre-initialized bins + // Match query results to pre-initialized bins. for rows.Next() { rpoint := model.MetricHistoPoint{} - if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { // Required for Debug: &rpoint.Min, &rpoint.Max + if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { cclog.Warnf("Error while scanning rows for %s", metric) - return nil, err // FIXME: Totally bricks cc-backend if returned and if all metrics requested? + return nil, err } - // Match query result to pre-initialized bin and update count for _, e := range points { - if e.Bin != nil && rpoint.Bin != nil { - if *e.Bin == *rpoint.Bin { - e.Count = rpoint.Count - // Only Required For Debug: Check DB returned Min/Max against Backend Init above - // if rpoint.Min != nil { - // cclog.Warnf(">>>> Bin %d Min Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Min, *e.Min) - // } - // if rpoint.Max != nil { - // cclog.Warnf(">>>> Bin %d Max Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Max, *e.Max) - // } - break - } + if e.Bin != nil && rpoint.Bin != nil && *e.Bin == *rpoint.Bin { + e.Count = rpoint.Count + break } } } @@ -770,6 +982,28 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( return &result, nil } +// runningJobsMetricStatisticsHistogram generates metric histograms for running jobs using live data. +// +// Unlike completed jobs which use footprint data from the database, running jobs require +// fetching current metric averages from the metric backend (via metricdispatch). +// +// Parameters: +// - metrics: List of metric names +// - filters: Job filters (should filter to running jobs only) +// - bins: Number of histogram bins +// +// Returns slice of MetricHistoPoints, one per metric. +// +// Limitations: +// - Maximum 500 jobs (returns nil if more jobs match) +// - Requires metric backend availability +// - Bins based on metric peak values from cluster configuration +// +// Algorithm: +// 1. Query first 501 jobs to check count limit +// 2. Load metric averages for all jobs via metricdispatch +// 3. For each metric, create bins based on peak value +// 4. Iterate averages and count jobs per bin func (r *JobRepository) runningJobsMetricStatisticsHistogram( ctx context.Context, metrics []string, diff --git a/internal/repository/tags.go b/internal/repository/tags.go index f6cccfe2..861cbb76 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -3,6 +3,34 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. +// Package repository provides data access and persistence layer for ClusterCockpit. +// +// This file implements tag management functionality for job categorization and classification. +// Tags support both manual assignment (via REST/GraphQL APIs) and automatic detection +// (via tagger plugins). The implementation includes role-based access control through +// tag scopes and maintains bidirectional consistency between the SQL database and +// the file-based job archive. +// +// Database Schema: +// +// CREATE TABLE tag ( +// id INTEGER PRIMARY KEY AUTOINCREMENT, +// tag_type VARCHAR(255) NOT NULL, +// tag_name VARCHAR(255) NOT NULL, +// tag_scope VARCHAR(255) NOT NULL DEFAULT "global", +// CONSTRAINT tag_unique UNIQUE (tag_type, tag_name, tag_scope) +// ); +// +// CREATE TABLE jobtag ( +// job_id INTEGER, +// tag_id INTEGER, +// PRIMARY KEY (job_id, tag_id), +// FOREIGN KEY (job_id) REFERENCES job(id) ON DELETE CASCADE, +// FOREIGN KEY (tag_id) REFERENCES tag(id) ON DELETE CASCADE +// ); +// +// The jobtag junction table enables many-to-many relationships between jobs and tags. +// CASCADE deletion ensures referential integrity when jobs or tags are removed. package repository import ( @@ -73,7 +101,7 @@ func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) { j, err := r.FindByIDWithUser(user, job) if err != nil { - cclog.Warn("Error while finding job by id") + cclog.Warnf("Error while finding job %d for user %s during tag removal: %v", job, user.Username, err) return nil, err } @@ -93,7 +121,7 @@ func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema. archiveTags, err := r.getArchiveTags(&job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting archive tags for job %d in RemoveTag: %v", job, err) return nil, err } @@ -104,7 +132,7 @@ func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema. // Requires user authentication for security checks. Used by REST API. func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagType string, tagName string, tagScope string) ([]*schema.Tag, error) { // Get Tag ID to delete - tagID, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { cclog.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return nil, fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) @@ -113,7 +141,7 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT // Get Job j, err := r.FindByIDWithUser(user, job) if err != nil { - cclog.Warn("Error while finding job by id") + cclog.Warnf("Error while finding job %d for user %s during tag removal by request: %v", job, user.Username, err) return nil, err } @@ -128,19 +156,30 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT tags, err := r.GetTags(user, &job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting tags for job %d in RemoveJobTagByRequest: %v", job, err) return nil, err } archiveTags, err := r.getArchiveTags(&job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting archive tags for job %d in RemoveJobTagByRequest: %v", job, err) return nil, err } return tags, archive.UpdateTags(j, archiveTags) } +// removeTagFromArchiveJobs updates the job archive for all affected jobs after a tag deletion. +// +// This function is called asynchronously (via goroutine) after removing a tag from the database +// to synchronize the file-based job archive with the database state. Errors are logged but not +// returned since this runs in the background. +// +// Parameters: +// - jobIds: Database IDs of all jobs that had the deleted tag +// +// Implementation note: Each job is processed individually to handle partial failures gracefully. +// If one job fails to update, others will still be processed. func (r *JobRepository) removeTagFromArchiveJobs(jobIds []int64) { for _, j := range jobIds { tags, err := r.getArchiveTags(&j) @@ -163,18 +202,18 @@ func (r *JobRepository) removeTagFromArchiveJobs(jobIds []int64) { // Used by REST API. Does not update tagged jobs in Job archive. func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagScope string) error { // Get Tag ID to delete - tagID, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { cclog.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } - return r.RemoveTagById(tagID) + return r.RemoveTagByID(tagID) } // Removes a tag from db by tag id // Used by GraphQL API. -func (r *JobRepository) RemoveTagById(tagID int64) error { +func (r *JobRepository) RemoveTagByID(tagID int64) error { jobIds, err := r.FindJobIdsByTag(tagID) if err != nil { return err @@ -213,7 +252,7 @@ func (r *JobRepository) RemoveTagById(tagID int64) error { // Example: // // tagID, err := repo.CreateTag("performance", "high-memory", "global") -func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagId int64, err error) { +func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagID int64, err error) { // Default to "Global" scope if none defined if tagScope == "" { tagScope = "global" @@ -300,13 +339,13 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts for rows.Next() { var tagType string var tagName string - var tagId int + var tagID int var count int - if err = rows.Scan(&tagType, &tagName, &tagId, &count); err != nil { + if err = rows.Scan(&tagType, &tagName, &tagID, &count); err != nil { return nil, nil, err } // Use tagId as second Map-Key component to differentiate tags with identical names - counts[fmt.Sprint(tagType, tagName, tagId)] = count + counts[fmt.Sprint(tagType, tagName, tagID)] = count } err = rows.Err() @@ -314,18 +353,44 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts } var ( - ErrTagNotFound = errors.New("the tag does not exist") - ErrJobNotOwned = errors.New("user is not owner of job") - ErrTagNoAccess = errors.New("user not permitted to use that tag") - ErrTagPrivateScope = errors.New("tag is private to another user") - ErrTagAdminScope = errors.New("tag requires admin privileges") + // ErrTagNotFound is returned when a tag ID or tag identifier (type, name, scope) does not exist in the database. + ErrTagNotFound = errors.New("the tag does not exist") + + // ErrJobNotOwned is returned when a user attempts to tag a job they do not have permission to access. + ErrJobNotOwned = errors.New("user is not owner of job") + + // ErrTagNoAccess is returned when a user attempts to use a tag they cannot access due to scope restrictions. + ErrTagNoAccess = errors.New("user not permitted to use that tag") + + // ErrTagPrivateScope is returned when a user attempts to access another user's private tag. + ErrTagPrivateScope = errors.New("tag is private to another user") + + // ErrTagAdminScope is returned when a non-admin user attempts to use an admin-scoped tag. + ErrTagAdminScope = errors.New("tag requires admin privileges") + + // ErrTagsIncompatScopes is returned when attempting to combine admin and non-admin scoped tags in a single operation. ErrTagsIncompatScopes = errors.New("combining admin and non-admin scoped tags not allowed") ) // addJobTag is a helper function that inserts a job-tag association and updates the archive. -// Returns the updated tag list for the job. -func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, getTags func() ([]*schema.Tag, error)) ([]*schema.Tag, error) { - q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId) +// +// This function performs three operations atomically: +// 1. Inserts the job-tag association into the jobtag junction table +// 2. Retrieves the updated tag list for the job (using the provided getTags callback) +// 3. Updates the job archive with the new tags to maintain database-archive consistency +// +// Parameters: +// - jobId: Database ID of the job +// - tagId: Database ID of the tag to associate +// - job: Full job object needed for archive update +// - getTags: Callback function to retrieve updated tags (allows different security contexts) +// +// Returns the complete updated tag list for the job or an error. +// +// Note: This function does NOT validate tag scope permissions - callers must perform +// authorization checks before invoking this helper. +func (r *JobRepository) addJobTag(jobID int64, tagID int64, job *schema.Job, getTags func() ([]*schema.Tag, error)) ([]*schema.Tag, error) { + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobID, tagID) if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { s, _, _ := q.ToSql() @@ -335,13 +400,13 @@ func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, get tags, err := getTags() if err != nil { - cclog.Warnf("Error getting tags for job %d: %v", jobId, err) + cclog.Warnf("Error getting tags for job %d: %v", jobID, err) return nil, err } - archiveTags, err := r.getArchiveTags(&jobId) + archiveTags, err := r.getArchiveTags(&jobID) if err != nil { - cclog.Warnf("Error getting archive tags for job %d: %v", jobId, err) + cclog.Warnf("Error getting archive tags for job %d: %v", jobID, err) return nil, err } @@ -350,7 +415,7 @@ func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, get // AddTagOrCreate adds the tag with the specified type and name to the job with the database id `jobId`. // If such a tag does not yet exist, it is created. -func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType string, tagName string, tagScope string) (tagId int64, err error) { +func (r *JobRepository) AddTagOrCreate(user *schema.User, jobID int64, tagType string, tagName string, tagScope string) (tagID int64, err error) { // Default to "Global" scope if none defined if tagScope == "" { tagScope = "global" @@ -364,44 +429,43 @@ func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType s return 0, fmt.Errorf("cannot write tag scope with current authorization") } - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return 0, err } } - if _, err := r.AddTag(user, jobId, tagId); err != nil { + if _, err := r.AddTag(user, jobID, tagID); err != nil { return 0, err } - return tagId, nil + return tagID, nil } -// used in auto tagger plugins -func (r *JobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) { +func (r *JobRepository) AddTagOrCreateDirect(jobID int64, tagType string, tagName string) (tagID int64, err error) { tagScope := "global" - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return 0, err } } - if _, err := r.AddTagDirect(jobId, tagId); err != nil { + if _, err := r.AddTagDirect(jobID, tagID); err != nil { return 0, err } - return tagId, nil + return tagID, nil } -func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { +func (r *JobRepository) HasTag(jobID int64, tagType string, tagName string) bool { var id int64 q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). - Where("jobtag.job_id = ?", jobId).Where("tag.tag_type = ?", tagType). + Where("jobtag.job_id = ?", jobID).Where("tag.tag_type = ?", tagType). Where("tag.tag_name = ?", tagName) err := q.RunWith(r.stmtCache).QueryRow().Scan(&id) if err != nil { @@ -411,21 +475,21 @@ func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool } } -// TagId returns the database id of the tag with the specified type and name. -func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) { +// TagID returns the database id of the tag with the specified type and name. +func (r *JobRepository) TagID(tagType string, tagName string, tagScope string) (tagID int64, exists bool) { exists = true if err := sq.Select("id").From("tag"). Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).Where("tag.tag_scope = ?", tagScope). - RunWith(r.stmtCache).QueryRow().Scan(&tagId); err != nil { + RunWith(r.stmtCache).QueryRow().Scan(&tagID); err != nil { exists = false } return } // TagInfo returns the database infos of the tag with the specified id. -func (r *JobRepository) TagInfo(tagId int64) (tagType string, tagName string, tagScope string, exists bool) { +func (r *JobRepository) TagInfo(tagID int64) (tagType string, tagName string, tagScope string, exists bool) { exists = true - if err := sq.Select("tag.tag_type", "tag.tag_name", "tag.tag_scope").From("tag").Where("tag.id = ?", tagId). + if err := sq.Select("tag.tag_type", "tag.tag_name", "tag.tag_scope").From("tag").Where("tag.id = ?", tagID). RunWith(r.stmtCache).QueryRow().Scan(&tagType, &tagName, &tagScope); err != nil { exists = false } @@ -450,7 +514,7 @@ func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, e for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in GetTags: %v", err) return nil, err } // Handle Scope Filtering: Tag Scope is Global, Private (== Username) or User is auth'd to view Admin Tags @@ -483,7 +547,7 @@ func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in GetTagsDirect: %v", err) return nil, err } tags = append(tags, tag) @@ -492,7 +556,18 @@ func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { return tags, nil } -// GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has. +// getArchiveTags returns all tags for a job WITHOUT applying scope-based filtering. +// +// This internal function is used exclusively for job archive synchronization where we need +// to store all tags regardless of the current user's permissions. Unlike GetTags() which +// filters by scope, this returns the complete unfiltered tag list. +// +// Parameters: +// - job: Pointer to job database ID, or nil to return all tags in the system +// +// Returns all tags without scope filtering, used only for archive operations. +// +// WARNING: Do NOT expose this function to user-facing APIs as it bypasses authorization. func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") if job != nil { @@ -510,7 +585,7 @@ func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in getArchiveTags: %v", err) return nil, err } tags = append(tags, tag) @@ -519,18 +594,18 @@ func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { return tags, nil } -func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, tagScope string) (err error) { +func (r *JobRepository) ImportTag(jobID int64, tagType string, tagName string, tagScope string) (err error) { // Import has no scope ctx, only import from metafile to DB (No recursive archive update required), only returns err - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return err } } - q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId) + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobID, tagID) if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { s, _, _ := q.ToSql() @@ -541,6 +616,28 @@ func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, t return nil } +// checkScopeAuth validates whether a user is authorized to perform an operation on a tag with the given scope. +// +// This function implements the tag scope authorization matrix: +// +// Scope | Read Access | Write Access +// -------------|----------------------------------|---------------------------------- +// "global" | All users | Admin, Support, API-only +// "admin" | Admin, Support | Admin, API-only +// | Owner only | Owner only (private tags) +// +// Parameters: +// - user: User attempting the operation (must not be nil) +// - operation: Either "read" or "write" +// - scope: Tag scope value ("global", "admin", or username for private tags) +// +// Returns: +// - pass: true if authorized, false if denied +// - err: error only if operation is invalid or user is nil +// +// Special cases: +// - API-only users (single role: RoleApi) can write to admin and global scopes for automation +// - Private tags use the username as scope, granting exclusive access to that user func (r *JobRepository) checkScopeAuth(user *schema.User, operation string, scope string) (pass bool, err error) { if user != nil { switch { diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 6e4866eb..272eeb35 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -108,7 +108,7 @@ func initClusterConfig() error { } availability.SubClusters = append(availability.SubClusters, sc.Name) - sc.MetricConfig = append(sc.MetricConfig, *newMetric) + sc.MetricConfig = append(sc.MetricConfig, newMetric) if newMetric.Footprint != "" { sc.Footprint = append(sc.Footprint, newMetric.Name) @@ -282,7 +282,7 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) { return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname) } -func MetricIndex(mc []schema.MetricConfig, name string) (int, error) { +func MetricIndex(mc []*schema.MetricConfig, name string) (int, error) { for i, m := range mc { if m.Name == name { return i, nil diff --git a/pkg/archive/nodelist.go b/pkg/archive/nodelist.go index 7a3784c3..42d8492a 100644 --- a/pkg/archive/nodelist.go +++ b/pkg/archive/nodelist.go @@ -3,6 +3,70 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. +// Package archive provides nodelist parsing functionality for HPC cluster node specifications. +// +// # Overview +// +// The nodelist package implements parsing and querying of compact node list representations +// commonly used in HPC job schedulers and cluster management systems. It converts compressed +// node specifications (e.g., "node[01-10]") into queryable structures that can efficiently +// test node membership and expand to full node lists. +// +// # Node List Format +// +// Node lists use a compact syntax with the following rules: +// +// 1. Comma-separated terms represent alternative node patterns (OR logic) +// 2. Each term consists of a string prefix followed by optional numeric ranges +// 3. Numeric ranges are specified in square brackets with zero-padded start-end format +// 4. Multiple ranges within brackets are comma-separated +// 5. Range digits must be zero-padded and of equal length (e.g., "01-99" not "1-99") +// +// # Examples +// +// "node01" // Single node +// "node01,node02" // Multiple individual nodes +// "node[01-10]" // Range: node01 through node10 (zero-padded) +// "node[01-10,20-30]" // Multiple ranges: node01-10 and node20-30 +// "cn-00[10-20],cn-00[50-60]" // Different prefixes with ranges +// "login,compute[001-100]" // Mixed individual and range terms +// +// # Usage +// +// Parse a node list specification: +// +// nl, err := ParseNodeList("node[01-10],login") +// if err != nil { +// log.Fatal(err) +// } +// +// Check if a node name matches the list: +// +// if nl.Contains("node05") { +// // node05 is in the list +// } +// +// Expand to full list of node names: +// +// nodes := nl.PrintList() // ["node01", "node02", ..., "node10", "login"] +// +// Count total nodes in the list: +// +// count := nl.NodeCount() // 11 (10 from range + 1 individual) +// +// # Integration +// +// This package is used by: +// - clusterConfig.go: Parses SubCluster.Nodes field from cluster configuration +// - schema.resolvers.go: GraphQL resolver for computing numberOfNodes in subclusters +// - Job archive: Validates node assignments against configured cluster topology +// +// # Constraints +// +// - Only zero-padded numeric ranges are supported +// - Range start and end must have identical digit counts +// - No whitespace allowed in node list specifications +// - Ranges must be specified as start-end (not individual numbers) package archive import ( @@ -13,12 +77,36 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) +// NodeList represents a parsed node list specification as a collection of node pattern terms. +// Each term is a sequence of expressions that must match consecutively for a node name to match. +// Terms are evaluated with OR logic - a node matches if ANY term matches completely. +// +// Internal structure: +// - Outer slice: OR terms (comma-separated in input) +// - Inner slice: AND expressions (must all match sequentially) +// - Each expression implements: consume (pattern matching), limits (range info), prefix (string part) +// +// Example: "node[01-10],login" becomes: +// - Term 1: [NLExprString("node"), NLExprIntRanges(01-10)] +// - Term 2: [NLExprString("login")] type NodeList [][]interface { consume(input string) (next string, ok bool) limits() []map[string]int prefix() string } +// Contains tests whether the given node name matches any pattern in the NodeList. +// Returns true if the name matches at least one term completely, false otherwise. +// +// Matching logic: +// - Evaluates each term sequentially (OR logic across terms) +// - Within a term, all expressions must match in order (AND logic) +// - A match is complete only if the entire input is consumed (str == "") +// +// Examples: +// - NodeList("node[01-10]").Contains("node05") → true +// - NodeList("node[01-10]").Contains("node11") → false +// - NodeList("node[01-10]").Contains("node5") → false (missing zero-padding) func (nl *NodeList) Contains(name string) bool { var ok bool for _, term := range *nl { @@ -38,14 +126,22 @@ func (nl *NodeList) Contains(name string) bool { return false } +// PrintList expands the NodeList into a full slice of individual node names. +// This performs the inverse operation of ParseNodeList, expanding all ranges +// into their constituent node names with proper zero-padding. +// +// Returns a slice of node names in the order they appear in the NodeList. +// For range terms, nodes are expanded in ascending numeric order. +// +// Example: +// - ParseNodeList("node[01-03],login").PrintList() → ["node01", "node02", "node03", "login"] func (nl *NodeList) PrintList() []string { var out []string for _, term := range *nl { - // Get String-Part first prefix := term[0].prefix() - if len(term) == 1 { // If only String-Part in Term: Single Node Name -> Use as provided + if len(term) == 1 { out = append(out, prefix) - } else { // Else: Numeric start-end definition with x digits zeroPadded + } else { limitArr := term[1].limits() for _, inner := range limitArr { for i := inner["start"]; i < inner["end"]+1; i++ { @@ -61,12 +157,22 @@ func (nl *NodeList) PrintList() []string { return out } +// NodeCount returns the total number of individual nodes represented by the NodeList. +// This efficiently counts nodes without expanding the full list, making it suitable +// for large node ranges. +// +// Calculation: +// - Individual node terms contribute 1 +// - Range terms contribute (end - start + 1) for each range +// +// Example: +// - ParseNodeList("node[01-10],login").NodeCount() → 11 (10 from range + 1 individual) func (nl *NodeList) NodeCount() int { out := 0 for _, term := range *nl { - if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one + if len(term) == 1 { out += 1 - } else { // Else: Numeric start-end definition -> add difference + 1 + } else { limitArr := term[1].limits() for _, inner := range limitArr { out += (inner["end"] - inner["start"]) + 1 @@ -76,6 +182,8 @@ func (nl *NodeList) NodeCount() int { return out } +// NLExprString represents a literal string prefix in a node name pattern. +// It matches by checking if the input starts with this exact string. type NLExprString string func (nle NLExprString) consume(input string) (next string, ok bool) { @@ -96,6 +204,8 @@ func (nle NLExprString) prefix() string { return string(nle) } +// NLExprIntRanges represents multiple alternative integer ranges (comma-separated within brackets). +// A node name matches if it matches ANY of the contained ranges (OR logic). type NLExprIntRanges []NLExprIntRange func (nles NLExprIntRanges) consume(input string) (next string, ok bool) { @@ -122,6 +232,11 @@ func (nles NLExprIntRanges) prefix() string { return s } +// NLExprIntRange represents a single zero-padded integer range (e.g., "01-99"). +// Fields: +// - start, end: Numeric range boundaries (inclusive) +// - zeroPadded: Must be true (non-padded ranges not supported) +// - digits: Required digit count for zero-padding type NLExprIntRange struct { start, end int64 zeroPadded bool @@ -176,6 +291,28 @@ func (nles NLExprIntRange) prefix() string { return s } +// ParseNodeList parses a compact node list specification into a queryable NodeList structure. +// +// Input format rules: +// - Comma-separated terms (OR logic): "node01,node02" matches either node +// - Range syntax: "node[01-10]" expands to node01 through node10 +// - Multiple ranges: "node[01-05,10-15]" creates two ranges +// - Zero-padding required: digits in ranges must be zero-padded and equal length +// - Mixed formats: "login,compute[001-100]" combines individual and range terms +// +// Validation: +// - Returns error if brackets are unclosed +// - Returns error if ranges lack '-' separator +// - Returns error if range digits have unequal length +// - Returns error if range numbers fail to parse +// - Returns error on invalid characters +// +// Examples: +// - "node[01-10]" → NodeList with one term (10 nodes) +// - "node01,node02" → NodeList with two terms (2 nodes) +// - "cn[01-05,10-15]" → NodeList with ranges 01-05 and 10-15 (11 nodes total) +// - "a[1-9]" → Error (not zero-padded) +// - "a[01-9]" → Error (unequal digit counts) func ParseNodeList(raw string) (NodeList, error) { isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') } isDigit := func(r byte) bool { return '0' <= r && r <= '9' } @@ -232,12 +369,12 @@ func ParseNodeList(raw string) (NodeList, error) { nles := NLExprIntRanges{} for _, part := range parts { - minus := strings.Index(part, "-") - if minus == -1 { + before, after, ok := strings.Cut(part, "-") + if !ok { return nil, fmt.Errorf("ARCHIVE/NODELIST > no '-' found inside '[...]'") } - s1, s2 := part[0:minus], part[minus+1:] + s1, s2 := before, after if len(s1) != len(s2) || len(s1) == 0 { return nil, fmt.Errorf("ARCHIVE/NODELIST > %v and %v are not of equal length or of length zero", s1, s2) }