cc-backend/internal/repository/job.go

package repository

import (
	"context"
	"database/sql"
	"encoding/json"
	"errors"
	"fmt"
	"strconv"
	"sync"
	"time"

	"github.com/ClusterCockpit/cc-backend/internal/auth"
	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
	"github.com/ClusterCockpit/cc-backend/pkg/log"
	"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
	"github.com/ClusterCockpit/cc-backend/pkg/schema"
	sq "github.com/Masterminds/squirrel"
	"github.com/jmoiron/sqlx"
)

var (
	jobRepoOnce     sync.Once
	jobRepoInstance *JobRepository
)

type JobRepository struct {
	DB *sqlx.DB

	stmtCache *sq.StmtCache
	cache     *lrucache.Cache
}

func GetRepository() *JobRepository {
	jobRepoOnce.Do(func() {
		db := GetConnection()

		jobRepoInstance = &JobRepository{
			DB:        db.DB,
			stmtCache: sq.NewStmtCache(db.DB),
			cache:     lrucache.New(1024 * 1024),
		}
	})

	return jobRepoInstance
}

var jobColumns []string = []string{
	"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id",
	"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",
	"job.duration", "job.walltime", "job.resources", // "job.meta_data",
}

func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
	job := &schema.Job{}
	if err := row.Scan(
		&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,
		&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
		&job.Duration, &job.Walltime, &job.RawResources /*&job.MetaData*/); err != nil {
		return nil, err
	}

	if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
		return nil, err
	}

	job.StartTime = time.Unix(job.StartTimeUnix, 0)
	if job.Duration == 0 && job.State == schema.JobStateRunning {
		job.Duration = int32(time.Since(job.StartTime).Seconds())
	}

	job.RawResources = nil
	return job, nil
}

func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) {
	cachekey := fmt.Sprintf("metadata:%d", job.ID)
	if cached := r.cache.Get(cachekey, nil); cached != nil {
		job.MetaData = cached.(map[string]string)
		return job.MetaData, nil
	}

	if err := sq.Select("job.meta_data").From("job").Where("job.id = ?", job.ID).
		RunWith(r.stmtCache).QueryRow().Scan(&job.RawMetaData); err != nil {
		return nil, err
	}

	if len(job.RawMetaData) == 0 {
		return nil, nil
	}

	if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
		return nil, err
	}

	r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)
	return job.MetaData, nil
}

func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err error) {
	cachekey := fmt.Sprintf("metadata:%d", job.ID)
	r.cache.Del(cachekey)
	if job.MetaData == nil {
		if _, err = r.FetchMetadata(job); err != nil {
			return err
		}
	}

	if job.MetaData != nil {
		cpy := make(map[string]string, len(job.MetaData)+1)
		for k, v := range job.MetaData {
			cpy[k] = v
		}
		cpy[key] = val
		job.MetaData = cpy
	} else {
		job.MetaData = map[string]string{key: val}
	}

	if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil {
		return err
	}

	if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {
		return err
	}

	r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)
	return nil
}

// Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) Find(
	jobId *int64,
	cluster *string,
	startTime *int64) (*schema.Job, error) {

	q := sq.Select(jobColumns...).From("job").
		Where("job.job_id = ?", *jobId)

	if cluster != nil {
		q = q.Where("job.cluster = ?", *cluster)
	}
	if startTime != nil {
		q = q.Where("job.start_time = ?", *startTime)
	}

	return scanJob(q.RunWith(r.stmtCache).QueryRow())
}

// FindById executes a SQL query to find a specific batch job.
// The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindById(
	jobId int64) (*schema.Job, error) {
	q := sq.Select(jobColumns...).
		From("job").Where("job.id = ?", jobId)
	return scanJob(q.RunWith(r.stmtCache).QueryRow())
}

// Start inserts a new job in the table, returning the unique job ID.
// Statistics are not transfered!
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
	job.RawResources, err = json.Marshal(job.Resources)
	if err != nil {
		return -1, fmt.Errorf("encoding resources field failed: %w", err)
	}

	job.RawMetaData, err = json.Marshal(job.MetaData)
	if err != nil {
		return -1, fmt.Errorf("encoding metaData field failed: %w", err)
	}

	res, err := r.DB.NamedExec(`INSERT INTO job (
		job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc,
		exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data
	) VALUES (
		:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
		:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data
	);`, job)
	if err != nil {
		return -1, err
	}

	return res.LastInsertId()
}

// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) Stop(
	jobId int64,
	duration int32,
	state schema.JobState,
	monitoringStatus int32) (err error) {

	stmt := sq.Update("job").
		Set("job_state", state).
		Set("duration", duration).
		Set("monitoring_status", monitoringStatus).
		Where("job.id = ?", jobId)

	_, err = stmt.RunWith(r.stmtCache).Exec()
	return
}

// TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC;
func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggregate, filters []*model.JobFilter, weight *model.Weights, limit *int) (map[string]int, error) {
	if !aggreg.IsValid() {
		return nil, errors.New("invalid aggregate")
	}

	runner := (sq.BaseRunner)(r.stmtCache)
	count := "count(*) as count"
	if weight != nil {
		switch *weight {
		case model.WeightsNodeCount:
			count = "sum(job.num_nodes) as count"
		case model.WeightsNodeHours:
			now := time.Now().Unix()
			count = fmt.Sprintf(`sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) as count`, now)
			runner = r.DB
		}
	}

	q := sq.Select("job."+string(aggreg), count).From("job").GroupBy("job." + string(aggreg)).OrderBy("count DESC")
	q = SecurityCheck(ctx, q)
	for _, f := range filters {
		q = BuildWhereClause(f, q)
	}
	if limit != nil {
		q = q.Limit(uint64(*limit))
	}

	counts := map[string]int{}
	rows, err := q.RunWith(runner).Query()
	if err != nil {
		return nil, err
	}

	for rows.Next() {
		var group string
		var count int
		if err := rows.Scan(&group, &count); err != nil {
			return nil, err
		}

		counts[group] = count
	}

	return counts, nil
}

func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
	stmt := sq.Update("job").
		Set("monitoring_status", monitoringStatus).
		Where("job.id = ?", job)

	_, err = stmt.RunWith(r.stmtCache).Exec()
	return
}

// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) Archive(
	jobId int64,
	monitoringStatus int32,
	metricStats map[string]schema.JobStatistics) error {

	stmt := sq.Update("job").
		Set("monitoring_status", monitoringStatus).
		Where("job.id = ?", jobId)

	for metric, stats := range metricStats {
		switch metric {
		case "flops_any":
			stmt = stmt.Set("flops_any_avg", stats.Avg)
		case "mem_used":
			stmt = stmt.Set("mem_used_max", stats.Max)
		case "mem_bw":
			stmt = stmt.Set("mem_bw_avg", stats.Avg)
		case "load":
			stmt = stmt.Set("load_avg", stats.Avg)
		case "net_bw":
			stmt = stmt.Set("net_bw_avg", stats.Avg)
		case "file_bw":
			stmt = stmt.Set("file_bw_avg", stats.Avg)
		}
	}

	if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
		return err
	}
	return nil
}

var ErrNotFound = errors.New("no such job or user")

// FindJobOrUser returns a job database ID or a username if a job or user machtes the search term.
// As 0 is a valid job id, check if username is "" instead in order to check what machted.
// If nothing matches the search, `ErrNotFound` is returned.
func (r *JobRepository) FindJobOrUser(ctx context.Context, searchterm string) (job int64, username string, err error) {
	user := auth.GetUser(ctx)
	if id, err := strconv.Atoi(searchterm); err == nil {
		qb := sq.Select("job.id").From("job").Where("job.job_id = ?", id)
		if user != nil && !user.HasRole(auth.RoleAdmin) {
			qb = qb.Where("job.user = ?", user.Username)
		}

		err := qb.RunWith(r.stmtCache).QueryRow().Scan(&job)
		if err != nil && err != sql.ErrNoRows {
			return 0, "", err
		} else if err == nil {
			return job, "", nil
		}
	}

	if user == nil || user.HasRole(auth.RoleAdmin) {
		err := sq.Select("job.user").Distinct().From("job").
			Where("job.user = ?", searchterm).
			RunWith(r.stmtCache).QueryRow().Scan(&username)
		if err != nil && err != sql.ErrNoRows {
			return 0, "", err
		} else if err == nil {
			return 0, username, nil
		}
	}

	return 0, "", ErrNotFound
}

func (r *JobRepository) Partitions(cluster string) ([]string, error) {
	var err error
	partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) {
		parts := []string{}
		if err = r.DB.Select(&parts, `SELECT DISTINCT job.partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
			return nil, 0, 1000
		}

		return parts, 1 * time.Hour, 1
	})
	if err != nil {
		return nil, err
	}
	return partitions.([]string), nil
}

// AllocatedNodes returns a map of all subclusters to a map of hostnames to the amount of jobs running on that host.
// Hosts with zero jobs running on them will not show up!
func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]int, error) {
	subclusters := make(map[string]map[string]int)
	rows, err := sq.Select("resources", "subcluster").From("job").
		Where("job.job_state = 'running'").
		Where("job.cluster = ?", cluster).
		RunWith(r.stmtCache).Query()
	if err != nil {
		return nil, err
	}

	var raw []byte
	defer rows.Close()
	for rows.Next() {
		raw = raw[0:0]
		var resources []*schema.Resource
		var subcluster string
		if err := rows.Scan(&raw, &subcluster); err != nil {
			return nil, err
		}
		if err := json.Unmarshal(raw, &resources); err != nil {
			return nil, err
		}

		hosts, ok := subclusters[subcluster]
		if !ok {
			hosts = make(map[string]int)
			subclusters[subcluster] = hosts
		}

		for _, resource := range resources {
			hosts[resource.Hostname] += 1
		}
	}

	return subclusters, nil
}

func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
	res, err := sq.Update("job").
		Set("monitoring_status", schema.MonitoringStatusArchivingFailed).
		Set("duration", 0).
		Set("job_state", schema.JobStateFailed).
		Where("job.job_state = 'running'").
		Where("job.walltime > 0").
		Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).
		RunWith(r.DB).Exec()
	if err != nil {
		return err
	}

	rowsAffected, err := res.RowsAffected()
	if err != nil {
		return err
	}

	if rowsAffected > 0 {
		log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected)
	}
	return nil
}
Start to extract DB repositories 2022-02-06 09:48:31 +01:00			`package repository`

			`import (`
Add /search endpoint which redirects to user/job 2022-02-09 15:03:12 +01:00			`"context"`
			`"database/sql"`
use prepared statements 2022-02-22 09:25:41 +01:00			`"encoding/json"`
Add /search endpoint which redirects to user/job 2022-02-09 15:03:12 +01:00			`"errors"`
Make metaData a map[string]string; Resolve explicitly 2022-03-08 11:53:24 +01:00			`"fmt"`
Add /search endpoint which redirects to user/job 2022-02-09 15:03:12 +01:00			`"strconv"`
Refactor directory structure 2022-06-21 17:52:36 +02:00			`"sync"`
use prepared statements 2022-02-22 09:25:41 +01:00			`"time"`
Add /search endpoint which redirects to user/job 2022-02-09 15:03:12 +01:00
Refactor directory structure 2022-06-21 17:52:36 +02:00			`"github.com/ClusterCockpit/cc-backend/internal/auth"`
			`"github.com/ClusterCockpit/cc-backend/internal/graph/model"`
			`"github.com/ClusterCockpit/cc-backend/pkg/log"`
Use internal lrucache 2022-06-22 06:11:00 +02:00			`"github.com/ClusterCockpit/cc-backend/pkg/lrucache"`
Refactor directory structure 2022-06-21 17:52:36 +02:00			`"github.com/ClusterCockpit/cc-backend/pkg/schema"`
Start to extract DB repositories 2022-02-06 09:48:31 +01:00			`sq "github.com/Masterminds/squirrel"`
			`"github.com/jmoiron/sqlx"`
			`)`

Refactor directory structure 2022-06-21 17:52:36 +02:00			`var (`
			`jobRepoOnce sync.Once`
			`jobRepoInstance *JobRepository`
			`)`

Start to extract DB repositories 2022-02-06 09:48:31 +01:00			`type JobRepository struct {`
			`DB *sqlx.DB`
use prepared statements 2022-02-22 09:25:41 +01:00
			`stmtCache *sq.StmtCache`
Add subcluster and walltime to Job types 2022-03-14 09:08:02 +01:00			`cache *lrucache.Cache`
Start to extract DB repositories 2022-02-06 09:48:31 +01:00			`}`

Refactor directory structure 2022-06-21 17:52:36 +02:00			`func GetRepository() *JobRepository {`
			`jobRepoOnce.Do(func() {`
			`db := GetConnection()`

			`jobRepoInstance = &JobRepository{`
			`DB: db.DB,`
			`stmtCache: sq.NewStmtCache(db.DB),`
			`cache: lrucache.New(1024 * 1024),`
			`}`
			`})`

			`return jobRepoInstance`
Add GraphQL endpoint for counting jobs 2022-02-19 10:28:29 +01:00			`}`

use prepared statements 2022-02-22 09:25:41 +01:00			`var jobColumns []string = []string{`
subclusters instead of slurm partitions 2022-03-14 10:18:56 +01:00			`"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id",`
use prepared statements 2022-02-22 09:25:41 +01:00			`"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",`
subclusters instead of slurm partitions 2022-03-14 10:18:56 +01:00			`"job.duration", "job.walltime", "job.resources", // "job.meta_data",`
use prepared statements 2022-02-22 09:25:41 +01:00			`}`

			`func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {`
			`job := &schema.Job{}`
			`if err := row.Scan(`
subclusters instead of slurm partitions 2022-03-14 10:18:56 +01:00			`&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,`
use prepared statements 2022-02-22 09:25:41 +01:00			`&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,`
subclusters instead of slurm partitions 2022-03-14 10:18:56 +01:00			`&job.Duration, &job.Walltime, &job.RawResources /&job.MetaData/); err != nil {`
use prepared statements 2022-02-22 09:25:41 +01:00			`return nil, err`
			`}`

			`if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {`
			`return nil, err`
			`}`

			`job.StartTime = time.Unix(job.StartTimeUnix, 0)`
			`if job.Duration == 0 && job.State == schema.JobStateRunning {`
			`job.Duration = int32(time.Since(job.StartTime).Seconds())`
			`}`

			`job.RawResources = nil`
			`return job, nil`
			`}`

Make metaData a map[string]string; Resolve explicitly 2022-03-08 11:53:24 +01:00			`func (r JobRepository) FetchMetadata(job schema.Job) (map[string]string, error) {`
Update frontend; Cache MetaData 2022-03-17 11:18:22 +01:00			`cachekey := fmt.Sprintf("metadata:%d", job.ID)`
			`if cached := r.cache.Get(cachekey, nil); cached != nil {`
			`job.MetaData = cached.(map[string]string)`
			`return job.MetaData, nil`
			`}`

Make metaData a map[string]string; Resolve explicitly 2022-03-08 11:53:24 +01:00			`if err := sq.Select("job.meta_data").From("job").Where("job.id = ?", job.ID).`
			`RunWith(r.stmtCache).QueryRow().Scan(&job.RawMetaData); err != nil {`
			`return nil, err`
			`}`

			`if len(job.RawMetaData) == 0 {`
			`return nil, nil`
			`}`

			`if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {`
			`return nil, err`
			`}`

Update frontend; Cache MetaData 2022-03-17 11:18:22 +01:00			`r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)`
Make metaData a map[string]string; Resolve explicitly 2022-03-08 11:53:24 +01:00			`return job.MetaData, nil`
			`}`

Update frontend; Cache MetaData 2022-03-17 11:18:22 +01:00			`func (r JobRepository) UpdateMetadata(job schema.Job, key, val string) (err error) {`
			`cachekey := fmt.Sprintf("metadata:%d", job.ID)`
			`r.cache.Del(cachekey)`
			`if job.MetaData == nil {`
			`if _, err = r.FetchMetadata(job); err != nil {`
			`return err`
			`}`
			`}`

			`if job.MetaData != nil {`
			`cpy := make(map[string]string, len(job.MetaData)+1)`
			`for k, v := range job.MetaData {`
			`cpy[k] = v`
			`}`
			`cpy[key] = val`
			`job.MetaData = cpy`
			`} else {`
			`job.MetaData = map[string]string{key: val}`
			`}`

			`if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil {`
			`return err`
			`}`

			`if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {`
			`return err`
			`}`

			`r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)`
			`return nil`
			`}`

Refactor job repository API. Cleanup. 2022-02-07 09:57:06 +01:00			`// Find executes a SQL query to find a specific batch job.`
			`// The job is queried using the batch job id, the cluster name,`
			`// and the start time of the job in UNIX epoch time seconds.`
			`// It returns a pointer to a schema.Job data structure and an error variable.`
Change job repo interface 2022-02-07 14:56:46 +01:00			`// To check if no job was found test err == sql.ErrNoRows`
Refactor job repository API. Cleanup. 2022-02-07 09:57:06 +01:00			`func (r *JobRepository) Find(`
Allow variable number of parameters for stop job 2022-02-15 17:13:16 +01:00			`jobId *int64,`
			`cluster *string,`
			`startTime int64) (schema.Job, error) {`

use prepared statements 2022-02-22 09:25:41 +01:00			`q := sq.Select(jobColumns...).From("job").`
Minor fixes; Update frontend 2022-03-08 10:33:56 +01:00			`Where("job.job_id = ?", *jobId)`
Allow variable number of parameters for stop job 2022-02-15 17:13:16 +01:00
			`if cluster != nil {`
use prepared statements 2022-02-22 09:25:41 +01:00			`q = q.Where("job.cluster = ?", *cluster)`
Allow variable number of parameters for stop job 2022-02-15 17:13:16 +01:00			`}`
			`if startTime != nil {`
use prepared statements 2022-02-22 09:25:41 +01:00			`q = q.Where("job.start_time = ?", *startTime)`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`}`

use prepared statements 2022-02-22 09:25:41 +01:00			`return scanJob(q.RunWith(r.stmtCache).QueryRow())`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`}`

Refactor job repository API. Cleanup. 2022-02-07 09:57:06 +01:00			`// FindById executes a SQL query to find a specific batch job.`
			`// The job is queried using the database id.`
			`// It returns a pointer to a schema.Job data structure and an error variable.`
Change job repo interface 2022-02-07 14:56:46 +01:00			`// To check if no job was found test err == sql.ErrNoRows`
Refactor job repository API. Cleanup. 2022-02-07 09:57:06 +01:00			`func (r *JobRepository) FindById(`
			`jobId int64) (*schema.Job, error) {`
use prepared statements 2022-02-22 09:25:41 +01:00			`q := sq.Select(jobColumns...).`
			`From("job").Where("job.id = ?", jobId)`
			`return scanJob(q.RunWith(r.stmtCache).QueryRow())`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`}`

Create tags if needed 2022-02-08 12:49:28 +01:00			`// Start inserts a new job in the table, returning the unique job ID.`
			`// Statistics are not transfered!`
			`func (r JobRepository) Start(job schema.JobMeta) (id int64, err error) {`
Make metaData a map[string]string; Resolve explicitly 2022-03-08 11:53:24 +01:00			`job.RawResources, err = json.Marshal(job.Resources)`
			`if err != nil {`
			`return -1, fmt.Errorf("encoding resources field failed: %w", err)`
			`}`

			`job.RawMetaData, err = json.Marshal(job.MetaData)`
			`if err != nil {`
			`return -1, fmt.Errorf("encoding metaData field failed: %w", err)`
			`}`

Create tags if needed 2022-02-08 12:49:28 +01:00			res, err := r.DB.NamedExec(`INSERT INTO job (
Add subcluster and walltime to Job types 2022-03-14 09:08:02 +01:00			job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc,
			`exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`) VALUES (`
Add subcluster and walltime to Job types 2022-03-14 09:08:02 +01:00			`:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,`
			`:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			);`, job)
Create tags if needed 2022-02-08 12:49:28 +01:00			`if err != nil {`
			`return -1, err`
			`}`

			`return res.LastInsertId()`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`}`

Create tags if needed 2022-02-08 12:49:28 +01:00			`// Stop updates the job with the database id jobId using the provided arguments.`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`func (r *JobRepository) Stop(`
			`jobId int64,`
			`duration int32,`
fix stop_job returned state; handle monitoring status 2022-02-15 14:25:39 +01:00			`state schema.JobState,`
			`monitoringStatus int32) (err error) {`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00
			`stmt := sq.Update("job").`
			`Set("job_state", state).`
			`Set("duration", duration).`
fix stop_job returned state; handle monitoring status 2022-02-15 14:25:39 +01:00			`Set("monitoring_status", monitoringStatus).`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`Where("job.id = ?", jobId)`

use prepared statements 2022-02-22 09:25:41 +01:00			`_, err = stmt.RunWith(r.stmtCache).Exec()`
Add error handling to stop job 2022-02-15 11:33:59 +01:00			`return`
Try to fix stop job issue. Add Archive to repo. 2022-02-15 11:10:49 +01:00			`}`

Add import command line flag 2022-02-24 11:54:36 +01:00			`// TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC;`
Allow weighting job counts 2022-03-25 10:20:33 +01:00			`func (r JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggregate, filters []model.JobFilter, weight model.Weights, limit int) (map[string]int, error) {`
Add GraphQL endpoint for counting jobs 2022-02-19 10:28:29 +01:00			`if !aggreg.IsValid() {`
			`return nil, errors.New("invalid aggregate")`
			`}`

Allow weighting job counts 2022-03-25 10:20:33 +01:00			`runner := (sq.BaseRunner)(r.stmtCache)`
			`count := "count(*) as count"`
			`if weight != nil {`
			`switch *weight {`
			`case model.WeightsNodeCount:`
			`count = "sum(job.num_nodes) as count"`
			`case model.WeightsNodeHours:`
			`now := time.Now().Unix()`
			count = fmt.Sprintf(`sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) as count`, now)
			`runner = r.DB`
			`}`
			`}`

			`q := sq.Select("job."+string(aggreg), count).From("job").GroupBy("job." + string(aggreg)).OrderBy("count DESC")`
Add GraphQL endpoint for counting jobs 2022-02-19 10:28:29 +01:00			`q = SecurityCheck(ctx, q)`
			`for _, f := range filters {`
			`q = BuildWhereClause(f, q)`
Add running/total job count to home 2022-02-16 12:29:54 +01:00			`}`
Add GraphQL endpoint for counting jobs 2022-02-19 10:28:29 +01:00			`if limit != nil {`
			`q = q.Limit(uint64(*limit))`
Add running/total job count to home 2022-02-16 12:29:54 +01:00			`}`

			`counts := map[string]int{}`
Allow weighting job counts 2022-03-25 10:20:33 +01:00			`rows, err := q.RunWith(runner).Query()`
Add running/total job count to home 2022-02-16 12:29:54 +01:00			`if err != nil {`
			`return nil, err`
			`}`

			`for rows.Next() {`
Add GraphQL endpoint for counting jobs 2022-02-19 10:28:29 +01:00			`var group string`
Add running/total job count to home 2022-02-16 12:29:54 +01:00			`var count int`
Add GraphQL endpoint for counting jobs 2022-02-19 10:28:29 +01:00			`if err := rows.Scan(&group, &count); err != nil {`
Add running/total job count to home 2022-02-16 12:29:54 +01:00			`return nil, err`
			`}`

Add GraphQL endpoint for counting jobs 2022-02-19 10:28:29 +01:00			`counts[group] = count`
Add running/total job count to home 2022-02-16 12:29:54 +01:00			`}`

			`return counts, nil`
			`}`

refactor stopJob, remove non-async archiving 2022-02-15 13:18:27 +01:00			`func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {`
			`stmt := sq.Update("job").`
			`Set("monitoring_status", monitoringStatus).`
			`Where("job.id = ?", job)`

use prepared statements 2022-02-22 09:25:41 +01:00			`_, err = stmt.RunWith(r.stmtCache).Exec()`
refactor stopJob, remove non-async archiving 2022-02-15 13:18:27 +01:00			`return`
			`}`

Try to fix stop job issue. Add Archive to repo. 2022-02-15 11:10:49 +01:00			`// Stop updates the job with the database id jobId using the provided arguments.`
			`func (r *JobRepository) Archive(`
			`jobId int64,`
			`monitoringStatus int32,`
refactor stopJob, remove non-async archiving 2022-02-15 13:18:27 +01:00			`metricStats map[string]schema.JobStatistics) error {`
Try to fix stop job issue. Add Archive to repo. 2022-02-15 11:10:49 +01:00
			`stmt := sq.Update("job").`
			`Set("monitoring_status", monitoringStatus).`
			`Where("job.id = ?", jobId)`

Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`for metric, stats := range metricStats {`
			`switch metric {`
			`case "flops_any":`
			`stmt = stmt.Set("flops_any_avg", stats.Avg)`
			`case "mem_used":`
			`stmt = stmt.Set("mem_used_max", stats.Max)`
			`case "mem_bw":`
			`stmt = stmt.Set("mem_bw_avg", stats.Avg)`
			`case "load":`
			`stmt = stmt.Set("load_avg", stats.Avg)`
			`case "net_bw":`
			`stmt = stmt.Set("net_bw_avg", stats.Avg)`
			`case "file_bw":`
			`stmt = stmt.Set("file_bw_avg", stats.Avg)`
			`}`
			`}`

use prepared statements 2022-02-22 09:25:41 +01:00			`if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {`
refactor stopJob, remove non-async archiving 2022-02-15 13:18:27 +01:00			`return err`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`}`
refactor stopJob, remove non-async archiving 2022-02-15 13:18:27 +01:00			`return nil`
Extract DB queries from REST API 2022-02-07 07:09:47 +01:00			`}`

Add /search endpoint which redirects to user/job 2022-02-09 15:03:12 +01:00			`var ErrNotFound = errors.New("no such job or user")`

			`// FindJobOrUser returns a job database ID or a username if a job or user machtes the search term.`
			`// As 0 is a valid job id, check if username is "" instead in order to check what machted.`
			// If nothing matches the search, `ErrNotFound` is returned.
			`func (r *JobRepository) FindJobOrUser(ctx context.Context, searchterm string) (job int64, username string, err error) {`
			`user := auth.GetUser(ctx)`
			`if id, err := strconv.Atoi(searchterm); err == nil {`
			`qb := sq.Select("job.id").From("job").Where("job.job_id = ?", id)`
			`if user != nil && !user.HasRole(auth.RoleAdmin) {`
			`qb = qb.Where("job.user = ?", user.Username)`
			`}`

use prepared statements 2022-02-22 09:25:41 +01:00			`err := qb.RunWith(r.stmtCache).QueryRow().Scan(&job)`
Add /search endpoint which redirects to user/job 2022-02-09 15:03:12 +01:00			`if err != nil && err != sql.ErrNoRows {`
			`return 0, "", err`
			`} else if err == nil {`
			`return job, "", nil`
			`}`
			`}`

			`if user == nil \|\| user.HasRole(auth.RoleAdmin) {`
			`err := sq.Select("job.user").Distinct().From("job").`
			`Where("job.user = ?", searchterm).`
use prepared statements 2022-02-22 09:25:41 +01:00			`RunWith(r.stmtCache).QueryRow().Scan(&username)`
Add /search endpoint which redirects to user/job 2022-02-09 15:03:12 +01:00			`if err != nil && err != sql.ErrNoRows {`
			`return 0, "", err`
			`} else if err == nil {`
			`return 0, username, nil`
			`}`
			`}`

			`return 0, "", ErrNotFound`
			`}`
Add subcluster and walltime to Job types 2022-03-14 09:08:02 +01:00
			`func (r *JobRepository) Partitions(cluster string) ([]string, error) {`
			`var err error`
			`partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) {`
			`parts := []string{}`
			if err = r.DB.Select(&parts, `SELECT DISTINCT job.partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
			`return nil, 0, 1000`
			`}`

			`return parts, 1 * time.Hour, 1`
			`})`
			`if err != nil {`
			`return nil, err`
			`}`
			`return partitions.([]string), nil`
			`}`
Add allocatedNodes to the GraphQL API 2022-03-24 10:32:08 +01:00
Change allocatedNodes; Update frontend 2022-03-24 16:08:47 +01:00			`// AllocatedNodes returns a map of all subclusters to a map of hostnames to the amount of jobs running on that host.`
			`// Hosts with zero jobs running on them will not show up!`
			`func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]int, error) {`
			`subclusters := make(map[string]map[string]int)`
			`rows, err := sq.Select("resources", "subcluster").From("job").`
Add allocatedNodes to the GraphQL API 2022-03-24 10:32:08 +01:00			`Where("job.job_state = 'running'").`
			`Where("job.cluster = ?", cluster).`
			`RunWith(r.stmtCache).Query()`
			`if err != nil {`
			`return nil, err`
			`}`

			`var raw []byte`
			`defer rows.Close()`
			`for rows.Next() {`
			`raw = raw[0:0]`
			`var resources []*schema.Resource`
Change allocatedNodes; Update frontend 2022-03-24 16:08:47 +01:00			`var subcluster string`
			`if err := rows.Scan(&raw, &subcluster); err != nil {`
Add allocatedNodes to the GraphQL API 2022-03-24 10:32:08 +01:00			`return nil, err`
			`}`
			`if err := json.Unmarshal(raw, &resources); err != nil {`
			`return nil, err`
			`}`

Change allocatedNodes; Update frontend 2022-03-24 16:08:47 +01:00			`hosts, ok := subclusters[subcluster]`
			`if !ok {`
			`hosts = make(map[string]int)`
			`subclusters[subcluster] = hosts`
			`}`

Add allocatedNodes to the GraphQL API 2022-03-24 10:32:08 +01:00			`for _, resource := range resources {`
Change allocatedNodes; Update frontend 2022-03-24 16:08:47 +01:00			`hosts[resource.Hostname] += 1`
Add allocatedNodes to the GraphQL API 2022-03-24 10:32:08 +01:00			`}`
			`}`

Change allocatedNodes; Update frontend 2022-03-24 16:08:47 +01:00			`return subclusters, nil`
Add allocatedNodes to the GraphQL API 2022-03-24 10:32:08 +01:00			`}`
Automatically mark jobs as failed if running too long 2022-04-07 09:50:32 +02:00
			`func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {`
			`res, err := sq.Update("job").`
			`Set("monitoring_status", schema.MonitoringStatusArchivingFailed).`
			`Set("duration", 0).`
			`Set("job_state", schema.JobStateFailed).`
			`Where("job.job_state = 'running'").`
Only autostop jobs with a positive walltime 2022-05-09 11:53:41 +02:00			`Where("job.walltime > 0").`
Automatically mark jobs as failed if running too long 2022-04-07 09:50:32 +02:00			`Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).`
			`RunWith(r.DB).Exec()`
			`if err != nil {`
			`return err`
			`}`

			`rowsAffected, err := res.RowsAffected()`
			`if err != nil {`
			`return err`
			`}`

			`if rowsAffected > 0 {`
			`log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected)`
			`}`
			`return nil`
			`}`