2024-04-11 23:04:30 +02:00
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
2022-07-29 06:29:21 +02:00
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
2022-02-06 09:48:31 +01:00
package repository
import (
2022-02-09 15:03:12 +01:00
"database/sql"
2022-02-22 09:25:41 +01:00
"encoding/json"
2022-02-09 15:03:12 +01:00
"errors"
2022-03-08 11:53:24 +01:00
"fmt"
2024-09-27 13:45:44 +02:00
"math"
2022-02-09 15:03:12 +01:00
"strconv"
2022-06-21 17:52:36 +02:00
"sync"
2022-02-22 09:25:41 +01:00
"time"
2022-02-09 15:03:12 +01:00
2022-06-21 17:52:36 +02:00
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
2024-03-08 08:51:05 +01:00
"github.com/ClusterCockpit/cc-backend/pkg/archive"
2022-06-21 17:52:36 +02:00
"github.com/ClusterCockpit/cc-backend/pkg/log"
2022-06-22 06:11:00 +02:00
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
2022-06-21 17:52:36 +02:00
"github.com/ClusterCockpit/cc-backend/pkg/schema"
2022-02-06 09:48:31 +01:00
sq "github.com/Masterminds/squirrel"
"github.com/jmoiron/sqlx"
)
2022-06-21 17:52:36 +02:00
var (
jobRepoOnce sync . Once
jobRepoInstance * JobRepository
)
2022-02-06 09:48:31 +01:00
type JobRepository struct {
2024-08-28 11:13:54 +02:00
DB * sqlx . DB
stmtCache * sq . StmtCache
cache * lrucache . Cache
driver string
2022-02-06 09:48:31 +01:00
}
2022-09-05 17:46:38 +02:00
func GetJobRepository ( ) * JobRepository {
2022-06-21 17:52:36 +02:00
jobRepoOnce . Do ( func ( ) {
db := GetConnection ( )
jobRepoInstance = & JobRepository {
2023-02-21 17:17:41 +01:00
DB : db . DB ,
driver : db . Driver ,
2023-02-21 16:21:47 +01:00
2024-08-28 11:13:54 +02:00
stmtCache : sq . NewStmtCache ( db . DB ) ,
cache : lrucache . New ( 1024 * 1024 ) ,
2022-06-21 17:52:36 +02:00
}
} )
return jobRepoInstance
2022-02-19 10:28:29 +01:00
}
2022-02-22 09:25:41 +01:00
var jobColumns [ ] string = [ ] string {
2024-11-21 15:02:30 +01:00
"job.id" , "job.job_id" , "job.hpc_user" , "job.project" , "job.cluster" , "job.subcluster" , "job.start_time" , "job.cluster_partition" , "job.array_job_id" ,
2022-02-22 09:25:41 +01:00
"job.num_nodes" , "job.num_hwthreads" , "job.num_acc" , "job.exclusive" , "job.monitoring_status" , "job.smt" , "job.job_state" ,
2024-09-27 13:45:44 +02:00
"job.duration" , "job.walltime" , "job.resources" , "job.footprint" , "job.energy" ,
2022-02-22 09:25:41 +01:00
}
func scanJob ( row interface { Scan ( ... interface { } ) error } ) ( * schema . Job , error ) {
job := & schema . Job { }
2024-07-04 14:14:27 +02:00
2022-02-22 09:25:41 +01:00
if err := row . Scan (
2022-03-14 10:18:56 +01:00
& job . ID , & job . JobID , & job . User , & job . Project , & job . Cluster , & job . SubCluster , & job . StartTimeUnix , & job . Partition , & job . ArrayJobId ,
2022-02-22 09:25:41 +01:00
& job . NumNodes , & job . NumHWThreads , & job . NumAcc , & job . Exclusive , & job . MonitoringStatus , & job . SMT , & job . State ,
2024-09-27 13:45:44 +02:00
& job . Duration , & job . Walltime , & job . RawResources , & job . RawFootprint , & job . Energy ) ; err != nil {
2023-06-01 17:48:43 +02:00
log . Warnf ( "Error while scanning rows (Job): %v" , err )
2022-02-22 09:25:41 +01:00
return nil , err
}
if err := json . Unmarshal ( job . RawResources , & job . Resources ) ; err != nil {
2024-06-28 16:48:10 +02:00
log . Warn ( "Error while unmarshaling raw resources json" )
2022-02-22 09:25:41 +01:00
return nil , err
}
2024-06-28 16:48:10 +02:00
job . RawResources = nil
if err := json . Unmarshal ( job . RawFootprint , & job . Footprint ) ; err != nil {
2024-07-04 14:14:27 +02:00
log . Warnf ( "Error while unmarshaling raw footprint json: %v" , err )
2024-06-28 16:48:10 +02:00
return nil , err
}
job . RawFootprint = nil
2022-02-22 09:25:41 +01:00
job . StartTime = time . Unix ( job . StartTimeUnix , 0 )
2024-12-19 06:24:08 +01:00
// Always ensure accurate duration for running jobs
if job . State == schema . JobStateRunning {
job . Duration = int32 ( time . Since ( job . StartTime ) . Seconds ( ) )
}
2022-02-22 09:25:41 +01:00
return job , nil
}
2023-05-11 09:40:13 +02:00
func ( r * JobRepository ) Optimize ( ) error {
var err error
switch r . driver {
case "sqlite3" :
if _ , err = r . DB . Exec ( ` VACUUM ` ) ; err != nil {
return err
}
case "mysql" :
log . Info ( "Optimize currently not supported for mysql driver" )
}
return nil
}
2023-05-04 15:34:36 +02:00
func ( r * JobRepository ) Flush ( ) error {
var err error
switch r . driver {
case "sqlite3" :
2023-05-04 16:03:04 +02:00
if _ , err = r . DB . Exec ( ` DELETE FROM jobtag ` ) ; err != nil {
return err
}
if _ , err = r . DB . Exec ( ` DELETE FROM tag ` ) ; err != nil {
return err
}
if _ , err = r . DB . Exec ( ` DELETE FROM job ` ) ; err != nil {
return err
}
2023-05-04 15:34:36 +02:00
case "mysql" :
2023-06-07 16:49:08 +02:00
if _ , err = r . DB . Exec ( ` SET FOREIGN_KEY_CHECKS = 0 ` ) ; err != nil {
return err
}
2023-05-04 16:03:04 +02:00
if _ , err = r . DB . Exec ( ` TRUNCATE TABLE jobtag ` ) ; err != nil {
return err
}
if _ , err = r . DB . Exec ( ` TRUNCATE TABLE tag ` ) ; err != nil {
return err
}
if _ , err = r . DB . Exec ( ` TRUNCATE TABLE job ` ) ; err != nil {
return err
}
2023-06-07 16:49:08 +02:00
if _ , err = r . DB . Exec ( ` SET FOREIGN_KEY_CHECKS = 1 ` ) ; err != nil {
return err
}
2023-05-04 15:34:36 +02:00
}
2023-05-04 16:03:04 +02:00
return nil
2023-05-04 15:34:36 +02:00
}
2023-04-28 12:34:40 +02:00
func scanJobLink ( row interface { Scan ( ... interface { } ) error } ) ( * model . JobLink , error ) {
jobLink := & model . JobLink { }
if err := row . Scan (
& jobLink . ID , & jobLink . JobID ) ; err != nil {
log . Warn ( "Error while scanning rows (jobLink)" )
return nil , err
}
return jobLink , nil
}
2022-03-08 11:53:24 +01:00
func ( r * JobRepository ) FetchMetadata ( job * schema . Job ) ( map [ string ] string , error ) {
2023-02-20 15:08:23 +01:00
start := time . Now ( )
2022-03-17 11:18:22 +01:00
cachekey := fmt . Sprintf ( "metadata:%d" , job . ID )
if cached := r . cache . Get ( cachekey , nil ) ; cached != nil {
job . MetaData = cached . ( map [ string ] string )
return job . MetaData , nil
}
2022-03-08 11:53:24 +01:00
if err := sq . Select ( "job.meta_data" ) . From ( "job" ) . Where ( "job.id = ?" , job . ID ) .
RunWith ( r . stmtCache ) . QueryRow ( ) . Scan ( & job . RawMetaData ) ; err != nil {
2023-02-01 11:58:27 +01:00
log . Warn ( "Error while scanning for job metadata" )
2022-03-08 11:53:24 +01:00
return nil , err
}
if len ( job . RawMetaData ) == 0 {
return nil , nil
}
if err := json . Unmarshal ( job . RawMetaData , & job . MetaData ) ; err != nil {
2023-02-01 11:58:27 +01:00
log . Warn ( "Error while unmarshaling raw metadata json" )
2022-03-08 11:53:24 +01:00
return nil , err
}
2022-03-17 11:18:22 +01:00
r . cache . Put ( cachekey , job . MetaData , len ( job . RawMetaData ) , 24 * time . Hour )
2023-06-20 15:47:38 +02:00
log . Debugf ( "Timer FetchMetadata %s" , time . Since ( start ) )
2022-03-08 11:53:24 +01:00
return job . MetaData , nil
}
2022-03-17 11:18:22 +01:00
func ( r * JobRepository ) UpdateMetadata ( job * schema . Job , key , val string ) ( err error ) {
cachekey := fmt . Sprintf ( "metadata:%d" , job . ID )
r . cache . Del ( cachekey )
if job . MetaData == nil {
if _ , err = r . FetchMetadata ( job ) ; err != nil {
2023-02-01 11:58:27 +01:00
log . Warnf ( "Error while fetching metadata for job, DB ID '%v'" , job . ID )
2022-03-17 11:18:22 +01:00
return err
}
}
if job . MetaData != nil {
cpy := make ( map [ string ] string , len ( job . MetaData ) + 1 )
for k , v := range job . MetaData {
cpy [ k ] = v
}
cpy [ key ] = val
job . MetaData = cpy
} else {
job . MetaData = map [ string ] string { key : val }
}
if job . RawMetaData , err = json . Marshal ( job . MetaData ) ; err != nil {
2023-02-01 11:58:27 +01:00
log . Warnf ( "Error while marshaling metadata for job, DB ID '%v'" , job . ID )
2022-03-17 11:18:22 +01:00
return err
}
2024-09-03 10:03:38 +02:00
if _ , err = sq . Update ( "job" ) .
Set ( "meta_data" , job . RawMetaData ) .
Where ( "job.id = ?" , job . ID ) .
RunWith ( r . stmtCache ) . Exec ( ) ; err != nil {
2023-02-01 11:58:27 +01:00
log . Warnf ( "Error while updating metadata for job, DB ID '%v'" , job . ID )
2022-03-17 11:18:22 +01:00
return err
}
r . cache . Put ( cachekey , job . MetaData , len ( job . RawMetaData ) , 24 * time . Hour )
2024-03-08 08:51:05 +01:00
return archive . UpdateMetadata ( job , job . MetaData )
2022-03-17 11:18:22 +01:00
}
2024-07-09 09:17:50 +02:00
func ( r * JobRepository ) FetchFootprint ( job * schema . Job ) ( map [ string ] float64 , error ) {
start := time . Now ( )
if err := sq . Select ( "job.footprint" ) . From ( "job" ) . Where ( "job.id = ?" , job . ID ) .
RunWith ( r . stmtCache ) . QueryRow ( ) . Scan ( & job . RawFootprint ) ; err != nil {
log . Warn ( "Error while scanning for job footprint" )
return nil , err
}
if len ( job . RawFootprint ) == 0 {
return nil , nil
}
if err := json . Unmarshal ( job . RawFootprint , & job . Footprint ) ; err != nil {
log . Warn ( "Error while unmarshaling raw footprint json" )
return nil , err
}
log . Debugf ( "Timer FetchFootprint %s" , time . Since ( start ) )
return job . Footprint , nil
}
2024-09-27 13:45:44 +02:00
func ( r * JobRepository ) FetchEnergyFootprint ( job * schema . Job ) ( map [ string ] float64 , error ) {
start := time . Now ( )
cachekey := fmt . Sprintf ( "energyFootprint:%d" , job . ID )
if cached := r . cache . Get ( cachekey , nil ) ; cached != nil {
job . EnergyFootprint = cached . ( map [ string ] float64 )
return job . EnergyFootprint , nil
}
if err := sq . Select ( "job.energy_footprint" ) . From ( "job" ) . Where ( "job.id = ?" , job . ID ) .
RunWith ( r . stmtCache ) . QueryRow ( ) . Scan ( & job . RawEnergyFootprint ) ; err != nil {
log . Warn ( "Error while scanning for job energy_footprint" )
return nil , err
}
if len ( job . RawEnergyFootprint ) == 0 {
return nil , nil
}
if err := json . Unmarshal ( job . RawEnergyFootprint , & job . EnergyFootprint ) ; err != nil {
log . Warn ( "Error while unmarshaling raw energy footprint json" )
return nil , err
}
r . cache . Put ( cachekey , job . EnergyFootprint , len ( job . EnergyFootprint ) , 24 * time . Hour )
log . Debugf ( "Timer FetchEnergyFootprint %s" , time . Since ( start ) )
return job . EnergyFootprint , nil
}
2022-11-11 15:26:27 +01:00
func ( r * JobRepository ) DeleteJobsBefore ( startTime int64 ) ( int , error ) {
var cnt int
2024-03-06 14:50:08 +01:00
q := sq . Select ( "count(*)" ) . From ( "job" ) . Where ( "job.start_time < ?" , startTime )
q . RunWith ( r . DB ) . QueryRow ( ) . Scan ( cnt )
qd := sq . Delete ( "job" ) . Where ( "job.start_time < ?" , startTime )
_ , err := qd . RunWith ( r . DB ) . Exec ( )
2022-11-25 15:15:05 +01:00
if err != nil {
2024-03-06 14:50:08 +01:00
s , _ , _ := qd . ToSql ( )
log . Errorf ( " DeleteJobsBefore(%d) with %s: error %#v" , startTime , s , err )
2022-11-25 15:15:05 +01:00
} else {
2023-06-20 15:47:38 +02:00
log . Debugf ( "DeleteJobsBefore(%d): Deleted %d jobs" , startTime , cnt )
2022-11-25 15:15:05 +01:00
}
2022-11-11 15:26:27 +01:00
return cnt , err
}
func ( r * JobRepository ) DeleteJobById ( id int64 ) error {
2024-03-06 14:50:08 +01:00
qd := sq . Delete ( "job" ) . Where ( "job.id = ?" , id )
_ , err := qd . RunWith ( r . DB ) . Exec ( )
2022-11-25 15:15:05 +01:00
if err != nil {
2024-03-06 14:50:08 +01:00
s , _ , _ := qd . ToSql ( )
log . Errorf ( "DeleteJobById(%d) with %s : error %#v" , id , s , err )
2022-11-25 15:15:05 +01:00
} else {
2023-06-20 15:47:38 +02:00
log . Debugf ( "DeleteJobById(%d): Success" , id )
2022-11-25 15:15:05 +01:00
}
2022-11-11 15:26:27 +01:00
return err
}
2023-08-17 10:29:00 +02:00
func ( r * JobRepository ) FindUserOrProjectOrJobname ( user * schema . User , searchterm string ) ( jobid string , username string , project string , jobname string ) {
2023-01-12 11:26:01 +01:00
if _ , err := strconv . Atoi ( searchterm ) ; err == nil { // Return empty on successful conversion: parent method will redirect for integer jobId
2023-06-23 16:09:33 +02:00
return searchterm , "" , "" , ""
2023-02-22 16:30:01 +01:00
} else { // Has to have letters and logged-in user for other guesses
if user != nil {
2024-12-09 11:06:12 +01:00
// Find username by username in job table (match)
uresult , _ := r . FindColumnValue ( user , searchterm , "job" , "hpc_user" , "hpc_user" , false )
2023-02-22 16:30:01 +01:00
if uresult != "" {
2023-06-23 16:09:33 +02:00
return "" , uresult , "" , ""
2023-01-12 11:26:01 +01:00
}
2024-12-09 11:06:12 +01:00
// Find username by real name in hpc_user table (like)
2024-11-21 15:02:30 +01:00
nresult , _ := r . FindColumnValue ( user , searchterm , "hpc_user" , "username" , "name" , true )
2023-02-22 16:30:01 +01:00
if nresult != "" {
2023-06-23 16:09:33 +02:00
return "" , nresult , "" , ""
2023-02-17 10:45:27 +01:00
}
2024-12-09 11:06:12 +01:00
// Find projectId by projectId in job table (match)
2023-02-22 16:30:01 +01:00
presult , _ := r . FindColumnValue ( user , searchterm , "job" , "project" , "project" , false )
if presult != "" {
2023-06-23 16:09:33 +02:00
return "" , "" , presult , ""
2023-01-12 11:26:01 +01:00
}
2022-02-09 15:03:12 +01:00
}
2023-06-23 16:09:33 +02:00
// Return searchterm if no match before: Forward as jobname query to GQL in handleSearchbar function
return "" , "" , "" , searchterm
2022-02-09 15:03:12 +01:00
}
2023-01-12 11:26:01 +01:00
}
2022-02-09 15:03:12 +01:00
2024-03-06 14:50:08 +01:00
var (
ErrNotFound = errors . New ( "no such jobname, project or user" )
ErrForbidden = errors . New ( "not authorized" )
)
2023-06-23 16:09:33 +02:00
2023-08-17 10:29:00 +02:00
func ( r * JobRepository ) FindColumnValue ( user * schema . User , searchterm string , table string , selectColumn string , whereColumn string , isLike bool ) ( result string , err error ) {
2023-02-22 16:30:01 +01:00
compareStr := " = ?"
query := searchterm
2023-05-04 16:27:30 +02:00
if isLike {
2023-02-22 16:30:01 +01:00
compareStr = " LIKE ?"
query = "%" + searchterm + "%"
}
2023-08-17 10:29:00 +02:00
if user . HasAnyRole ( [ ] schema . Role { schema . RoleAdmin , schema . RoleSupport , schema . RoleManager } ) {
2023-06-07 14:13:59 +02:00
theQuery := sq . Select ( table + "." + selectColumn ) . Distinct ( ) . From ( table ) .
Where ( table + "." + whereColumn + compareStr , query )
// theSql, args, theErr := theQuery.ToSql()
// if theErr != nil {
// log.Warn("Error while converting query to sql")
// return "", err
// }
// log.Debugf("SQL query (FindColumnValue): `%s`, args: %#v", theSql, args)
err := theQuery . RunWith ( r . stmtCache ) . QueryRow ( ) . Scan ( & result )
2022-02-09 15:03:12 +01:00
if err != nil && err != sql . ErrNoRows {
2023-02-17 10:45:27 +01:00
return "" , err
2022-02-09 15:03:12 +01:00
} else if err == nil {
2023-02-22 16:30:01 +01:00
return result , nil
2022-02-09 15:03:12 +01:00
}
2023-02-17 10:45:27 +01:00
return "" , ErrNotFound
} else {
2023-02-22 16:30:01 +01:00
log . Infof ( "Non-Admin User %s : Requested Query '%s' on table '%s' : Forbidden" , user . Name , query , table )
2023-02-17 10:45:27 +01:00
return "" , ErrForbidden
2022-02-09 15:03:12 +01:00
}
2023-02-17 10:45:27 +01:00
}
2022-02-09 15:03:12 +01:00
2023-08-17 10:29:00 +02:00
func ( r * JobRepository ) FindColumnValues ( user * schema . User , query string , table string , selectColumn string , whereColumn string ) ( results [ ] string , err error ) {
2023-02-17 10:45:27 +01:00
emptyResult := make ( [ ] string , 0 )
2023-08-17 10:29:00 +02:00
if user . HasAnyRole ( [ ] schema . Role { schema . RoleAdmin , schema . RoleSupport , schema . RoleManager } ) {
2023-02-22 16:30:01 +01:00
rows , err := sq . Select ( table + "." + selectColumn ) . Distinct ( ) . From ( table ) .
Where ( table + "." + whereColumn + " LIKE ?" , fmt . Sprint ( "%" , query , "%" ) ) .
2023-02-17 10:45:27 +01:00
RunWith ( r . stmtCache ) . Query ( )
2022-02-09 15:03:12 +01:00
if err != nil && err != sql . ErrNoRows {
2023-02-17 10:45:27 +01:00
return emptyResult , err
2022-02-09 15:03:12 +01:00
} else if err == nil {
2023-02-17 10:45:27 +01:00
for rows . Next ( ) {
2023-02-22 16:30:01 +01:00
var result string
err := rows . Scan ( & result )
2023-02-17 10:45:27 +01:00
if err != nil {
rows . Close ( )
log . Warnf ( "Error while scanning rows: %v" , err )
return emptyResult , err
}
2023-02-22 16:30:01 +01:00
results = append ( results , result )
2023-02-17 10:45:27 +01:00
}
2023-02-22 16:30:01 +01:00
return results , nil
2022-02-09 15:03:12 +01:00
}
2023-02-17 10:45:27 +01:00
return emptyResult , ErrNotFound
2022-02-09 15:03:12 +01:00
2023-02-17 10:45:27 +01:00
} else {
2023-02-22 16:30:01 +01:00
log . Infof ( "Non-Admin User %s : Requested Query '%s' on table '%s' : Forbidden" , user . Name , query , table )
2023-02-17 10:45:27 +01:00
return emptyResult , ErrForbidden
2022-02-09 15:03:12 +01:00
}
}
2022-03-14 09:08:02 +01:00
func ( r * JobRepository ) Partitions ( cluster string ) ( [ ] string , error ) {
var err error
2023-02-20 15:08:23 +01:00
start := time . Now ( )
2022-03-14 09:08:02 +01:00
partitions := r . cache . Get ( "partitions:" + cluster , func ( ) ( interface { } , time . Duration , int ) {
parts := [ ] string { }
2024-11-21 15:02:30 +01:00
if err = r . DB . Select ( & parts , ` SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?; ` , cluster ) ; err != nil {
2022-03-14 09:08:02 +01:00
return nil , 0 , 1000
}
return parts , 1 * time . Hour , 1
} )
if err != nil {
return nil , err
}
2023-06-20 15:47:38 +02:00
log . Debugf ( "Timer Partitions %s" , time . Since ( start ) )
2022-03-14 09:08:02 +01:00
return partitions . ( [ ] string ) , nil
}
2022-03-24 10:32:08 +01:00
2022-03-24 16:08:47 +01:00
// AllocatedNodes returns a map of all subclusters to a map of hostnames to the amount of jobs running on that host.
// Hosts with zero jobs running on them will not show up!
func ( r * JobRepository ) AllocatedNodes ( cluster string ) ( map [ string ] map [ string ] int , error ) {
2023-02-20 15:08:23 +01:00
start := time . Now ( )
2022-03-24 16:08:47 +01:00
subclusters := make ( map [ string ] map [ string ] int )
rows , err := sq . Select ( "resources" , "subcluster" ) . From ( "job" ) .
2022-03-24 10:32:08 +01:00
Where ( "job.job_state = 'running'" ) .
Where ( "job.cluster = ?" , cluster ) .
RunWith ( r . stmtCache ) . Query ( )
if err != nil {
2023-01-31 18:28:44 +01:00
log . Error ( "Error while running query" )
2022-03-24 10:32:08 +01:00
return nil , err
}
var raw [ ] byte
defer rows . Close ( )
for rows . Next ( ) {
raw = raw [ 0 : 0 ]
var resources [ ] * schema . Resource
2022-03-24 16:08:47 +01:00
var subcluster string
if err := rows . Scan ( & raw , & subcluster ) ; err != nil {
2023-02-01 11:58:27 +01:00
log . Warn ( "Error while scanning rows" )
2022-03-24 10:32:08 +01:00
return nil , err
}
if err := json . Unmarshal ( raw , & resources ) ; err != nil {
2023-02-01 11:58:27 +01:00
log . Warn ( "Error while unmarshaling raw resources json" )
2022-03-24 10:32:08 +01:00
return nil , err
}
2022-03-24 16:08:47 +01:00
hosts , ok := subclusters [ subcluster ]
if ! ok {
hosts = make ( map [ string ] int )
subclusters [ subcluster ] = hosts
}
2022-03-24 10:32:08 +01:00
for _ , resource := range resources {
2022-03-24 16:08:47 +01:00
hosts [ resource . Hostname ] += 1
2022-03-24 10:32:08 +01:00
}
}
2023-06-20 15:47:38 +02:00
log . Debugf ( "Timer AllocatedNodes %s" , time . Since ( start ) )
2022-03-24 16:08:47 +01:00
return subclusters , nil
2022-03-24 10:32:08 +01:00
}
2022-04-07 09:50:32 +02:00
2024-12-19 05:55:31 +01:00
// FIXME: Set duration to requested walltime?
2022-04-07 09:50:32 +02:00
func ( r * JobRepository ) StopJobsExceedingWalltimeBy ( seconds int ) error {
2023-02-20 15:08:23 +01:00
start := time . Now ( )
2022-04-07 09:50:32 +02:00
res , err := sq . Update ( "job" ) .
Set ( "monitoring_status" , schema . MonitoringStatusArchivingFailed ) .
Set ( "duration" , 0 ) .
Set ( "job_state" , schema . JobStateFailed ) .
Where ( "job.job_state = 'running'" ) .
2022-05-09 11:53:41 +02:00
Where ( "job.walltime > 0" ) .
2022-04-07 09:50:32 +02:00
Where ( fmt . Sprintf ( "(%d - job.start_time) > (job.walltime + %d)" , time . Now ( ) . Unix ( ) , seconds ) ) .
RunWith ( r . DB ) . Exec ( )
if err != nil {
2023-02-01 11:58:27 +01:00
log . Warn ( "Error while stopping jobs exceeding walltime" )
2022-04-07 09:50:32 +02:00
return err
}
rowsAffected , err := res . RowsAffected ( )
if err != nil {
2023-02-01 11:58:27 +01:00
log . Warn ( "Error while fetching affected rows after stopping due to exceeded walltime" )
2022-04-07 09:50:32 +02:00
return err
}
if rowsAffected > 0 {
2023-02-15 11:50:51 +01:00
log . Infof ( "%d jobs have been marked as failed due to running too long" , rowsAffected )
2022-04-07 09:50:32 +02:00
}
2023-06-20 15:47:38 +02:00
log . Debugf ( "Timer StopJobsExceedingWalltimeBy %s" , time . Since ( start ) )
2022-04-07 09:50:32 +02:00
return nil
}
2023-02-13 13:53:24 +01:00
2024-08-30 07:22:40 +02:00
func ( r * JobRepository ) FindRunningJobs ( cluster string ) ( [ ] * schema . Job , error ) {
query := sq . Select ( jobColumns ... ) . From ( "job" ) .
Where ( fmt . Sprintf ( "job.cluster = '%s'" , cluster ) ) .
Where ( "job.job_state = 'running'" ) .
2024-09-25 18:05:04 +02:00
Where ( "job.duration > 600" )
2024-08-30 07:22:40 +02:00
rows , err := query . RunWith ( r . stmtCache ) . Query ( )
if err != nil {
log . Error ( "Error while running query" )
return nil , err
}
jobs := make ( [ ] * schema . Job , 0 , 50 )
for rows . Next ( ) {
job , err := scanJob ( rows )
if err != nil {
rows . Close ( )
log . Warn ( "Error while scanning rows" )
return nil , err
}
jobs = append ( jobs , job )
}
log . Infof ( "Return job count %d" , len ( jobs ) )
return jobs , nil
}
2024-09-03 10:03:38 +02:00
func ( r * JobRepository ) UpdateDuration ( ) error {
2024-09-05 11:18:00 +02:00
stmnt := sq . Update ( "job" ) .
2024-09-03 10:03:38 +02:00
Set ( "duration" , sq . Expr ( "? - job.start_time" , time . Now ( ) . Unix ( ) ) ) .
2024-09-05 12:38:39 +02:00
Where ( "job_state = 'running'" )
2024-09-05 11:18:00 +02:00
2024-09-05 14:58:08 +02:00
_ , err := stmnt . RunWith ( r . stmtCache ) . Exec ( )
2024-09-05 11:18:00 +02:00
if err != nil {
2024-09-03 10:03:38 +02:00
return err
}
return nil
}
2023-06-10 07:49:02 +02:00
func ( r * JobRepository ) FindJobsBetween ( startTimeBegin int64 , startTimeEnd int64 ) ( [ ] * schema . Job , error ) {
var query sq . SelectBuilder
2023-05-09 16:33:26 +02:00
2023-06-10 07:49:02 +02:00
if startTimeBegin == startTimeEnd || startTimeBegin > startTimeEnd {
return nil , errors . New ( "startTimeBegin is equal or larger startTimeEnd" )
}
if startTimeBegin == 0 {
2023-06-27 14:29:56 +02:00
log . Infof ( "Find jobs before %d" , startTimeEnd )
2023-06-10 07:49:02 +02:00
query = sq . Select ( jobColumns ... ) . From ( "job" ) . Where ( fmt . Sprintf (
"job.start_time < %d" , startTimeEnd ) )
} else {
2023-06-27 14:29:56 +02:00
log . Infof ( "Find jobs between %d and %d" , startTimeBegin , startTimeEnd )
2023-06-10 07:49:02 +02:00
query = sq . Select ( jobColumns ... ) . From ( "job" ) . Where ( fmt . Sprintf (
"job.start_time BETWEEN %d AND %d" , startTimeBegin , startTimeEnd ) )
2023-05-09 16:33:26 +02:00
}
rows , err := query . RunWith ( r . stmtCache ) . Query ( )
if err != nil {
log . Error ( "Error while running query" )
return nil , err
}
jobs := make ( [ ] * schema . Job , 0 , 50 )
for rows . Next ( ) {
job , err := scanJob ( rows )
if err != nil {
rows . Close ( )
log . Warn ( "Error while scanning rows" )
return nil , err
}
jobs = append ( jobs , job )
}
2023-06-27 14:29:56 +02:00
log . Infof ( "Return job count %d" , len ( jobs ) )
2023-05-09 16:33:26 +02:00
return jobs , nil
}
2024-08-28 11:13:54 +02:00
func ( r * JobRepository ) UpdateMonitoringStatus ( job int64 , monitoringStatus int32 ) ( err error ) {
stmt := sq . Update ( "job" ) .
Set ( "monitoring_status" , monitoringStatus ) .
Where ( "job.id = ?" , job )
_ , err = stmt . RunWith ( r . stmtCache ) . Exec ( )
return
}
2024-09-03 13:41:00 +02:00
func ( r * JobRepository ) Execute ( stmt sq . UpdateBuilder ) error {
2024-08-28 11:13:54 +02:00
if _ , err := stmt . RunWith ( r . stmtCache ) . Exec ( ) ; err != nil {
return err
}
2024-09-03 13:41:00 +02:00
2024-08-28 11:13:54 +02:00
return nil
}
2024-09-03 13:41:00 +02:00
func ( r * JobRepository ) MarkArchived (
stmt sq . UpdateBuilder ,
monitoringStatus int32 ,
) sq . UpdateBuilder {
return stmt . Set ( "monitoring_status" , monitoringStatus )
}
func ( r * JobRepository ) UpdateEnergy (
stmt sq . UpdateBuilder ,
jobMeta * schema . JobMeta ,
) ( sq . UpdateBuilder , error ) {
2024-09-27 13:45:44 +02:00
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
2024-08-29 07:26:49 +02:00
sc , err := archive . GetSubCluster ( jobMeta . Cluster , jobMeta . SubCluster )
if err != nil {
log . Errorf ( "cannot get subcluster: %s" , err . Error ( ) )
2024-09-03 13:41:00 +02:00
return stmt , err
2024-08-29 07:26:49 +02:00
}
energyFootprint := make ( map [ string ] float64 )
2025-03-06 12:46:25 +01:00
// Total Job Energy Outside Loop
totalEnergy := 0.0
2024-08-29 07:26:49 +02:00
for _ , fp := range sc . EnergyFootprint {
2025-03-06 12:46:25 +01:00
// Always Init Metric Energy Inside Loop
metricEnergy := 0.0
2024-09-27 13:45:44 +02:00
if i , err := archive . MetricIndex ( sc . MetricConfig , fp ) ; err == nil {
2024-10-01 14:58:19 +02:00
// Note: For DB data, calculate and save as kWh
2024-12-05 07:49:52 +01:00
if sc . MetricConfig [ i ] . Energy == "energy" { // this metric has energy as unit (Joules or Wh)
2025-03-06 12:46:25 +01:00
log . Warnf ( "Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0" , jobMeta . JobID , jobMeta . Cluster , fp )
2024-12-05 07:49:52 +01:00
// FIXME: Needs sum as stats type
2024-11-08 06:27:27 +01:00
} else if sc . MetricConfig [ i ] . Energy == "power" { // this metric has power as unit (Watt)
2024-12-05 07:49:52 +01:00
// Energy: Power (in Watts) * Time (in Seconds)
2025-03-06 12:46:25 +01:00
// Unit: (W * (s / 3600)) / 1000 = kWh
// Round 2 Digits: round(Energy * 100) / 100
// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
2025-02-18 18:10:39 +01:00
// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
2025-03-06 12:46:25 +01:00
rawEnergy := ( ( LoadJobStat ( jobMeta , fp , "avg" ) * float64 ( jobMeta . NumNodes ) ) * ( float64 ( jobMeta . Duration ) / 3600.0 ) ) / 1000.0
metricEnergy = math . Round ( rawEnergy * 100.0 ) / 100.0
2024-08-29 07:26:49 +02:00
}
2024-09-27 13:45:44 +02:00
} else {
log . Warnf ( "Error while collecting energy metric %s for job, DB ID '%v', return '0.0'" , fp , jobMeta . ID )
2024-08-29 07:26:49 +02:00
}
2025-03-06 12:46:25 +01:00
energyFootprint [ fp ] = metricEnergy
totalEnergy += metricEnergy
// log.Infof("Metric %s Average %f -> %f kWh | Job %d Total -> %f kWh", fp, LoadJobStat(jobMeta, fp, "avg"), energy, jobMeta.JobID, totalEnergy)
2024-08-29 07:26:49 +02:00
}
var rawFootprint [ ] byte
if rawFootprint , err = json . Marshal ( energyFootprint ) ; err != nil {
2024-10-22 14:37:22 +02:00
log . Warnf ( "Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'" , jobMeta . ID )
2024-09-03 13:41:00 +02:00
return stmt , err
2024-08-29 07:26:49 +02:00
}
2025-03-06 12:46:25 +01:00
return stmt . Set ( "energy_footprint" , string ( rawFootprint ) ) . Set ( "energy" , ( math . Round ( totalEnergy * 100.0 ) / 100.0 ) ) , nil
2024-08-29 07:26:49 +02:00
}
2024-09-03 13:41:00 +02:00
func ( r * JobRepository ) UpdateFootprint (
stmt sq . UpdateBuilder ,
jobMeta * schema . JobMeta ,
) ( sq . UpdateBuilder , error ) {
2024-09-27 13:45:44 +02:00
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
2024-08-28 11:13:54 +02:00
sc , err := archive . GetSubCluster ( jobMeta . Cluster , jobMeta . SubCluster )
if err != nil {
log . Errorf ( "cannot get subcluster: %s" , err . Error ( ) )
2024-09-03 13:41:00 +02:00
return stmt , err
2024-08-28 11:13:54 +02:00
}
footprint := make ( map [ string ] float64 )
for _ , fp := range sc . Footprint {
2024-10-23 16:16:28 +02:00
var statType string
for _ , gm := range archive . GlobalMetricList {
if gm . Name == fp {
statType = gm . Footprint
}
}
if statType != "avg" && statType != "min" && statType != "max" {
log . Warnf ( "unknown statType for footprint update: %s" , statType )
return stmt , fmt . Errorf ( "unknown statType for footprint update: %s" , statType )
}
2024-08-29 07:26:49 +02:00
if i , err := archive . MetricIndex ( sc . MetricConfig , fp ) ; err != nil {
statType = sc . MetricConfig [ i ] . Footprint
}
2024-08-30 07:22:40 +02:00
name := fmt . Sprintf ( "%s_%s" , fp , statType )
2024-09-25 18:05:04 +02:00
footprint [ name ] = LoadJobStat ( jobMeta , fp , statType )
2024-08-28 11:13:54 +02:00
}
var rawFootprint [ ] byte
if rawFootprint , err = json . Marshal ( footprint ) ; err != nil {
2024-10-22 14:37:22 +02:00
log . Warnf ( "Error while marshaling footprint for job INTO BYTES, DB ID '%v'" , jobMeta . ID )
2024-09-03 13:41:00 +02:00
return stmt , err
2024-08-28 11:13:54 +02:00
}
2024-10-22 14:37:22 +02:00
return stmt . Set ( "footprint" , string ( rawFootprint ) ) , nil
2024-08-28 11:13:54 +02:00
}