Merge branch 'dev' into migrate_svelte5

This commit is contained in:
Christoph Kluge 2025-06-05 17:56:48 +02:00
commit 5048f7be14
70 changed files with 5231 additions and 2372 deletions

View File

@ -4,61 +4,78 @@ scalar Any
scalar NullableFloat scalar NullableFloat
scalar MetricScope scalar MetricScope
scalar JobState scalar JobState
scalar NodeState
scalar MonitoringState
type Node {
id: ID!
hostname: String!
cluster: String!
subCluster: String!
nodeState: NodeState!
HealthState: MonitoringState!
metaData: Any
}
type NodeStats {
state: String!
count: Int!
}
type Job { type Job {
id: ID! id: ID!
jobId: Int! jobId: Int!
user: String! user: String!
project: String! project: String!
cluster: String! cluster: String!
subCluster: String! subCluster: String!
startTime: Time! startTime: Time!
duration: Int! duration: Int!
walltime: Int! walltime: Int!
numNodes: Int! numNodes: Int!
numHWThreads: Int! numHWThreads: Int!
numAcc: Int! numAcc: Int!
energy: Float! energy: Float!
SMT: Int! SMT: Int!
exclusive: Int! exclusive: Int!
partition: String! partition: String!
arrayJobId: Int! arrayJobId: Int!
monitoringStatus: Int! monitoringStatus: Int!
state: JobState! state: JobState!
tags: [Tag!]! tags: [Tag!]!
resources: [Resource!]! resources: [Resource!]!
concurrentJobs: JobLinkResultList concurrentJobs: JobLinkResultList
footprint: [FootprintValue] footprint: [FootprintValue]
energyFootprint: [EnergyFootprintValue] energyFootprint: [EnergyFootprintValue]
metaData: Any metaData: Any
userData: User userData: User
} }
type JobLink { type JobLink {
id: ID! id: ID!
jobId: Int! jobId: Int!
} }
type Cluster { type Cluster {
name: String! name: String!
partitions: [String!]! # Slurm partitions partitions: [String!]! # Slurm partitions
subClusters: [SubCluster!]! # Hardware partitions/subclusters subClusters: [SubCluster!]! # Hardware partitions/subclusters
} }
type SubCluster { type SubCluster {
name: String! name: String!
nodes: String! nodes: String!
numberOfNodes: Int! numberOfNodes: Int!
processorType: String! processorType: String!
socketsPerNode: Int! socketsPerNode: Int!
coresPerSocket: Int! coresPerSocket: Int!
threadsPerCore: Int! threadsPerCore: Int!
flopRateScalar: MetricValue! flopRateScalar: MetricValue!
flopRateSimd: MetricValue! flopRateSimd: MetricValue!
memoryBandwidth: MetricValue! memoryBandwidth: MetricValue!
topology: Topology! topology: Topology!
metricConfig: [MetricConfig!]! metricConfig: [MetricConfig!]!
footprint: [String!]! footprint: [String!]!
} }
type FootprintValue { type FootprintValue {
@ -80,94 +97,94 @@ type MetricValue {
} }
type Topology { type Topology {
node: [Int!] node: [Int!]
socket: [[Int!]!] socket: [[Int!]!]
memoryDomain: [[Int!]!] memoryDomain: [[Int!]!]
die: [[Int!]!] die: [[Int!]!]
core: [[Int!]!] core: [[Int!]!]
accelerators: [Accelerator!] accelerators: [Accelerator!]
} }
type Accelerator { type Accelerator {
id: String! id: String!
type: String! type: String!
model: String! model: String!
} }
type SubClusterConfig { type SubClusterConfig {
name: String! name: String!
peak: Float peak: Float
normal: Float normal: Float
caution: Float caution: Float
alert: Float alert: Float
remove: Boolean remove: Boolean
} }
type MetricConfig { type MetricConfig {
name: String! name: String!
unit: Unit! unit: Unit!
scope: MetricScope! scope: MetricScope!
aggregation: String! aggregation: String!
timestep: Int! timestep: Int!
peak: Float! peak: Float!
normal: Float normal: Float
caution: Float! caution: Float!
alert: Float! alert: Float!
lowerIsBetter: Boolean lowerIsBetter: Boolean
subClusters: [SubClusterConfig!]! subClusters: [SubClusterConfig!]!
} }
type Tag { type Tag {
id: ID! id: ID!
type: String! type: String!
name: String! name: String!
scope: String! scope: String!
} }
type Resource { type Resource {
hostname: String! hostname: String!
hwthreads: [Int!] hwthreads: [Int!]
accelerators: [String!] accelerators: [String!]
configuration: String configuration: String
} }
type JobMetricWithName { type JobMetricWithName {
name: String! name: String!
scope: MetricScope! scope: MetricScope!
metric: JobMetric! metric: JobMetric!
} }
type JobMetric { type JobMetric {
unit: Unit unit: Unit
timestep: Int! timestep: Int!
series: [Series!] series: [Series!]
statisticsSeries: StatsSeries statisticsSeries: StatsSeries
} }
type Series { type Series {
hostname: String! hostname: String!
id: String id: String
statistics: MetricStatistics statistics: MetricStatistics
data: [NullableFloat!]! data: [NullableFloat!]!
} }
type StatsSeries { type StatsSeries {
mean: [NullableFloat!]! mean: [NullableFloat!]!
median: [NullableFloat!]! median: [NullableFloat!]!
min: [NullableFloat!]! min: [NullableFloat!]!
max: [NullableFloat!]! max: [NullableFloat!]!
} }
type NamedStatsWithScope { type NamedStatsWithScope {
name: String! name: String!
scope: MetricScope! scope: MetricScope!
stats: [ScopedStats!]! stats: [ScopedStats!]!
} }
type ScopedStats { type ScopedStats {
hostname: String! hostname: String!
id: String id: String
data: MetricStatistics! data: MetricStatistics!
} }
type JobStats { type JobStats {
@ -184,8 +201,8 @@ type JobStats {
} }
type NamedStats { type NamedStats {
name: String! name: String!
data: MetricStatistics! data: MetricStatistics!
} }
type Unit { type Unit {
@ -201,12 +218,12 @@ type MetricStatistics {
type MetricFootprints { type MetricFootprints {
metric: String! metric: String!
data: [NullableFloat!]! data: [NullableFloat!]!
} }
type Footprints { type Footprints {
timeWeights: TimeWeights! timeWeights: TimeWeights!
metrics: [MetricFootprints!]! metrics: [MetricFootprints!]!
} }
type TimeWeights { type TimeWeights {
@ -215,20 +232,33 @@ type TimeWeights {
coreHours: [NullableFloat!]! coreHours: [NullableFloat!]!
} }
enum Aggregate { USER, PROJECT, CLUSTER } enum Aggregate {
enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS } USER
PROJECT
CLUSTER
}
enum SortByAggregate {
TOTALWALLTIME
TOTALJOBS
TOTALNODES
TOTALNODEHOURS
TOTALCORES
TOTALCOREHOURS
TOTALACCS
TOTALACCHOURS
}
type NodeMetrics { type NodeMetrics {
host: String! host: String!
subCluster: String! subCluster: String!
metrics: [JobMetricWithName!]! metrics: [JobMetricWithName!]!
} }
type NodesResultList { type NodesResultList {
items: [NodeMetrics!]! items: [NodeMetrics!]!
offset: Int offset: Int
limit: Int limit: Int
count: Int count: Int
totalNodes: Int totalNodes: Int
hasNextPage: Boolean hasNextPage: Boolean
} }
@ -247,14 +277,14 @@ type GlobalMetricListItem {
} }
type Count { type Count {
name: String! name: String!
count: Int! count: Int!
} }
type User { type User {
username: String! username: String!
name: String! name: String!
email: String! email: String!
} }
input MetricStatItem { input MetricStatItem {
@ -263,27 +293,81 @@ input MetricStatItem {
} }
type Query { type Query {
clusters: [Cluster!]! # List of all clusters clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags tags: [Tag!]! # List of all tags
globalMetrics: [GlobalMetricListItem!]! globalMetrics: [GlobalMetricListItem!]!
user(username: String!): User user(username: String!): User
allocatedNodes(cluster: String!): [Count!]! allocatedNodes(cluster: String!): [Count!]!
job(id: ID!): Job node(id: ID!): Node
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]! nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList!
jobStats(id: ID!, metrics: [String!]): [NamedStats!]! nodeStats(filter: [NodeFilter!]): [NodeStats!]!
scopedJobStats(id: ID!, metrics: [String!], scopes: [MetricScope!]): [NamedStatsWithScope!]!
job(id: ID!): Job
jobMetrics(
id: ID!
metrics: [String!]
scopes: [MetricScope!]
resolution: Int
): [JobMetricWithName!]!
jobStats(id: ID!, metrics: [String!]): [NamedStats!]!
scopedJobStats(
id: ID!
metrics: [String!]
scopes: [MetricScope!]
): [NamedStatsWithScope!]!
jobs(
filter: [JobFilter!]
page: PageRequest
order: OrderByInput
): JobResultList!
jobsStatistics(
filter: [JobFilter!]
metrics: [String!]
page: PageRequest
sortBy: SortByAggregate
groupBy: Aggregate
numDurationBins: String
numMetricBins: Int
): [JobsStatistics!]!
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]!
jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]! jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]! rooflineHeatmap(
filter: [JobFilter!]!
rows: Int!
cols: Int!
minX: Float!
minY: Float!
maxX: Float!
maxY: Float!
): [[Float!]!]!
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! nodeMetrics(
nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList! cluster: String!
nodes: [String!]
scopes: [MetricScope!]
metrics: [String!]
from: Time!
to: Time!
): [NodeMetrics!]!
nodeMetricsList(
cluster: String!
subCluster: String!
nodeFilter: String!
scopes: [MetricScope!]
metrics: [String!]
from: Time!
to: Time!
page: PageRequest
resolution: Int
): NodesResultList!
} }
type Mutation { type Mutation {
@ -296,38 +380,53 @@ type Mutation {
updateConfiguration(name: String!, value: String!): String updateConfiguration(name: String!, value: String!): String
} }
type IntRangeOutput { from: Int!, to: Int! } type IntRangeOutput {
type TimeRangeOutput { range: String, from: Time!, to: Time! } from: Int!
to: Int!
}
type TimeRangeOutput {
range: String
from: Time!
to: Time!
}
input NodeFilter {
hostname: StringInput
cluster: StringInput
subCluster: StringInput
nodeState: NodeState
healthState: MonitoringState
}
input JobFilter { input JobFilter {
tags: [ID!] tags: [ID!]
dbId: [ID!] dbId: [ID!]
jobId: StringInput jobId: StringInput
arrayJobId: Int arrayJobId: Int
user: StringInput user: StringInput
project: StringInput project: StringInput
jobName: StringInput jobName: StringInput
cluster: StringInput cluster: StringInput
partition: StringInput partition: StringInput
duration: IntRange duration: IntRange
energy: FloatRange energy: FloatRange
minRunningFor: Int minRunningFor: Int
numNodes: IntRange numNodes: IntRange
numAccelerators: IntRange numAccelerators: IntRange
numHWThreads: IntRange numHWThreads: IntRange
startTime: TimeRange startTime: TimeRange
state: [JobState!] state: [JobState!]
metricStats: [MetricStatItem!] metricStats: [MetricStatItem!]
exclusive: Int exclusive: Int
node: StringInput node: StringInput
} }
input OrderByInput { input OrderByInput {
field: String! field: String!
type: String!, type: String!
order: SortDirectionEnum! = ASC order: SortDirectionEnum! = ASC
} }
@ -337,34 +436,46 @@ enum SortDirectionEnum {
} }
input StringInput { input StringInput {
eq: String eq: String
neq: String neq: String
contains: String contains: String
startsWith: String startsWith: String
endsWith: String endsWith: String
in: [String!] in: [String!]
} }
input IntRange { from: Int!, to: Int! } input IntRange {
input TimeRange { range: String, from: Time, to: Time } from: Int!
to: Int!
}
input TimeRange {
range: String
from: Time
to: Time
}
input FloatRange { input FloatRange {
from: Float! from: Float!
to: Float! to: Float!
} }
type NodeStateResultList {
items: [Node!]!
count: Int
}
type JobResultList { type JobResultList {
items: [Job!]! items: [Job!]!
offset: Int offset: Int
limit: Int limit: Int
count: Int count: Int
hasNextPage: Boolean hasNextPage: Boolean
} }
type JobLinkResultList { type JobLinkResultList {
listQuery: String listQuery: String
items: [JobLink!]! items: [JobLink!]!
count: Int count: Int
} }
type HistoPoint { type HistoPoint {
@ -386,27 +497,27 @@ type MetricHistoPoint {
max: Int max: Int
} }
type JobsStatistics { type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster id: ID! # If `groupBy` was used, ID of the user/project/cluster
name: String! # if User-Statistics: Given Name of Account (ID) Owner name: String! # if User-Statistics: Given Name of Account (ID) Owner
totalJobs: Int! # Number of jobs totalJobs: Int! # Number of jobs
runningJobs: Int! # Number of running jobs runningJobs: Int! # Number of running jobs
shortJobs: Int! # Number of jobs with a duration of less than duration shortJobs: Int! # Number of jobs with a duration of less than duration
totalWalltime: Int! # Sum of the duration of all matched jobs in hours totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalNodes: Int! # Sum of the nodes of all matched jobs totalNodes: Int! # Sum of the nodes of all matched jobs
totalNodeHours: Int! # Sum of the node hours of all matched jobs totalNodeHours: Int! # Sum of the node hours of all matched jobs
totalCores: Int! # Sum of the cores of all matched jobs totalCores: Int! # Sum of the cores of all matched jobs
totalCoreHours: Int! # Sum of the core hours of all matched jobs totalCoreHours: Int! # Sum of the core hours of all matched jobs
totalAccs: Int! # Sum of the accs of all matched jobs totalAccs: Int! # Sum of the accs of all matched jobs
totalAccHours: Int! # Sum of the gpu hours of all matched jobs totalAccHours: Int! # Sum of the gpu hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores
histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs
histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average
} }
input PageRequest { input PageRequest {
itemsPerPage: Int! itemsPerPage: Int!
page: Int! page: Int!
} }

View File

@ -7,8 +7,9 @@ package main
import "flag" import "flag"
var ( var (
flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB,
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool
flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string
) )
func cliInit() { func cliInit() {
@ -21,6 +22,7 @@ func cliInit() {
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit") flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit") flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit")
flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit") flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit")
flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit")
flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit") flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`") flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`")

View File

@ -19,7 +19,9 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/tagger"
"github.com/ClusterCockpit/cc-backend/internal/taskManager" "github.com/ClusterCockpit/cc-backend/internal/taskManager"
"github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv" "github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv"
@ -211,11 +213,22 @@ func main() {
} }
} }
if config.Keys.EnableJobTaggers {
tagger.Init()
}
if flagApplyTags {
if err := tagger.RunTaggers(); err != nil {
log.Abortf("Running job taggers.\nError: %s\n", err.Error())
}
}
if !flagServer { if !flagServer {
log.Exit("No errors, server flag not set. Exiting cc-backend.") log.Exit("No errors, server flag not set. Exiting cc-backend.")
} }
archiver.Start(repository.GetJobRepository()) archiver.Start(repository.GetJobRepository())
taskManager.Start() taskManager.Start()
serverInit() serverInit()
@ -237,6 +250,8 @@ func main() {
serverShutdown() serverShutdown()
util.FsWatcherShutdown()
taskManager.Shutdown() taskManager.Shutdown()
}() }()

8
go.mod
View File

@ -9,6 +9,8 @@ require (
github.com/ClusterCockpit/cc-units v0.4.0 github.com/ClusterCockpit/cc-units v0.4.0
github.com/Masterminds/squirrel v1.5.4 github.com/Masterminds/squirrel v1.5.4
github.com/coreos/go-oidc/v3 v3.12.0 github.com/coreos/go-oidc/v3 v3.12.0
github.com/expr-lang/expr v1.17.3
github.com/fsnotify/fsnotify v1.9.0
github.com/go-co-op/gocron/v2 v2.16.0 github.com/go-co-op/gocron/v2 v2.16.0
github.com/go-ldap/ldap/v3 v3.4.10 github.com/go-ldap/ldap/v3 v3.4.10
github.com/go-sql-driver/mysql v1.9.0 github.com/go-sql-driver/mysql v1.9.0
@ -18,8 +20,8 @@ require (
github.com/gorilla/handlers v1.5.2 github.com/gorilla/handlers v1.5.2
github.com/gorilla/mux v1.8.1 github.com/gorilla/mux v1.8.1
github.com/gorilla/sessions v1.4.0 github.com/gorilla/sessions v1.4.0
github.com/influxdata/influxdb-client-go/v2 v2.14.0
github.com/jmoiron/sqlx v1.4.0 github.com/jmoiron/sqlx v1.4.0
github.com/joho/godotenv v1.5.1
github.com/mattn/go-sqlite3 v1.14.24 github.com/mattn/go-sqlite3 v1.14.24
github.com/prometheus/client_golang v1.21.0 github.com/prometheus/client_golang v1.21.0
github.com/prometheus/common v0.62.0 github.com/prometheus/common v0.62.0
@ -39,7 +41,6 @@ require (
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
github.com/KyleBanks/depth v1.2.1 // indirect github.com/KyleBanks/depth v1.2.1 // indirect
github.com/agnivade/levenshtein v1.2.1 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
@ -57,8 +58,6 @@ require (
github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
github.com/joho/godotenv v1.5.1 // indirect
github.com/jonboulle/clockwork v0.5.0 // indirect github.com/jonboulle/clockwork v0.5.0 // indirect
github.com/josharian/intern v1.0.0 // indirect github.com/josharian/intern v1.0.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect
@ -70,7 +69,6 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect github.com/prometheus/procfs v0.15.1 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect

16
go.sum
View File

@ -16,7 +16,6 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0= github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0=
github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U= github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM=
github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa h1:LHTHcTQiSGT7VVbI0o4wBRNQIgn917usHWOd6VAffYI= github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa h1:LHTHcTQiSGT7VVbI0o4wBRNQIgn917usHWOd6VAffYI=
@ -25,13 +24,10 @@ github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNg
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/coreos/go-oidc/v3 v3.12.0 h1:sJk+8G2qq94rDI6ehZ71Bol3oUHy63qNYmkiSjrc/Jo= github.com/coreos/go-oidc/v3 v3.12.0 h1:sJk+8G2qq94rDI6ehZ71Bol3oUHy63qNYmkiSjrc/Jo=
@ -53,8 +49,12 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/expr-lang/expr v1.17.3 h1:myeTTuDFz7k6eFe/JPlep/UsiIjVhG61FMHFu63U7j0=
github.com/expr-lang/expr v1.17.3/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk= github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk=
github.com/go-asn1-ber/asn1-ber v1.5.7/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-asn1-ber/asn1-ber v1.5.7/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-co-op/gocron/v2 v2.16.0 h1:uqUF6WFZ4enRU45pWFNcn1xpDLc+jBOTKhPQI16Z1xs= github.com/go-co-op/gocron/v2 v2.16.0 h1:uqUF6WFZ4enRU45pWFNcn1xpDLc+jBOTKhPQI16Z1xs=
@ -119,10 +119,6 @@ github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/C
github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4=
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8=
github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs=
github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo=
@ -147,7 +143,6 @@ github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2E
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
@ -182,8 +177,6 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
@ -215,7 +208,6 @@ github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4= github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4=
github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg= github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=

View File

@ -62,6 +62,11 @@ models:
fields: fields:
partitions: partitions:
resolver: true resolver: true
Node:
model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Node"
fields:
metaData:
resolver: true
NullableFloat: NullableFloat:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" } { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" }
MetricScope: MetricScope:
@ -81,6 +86,10 @@ models:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" } { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" }
JobState: JobState:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" } { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" }
MonitoringState:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.NodeState" }
HealthState:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MonitoringState" }
TimeRange: TimeRange:
{ model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" } { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" }
IntRange: IntRange:

View File

@ -123,7 +123,7 @@ func setup(t *testing.T) *api.RestApi {
t.Fatal(err) t.Fatal(err)
} }
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil { if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 2), 0666); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -204,11 +204,11 @@ func TestRestApi(t *testing.T) {
restapi.MountApiRoutes(r) restapi.MountApiRoutes(r)
var TestJobId int64 = 123 var TestJobId int64 = 123
var TestClusterName string = "testcluster" TestClusterName := "testcluster"
var TestStartTime int64 = 123456789 var TestStartTime int64 = 123456789
const startJobBody string = `{ const startJobBody string = `{
"jobId": 123, "jobId": 123,
"user": "testuser", "user": "testuser",
"project": "testproj", "project": "testproj",
"cluster": "testcluster", "cluster": "testcluster",
@ -221,7 +221,6 @@ func TestRestApi(t *testing.T) {
"exclusive": 1, "exclusive": 1,
"monitoringStatus": 1, "monitoringStatus": 1,
"smt": 1, "smt": 1,
"tags": [{ "type": "testTagType", "name": "testTagName", "scope": "testuser" }],
"resources": [ "resources": [
{ {
"hostname": "host123", "hostname": "host123",
@ -252,16 +251,17 @@ func TestRestApi(t *testing.T) {
if response.StatusCode != http.StatusCreated { if response.StatusCode != http.StatusCreated {
t.Fatal(response.Status, recorder.Body.String()) t.Fatal(response.Status, recorder.Body.String())
} }
resolver := graph.GetResolverInstance() // resolver := graph.GetResolverInstance()
restapi.JobRepository.SyncJobs()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
job.Tags, err = resolver.Job().Tags(ctx, job) // job.Tags, err = resolver.Job().Tags(ctx, job)
if err != nil { // if err != nil {
t.Fatal(err) // t.Fatal(err)
} // }
if job.JobID != 123 || if job.JobID != 123 ||
job.User != "testuser" || job.User != "testuser" ||
@ -278,13 +278,13 @@ func TestRestApi(t *testing.T) {
job.MonitoringStatus != 1 || job.MonitoringStatus != 1 ||
job.SMT != 1 || job.SMT != 1 ||
!reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) || !reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) ||
job.StartTime.Unix() != 123456789 { job.StartTime != 123456789 {
t.Fatalf("unexpected job properties: %#v", job) t.Fatalf("unexpected job properties: %#v", job)
} }
if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { // if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" {
t.Fatalf("unexpected tags: %#v", job.Tags) // t.Fatalf("unexpected tags: %#v", job.Tags)
} // }
}); !ok { }); !ok {
return return
} }
@ -352,7 +352,7 @@ func TestRestApi(t *testing.T) {
t.Run("CheckDoubleStart", func(t *testing.T) { t.Run("CheckDoubleStart", func(t *testing.T) {
// Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart! // Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart!
body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1) body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`)
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body))) req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body)))
recorder := httptest.NewRecorder() recorder := httptest.NewRecorder()
@ -402,6 +402,7 @@ func TestRestApi(t *testing.T) {
} }
time.Sleep(1 * time.Second) time.Sleep(1 * time.Second)
restapi.JobRepository.SyncJobs()
const stopJobBodyFailed string = `{ const stopJobBodyFailed string = `{
"jobId": 12345, "jobId": 12345,

70
internal/api/cluster.go Normal file
View File

@ -0,0 +1,70 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// GetClustersApiResponse model
type GetClustersApiResponse struct {
Clusters []*schema.Cluster `json:"clusters"` // Array of clusters
}
// getClusters godoc
// @summary Lists all cluster configs
// @tags Cluster query
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
// @produce json
// @param cluster query string false "Job Cluster"
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/clusters/ [get]
func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); user != nil &&
!user.HasRole(schema.RoleApi) {
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
var clusters []*schema.Cluster
if r.URL.Query().Has("cluster") {
name := r.URL.Query().Get("cluster")
cluster := archive.GetCluster(name)
if cluster == nil {
handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw)
return
}
clusters = append(clusters, cluster)
} else {
clusters = archive.Clusters
}
payload := GetClustersApiResponse{
Clusters: clusters,
}
if err := json.NewEncoder(bw).Encode(payload); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}

987
internal/api/job.go Normal file
View File

@ -0,0 +1,987 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"database/sql"
"encoding/json"
"errors"
"fmt"
"net/http"
"strconv"
"strings"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/gorilla/mux"
)
// DefaultApiResponse model
type DefaultJobApiResponse struct {
Message string `json:"msg"`
}
// StopJobApiRequest model
type StopJobApiRequest struct {
JobId *int64 `json:"jobId" example:"123000"`
Cluster *string `json:"cluster" example:"fritz"`
StartTime *int64 `json:"startTime" example:"1649723812"`
State schema.JobState `json:"jobState" validate:"required" example:"completed"`
StopTime int64 `json:"stopTime" validate:"required" example:"1649763839"`
}
// DeleteJobApiRequest model
type DeleteJobApiRequest struct {
JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
Cluster *string `json:"cluster" example:"fritz"` // Cluster of job
StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch
}
// GetJobsApiResponse model
type GetJobsApiResponse struct {
Jobs []*schema.Job `json:"jobs"` // Array of jobs
Items int `json:"items"` // Number of jobs returned
Page int `json:"page"` // Page id returned
}
// ApiTag model
type ApiTag struct {
// Tag Type
Type string `json:"type" example:"Debug"`
Name string `json:"name" example:"Testjob"` // Tag Name
Scope string `json:"scope" example:"global"` // Tag Scope for Frontend Display
}
// ApiMeta model
type EditMetaRequest struct {
Key string `json:"key" example:"jobScript"`
Value string `json:"value" example:"bash script"`
}
type TagJobApiRequest []*ApiTag
type GetJobApiRequest []string
type GetJobApiResponse struct {
Meta *schema.Job
Data []*JobMetricWithName
}
type GetCompleteJobApiResponse struct {
Meta *schema.Job
Data schema.JobData
}
type JobMetricWithName struct {
Metric *schema.JobMetric `json:"metric"`
Name string `json:"name"`
Scope schema.MetricScope `json:"scope"`
}
// getJobs godoc
// @summary Lists all jobs
// @tags Job query
// @description Get a list of all jobs. Filters can be applied using query parameters.
// @description Number of results can be limited by page. Results are sorted by descending startTime.
// @produce json
// @param state query string false "Job State" Enums(running, completed, failed, cancelled, stopped, timeout)
// @param cluster query string false "Job Cluster"
// @param start-time query string false "Syntax: '$from-$to', as unix epoch timestamps in seconds"
// @param items-per-page query int false "Items per page (Default: 25)"
// @param page query int false "Page Number (Default: 1)"
// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response"
// @success 200 {object} api.GetJobsApiResponse "Job array and page info"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/ [get]
func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
withMetadata := false
filter := &model.JobFilter{}
page := &model.PageRequest{ItemsPerPage: 25, Page: 1}
order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc}
for key, vals := range r.URL.Query() {
switch key {
case "state":
for _, s := range vals {
state := schema.JobState(s)
if !state.Valid() {
handleError(fmt.Errorf("invalid query parameter value: state"),
http.StatusBadRequest, rw)
return
}
filter.State = append(filter.State, state)
}
case "cluster":
filter.Cluster = &model.StringInput{Eq: &vals[0]}
case "start-time":
st := strings.Split(vals[0], "-")
if len(st) != 2 {
handleError(fmt.Errorf("invalid query parameter value: startTime"),
http.StatusBadRequest, rw)
return
}
from, err := strconv.ParseInt(st[0], 10, 64)
if err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
to, err := strconv.ParseInt(st[1], 10, 64)
if err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
ufrom, uto := time.Unix(from, 0), time.Unix(to, 0)
filter.StartTime = &schema.TimeRange{From: &ufrom, To: &uto}
case "page":
x, err := strconv.Atoi(vals[0])
if err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
page.Page = x
case "items-per-page":
x, err := strconv.Atoi(vals[0])
if err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
page.ItemsPerPage = x
case "with-metadata":
withMetadata = true
default:
handleError(fmt.Errorf("invalid query parameter: %s", key),
http.StatusBadRequest, rw)
return
}
}
jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
results := make([]*schema.Job, 0, len(jobs))
for _, job := range jobs {
if withMetadata {
if _, err = api.JobRepository.FetchMetadata(job); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
if job.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful {
job.Statistics, err = archive.GetStatistics(job)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}
results = append(results, job)
}
log.Debugf("/api/jobs: %d jobs returned", len(results))
rw.Header().Add("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
payload := GetJobsApiResponse{
Jobs: results,
Items: page.ItemsPerPage,
Page: page.Page,
}
if err := json.NewEncoder(bw).Encode(payload); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}
// getCompleteJobById godoc
// @summary Get job meta and optional all metric data
// @tags Job query
// @description Job to get is specified by database ID
// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'.
// @produce json
// @param id path int true "Database ID of Job"
// @param all-metrics query bool false "Include all available metrics"
// @success 200 {object} api.GetJobApiResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 404 {object} api.ErrorResponse "Resource not found"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/{id} [get]
func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) {
// Fetch job from db
id, ok := mux.Vars(r)["id"]
var job *schema.Job
var err error
if ok {
id, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
return
}
job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID
} else {
handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw)
return
}
if err != nil {
handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw)
return
}
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
if _, err = api.JobRepository.FetchMetadata(job); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
var scopes []schema.MetricScope
if job.NumNodes == 1 {
scopes = []schema.MetricScope{"core"}
} else {
scopes = []schema.MetricScope{"node"}
}
var data schema.JobData
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
resolution := 0
for _, mc := range metricConfigs {
resolution = max(resolution, mc.Timestep)
}
if r.URL.Query().Get("all-metrics") == "true" {
data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
if err != nil {
log.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
return
}
}
log.Debugf("/api/job/%s: get job %d", id, job.JobID)
rw.Header().Add("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
payload := GetCompleteJobApiResponse{
Meta: job,
Data: data,
}
if err := json.NewEncoder(bw).Encode(payload); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}
// getJobById godoc
// @summary Get job meta and configurable metric data
// @tags Job query
// @description Job to get is specified by database ID
// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'.
// @accept json
// @produce json
// @param id path int true "Database ID of Job"
// @param request body api.GetJobApiRequest true "Array of metric names"
// @success 200 {object} api.GetJobApiResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 404 {object} api.ErrorResponse "Resource not found"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/{id} [post]
func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) {
// Fetch job from db
id, ok := mux.Vars(r)["id"]
var job *schema.Job
var err error
if ok {
id, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
return
}
job, err = api.JobRepository.FindById(r.Context(), id)
} else {
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
return
}
if err != nil {
handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw)
return
}
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
if _, err = api.JobRepository.FetchMetadata(job); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
var metrics GetJobApiRequest
if err = decode(r.Body, &metrics); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
var scopes []schema.MetricScope
if job.NumNodes == 1 {
scopes = []schema.MetricScope{"core"}
} else {
scopes = []schema.MetricScope{"node"}
}
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
resolution := 0
for _, mc := range metricConfigs {
resolution = max(resolution, mc.Timestep)
}
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
if err != nil {
log.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
return
}
res := []*JobMetricWithName{}
for name, md := range data {
for scope, metric := range md {
res = append(res, &JobMetricWithName{
Name: name,
Scope: scope,
Metric: metric,
})
}
}
log.Debugf("/api/job/%s: get job %d", id, job.JobID)
rw.Header().Add("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
payload := GetJobApiResponse{
Meta: job,
Data: res,
}
if err := json.NewEncoder(bw).Encode(payload); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}
// editMeta godoc
// @summary Edit meta-data json
// @tags Job add and modify
// @description Edit key value pairs in job metadata json
// @description If a key already exists its content will be overwritten
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.EditMetaRequest true "Kay value pair to add"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 404 {object} api.ErrorResponse "Job does not exist"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/edit_meta/{id} [post]
func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) {
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
job, err := api.JobRepository.FindById(r.Context(), id)
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
return
}
var req EditMetaRequest
if err := decode(r.Body, &req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
if err := api.JobRepository.UpdateMetadata(job, req.Key, req.Value); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
// tagJob godoc
// @summary Adds one or more tags to a job
// @tags Job add and modify
// @description Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.
// @description Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username.
// @description If tagged job is already finished: Tag will be written directly to respective archive files.
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to add"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 404 {object} api.ErrorResponse "Job or tag does not exist"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/tag_job/{id} [post]
func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
job, err := api.JobRepository.FindById(r.Context(), id)
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
return
}
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
var req TagJobApiRequest
if err := decode(r.Body, &req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
for _, tag := range req {
tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), *job.ID, tag.Type, tag.Name, tag.Scope)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
job.Tags = append(job.Tags, &schema.Tag{
ID: tagId,
Type: tag.Type,
Name: tag.Name,
Scope: tag.Scope,
})
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
// removeTagJob godoc
// @summary Removes one or more tags from a job
// @tags Job add and modify
// @description Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.
// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API.
// @description If tagged job is already finished: Tag will be removed from respective archive files.
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 404 {object} api.ErrorResponse "Job or tag does not exist"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /jobs/tag_job/{id} [delete]
func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) {
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
job, err := api.JobRepository.FindById(r.Context(), id)
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
return
}
job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
var req TagJobApiRequest
if err := decode(r.Body, &req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
for _, rtag := range req {
// Only Global and Admin Tags
if rtag.Scope != "global" && rtag.Scope != "admin" {
log.Warnf("Cannot delete private tag for job %d: Skip", job.JobID)
continue
}
remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), *job.ID, rtag.Type, rtag.Name, rtag.Scope)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
job.Tags = remainingTags
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
// removeTags godoc
// @summary Removes all tags and job-relations for type:name tuple
// @tags Tag remove
// @description Removes tags by type and name. Name and Type of Tag(s) must match.
// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API.
// @description Tag wills be removed from respective archive files.
// @accept json
// @produce plain
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
// @success 200 {string} string "Success Response"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 404 {object} api.ErrorResponse "Job or tag does not exist"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /tags/ [delete]
func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) {
var req TagJobApiRequest
if err := decode(r.Body, &req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
targetCount := len(req)
currentCount := 0
for _, rtag := range req {
// Only Global and Admin Tags
if rtag.Scope != "global" && rtag.Scope != "admin" {
log.Warn("Cannot delete private tag: Skip")
continue
}
err := api.JobRepository.RemoveTagByRequest(rtag.Type, rtag.Name, rtag.Scope)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
} else {
currentCount++
}
}
rw.WriteHeader(http.StatusOK)
fmt.Fprintf(rw, "Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount)
}
// startJob godoc
// @summary Adds a new job as "running"
// @tags Job add and modify
// @description Job specified in request body will be saved to database as "running" with new DB ID.
// @description Job specifications follow the 'JobMeta' scheme, API will fail to execute if requirements are not met.
// @accept json
// @produce json
// @param request body schema.JobMeta true "Job to add"
// @success 201 {object} api.DefaultJobApiResponse "Job added successfully"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: The combination of jobId, clusterId and startTime does already exist"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/start_job/ [post]
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
req := schema.Job{
Exclusive: 1,
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
return
}
log.Printf("REST: %s\n", req.GoString())
req.State = schema.JobStateRunning
if err := importer.SanityChecks(&req); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
// aquire lock to avoid race condition between API calls
var unlockOnce sync.Once
api.RepositoryMutex.Lock()
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
// Check if combination of (job_id, cluster_id, start_time) already exists:
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows {
handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw)
return
} else if err == nil {
for _, job := range jobs {
if (req.StartTime - job.StartTime) < 86400 {
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
return
}
}
}
id, err := api.JobRepository.Start(&req)
if err != nil {
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
return
}
// unlock here, adding Tags can be async
unlockOnce.Do(api.RepositoryMutex.Unlock)
for _, tag := range req.Tags {
if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw)
return
}
}
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusCreated)
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
Message: "success",
})
}
// stopJobByRequest godoc
// @summary Marks job as completed and triggers archiving
// @tags Job add and modify
// @description Job to stop is specified by request body. All fields are required in this case.
// @description Returns full job resource information according to 'JobMeta' scheme.
// @produce json
// @param request body api.StopJobApiRequest true "All fields required"
// @success 200 {object} schema.JobMeta "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 404 {object} api.ErrorResponse "Resource not found"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: job has already been stopped"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/stop_job/ [post]
func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
// Parse request body
req := StopJobApiRequest{}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
return
}
// Fetch job (that will be stopped) from db
var job *schema.Job
var err error
if req.JobId == nil {
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
return
}
// log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
if err != nil {
job, err = api.JobRepository.FindCached(req.JobId, req.Cluster, req.StartTime)
// FIXME: Previous error is hidden
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
}
api.checkAndHandleStopJob(rw, job, req)
}
// deleteJobById godoc
// @summary Remove a job from the sql database
// @tags Job remove
// @description Job to remove is specified by database ID. This will not remove the job from the job archive.
// @produce json
// @param id path int true "Database ID of Job"
// @success 200 {object} api.DefaultJobApiResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 404 {object} api.ErrorResponse "Resource not found"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/delete_job/{id} [delete]
func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) {
// Fetch job (that will be stopped) from db
id, ok := mux.Vars(r)["id"]
var err error
if ok {
id, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
return
}
err = api.JobRepository.DeleteJobById(id)
} else {
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
return
}
if err != nil {
handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
Message: fmt.Sprintf("Successfully deleted job %s", id),
})
}
// deleteJobByRequest godoc
// @summary Remove a job from the sql database
// @tags Job remove
// @description Job to delete is specified by request body. All fields are required in this case.
// @accept json
// @produce json
// @param request body api.DeleteJobApiRequest true "All fields required"
// @success 200 {object} api.DefaultJobApiResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 404 {object} api.ErrorResponse "Resource not found"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/delete_job/ [delete]
func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) {
// Parse request body
req := DeleteJobApiRequest{}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
return
}
// Fetch job (that will be deleted) from db
var job *schema.Job
var err error
if req.JobId == nil {
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
return
}
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
err = api.JobRepository.DeleteJobById(*job.ID)
if err != nil {
handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
})
}
// deleteJobBefore godoc
// @summary Remove a job from the sql database
// @tags Job remove
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
// @produce json
// @param ts path int true "Unix epoch timestamp"
// @success 200 {object} api.DefaultJobApiResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 404 {object} api.ErrorResponse "Resource not found"
// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/delete_job_before/{ts} [delete]
func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
var cnt int
// Fetch job (that will be stopped) from db
id, ok := mux.Vars(r)["ts"]
var err error
if ok {
ts, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw)
return
}
cnt, err = api.JobRepository.DeleteJobsBefore(ts)
} else {
handleError(errors.New("the parameter 'ts' is required"), http.StatusBadRequest, rw)
return
}
if err != nil {
handleError(fmt.Errorf("deleting jobs failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(DefaultJobApiResponse{
Message: fmt.Sprintf("Successfully deleted %d jobs", cnt),
})
}
func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobApiRequest) {
// Sanity checks
if job.State != schema.JobStateRunning {
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
return
}
if job == nil || job.StartTime > req.StopTime {
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
return
}
if req.State != "" && !req.State.Valid() {
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
return
} else if req.State == "" {
req.State = schema.JobStateCompleted
}
// Mark job as stopped in the database (update state and duration)
job.Duration = int32(req.StopTime - job.StartTime)
job.State = req.State
api.JobRepository.Mutex.Lock()
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
api.JobRepository.Mutex.Unlock()
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
return
}
}
api.JobRepository.Mutex.Unlock()
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
// Send a response (with status OK). This means that erros that happen from here on forward
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
// writing to the filesystem fails, the client will not know.
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
// Monitoring is disabled...
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
return
}
// Trigger async archiving
archiver.TriggerArchiving(job)
}
func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
id := mux.Vars(r)["id"]
metrics := r.URL.Query()["metric"]
var scopes []schema.MetricScope
for _, scope := range r.URL.Query()["scope"] {
var s schema.MetricScope
if err := s.UnmarshalGQL(scope); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
scopes = append(scopes, s)
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
type Respone struct {
Data *struct {
JobMetrics []*model.JobMetricWithName `json:"jobMetrics"`
} `json:"data"`
Error *struct {
Message string `json:"message"`
} `json:"error"`
}
resolver := graph.GetResolverInstance()
data, err := resolver.Query().JobMetrics(r.Context(), id, metrics, scopes, nil)
if err != nil {
json.NewEncoder(rw).Encode(Respone{
Error: &struct {
Message string "json:\"message\""
}{Message: err.Error()},
})
return
}
json.NewEncoder(rw).Encode(Respone{
Data: &struct {
JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\""
}{JobMetrics: data},
})
}

74
internal/api/node.go Normal file
View File

@ -0,0 +1,74 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"fmt"
"net/http"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type Node struct {
Name string `json:"hostname"`
States []string `json:"states"`
}
// updateNodeStatesRequest model
type UpdateNodeStatesRequest struct {
Nodes []Node `json:"nodes"`
Cluster string `json:"cluster" example:"fritz"`
}
// this routine assumes that only one of them applies per node
func determineState(states []string) schema.NodeState {
for _, state := range states {
switch strings.ToLower(state) {
case "allocated":
return schema.NodeStateAllocated
case "reserved":
return schema.NodeStateReserved
case "idle":
return schema.NodeStateIdle
case "down":
return schema.NodeStateDown
case "mixed":
return schema.NodeStateMixed
}
}
return schema.NodeStateUnknown
}
// updateNodeStates godoc
// @summary Deliver updated Slurm node states
// @tags node
// @description Returns a JSON-encoded list of users.
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
// @success 200 {array} api.SuccessResponse "Success"
// @failure 400 {string} string "Bad Request"
// @failure 401 {string} string "Unauthorized"
// @failure 403 {string} string "Forbidden"
// @failure 500 {string} string "Internal Server Error"
// @security ApiKeyAuth
// @router /api/nodestats/ [post]
func (api *RestApi) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
// Parse request body
req := UpdateNodeStatesRequest{}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
return
}
repo := repository.GetNodeRepository()
for _, node := range req.Nodes {
state := determineState(node.States)
repo.UpdateNodeState(node.Name, req.Cluster, &state)
}
}

File diff suppressed because it is too large Load Diff

159
internal/api/user.go Normal file
View File

@ -0,0 +1,159 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"encoding/json"
"fmt"
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/gorilla/mux"
)
type ApiReturnedUser struct {
Username string `json:"username"`
Name string `json:"name"`
Roles []string `json:"roles"`
Email string `json:"email"`
Projects []string `json:"projects"`
}
// getUsers godoc
// @summary Returns a list of users
// @tags User
// @description Returns a JSON-encoded list of users.
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
// @failure 400 {string} string "Bad Request"
// @failure 401 {string} string "Unauthorized"
// @failure 403 {string} string "Forbidden"
// @failure 500 {string} string "Internal Server Error"
// @security ApiKeyAuth
// @router /api/users/ [get]
func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
http.Error(rw, "Only admins are allowed to fetch a list of users", http.StatusForbidden)
return
}
users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true")
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
json.NewEncoder(rw).Encode(users)
}
func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
http.Error(rw, "Only admins are allowed to update a user", http.StatusForbidden)
return
}
// Get Values
newrole := r.FormValue("add-role")
delrole := r.FormValue("remove-role")
newproj := r.FormValue("add-project")
delproj := r.FormValue("remove-project")
// TODO: Handle anything but roles...
if newrole != "" {
if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.Write([]byte("Add Role Success"))
} else if delrole != "" {
if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.Write([]byte("Remove Role Success"))
} else if newproj != "" {
if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.Write([]byte("Add Project Success"))
} else if delproj != "" {
if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.Write([]byte("Remove Project Success"))
} else {
http.Error(rw, "Not Add or Del [role|project]?", http.StatusInternalServerError)
}
}
func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
rw.Header().Set("Content-Type", "text/plain")
me := repository.GetUserFromContext(r.Context())
if !me.HasRole(schema.RoleAdmin) {
http.Error(rw, "Only admins are allowed to create new users", http.StatusForbidden)
return
}
username, password, role, name, email, project := r.FormValue("username"),
r.FormValue("password"), r.FormValue("role"), r.FormValue("name"),
r.FormValue("email"), r.FormValue("project")
if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
http.Error(rw, "Only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest)
return
}
if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) {
http.Error(rw, "only managers require a project (can be changed later)",
http.StatusBadRequest)
return
} else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) {
http.Error(rw, "managers require a project to manage (can be changed later)",
http.StatusBadRequest)
return
}
if err := repository.GetUserRepository().AddUser(&schema.User{
Username: username,
Name: name,
Password: password,
Email: email,
Projects: []string{project},
Roles: []string{role},
}); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
fmt.Fprintf(rw, "User %v successfully created!\n", username)
}
func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
http.Error(rw, "Only admins are allowed to delete a user", http.StatusForbidden)
return
}
username := r.FormValue("username")
if err := repository.GetUserRepository().DelUser(username); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.WriteHeader(http.StatusOK)
}

View File

@ -41,7 +41,7 @@ func archivingWorker() {
// will fail if job meta not in repository // will fail if job meta not in repository
if _, err := jobRepo.FetchMetadata(job); err != nil { if _, err := jobRepo.FetchMetadata(job); err != nil {
log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error()) log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
continue continue
} }
@ -50,7 +50,7 @@ func archivingWorker() {
jobMeta, err := ArchiveJob(job, context.Background()) jobMeta, err := ArchiveJob(job, context.Background())
if err != nil { if err != nil {
log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error()) log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
continue continue
} }
@ -72,7 +72,11 @@ func archivingWorker() {
} }
log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) log.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
log.Printf("archiving job (dbid: %d) successful", job.ID) log.Printf("archiving job (dbid: %d) successful", job.ID)
repository.CallJobStopHooks(job)
archivePending.Done() archivePending.Done()
default:
continue
} }
} }
} }

View File

@ -16,7 +16,7 @@ import (
) )
// Writes a running job to the job-archive // Writes a running job to the job-archive
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
allMetrics := make([]string, 0) allMetrics := make([]string, 0)
metricConfigs := archive.GetCluster(job.Cluster).MetricConfig metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
for _, mc := range metricConfigs { for _, mc := range metricConfigs {
@ -40,11 +40,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
return nil, err return nil, err
} }
jobMeta := &schema.JobMeta{ job.Statistics = make(map[string]schema.JobStatistics)
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData { for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
@ -61,7 +57,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
} }
// Round AVG Result to 2 Digits // Round AVG Result to 2 Digits
jobMeta.Statistics[metric] = schema.JobStatistics{ job.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{ Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
@ -76,8 +72,8 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
// only return the JobMeta structure as the // only return the JobMeta structure as the
// statistics in there are needed. // statistics in there are needed.
if config.Keys.DisableArchive { if config.Keys.DisableArchive {
return jobMeta, nil return job, nil
} }
return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData) return job, archive.GetHandle().ImportJob(job, &jobData)
} }

View File

@ -237,7 +237,7 @@ func (auth *Authentication) Login(
limiter := getIPUserLimiter(ip, username) limiter := getIPUserLimiter(ip, username)
if !limiter.Allow() { if !limiter.Allow() {
log.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username) log.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username)
onfailure(rw, r, errors.New("Too many login attempts, try again in a few minutes.")) onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes"))
return return
} }

File diff suppressed because it is too large Load Diff

View File

@ -167,12 +167,30 @@ type NamedStatsWithScope struct {
Stats []*ScopedStats `json:"stats"` Stats []*ScopedStats `json:"stats"`
} }
type NodeFilter struct {
Hostname *StringInput `json:"hostname,omitempty"`
Cluster *StringInput `json:"cluster,omitempty"`
SubCluster *StringInput `json:"subCluster,omitempty"`
NodeState *string `json:"nodeState,omitempty"`
HealthState *schema.NodeState `json:"healthState,omitempty"`
}
type NodeMetrics struct { type NodeMetrics struct {
Host string `json:"host"` Host string `json:"host"`
SubCluster string `json:"subCluster"` SubCluster string `json:"subCluster"`
Metrics []*JobMetricWithName `json:"metrics"` Metrics []*JobMetricWithName `json:"metrics"`
} }
type NodeStateResultList struct {
Items []*schema.Node `json:"items"`
Count *int `json:"count,omitempty"`
}
type NodeStats struct {
State string `json:"state"`
Count int `json:"count"`
}
type NodesResultList struct { type NodesResultList struct {
Items []*NodeMetrics `json:"items"` Items []*NodeMetrics `json:"items"`
Offset *int `json:"offset,omitempty"` Offset *int `json:"offset,omitempty"`

View File

@ -29,9 +29,14 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) (
return r.Repo.Partitions(obj.Name) return r.Repo.Partitions(obj.Name)
} }
// StartTime is the resolver for the startTime field.
func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) {
panic(fmt.Errorf("not implemented: StartTime - startTime"))
}
// Tags is the resolver for the tags field. // Tags is the resolver for the tags field.
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) { func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
return r.Repo.GetTags(repository.GetUserFromContext(ctx), &obj.ID) return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID)
} }
// ConcurrentJobs is the resolver for the concurrentJobs field. // ConcurrentJobs is the resolver for the concurrentJobs field.
@ -143,7 +148,7 @@ func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name s
return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil
} else { } else {
log.Warnf("Not authorized to create tag with scope: %s", scope) log.Warnf("Not authorized to create tag with scope: %s", scope)
return nil, fmt.Errorf("Not authorized to create tag with scope: %s", scope) return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope)
} }
} }
@ -179,7 +184,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
_, _, tscope, exists := r.Repo.TagInfo(tid) _, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists { if !exists {
log.Warnf("Tag does not exist (ID): %d", tid) log.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
} }
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
@ -193,7 +198,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
} }
} else { } else {
log.Warnf("Not authorized to add tag: %d", tid) log.Warnf("Not authorized to add tag: %d", tid)
return nil, fmt.Errorf("Not authorized to add tag: %d", tid) return nil, fmt.Errorf("not authorized to add tag: %d", tid)
} }
} }
@ -226,7 +231,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
_, _, tscope, exists := r.Repo.TagInfo(tid) _, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists { if !exists {
log.Warnf("Tag does not exist (ID): %d", tid) log.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
} }
// Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag
@ -240,7 +245,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
} }
} else { } else {
log.Warnf("Not authorized to remove tag: %d", tid) log.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
} }
} }
@ -269,7 +274,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
_, _, tscope, exists := r.Repo.TagInfo(tid) _, _, tscope, exists := r.Repo.TagInfo(tid)
if !exists { if !exists {
log.Warnf("Tag does not exist (ID): %d", tid) log.Warnf("Tag does not exist (ID): %d", tid)
return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) return nil, fmt.Errorf("tag does not exist (ID): %d", tid)
} }
// Test Access: Admins && Admin Tag OR Everyone && Private Tag // Test Access: Admins && Admin Tag OR Everyone && Private Tag
@ -283,7 +288,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
} }
} else { } else {
log.Warnf("Not authorized to remove tag: %d", tid) log.Warnf("Not authorized to remove tag: %d", tid)
return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) return nil, fmt.Errorf("not authorized to remove tag: %d", tid)
} }
} }
return tags, nil return tags, nil
@ -299,6 +304,21 @@ func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string,
return nil, nil return nil, nil
} }
// NodeState is the resolver for the nodeState field.
func (r *nodeResolver) NodeState(ctx context.Context, obj *schema.Node) (string, error) {
panic(fmt.Errorf("not implemented: NodeState - nodeState"))
}
// HealthState is the resolver for the HealthState field.
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (schema.NodeState, error) {
panic(fmt.Errorf("not implemented: HealthState - HealthState"))
}
// MetaData is the resolver for the metaData field.
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
panic(fmt.Errorf("not implemented: MetaData - metaData"))
}
// Clusters is the resolver for the clusters field. // Clusters is the resolver for the clusters field.
func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) { func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) {
return archive.Clusters, nil return archive.Clusters, nil
@ -338,6 +358,21 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
return counts, nil return counts, nil
} }
// Node is the resolver for the node field.
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
panic(fmt.Errorf("not implemented: Node - node"))
}
// Nodes is the resolver for the nodes field.
func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
panic(fmt.Errorf("not implemented: Nodes - nodes"))
}
// NodeStats is the resolver for the nodeStats field.
func (r *queryResolver) NodeStats(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStats, error) {
panic(fmt.Errorf("not implemented: NodeStats - nodeStats"))
}
// Job is the resolver for the job field. // Job is the resolver for the job field.
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) { func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
numericId, err := strconv.ParseInt(id, 10, 64) numericId, err := strconv.ParseInt(id, 10, 64)
@ -499,10 +534,7 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
return nil, err return nil, err
} }
hasNextPage := false hasNextPage := len(nextJobs) == 1
if len(nextJobs) == 1 {
hasNextPage = true
}
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
} }
@ -513,8 +545,8 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF
var stats []*model.JobsStatistics var stats []*model.JobsStatistics
// Top Level Defaults // Top Level Defaults
var defaultDurationBins string = "1h" defaultDurationBins := "1h"
var defaultMetricBins int = 10 defaultMetricBins := 10
if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") || if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") ||
requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") { requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") {
@ -618,9 +650,9 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job
numThreadsInt := int(job.NumHWThreads) numThreadsInt := int(job.NumHWThreads)
numAccsInt := int(job.NumAcc) numAccsInt := int(job.NumAcc)
res = append(res, &model.JobStats{ res = append(res, &model.JobStats{
ID: int(job.ID), ID: int(*job.ID),
JobID: strconv.Itoa(int(job.JobID)), JobID: strconv.Itoa(int(job.JobID)),
StartTime: int(job.StartTime.Unix()), StartTime: int(job.StartTime),
Duration: int(job.Duration), Duration: int(job.Duration),
Cluster: job.Cluster, Cluster: job.Cluster,
SubCluster: job.SubCluster, SubCluster: job.SubCluster,
@ -773,6 +805,9 @@ func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricV
// Mutation returns generated.MutationResolver implementation. // Mutation returns generated.MutationResolver implementation.
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} } func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
// Node returns generated.NodeResolver implementation.
func (r *Resolver) Node() generated.NodeResolver { return &nodeResolver{r} }
// Query returns generated.QueryResolver implementation. // Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
@ -783,5 +818,6 @@ type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver } type jobResolver struct{ *Resolver }
type metricValueResolver struct{ *Resolver } type metricValueResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver } type mutationResolver struct{ *Resolver }
type nodeResolver struct{ *Resolver }
type queryResolver struct{ *Resolver } type queryResolver struct{ *Resolver }
type subClusterResolver struct{ *Resolver } type subClusterResolver struct{ *Resolver }

View File

@ -42,7 +42,10 @@ func HandleImportFlag(flag string) error {
} }
dec := json.NewDecoder(bytes.NewReader(raw)) dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields() dec.DisallowUnknownFields()
job := schema.JobMeta{BaseJob: schema.JobDefaults} job := schema.Job{
Exclusive: 1,
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
if err = dec.Decode(&job); err != nil { if err = dec.Decode(&job); err != nil {
log.Warn("Error while decoding raw json metadata for import") log.Warn("Error while decoding raw json metadata for import")
return err return err
@ -141,7 +144,7 @@ func HandleImportFlag(flag string) error {
return err return err
} }
if err = SanityChecks(&job.BaseJob); err != nil { if err = SanityChecks(&job); err != nil {
log.Warn("BaseJob SanityChecks failed") log.Warn("BaseJob SanityChecks failed")
return err return err
} }

View File

@ -166,7 +166,7 @@ func TestHandleImportFlag(t *testing.T) {
} }
result := readResult(t, testname) result := readResult(t, testname)
job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime) job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }

View File

@ -60,11 +60,6 @@ func InitDB() error {
} }
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
if err != nil { if err != nil {
@ -72,7 +67,7 @@ func InitDB() error {
return err return err
} }
job.Footprint = make(map[string]float64) jobMeta.Footprint = make(map[string]float64)
for _, fp := range sc.Footprint { for _, fp := range sc.Footprint {
statType := "avg" statType := "avg"
@ -83,16 +78,16 @@ func InitDB() error {
name := fmt.Sprintf("%s_%s", fp, statType) name := fmt.Sprintf("%s_%s", fp, statType)
job.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType) jobMeta.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType)
} }
job.RawFootprint, err = json.Marshal(job.Footprint) jobMeta.RawFootprint, err = json.Marshal(jobMeta.Footprint)
if err != nil { if err != nil {
log.Warn("Error while marshaling job footprint") log.Warn("Error while marshaling job footprint")
return err return err
} }
job.EnergyFootprint = make(map[string]float64) jobMeta.EnergyFootprint = make(map[string]float64)
// Total Job Energy Outside Loop // Total Job Energy Outside Loop
totalEnergy := 0.0 totalEnergy := 0.0
@ -117,45 +112,45 @@ func InitDB() error {
log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
} }
job.EnergyFootprint[fp] = metricEnergy jobMeta.EnergyFootprint[fp] = metricEnergy
totalEnergy += metricEnergy totalEnergy += metricEnergy
} }
job.Energy = (math.Round(totalEnergy*100.0) / 100.0) jobMeta.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil { if jobMeta.RawEnergyFootprint, err = json.Marshal(jobMeta.EnergyFootprint); err != nil {
log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID) log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
return err return err
} }
job.RawResources, err = json.Marshal(job.Resources) jobMeta.RawResources, err = json.Marshal(jobMeta.Resources)
if err != nil { if err != nil {
log.Errorf("repository initDB(): %v", err) log.Errorf("repository initDB(): %v", err)
errorOccured++ errorOccured++
continue continue
} }
job.RawMetaData, err = json.Marshal(job.MetaData) jobMeta.RawMetaData, err = json.Marshal(jobMeta.MetaData)
if err != nil { if err != nil {
log.Errorf("repository initDB(): %v", err) log.Errorf("repository initDB(): %v", err)
errorOccured++ errorOccured++
continue continue
} }
if err := SanityChecks(&job.BaseJob); err != nil { if err := SanityChecks(jobMeta); err != nil {
log.Errorf("repository initDB(): %v", err) log.Errorf("repository initDB(): %v", err)
errorOccured++ errorOccured++
continue continue
} }
id, err := r.TransactionAddNamed(t, id, err := r.TransactionAddNamed(t,
repository.NamedJobInsert, job) repository.NamedJobInsert, jobMeta)
if err != nil { if err != nil {
log.Errorf("repository initDB(): %v", err) log.Errorf("repository initDB(): %v", err)
errorOccured++ errorOccured++
continue continue
} }
for _, tag := range job.Tags { for _, tag := range jobMeta.Tags {
tagstr := tag.Name + ":" + tag.Type tagstr := tag.Name + ":" + tag.Type
tagId, ok := tags[tagstr] tagId, ok := tags[tagstr]
if !ok { if !ok {
@ -190,7 +185,7 @@ func InitDB() error {
} }
// This function also sets the subcluster if necessary! // This function also sets the subcluster if necessary!
func SanityChecks(job *schema.BaseJob) error { func SanityChecks(job *schema.Job) error {
if c := archive.GetCluster(job.Cluster); c == nil { if c := archive.GetCluster(job.Cluster); c == nil {
return fmt.Errorf("no such cluster: %v", job.Cluster) return fmt.Errorf("no such cluster: %v", job.Cluster)
} }

View File

@ -183,8 +183,8 @@ func (ccms *CCMetricStore) LoadData(
req := ApiQueryRequest{ req := ApiQueryRequest{
Cluster: job.Cluster, Cluster: job.Cluster,
From: job.StartTime.Unix(), From: job.StartTime,
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), To: job.StartTime + int64(job.Duration),
Queries: queries, Queries: queries,
WithStats: true, WithStats: true,
WithData: true, WithData: true,
@ -570,7 +570,6 @@ func (ccms *CCMetricStore) LoadStats(
metrics []string, metrics []string,
ctx context.Context, ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) { ) (map[string]map[string]schema.MetricStatistics, error) {
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization? queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
if err != nil { if err != nil {
log.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) log.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error())
@ -579,8 +578,8 @@ func (ccms *CCMetricStore) LoadStats(
req := ApiQueryRequest{ req := ApiQueryRequest{
Cluster: job.Cluster, Cluster: job.Cluster,
From: job.StartTime.Unix(), From: job.StartTime,
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), To: job.StartTime + int64(job.Duration),
Queries: queries, Queries: queries,
WithStats: true, WithStats: true,
WithData: false, WithData: false,
@ -638,8 +637,8 @@ func (ccms *CCMetricStore) LoadScopedStats(
req := ApiQueryRequest{ req := ApiQueryRequest{
Cluster: job.Cluster, Cluster: job.Cluster,
From: job.StartTime.Unix(), From: job.StartTime,
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), To: job.StartTime + int64(job.Duration),
Queries: queries, Queries: queries,
WithStats: true, WithStats: true,
WithData: false, WithData: false,
@ -816,7 +815,6 @@ func (ccms *CCMetricStore) LoadNodeListData(
page *model.PageRequest, page *model.PageRequest,
ctx context.Context, ctx context.Context,
) (map[string]schema.JobData, int, bool, error) { ) (map[string]schema.JobData, int, bool, error) {
// 0) Init additional vars // 0) Init additional vars
var totalNodes int = 0 var totalNodes int = 0
var hasNextPage bool = false var hasNextPage bool = false
@ -975,7 +973,6 @@ func (ccms *CCMetricStore) buildNodeQueries(
scopes []schema.MetricScope, scopes []schema.MetricScope,
resolution int, resolution int,
) ([]ApiQuery, []schema.MetricScope, error) { ) ([]ApiQuery, []schema.MetricScope, error) {
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := []schema.MetricScope{} assignedScope := []schema.MetricScope{}

View File

@ -1,575 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"math"
"sort"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
)
type InfluxDBv2DataRepositoryConfig struct {
Url string `json:"url"`
Token string `json:"token"`
Bucket string `json:"bucket"`
Org string `json:"org"`
SkipTls bool `json:"skiptls"`
}
type InfluxDBv2DataRepository struct {
client influxdb2.Client
queryClient influxdb2Api.QueryAPI
bucket, measurement string
}
func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error {
var config InfluxDBv2DataRepositoryConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Warn("Error while unmarshaling raw json config")
return err
}
idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls}))
idb.queryClient = idb.client.QueryAPI(config.Org)
idb.bucket = config.Bucket
return nil
}
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00”
}
func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time {
return time.Unix(epoch, 0)
}
func (idb *InfluxDBv2DataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int) (schema.JobData, error) {
log.Infof("InfluxDB 2 Backend: Resolution Scaling not Implemented, will return default timestep. Requested Resolution %d", resolution)
measurementsConds := make([]string, 0, len(metrics))
for _, m := range metrics {
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
}
measurementsCond := strings.Join(measurementsConds, " or ")
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
jobData := make(schema.JobData) // Empty Schema: map[<string>FIELD]map[<MetricScope>SCOPE]<*JobMetric>METRIC
// Requested Scopes
for _, scope := range scopes {
query := ""
switch scope {
case "node":
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows <-- Resolution could be added here?
// log.Info("Scope 'node' requested. ")
query = fmt.Sprintf(`
from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => (%s) and (%s) )
|> drop(columns: ["_start", "_stop"])
|> group(columns: ["hostname", "_measurement"])
|> aggregateWindow(every: 60s, fn: mean)
|> drop(columns: ["_time"])`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
measurementsCond, hostsCond)
case "socket":
log.Info("Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ")
continue
case "core":
log.Info(" Scope 'core' requested, but not yet supported: Will return 'node' scope only. ")
continue
// Get Finest Granularity only, Set NULL to 0.0
// query = fmt.Sprintf(`
// from(bucket: "%s")
// |> range(start: %s, stop: %s)
// |> filter(fn: (r) => %s )
// |> filter(fn: (r) => %s )
// |> drop(columns: ["_start", "_stop", "cluster"])
// |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`,
// idb.bucket,
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
// measurementsCond, hostsCond)
case "hwthread":
log.Info(" Scope 'hwthread' requested, but not yet supported: Will return 'node' scope only. ")
continue
case "accelerator":
log.Info(" Scope 'accelerator' requested, but not yet supported: Will return 'node' scope only. ")
continue
default:
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
continue
// return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support other scopes than 'node'")
}
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
log.Error("Error while performing query")
return nil, err
}
// Init Metrics: Only Node level now -> TODO: Matching /check on scope level ...
for _, metric := range metrics {
jobMetric, ok := jobData[metric]
if !ok {
mc := archive.GetMetricConfig(job.Cluster, metric)
jobMetric = map[schema.MetricScope]*schema.JobMetric{
scope: { // uses scope var from above!
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0, len(job.Resources)),
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
},
}
}
jobData[metric] = jobMetric
}
// Process Result: Time-Data
field, host, hostSeries := "", "", schema.Series{}
// typeId := 0
switch scope {
case "node":
for rows.Next() {
row := rows.Record()
if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() {
if host != "" {
// Append Series before reset
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
hostSeries = schema.Series{
Hostname: host,
Statistics: schema.MetricStatistics{}, //TODO Add Statistics
Data: make([]schema.Float, 0),
}
}
val, ok := row.Value().(float64)
if ok {
hostSeries.Data = append(hostSeries.Data, schema.Float(val))
} else {
hostSeries.Data = append(hostSeries.Data, schema.Float(0))
}
}
case "socket":
continue
case "accelerator":
continue
case "hwthread":
// See below @ core
continue
case "core":
continue
// Include Series.Id in hostSeries
// for rows.Next() {
// row := rows.Record()
// if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) {
// if ( host != "" ) {
// // Append Series before reset
// jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
// }
// field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int)
// hostSeries = schema.Series{
// Hostname: host,
// Id: &typeId,
// Statistics: nil,
// Data: make([]schema.Float, 0),
// }
// }
// val := row.Value().(float64)
// hostSeries.Data = append(hostSeries.Data, schema.Float(val))
// }
default:
log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope)
continue
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'")
}
// Append last Series
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
// Get Stats
stats, err := idb.LoadStats(job, metrics, ctx)
if err != nil {
log.Warn("Error while loading statistics")
return nil, err
}
for _, scope := range scopes {
if scope == "node" { // No 'socket/core' support yet
for metric, nodes := range stats {
for node, stats := range nodes {
for index, _ := range jobData[metric][scope].Series {
if jobData[metric][scope].Series[index].Hostname == node {
jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
}
}
}
}
}
}
return jobData, nil
}
func (idb *InfluxDBv2DataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
stats := map[string]map[string]schema.MetricStatistics{}
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
// lenMet := len(metrics)
for _, metric := range metrics {
// log.Debugf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet)
query := fmt.Sprintf(`
data = from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s))
union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"),
data |> min(column: "_value") |> set(key: "_field", value: "min"),
data |> max(column: "_value") |> set(key: "_field", value: "max")])
|> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value")
|> group()`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
metric, hostsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
log.Error("Error while performing query")
return nil, err
}
nodes := map[string]schema.MetricStatistics{}
for rows.Next() {
row := rows.Record()
host := row.ValueByKey("hostname").(string)
avg, avgok := row.ValueByKey("avg").(float64)
if !avgok {
// log.Debugf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg)
avg = 0.0
}
min, minok := row.ValueByKey("min").(float64)
if !minok {
// log.Debugf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min)
min = 0.0
}
max, maxok := row.ValueByKey("max").(float64)
if !maxok {
// log.Debugf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max)
max = 0.0
}
nodes[host] = schema.MetricStatistics{
Avg: avg,
Min: min,
Max: max,
}
}
stats[metric] = nodes
}
return stats, nil
}
// Used in Job-View StatsTable
// UNTESTED
func (idb *InfluxDBv2DataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.ScopedJobStats, error) {
// Assumption: idb.loadData() only returns series node-scope - use node scope for statsTable
scopedJobStats := make(schema.ScopedJobStats)
data, err := idb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
log.Warn("Error while loading job for scopedJobStats")
return nil, err
}
for metric, metricData := range data {
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range metricData[scope].Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Data: &series.Statistics,
})
}
}
}
return scopedJobStats, nil
}
// Used in Systems-View @ Node-Overview
// UNTESTED
func (idb *InfluxDBv2DataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
// Note: scopes[] Array will be ignored, only return node scope
// CONVERT ARGS TO INFLUX
measurementsConds := make([]string, 0)
for _, m := range metrics {
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
}
measurementsCond := strings.Join(measurementsConds, " or ")
hostsConds := make([]string, 0)
if nodes == nil {
var allNodes []string
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
allNodes = append(nodes, nodeList.PrintList()...)
}
for _, node := range allNodes {
nodes = append(nodes, node)
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node))
}
} else {
for _, node := range nodes {
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node))
}
}
hostsCond := strings.Join(hostsConds, " or ")
// BUILD AND PERFORM QUERY
query := fmt.Sprintf(`
from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => (%s) and (%s) )
|> drop(columns: ["_start", "_stop"])
|> group(columns: ["hostname", "_measurement"])
|> aggregateWindow(every: 60s, fn: mean)
|> drop(columns: ["_time"])`,
idb.bucket,
idb.formatTime(from), idb.formatTime(to),
measurementsCond, hostsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
log.Error("Error while performing query")
return nil, err
}
// HANDLE QUERY RETURN
// Collect Float Arrays for Node@Metric -> No Scope Handling!
influxData := make(map[string]map[string][]schema.Float)
for rows.Next() {
row := rows.Record()
host, field := row.ValueByKey("hostname").(string), row.Measurement()
influxHostData, ok := influxData[host]
if !ok {
influxHostData = make(map[string][]schema.Float)
influxData[host] = influxHostData
}
influxFieldData, ok := influxData[host][field]
if !ok {
influxFieldData = make([]schema.Float, 0)
influxData[host][field] = influxFieldData
}
val, ok := row.Value().(float64)
if ok {
influxData[host][field] = append(influxData[host][field], schema.Float(val))
} else {
influxData[host][field] = append(influxData[host][field], schema.Float(0))
}
}
// BUILD FUNCTION RETURN
data := make(map[string]map[string][]*schema.JobMetric)
for node, metricData := range influxData {
nodeData, ok := data[node]
if !ok {
nodeData = make(map[string][]*schema.JobMetric)
data[node] = nodeData
}
for metric, floatArray := range metricData {
avg, min, max := 0.0, 0.0, 0.0
for _, val := range floatArray {
avg += float64(val)
min = math.Min(min, float64(val))
max = math.Max(max, float64(val))
}
stats := schema.MetricStatistics{
Avg: (math.Round((avg/float64(len(floatArray)))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
mc := archive.GetMetricConfig(cluster, metric)
nodeData[metric] = append(nodeData[metric], &schema.JobMetric{
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: []schema.Series{
{
Hostname: node,
Statistics: stats,
Data: floatArray,
},
},
})
}
}
return data, nil
}
// Used in Systems-View @ Node-List
// UNTESTED
func (idb *InfluxDBv2DataRepository) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]schema.JobData, int, bool, error) {
// Assumption: idb.loadData() only returns series node-scope - use node scope for NodeList
// 0) Init additional vars
var totalNodes int = 0
var hasNextPage bool = false
// 1) Get list of all nodes
var nodes []string
if subCluster != "" {
scNodes := archive.NodeLists[cluster][subCluster]
nodes = scNodes.PrintList()
} else {
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
nodes = append(nodes, nodeList.PrintList()...)
}
}
// 2) Filter nodes
if nodeFilter != "" {
filteredNodes := []string{}
for _, node := range nodes {
if strings.Contains(node, nodeFilter) {
filteredNodes = append(filteredNodes, node)
}
}
nodes = filteredNodes
}
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after return ...
totalNodes = len(nodes)
sort.Strings(nodes)
// 3) Apply paging
if len(nodes) > page.ItemsPerPage {
start := (page.Page - 1) * page.ItemsPerPage
end := start + page.ItemsPerPage
if end > len(nodes) {
end = len(nodes)
hasNextPage = false
} else {
hasNextPage = true
}
nodes = nodes[start:end]
}
// 4) Fetch And Convert Data, use idb.LoadNodeData() for query
rawNodeData, err := idb.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
log.Error(fmt.Sprintf("Error while loading influx nodeData for nodeListData %#v\n", err))
return nil, totalNodes, hasNextPage, err
}
data := make(map[string]schema.JobData)
for node, nodeData := range rawNodeData {
// Init Nested Map Data Structures If Not Found
hostData, ok := data[node]
if !ok {
hostData = make(schema.JobData)
data[node] = hostData
}
for metric, nodeMetricData := range nodeData {
metricData, ok := hostData[metric]
if !ok {
metricData = make(map[schema.MetricScope]*schema.JobMetric)
data[node][metric] = metricData
}
data[node][metric][schema.MetricScopeNode] = nodeMetricData[0] // Only Node Scope Returned from loadNodeData
}
}
return data, totalNodes, hasNextPage, nil
}

View File

@ -54,8 +54,6 @@ func Init() error {
switch kind.Kind { switch kind.Kind {
case "cc-metric-store": case "cc-metric-store":
mdr = &CCMetricStore{} mdr = &CCMetricStore{}
case "influxdb":
mdr = &InfluxDBv2DataRepository{}
case "prometheus": case "prometheus":
mdr = &PrometheusDataRepository{} mdr = &PrometheusDataRepository{}
case "test": case "test":

View File

@ -279,8 +279,8 @@ func (pdb *PrometheusDataRepository) LoadData(
for i, resource := range job.Resources { for i, resource := range job.Resources {
nodes[i] = resource.Hostname nodes[i] = resource.Hostname
} }
from := job.StartTime from := time.Unix(job.StartTime, 0)
to := job.StartTime.Add(time.Duration(job.Duration) * time.Second) to := time.Unix(job.StartTime+int64(job.Duration), 0)
for _, scope := range scopes { for _, scope := range scopes {
if scope != schema.MetricScopeNode { if scope != schema.MetricScopeNode {
@ -453,8 +453,8 @@ func (pdb *PrometheusDataRepository) LoadScopedStats(
job *schema.Job, job *schema.Job,
metrics []string, metrics []string,
scopes []schema.MetricScope, scopes []schema.MetricScope,
ctx context.Context) (schema.ScopedJobStats, error) { ctx context.Context,
) (schema.ScopedJobStats, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable // Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
scopedJobStats := make(schema.ScopedJobStats) scopedJobStats := make(schema.ScopedJobStats)
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/) data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
@ -502,7 +502,6 @@ func (pdb *PrometheusDataRepository) LoadNodeListData(
page *model.PageRequest, page *model.PageRequest,
ctx context.Context, ctx context.Context,
) (map[string]schema.JobData, int, bool, error) { ) (map[string]schema.JobData, int, bool, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList // Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
// 0) Init additional vars // 0) Init additional vars

View File

@ -9,12 +9,12 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"maps"
"math" "math"
"strconv" "strconv"
"sync" "sync"
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache" "github.com/ClusterCockpit/cc-backend/pkg/lrucache"
@ -33,6 +33,7 @@ type JobRepository struct {
stmtCache *sq.StmtCache stmtCache *sq.StmtCache
cache *lrucache.Cache cache *lrucache.Cache
driver string driver string
Mutex sync.Mutex
} }
func GetJobRepository() *JobRepository { func GetJobRepository() *JobRepository {
@ -51,17 +52,29 @@ func GetJobRepository() *JobRepository {
} }
var jobColumns []string = []string{ var jobColumns []string = []string{
"job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster",
"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes",
"job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status",
"job.smt", "job.job_state", "job.duration", "job.walltime", "job.resources",
"job.footprint", "job.energy",
} }
func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { var jobCacheColumns []string = []string{
"job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster",
"job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition",
"job_cache.array_job_id", "job_cache.num_nodes", "job_cache.num_hwthreads",
"job_cache.num_acc", "job_cache.exclusive", "job_cache.monitoring_status", "job_cache.smt",
"job_cache.job_state", "job_cache.duration", "job_cache.walltime", "job_cache.resources",
"job_cache.footprint", "job_cache.energy",
}
func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) {
job := &schema.Job{} job := &schema.Job{}
if err := row.Scan( if err := row.Scan(
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster,
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, &job.StartTime, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads,
&job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
&job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil { &job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil {
log.Warnf("Error while scanning rows (Job): %v", err) log.Warnf("Error while scanning rows (Job): %v", err)
return nil, err return nil, err
@ -79,10 +92,9 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
} }
job.RawFootprint = nil job.RawFootprint = nil
job.StartTime = time.Unix(job.StartTimeUnix, 0)
// Always ensure accurate duration for running jobs // Always ensure accurate duration for running jobs
if job.State == schema.JobStateRunning { if job.State == schema.JobStateRunning {
job.Duration = int32(time.Since(job.StartTime).Seconds()) job.Duration = int32(time.Now().Unix() - job.StartTime)
} }
return job, nil return job, nil
@ -138,17 +150,6 @@ func (r *JobRepository) Flush() error {
return nil return nil
} }
func scanJobLink(row interface{ Scan(...interface{}) error }) (*model.JobLink, error) {
jobLink := &model.JobLink{}
if err := row.Scan(
&jobLink.ID, &jobLink.JobID); err != nil {
log.Warn("Error while scanning rows (jobLink)")
return nil, err
}
return jobLink, nil
}
func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) { func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) {
start := time.Now() start := time.Now()
cachekey := fmt.Sprintf("metadata:%d", job.ID) cachekey := fmt.Sprintf("metadata:%d", job.ID)
@ -189,9 +190,7 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
if job.MetaData != nil { if job.MetaData != nil {
cpy := make(map[string]string, len(job.MetaData)+1) cpy := make(map[string]string, len(job.MetaData)+1)
for k, v := range job.MetaData { maps.Copy(cpy, job.MetaData)
cpy[k] = v
}
cpy[key] = val cpy[key] = val
job.MetaData = cpy job.MetaData = cpy
} else { } else {
@ -389,7 +388,7 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table
func (r *JobRepository) Partitions(cluster string) ([]string, error) { func (r *JobRepository) Partitions(cluster string) ([]string, error) {
var err error var err error
start := time.Now() start := time.Now()
partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) { partitions := r.cache.Get("partitions:"+cluster, func() (any, time.Duration, int) {
parts := []string{} parts := []string{}
if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil { if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
return nil, 0, 1000 return nil, 0, 1000
@ -477,6 +476,7 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
return nil return nil
} }
// FIXME: Reconsider filtering short jobs with harcoded threshold
func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) {
query := sq.Select(jobColumns...).From("job"). query := sq.Select(jobColumns...).From("job").
Where(fmt.Sprintf("job.cluster = '%s'", cluster)). Where(fmt.Sprintf("job.cluster = '%s'", cluster)).
@ -581,7 +581,7 @@ func (r *JobRepository) MarkArchived(
func (r *JobRepository) UpdateEnergy( func (r *JobRepository) UpdateEnergy(
stmt sq.UpdateBuilder, stmt sq.UpdateBuilder,
jobMeta *schema.JobMeta, jobMeta *schema.Job,
) (sq.UpdateBuilder, error) { ) (sq.UpdateBuilder, error) {
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
@ -631,7 +631,7 @@ func (r *JobRepository) UpdateEnergy(
func (r *JobRepository) UpdateFootprint( func (r *JobRepository) UpdateFootprint(
stmt sq.UpdateBuilder, stmt sq.UpdateBuilder,
jobMeta *schema.JobMeta, jobMeta *schema.Job,
) (sq.UpdateBuilder, error) { ) (sq.UpdateBuilder, error) {
/* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */
sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)

View File

@ -13,6 +13,14 @@ import (
sq "github.com/Masterminds/squirrel" sq "github.com/Masterminds/squirrel"
) )
const NamedJobCacheInsert string = `INSERT INTO job_cache (
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
) VALUES (
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
);`
const NamedJobInsert string = `INSERT INTO job ( const NamedJobInsert string = `INSERT INTO job (
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
@ -21,8 +29,10 @@ const NamedJobInsert string = `INSERT INTO job (
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
);` );`
func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
res, err := r.DB.NamedExec(NamedJobInsert, job) r.Mutex.Lock()
res, err := r.DB.NamedExec(NamedJobCacheInsert, job)
r.Mutex.Unlock()
if err != nil { if err != nil {
log.Warn("Error while NamedJobInsert") log.Warn("Error while NamedJobInsert")
return 0, err return 0, err
@ -36,9 +46,48 @@ func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) {
return id, nil return id, nil
} }
func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
r.Mutex.Lock()
defer r.Mutex.Unlock()
query := sq.Select(jobCacheColumns...).From("job_cache")
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
log.Errorf("Error while running query %v", err)
return nil, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
rows.Close()
log.Warn("Error while scanning rows")
return nil, err
}
jobs = append(jobs, job)
}
_, err = r.DB.Exec(
"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
if err != nil {
log.Warnf("Error while Job sync: %v", err)
return nil, err
}
_, err = r.DB.Exec("DELETE FROM job_cache")
if err != nil {
log.Warnf("Error while Job cache clean: %v", err)
return nil, err
}
return jobs, nil
}
// Start inserts a new job in the table, returning the unique job ID. // Start inserts a new job in the table, returning the unique job ID.
// Statistics are not transfered! // Statistics are not transfered!
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
job.RawFootprint, err = json.Marshal(job.Footprint) job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil { if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err) return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
@ -73,3 +122,19 @@ func (r *JobRepository) Stop(
_, err = stmt.RunWith(r.stmtCache).Exec() _, err = stmt.RunWith(r.stmtCache).Exec()
return return
} }
func (r *JobRepository) StopCached(
jobId int64,
duration int32,
state schema.JobState,
monitoringStatus int32,
) (err error) {
stmt := sq.Update("job_cache").
Set("job_state", state).
Set("duration", duration).
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", jobId)
_, err = stmt.RunWith(r.stmtCache).Exec()
return
}

View File

@ -43,6 +43,26 @@ func (r *JobRepository) Find(
return scanJob(q.RunWith(r.stmtCache).QueryRow()) return scanJob(q.RunWith(r.stmtCache).QueryRow())
} }
func (r *JobRepository) FindCached(
jobId *int64,
cluster *string,
startTime *int64,
) (*schema.Job, error) {
q := sq.Select(jobCacheColumns...).From("job_cache").
Where("job_cache.job_id = ?", *jobId)
if cluster != nil {
q = q.Where("job_cache.cluster = ?", *cluster)
}
if startTime != nil {
q = q.Where("job_cache.start_time = ?", *startTime)
}
q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// Find executes a SQL query to find a specific batch job. // Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name, // The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds. // and the start time of the job in UNIX epoch time seconds.
@ -83,6 +103,35 @@ func (r *JobRepository) FindAll(
return jobs, nil return jobs, nil
} }
// Get complete joblist only consisting of db ids.
// This is useful to process large job counts and intended to be used
// together with FindById to process jobs one by one
func (r *JobRepository) GetJobList() ([]int64, error) {
query := sq.Select("id").From("job").
Where("job.job_state != 'running'")
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
jl := make([]int64, 0, 1000)
for rows.Next() {
var id int64
err := rows.Scan(&id)
if err != nil {
rows.Close()
log.Warn("Error while scanning rows")
return nil, err
}
jl = append(jl, id)
}
log.Infof("Return job count %d", len(jl))
return jl, nil
}
// FindById executes a SQL query to find a specific batch job. // FindById executes a SQL query to find a specific batch job.
// The job is queried using the database id. // The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable. // It returns a pointer to a schema.Job data structure and an error variable.
@ -178,7 +227,7 @@ func (r *JobRepository) FindConcurrentJobs(
var startTime int64 var startTime int64
var stopTime int64 var stopTime int64
startTime = job.StartTimeUnix startTime = job.StartTime
hostname := job.Resources[0].Hostname hostname := job.Resources[0].Hostname
if job.State == schema.JobStateRunning { if job.State == schema.JobStateRunning {

View File

@ -0,0 +1,57 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"sync"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type JobHook interface {
JobStartCallback(job *schema.Job)
JobStopCallback(job *schema.Job)
}
var (
initOnce sync.Once
hooks []JobHook
)
func RegisterJobJook(hook JobHook) {
initOnce.Do(func() {
hooks = make([]JobHook, 0)
})
if hook != nil {
hooks = append(hooks, hook)
}
}
func CallJobStartHooks(jobs []*schema.Job) {
if hooks == nil {
return
}
for _, hook := range hooks {
if hook != nil {
for _, job := range jobs {
hook.JobStartCallback(job)
}
}
}
}
func CallJobStopHooks(job *schema.Job) {
if hooks == nil {
return
}
for _, hook := range hooks {
if hook != nil {
hook.JobStopCallback(job)
}
}
}

View File

@ -148,9 +148,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
} }
if filter.DbID != nil { if filter.DbID != nil {
dbIDs := make([]string, len(filter.DbID)) dbIDs := make([]string, len(filter.DbID))
for i, val := range filter.DbID { copy(dbIDs, filter.DbID)
dbIDs[i] = val
}
query = query.Where(sq.Eq{"job.id": dbIDs}) query = query.Where(sq.Eq{"job.id": dbIDs})
} }
if filter.JobID != nil { if filter.JobID != nil {

View File

@ -24,7 +24,7 @@ func TestFind(t *testing.T) {
// fmt.Printf("%+v", job) // fmt.Printf("%+v", job)
if job.ID != 5 { if *job.ID != 5 {
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1366", job.JobID) t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1366", job.JobID)
} }
} }

View File

@ -16,7 +16,7 @@ import (
"github.com/golang-migrate/migrate/v4/source/iofs" "github.com/golang-migrate/migrate/v4/source/iofs"
) )
const Version uint = 8 const Version uint = 10
//go:embed migrations/* //go:embed migrations/*
var migrationFiles embed.FS var migrationFiles embed.FS
@ -115,8 +115,17 @@ func MigrateDB(backend string, db string) error {
} }
v, dirty, err := m.Version() v, dirty, err := m.Version()
if err != nil {
if err == migrate.ErrNilVersion {
log.Warn("Legacy database without version or missing database file!")
} else {
return err
}
}
log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version) if v < Version {
log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version)
}
if dirty { if dirty {
return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version) return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version)

View File

@ -0,0 +1 @@
DROP TABLE IF EXISTS job_cache;

View File

@ -0,0 +1,31 @@
CREATE TABLE "job_cache" (
id INTEGER PRIMARY KEY,
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
start_time BIGINT NOT NULL, -- Unix timestamp
hpc_user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
cluster_partition VARCHAR(255),
array_job_id BIGINT,
duration INT NOT NULL,
walltime INT NOT NULL,
job_state VARCHAR(255) NOT NULL
CHECK (job_state IN (
'running', 'completed', 'failed', 'cancelled',
'stopped', 'timeout', 'preempted', 'out_of_memory'
)),
meta_data TEXT, -- JSON
resources TEXT NOT NULL, -- JSON
num_nodes INT NOT NULL,
num_hwthreads INT,
num_acc INT,
smt TINYINT NOT NULL DEFAULT 1 CHECK (smt IN (0, 1)),
exclusive TINYINT NOT NULL DEFAULT 1 CHECK (exclusive IN (0, 1, 2)),
monitoring_status TINYINT NOT NULL DEFAULT 1
CHECK (monitoring_status IN (0, 1, 2, 3)),
energy REAL NOT NULL DEFAULT 0.0,
energy_footprint TEXT DEFAULT NULL,
footprint TEXT DEFAULT NULL,
UNIQUE (job_id, cluster, start_time)
);

View File

@ -0,0 +1 @@
DROP TABLE IF EXISTS node;

View File

@ -0,0 +1,17 @@
CREATE TABLE "node" (
id INTEGER PRIMARY KEY,
hostname VARCHAR(255) NOT NULL,
cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
node_state VARCHAR(255) NOT NULL
CHECK (node_state IN (
'allocated', 'reserved', 'idle', 'mixed',
'down', 'unknown'
)),
health_state VARCHAR(255) NOT NULL
CHECK (health_state IN (
'full', 'partial', 'failed'
)),
meta_data TEXT, -- JSON
UNIQUE (hostname, cluster)
);

241
internal/repository/node.go Normal file
View File

@ -0,0 +1,241 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"database/sql"
"encoding/json"
"fmt"
"maps"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
"github.com/jmoiron/sqlx"
)
var (
nodeRepoOnce sync.Once
nodeRepoInstance *NodeRepository
)
type NodeRepository struct {
DB *sqlx.DB
stmtCache *sq.StmtCache
cache *lrucache.Cache
driver string
}
func GetNodeRepository() *NodeRepository {
nodeRepoOnce.Do(func() {
db := GetConnection()
nodeRepoInstance = &NodeRepository{
DB: db.DB,
driver: db.Driver,
stmtCache: sq.NewStmtCache(db.DB),
cache: lrucache.New(1024 * 1024),
}
})
return nodeRepoInstance
}
func (r *NodeRepository) FetchMetadata(node *schema.Node) (map[string]string, error) {
start := time.Now()
cachekey := fmt.Sprintf("metadata:%d", node.ID)
if cached := r.cache.Get(cachekey, nil); cached != nil {
node.MetaData = cached.(map[string]string)
return node.MetaData, nil
}
if err := sq.Select("node.meta_data").From("node").Where("node.id = ?", node.ID).
RunWith(r.stmtCache).QueryRow().Scan(&node.RawMetaData); err != nil {
log.Warn("Error while scanning for node metadata")
return nil, err
}
if len(node.RawMetaData) == 0 {
return nil, nil
}
if err := json.Unmarshal(node.RawMetaData, &node.MetaData); err != nil {
log.Warn("Error while unmarshaling raw metadata json")
return nil, err
}
r.cache.Put(cachekey, node.MetaData, len(node.RawMetaData), 24*time.Hour)
log.Debugf("Timer FetchMetadata %s", time.Since(start))
return node.MetaData, nil
}
func (r *NodeRepository) UpdateMetadata(node *schema.Node, key, val string) (err error) {
cachekey := fmt.Sprintf("metadata:%d", node.ID)
r.cache.Del(cachekey)
if node.MetaData == nil {
if _, err = r.FetchMetadata(node); err != nil {
log.Warnf("Error while fetching metadata for node, DB ID '%v'", node.ID)
return err
}
}
if node.MetaData != nil {
cpy := make(map[string]string, len(node.MetaData)+1)
maps.Copy(cpy, node.MetaData)
cpy[key] = val
node.MetaData = cpy
} else {
node.MetaData = map[string]string{key: val}
}
if node.RawMetaData, err = json.Marshal(node.MetaData); err != nil {
log.Warnf("Error while marshaling metadata for node, DB ID '%v'", node.ID)
return err
}
if _, err = sq.Update("node").
Set("meta_data", node.RawMetaData).
Where("node.id = ?", node.ID).
RunWith(r.stmtCache).Exec(); err != nil {
log.Warnf("Error while updating metadata for node, DB ID '%v'", node.ID)
return err
}
r.cache.Put(cachekey, node.MetaData, len(node.RawMetaData), 24*time.Hour)
return nil
}
func (r *NodeRepository) GetNode(id int64, withMeta bool) (*schema.Node, error) {
node := &schema.Node{}
if err := sq.Select("id", "hostname", "cluster", "subcluster", "node_state",
"health_state").From("node").
Where("node.id = ?", id).RunWith(r.DB).
QueryRow().Scan(&node.ID, &node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState,
&node.HealthState); err != nil {
log.Warnf("Error while querying node '%v' from database", id)
return nil, err
}
if withMeta {
var err error
var meta map[string]string
if meta, err = r.FetchMetadata(node); err != nil {
log.Warnf("Error while fetching metadata for node '%v'", id)
return nil, err
}
node.MetaData = meta
}
return node, nil
}
const NamedNodeInsert string = `
INSERT INTO node (hostname, cluster, subcluster, node_state, health_state)
VALUES (:hostname, :cluster, :subcluster, :node_state, :health_state);`
func (r *NodeRepository) AddNode(node *schema.Node) (int64, error) {
var err error
res, err := r.DB.NamedExec(NamedNodeInsert, node)
if err != nil {
log.Errorf("Error while adding node '%v' to database", node.Hostname)
return 0, err
}
node.ID, err = res.LastInsertId()
if err != nil {
log.Errorf("Error while getting last insert id for node '%v' from database", node.Hostname)
return 0, err
}
return node.ID, nil
}
func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeState *schema.NodeState) error {
var id int64
if err := sq.Select("id").From("node").
Where("node.hostname = ?", hostname).Where("node.cluster = ?", cluster).RunWith(r.DB).
QueryRow().Scan(&id); err != nil {
if err == sql.ErrNoRows {
subcluster, err := archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
log.Errorf("Error while getting subcluster for node '%s' in cluster '%s': %v", hostname, cluster, err)
return err
}
node := schema.Node{
Hostname: hostname, Cluster: cluster, SubCluster: subcluster, NodeState: *nodeState,
HealthState: schema.MonitoringStateFull,
}
_, err = r.AddNode(&node)
if err != nil {
log.Errorf("Error while adding node '%s' to database: %v", hostname, err)
return err
}
return nil
} else {
log.Warnf("Error while querying node '%v' from database", id)
return err
}
}
if _, err := sq.Update("node").Set("node_state", nodeState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil {
log.Errorf("error while updating node '%s'", hostname)
return err
}
return nil
}
// func (r *NodeRepository) UpdateHealthState(hostname string, healthState *schema.MonitoringState) error {
// if _, err := sq.Update("node").Set("health_state", healthState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil {
// log.Errorf("error while updating node '%d'", id)
// return err
// }
//
// return nil
// }
func (r *NodeRepository) DeleteNode(id int64) error {
_, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id)
if err != nil {
log.Errorf("Error while deleting node '%d' from DB", id)
return err
}
log.Infof("deleted node '%d' from DB", id)
return nil
}
func (r *NodeRepository) QueryNodes() ([]*schema.Node, error) {
return nil, nil
}
func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
q := sq.Select("hostname", "cluster", "subcluster", "node_state",
"health_state").From("node").Where("node.cluster = ?", cluster).OrderBy("node.hostname ASC")
rows, err := q.RunWith(r.DB).Query()
if err != nil {
log.Warn("Error while querying user list")
return nil, err
}
nodeList := make([]*schema.Node, 0, 100)
defer rows.Close()
for rows.Next() {
node := &schema.Node{}
if err := rows.Scan(&node.Hostname, &node.Cluster,
&node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
log.Warn("Error while scanning node list")
return nil, err
}
nodeList = append(nodeList, node)
}
return nodeList, nil
}

View File

@ -291,7 +291,7 @@ func (r *JobRepository) JobsStats(
return stats, nil return stats, nil
} }
func LoadJobStat(job *schema.JobMeta, metric string, statType string) float64 { func LoadJobStat(job *schema.Job, metric string, statType string) float64 {
if stats, ok := job.Statistics[metric]; ok { if stats, ok := job.Statistics[metric]; ok {
switch statType { switch statType {
case "avg": case "avg":
@ -759,7 +759,6 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
filters []*model.JobFilter, filters []*model.JobFilter,
bins *int, bins *int,
) []*model.MetricHistoPoints { ) []*model.MetricHistoPoints {
// Get Jobs // Get Jobs
jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil) jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil)
if err != nil { if err != nil {

View File

@ -45,6 +45,36 @@ func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*sche
return tags, archive.UpdateTags(j, archiveTags) return tags, archive.UpdateTags(j, archiveTags)
} }
func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error) {
j, err := r.FindByIdDirect(job)
if err != nil {
log.Warn("Error while finding job by id")
return nil, err
}
q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(job, tag)
if _, err := q.RunWith(r.stmtCache).Exec(); err != nil {
s, _, _ := q.ToSql()
log.Errorf("Error adding tag with %s: %v", s, err)
return nil, err
}
tags, err := r.GetTagsDirect(&job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
archiveTags, err := r.getArchiveTags(&job)
if err != nil {
log.Warn("Error while getting tags for job")
return nil, err
}
return tags, archive.UpdateTags(j, archiveTags)
}
// Removes a tag from a job by tag id // Removes a tag from a job by tag id
func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) { func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) {
j, err := r.FindByIdWithUser(user, job) j, err := r.FindByIdWithUser(user, job)
@ -82,7 +112,7 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT
tagID, exists := r.TagId(tagType, tagName, tagScope) tagID, exists := r.TagId(tagType, tagName, tagScope)
if !exists { if !exists {
log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
return nil, fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return nil, fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
} }
// Get Job // Get Job
@ -122,7 +152,7 @@ func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagSc
tagID, exists := r.TagId(tagType, tagName, tagScope) tagID, exists := r.TagId(tagType, tagName, tagScope)
if !exists { if !exists {
log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
return fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope)
} }
// Handle Delete JobTagTable // Handle Delete JobTagTable
@ -291,6 +321,37 @@ func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType s
return tagId, nil return tagId, nil
} }
func (r *JobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) {
tagScope := "global"
tagId, exists := r.TagId(tagType, tagName, tagScope)
if !exists {
tagId, err = r.CreateTag(tagType, tagName, tagScope)
if err != nil {
return 0, err
}
}
if _, err := r.AddTagDirect(jobId, tagId); err != nil {
return 0, err
}
return tagId, nil
}
func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool {
var id int64
q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id").
Where("jobtag.job_id = ?", jobId).Where("tag.tag_type = ?", tagType).
Where("tag.tag_name = ?", tagName)
err := q.RunWith(r.stmtCache).QueryRow().Scan(&id)
if err != nil {
return false
} else {
return true
}
}
// TagId returns the database id of the tag with the specified type and name. // TagId returns the database id of the tag with the specified type and name.
func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) { func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) {
exists = true exists = true
@ -346,6 +407,32 @@ func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, e
return tags, nil return tags, nil
} }
func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) {
q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag")
if job != nil {
q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job)
}
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
s, _, _ := q.ToSql()
log.Errorf("Error get tags with %s: %v", s, err)
return nil, err
}
tags := make([]*schema.Tag, 0)
for rows.Next() {
tag := &schema.Tag{}
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil {
log.Warn("Error while scanning rows")
return nil, err
}
tags = append(tags, tag)
}
return tags, nil
}
// GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has. // GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has.
func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) {
q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag")

Binary file not shown.

View File

@ -0,0 +1,4 @@
GROMACS
gromacs
GMX
mdrun

View File

@ -0,0 +1 @@
openfoam

View File

@ -0,0 +1,3 @@
python
anaconda
conda

View File

@ -0,0 +1,2 @@
VASP
vasp

View File

@ -0,0 +1,322 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package tagger
import (
"bytes"
"embed"
"encoding/json"
"fmt"
"maps"
"os"
"strings"
"text/template"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/expr-lang/expr"
"github.com/expr-lang/expr/vm"
)
//go:embed jobclasses/*
var jobclassFiles embed.FS
type Variable struct {
Name string `json:"name"`
Expr string `json:"expr"`
}
type ruleVariable struct {
name string
expr *vm.Program
}
type RuleFormat struct {
Name string `json:"name"`
Tag string `json:"tag"`
Parameters []string `json:"parameters"`
Metrics []string `json:"metrics"`
Requirements []string `json:"requirements"`
Variables []Variable `json:"variables"`
Rule string `json:"rule"`
Hint string `json:"hint"`
}
type ruleInfo struct {
env map[string]any
metrics []string
requirements []*vm.Program
variables []ruleVariable
rule *vm.Program
hint *template.Template
}
type JobClassTagger struct {
rules map[string]ruleInfo
parameters map[string]any
tagType string
cfgPath string
}
func (t *JobClassTagger) prepareRule(b []byte, fns string) {
var rule RuleFormat
if err := json.NewDecoder(bytes.NewReader(b)).Decode(&rule); err != nil {
log.Warn("Error while decoding raw job meta json")
return
}
ri := ruleInfo{}
ri.env = make(map[string]any)
ri.metrics = make([]string, 0)
ri.requirements = make([]*vm.Program, 0)
ri.variables = make([]ruleVariable, 0)
// check if all required parameters are available
for _, p := range rule.Parameters {
param, ok := t.parameters[p]
if !ok {
log.Warnf("prepareRule() > missing parameter %s in rule %s", p, fns)
return
}
ri.env[p] = param
}
// set all required metrics
ri.metrics = append(ri.metrics, rule.Metrics...)
// compile requirements
for _, r := range rule.Requirements {
req, err := expr.Compile(r, expr.AsBool())
if err != nil {
log.Errorf("error compiling requirement %s: %#v", r, err)
return
}
ri.requirements = append(ri.requirements, req)
}
// compile variables
for _, v := range rule.Variables {
req, err := expr.Compile(v.Expr, expr.AsFloat64())
if err != nil {
log.Errorf("error compiling requirement %s: %#v", v.Name, err)
return
}
ri.variables = append(ri.variables, ruleVariable{name: v.Name, expr: req})
}
// compile rule
exp, err := expr.Compile(rule.Rule, expr.AsBool())
if err != nil {
log.Errorf("error compiling rule %s: %#v", fns, err)
return
}
ri.rule = exp
// prepare hint template
ri.hint, err = template.New(fns).Parse(rule.Hint)
if err != nil {
log.Errorf("error processing template %s: %#v", fns, err)
}
log.Infof("prepareRule() > processing %s with %d requirements and %d variables", fns, len(ri.requirements), len(ri.variables))
t.rules[rule.Tag] = ri
}
func (t *JobClassTagger) EventMatch(s string) bool {
return strings.Contains(s, "jobclasses")
}
// FIXME: Only process the file that caused the event
func (t *JobClassTagger) EventCallback() {
files, err := os.ReadDir(t.cfgPath)
if err != nil {
log.Fatal(err)
}
if util.CheckFileExists(t.cfgPath + "/parameters.json") {
log.Info("Merge parameters")
b, err := os.ReadFile(t.cfgPath + "/parameters.json")
if err != nil {
log.Warnf("prepareRule() > open file error: %v", err)
}
var paramTmp map[string]any
if err := json.NewDecoder(bytes.NewReader(b)).Decode(&paramTmp); err != nil {
log.Warn("Error while decoding parameters.json")
}
maps.Copy(t.parameters, paramTmp)
}
for _, fn := range files {
fns := fn.Name()
if fns != "parameters.json" {
log.Debugf("Process: %s", fns)
filename := fmt.Sprintf("%s/%s", t.cfgPath, fns)
b, err := os.ReadFile(filename)
if err != nil {
log.Warnf("prepareRule() > open file error: %v", err)
return
}
t.prepareRule(b, fns)
}
}
}
func (t *JobClassTagger) initParameters() error {
log.Info("Initialize parameters")
b, err := jobclassFiles.ReadFile("jobclasses/parameters.json")
if err != nil {
log.Warnf("prepareRule() > open file error: %v", err)
return err
}
if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil {
log.Warn("Error while decoding parameters.json")
return err
}
return nil
}
func (t *JobClassTagger) Register() error {
t.cfgPath = "./var/tagger/jobclasses"
t.tagType = "jobClass"
err := t.initParameters()
if err != nil {
log.Warnf("error reading parameters.json: %v", err)
return err
}
files, err := jobclassFiles.ReadDir("jobclasses")
if err != nil {
return fmt.Errorf("error reading app folder: %#v", err)
}
t.rules = make(map[string]ruleInfo, 0)
for _, fn := range files {
fns := fn.Name()
if fns != "parameters.json" {
filename := fmt.Sprintf("jobclasses/%s", fns)
log.Infof("Process: %s", fns)
b, err := jobclassFiles.ReadFile(filename)
if err != nil {
log.Warnf("prepareRule() > open file error: %v", err)
return err
}
t.prepareRule(b, fns)
}
}
if util.CheckFileExists(t.cfgPath) {
t.EventCallback()
log.Infof("Setup file watch for %s", t.cfgPath)
util.AddListener(t.cfgPath, t)
}
return nil
}
func (t *JobClassTagger) Match(job *schema.Job) {
r := repository.GetJobRepository()
jobstats, err := archive.GetStatistics(job)
metricsList := archive.GetMetricConfigSubCluster(job.Cluster, job.SubCluster)
log.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID)
if err != nil {
log.Errorf("job classification failed for job %d: %#v", job.JobID, err)
return
}
for tag, ri := range t.rules {
env := make(map[string]any)
maps.Copy(env, ri.env)
log.Infof("Try to match rule %s for job %d", tag, job.JobID)
// Initialize environment
env["job"] = map[string]any{
"exclusive": job.Exclusive,
"duration": job.Duration,
"numCores": job.NumHWThreads,
"numNodes": job.NumNodes,
"jobState": job.State,
"numAcc": job.NumAcc,
"smt": job.SMT,
}
// add metrics to env
for _, m := range ri.metrics {
stats, ok := jobstats[m]
if !ok {
log.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m)
return
}
env[m] = map[string]any{
"min": stats.Min,
"max": stats.Max,
"avg": stats.Avg,
"limits": map[string]float64{
"peak": metricsList[m].Peak,
"normal": metricsList[m].Normal,
"caution": metricsList[m].Caution,
"alert": metricsList[m].Alert,
},
}
}
// check rule requirements apply
for _, r := range ri.requirements {
ok, err := expr.Run(r, env)
if err != nil {
log.Errorf("error running requirement for rule %s: %#v", tag, err)
return
}
if !ok.(bool) {
log.Infof("requirement for rule %s not met", tag)
return
}
}
// validate rule expression
for _, v := range ri.variables {
value, err := expr.Run(v.expr, env)
if err != nil {
log.Errorf("error running rule %s: %#v", tag, err)
return
}
env[v.name] = value
}
// dump.P(env)
match, err := expr.Run(ri.rule, env)
if err != nil {
log.Errorf("error running rule %s: %#v", tag, err)
return
}
if match.(bool) {
log.Info("Rule matches!")
id := *job.ID
if !r.HasTag(id, t.tagType, tag) {
r.AddTagOrCreateDirect(id, t.tagType, tag)
}
// process hint template
var msg bytes.Buffer
if err := ri.hint.Execute(&msg, env); err != nil {
log.Errorf("Template error: %s", err.Error())
return
}
// FIXME: Handle case where multiple tags apply
r.UpdateMetadata(job, "message", msg.String())
} else {
log.Info("Rule does not match!")
}
}
}

View File

@ -0,0 +1,125 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package tagger
import (
"bufio"
"embed"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
//go:embed apps/*
var appFiles embed.FS
type appInfo struct {
tag string
strings []string
}
type AppTagger struct {
apps map[string]appInfo
tagType string
cfgPath string
}
func (t *AppTagger) scanApp(f fs.File, fns string) {
scanner := bufio.NewScanner(f)
ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)}
for scanner.Scan() {
ai.strings = append(ai.strings, scanner.Text())
}
delete(t.apps, ai.tag)
t.apps[ai.tag] = ai
}
func (t *AppTagger) EventMatch(s string) bool {
return strings.Contains(s, "apps")
}
// FIXME: Only process the file that caused the event
func (t *AppTagger) EventCallback() {
files, err := os.ReadDir(t.cfgPath)
if err != nil {
log.Fatal(err)
}
for _, fn := range files {
fns := fn.Name()
log.Debugf("Process: %s", fns)
f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns))
if err != nil {
log.Errorf("error opening app file %s: %#v", fns, err)
}
t.scanApp(f, fns)
}
}
func (t *AppTagger) Register() error {
t.cfgPath = "./var/tagger/apps"
t.tagType = "app"
files, err := appFiles.ReadDir("apps")
if err != nil {
return fmt.Errorf("error reading app folder: %#v", err)
}
t.apps = make(map[string]appInfo, 0)
for _, fn := range files {
fns := fn.Name()
log.Debugf("Process: %s", fns)
f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns))
if err != nil {
return fmt.Errorf("error opening app file %s: %#v", fns, err)
}
defer f.Close()
t.scanApp(f, fns)
}
if util.CheckFileExists(t.cfgPath) {
t.EventCallback()
log.Infof("Setup file watch for %s", t.cfgPath)
util.AddListener(t.cfgPath, t)
}
return nil
}
func (t *AppTagger) Match(job *schema.Job) {
r := repository.GetJobRepository()
metadata, err := r.FetchMetadata(job)
if err != nil {
log.Infof("Cannot fetch metadata for job: %d on %s", job.JobID, job.Cluster)
return
}
jobscript, ok := metadata["jobScript"]
if ok {
id := *job.ID
out:
for _, a := range t.apps {
tag := a.tag
for _, s := range a.strings {
if strings.Contains(jobscript, s) {
if !r.HasTag(id, t.tagType, tag) {
r.AddTagOrCreateDirect(id, t.tagType, tag)
break out
}
}
}
}
} else {
log.Infof("Cannot extract job script for job: %d on %s", job.JobID, job.Cluster)
}
}

View File

@ -0,0 +1,59 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package tagger
import (
"testing"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
func setup(tb testing.TB) *repository.JobRepository {
tb.Helper()
log.Init("warn", true)
dbfile := "../repository/testdata/job.db"
err := repository.MigrateDB("sqlite3", dbfile)
noErr(tb, err)
repository.Connect("sqlite3", dbfile)
return repository.GetJobRepository()
}
func noErr(tb testing.TB, err error) {
tb.Helper()
if err != nil {
tb.Fatal("Error is not nil:", err)
}
}
func TestRegister(t *testing.T) {
var tagger AppTagger
err := tagger.Register()
noErr(t, err)
if len(tagger.apps) != 4 {
t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 3", len(tagger.apps))
}
}
func TestMatch(t *testing.T) {
r := setup(t)
job, err := r.FindByIdDirect(5)
noErr(t, err)
var tagger AppTagger
err = tagger.Register()
noErr(t, err)
tagger.Match(job)
if !r.HasTag(5, "app", "vasp") {
t.Errorf("missing tag vasp")
}
}

View File

@ -0,0 +1,26 @@
{
"name": "Excessive CPU load",
"tag": "excessiveload",
"parameters": [
"excessivecpuload_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": ["cpu_load"],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "load_threshold",
"expr": "(job.numCores / job.numNodes) * excessivecpuload_threshold_factor"
},
{
"name": "load_perc",
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
}
],
"rule": "cpu_load.avg > cpu_load.limits.peak",
"hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.cpu_load.limits.peak}}."
}

View File

@ -0,0 +1,26 @@
{
"name": "Low CPU load",
"tag": "lowload",
"parameters": [
"lowcpuload_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": ["cpu_load"],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "load_threshold",
"expr": "job.numCores * lowcpuload_threshold_factor"
},
{
"name": "load_perc",
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
}
],
"rule": "cpu_load.avg < cpu_load.limits.caution",
"hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}."
}

View File

@ -0,0 +1,14 @@
{
"lowcpuload_threshold_factor": 0.9,
"excessivecpuload_threshold_factor": 1.1,
"highmemoryusage_threshold_factor": 0.9,
"node_load_imbalance_threshold_factor": 0.1,
"core_load_imbalance_threshold_factor": 0.1,
"high_memory_load_threshold_factor": 0.9,
"lowgpuload_threshold_factor": 0.7,
"memory_leak_slope_threshold": 0.1,
"job_min_duration_seconds": 600.0,
"sampling_interval_seconds": 30.0,
"cpu_load_pre_cutoff_samples": 11.0,
"cpu_load_core_pre_cutoff_samples": 6.0
}

View File

@ -0,0 +1,21 @@
{
"and": [
{
"in": [
"a40",
{
"var": "metaData.jobScript"
}
]
},
{
">": [
{
"var": "statistics.clock.min"
},
2000
]
}
]
}

88
internal/tagger/tagger.go Normal file
View File

@ -0,0 +1,88 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package tagger
import (
"sync"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type Tagger interface {
Register() error
Match(job *schema.Job)
}
var (
initOnce sync.Once
jobTagger *JobTagger
)
type JobTagger struct {
startTaggers []Tagger
stopTaggers []Tagger
}
func newTagger() {
jobTagger = &JobTagger{}
jobTagger.startTaggers = make([]Tagger, 0)
jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{})
jobTagger.stopTaggers = make([]Tagger, 0)
jobTagger.stopTaggers = append(jobTagger.stopTaggers, &JobClassTagger{})
for _, tagger := range jobTagger.startTaggers {
tagger.Register()
}
for _, tagger := range jobTagger.stopTaggers {
tagger.Register()
}
}
func Init() {
initOnce.Do(func() {
newTagger()
repository.RegisterJobJook(jobTagger)
})
}
func (jt *JobTagger) JobStartCallback(job *schema.Job) {
for _, tagger := range jt.startTaggers {
tagger.Match(job)
}
}
func (jt *JobTagger) JobStopCallback(job *schema.Job) {
for _, tagger := range jt.stopTaggers {
tagger.Match(job)
}
}
func RunTaggers() error {
newTagger()
r := repository.GetJobRepository()
jl, err := r.GetJobList()
if err != nil {
log.Errorf("Error while getting job list %s", err)
return err
}
for _, id := range jl {
job, err := r.FindByIdDirect(id)
if err != nil {
log.Errorf("Error while getting job %s", err)
return err
}
for _, tagger := range jobTagger.startTaggers {
tagger.Match(job)
}
for _, tagger := range jobTagger.stopTaggers {
log.Infof("Run stop tagger for job %d", job.ID)
tagger.Match(job)
}
}
return nil
}

View File

@ -0,0 +1,31 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package tagger
import (
"testing"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func TestInit(t *testing.T) {
Init()
}
func TestJobStartCallback(t *testing.T) {
Init()
r := setup(t)
job, err := r.FindByIdDirect(2)
noErr(t, err)
jobs := make([]*schema.Job, 0, 1)
jobs = append(jobs, job)
repository.CallJobStartHooks(jobs)
if !r.HasTag(2, "app", "python") {
t.Errorf("missing tag python")
}
}

View File

@ -0,0 +1,35 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package taskManager
import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/go-co-op/gocron/v2"
)
func RegisterCommitJobService() {
var frequency string
if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.CommitJobWorker != "" {
frequency = config.Keys.CronFrequency.CommitJobWorker
} else {
frequency = "2m"
}
d, _ := time.ParseDuration(frequency)
log.Infof("Register commitJob service with %s interval", frequency)
s.NewJob(gocron.DurationJob(d),
gocron.NewTask(
func() {
start := time.Now()
log.Printf("Jobcache sync started at %s", start.Format(time.RFC3339))
jobs, _ := jobRepo.SyncJobs()
repository.CallJobStartHooks(jobs)
log.Printf("Jobcache sync and job callbacks are done and took %s", time.Since(start))
}))
}

View File

@ -81,6 +81,7 @@ func Start() {
RegisterFootprintWorker() RegisterFootprintWorker()
RegisterUpdateDurationWorker() RegisterUpdateDurationWorker()
RegisterCommitJobService()
s.Start() s.Start()
} }

View File

@ -73,11 +73,7 @@ func RegisterFootprintWorker() {
continue continue
} }
jobMeta := &schema.JobMeta{ job.Statistics = make(map[string]schema.JobStatistics)
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for _, metric := range allMetrics { for _, metric := range allMetrics {
avg, min, max := 0.0, 0.0, 0.0 avg, min, max := 0.0, 0.0, 0.0
@ -95,7 +91,7 @@ func RegisterFootprintWorker() {
} }
// Add values rounded to 2 digits: repo.LoadStats may return unrounded // Add values rounded to 2 digits: repo.LoadStats may return unrounded
jobMeta.Statistics[metric] = schema.JobStatistics{ job.Statistics[metric] = schema.JobStatistics{
Unit: schema.Unit{ Unit: schema.Unit{
Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix,
Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base,
@ -108,7 +104,7 @@ func RegisterFootprintWorker() {
// Build Statement per Job, Add to Pending Array // Build Statement per Job, Add to Pending Array
stmt := sq.Update("job") stmt := sq.Update("job")
stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta) stmt, err = jobRepo.UpdateFootprint(stmt, job)
if err != nil { if err != nil {
log.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", job.ID, err.Error()) log.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", job.ID, err.Error())
ce++ ce++

View File

@ -0,0 +1,75 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package util
import (
"sync"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/fsnotify/fsnotify"
)
type Listener interface {
EventCallback()
EventMatch(event string) bool
}
var (
initOnce sync.Once
w *fsnotify.Watcher
listeners []Listener
)
func AddListener(path string, l Listener) {
var err error
initOnce.Do(func() {
var err error
w, err = fsnotify.NewWatcher()
if err != nil {
log.Error("creating a new watcher: %w", err)
}
listeners = make([]Listener, 0)
go watchLoop(w)
})
listeners = append(listeners, l)
err = w.Add(path)
if err != nil {
log.Warnf("%q: %s", path, err)
}
}
func FsWatcherShutdown() {
if w != nil {
w.Close()
}
}
func watchLoop(w *fsnotify.Watcher) {
for {
select {
// Read from Errors.
case err, ok := <-w.Errors:
if !ok { // Channel was closed (i.e. Watcher.Close() was called).
return
}
log.Errorf("watch event loop: %s", err)
// Read from Events.
case e, ok := <-w.Events:
if !ok { // Channel was closed (i.e. Watcher.Close() was called).
return
}
log.Infof("Event %s", e)
for _, l := range listeners {
if l.EventMatch(e.String()) {
l.EventCallback()
}
}
}
}
}

View File

@ -23,7 +23,7 @@ type ArchiveBackend interface {
Exists(job *schema.Job) bool Exists(job *schema.Job) bool
LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) LoadJobMeta(job *schema.Job) (*schema.Job, error)
LoadJobData(job *schema.Job) (schema.JobData, error) LoadJobData(job *schema.Job) (schema.JobData, error)
@ -31,9 +31,9 @@ type ArchiveBackend interface {
LoadClusterCfg(name string) (*schema.Cluster, error) LoadClusterCfg(name string) (*schema.Cluster, error)
StoreJobMeta(jobMeta *schema.JobMeta) error StoreJobMeta(jobMeta *schema.Job) error
ImportJob(jobMeta *schema.JobMeta, jobData *schema.JobData) error ImportJob(jobMeta *schema.Job, jobData *schema.JobData) error
GetClusters() []string GetClusters() []string
@ -51,7 +51,7 @@ type ArchiveBackend interface {
} }
type JobContainer struct { type JobContainer struct {
Meta *schema.JobMeta Meta *schema.Job
Data *schema.JobData Data *schema.JobData
} }
@ -162,7 +162,6 @@ func LoadScopedStatsFromArchive(
metrics []string, metrics []string,
scopes []schema.MetricScope, scopes []schema.MetricScope,
) (schema.ScopedJobStats, error) { ) (schema.ScopedJobStats, error) {
data, err := ar.LoadJobStats(job) data, err := ar.LoadJobStats(job)
if err != nil { if err != nil {
log.Errorf("Error while loading job stats from archiveBackend: %s", err.Error()) log.Errorf("Error while loading job stats from archiveBackend: %s", err.Error())

View File

@ -9,7 +9,6 @@ import (
"fmt" "fmt"
"path/filepath" "path/filepath"
"testing" "testing"
"time"
"github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
@ -32,12 +31,12 @@ func setup(t *testing.T) archive.ArchiveBackend {
jobs[0] = &schema.Job{} jobs[0] = &schema.Job{}
jobs[0].JobID = 1403244 jobs[0].JobID = 1403244
jobs[0].Cluster = "emmy" jobs[0].Cluster = "emmy"
jobs[0].StartTime = time.Unix(1608923076, 0) jobs[0].StartTime = 1608923076
jobs[1] = &schema.Job{} jobs[1] = &schema.Job{}
jobs[0].JobID = 1404397 jobs[0].JobID = 1404397
jobs[0].Cluster = "emmy" jobs[0].Cluster = "emmy"
jobs[0].StartTime = time.Unix(1609300556, 0) jobs[0].StartTime = 1609300556
return archive.GetHandle() return archive.GetHandle()
} }

View File

@ -69,16 +69,18 @@ func initClusterConfig() error {
for _, sc := range cluster.SubClusters { for _, sc := range cluster.SubClusters {
newMetric := &schema.MetricConfig{ newMetric := &schema.MetricConfig{
Unit: mc.Unit, Metric: schema.Metric{
Name: mc.Name,
Unit: mc.Unit,
Peak: mc.Peak,
Normal: mc.Normal,
Caution: mc.Caution,
Alert: mc.Alert,
},
Energy: mc.Energy, Energy: mc.Energy,
Name: mc.Name,
Scope: mc.Scope, Scope: mc.Scope,
Aggregation: mc.Aggregation, Aggregation: mc.Aggregation,
Peak: mc.Peak,
Caution: mc.Caution,
Alert: mc.Alert,
Timestep: mc.Timestep, Timestep: mc.Timestep,
Normal: mc.Normal,
LowerIsBetter: mc.LowerIsBetter, LowerIsBetter: mc.LowerIsBetter,
} }
@ -167,6 +169,45 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster) return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster)
} }
func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric {
metrics := make(map[string]*schema.Metric)
for _, c := range Clusters {
if c.Name == cluster {
for _, m := range c.MetricConfig {
for _, s := range m.SubClusters {
if s.Name == subcluster {
metrics[m.Name] = &schema.Metric{
Name: m.Name,
Unit: s.Unit,
Peak: s.Peak,
Normal: s.Normal,
Caution: s.Caution,
Alert: s.Alert,
}
break
}
}
_, ok := metrics[m.Name]
if !ok {
metrics[m.Name] = &schema.Metric{
Name: m.Name,
Unit: m.Unit,
Peak: m.Peak,
Normal: m.Normal,
Caution: m.Caution,
Alert: m.Alert,
}
}
}
break
}
}
return metrics
}
func GetMetricConfig(cluster, metric string) *schema.MetricConfig { func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
for _, c := range Clusters { for _, c := range Clusters {
if c.Name == cluster { if c.Name == cluster {
@ -182,7 +223,7 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
// AssignSubCluster sets the `job.subcluster` property of the job based // AssignSubCluster sets the `job.subcluster` property of the job based
// on its cluster and resources. // on its cluster and resources.
func AssignSubCluster(job *schema.BaseJob) error { func AssignSubCluster(job *schema.Job) error {
cluster := GetCluster(job.Cluster) cluster := GetCluster(job.Cluster)
if cluster == nil { if cluster == nil {
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster) return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster)

View File

@ -53,28 +53,27 @@ func getDirectory(
rootPath, rootPath,
job.Cluster, job.Cluster,
lvl1, lvl2, lvl1, lvl2,
strconv.FormatInt(job.StartTime.Unix(), 10)) strconv.FormatInt(job.StartTime, 10))
} }
func getPath( func getPath(
job *schema.Job, job *schema.Job,
rootPath string, rootPath string,
file string) string { file string,
) string {
return filepath.Join( return filepath.Join(
getDirectory(job, rootPath), file) getDirectory(job, rootPath), file)
} }
func loadJobMeta(filename string) (*schema.JobMeta, error) { func loadJobMeta(filename string) (*schema.Job, error) {
b, err := os.ReadFile(filename) b, err := os.ReadFile(filename)
if err != nil { if err != nil {
log.Errorf("loadJobMeta() > open file error: %v", err) log.Errorf("loadJobMeta() > open file error: %v", err)
return &schema.JobMeta{}, err return nil, err
} }
if config.Keys.Validate { if config.Keys.Validate {
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil { if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err) return nil, fmt.Errorf("validate job meta: %v", err)
} }
} }
@ -83,7 +82,6 @@ func loadJobMeta(filename string) (*schema.JobMeta, error) {
func loadJobData(filename string, isCompressed bool) (schema.JobData, error) { func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
f, err := os.Open(filename) f, err := os.Open(filename)
if err != nil { if err != nil {
log.Errorf("fsBackend LoadJobData()- %v", err) log.Errorf("fsBackend LoadJobData()- %v", err)
return nil, err return nil, err
@ -117,7 +115,6 @@ func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) { func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) {
f, err := os.Open(filename) f, err := os.Open(filename)
if err != nil { if err != nil {
log.Errorf("fsBackend LoadJobStats()- %v", err) log.Errorf("fsBackend LoadJobStats()- %v", err)
return nil, err return nil, err
@ -150,7 +147,6 @@ func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, er
} }
func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) { func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) {
var config FsArchiveConfig var config FsArchiveConfig
if err := json.Unmarshal(rawConfig, &config); err != nil { if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Warnf("Init() > Unmarshal error: %#v", err) log.Warnf("Init() > Unmarshal error: %#v", err)
@ -276,7 +272,6 @@ func (fsa *FsArchive) Exists(job *schema.Job) bool {
} }
func (fsa *FsArchive) Clean(before int64, after int64) { func (fsa *FsArchive) Clean(before int64, after int64) {
if after == 0 { if after == 0 {
after = math.MaxInt64 after = math.MaxInt64
} }
@ -392,7 +387,6 @@ func (fsa *FsArchive) Compress(jobs []*schema.Job) {
} }
func (fsa *FsArchive) CompressLast(starttime int64) int64 { func (fsa *FsArchive) CompressLast(starttime int64) int64 {
filename := filepath.Join(fsa.path, "compress.txt") filename := filepath.Join(fsa.path, "compress.txt")
b, err := os.ReadFile(filename) b, err := os.ReadFile(filename)
if err != nil { if err != nil {
@ -435,13 +429,12 @@ func (fsa *FsArchive) LoadJobStats(job *schema.Job) (schema.ScopedJobStats, erro
return loadJobStats(filename, isCompressed) return loadJobStats(filename, isCompressed)
} }
func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) { func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.Job, error) {
filename := getPath(job, fsa.path, "meta.json") filename := getPath(job, fsa.path, "meta.json")
return loadJobMeta(filename) return loadJobMeta(filename)
} }
func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) {
b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json")) b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json"))
if err != nil { if err != nil {
log.Errorf("LoadClusterCfg() > open file error: %v", err) log.Errorf("LoadClusterCfg() > open file error: %v", err)
@ -456,7 +449,6 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) {
} }
func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
ch := make(chan JobContainer) ch := make(chan JobContainer)
go func() { go func() {
clustersDir, err := os.ReadDir(fsa.path) clustersDir, err := os.ReadDir(fsa.path)
@ -526,19 +518,13 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
return ch return ch
} }
func (fsa *FsArchive) StoreJobMeta(jobMeta *schema.JobMeta) error { func (fsa *FsArchive) StoreJobMeta(job *schema.Job) error {
f, err := os.Create(getPath(job, fsa.path, "meta.json"))
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
f, err := os.Create(getPath(&job, fsa.path, "meta.json"))
if err != nil { if err != nil {
log.Error("Error while creating filepath for meta.json") log.Error("Error while creating filepath for meta.json")
return err return err
} }
if err := EncodeJobMeta(f, jobMeta); err != nil { if err := EncodeJobMeta(f, job); err != nil {
log.Error("Error while encoding job metadata to meta.json file") log.Error("Error while encoding job metadata to meta.json file")
return err return err
} }
@ -555,15 +541,10 @@ func (fsa *FsArchive) GetClusters() []string {
} }
func (fsa *FsArchive) ImportJob( func (fsa *FsArchive) ImportJob(
jobMeta *schema.JobMeta, jobMeta *schema.Job,
jobData *schema.JobData) error { jobData *schema.JobData,
) error {
job := schema.Job{ dir := getPath(jobMeta, fsa.path, "")
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
dir := getPath(&job, fsa.path, "")
if err := os.MkdirAll(dir, 0777); err != nil { if err := os.MkdirAll(dir, 0777); err != nil {
log.Error("Error while creating job archive path") log.Error("Error while creating job archive path")
return err return err
@ -583,28 +564,6 @@ func (fsa *FsArchive) ImportJob(
return err return err
} }
// var isCompressed bool = true
// // TODO Use shortJob Config for check
// if jobMeta.Duration < 300 {
// isCompressed = false
// f, err = os.Create(path.Join(dir, "data.json"))
// } else {
// f, err = os.Create(path.Join(dir, "data.json.gz"))
// }
// if err != nil {
// return err
// }
//
// if isCompressed {
// if err := EncodeJobData(gzip.NewWriter(f), jobData); err != nil {
// return err
// }
// } else {
// if err := EncodeJobData(f, jobData); err != nil {
// return err
// }
// }
f, err = os.Create(path.Join(dir, "data.json")) f, err = os.Create(path.Join(dir, "data.json"))
if err != nil { if err != nil {
log.Error("Error while creating filepath for data.json") log.Error("Error while creating filepath for data.json")

View File

@ -9,7 +9,6 @@ import (
"fmt" "fmt"
"path/filepath" "path/filepath"
"testing" "testing"
"time"
"github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/internal/util"
"github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/ClusterCockpit/cc-backend/pkg/schema"
@ -86,8 +85,11 @@ func TestLoadJobMeta(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
jobIn := schema.Job{BaseJob: schema.JobDefaults} jobIn := schema.Job{
jobIn.StartTime = time.Unix(1608923076, 0) Exclusive: 1,
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
jobIn.StartTime = 1608923076
jobIn.JobID = 1403244 jobIn.JobID = 1403244
jobIn.Cluster = "emmy" jobIn.Cluster = "emmy"
@ -114,8 +116,11 @@ func TestLoadJobData(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
jobIn := schema.Job{BaseJob: schema.JobDefaults} jobIn := schema.Job{
jobIn.StartTime = time.Unix(1608923076, 0) Exclusive: 1,
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
jobIn.StartTime = 1608923076
jobIn.JobID = 1403244 jobIn.JobID = 1403244
jobIn.Cluster = "emmy" jobIn.Cluster = "emmy"
@ -142,8 +147,11 @@ func BenchmarkLoadJobData(b *testing.B) {
var fsa FsArchive var fsa FsArchive
fsa.Init(json.RawMessage(archiveCfg)) fsa.Init(json.RawMessage(archiveCfg))
jobIn := schema.Job{BaseJob: schema.JobDefaults} jobIn := schema.Job{
jobIn.StartTime = time.Unix(1608923076, 0) Exclusive: 1,
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
jobIn.StartTime = 1608923076
jobIn.JobID = 1403244 jobIn.JobID = 1403244
jobIn.Cluster = "emmy" jobIn.Cluster = "emmy"
@ -165,8 +173,11 @@ func BenchmarkLoadJobDataCompressed(b *testing.B) {
var fsa FsArchive var fsa FsArchive
fsa.Init(json.RawMessage(archiveCfg)) fsa.Init(json.RawMessage(archiveCfg))
jobIn := schema.Job{BaseJob: schema.JobDefaults} jobIn := schema.Job{
jobIn.StartTime = time.Unix(1608923076, 0) Exclusive: 1,
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
jobIn.StartTime = 1608923076
jobIn.JobID = 1403244 jobIn.JobID = 1403244
jobIn.Cluster = "emmy" jobIn.Cluster = "emmy"

View File

@ -69,8 +69,8 @@ func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) {
return nil, err return nil, err
} }
func DecodeJobMeta(r io.Reader) (*schema.JobMeta, error) { func DecodeJobMeta(r io.Reader) (*schema.Job, error) {
var d schema.JobMeta var d schema.Job
if err := json.NewDecoder(r).Decode(&d); err != nil { if err := json.NewDecoder(r).Decode(&d); err != nil {
log.Warn("Error while decoding raw job meta json") log.Warn("Error while decoding raw job meta json")
return &d, err return &d, err
@ -103,7 +103,7 @@ func EncodeJobData(w io.Writer, d *schema.JobData) error {
return nil return nil
} }
func EncodeJobMeta(w io.Writer, d *schema.JobMeta) error { func EncodeJobMeta(w io.Writer, d *schema.Job) error {
// Sanitize parameters // Sanitize parameters
if err := json.NewEncoder(w).Encode(d); err != nil { if err := json.NewEncoder(w).Encode(d); err != nil {
log.Warn("Error while encoding new job meta json") log.Warn("Error while encoding new job meta json")

View File

@ -61,7 +61,7 @@ func (nl *NodeList) PrintList() []string {
} }
func (nl *NodeList) NodeCount() int { func (nl *NodeList) NodeCount() int {
var out int = 0 out := 0
for _, term := range *nl { for _, term := range *nl {
if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one
out += 1 out += 1
@ -160,7 +160,7 @@ func (nle NLExprIntRange) limits() []map[string]int {
m["start"] = int(nle.start) m["start"] = int(nle.start)
m["end"] = int(nle.end) m["end"] = int(nle.end)
m["digits"] = int(nle.digits) m["digits"] = int(nle.digits)
if nle.zeroPadded == true { if nle.zeroPadded {
m["zeroPadded"] = 1 m["zeroPadded"] = 1
} else { } else {
m["zeroPadded"] = 0 m["zeroPadded"] = 0
@ -183,14 +183,15 @@ func ParseNodeList(raw string) (NodeList, error) {
rawterms := []string{} rawterms := []string{}
prevterm := 0 prevterm := 0
for i := 0; i < len(raw); i++ { for i := 0; i < len(raw); i++ {
if raw[i] == '[' { switch raw[i] {
case '[':
for i < len(raw) && raw[i] != ']' { for i < len(raw) && raw[i] != ']' {
i++ i++
} }
if i == len(raw) { if i == len(raw) {
return nil, fmt.Errorf("ARCHIVE/NODELIST > unclosed '['") return nil, fmt.Errorf("ARCHIVE/NODELIST > unclosed '['")
} }
} else if raw[i] == ',' { case ',':
rawterms = append(rawterms, raw[prevterm:i]) rawterms = append(rawterms, raw[prevterm:i])
prevterm = i + 1 prevterm = i + 1
} }

View File

@ -45,31 +45,31 @@ type SubCluster struct {
ThreadsPerCore int `json:"threadsPerCore"` ThreadsPerCore int `json:"threadsPerCore"`
} }
type Metric struct {
Name string `json:"name"`
Unit Unit `json:"unit"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
}
type SubClusterConfig struct { type SubClusterConfig struct {
Name string `json:"name"` Metric
Footprint string `json:"footprint,omitempty"` Footprint string `json:"footprint,omitempty"`
Energy string `json:"energy"` Energy string `json:"energy"`
Peak float64 `json:"peak"` Remove bool `json:"remove"`
Normal float64 `json:"normal"` LowerIsBetter bool `json:"lowerIsBetter"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Remove bool `json:"remove"`
LowerIsBetter bool `json:"lowerIsBetter"`
} }
type MetricConfig struct { type MetricConfig struct {
Unit Unit `json:"unit"` Metric
Energy string `json:"energy"` Energy string `json:"energy"`
Name string `json:"name"`
Scope MetricScope `json:"scope"` Scope MetricScope `json:"scope"`
Aggregation string `json:"aggregation"` Aggregation string `json:"aggregation"`
Footprint string `json:"footprint,omitempty"` Footprint string `json:"footprint,omitempty"`
SubClusters []*SubClusterConfig `json:"subClusters,omitempty"` SubClusters []*SubClusterConfig `json:"subClusters,omitempty"`
Peak float64 `json:"peak"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
Timestep int `json:"timestep"` Timestep int `json:"timestep"`
Normal float64 `json:"normal"`
LowerIsBetter bool `json:"lowerIsBetter"` LowerIsBetter bool `json:"lowerIsBetter"`
} }
@ -127,7 +127,7 @@ func (topo *Topology) GetSocketsFromHWThreads(
// those in the argument list are assigned to one of the sockets in the first // those in the argument list are assigned to one of the sockets in the first
// return value, return true as the second value. TODO: Optimize this, there // return value, return true as the second value. TODO: Optimize this, there
// must be a more efficient way/algorithm. // must be a more efficient way/algorithm.
func (topo *Topology) GetSocketsFromCores ( func (topo *Topology) GetSocketsFromCores(
cores []int, cores []int,
) (sockets []int, exclusive bool) { ) (sockets []int, exclusive bool) {
socketsMap := map[int]int{} socketsMap := map[int]int{}

View File

@ -89,6 +89,8 @@ type ResampleConfig struct {
} }
type CronFrequency struct { type CronFrequency struct {
// Duration Update Worker [Defaults to '2m']
CommitJobWorker string `json:"commit-job-worker"`
// Duration Update Worker [Defaults to '5m'] // Duration Update Worker [Defaults to '5m']
DurationWorker string `json:"duration-worker"` DurationWorker string `json:"duration-worker"`
// Metric-Footprint Update Worker [Defaults to '10m'] // Metric-Footprint Update Worker [Defaults to '10m']
@ -129,6 +131,8 @@ type ProgramConfig struct {
// do not write to the job-archive. // do not write to the job-archive.
DisableArchive bool `json:"disable-archive"` DisableArchive bool `json:"disable-archive"`
EnableJobTaggers bool `json:"enable-job-taggers"`
// Validate json input against schema // Validate json input against schema
Validate bool `json:"validate"` Validate bool `json:"validate"`
@ -150,7 +154,7 @@ type ProgramConfig struct {
// If overwritten, at least all the options in the defaults below must // If overwritten, at least all the options in the defaults below must
// be provided! Most options here can be overwritten by the user. // be provided! Most options here can be overwritten by the user.
UiDefaults map[string]interface{} `json:"ui-defaults"` UiDefaults map[string]any `json:"ui-defaults"`
// If exists, will enable dynamic zoom in frontend metric plots using the configured values // If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"enable-resampling"` EnableResampling *ResampleConfig `json:"enable-resampling"`

View File

@ -8,43 +8,8 @@ import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"time"
) )
// BaseJob is the common part of the job metadata structs
//
// Common subset of Job and JobMeta. Use one of those, not this type directly.
type BaseJob struct {
Cluster string `json:"cluster" db:"cluster" example:"fritz"`
SubCluster string `json:"subCluster" db:"subcluster" example:"main"`
Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"`
Project string `json:"project" db:"project" example:"abcd200"`
User string `json:"user" db:"hpc_user" example:"abcd100h"`
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"`
Tags []*Tag `json:"tags,omitempty"`
RawEnergyFootprint []byte `json:"-" db:"energy_footprint"`
RawFootprint []byte `json:"-" db:"footprint"`
RawMetaData []byte `json:"-" db:"meta_data"`
RawResources []byte `json:"-" db:"resources"`
Resources []*Resource `json:"resources"`
EnergyFootprint map[string]float64 `json:"energyFootprint"`
Footprint map[string]float64 `json:"footprint"`
MetaData map[string]string `json:"metaData"`
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
Energy float64 `json:"energy" db:"energy"`
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"`
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"`
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"`
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"`
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"`
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"`
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"`
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"`
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"`
}
// Job struct type // Job struct type
// //
// This type is used as the GraphQL interface and using sqlx as a table row. // This type is used as the GraphQL interface and using sqlx as a table row.
@ -52,10 +17,37 @@ type BaseJob struct {
// Job model // Job model
// @Description Information of a HPC job. // @Description Information of a HPC job.
type Job struct { type Job struct {
StartTime time.Time `json:"startTime"` Cluster string `json:"cluster" db:"cluster" example:"fritz"`
BaseJob SubCluster string `json:"subCluster" db:"subcluster" example:"main"`
ID int64 `json:"id" db:"id"` Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"`
StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` Project string `json:"project" db:"project" example:"abcd200"`
User string `json:"user" db:"hpc_user" example:"abcd100h"`
State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"`
Tags []*Tag `json:"tags,omitempty"`
RawEnergyFootprint []byte `json:"-" db:"energy_footprint"`
RawFootprint []byte `json:"-" db:"footprint"`
RawMetaData []byte `json:"-" db:"meta_data"`
RawResources []byte `json:"-" db:"resources"`
Resources []*Resource `json:"resources"`
EnergyFootprint map[string]float64 `json:"energyFootprint"`
Footprint map[string]float64 `json:"footprint"`
MetaData map[string]string `json:"metaData"`
ConcurrentJobs JobLinkResultList `json:"concurrentJobs"`
Energy float64 `json:"energy" db:"energy"`
ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"`
Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"`
RequestedMemory int64 `json:"requestedMemory,omitempty" db:"requested_memory" example:"128000" minimum:"1"` // in MB
JobID int64 `json:"jobId" db:"job_id" example:"123000"`
Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"`
SMT int32 `json:"smt,omitempty" db:"smt" example:"4"`
MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"`
Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"`
NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"`
NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"`
NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"`
Statistics map[string]JobStatistics `json:"statistics"`
ID *int64 `json:"id,omitempty" db:"id"`
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812"`
} }
// JobMeta struct type // JobMeta struct type
@ -68,6 +60,14 @@ type Job struct {
// *int64 `json:"id,omitempty"` >> never used in the job-archive, only // *int64 `json:"id,omitempty"` >> never used in the job-archive, only
// available via REST-API // available via REST-API
// //
// JobMeta model
// @Description Meta data information of a HPC job.
// type JobMeta struct {
// ID *int64 `json:"id,omitempty"`
// BaseJob
// Statistics map[string]JobStatistics `json:"statistics"`
// StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"`
// }
type JobLink struct { type JobLink struct {
ID int64 `json:"id"` ID int64 `json:"id"`
@ -79,15 +79,6 @@ type JobLinkResultList struct {
Count int `json:"count"` Count int `json:"count"`
} }
// JobMeta model
// @Description Meta data information of a HPC job.
type JobMeta struct {
ID *int64 `json:"id,omitempty"`
Statistics map[string]JobStatistics `json:"statistics"`
BaseJob
StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"`
}
const ( const (
MonitoringStatusDisabled int32 = 0 MonitoringStatusDisabled int32 = 0
MonitoringStatusRunningOrArchiving int32 = 1 MonitoringStatusRunningOrArchiving int32 = 1
@ -95,10 +86,10 @@ const (
MonitoringStatusArchivingSuccessful int32 = 3 MonitoringStatusArchivingSuccessful int32 = 3
) )
var JobDefaults BaseJob = BaseJob{ // var JobDefaults Job = Job{
Exclusive: 1, // Exclusive: 1,
MonitoringStatus: MonitoringStatusRunningOrArchiving, // MonitoringStatus: MonitoringStatusRunningOrArchiving,
} // }
type Unit struct { type Unit struct {
Base string `json:"base"` Base string `json:"base"`
@ -145,7 +136,12 @@ const (
JobStateOutOfMemory JobState = "out_of_memory" JobStateOutOfMemory JobState = "out_of_memory"
) )
func (e *JobState) UnmarshalGQL(v interface{}) error { func (j Job) GoString() string {
return fmt.Sprintf("Job{ID:%d, StartTime:%d, JobID:%v, BaseJob:%v}",
j.ID, j.StartTime, j.JobID, j)
}
func (e *JobState) UnmarshalGQL(v any) error {
str, ok := v.(string) str, ok := v.(string)
if !ok { if !ok {
return fmt.Errorf("SCHEMA/JOB > enums must be strings") return fmt.Errorf("SCHEMA/JOB > enums must be strings")

35
pkg/schema/node.go Normal file
View File

@ -0,0 +1,35 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package schema
type NodeState string
const (
NodeStateAllocated NodeState = "allocated"
NodeStateReserved NodeState = "reserved"
NodeStateIdle NodeState = "idle"
NodeStateMixed NodeState = "mixed"
NodeStateDown NodeState = "down"
NodeStateUnknown NodeState = "unknown"
)
type MonitoringState string
const (
MonitoringStateFull MonitoringState = "full"
MonitoringStatePartial MonitoringState = "partial"
MonitoringStateFailed MonitoringState = "failed"
)
type Node struct {
ID int64 `json:"id" db:"id"`
Hostname string `json:"hostname" db:"hostname" example:"fritz"`
Cluster string `json:"cluster" db:"cluster" example:"fritz"`
SubCluster string `json:"subCluster" db:"subcluster" example:"main"`
NodeState NodeState `json:"nodeState" db:"node_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"`
HealthState MonitoringState `json:"healthState" db:"health_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"`
RawMetaData []byte `json:"-" db:"meta_data"`
MetaData map[string]string `json:"metaData"`
}

View File

@ -6,6 +6,7 @@ package schema
import ( import (
"fmt" "fmt"
"slices"
"strings" "strings"
) )
@ -50,12 +51,7 @@ type User struct {
} }
func (u *User) HasProject(project string) bool { func (u *User) HasProject(project string) bool {
for _, p := range u.Projects { return slices.Contains(u.Projects, project)
if p == project {
return true
}
}
return false
} }
func GetRoleString(roleInt Role) string { func GetRoleString(roleInt Role) string {

View File

@ -28,12 +28,13 @@ const (
//go:embed schemas/* //go:embed schemas/*
var schemaFiles embed.FS var schemaFiles embed.FS
func Validate(k Kind, r io.Reader) (err error) { func Validate(k Kind, r io.Reader) error {
jsonschema.Loaders["embedfs"] = func(s string) (io.ReadCloser, error) { jsonschema.Loaders["embedfs"] = func(s string) (io.ReadCloser, error) {
f := filepath.Join("schemas", strings.Split(s, "//")[1]) f := filepath.Join("schemas", strings.Split(s, "//")[1])
return schemaFiles.Open(f) return schemaFiles.Open(f)
} }
var s *jsonschema.Schema var s *jsonschema.Schema
var err error
switch k { switch k {
case Meta: case Meta:
@ -54,7 +55,7 @@ func Validate(k Kind, r io.Reader) (err error) {
} }
var v interface{} var v interface{}
if err := json.NewDecoder(r).Decode(&v); err != nil { if err = json.NewDecoder(r).Decode(&v); err != nil {
log.Warnf("Error while decoding raw json schema: %#v", err) log.Warnf("Error while decoding raw json schema: %#v", err)
return err return err
} }