diff --git a/api/schema.graphqls b/api/schema.graphqls index 268a579..a7bafde 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -4,61 +4,78 @@ scalar Any scalar NullableFloat scalar MetricScope scalar JobState +scalar NodeState +scalar MonitoringState + +type Node { + id: ID! + hostname: String! + cluster: String! + subCluster: String! + nodeState: NodeState! + HealthState: MonitoringState! + metaData: Any +} + +type NodeStats { + state: String! + count: Int! +} type Job { - id: ID! - jobId: Int! - user: String! - project: String! - cluster: String! - subCluster: String! - startTime: Time! - duration: Int! - walltime: Int! - numNodes: Int! - numHWThreads: Int! - numAcc: Int! - energy: Float! - SMT: Int! - exclusive: Int! - partition: String! - arrayJobId: Int! + id: ID! + jobId: Int! + user: String! + project: String! + cluster: String! + subCluster: String! + startTime: Time! + duration: Int! + walltime: Int! + numNodes: Int! + numHWThreads: Int! + numAcc: Int! + energy: Float! + SMT: Int! + exclusive: Int! + partition: String! + arrayJobId: Int! monitoringStatus: Int! - state: JobState! - tags: [Tag!]! - resources: [Resource!]! - concurrentJobs: JobLinkResultList - footprint: [FootprintValue] - energyFootprint: [EnergyFootprintValue] - metaData: Any - userData: User + state: JobState! + tags: [Tag!]! + resources: [Resource!]! + concurrentJobs: JobLinkResultList + footprint: [FootprintValue] + energyFootprint: [EnergyFootprintValue] + metaData: Any + userData: User } type JobLink { - id: ID! - jobId: Int! + id: ID! + jobId: Int! } type Cluster { - name: String! - partitions: [String!]! # Slurm partitions - subClusters: [SubCluster!]! # Hardware partitions/subclusters + name: String! + partitions: [String!]! # Slurm partitions + subClusters: [SubCluster!]! # Hardware partitions/subclusters } type SubCluster { - name: String! - nodes: String! - numberOfNodes: Int! - processorType: String! - socketsPerNode: Int! - coresPerSocket: Int! - threadsPerCore: Int! - flopRateScalar: MetricValue! - flopRateSimd: MetricValue! + name: String! + nodes: String! + numberOfNodes: Int! + processorType: String! + socketsPerNode: Int! + coresPerSocket: Int! + threadsPerCore: Int! + flopRateScalar: MetricValue! + flopRateSimd: MetricValue! memoryBandwidth: MetricValue! - topology: Topology! - metricConfig: [MetricConfig!]! - footprint: [String!]! + topology: Topology! + metricConfig: [MetricConfig!]! + footprint: [String!]! } type FootprintValue { @@ -80,94 +97,94 @@ type MetricValue { } type Topology { - node: [Int!] - socket: [[Int!]!] + node: [Int!] + socket: [[Int!]!] memoryDomain: [[Int!]!] - die: [[Int!]!] - core: [[Int!]!] + die: [[Int!]!] + core: [[Int!]!] accelerators: [Accelerator!] } type Accelerator { - id: String! - type: String! + id: String! + type: String! model: String! } type SubClusterConfig { - name: String! - peak: Float - normal: Float + name: String! + peak: Float + normal: Float caution: Float - alert: Float - remove: Boolean + alert: Float + remove: Boolean } type MetricConfig { - name: String! - unit: Unit! - scope: MetricScope! + name: String! + unit: Unit! + scope: MetricScope! aggregation: String! - timestep: Int! - peak: Float! - normal: Float + timestep: Int! + peak: Float! + normal: Float caution: Float! - alert: Float! + alert: Float! lowerIsBetter: Boolean subClusters: [SubClusterConfig!]! } type Tag { - id: ID! + id: ID! type: String! name: String! scope: String! } type Resource { - hostname: String! - hwthreads: [Int!] - accelerators: [String!] + hostname: String! + hwthreads: [Int!] + accelerators: [String!] configuration: String } type JobMetricWithName { - name: String! - scope: MetricScope! + name: String! + scope: MetricScope! metric: JobMetric! } type JobMetric { - unit: Unit - timestep: Int! - series: [Series!] + unit: Unit + timestep: Int! + series: [Series!] statisticsSeries: StatsSeries } type Series { - hostname: String! - id: String + hostname: String! + id: String statistics: MetricStatistics - data: [NullableFloat!]! + data: [NullableFloat!]! } type StatsSeries { - mean: [NullableFloat!]! + mean: [NullableFloat!]! median: [NullableFloat!]! - min: [NullableFloat!]! - max: [NullableFloat!]! + min: [NullableFloat!]! + max: [NullableFloat!]! } type NamedStatsWithScope { - name: String! - scope: MetricScope! - stats: [ScopedStats!]! + name: String! + scope: MetricScope! + stats: [ScopedStats!]! } type ScopedStats { - hostname: String! - id: String - data: MetricStatistics! + hostname: String! + id: String + data: MetricStatistics! } type JobStats { @@ -184,8 +201,8 @@ type JobStats { } type NamedStats { - name: String! - data: MetricStatistics! + name: String! + data: MetricStatistics! } type Unit { @@ -201,12 +218,12 @@ type MetricStatistics { type MetricFootprints { metric: String! - data: [NullableFloat!]! + data: [NullableFloat!]! } type Footprints { timeWeights: TimeWeights! - metrics: [MetricFootprints!]! + metrics: [MetricFootprints!]! } type TimeWeights { @@ -215,20 +232,33 @@ type TimeWeights { coreHours: [NullableFloat!]! } -enum Aggregate { USER, PROJECT, CLUSTER } -enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS } +enum Aggregate { + USER + PROJECT + CLUSTER +} +enum SortByAggregate { + TOTALWALLTIME + TOTALJOBS + TOTALNODES + TOTALNODEHOURS + TOTALCORES + TOTALCOREHOURS + TOTALACCS + TOTALACCHOURS +} type NodeMetrics { - host: String! + host: String! subCluster: String! - metrics: [JobMetricWithName!]! + metrics: [JobMetricWithName!]! } type NodesResultList { - items: [NodeMetrics!]! + items: [NodeMetrics!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int totalNodes: Int hasNextPage: Boolean } @@ -247,14 +277,14 @@ type GlobalMetricListItem { } type Count { - name: String! + name: String! count: Int! } type User { username: String! - name: String! - email: String! + name: String! + email: String! } input MetricStatItem { @@ -263,27 +293,81 @@ input MetricStatItem { } type Query { - clusters: [Cluster!]! # List of all clusters - tags: [Tag!]! # List of all tags - globalMetrics: [GlobalMetricListItem!]! + clusters: [Cluster!]! # List of all clusters + tags: [Tag!]! # List of all tags + globalMetrics: [GlobalMetricListItem!]! user(username: String!): User allocatedNodes(cluster: String!): [Count!]! - job(id: ID!): Job - jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]! - jobStats(id: ID!, metrics: [String!]): [NamedStats!]! - scopedJobStats(id: ID!, metrics: [String!], scopes: [MetricScope!]): [NamedStatsWithScope!]! + node(id: ID!): Node + nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! + nodeStats(filter: [NodeFilter!]): [NodeStats!]! + + job(id: ID!): Job + jobMetrics( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + resolution: Int + ): [JobMetricWithName!]! + + jobStats(id: ID!, metrics: [String!]): [NamedStats!]! + + scopedJobStats( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + ): [NamedStatsWithScope!]! + + jobs( + filter: [JobFilter!] + page: PageRequest + order: OrderByInput + ): JobResultList! + + jobsStatistics( + filter: [JobFilter!] + metrics: [String!] + page: PageRequest + sortBy: SortByAggregate + groupBy: Aggregate + numDurationBins: String + numMetricBins: Int + ): [JobsStatistics!]! - jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList! - jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]! jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]! jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints - rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]! + rooflineHeatmap( + filter: [JobFilter!]! + rows: Int! + cols: Int! + minX: Float! + minY: Float! + maxX: Float! + maxY: Float! + ): [[Float!]!]! - nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! - nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList! + nodeMetrics( + cluster: String! + nodes: [String!] + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + ): [NodeMetrics!]! + nodeMetricsList( + cluster: String! + subCluster: String! + nodeFilter: String! + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + page: PageRequest + resolution: Int + ): NodesResultList! } type Mutation { @@ -296,38 +380,53 @@ type Mutation { updateConfiguration(name: String!, value: String!): String } -type IntRangeOutput { from: Int!, to: Int! } -type TimeRangeOutput { range: String, from: Time!, to: Time! } +type IntRangeOutput { + from: Int! + to: Int! +} +type TimeRangeOutput { + range: String + from: Time! + to: Time! +} + +input NodeFilter { + hostname: StringInput + cluster: StringInput + subCluster: StringInput + nodeState: NodeState + healthState: MonitoringState +} input JobFilter { - tags: [ID!] - dbId: [ID!] - jobId: StringInput - arrayJobId: Int - user: StringInput - project: StringInput - jobName: StringInput - cluster: StringInput - partition: StringInput - duration: IntRange - energy: FloatRange + tags: [ID!] + dbId: [ID!] + jobId: StringInput + arrayJobId: Int + user: StringInput + project: StringInput + jobName: StringInput + cluster: StringInput + partition: StringInput + duration: IntRange + energy: FloatRange minRunningFor: Int - numNodes: IntRange + numNodes: IntRange numAccelerators: IntRange - numHWThreads: IntRange + numHWThreads: IntRange - startTime: TimeRange - state: [JobState!] + startTime: TimeRange + state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int - node: StringInput + exclusive: Int + node: StringInput } input OrderByInput { field: String! - type: String!, + type: String! order: SortDirectionEnum! = ASC } @@ -337,34 +436,46 @@ enum SortDirectionEnum { } input StringInput { - eq: String - neq: String - contains: String + eq: String + neq: String + contains: String startsWith: String - endsWith: String - in: [String!] + endsWith: String + in: [String!] } -input IntRange { from: Int!, to: Int! } -input TimeRange { range: String, from: Time, to: Time } +input IntRange { + from: Int! + to: Int! +} +input TimeRange { + range: String + from: Time + to: Time +} input FloatRange { from: Float! to: Float! } +type NodeStateResultList { + items: [Node!]! + count: Int +} + type JobResultList { - items: [Job!]! + items: [Job!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int hasNextPage: Boolean } type JobLinkResultList { listQuery: String - items: [JobLink!]! - count: Int + items: [JobLink!]! + count: Int } type HistoPoint { @@ -386,27 +497,27 @@ type MetricHistoPoint { max: Int } -type JobsStatistics { - id: ID! # If `groupBy` was used, ID of the user/project/cluster - name: String! # if User-Statistics: Given Name of Account (ID) Owner - totalJobs: Int! # Number of jobs - runningJobs: Int! # Number of running jobs - shortJobs: Int! # Number of jobs with a duration of less than duration - totalWalltime: Int! # Sum of the duration of all matched jobs in hours - totalNodes: Int! # Sum of the nodes of all matched jobs - totalNodeHours: Int! # Sum of the node hours of all matched jobs - totalCores: Int! # Sum of the cores of all matched jobs - totalCoreHours: Int! # Sum of the core hours of all matched jobs - totalAccs: Int! # Sum of the accs of all matched jobs - totalAccHours: Int! # Sum of the gpu hours of all matched jobs - histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value - histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes - histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores - histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs - histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average +type JobsStatistics { + id: ID! # If `groupBy` was used, ID of the user/project/cluster + name: String! # if User-Statistics: Given Name of Account (ID) Owner + totalJobs: Int! # Number of jobs + runningJobs: Int! # Number of running jobs + shortJobs: Int! # Number of jobs with a duration of less than duration + totalWalltime: Int! # Sum of the duration of all matched jobs in hours + totalNodes: Int! # Sum of the nodes of all matched jobs + totalNodeHours: Int! # Sum of the node hours of all matched jobs + totalCores: Int! # Sum of the cores of all matched jobs + totalCoreHours: Int! # Sum of the core hours of all matched jobs + totalAccs: Int! # Sum of the accs of all matched jobs + totalAccHours: Int! # Sum of the gpu hours of all matched jobs + histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value + histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes + histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores + histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs + histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average } input PageRequest { itemsPerPage: Int! - page: Int! + page: Int! } diff --git a/cmd/cc-backend/cli.go b/cmd/cc-backend/cli.go index 8d9e7e6..8b826bb 100644 --- a/cmd/cc-backend/cli.go +++ b/cmd/cc-backend/cli.go @@ -7,8 +7,9 @@ package main import "flag" var ( - flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, flagForceDB, flagDev, flagVersion, flagLogDateTime bool - flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string + flagReinitDB, flagInit, flagServer, flagSyncLDAP, flagGops, flagMigrateDB, flagRevertDB, + flagForceDB, flagDev, flagVersion, flagLogDateTime, flagApplyTags bool + flagNewUser, flagDelUser, flagGenJWT, flagConfigFile, flagImportJob, flagLogLevel string ) func cliInit() { @@ -21,6 +22,7 @@ func cliInit() { flag.BoolVar(&flagVersion, "version", false, "Show version information and exit") flag.BoolVar(&flagMigrateDB, "migrate-db", false, "Migrate database to supported version and exit") flag.BoolVar(&flagRevertDB, "revert-db", false, "Migrate database to previous version and exit") + flag.BoolVar(&flagApplyTags, "apply-tags", false, "Run taggers on all completed jobs and exit") flag.BoolVar(&flagForceDB, "force-db", false, "Force database version, clear dirty flag and exit") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`") diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 4b6d7f9..ab07d28 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -19,7 +19,9 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/tagger" "github.com/ClusterCockpit/cc-backend/internal/taskManager" + "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/runtimeEnv" @@ -211,11 +213,22 @@ func main() { } } + if config.Keys.EnableJobTaggers { + tagger.Init() + } + + if flagApplyTags { + if err := tagger.RunTaggers(); err != nil { + log.Abortf("Running job taggers.\nError: %s\n", err.Error()) + } + } + if !flagServer { log.Exit("No errors, server flag not set. Exiting cc-backend.") } archiver.Start(repository.GetJobRepository()) + taskManager.Start() serverInit() @@ -237,6 +250,8 @@ func main() { serverShutdown() + util.FsWatcherShutdown() + taskManager.Shutdown() }() diff --git a/go.mod b/go.mod index 98d1cab..f55412d 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,8 @@ require ( github.com/ClusterCockpit/cc-units v0.4.0 github.com/Masterminds/squirrel v1.5.4 github.com/coreos/go-oidc/v3 v3.12.0 + github.com/expr-lang/expr v1.17.3 + github.com/fsnotify/fsnotify v1.9.0 github.com/go-co-op/gocron/v2 v2.16.0 github.com/go-ldap/ldap/v3 v3.4.10 github.com/go-sql-driver/mysql v1.9.0 @@ -18,8 +20,8 @@ require ( github.com/gorilla/handlers v1.5.2 github.com/gorilla/mux v1.8.1 github.com/gorilla/sessions v1.4.0 - github.com/influxdata/influxdb-client-go/v2 v2.14.0 github.com/jmoiron/sqlx v1.4.0 + github.com/joho/godotenv v1.5.1 github.com/mattn/go-sqlite3 v1.14.24 github.com/prometheus/client_golang v1.21.0 github.com/prometheus/common v0.62.0 @@ -39,7 +41,6 @@ require ( github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect github.com/KyleBanks/depth v1.2.1 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect - github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect @@ -57,8 +58,6 @@ require ( github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect - github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect - github.com/joho/godotenv v1.5.1 // indirect github.com/jonboulle/clockwork v0.5.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect @@ -70,7 +69,6 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect - github.com/oapi-codegen/runtime v1.1.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index a76e112..a935407 100644 --- a/go.sum +++ b/go.sum @@ -16,7 +16,6 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/PuerkitoBio/goquery v1.9.3 h1:mpJr/ikUA9/GNJB/DBZcGeFDXUtosHRyRrwh7KGdTG0= github.com/PuerkitoBio/goquery v1.9.3/go.mod h1:1ndLHPdTz+DyQPICCWYlYQMPl0oXZj0G6D4LCYA6u4U= -github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= github.com/alexbrainman/sspi v0.0.0-20231016080023-1a75b4708caa h1:LHTHcTQiSGT7VVbI0o4wBRNQIgn917usHWOd6VAffYI= @@ -25,13 +24,10 @@ github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNg github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= -github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= -github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coreos/go-oidc/v3 v3.12.0 h1:sJk+8G2qq94rDI6ehZ71Bol3oUHy63qNYmkiSjrc/Jo= @@ -53,8 +49,12 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/expr-lang/expr v1.17.3 h1:myeTTuDFz7k6eFe/JPlep/UsiIjVhG61FMHFu63U7j0= +github.com/expr-lang/expr v1.17.3/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk= github.com/go-asn1-ber/asn1-ber v1.5.7/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-co-op/gocron/v2 v2.16.0 h1:uqUF6WFZ4enRU45pWFNcn1xpDLc+jBOTKhPQI16Z1xs= @@ -119,10 +119,6 @@ github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/C github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= -github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4= -github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= -github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= -github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= @@ -147,7 +143,6 @@ github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2E github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE= github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -182,8 +177,6 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= -github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= @@ -215,7 +208,6 @@ github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4= github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg= -github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= diff --git a/gqlgen.yml b/gqlgen.yml index ccd95ff..307a074 100644 --- a/gqlgen.yml +++ b/gqlgen.yml @@ -62,6 +62,11 @@ models: fields: partitions: resolver: true + Node: + model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Node" + fields: + metaData: + resolver: true NullableFloat: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Float" } MetricScope: @@ -81,6 +86,10 @@ models: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.Resource" } JobState: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.JobState" } + MonitoringState: + { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.NodeState" } + HealthState: + { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.MonitoringState" } TimeRange: { model: "github.com/ClusterCockpit/cc-backend/pkg/schema.TimeRange" } IntRange: diff --git a/internal/api/api_test.go b/internal/api/api_test.go index e67813c..a938cb6 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -123,7 +123,7 @@ func setup(t *testing.T) *api.RestApi { t.Fatal(err) } - if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), []byte(fmt.Sprintf("%d", 2)), 0666); err != nil { + if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 2), 0666); err != nil { t.Fatal(err) } @@ -204,11 +204,11 @@ func TestRestApi(t *testing.T) { restapi.MountApiRoutes(r) var TestJobId int64 = 123 - var TestClusterName string = "testcluster" + TestClusterName := "testcluster" var TestStartTime int64 = 123456789 const startJobBody string = `{ - "jobId": 123, + "jobId": 123, "user": "testuser", "project": "testproj", "cluster": "testcluster", @@ -221,7 +221,6 @@ func TestRestApi(t *testing.T) { "exclusive": 1, "monitoringStatus": 1, "smt": 1, - "tags": [{ "type": "testTagType", "name": "testTagName", "scope": "testuser" }], "resources": [ { "hostname": "host123", @@ -252,16 +251,17 @@ func TestRestApi(t *testing.T) { if response.StatusCode != http.StatusCreated { t.Fatal(response.Status, recorder.Body.String()) } - resolver := graph.GetResolverInstance() + // resolver := graph.GetResolverInstance() + restapi.JobRepository.SyncJobs() job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } - job.Tags, err = resolver.Job().Tags(ctx, job) - if err != nil { - t.Fatal(err) - } + // job.Tags, err = resolver.Job().Tags(ctx, job) + // if err != nil { + // t.Fatal(err) + // } if job.JobID != 123 || job.User != "testuser" || @@ -278,13 +278,13 @@ func TestRestApi(t *testing.T) { job.MonitoringStatus != 1 || job.SMT != 1 || !reflect.DeepEqual(job.Resources, []*schema.Resource{{Hostname: "host123", HWThreads: []int{0, 1, 2, 3, 4, 5, 6, 7}}}) || - job.StartTime.Unix() != 123456789 { + job.StartTime != 123456789 { t.Fatalf("unexpected job properties: %#v", job) } - if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { - t.Fatalf("unexpected tags: %#v", job.Tags) - } + // if len(job.Tags) != 1 || job.Tags[0].Type != "testTagType" || job.Tags[0].Name != "testTagName" || job.Tags[0].Scope != "testuser" { + // t.Fatalf("unexpected tags: %#v", job.Tags) + // } }); !ok { return } @@ -352,7 +352,7 @@ func TestRestApi(t *testing.T) { t.Run("CheckDoubleStart", func(t *testing.T) { // Starting a job with the same jobId and cluster should only be allowed if the startTime is far appart! - body := strings.Replace(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`, -1) + body := strings.ReplaceAll(startJobBody, `"startTime": 123456789`, `"startTime": 123456790`) req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(body))) recorder := httptest.NewRecorder() @@ -402,6 +402,7 @@ func TestRestApi(t *testing.T) { } time.Sleep(1 * time.Second) + restapi.JobRepository.SyncJobs() const stopJobBodyFailed string = `{ "jobId": 12345, diff --git a/internal/api/cluster.go b/internal/api/cluster.go new file mode 100644 index 0000000..0529480 --- /dev/null +++ b/internal/api/cluster.go @@ -0,0 +1,70 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "bufio" + "encoding/json" + "fmt" + "net/http" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +// GetClustersApiResponse model +type GetClustersApiResponse struct { + Clusters []*schema.Cluster `json:"clusters"` // Array of clusters +} + +// getClusters godoc +// @summary Lists all cluster configs +// @tags Cluster query +// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. +// @produce json +// @param cluster query string false "Job Cluster" +// @success 200 {object} api.GetClustersApiResponse "Array of clusters" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/clusters/ [get] +func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) { + if user := repository.GetUserFromContext(r.Context()); user != nil && + !user.HasRole(schema.RoleApi) { + + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + var clusters []*schema.Cluster + + if r.URL.Query().Has("cluster") { + name := r.URL.Query().Get("cluster") + cluster := archive.GetCluster(name) + if cluster == nil { + handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw) + return + } + clusters = append(clusters, cluster) + } else { + clusters = archive.Clusters + } + + payload := GetClustersApiResponse{ + Clusters: clusters, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} diff --git a/internal/api/job.go b/internal/api/job.go new file mode 100644 index 0000000..1af6c38 --- /dev/null +++ b/internal/api/job.go @@ -0,0 +1,987 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "bufio" + "database/sql" + "encoding/json" + "errors" + "fmt" + "net/http" + "strconv" + "strings" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/archiver" + "github.com/ClusterCockpit/cc-backend/internal/graph" + "github.com/ClusterCockpit/cc-backend/internal/graph/model" + "github.com/ClusterCockpit/cc-backend/internal/importer" + "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "github.com/gorilla/mux" +) + +// DefaultApiResponse model +type DefaultJobApiResponse struct { + Message string `json:"msg"` +} + +// StopJobApiRequest model +type StopJobApiRequest struct { + JobId *int64 `json:"jobId" example:"123000"` + Cluster *string `json:"cluster" example:"fritz"` + StartTime *int64 `json:"startTime" example:"1649723812"` + State schema.JobState `json:"jobState" validate:"required" example:"completed"` + StopTime int64 `json:"stopTime" validate:"required" example:"1649763839"` +} + +// DeleteJobApiRequest model +type DeleteJobApiRequest struct { + JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job + Cluster *string `json:"cluster" example:"fritz"` // Cluster of job + StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch +} + +// GetJobsApiResponse model +type GetJobsApiResponse struct { + Jobs []*schema.Job `json:"jobs"` // Array of jobs + Items int `json:"items"` // Number of jobs returned + Page int `json:"page"` // Page id returned +} + +// ApiTag model +type ApiTag struct { + // Tag Type + Type string `json:"type" example:"Debug"` + Name string `json:"name" example:"Testjob"` // Tag Name + Scope string `json:"scope" example:"global"` // Tag Scope for Frontend Display +} + +// ApiMeta model +type EditMetaRequest struct { + Key string `json:"key" example:"jobScript"` + Value string `json:"value" example:"bash script"` +} + +type TagJobApiRequest []*ApiTag + +type GetJobApiRequest []string + +type GetJobApiResponse struct { + Meta *schema.Job + Data []*JobMetricWithName +} + +type GetCompleteJobApiResponse struct { + Meta *schema.Job + Data schema.JobData +} + +type JobMetricWithName struct { + Metric *schema.JobMetric `json:"metric"` + Name string `json:"name"` + Scope schema.MetricScope `json:"scope"` +} + +// getJobs godoc +// @summary Lists all jobs +// @tags Job query +// @description Get a list of all jobs. Filters can be applied using query parameters. +// @description Number of results can be limited by page. Results are sorted by descending startTime. +// @produce json +// @param state query string false "Job State" Enums(running, completed, failed, cancelled, stopped, timeout) +// @param cluster query string false "Job Cluster" +// @param start-time query string false "Syntax: '$from-$to', as unix epoch timestamps in seconds" +// @param items-per-page query int false "Items per page (Default: 25)" +// @param page query int false "Page Number (Default: 1)" +// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" +// @success 200 {object} api.GetJobsApiResponse "Job array and page info" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/ [get] +func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) { + withMetadata := false + filter := &model.JobFilter{} + page := &model.PageRequest{ItemsPerPage: 25, Page: 1} + order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc} + + for key, vals := range r.URL.Query() { + switch key { + case "state": + for _, s := range vals { + state := schema.JobState(s) + if !state.Valid() { + handleError(fmt.Errorf("invalid query parameter value: state"), + http.StatusBadRequest, rw) + return + } + filter.State = append(filter.State, state) + } + case "cluster": + filter.Cluster = &model.StringInput{Eq: &vals[0]} + case "start-time": + st := strings.Split(vals[0], "-") + if len(st) != 2 { + handleError(fmt.Errorf("invalid query parameter value: startTime"), + http.StatusBadRequest, rw) + return + } + from, err := strconv.ParseInt(st[0], 10, 64) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + to, err := strconv.ParseInt(st[1], 10, 64) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + ufrom, uto := time.Unix(from, 0), time.Unix(to, 0) + filter.StartTime = &schema.TimeRange{From: &ufrom, To: &uto} + case "page": + x, err := strconv.Atoi(vals[0]) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + page.Page = x + case "items-per-page": + x, err := strconv.Atoi(vals[0]) + if err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + page.ItemsPerPage = x + case "with-metadata": + withMetadata = true + default: + handleError(fmt.Errorf("invalid query parameter: %s", key), + http.StatusBadRequest, rw) + return + } + } + + jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + results := make([]*schema.Job, 0, len(jobs)) + for _, job := range jobs { + if withMetadata { + if _, err = api.JobRepository.FetchMetadata(job); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + if job.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful { + job.Statistics, err = archive.GetStatistics(job) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + } + + results = append(results, job) + } + + log.Debugf("/api/jobs: %d jobs returned", len(results)) + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + payload := GetJobsApiResponse{ + Jobs: results, + Items: page.ItemsPerPage, + Page: page.Page, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} + +// getCompleteJobById godoc +// @summary Get job meta and optional all metric data +// @tags Job query +// @description Job to get is specified by database ID +// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. +// @produce json +// @param id path int true "Database ID of Job" +// @param all-metrics query bool false "Include all available metrics" +// @success 200 {object} api.GetJobApiResponse "Job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/{id} [get] +func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) { + // Fetch job from db + id, ok := mux.Vars(r)["id"] + var job *schema.Job + var err error + if ok { + id, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) + return + } + + job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID + } else { + handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + + } + if _, err = api.JobRepository.FetchMetadata(job); err != nil { + + handleError(err, http.StatusInternalServerError, rw) + return + } + + var scopes []schema.MetricScope + + if job.NumNodes == 1 { + scopes = []schema.MetricScope{"core"} + } else { + scopes = []schema.MetricScope{"node"} + } + + var data schema.JobData + + metricConfigs := archive.GetCluster(job.Cluster).MetricConfig + resolution := 0 + + for _, mc := range metricConfigs { + resolution = max(resolution, mc.Timestep) + } + + if r.URL.Query().Get("all-metrics") == "true" { + data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution) + if err != nil { + log.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster) + return + } + } + + log.Debugf("/api/job/%s: get job %d", id, job.JobID) + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + payload := GetCompleteJobApiResponse{ + Meta: job, + Data: data, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} + +// getJobById godoc +// @summary Get job meta and configurable metric data +// @tags Job query +// @description Job to get is specified by database ID +// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. +// @accept json +// @produce json +// @param id path int true "Database ID of Job" +// @param request body api.GetJobApiRequest true "Array of metric names" +// @success 200 {object} api.GetJobApiResponse "Job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/{id} [post] +func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) { + // Fetch job from db + id, ok := mux.Vars(r)["id"] + var job *schema.Job + var err error + if ok { + id, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) + return + } + + job, err = api.JobRepository.FindById(r.Context(), id) + } else { + handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + + } + if _, err = api.JobRepository.FetchMetadata(job); err != nil { + + handleError(err, http.StatusInternalServerError, rw) + return + } + + var metrics GetJobApiRequest + if err = decode(r.Body, &metrics); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + var scopes []schema.MetricScope + + if job.NumNodes == 1 { + scopes = []schema.MetricScope{"core"} + } else { + scopes = []schema.MetricScope{"node"} + } + + metricConfigs := archive.GetCluster(job.Cluster).MetricConfig + resolution := 0 + + for _, mc := range metricConfigs { + resolution = max(resolution, mc.Timestep) + } + + data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution) + if err != nil { + log.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster) + return + } + + res := []*JobMetricWithName{} + for name, md := range data { + for scope, metric := range md { + res = append(res, &JobMetricWithName{ + Name: name, + Scope: scope, + Metric: metric, + }) + } + } + + log.Debugf("/api/job/%s: get job %d", id, job.JobID) + rw.Header().Add("Content-Type", "application/json") + bw := bufio.NewWriter(rw) + defer bw.Flush() + + payload := GetJobApiResponse{ + Meta: job, + Data: res, + } + + if err := json.NewEncoder(bw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} + +// editMeta godoc +// @summary Edit meta-data json +// @tags Job add and modify +// @description Edit key value pairs in job metadata json +// @description If a key already exists its content will be overwritten +// @accept json +// @produce json +// @param id path int true "Job Database ID" +// @param request body api.EditMetaRequest true "Kay value pair to add" +// @success 200 {object} schema.Job "Updated job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/edit_meta/{id} [post] +func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) { + id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + job, err := api.JobRepository.FindById(r.Context(), id) + if err != nil { + http.Error(rw, err.Error(), http.StatusNotFound) + return + } + + var req EditMetaRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + if err := api.JobRepository.UpdateMetadata(job, req.Key, req.Value); err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) +} + +// tagJob godoc +// @summary Adds one or more tags to a job +// @tags Job add and modify +// @description Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely. +// @description Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username. +// @description If tagged job is already finished: Tag will be written directly to respective archive files. +// @accept json +// @produce json +// @param id path int true "Job Database ID" +// @param request body api.TagJobApiRequest true "Array of tag-objects to add" +// @success 200 {object} schema.Job "Updated job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/tag_job/{id} [post] +func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) { + id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + job, err := api.JobRepository.FindById(r.Context(), id) + if err != nil { + http.Error(rw, err.Error(), http.StatusNotFound) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + var req TagJobApiRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + for _, tag := range req { + tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), *job.ID, tag.Type, tag.Name, tag.Scope) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + job.Tags = append(job.Tags, &schema.Tag{ + ID: tagId, + Type: tag.Type, + Name: tag.Name, + Scope: tag.Scope, + }) + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) +} + +// removeTagJob godoc +// @summary Removes one or more tags from a job +// @tags Job add and modify +// @description Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match. +// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. +// @description If tagged job is already finished: Tag will be removed from respective archive files. +// @accept json +// @produce json +// @param id path int true "Job Database ID" +// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @success 200 {object} schema.Job "Updated job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /jobs/tag_job/{id} [delete] +func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) { + id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + job, err := api.JobRepository.FindById(r.Context(), id) + if err != nil { + http.Error(rw, err.Error(), http.StatusNotFound) + return + } + + job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), job.ID) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + var req TagJobApiRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + for _, rtag := range req { + // Only Global and Admin Tags + if rtag.Scope != "global" && rtag.Scope != "admin" { + log.Warnf("Cannot delete private tag for job %d: Skip", job.JobID) + continue + } + + remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), *job.ID, rtag.Type, rtag.Name, rtag.Scope) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + job.Tags = remainingTags + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) +} + +// removeTags godoc +// @summary Removes all tags and job-relations for type:name tuple +// @tags Tag remove +// @description Removes tags by type and name. Name and Type of Tag(s) must match. +// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. +// @description Tag wills be removed from respective archive files. +// @accept json +// @produce plain +// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @success 200 {string} string "Success Response" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /tags/ [delete] +func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) { + var req TagJobApiRequest + if err := decode(r.Body, &req); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + targetCount := len(req) + currentCount := 0 + for _, rtag := range req { + // Only Global and Admin Tags + if rtag.Scope != "global" && rtag.Scope != "admin" { + log.Warn("Cannot delete private tag: Skip") + continue + } + + err := api.JobRepository.RemoveTagByRequest(rtag.Type, rtag.Name, rtag.Scope) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } else { + currentCount++ + } + } + + rw.WriteHeader(http.StatusOK) + fmt.Fprintf(rw, "Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount) +} + +// startJob godoc +// @summary Adds a new job as "running" +// @tags Job add and modify +// @description Job specified in request body will be saved to database as "running" with new DB ID. +// @description Job specifications follow the 'JobMeta' scheme, API will fail to execute if requirements are not met. +// @accept json +// @produce json +// @param request body schema.JobMeta true "Job to add" +// @success 201 {object} api.DefaultJobApiResponse "Job added successfully" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: The combination of jobId, clusterId and startTime does already exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/start_job/ [post] +func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { + req := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + + log.Printf("REST: %s\n", req.GoString()) + req.State = schema.JobStateRunning + + if err := importer.SanityChecks(&req); err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } + + // aquire lock to avoid race condition between API calls + var unlockOnce sync.Once + api.RepositoryMutex.Lock() + defer unlockOnce.Do(api.RepositoryMutex.Unlock) + + // Check if combination of (job_id, cluster_id, start_time) already exists: + jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil) + if err != nil && err != sql.ErrNoRows { + handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw) + return + } else if err == nil { + for _, job := range jobs { + if (req.StartTime - job.StartTime) < 86400 { + handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw) + return + } + } + } + + id, err := api.JobRepository.Start(&req) + if err != nil { + handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw) + return + } + // unlock here, adding Tags can be async + unlockOnce.Do(api.RepositoryMutex.Unlock) + + for _, tag := range req.Tags { + if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw) + return + } + } + + log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime) + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusCreated) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: "success", + }) +} + +// stopJobByRequest godoc +// @summary Marks job as completed and triggers archiving +// @tags Job add and modify +// @description Job to stop is specified by request body. All fields are required in this case. +// @description Returns full job resource information according to 'JobMeta' scheme. +// @produce json +// @param request body api.StopJobApiRequest true "All fields required" +// @success 200 {object} schema.JobMeta "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: job has already been stopped" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/stop_job/ [post] +func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { + // Parse request body + req := StopJobApiRequest{} + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + + // Fetch job (that will be stopped) from db + var job *schema.Job + var err error + if req.JobId == nil { + handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) + return + } + + // log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) + job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) + if err != nil { + job, err = api.JobRepository.FindCached(req.JobId, req.Cluster, req.StartTime) + // FIXME: Previous error is hidden + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + } + + api.checkAndHandleStopJob(rw, job, req) +} + +// deleteJobById godoc +// @summary Remove a job from the sql database +// @tags Job remove +// @description Job to remove is specified by database ID. This will not remove the job from the job archive. +// @produce json +// @param id path int true "Database ID of Job" +// @success 200 {object} api.DefaultJobApiResponse "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/delete_job/{id} [delete] +func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) { + // Fetch job (that will be stopped) from db + id, ok := mux.Vars(r)["id"] + var err error + if ok { + id, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) + return + } + + err = api.JobRepository.DeleteJobById(id) + } else { + handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: fmt.Sprintf("Successfully deleted job %s", id), + }) +} + +// deleteJobByRequest godoc +// @summary Remove a job from the sql database +// @tags Job remove +// @description Job to delete is specified by request body. All fields are required in this case. +// @accept json +// @produce json +// @param request body api.DeleteJobApiRequest true "All fields required" +// @success 200 {object} api.DefaultJobApiResponse "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/delete_job/ [delete] +func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) { + // Parse request body + req := DeleteJobApiRequest{} + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + + // Fetch job (that will be deleted) from db + var job *schema.Job + var err error + if req.JobId == nil { + handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) + return + } + + job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + + err = api.JobRepository.DeleteJobById(*job.ID) + if err != nil { + handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: fmt.Sprintf("Successfully deleted job %d", job.ID), + }) +} + +// deleteJobBefore godoc +// @summary Remove a job from the sql database +// @tags Job remove +// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. +// @produce json +// @param ts path int true "Unix epoch timestamp" +// @success 200 {object} api.DefaultJobApiResponse "Success message" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 404 {object} api.ErrorResponse "Resource not found" +// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/delete_job_before/{ts} [delete] +func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { + var cnt int + // Fetch job (that will be stopped) from db + id, ok := mux.Vars(r)["ts"] + var err error + if ok { + ts, e := strconv.ParseInt(id, 10, 64) + if e != nil { + handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw) + return + } + + cnt, err = api.JobRepository.DeleteJobsBefore(ts) + } else { + handleError(errors.New("the parameter 'ts' is required"), http.StatusBadRequest, rw) + return + } + if err != nil { + handleError(fmt.Errorf("deleting jobs failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(DefaultJobApiResponse{ + Message: fmt.Sprintf("Successfully deleted %d jobs", cnt), + }) +} + +func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobApiRequest) { + // Sanity checks + if job.State != schema.JobStateRunning { + handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) + return + } + + if job == nil || job.StartTime > req.StopTime { + handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw) + return + } + + if req.State != "" && !req.State.Valid() { + handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw) + return + } else if req.State == "" { + req.State = schema.JobStateCompleted + } + + // Mark job as stopped in the database (update state and duration) + job.Duration = int32(req.StopTime - job.StartTime) + job.State = req.State + api.JobRepository.Mutex.Lock() + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + api.JobRepository.Mutex.Unlock() + handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + return + } + } + api.JobRepository.Mutex.Unlock() + + log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) + + // Send a response (with status OK). This means that erros that happen from here on forward + // can *NOT* be communicated to the client. If reading from a MetricDataRepository or + // writing to the filesystem fails, the client will not know. + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) + + // Monitoring is disabled... + if job.MonitoringStatus == schema.MonitoringStatusDisabled { + return + } + + // Trigger async archiving + archiver.TriggerArchiving(job) +} + +func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) { + id := mux.Vars(r)["id"] + metrics := r.URL.Query()["metric"] + var scopes []schema.MetricScope + for _, scope := range r.URL.Query()["scope"] { + var s schema.MetricScope + if err := s.UnmarshalGQL(scope); err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + scopes = append(scopes, s) + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + + type Respone struct { + Data *struct { + JobMetrics []*model.JobMetricWithName `json:"jobMetrics"` + } `json:"data"` + Error *struct { + Message string `json:"message"` + } `json:"error"` + } + + resolver := graph.GetResolverInstance() + data, err := resolver.Query().JobMetrics(r.Context(), id, metrics, scopes, nil) + if err != nil { + json.NewEncoder(rw).Encode(Respone{ + Error: &struct { + Message string "json:\"message\"" + }{Message: err.Error()}, + }) + return + } + + json.NewEncoder(rw).Encode(Respone{ + Data: &struct { + JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\"" + }{JobMetrics: data}, + }) +} diff --git a/internal/api/node.go b/internal/api/node.go new file mode 100644 index 0000000..7a582ed --- /dev/null +++ b/internal/api/node.go @@ -0,0 +1,74 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "fmt" + "net/http" + "strings" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +type Node struct { + Name string `json:"hostname"` + States []string `json:"states"` +} + +// updateNodeStatesRequest model +type UpdateNodeStatesRequest struct { + Nodes []Node `json:"nodes"` + Cluster string `json:"cluster" example:"fritz"` +} + +// this routine assumes that only one of them applies per node +func determineState(states []string) schema.NodeState { + for _, state := range states { + switch strings.ToLower(state) { + case "allocated": + return schema.NodeStateAllocated + case "reserved": + return schema.NodeStateReserved + case "idle": + return schema.NodeStateIdle + case "down": + return schema.NodeStateDown + case "mixed": + return schema.NodeStateMixed + } + } + + return schema.NodeStateUnknown +} + +// updateNodeStates godoc +// @summary Deliver updated Slurm node states +// @tags node +// @description Returns a JSON-encoded list of users. +// @description Required query-parameter defines if all users or only users with additional special roles are returned. +// @produce json +// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states" +// @success 200 {array} api.SuccessResponse "Success" +// @failure 400 {string} string "Bad Request" +// @failure 401 {string} string "Unauthorized" +// @failure 403 {string} string "Forbidden" +// @failure 500 {string} string "Internal Server Error" +// @security ApiKeyAuth +// @router /api/nodestats/ [post] +func (api *RestApi) updateNodeStates(rw http.ResponseWriter, r *http.Request) { + // Parse request body + req := UpdateNodeStatesRequest{} + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + repo := repository.GetNodeRepository() + + for _, node := range req.Nodes { + state := determineState(node.States) + repo.UpdateNodeState(node.Name, req.Cluster, &state) + } +} diff --git a/internal/api/rest.go b/internal/api/rest.go index 669768e..54472d8 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -5,30 +5,18 @@ package api import ( - "bufio" - "database/sql" "encoding/json" - "errors" "fmt" "io" "net/http" "os" "path/filepath" - "strconv" - "strings" "sync" - "time" - "github.com/ClusterCockpit/cc-backend/internal/archiver" "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-backend/internal/graph" - "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/importer" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/util" - "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/schema" "github.com/gorilla/mux" @@ -73,6 +61,8 @@ func (api *RestApi) MountApiRoutes(r *mux.Router) { r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) // Cluster List r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet) + // Slurm node state + r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut) // Job Handler r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut) r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut) @@ -120,46 +110,13 @@ func (api *RestApi) MountConfigApiRoutes(r *mux.Router) { func (api *RestApi) MountFrontendApiRoutes(r *mux.Router) { r.StrictSlash(true) - // Settings Frontrend Uses SessionAuth + // Settings Frontend Uses SessionAuth if api.Authentication != nil { r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet) r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost) } } -// DefaultApiResponse model -type DefaultJobApiResponse struct { - Message string `json:"msg"` -} - -// StopJobApiRequest model -type StopJobApiRequest struct { - JobId *int64 `json:"jobId" example:"123000"` - Cluster *string `json:"cluster" example:"fritz"` - StartTime *int64 `json:"startTime" example:"1649723812"` - State schema.JobState `json:"jobState" validate:"required" example:"completed"` - StopTime int64 `json:"stopTime" validate:"required" example:"1649763839"` -} - -// DeleteJobApiRequest model -type DeleteJobApiRequest struct { - JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job - Cluster *string `json:"cluster" example:"fritz"` // Cluster of job - StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch -} - -// GetJobsApiResponse model -type GetJobsApiResponse struct { - Jobs []*schema.JobMeta `json:"jobs"` // Array of jobs - Items int `json:"items"` // Number of jobs returned - Page int `json:"page"` // Page id returned -} - -// GetClustersApiResponse model -type GetClustersApiResponse struct { - Clusters []*schema.Cluster `json:"clusters"` // Array of clusters -} - // ErrorResponse model type ErrorResponse struct { // Statustext of Errorcode @@ -167,48 +124,6 @@ type ErrorResponse struct { Error string `json:"error"` // Error Message } -// ApiTag model -type ApiTag struct { - // Tag Type - Type string `json:"type" example:"Debug"` - Name string `json:"name" example:"Testjob"` // Tag Name - Scope string `json:"scope" example:"global"` // Tag Scope for Frontend Display -} - -// ApiMeta model -type EditMetaRequest struct { - Key string `json:"key" example:"jobScript"` - Value string `json:"value" example:"bash script"` -} - -type TagJobApiRequest []*ApiTag - -type GetJobApiRequest []string - -type GetJobApiResponse struct { - Meta *schema.Job - Data []*JobMetricWithName -} - -type GetCompleteJobApiResponse struct { - Meta *schema.Job - Data schema.JobData -} - -type JobMetricWithName struct { - Metric *schema.JobMetric `json:"metric"` - Name string `json:"name"` - Scope schema.MetricScope `json:"scope"` -} - -type ApiReturnedUser struct { - Username string `json:"username"` - Name string `json:"name"` - Roles []string `json:"roles"` - Email string `json:"email"` - Projects []string `json:"projects"` -} - func handleError(err error, statusCode int, rw http.ResponseWriter) { log.Warnf("REST ERROR : %s", err.Error()) rw.Header().Add("Content-Type", "application/json") @@ -225,1080 +140,6 @@ func decode(r io.Reader, val any) error { return dec.Decode(val) } -// getClusters godoc -// @summary Lists all cluster configs -// @tags Cluster query -// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. -// @produce json -// @param cluster query string false "Job Cluster" -// @success 200 {object} api.GetClustersApiResponse "Array of clusters" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/clusters/ [get] -func (api *RestApi) getClusters(rw http.ResponseWriter, r *http.Request) { - if user := repository.GetUserFromContext(r.Context()); user != nil && - !user.HasRole(schema.RoleApi) { - - handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) - return - } - - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - var clusters []*schema.Cluster - - if r.URL.Query().Has("cluster") { - name := r.URL.Query().Get("cluster") - cluster := archive.GetCluster(name) - if cluster == nil { - handleError(fmt.Errorf("unknown cluster: %s", name), http.StatusBadRequest, rw) - return - } - clusters = append(clusters, cluster) - } else { - clusters = archive.Clusters - } - - payload := GetClustersApiResponse{ - Clusters: clusters, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// getJobs godoc -// @summary Lists all jobs -// @tags Job query -// @description Get a list of all jobs. Filters can be applied using query parameters. -// @description Number of results can be limited by page. Results are sorted by descending startTime. -// @produce json -// @param state query string false "Job State" Enums(running, completed, failed, cancelled, stopped, timeout) -// @param cluster query string false "Job Cluster" -// @param start-time query string false "Syntax: '$from-$to', as unix epoch timestamps in seconds" -// @param items-per-page query int false "Items per page (Default: 25)" -// @param page query int false "Page Number (Default: 1)" -// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" -// @success 200 {object} api.GetJobsApiResponse "Job array and page info" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/ [get] -func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) { - withMetadata := false - filter := &model.JobFilter{} - page := &model.PageRequest{ItemsPerPage: 25, Page: 1} - order := &model.OrderByInput{Field: "startTime", Type: "col", Order: model.SortDirectionEnumDesc} - - for key, vals := range r.URL.Query() { - switch key { - case "state": - for _, s := range vals { - state := schema.JobState(s) - if !state.Valid() { - handleError(fmt.Errorf("invalid query parameter value: state"), - http.StatusBadRequest, rw) - return - } - filter.State = append(filter.State, state) - } - case "cluster": - filter.Cluster = &model.StringInput{Eq: &vals[0]} - case "start-time": - st := strings.Split(vals[0], "-") - if len(st) != 2 { - handleError(fmt.Errorf("invalid query parameter value: startTime"), - http.StatusBadRequest, rw) - return - } - from, err := strconv.ParseInt(st[0], 10, 64) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - to, err := strconv.ParseInt(st[1], 10, 64) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - ufrom, uto := time.Unix(from, 0), time.Unix(to, 0) - filter.StartTime = &schema.TimeRange{From: &ufrom, To: &uto} - case "page": - x, err := strconv.Atoi(vals[0]) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - page.Page = x - case "items-per-page": - x, err := strconv.Atoi(vals[0]) - if err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - page.ItemsPerPage = x - case "with-metadata": - withMetadata = true - default: - handleError(fmt.Errorf("invalid query parameter: %s", key), - http.StatusBadRequest, rw) - return - } - } - - jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - - results := make([]*schema.JobMeta, 0, len(jobs)) - for _, job := range jobs { - if withMetadata { - if _, err = api.JobRepository.FetchMetadata(job); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - } - - res := &schema.JobMeta{ - ID: &job.ID, - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - } - - res.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - - if res.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful { - res.Statistics, err = archive.GetStatistics(job) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - } - - results = append(results, res) - } - - log.Debugf("/api/jobs: %d jobs returned", len(results)) - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - payload := GetJobsApiResponse{ - Jobs: results, - Items: page.ItemsPerPage, - Page: page.Page, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// getCompleteJobById godoc -// @summary Get job meta and optional all metric data -// @tags Job query -// @description Job to get is specified by database ID -// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. -// @produce json -// @param id path int true "Database ID of Job" -// @param all-metrics query bool false "Include all available metrics" -// @success 200 {object} api.GetJobApiResponse "Job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/{id} [get] -func (api *RestApi) getCompleteJobById(rw http.ResponseWriter, r *http.Request) { - // Fetch job from db - id, ok := mux.Vars(r)["id"] - var job *schema.Job - var err error - if ok { - id, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) - return - } - - job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID - } else { - handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - - } - if _, err = api.JobRepository.FetchMetadata(job); err != nil { - - handleError(err, http.StatusInternalServerError, rw) - return - } - - var scopes []schema.MetricScope - - if job.NumNodes == 1 { - scopes = []schema.MetricScope{"core"} - } else { - scopes = []schema.MetricScope{"node"} - } - - var data schema.JobData - - metricConfigs := archive.GetCluster(job.Cluster).MetricConfig - resolution := 0 - - for _, mc := range metricConfigs { - resolution = max(resolution, mc.Timestep) - } - - if r.URL.Query().Get("all-metrics") == "true" { - data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution) - if err != nil { - log.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster) - return - } - } - - log.Debugf("/api/job/%s: get job %d", id, job.JobID) - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - payload := GetCompleteJobApiResponse{ - Meta: job, - Data: data, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// getJobById godoc -// @summary Get job meta and configurable metric data -// @tags Job query -// @description Job to get is specified by database ID -// @description Returns full job resource information according to 'JobMeta' scheme and all metrics according to 'JobData'. -// @accept json -// @produce json -// @param id path int true "Database ID of Job" -// @param request body api.GetJobApiRequest true "Array of metric names" -// @success 200 {object} api.GetJobApiResponse "Job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/{id} [post] -func (api *RestApi) getJobById(rw http.ResponseWriter, r *http.Request) { - // Fetch job from db - id, ok := mux.Vars(r)["id"] - var job *schema.Job - var err error - if ok { - id, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) - return - } - - job, err = api.JobRepository.FindById(r.Context(), id) - } else { - handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("finding job with db id %s failed: %w", id, err), http.StatusUnprocessableEntity, rw) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - - } - if _, err = api.JobRepository.FetchMetadata(job); err != nil { - - handleError(err, http.StatusInternalServerError, rw) - return - } - - var metrics GetJobApiRequest - if err = decode(r.Body, &metrics); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - var scopes []schema.MetricScope - - if job.NumNodes == 1 { - scopes = []schema.MetricScope{"core"} - } else { - scopes = []schema.MetricScope{"node"} - } - - metricConfigs := archive.GetCluster(job.Cluster).MetricConfig - resolution := 0 - - for _, mc := range metricConfigs { - resolution = max(resolution, mc.Timestep) - } - - data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution) - if err != nil { - log.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster) - return - } - - res := []*JobMetricWithName{} - for name, md := range data { - for scope, metric := range md { - res = append(res, &JobMetricWithName{ - Name: name, - Scope: scope, - Metric: metric, - }) - } - } - - log.Debugf("/api/job/%s: get job %d", id, job.JobID) - rw.Header().Add("Content-Type", "application/json") - bw := bufio.NewWriter(rw) - defer bw.Flush() - - payload := GetJobApiResponse{ - Meta: job, - Data: res, - } - - if err := json.NewEncoder(bw).Encode(payload); err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } -} - -// editMeta godoc -// @summary Edit meta-data json -// @tags Job add and modify -// @description Edit key value pairs in job metadata json -// @description If a key already exists its content will be overwritten -// @accept json -// @produce json -// @param id path int true "Job Database ID" -// @param request body api.EditMetaRequest true "Kay value pair to add" -// @success 200 {object} schema.Job "Updated job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/edit_meta/{id} [post] -func (api *RestApi) editMeta(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) - if err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - job, err := api.JobRepository.FindById(r.Context(), id) - if err != nil { - http.Error(rw, err.Error(), http.StatusNotFound) - return - } - - var req EditMetaRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - if err := api.JobRepository.UpdateMetadata(job, req.Key, req.Value); err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) -} - -// tagJob godoc -// @summary Adds one or more tags to a job -// @tags Job add and modify -// @description Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely. -// @description Tag Scope for frontend visibility will default to "global" if none entered, other options: "admin" or specific username. -// @description If tagged job is already finished: Tag will be written directly to respective archive files. -// @accept json -// @produce json -// @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to add" -// @success 200 {object} schema.Job "Updated job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/tag_job/{id} [post] -func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) - if err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - job, err := api.JobRepository.FindById(r.Context(), id) - if err != nil { - http.Error(rw, err.Error(), http.StatusNotFound) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - var req TagJobApiRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - for _, tag := range req { - tagId, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), job.ID, tag.Type, tag.Name, tag.Scope) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - job.Tags = append(job.Tags, &schema.Tag{ - ID: tagId, - Type: tag.Type, - Name: tag.Name, - Scope: tag.Scope, - }) - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) -} - -// removeTagJob godoc -// @summary Removes one or more tags from a job -// @tags Job add and modify -// @description Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match. -// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. -// @description If tagged job is already finished: Tag will be removed from respective archive files. -// @accept json -// @produce json -// @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" -// @success 200 {object} schema.Job "Updated job resource" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /jobs/tag_job/{id} [delete] -func (api *RestApi) removeTagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) - if err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - job, err := api.JobRepository.FindById(r.Context(), id) - if err != nil { - http.Error(rw, err.Error(), http.StatusNotFound) - return - } - - job.Tags, err = api.JobRepository.GetTags(repository.GetUserFromContext(r.Context()), &job.ID) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - var req TagJobApiRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - for _, rtag := range req { - // Only Global and Admin Tags - if rtag.Scope != "global" && rtag.Scope != "admin" { - log.Warnf("Cannot delete private tag for job %d: Skip", job.JobID) - continue - } - - remainingTags, err := api.JobRepository.RemoveJobTagByRequest(repository.GetUserFromContext(r.Context()), job.ID, rtag.Type, rtag.Name, rtag.Scope) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - job.Tags = remainingTags - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) -} - -// removeTags godoc -// @summary Removes all tags and job-relations for type:name tuple -// @tags Tag remove -// @description Removes tags by type and name. Name and Type of Tag(s) must match. -// @description Tag Scope is required for matching, options: "global", "admin". Private tags can not be deleted via API. -// @description Tag wills be removed from respective archive files. -// @accept json -// @produce plain -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" -// @success 200 {string} string "Success Response" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 404 {object} api.ErrorResponse "Job or tag does not exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /tags/ [delete] -func (api *RestApi) removeTags(rw http.ResponseWriter, r *http.Request) { - var req TagJobApiRequest - if err := decode(r.Body, &req); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - - targetCount := len(req) - currentCount := 0 - for _, rtag := range req { - // Only Global and Admin Tags - if rtag.Scope != "global" && rtag.Scope != "admin" { - log.Warn("Cannot delete private tag: Skip") - continue - } - - err := api.JobRepository.RemoveTagByRequest(rtag.Type, rtag.Name, rtag.Scope) - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } else { - currentCount++ - } - } - - rw.WriteHeader(http.StatusOK) - rw.Write([]byte(fmt.Sprintf("Deleted Tags from DB: %d successfull of %d requested\n", currentCount, targetCount))) -} - -// startJob godoc -// @summary Adds a new job as "running" -// @tags Job add and modify -// @description Job specified in request body will be saved to database as "running" with new DB ID. -// @description Job specifications follow the 'JobMeta' scheme, API will fail to execute if requirements are not met. -// @accept json -// @produce json -// @param request body schema.JobMeta true "Job to add" -// @success 201 {object} api.DefaultJobApiResponse "Job added successfully" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: The combination of jobId, clusterId and startTime does already exist" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/start_job/ [post] -func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) { - req := schema.JobMeta{BaseJob: schema.JobDefaults} - if err := decode(r.Body, &req); err != nil { - handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) - return - } - - req.State = schema.JobStateRunning - - if err := importer.SanityChecks(&req.BaseJob); err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } - - // aquire lock to avoid race condition between API calls - var unlockOnce sync.Once - api.RepositoryMutex.Lock() - defer unlockOnce.Do(api.RepositoryMutex.Unlock) - - // Check if combination of (job_id, cluster_id, start_time) already exists: - jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil) - if err != nil && err != sql.ErrNoRows { - handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw) - return - } else if err == nil { - for _, job := range jobs { - if (req.StartTime - job.StartTimeUnix) < 86400 { - handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw) - return - } - } - } - - id, err := api.JobRepository.Start(&req) - if err != nil { - handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw) - return - } - // unlock here, adding Tags can be async - unlockOnce.Do(api.RepositoryMutex.Unlock) - - for _, tag := range req.Tags { - if _, err := api.JobRepository.AddTagOrCreate(repository.GetUserFromContext(r.Context()), id, tag.Type, tag.Name, tag.Scope); err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw) - return - } - } - - log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime) - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusCreated) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: "success", - }) -} - -// stopJobByRequest godoc -// @summary Marks job as completed and triggers archiving -// @tags Job add and modify -// @description Job to stop is specified by request body. All fields are required in this case. -// @description Returns full job resource information according to 'JobMeta' scheme. -// @produce json -// @param request body api.StopJobApiRequest true "All fields required" -// @success 200 {object} schema.JobMeta "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: job has already been stopped" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/stop_job/ [post] -func (api *RestApi) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { - // Parse request body - req := StopJobApiRequest{} - if err := decode(r.Body, &req); err != nil { - handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) - return - } - - // Fetch job (that will be stopped) from db - var job *schema.Job - var err error - if req.JobId == nil { - handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) - return - } - - // log.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) - job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) - if err != nil { - handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - - api.checkAndHandleStopJob(rw, job, req) -} - -// deleteJobById godoc -// @summary Remove a job from the sql database -// @tags Job remove -// @description Job to remove is specified by database ID. This will not remove the job from the job archive. -// @produce json -// @param id path int true "Database ID of Job" -// @success 200 {object} api.DefaultJobApiResponse "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/delete_job/{id} [delete] -func (api *RestApi) deleteJobById(rw http.ResponseWriter, r *http.Request) { - // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["id"] - var err error - if ok { - id, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) - return - } - - err = api.JobRepository.DeleteJobById(id) - } else { - handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: fmt.Sprintf("Successfully deleted job %s", id), - }) -} - -// deleteJobByRequest godoc -// @summary Remove a job from the sql database -// @tags Job remove -// @description Job to delete is specified by request body. All fields are required in this case. -// @accept json -// @produce json -// @param request body api.DeleteJobApiRequest true "All fields required" -// @success 200 {object} api.DefaultJobApiResponse "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/delete_job/ [delete] -func (api *RestApi) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) { - // Parse request body - req := DeleteJobApiRequest{} - if err := decode(r.Body, &req); err != nil { - handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) - return - } - - // Fetch job (that will be deleted) from db - var job *schema.Job - var err error - if req.JobId == nil { - handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) - return - } - - job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) - if err != nil { - handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - - err = api.JobRepository.DeleteJobById(job.ID) - if err != nil { - handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: fmt.Sprintf("Successfully deleted job %d", job.ID), - }) -} - -// deleteJobBefore godoc -// @summary Remove a job from the sql database -// @tags Job remove -// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. -// @produce json -// @param ts path int true "Unix epoch timestamp" -// @success 200 {object} api.DefaultJobApiResponse "Success message" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 404 {object} api.ErrorResponse "Resource not found" -// @failure 422 {object} api.ErrorResponse "Unprocessable Entity: finding job failed: sql: no rows in result set" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /api/jobs/delete_job_before/{ts} [delete] -func (api *RestApi) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { - var cnt int - // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["ts"] - var err error - if ok { - ts, e := strconv.ParseInt(id, 10, 64) - if e != nil { - handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw) - return - } - - cnt, err = api.JobRepository.DeleteJobsBefore(ts) - } else { - handleError(errors.New("the parameter 'ts' is required"), http.StatusBadRequest, rw) - return - } - if err != nil { - handleError(fmt.Errorf("deleting jobs failed: %w", err), http.StatusUnprocessableEntity, rw) - return - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(DefaultJobApiResponse{ - Message: fmt.Sprintf("Successfully deleted %d jobs", cnt), - }) -} - -func (api *RestApi) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobApiRequest) { - // Sanity checks - if job.State != schema.JobStateRunning { - handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) - return - } - - if job == nil || job.StartTime.Unix() > req.StopTime { - handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime.Unix()), http.StatusBadRequest, rw) - return - } - - if req.State != "" && !req.State.Valid() { - handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw) - return - } else if req.State == "" { - req.State = schema.JobStateCompleted - } - - // Mark job as stopped in the database (update state and duration) - job.Duration = int32(req.StopTime - job.StartTime.Unix()) - job.State = req.State - if err := api.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) - return - } - - log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) - - // Send a response (with status OK). This means that erros that happen from here on forward - // can *NOT* be communicated to the client. If reading from a MetricDataRepository or - // writing to the filesystem fails, the client will not know. - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - json.NewEncoder(rw).Encode(job) - - // Monitoring is disabled... - if job.MonitoringStatus == schema.MonitoringStatusDisabled { - return - } - - // Trigger async archiving - archiver.TriggerArchiving(job) -} - -func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) { - id := mux.Vars(r)["id"] - metrics := r.URL.Query()["metric"] - var scopes []schema.MetricScope - for _, scope := range r.URL.Query()["scope"] { - var s schema.MetricScope - if err := s.UnmarshalGQL(scope); err != nil { - http.Error(rw, err.Error(), http.StatusBadRequest) - return - } - scopes = append(scopes, s) - } - - rw.Header().Add("Content-Type", "application/json") - rw.WriteHeader(http.StatusOK) - - type Respone struct { - Data *struct { - JobMetrics []*model.JobMetricWithName `json:"jobMetrics"` - } `json:"data"` - Error *struct { - Message string `json:"message"` - } `json:"error"` - } - - resolver := graph.GetResolverInstance() - data, err := resolver.Query().JobMetrics(r.Context(), id, metrics, scopes, nil) - if err != nil { - json.NewEncoder(rw).Encode(Respone{ - Error: &struct { - Message string "json:\"message\"" - }{Message: err.Error()}, - }) - return - } - - json.NewEncoder(rw).Encode(Respone{ - Data: &struct { - JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\"" - }{JobMetrics: data}, - }) -} - -func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - rw.Header().Set("Content-Type", "text/plain") - me := repository.GetUserFromContext(r.Context()) - if !me.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to create new users", http.StatusForbidden) - return - } - - username, password, role, name, email, project := r.FormValue("username"), - r.FormValue("password"), r.FormValue("role"), r.FormValue("name"), - r.FormValue("email"), r.FormValue("project") - - if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) { - http.Error(rw, "Only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest) - return - } - - if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) { - http.Error(rw, "only managers require a project (can be changed later)", - http.StatusBadRequest) - return - } else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) { - http.Error(rw, "managers require a project to manage (can be changed later)", - http.StatusBadRequest) - return - } - - if err := repository.GetUserRepository().AddUser(&schema.User{ - Username: username, - Name: name, - Password: password, - Email: email, - Projects: []string{project}, - Roles: []string{role}, - }); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - - fmt.Fprintf(rw, "User %v successfully created!\n", username) -} - -func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to delete a user", http.StatusForbidden) - return - } - - username := r.FormValue("username") - if err := repository.GetUserRepository().DelUser(username); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - - rw.WriteHeader(http.StatusOK) -} - -// getUsers godoc -// @summary Returns a list of users -// @tags User -// @description Returns a JSON-encoded list of users. -// @description Required query-parameter defines if all users or only users with additional special roles are returned. -// @produce json -// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" -// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" -// @failure 400 {string} string "Bad Request" -// @failure 401 {string} string "Unauthorized" -// @failure 403 {string} string "Forbidden" -// @failure 500 {string} string "Internal Server Error" -// @security ApiKeyAuth -// @router /api/users/ [get] -func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to fetch a list of users", http.StatusForbidden) - return - } - - users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true") - if err != nil { - http.Error(rw, err.Error(), http.StatusInternalServerError) - return - } - - json.NewEncoder(rw).Encode(users) -} - -func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - - if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { - http.Error(rw, "Only admins are allowed to update a user", http.StatusForbidden) - return - } - - // Get Values - newrole := r.FormValue("add-role") - delrole := r.FormValue("remove-role") - newproj := r.FormValue("add-project") - delproj := r.FormValue("remove-project") - - // TODO: Handle anything but roles... - if newrole != "" { - if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Add Role Success")) - } else if delrole != "" { - if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Remove Role Success")) - } else if newproj != "" { - if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Add Project Success")) - } else if delproj != "" { - if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil { - http.Error(rw, err.Error(), http.StatusUnprocessableEntity) - return - } - rw.Write([]byte("Remove Project Success")) - } else { - http.Error(rw, "Not Add or Del [role|project]?", http.StatusInternalServerError) - } -} - func (api *RestApi) editNotice(rw http.ResponseWriter, r *http.Request) { // SecuredCheck() only worked with TokenAuth: Removed diff --git a/internal/api/user.go b/internal/api/user.go new file mode 100644 index 0000000..3ba9c87 --- /dev/null +++ b/internal/api/user.go @@ -0,0 +1,159 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "encoding/json" + "fmt" + "net/http" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "github.com/gorilla/mux" +) + +type ApiReturnedUser struct { + Username string `json:"username"` + Name string `json:"name"` + Roles []string `json:"roles"` + Email string `json:"email"` + Projects []string `json:"projects"` +} + +// getUsers godoc +// @summary Returns a list of users +// @tags User +// @description Returns a JSON-encoded list of users. +// @description Required query-parameter defines if all users or only users with additional special roles are returned. +// @produce json +// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" +// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" +// @failure 400 {string} string "Bad Request" +// @failure 401 {string} string "Unauthorized" +// @failure 403 {string} string "Forbidden" +// @failure 500 {string} string "Internal Server Error" +// @security ApiKeyAuth +// @router /api/users/ [get] +func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to fetch a list of users", http.StatusForbidden) + return + } + + users, err := repository.GetUserRepository().ListUsers(r.URL.Query().Get("not-just-user") == "true") + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + json.NewEncoder(rw).Encode(users) +} + +func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to update a user", http.StatusForbidden) + return + } + + // Get Values + newrole := r.FormValue("add-role") + delrole := r.FormValue("remove-role") + newproj := r.FormValue("add-project") + delproj := r.FormValue("remove-project") + + // TODO: Handle anything but roles... + if newrole != "" { + if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Add Role Success")) + } else if delrole != "" { + if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Remove Role Success")) + } else if newproj != "" { + if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Add Project Success")) + } else if delproj != "" { + if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + rw.Write([]byte("Remove Project Success")) + } else { + http.Error(rw, "Not Add or Del [role|project]?", http.StatusInternalServerError) + } +} + +func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + rw.Header().Set("Content-Type", "text/plain") + me := repository.GetUserFromContext(r.Context()) + if !me.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to create new users", http.StatusForbidden) + return + } + + username, password, role, name, email, project := r.FormValue("username"), + r.FormValue("password"), r.FormValue("role"), r.FormValue("name"), + r.FormValue("email"), r.FormValue("project") + + if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) { + http.Error(rw, "Only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest) + return + } + + if len(project) != 0 && role != schema.GetRoleString(schema.RoleManager) { + http.Error(rw, "only managers require a project (can be changed later)", + http.StatusBadRequest) + return + } else if len(project) == 0 && role == schema.GetRoleString(schema.RoleManager) { + http.Error(rw, "managers require a project to manage (can be changed later)", + http.StatusBadRequest) + return + } + + if err := repository.GetUserRepository().AddUser(&schema.User{ + Username: username, + Name: name, + Password: password, + Email: email, + Projects: []string{project}, + Roles: []string{role}, + }); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + + fmt.Fprintf(rw, "User %v successfully created!\n", username) +} + +func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) { + // SecuredCheck() only worked with TokenAuth: Removed + + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + http.Error(rw, "Only admins are allowed to delete a user", http.StatusForbidden) + return + } + + username := r.FormValue("username") + if err := repository.GetUserRepository().DelUser(username); err != nil { + http.Error(rw, err.Error(), http.StatusUnprocessableEntity) + return + } + + rw.WriteHeader(http.StatusOK) +} diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go index 628e36e..e9f3dc9 100644 --- a/internal/archiver/archiveWorker.go +++ b/internal/archiver/archiveWorker.go @@ -41,7 +41,7 @@ func archivingWorker() { // will fail if job meta not in repository if _, err := jobRepo.FetchMetadata(job); err != nil { log.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error()) - jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) + jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed) continue } @@ -50,7 +50,7 @@ func archivingWorker() { jobMeta, err := ArchiveJob(job, context.Background()) if err != nil { log.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error()) - jobRepo.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed) + jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed) continue } @@ -72,7 +72,11 @@ func archivingWorker() { } log.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) log.Printf("archiving job (dbid: %d) successful", job.ID) + + repository.CallJobStopHooks(job) archivePending.Done() + default: + continue } } } diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 1050ca1..b220d3b 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -16,7 +16,7 @@ import ( ) // Writes a running job to the job-archive -func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { +func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) { allMetrics := make([]string, 0) metricConfigs := archive.GetCluster(job.Cluster).MetricConfig for _, mc := range metricConfigs { @@ -40,11 +40,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { return nil, err } - jobMeta := &schema.JobMeta{ - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - Statistics: make(map[string]schema.JobStatistics), - } + job.Statistics = make(map[string]schema.JobStatistics) for metric, data := range jobData { avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 @@ -61,7 +57,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { } // Round AVG Result to 2 Digits - jobMeta.Statistics[metric] = schema.JobStatistics{ + job.Statistics[metric] = schema.JobStatistics{ Unit: schema.Unit{ Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, @@ -76,8 +72,8 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { // only return the JobMeta structure as the // statistics in there are needed. if config.Keys.DisableArchive { - return jobMeta, nil + return job, nil } - return jobMeta, archive.GetHandle().ImportJob(jobMeta, &jobData) + return job, archive.GetHandle().ImportJob(job, &jobData) } diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 5f88bbb..3e57768 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -237,7 +237,7 @@ func (auth *Authentication) Login( limiter := getIPUserLimiter(ip, username) if !limiter.Allow() { log.Warnf("AUTH/RATE > Too many login attempts for combination IP: %s, Username: %s", ip, username) - onfailure(rw, r, errors.New("Too many login attempts, try again in a few minutes.")) + onfailure(rw, r, errors.New("too many login attempts, try again in a few minutes")) return } diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index e73bcf1..4f3b9fd 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -44,6 +44,7 @@ type ResolverRoot interface { Job() JobResolver MetricValue() MetricValueResolver Mutation() MutationResolver + Node() NodeResolver Query() QueryResolver SubCluster() SubClusterResolver } @@ -268,12 +269,32 @@ type ComplexityRoot struct { Stats func(childComplexity int) int } + Node struct { + Cluster func(childComplexity int) int + HealthState func(childComplexity int) int + Hostname func(childComplexity int) int + ID func(childComplexity int) int + MetaData func(childComplexity int) int + NodeState func(childComplexity int) int + SubCluster func(childComplexity int) int + } + NodeMetrics struct { Host func(childComplexity int) int Metrics func(childComplexity int) int SubCluster func(childComplexity int) int } + NodeStateResultList struct { + Count func(childComplexity int) int + Items func(childComplexity int) int + } + + NodeStats struct { + Count func(childComplexity int) int + State func(childComplexity int) int + } + NodesResultList struct { Count func(childComplexity int) int HasNextPage func(childComplexity int) int @@ -294,8 +315,11 @@ type ComplexityRoot struct { JobsFootprints func(childComplexity int, filter []*model.JobFilter, metrics []string) int JobsMetricStats func(childComplexity int, filter []*model.JobFilter, metrics []string) int JobsStatistics func(childComplexity int, filter []*model.JobFilter, metrics []string, page *model.PageRequest, sortBy *model.SortByAggregate, groupBy *model.Aggregate, numDurationBins *string, numMetricBins *int) int + Node func(childComplexity int, id string) int NodeMetrics func(childComplexity int, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) int NodeMetricsList func(childComplexity int, cluster string, subCluster string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) int + NodeStats func(childComplexity int, filter []*model.NodeFilter) int + Nodes func(childComplexity int, filter []*model.NodeFilter, order *model.OrderByInput) int RooflineHeatmap func(childComplexity int, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) int ScopedJobStats func(childComplexity int, id string, metrics []string, scopes []schema.MetricScope) int Tags func(childComplexity int) int @@ -398,6 +422,8 @@ type ClusterResolver interface { Partitions(ctx context.Context, obj *schema.Cluster) ([]string, error) } type JobResolver interface { + StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) + Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) @@ -417,12 +443,20 @@ type MutationResolver interface { RemoveTagFromList(ctx context.Context, tagIds []string) ([]int, error) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) } +type NodeResolver interface { + NodeState(ctx context.Context, obj *schema.Node) (string, error) + HealthState(ctx context.Context, obj *schema.Node) (schema.NodeState, error) + MetaData(ctx context.Context, obj *schema.Node) (any, error) +} type QueryResolver interface { Clusters(ctx context.Context) ([]*schema.Cluster, error) Tags(ctx context.Context) ([]*schema.Tag, error) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) User(ctx context.Context, username string) (*model.User, error) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) + Node(ctx context.Context, id string) (*schema.Node, error) + Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) + NodeStats(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStats, error) Job(ctx context.Context, id string) (*schema.Job, error) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope, resolution *int) ([]*model.JobMetricWithName, error) JobStats(ctx context.Context, id string, metrics []string) ([]*model.NamedStats, error) @@ -1433,6 +1467,55 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.NamedStatsWithScope.Stats(childComplexity), true + case "Node.cluster": + if e.complexity.Node.Cluster == nil { + break + } + + return e.complexity.Node.Cluster(childComplexity), true + + case "Node.HealthState": + if e.complexity.Node.HealthState == nil { + break + } + + return e.complexity.Node.HealthState(childComplexity), true + + case "Node.hostname": + if e.complexity.Node.Hostname == nil { + break + } + + return e.complexity.Node.Hostname(childComplexity), true + + case "Node.id": + if e.complexity.Node.ID == nil { + break + } + + return e.complexity.Node.ID(childComplexity), true + + case "Node.metaData": + if e.complexity.Node.MetaData == nil { + break + } + + return e.complexity.Node.MetaData(childComplexity), true + + case "Node.nodeState": + if e.complexity.Node.NodeState == nil { + break + } + + return e.complexity.Node.NodeState(childComplexity), true + + case "Node.subCluster": + if e.complexity.Node.SubCluster == nil { + break + } + + return e.complexity.Node.SubCluster(childComplexity), true + case "NodeMetrics.host": if e.complexity.NodeMetrics.Host == nil { break @@ -1454,6 +1537,34 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.NodeMetrics.SubCluster(childComplexity), true + case "NodeStateResultList.count": + if e.complexity.NodeStateResultList.Count == nil { + break + } + + return e.complexity.NodeStateResultList.Count(childComplexity), true + + case "NodeStateResultList.items": + if e.complexity.NodeStateResultList.Items == nil { + break + } + + return e.complexity.NodeStateResultList.Items(childComplexity), true + + case "NodeStats.count": + if e.complexity.NodeStats.Count == nil { + break + } + + return e.complexity.NodeStats.Count(childComplexity), true + + case "NodeStats.state": + if e.complexity.NodeStats.State == nil { + break + } + + return e.complexity.NodeStats.State(childComplexity), true + case "NodesResultList.count": if e.complexity.NodesResultList.Count == nil { break @@ -1606,6 +1717,18 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Query.JobsStatistics(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string), args["page"].(*model.PageRequest), args["sortBy"].(*model.SortByAggregate), args["groupBy"].(*model.Aggregate), args["numDurationBins"].(*string), args["numMetricBins"].(*int)), true + case "Query.node": + if e.complexity.Query.Node == nil { + break + } + + args, err := ec.field_Query_node_args(context.TODO(), rawArgs) + if err != nil { + return 0, false + } + + return e.complexity.Query.Node(childComplexity, args["id"].(string)), true + case "Query.nodeMetrics": if e.complexity.Query.NodeMetrics == nil { break @@ -1630,6 +1753,30 @@ func (e *executableSchema) Complexity(typeName, field string, childComplexity in return e.complexity.Query.NodeMetricsList(childComplexity, args["cluster"].(string), args["subCluster"].(string), args["nodeFilter"].(string), args["scopes"].([]schema.MetricScope), args["metrics"].([]string), args["from"].(time.Time), args["to"].(time.Time), args["page"].(*model.PageRequest), args["resolution"].(*int)), true + case "Query.nodeStats": + if e.complexity.Query.NodeStats == nil { + break + } + + args, err := ec.field_Query_nodeStats_args(context.TODO(), rawArgs) + if err != nil { + return 0, false + } + + return e.complexity.Query.NodeStats(childComplexity, args["filter"].([]*model.NodeFilter)), true + + case "Query.nodes": + if e.complexity.Query.Nodes == nil { + break + } + + args, err := ec.field_Query_nodes_args(context.TODO(), rawArgs) + if err != nil { + return 0, false + } + + return e.complexity.Query.Nodes(childComplexity, args["filter"].([]*model.NodeFilter), args["order"].(*model.OrderByInput)), true + case "Query.rooflineHeatmap": if e.complexity.Query.RooflineHeatmap == nil { break @@ -2070,6 +2217,7 @@ func (e *executableSchema) Exec(ctx context.Context) graphql.ResponseHandler { ec.unmarshalInputIntRange, ec.unmarshalInputJobFilter, ec.unmarshalInputMetricStatItem, + ec.unmarshalInputNodeFilter, ec.unmarshalInputOrderByInput, ec.unmarshalInputPageRequest, ec.unmarshalInputStringInput, @@ -2177,61 +2325,78 @@ scalar Any scalar NullableFloat scalar MetricScope scalar JobState +scalar NodeState +scalar MonitoringState + +type Node { + id: ID! + hostname: String! + cluster: String! + subCluster: String! + nodeState: NodeState! + HealthState: MonitoringState! + metaData: Any +} + +type NodeStats { + state: String! + count: Int! +} type Job { - id: ID! - jobId: Int! - user: String! - project: String! - cluster: String! - subCluster: String! - startTime: Time! - duration: Int! - walltime: Int! - numNodes: Int! - numHWThreads: Int! - numAcc: Int! - energy: Float! - SMT: Int! - exclusive: Int! - partition: String! - arrayJobId: Int! + id: ID! + jobId: Int! + user: String! + project: String! + cluster: String! + subCluster: String! + startTime: Time! + duration: Int! + walltime: Int! + numNodes: Int! + numHWThreads: Int! + numAcc: Int! + energy: Float! + SMT: Int! + exclusive: Int! + partition: String! + arrayJobId: Int! monitoringStatus: Int! - state: JobState! - tags: [Tag!]! - resources: [Resource!]! - concurrentJobs: JobLinkResultList - footprint: [FootprintValue] - energyFootprint: [EnergyFootprintValue] - metaData: Any - userData: User + state: JobState! + tags: [Tag!]! + resources: [Resource!]! + concurrentJobs: JobLinkResultList + footprint: [FootprintValue] + energyFootprint: [EnergyFootprintValue] + metaData: Any + userData: User } type JobLink { - id: ID! - jobId: Int! + id: ID! + jobId: Int! } type Cluster { - name: String! - partitions: [String!]! # Slurm partitions - subClusters: [SubCluster!]! # Hardware partitions/subclusters + name: String! + partitions: [String!]! # Slurm partitions + subClusters: [SubCluster!]! # Hardware partitions/subclusters } type SubCluster { - name: String! - nodes: String! - numberOfNodes: Int! - processorType: String! - socketsPerNode: Int! - coresPerSocket: Int! - threadsPerCore: Int! - flopRateScalar: MetricValue! - flopRateSimd: MetricValue! + name: String! + nodes: String! + numberOfNodes: Int! + processorType: String! + socketsPerNode: Int! + coresPerSocket: Int! + threadsPerCore: Int! + flopRateScalar: MetricValue! + flopRateSimd: MetricValue! memoryBandwidth: MetricValue! - topology: Topology! - metricConfig: [MetricConfig!]! - footprint: [String!]! + topology: Topology! + metricConfig: [MetricConfig!]! + footprint: [String!]! } type FootprintValue { @@ -2253,94 +2418,94 @@ type MetricValue { } type Topology { - node: [Int!] - socket: [[Int!]!] + node: [Int!] + socket: [[Int!]!] memoryDomain: [[Int!]!] - die: [[Int!]!] - core: [[Int!]!] + die: [[Int!]!] + core: [[Int!]!] accelerators: [Accelerator!] } type Accelerator { - id: String! - type: String! + id: String! + type: String! model: String! } type SubClusterConfig { - name: String! - peak: Float - normal: Float + name: String! + peak: Float + normal: Float caution: Float - alert: Float - remove: Boolean + alert: Float + remove: Boolean } type MetricConfig { - name: String! - unit: Unit! - scope: MetricScope! + name: String! + unit: Unit! + scope: MetricScope! aggregation: String! - timestep: Int! - peak: Float! - normal: Float + timestep: Int! + peak: Float! + normal: Float caution: Float! - alert: Float! + alert: Float! lowerIsBetter: Boolean subClusters: [SubClusterConfig!]! } type Tag { - id: ID! + id: ID! type: String! name: String! scope: String! } type Resource { - hostname: String! - hwthreads: [Int!] - accelerators: [String!] + hostname: String! + hwthreads: [Int!] + accelerators: [String!] configuration: String } type JobMetricWithName { - name: String! - scope: MetricScope! + name: String! + scope: MetricScope! metric: JobMetric! } type JobMetric { - unit: Unit - timestep: Int! - series: [Series!] + unit: Unit + timestep: Int! + series: [Series!] statisticsSeries: StatsSeries } type Series { - hostname: String! - id: String + hostname: String! + id: String statistics: MetricStatistics - data: [NullableFloat!]! + data: [NullableFloat!]! } type StatsSeries { - mean: [NullableFloat!]! + mean: [NullableFloat!]! median: [NullableFloat!]! - min: [NullableFloat!]! - max: [NullableFloat!]! + min: [NullableFloat!]! + max: [NullableFloat!]! } type NamedStatsWithScope { - name: String! - scope: MetricScope! - stats: [ScopedStats!]! + name: String! + scope: MetricScope! + stats: [ScopedStats!]! } type ScopedStats { - hostname: String! - id: String - data: MetricStatistics! + hostname: String! + id: String + data: MetricStatistics! } type JobStats { @@ -2357,8 +2522,8 @@ type JobStats { } type NamedStats { - name: String! - data: MetricStatistics! + name: String! + data: MetricStatistics! } type Unit { @@ -2374,12 +2539,12 @@ type MetricStatistics { type MetricFootprints { metric: String! - data: [NullableFloat!]! + data: [NullableFloat!]! } type Footprints { timeWeights: TimeWeights! - metrics: [MetricFootprints!]! + metrics: [MetricFootprints!]! } type TimeWeights { @@ -2388,20 +2553,33 @@ type TimeWeights { coreHours: [NullableFloat!]! } -enum Aggregate { USER, PROJECT, CLUSTER } -enum SortByAggregate { TOTALWALLTIME, TOTALJOBS, TOTALNODES, TOTALNODEHOURS, TOTALCORES, TOTALCOREHOURS, TOTALACCS, TOTALACCHOURS } +enum Aggregate { + USER + PROJECT + CLUSTER +} +enum SortByAggregate { + TOTALWALLTIME + TOTALJOBS + TOTALNODES + TOTALNODEHOURS + TOTALCORES + TOTALCOREHOURS + TOTALACCS + TOTALACCHOURS +} type NodeMetrics { - host: String! + host: String! subCluster: String! - metrics: [JobMetricWithName!]! + metrics: [JobMetricWithName!]! } type NodesResultList { - items: [NodeMetrics!]! + items: [NodeMetrics!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int totalNodes: Int hasNextPage: Boolean } @@ -2420,14 +2598,14 @@ type GlobalMetricListItem { } type Count { - name: String! + name: String! count: Int! } type User { username: String! - name: String! - email: String! + name: String! + email: String! } input MetricStatItem { @@ -2436,27 +2614,81 @@ input MetricStatItem { } type Query { - clusters: [Cluster!]! # List of all clusters - tags: [Tag!]! # List of all tags - globalMetrics: [GlobalMetricListItem!]! + clusters: [Cluster!]! # List of all clusters + tags: [Tag!]! # List of all tags + globalMetrics: [GlobalMetricListItem!]! user(username: String!): User allocatedNodes(cluster: String!): [Count!]! - job(id: ID!): Job - jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!], resolution: Int): [JobMetricWithName!]! - jobStats(id: ID!, metrics: [String!]): [NamedStats!]! - scopedJobStats(id: ID!, metrics: [String!], scopes: [MetricScope!]): [NamedStatsWithScope!]! + node(id: ID!): Node + nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! + nodeStats(filter: [NodeFilter!]): [NodeStats!]! + + job(id: ID!): Job + jobMetrics( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + resolution: Int + ): [JobMetricWithName!]! + + jobStats(id: ID!, metrics: [String!]): [NamedStats!]! + + scopedJobStats( + id: ID! + metrics: [String!] + scopes: [MetricScope!] + ): [NamedStatsWithScope!]! + + jobs( + filter: [JobFilter!] + page: PageRequest + order: OrderByInput + ): JobResultList! + + jobsStatistics( + filter: [JobFilter!] + metrics: [String!] + page: PageRequest + sortBy: SortByAggregate + groupBy: Aggregate + numDurationBins: String + numMetricBins: Int + ): [JobsStatistics!]! - jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList! - jobsStatistics(filter: [JobFilter!], metrics: [String!], page: PageRequest, sortBy: SortByAggregate, groupBy: Aggregate, numDurationBins: String, numMetricBins: Int): [JobsStatistics!]! jobsMetricStats(filter: [JobFilter!], metrics: [String!]): [JobStats!]! jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints - rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]! + rooflineHeatmap( + filter: [JobFilter!]! + rows: Int! + cols: Int! + minX: Float! + minY: Float! + maxX: Float! + maxY: Float! + ): [[Float!]!]! - nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]! - nodeMetricsList(cluster: String!, subCluster: String!, nodeFilter: String!, scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!, page: PageRequest, resolution: Int): NodesResultList! + nodeMetrics( + cluster: String! + nodes: [String!] + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + ): [NodeMetrics!]! + nodeMetricsList( + cluster: String! + subCluster: String! + nodeFilter: String! + scopes: [MetricScope!] + metrics: [String!] + from: Time! + to: Time! + page: PageRequest + resolution: Int + ): NodesResultList! } type Mutation { @@ -2469,38 +2701,53 @@ type Mutation { updateConfiguration(name: String!, value: String!): String } -type IntRangeOutput { from: Int!, to: Int! } -type TimeRangeOutput { range: String, from: Time!, to: Time! } +type IntRangeOutput { + from: Int! + to: Int! +} +type TimeRangeOutput { + range: String + from: Time! + to: Time! +} + +input NodeFilter { + hostname: StringInput + cluster: StringInput + subCluster: StringInput + nodeState: NodeState + healthState: MonitoringState +} input JobFilter { - tags: [ID!] - dbId: [ID!] - jobId: StringInput - arrayJobId: Int - user: StringInput - project: StringInput - jobName: StringInput - cluster: StringInput - partition: StringInput - duration: IntRange - energy: FloatRange + tags: [ID!] + dbId: [ID!] + jobId: StringInput + arrayJobId: Int + user: StringInput + project: StringInput + jobName: StringInput + cluster: StringInput + partition: StringInput + duration: IntRange + energy: FloatRange minRunningFor: Int - numNodes: IntRange + numNodes: IntRange numAccelerators: IntRange - numHWThreads: IntRange + numHWThreads: IntRange - startTime: TimeRange - state: [JobState!] + startTime: TimeRange + state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int - node: StringInput + exclusive: Int + node: StringInput } input OrderByInput { field: String! - type: String!, + type: String! order: SortDirectionEnum! = ASC } @@ -2510,34 +2757,46 @@ enum SortDirectionEnum { } input StringInput { - eq: String - neq: String - contains: String + eq: String + neq: String + contains: String startsWith: String - endsWith: String - in: [String!] + endsWith: String + in: [String!] } -input IntRange { from: Int!, to: Int! } -input TimeRange { range: String, from: Time, to: Time } +input IntRange { + from: Int! + to: Int! +} +input TimeRange { + range: String + from: Time + to: Time +} input FloatRange { from: Float! to: Float! } +type NodeStateResultList { + items: [Node!]! + count: Int +} + type JobResultList { - items: [Job!]! + items: [Job!]! offset: Int - limit: Int - count: Int + limit: Int + count: Int hasNextPage: Boolean } type JobLinkResultList { listQuery: String - items: [JobLink!]! - count: Int + items: [JobLink!]! + count: Int } type HistoPoint { @@ -2559,29 +2818,29 @@ type MetricHistoPoint { max: Int } -type JobsStatistics { - id: ID! # If ` + "`" + `groupBy` + "`" + ` was used, ID of the user/project/cluster - name: String! # if User-Statistics: Given Name of Account (ID) Owner - totalJobs: Int! # Number of jobs - runningJobs: Int! # Number of running jobs - shortJobs: Int! # Number of jobs with a duration of less than duration - totalWalltime: Int! # Sum of the duration of all matched jobs in hours - totalNodes: Int! # Sum of the nodes of all matched jobs - totalNodeHours: Int! # Sum of the node hours of all matched jobs - totalCores: Int! # Sum of the cores of all matched jobs - totalCoreHours: Int! # Sum of the core hours of all matched jobs - totalAccs: Int! # Sum of the accs of all matched jobs - totalAccHours: Int! # Sum of the gpu hours of all matched jobs - histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value - histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes - histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores - histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs - histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average +type JobsStatistics { + id: ID! # If ` + "`" + `groupBy` + "`" + ` was used, ID of the user/project/cluster + name: String! # if User-Statistics: Given Name of Account (ID) Owner + totalJobs: Int! # Number of jobs + runningJobs: Int! # Number of running jobs + shortJobs: Int! # Number of jobs with a duration of less than duration + totalWalltime: Int! # Sum of the duration of all matched jobs in hours + totalNodes: Int! # Sum of the nodes of all matched jobs + totalNodeHours: Int! # Sum of the node hours of all matched jobs + totalCores: Int! # Sum of the cores of all matched jobs + totalCoreHours: Int! # Sum of the core hours of all matched jobs + totalAccs: Int! # Sum of the accs of all matched jobs + totalAccHours: Int! # Sum of the gpu hours of all matched jobs + histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value + histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes + histNumCores: [HistoPoint!]! # value: number of cores, count: number of jobs with that number of cores + histNumAccs: [HistoPoint!]! # value: number of accs, count: number of jobs with that number of accs + histMetrics: [MetricHistoPoints!]! # metric: metricname, data array of histopoints: value: metric average bin, count: number of jobs with that metric average } input PageRequest { itemsPerPage: Int! - page: Int! + page: Int! } `, BuiltIn: false}, } @@ -3803,6 +4062,113 @@ func (ec *executionContext) field_Query_nodeMetrics_argsTo( return zeroVal, nil } +func (ec *executionContext) field_Query_nodeStats_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := ec.field_Query_nodeStats_argsFilter(ctx, rawArgs) + if err != nil { + return nil, err + } + args["filter"] = arg0 + return args, nil +} +func (ec *executionContext) field_Query_nodeStats_argsFilter( + ctx context.Context, + rawArgs map[string]any, +) ([]*model.NodeFilter, error) { + if _, ok := rawArgs["filter"]; !ok { + var zeroVal []*model.NodeFilter + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("filter")) + if tmp, ok := rawArgs["filter"]; ok { + return ec.unmarshalONodeFilter2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilterᚄ(ctx, tmp) + } + + var zeroVal []*model.NodeFilter + return zeroVal, nil +} + +func (ec *executionContext) field_Query_node_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := ec.field_Query_node_argsID(ctx, rawArgs) + if err != nil { + return nil, err + } + args["id"] = arg0 + return args, nil +} +func (ec *executionContext) field_Query_node_argsID( + ctx context.Context, + rawArgs map[string]any, +) (string, error) { + if _, ok := rawArgs["id"]; !ok { + var zeroVal string + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("id")) + if tmp, ok := rawArgs["id"]; ok { + return ec.unmarshalNID2string(ctx, tmp) + } + + var zeroVal string + return zeroVal, nil +} + +func (ec *executionContext) field_Query_nodes_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := ec.field_Query_nodes_argsFilter(ctx, rawArgs) + if err != nil { + return nil, err + } + args["filter"] = arg0 + arg1, err := ec.field_Query_nodes_argsOrder(ctx, rawArgs) + if err != nil { + return nil, err + } + args["order"] = arg1 + return args, nil +} +func (ec *executionContext) field_Query_nodes_argsFilter( + ctx context.Context, + rawArgs map[string]any, +) ([]*model.NodeFilter, error) { + if _, ok := rawArgs["filter"]; !ok { + var zeroVal []*model.NodeFilter + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("filter")) + if tmp, ok := rawArgs["filter"]; ok { + return ec.unmarshalONodeFilter2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilterᚄ(ctx, tmp) + } + + var zeroVal []*model.NodeFilter + return zeroVal, nil +} + +func (ec *executionContext) field_Query_nodes_argsOrder( + ctx context.Context, + rawArgs map[string]any, +) (*model.OrderByInput, error) { + if _, ok := rawArgs["order"]; !ok { + var zeroVal *model.OrderByInput + return zeroVal, nil + } + + ctx = graphql.WithPathContext(ctx, graphql.NewPathWithField("order")) + if tmp, ok := rawArgs["order"]; ok { + return ec.unmarshalOOrderByInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐOrderByInput(ctx, tmp) + } + + var zeroVal *model.OrderByInput + return zeroVal, nil +} + func (ec *executionContext) field_Query_rooflineHeatmap_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { var err error args := map[string]any{} @@ -5456,9 +5822,9 @@ func (ec *executionContext) _Job_id(ctx context.Context, field graphql.Collected } return graphql.Null } - res := resTmp.(int64) + res := resTmp.(*int64) fc.Result = res - return ec.marshalNID2int64(ctx, field.Selections, res) + return ec.marshalNID2ᚖint64(ctx, field.Selections, res) } func (ec *executionContext) fieldContext_Job_id(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { @@ -5708,7 +6074,7 @@ func (ec *executionContext) _Job_startTime(ctx context.Context, field graphql.Co }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { ctx = rctx // use context from middleware stack in children - return obj.StartTime, nil + return ec.resolvers.Job().StartTime(rctx, obj) }) if err != nil { ec.Error(ctx, err) @@ -5720,17 +6086,17 @@ func (ec *executionContext) _Job_startTime(ctx context.Context, field graphql.Co } return graphql.Null } - res := resTmp.(time.Time) + res := resTmp.(*time.Time) fc.Result = res - return ec.marshalNTime2timeᚐTime(ctx, field.Selections, res) + return ec.marshalNTime2ᚖtimeᚐTime(ctx, field.Selections, res) } func (ec *executionContext) fieldContext_Job_startTime(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "Job", Field: field, - IsMethod: false, - IsResolver: false, + IsMethod: true, + IsResolver: true, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { return nil, errors.New("field of type Time does not have child fields") }, @@ -10443,6 +10809,311 @@ func (ec *executionContext) fieldContext_NamedStatsWithScope_stats(_ context.Con return fc, nil } +func (ec *executionContext) _Node_id(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_id(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.ID, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(int64) + fc.Result = res + return ec.marshalNID2int64(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_id(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type ID does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_hostname(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_hostname(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Hostname, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_hostname(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_cluster(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_cluster(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Cluster, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_cluster(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_subCluster(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_subCluster(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.SubCluster, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_subCluster(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_nodeState(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_nodeState(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Node().NodeState(rctx, obj) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNNodeState2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_nodeState(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type NodeState does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_HealthState(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_HealthState(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Node().HealthState(rctx, obj) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(schema.NodeState) + fc.Result = res + return ec.marshalNMonitoringState2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_HealthState(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type MonitoringState does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _Node_metaData(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Node_metaData(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Node().MetaData(rctx, obj) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(any) + fc.Result = res + return ec.marshalOAny2interface(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Node_metaData(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Any does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _NodeMetrics_host(ctx context.Context, field graphql.CollectedField, obj *model.NodeMetrics) (ret graphql.Marshaler) { fc, err := ec.fieldContext_NodeMetrics_host(ctx, field) if err != nil { @@ -10583,6 +11254,195 @@ func (ec *executionContext) fieldContext_NodeMetrics_metrics(_ context.Context, return fc, nil } +func (ec *executionContext) _NodeStateResultList_items(ctx context.Context, field graphql.CollectedField, obj *model.NodeStateResultList) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStateResultList_items(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Items, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.([]*schema.Node) + fc.Result = res + return ec.marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeᚄ(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStateResultList_items(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStateResultList", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "id": + return ec.fieldContext_Node_id(ctx, field) + case "hostname": + return ec.fieldContext_Node_hostname(ctx, field) + case "cluster": + return ec.fieldContext_Node_cluster(ctx, field) + case "subCluster": + return ec.fieldContext_Node_subCluster(ctx, field) + case "nodeState": + return ec.fieldContext_Node_nodeState(ctx, field) + case "HealthState": + return ec.fieldContext_Node_HealthState(ctx, field) + case "metaData": + return ec.fieldContext_Node_metaData(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type Node", field.Name) + }, + } + return fc, nil +} + +func (ec *executionContext) _NodeStateResultList_count(ctx context.Context, field graphql.CollectedField, obj *model.NodeStateResultList) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStateResultList_count(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Count, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*int) + fc.Result = res + return ec.marshalOInt2ᚖint(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStateResultList_count(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStateResultList", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _NodeStats_state(ctx context.Context, field graphql.CollectedField, obj *model.NodeStats) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStats_state(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.State, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(string) + fc.Result = res + return ec.marshalNString2string(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStats_state(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStats", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _NodeStats_count(ctx context.Context, field graphql.CollectedField, obj *model.NodeStats) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_NodeStats_count(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return obj.Count, nil + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(int) + fc.Result = res + return ec.marshalNInt2int(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_NodeStats_count(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeStats", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _NodesResultList_items(ctx context.Context, field graphql.CollectedField, obj *model.NodesResultList) (ret graphql.Marshaler) { fc, err := ec.fieldContext_NodesResultList_items(ctx, field) if err != nil { @@ -11123,6 +11983,196 @@ func (ec *executionContext) fieldContext_Query_allocatedNodes(ctx context.Contex return fc, nil } +func (ec *executionContext) _Query_node(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Query_node(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Query().Node(rctx, fc.Args["id"].(string)) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + return graphql.Null + } + res := resTmp.(*schema.Node) + fc.Result = res + return ec.marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Query_node(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "id": + return ec.fieldContext_Node_id(ctx, field) + case "hostname": + return ec.fieldContext_Node_hostname(ctx, field) + case "cluster": + return ec.fieldContext_Node_cluster(ctx, field) + case "subCluster": + return ec.fieldContext_Node_subCluster(ctx, field) + case "nodeState": + return ec.fieldContext_Node_nodeState(ctx, field) + case "HealthState": + return ec.fieldContext_Node_HealthState(ctx, field) + case "metaData": + return ec.fieldContext_Node_metaData(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type Node", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_node_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + +func (ec *executionContext) _Query_nodes(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Query_nodes(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Query().Nodes(rctx, fc.Args["filter"].([]*model.NodeFilter), fc.Args["order"].(*model.OrderByInput)) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.(*model.NodeStateResultList) + fc.Result = res + return ec.marshalNNodeStateResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Query_nodes(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "items": + return ec.fieldContext_NodeStateResultList_items(ctx, field) + case "count": + return ec.fieldContext_NodeStateResultList_count(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type NodeStateResultList", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_nodes_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + +func (ec *executionContext) _Query_nodeStats(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Query_nodeStats(ctx, field) + if err != nil { + return graphql.Null + } + ctx = graphql.WithFieldContext(ctx, fc) + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = graphql.Null + } + }() + resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { + ctx = rctx // use context from middleware stack in children + return ec.resolvers.Query().NodeStats(rctx, fc.Args["filter"].([]*model.NodeFilter)) + }) + if err != nil { + ec.Error(ctx, err) + return graphql.Null + } + if resTmp == nil { + if !graphql.HasFieldError(ctx, fc) { + ec.Errorf(ctx, "must not be null") + } + return graphql.Null + } + res := resTmp.([]*model.NodeStats) + fc.Result = res + return ec.marshalNNodeStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatsᚄ(ctx, field.Selections, res) +} + +func (ec *executionContext) fieldContext_Query_nodeStats(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "state": + return ec.fieldContext_NodeStats_state(ctx, field) + case "count": + return ec.fieldContext_NodeStats_count(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type NodeStats", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_nodeStats_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + func (ec *executionContext) _Query_job(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { fc, err := ec.fieldContext_Query_job(ctx, field) if err != nil { @@ -16689,6 +17739,61 @@ func (ec *executionContext) unmarshalInputMetricStatItem(ctx context.Context, ob return it, nil } +func (ec *executionContext) unmarshalInputNodeFilter(ctx context.Context, obj any) (model.NodeFilter, error) { + var it model.NodeFilter + asMap := map[string]any{} + for k, v := range obj.(map[string]any) { + asMap[k] = v + } + + fieldsInOrder := [...]string{"hostname", "cluster", "subCluster", "nodeState", "healthState"} + for _, k := range fieldsInOrder { + v, ok := asMap[k] + if !ok { + continue + } + switch k { + case "hostname": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("hostname")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + if err != nil { + return it, err + } + it.Hostname = data + case "cluster": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("cluster")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + if err != nil { + return it, err + } + it.Cluster = data + case "subCluster": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("subCluster")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + if err != nil { + return it, err + } + it.SubCluster = data + case "nodeState": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("nodeState")) + data, err := ec.unmarshalONodeState2ᚖstring(ctx, v) + if err != nil { + return it, err + } + it.NodeState = data + case "healthState": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("healthState")) + data, err := ec.unmarshalOMonitoringState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx, v) + if err != nil { + return it, err + } + it.HealthState = data + } + } + + return it, nil +} + func (ec *executionContext) unmarshalInputOrderByInput(ctx context.Context, obj any) (model.OrderByInput, error) { var it model.OrderByInput asMap := map[string]any{} @@ -17424,10 +18529,41 @@ func (ec *executionContext) _Job(ctx context.Context, sel ast.SelectionSet, obj atomic.AddUint32(&out.Invalids, 1) } case "startTime": - out.Values[i] = ec._Job_startTime(ctx, field, obj) - if out.Values[i] == graphql.Null { - atomic.AddUint32(&out.Invalids, 1) + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Job_startTime(ctx, field, obj) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) case "duration": out.Values[i] = ec._Job_duration(ctx, field, obj) if out.Values[i] == graphql.Null { @@ -18662,6 +19798,165 @@ func (ec *executionContext) _NamedStatsWithScope(ctx context.Context, sel ast.Se return out } +var nodeImplementors = []string{"Node"} + +func (ec *executionContext) _Node(ctx context.Context, sel ast.SelectionSet, obj *schema.Node) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, nodeImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("Node") + case "id": + out.Values[i] = ec._Node_id(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "hostname": + out.Values[i] = ec._Node_hostname(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "cluster": + out.Values[i] = ec._Node_cluster(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "subCluster": + out.Values[i] = ec._Node_subCluster(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) + } + case "nodeState": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Node_nodeState(ctx, field, obj) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + case "HealthState": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Node_HealthState(ctx, field, obj) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + case "metaData": + field := field + + innerFunc := func(ctx context.Context, _ *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Node_metaData(ctx, field, obj) + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.processDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + var nodeMetricsImplementors = []string{"NodeMetrics"} func (ec *executionContext) _NodeMetrics(ctx context.Context, sel ast.SelectionSet, obj *model.NodeMetrics) graphql.Marshaler { @@ -18711,6 +20006,91 @@ func (ec *executionContext) _NodeMetrics(ctx context.Context, sel ast.SelectionS return out } +var nodeStateResultListImplementors = []string{"NodeStateResultList"} + +func (ec *executionContext) _NodeStateResultList(ctx context.Context, sel ast.SelectionSet, obj *model.NodeStateResultList) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, nodeStateResultListImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("NodeStateResultList") + case "items": + out.Values[i] = ec._NodeStateResultList_items(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "count": + out.Values[i] = ec._NodeStateResultList_count(ctx, field, obj) + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.processDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + +var nodeStatsImplementors = []string{"NodeStats"} + +func (ec *executionContext) _NodeStats(ctx context.Context, sel ast.SelectionSet, obj *model.NodeStats) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, nodeStatsImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("NodeStats") + case "state": + out.Values[i] = ec._NodeStats_state(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "count": + out.Values[i] = ec._NodeStats_count(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.processDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + var nodesResultListImplementors = []string{"NodesResultList"} func (ec *executionContext) _NodesResultList(ctx context.Context, sel ast.SelectionSet, obj *model.NodesResultList) graphql.Marshaler { @@ -18885,6 +20265,69 @@ func (ec *executionContext) _Query(ctx context.Context, sel ast.SelectionSet) gr func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "node": + field := field + + innerFunc := func(ctx context.Context, _ *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_node(ctx, field) + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "nodes": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_nodes(ctx, field) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "nodeStats": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_nodeStats(ctx, field) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) case "job": field := field @@ -20580,6 +22023,27 @@ func (ec *executionContext) marshalNID2ᚕstringᚄ(ctx context.Context, sel ast return ret } +func (ec *executionContext) unmarshalNID2ᚖint64(ctx context.Context, v any) (*int64, error) { + res, err := graphql.UnmarshalInt64(v) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNID2ᚖint64(ctx context.Context, sel ast.SelectionSet, v *int64) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + res := graphql.MarshalInt64(*v) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + func (ec *executionContext) unmarshalNInt2int(ctx context.Context, v any) (int, error) { res, err := graphql.UnmarshalInt(v) return res, graphql.ErrorOnPath(ctx, err) @@ -21231,6 +22695,22 @@ func (ec *executionContext) marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋ return ec._MetricValue(ctx, sel, &v) } +func (ec *executionContext) unmarshalNMonitoringState2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, v any) (schema.NodeState, error) { + tmp, err := graphql.UnmarshalString(v) + res := schema.NodeState(tmp) + return res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNMonitoringState2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, sel ast.SelectionSet, v schema.NodeState) graphql.Marshaler { + res := graphql.MarshalString(string(v)) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + func (ec *executionContext) marshalNNamedStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NamedStats) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup @@ -21339,6 +22819,65 @@ func (ec *executionContext) marshalNNamedStatsWithScope2ᚖgithubᚗcomᚋCluste return ec._NamedStatsWithScope(ctx, sel, v) } +func (ec *executionContext) marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Node) graphql.Marshaler { + ret := make(graphql.Array, len(v)) + var wg sync.WaitGroup + isLen1 := len(v) == 1 + if !isLen1 { + wg.Add(len(v)) + } + for i := range v { + i := i + fc := &graphql.FieldContext{ + Index: &i, + Result: &v[i], + } + ctx := graphql.WithFieldContext(ctx, fc) + f := func(i int) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = nil + } + }() + if !isLen1 { + defer wg.Done() + } + ret[i] = ec.marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx, sel, v[i]) + } + if isLen1 { + f(i) + } else { + go f(i) + } + + } + wg.Wait() + + for _, e := range ret { + if e == graphql.Null { + return graphql.Null + } + } + + return ret +} + +func (ec *executionContext) marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._Node(ctx, sel, v) +} + +func (ec *executionContext) unmarshalNNodeFilter2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilter(ctx context.Context, v any) (*model.NodeFilter, error) { + res, err := ec.unmarshalInputNodeFilter(ctx, v) + return &res, graphql.ErrorOnPath(ctx, err) +} + func (ec *executionContext) marshalNNodeMetrics2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeMetricsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NodeMetrics) graphql.Marshaler { ret := make(graphql.Array, len(v)) var wg sync.WaitGroup @@ -21393,6 +22932,89 @@ func (ec *executionContext) marshalNNodeMetrics2ᚖgithubᚗcomᚋClusterCockpit return ec._NodeMetrics(ctx, sel, v) } +func (ec *executionContext) unmarshalNNodeState2string(ctx context.Context, v any) (string, error) { + res, err := graphql.UnmarshalString(v) + return res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNNodeState2string(ctx context.Context, sel ast.SelectionSet, v string) graphql.Marshaler { + res := graphql.MarshalString(v) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + +func (ec *executionContext) marshalNNodeStateResultList2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList(ctx context.Context, sel ast.SelectionSet, v model.NodeStateResultList) graphql.Marshaler { + return ec._NodeStateResultList(ctx, sel, &v) +} + +func (ec *executionContext) marshalNNodeStateResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList(ctx context.Context, sel ast.SelectionSet, v *model.NodeStateResultList) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._NodeStateResultList(ctx, sel, v) +} + +func (ec *executionContext) marshalNNodeStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NodeStats) graphql.Marshaler { + ret := make(graphql.Array, len(v)) + var wg sync.WaitGroup + isLen1 := len(v) == 1 + if !isLen1 { + wg.Add(len(v)) + } + for i := range v { + i := i + fc := &graphql.FieldContext{ + Index: &i, + Result: &v[i], + } + ctx := graphql.WithFieldContext(ctx, fc) + f := func(i int) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + ret = nil + } + }() + if !isLen1 { + defer wg.Done() + } + ret[i] = ec.marshalNNodeStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStats(ctx, sel, v[i]) + } + if isLen1 { + f(i) + } else { + go f(i) + } + + } + wg.Wait() + + for _, e := range ret { + if e == graphql.Null { + return graphql.Null + } + } + + return ret +} + +func (ec *executionContext) marshalNNodeStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStats(ctx context.Context, sel ast.SelectionSet, v *model.NodeStats) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._NodeStats(ctx, sel, v) +} + func (ec *executionContext) marshalNNodesResultList2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodesResultList(ctx context.Context, sel ast.SelectionSet, v model.NodesResultList) graphql.Marshaler { return ec._NodesResultList(ctx, sel, &v) } @@ -21799,6 +23421,27 @@ func (ec *executionContext) marshalNTime2timeᚐTime(ctx context.Context, sel as return res } +func (ec *executionContext) unmarshalNTime2ᚖtimeᚐTime(ctx context.Context, v any) (*time.Time, error) { + res, err := graphql.UnmarshalTime(v) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalNTime2ᚖtimeᚐTime(ctx context.Context, sel ast.SelectionSet, v *time.Time) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + res := graphql.MarshalTime(*v) + if res == graphql.Null { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + ec.Errorf(ctx, "the requested element is null which the schema does not allow") + } + } + return res +} + func (ec *executionContext) marshalNTimeWeights2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐTimeWeights(ctx context.Context, sel ast.SelectionSet, v *model.TimeWeights) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { @@ -22653,6 +24296,66 @@ func (ec *executionContext) marshalOMetricStatistics2githubᚗcomᚋClusterCockp return ec._MetricStatistics(ctx, sel, &v) } +func (ec *executionContext) unmarshalOMonitoringState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, v any) (*schema.NodeState, error) { + if v == nil { + return nil, nil + } + tmp, err := graphql.UnmarshalString(v) + res := schema.NodeState(tmp) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalOMonitoringState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNodeState(ctx context.Context, sel ast.SelectionSet, v *schema.NodeState) graphql.Marshaler { + if v == nil { + return graphql.Null + } + res := graphql.MarshalString(string(*v)) + return res +} + +func (ec *executionContext) marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋpkgᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { + if v == nil { + return graphql.Null + } + return ec._Node(ctx, sel, v) +} + +func (ec *executionContext) unmarshalONodeFilter2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilterᚄ(ctx context.Context, v any) ([]*model.NodeFilter, error) { + if v == nil { + return nil, nil + } + var vSlice []any + if v != nil { + vSlice = graphql.CoerceList(v) + } + var err error + res := make([]*model.NodeFilter, len(vSlice)) + for i := range vSlice { + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithIndex(i)) + res[i], err = ec.unmarshalNNodeFilter2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilter(ctx, vSlice[i]) + if err != nil { + return nil, err + } + } + return res, nil +} + +func (ec *executionContext) unmarshalONodeState2ᚖstring(ctx context.Context, v any) (*string, error) { + if v == nil { + return nil, nil + } + res, err := graphql.UnmarshalString(v) + return &res, graphql.ErrorOnPath(ctx, err) +} + +func (ec *executionContext) marshalONodeState2ᚖstring(ctx context.Context, sel ast.SelectionSet, v *string) graphql.Marshaler { + if v == nil { + return graphql.Null + } + res := graphql.MarshalString(*v) + return res +} + func (ec *executionContext) unmarshalOOrderByInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐOrderByInput(ctx context.Context, v any) (*model.OrderByInput, error) { if v == nil { return nil, nil diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index 5c50ff9..fc05280 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -167,12 +167,30 @@ type NamedStatsWithScope struct { Stats []*ScopedStats `json:"stats"` } +type NodeFilter struct { + Hostname *StringInput `json:"hostname,omitempty"` + Cluster *StringInput `json:"cluster,omitempty"` + SubCluster *StringInput `json:"subCluster,omitempty"` + NodeState *string `json:"nodeState,omitempty"` + HealthState *schema.NodeState `json:"healthState,omitempty"` +} + type NodeMetrics struct { Host string `json:"host"` SubCluster string `json:"subCluster"` Metrics []*JobMetricWithName `json:"metrics"` } +type NodeStateResultList struct { + Items []*schema.Node `json:"items"` + Count *int `json:"count,omitempty"` +} + +type NodeStats struct { + State string `json:"state"` + Count int `json:"count"` +} + type NodesResultList struct { Items []*NodeMetrics `json:"items"` Offset *int `json:"offset,omitempty"` diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index f3fc389..af167b4 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -29,9 +29,14 @@ func (r *clusterResolver) Partitions(ctx context.Context, obj *schema.Cluster) ( return r.Repo.Partitions(obj.Name) } +// StartTime is the resolver for the startTime field. +func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) { + panic(fmt.Errorf("not implemented: StartTime - startTime")) +} + // Tags is the resolver for the tags field. func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) { - return r.Repo.GetTags(repository.GetUserFromContext(ctx), &obj.ID) + return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID) } // ConcurrentJobs is the resolver for the concurrentJobs field. @@ -143,7 +148,7 @@ func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name s return &schema.Tag{ID: id, Type: typeArg, Name: name, Scope: scope}, nil } else { log.Warnf("Not authorized to create tag with scope: %s", scope) - return nil, fmt.Errorf("Not authorized to create tag with scope: %s", scope) + return nil, fmt.Errorf("not authorized to create tag with scope: %s", scope) } } @@ -179,7 +184,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag @@ -193,7 +198,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds } } else { log.Warnf("Not authorized to add tag: %d", tid) - return nil, fmt.Errorf("Not authorized to add tag: %d", tid) + return nil, fmt.Errorf("not authorized to add tag: %d", tid) } } @@ -226,7 +231,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Support/Admin and Global Tag OR Everyone && Private Tag @@ -240,7 +245,7 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta } } else { log.Warnf("Not authorized to remove tag: %d", tid) - return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) + return nil, fmt.Errorf("not authorized to remove tag: %d", tid) } } @@ -269,7 +274,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin _, _, tscope, exists := r.Repo.TagInfo(tid) if !exists { log.Warnf("Tag does not exist (ID): %d", tid) - return nil, fmt.Errorf("Tag does not exist (ID): %d", tid) + return nil, fmt.Errorf("tag does not exist (ID): %d", tid) } // Test Access: Admins && Admin Tag OR Everyone && Private Tag @@ -283,7 +288,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin } } else { log.Warnf("Not authorized to remove tag: %d", tid) - return nil, fmt.Errorf("Not authorized to remove tag: %d", tid) + return nil, fmt.Errorf("not authorized to remove tag: %d", tid) } } return tags, nil @@ -299,6 +304,21 @@ func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, return nil, nil } +// NodeState is the resolver for the nodeState field. +func (r *nodeResolver) NodeState(ctx context.Context, obj *schema.Node) (string, error) { + panic(fmt.Errorf("not implemented: NodeState - nodeState")) +} + +// HealthState is the resolver for the HealthState field. +func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (schema.NodeState, error) { + panic(fmt.Errorf("not implemented: HealthState - HealthState")) +} + +// MetaData is the resolver for the metaData field. +func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) { + panic(fmt.Errorf("not implemented: MetaData - metaData")) +} + // Clusters is the resolver for the clusters field. func (r *queryResolver) Clusters(ctx context.Context) ([]*schema.Cluster, error) { return archive.Clusters, nil @@ -338,6 +358,21 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]* return counts, nil } +// Node is the resolver for the node field. +func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) { + panic(fmt.Errorf("not implemented: Node - node")) +} + +// Nodes is the resolver for the nodes field. +func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) { + panic(fmt.Errorf("not implemented: Nodes - nodes")) +} + +// NodeStats is the resolver for the nodeStats field. +func (r *queryResolver) NodeStats(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStats, error) { + panic(fmt.Errorf("not implemented: NodeStats - nodeStats")) +} + // Job is the resolver for the job field. func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) { numericId, err := strconv.ParseInt(id, 10, 64) @@ -499,10 +534,7 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag return nil, err } - hasNextPage := false - if len(nextJobs) == 1 { - hasNextPage = true - } + hasNextPage := len(nextJobs) == 1 return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil } @@ -513,8 +545,8 @@ func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobF var stats []*model.JobsStatistics // Top Level Defaults - var defaultDurationBins string = "1h" - var defaultMetricBins int = 10 + defaultDurationBins := "1h" + defaultMetricBins := 10 if requireField(ctx, "totalJobs") || requireField(ctx, "totalWalltime") || requireField(ctx, "totalNodes") || requireField(ctx, "totalCores") || requireField(ctx, "totalAccs") || requireField(ctx, "totalNodeHours") || requireField(ctx, "totalCoreHours") || requireField(ctx, "totalAccHours") { @@ -618,9 +650,9 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job numThreadsInt := int(job.NumHWThreads) numAccsInt := int(job.NumAcc) res = append(res, &model.JobStats{ - ID: int(job.ID), + ID: int(*job.ID), JobID: strconv.Itoa(int(job.JobID)), - StartTime: int(job.StartTime.Unix()), + StartTime: int(job.StartTime), Duration: int(job.Duration), Cluster: job.Cluster, SubCluster: job.SubCluster, @@ -773,6 +805,9 @@ func (r *Resolver) MetricValue() generated.MetricValueResolver { return &metricV // Mutation returns generated.MutationResolver implementation. func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} } +// Node returns generated.NodeResolver implementation. +func (r *Resolver) Node() generated.NodeResolver { return &nodeResolver{r} } + // Query returns generated.QueryResolver implementation. func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } @@ -783,5 +818,6 @@ type clusterResolver struct{ *Resolver } type jobResolver struct{ *Resolver } type metricValueResolver struct{ *Resolver } type mutationResolver struct{ *Resolver } +type nodeResolver struct{ *Resolver } type queryResolver struct{ *Resolver } type subClusterResolver struct{ *Resolver } diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 623291c..83230f5 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -42,7 +42,10 @@ func HandleImportFlag(flag string) error { } dec := json.NewDecoder(bytes.NewReader(raw)) dec.DisallowUnknownFields() - job := schema.JobMeta{BaseJob: schema.JobDefaults} + job := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } if err = dec.Decode(&job); err != nil { log.Warn("Error while decoding raw json metadata for import") return err @@ -141,7 +144,7 @@ func HandleImportFlag(flag string) error { return err } - if err = SanityChecks(&job.BaseJob); err != nil { + if err = SanityChecks(&job); err != nil { log.Warn("BaseJob SanityChecks failed") return err } diff --git a/internal/importer/importer_test.go b/internal/importer/importer_test.go index 209b6be..d2bb0b4 100644 --- a/internal/importer/importer_test.go +++ b/internal/importer/importer_test.go @@ -166,7 +166,7 @@ func TestHandleImportFlag(t *testing.T) { } result := readResult(t, testname) - job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime) + job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime) if err != nil { t.Fatal(err) } diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index 9a2ccdf..1239951 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -60,11 +60,6 @@ func InitDB() error { } jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful - job := schema.Job{ - BaseJob: jobMeta.BaseJob, - StartTime: time.Unix(jobMeta.StartTime, 0), - StartTimeUnix: jobMeta.StartTime, - } sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) if err != nil { @@ -72,7 +67,7 @@ func InitDB() error { return err } - job.Footprint = make(map[string]float64) + jobMeta.Footprint = make(map[string]float64) for _, fp := range sc.Footprint { statType := "avg" @@ -83,16 +78,16 @@ func InitDB() error { name := fmt.Sprintf("%s_%s", fp, statType) - job.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType) + jobMeta.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType) } - job.RawFootprint, err = json.Marshal(job.Footprint) + jobMeta.RawFootprint, err = json.Marshal(jobMeta.Footprint) if err != nil { log.Warn("Error while marshaling job footprint") return err } - job.EnergyFootprint = make(map[string]float64) + jobMeta.EnergyFootprint = make(map[string]float64) // Total Job Energy Outside Loop totalEnergy := 0.0 @@ -117,45 +112,45 @@ func InitDB() error { log.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) } - job.EnergyFootprint[fp] = metricEnergy + jobMeta.EnergyFootprint[fp] = metricEnergy totalEnergy += metricEnergy } - job.Energy = (math.Round(totalEnergy*100.0) / 100.0) - if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil { + jobMeta.Energy = (math.Round(totalEnergy*100.0) / 100.0) + if jobMeta.RawEnergyFootprint, err = json.Marshal(jobMeta.EnergyFootprint); err != nil { log.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID) return err } - job.RawResources, err = json.Marshal(job.Resources) + jobMeta.RawResources, err = json.Marshal(jobMeta.Resources) if err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } - job.RawMetaData, err = json.Marshal(job.MetaData) + jobMeta.RawMetaData, err = json.Marshal(jobMeta.MetaData) if err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } - if err := SanityChecks(&job.BaseJob); err != nil { + if err := SanityChecks(jobMeta); err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } id, err := r.TransactionAddNamed(t, - repository.NamedJobInsert, job) + repository.NamedJobInsert, jobMeta) if err != nil { log.Errorf("repository initDB(): %v", err) errorOccured++ continue } - for _, tag := range job.Tags { + for _, tag := range jobMeta.Tags { tagstr := tag.Name + ":" + tag.Type tagId, ok := tags[tagstr] if !ok { @@ -190,7 +185,7 @@ func InitDB() error { } // This function also sets the subcluster if necessary! -func SanityChecks(job *schema.BaseJob) error { +func SanityChecks(job *schema.Job) error { if c := archive.GetCluster(job.Cluster); c == nil { return fmt.Errorf("no such cluster: %v", job.Cluster) } diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go index 7c84d93..557e1d2 100644 --- a/internal/metricdata/cc-metric-store.go +++ b/internal/metricdata/cc-metric-store.go @@ -183,8 +183,8 @@ func (ccms *CCMetricStore) LoadData( req := ApiQueryRequest{ Cluster: job.Cluster, - From: job.StartTime.Unix(), - To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), + From: job.StartTime, + To: job.StartTime + int64(job.Duration), Queries: queries, WithStats: true, WithData: true, @@ -570,7 +570,6 @@ func (ccms *CCMetricStore) LoadStats( metrics []string, ctx context.Context, ) (map[string]map[string]schema.MetricStatistics, error) { - queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization? if err != nil { log.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) @@ -579,8 +578,8 @@ func (ccms *CCMetricStore) LoadStats( req := ApiQueryRequest{ Cluster: job.Cluster, - From: job.StartTime.Unix(), - To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), + From: job.StartTime, + To: job.StartTime + int64(job.Duration), Queries: queries, WithStats: true, WithData: false, @@ -638,8 +637,8 @@ func (ccms *CCMetricStore) LoadScopedStats( req := ApiQueryRequest{ Cluster: job.Cluster, - From: job.StartTime.Unix(), - To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), + From: job.StartTime, + To: job.StartTime + int64(job.Duration), Queries: queries, WithStats: true, WithData: false, @@ -816,7 +815,6 @@ func (ccms *CCMetricStore) LoadNodeListData( page *model.PageRequest, ctx context.Context, ) (map[string]schema.JobData, int, bool, error) { - // 0) Init additional vars var totalNodes int = 0 var hasNextPage bool = false @@ -975,7 +973,6 @@ func (ccms *CCMetricStore) buildNodeQueries( scopes []schema.MetricScope, resolution int, ) ([]ApiQuery, []schema.MetricScope, error) { - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) assignedScope := []schema.MetricScope{} diff --git a/internal/metricdata/influxdb-v2.go b/internal/metricdata/influxdb-v2.go deleted file mode 100644 index c53dad3..0000000 --- a/internal/metricdata/influxdb-v2.go +++ /dev/null @@ -1,575 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package metricdata - -import ( - "context" - "crypto/tls" - "encoding/json" - "errors" - "fmt" - "math" - "sort" - "strings" - "time" - - "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/pkg/archive" - "github.com/ClusterCockpit/cc-backend/pkg/log" - "github.com/ClusterCockpit/cc-backend/pkg/schema" - influxdb2 "github.com/influxdata/influxdb-client-go/v2" - influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" -) - -type InfluxDBv2DataRepositoryConfig struct { - Url string `json:"url"` - Token string `json:"token"` - Bucket string `json:"bucket"` - Org string `json:"org"` - SkipTls bool `json:"skiptls"` -} - -type InfluxDBv2DataRepository struct { - client influxdb2.Client - queryClient influxdb2Api.QueryAPI - bucket, measurement string -} - -func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error { - var config InfluxDBv2DataRepositoryConfig - if err := json.Unmarshal(rawConfig, &config); err != nil { - log.Warn("Error while unmarshaling raw json config") - return err - } - - idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls})) - idb.queryClient = idb.client.QueryAPI(config.Org) - idb.bucket = config.Bucket - - return nil -} - -func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string { - return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00” -} - -func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time { - return time.Unix(epoch, 0) -} - -func (idb *InfluxDBv2DataRepository) LoadData( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, - resolution int) (schema.JobData, error) { - - log.Infof("InfluxDB 2 Backend: Resolution Scaling not Implemented, will return default timestep. Requested Resolution %d", resolution) - - measurementsConds := make([]string, 0, len(metrics)) - for _, m := range metrics { - measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m)) - } - measurementsCond := strings.Join(measurementsConds, " or ") - - hostsConds := make([]string, 0, len(job.Resources)) - for _, h := range job.Resources { - if h.HWThreads != nil || h.Accelerators != nil { - // TODO - return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators") - } - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname)) - } - hostsCond := strings.Join(hostsConds, " or ") - - jobData := make(schema.JobData) // Empty Schema: map[FIELD]map[SCOPE]<*JobMetric>METRIC - // Requested Scopes - for _, scope := range scopes { - query := "" - switch scope { - case "node": - // Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows <-- Resolution could be added here? - // log.Info("Scope 'node' requested. ") - query = fmt.Sprintf(` - from(bucket: "%s") - |> range(start: %s, stop: %s) - |> filter(fn: (r) => (%s) and (%s) ) - |> drop(columns: ["_start", "_stop"]) - |> group(columns: ["hostname", "_measurement"]) - |> aggregateWindow(every: 60s, fn: mean) - |> drop(columns: ["_time"])`, - idb.bucket, - idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))), - measurementsCond, hostsCond) - case "socket": - log.Info("Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ") - continue - case "core": - log.Info(" Scope 'core' requested, but not yet supported: Will return 'node' scope only. ") - continue - // Get Finest Granularity only, Set NULL to 0.0 - // query = fmt.Sprintf(` - // from(bucket: "%s") - // |> range(start: %s, stop: %s) - // |> filter(fn: (r) => %s ) - // |> filter(fn: (r) => %s ) - // |> drop(columns: ["_start", "_stop", "cluster"]) - // |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`, - // idb.bucket, - // idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )), - // measurementsCond, hostsCond) - case "hwthread": - log.Info(" Scope 'hwthread' requested, but not yet supported: Will return 'node' scope only. ") - continue - case "accelerator": - log.Info(" Scope 'accelerator' requested, but not yet supported: Will return 'node' scope only. ") - continue - default: - log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope) - continue - // return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support other scopes than 'node'") - } - - rows, err := idb.queryClient.Query(ctx, query) - if err != nil { - log.Error("Error while performing query") - return nil, err - } - - // Init Metrics: Only Node level now -> TODO: Matching /check on scope level ... - for _, metric := range metrics { - jobMetric, ok := jobData[metric] - if !ok { - mc := archive.GetMetricConfig(job.Cluster, metric) - jobMetric = map[schema.MetricScope]*schema.JobMetric{ - scope: { // uses scope var from above! - Unit: mc.Unit, - Timestep: mc.Timestep, - Series: make([]schema.Series, 0, len(job.Resources)), - StatisticsSeries: nil, // Should be: &schema.StatsSeries{}, - }, - } - } - jobData[metric] = jobMetric - } - - // Process Result: Time-Data - field, host, hostSeries := "", "", schema.Series{} - // typeId := 0 - switch scope { - case "node": - for rows.Next() { - row := rows.Record() - if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() { - if host != "" { - // Append Series before reset - jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries) - } - field, host = row.Measurement(), row.ValueByKey("hostname").(string) - hostSeries = schema.Series{ - Hostname: host, - Statistics: schema.MetricStatistics{}, //TODO Add Statistics - Data: make([]schema.Float, 0), - } - } - val, ok := row.Value().(float64) - if ok { - hostSeries.Data = append(hostSeries.Data, schema.Float(val)) - } else { - hostSeries.Data = append(hostSeries.Data, schema.Float(0)) - } - } - case "socket": - continue - case "accelerator": - continue - case "hwthread": - // See below @ core - continue - case "core": - continue - // Include Series.Id in hostSeries - // for rows.Next() { - // row := rows.Record() - // if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) { - // if ( host != "" ) { - // // Append Series before reset - // jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries) - // } - // field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int) - // hostSeries = schema.Series{ - // Hostname: host, - // Id: &typeId, - // Statistics: nil, - // Data: make([]schema.Float, 0), - // } - // } - // val := row.Value().(float64) - // hostSeries.Data = append(hostSeries.Data, schema.Float(val)) - // } - default: - log.Infof("Unknown scope '%s' requested: Will return 'node' scope.", scope) - continue - // return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'") - } - // Append last Series - jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries) - } - - // Get Stats - stats, err := idb.LoadStats(job, metrics, ctx) - if err != nil { - log.Warn("Error while loading statistics") - return nil, err - } - - for _, scope := range scopes { - if scope == "node" { // No 'socket/core' support yet - for metric, nodes := range stats { - for node, stats := range nodes { - for index, _ := range jobData[metric][scope].Series { - if jobData[metric][scope].Series[index].Hostname == node { - jobData[metric][scope].Series[index].Statistics = schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max} - } - } - } - } - } - } - - return jobData, nil -} - -func (idb *InfluxDBv2DataRepository) LoadStats( - job *schema.Job, - metrics []string, - ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) { - - stats := map[string]map[string]schema.MetricStatistics{} - - hostsConds := make([]string, 0, len(job.Resources)) - for _, h := range job.Resources { - if h.HWThreads != nil || h.Accelerators != nil { - // TODO - return nil, errors.New("METRICDATA/INFLUXV2 > the InfluxDB metric data repository does not yet support HWThreads or Accelerators") - } - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname)) - } - hostsCond := strings.Join(hostsConds, " or ") - - // lenMet := len(metrics) - - for _, metric := range metrics { - // log.Debugf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet) - - query := fmt.Sprintf(` - data = from(bucket: "%s") - |> range(start: %s, stop: %s) - |> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s)) - union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"), - data |> min(column: "_value") |> set(key: "_field", value: "min"), - data |> max(column: "_value") |> set(key: "_field", value: "max")]) - |> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value") - |> group()`, - idb.bucket, - idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))), - metric, hostsCond) - - rows, err := idb.queryClient.Query(ctx, query) - if err != nil { - log.Error("Error while performing query") - return nil, err - } - - nodes := map[string]schema.MetricStatistics{} - for rows.Next() { - row := rows.Record() - host := row.ValueByKey("hostname").(string) - - avg, avgok := row.ValueByKey("avg").(float64) - if !avgok { - // log.Debugf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg) - avg = 0.0 - } - min, minok := row.ValueByKey("min").(float64) - if !minok { - // log.Debugf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min) - min = 0.0 - } - max, maxok := row.ValueByKey("max").(float64) - if !maxok { - // log.Debugf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max) - max = 0.0 - } - - nodes[host] = schema.MetricStatistics{ - Avg: avg, - Min: min, - Max: max, - } - } - stats[metric] = nodes - } - - return stats, nil -} - -// Used in Job-View StatsTable -// UNTESTED -func (idb *InfluxDBv2DataRepository) LoadScopedStats( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context) (schema.ScopedJobStats, error) { - - // Assumption: idb.loadData() only returns series node-scope - use node scope for statsTable - scopedJobStats := make(schema.ScopedJobStats) - data, err := idb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/) - if err != nil { - log.Warn("Error while loading job for scopedJobStats") - return nil, err - } - - for metric, metricData := range data { - for _, scope := range scopes { - if scope != schema.MetricScopeNode { - logOnce.Do(func() { - log.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope) - }) - continue - } - - if _, ok := scopedJobStats[metric]; !ok { - scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats) - } - - if _, ok := scopedJobStats[metric][scope]; !ok { - scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0) - } - - for _, series := range metricData[scope].Series { - scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ - Hostname: series.Hostname, - Data: &series.Statistics, - }) - } - } - } - - return scopedJobStats, nil -} - -// Used in Systems-View @ Node-Overview -// UNTESTED -func (idb *InfluxDBv2DataRepository) LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) { - - // Note: scopes[] Array will be ignored, only return node scope - - // CONVERT ARGS TO INFLUX - measurementsConds := make([]string, 0) - for _, m := range metrics { - measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m)) - } - measurementsCond := strings.Join(measurementsConds, " or ") - - hostsConds := make([]string, 0) - if nodes == nil { - var allNodes []string - subClusterNodeLists := archive.NodeLists[cluster] - for _, nodeList := range subClusterNodeLists { - allNodes = append(nodes, nodeList.PrintList()...) - } - for _, node := range allNodes { - nodes = append(nodes, node) - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node)) - } - } else { - for _, node := range nodes { - hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, node)) - } - } - hostsCond := strings.Join(hostsConds, " or ") - - // BUILD AND PERFORM QUERY - query := fmt.Sprintf(` - from(bucket: "%s") - |> range(start: %s, stop: %s) - |> filter(fn: (r) => (%s) and (%s) ) - |> drop(columns: ["_start", "_stop"]) - |> group(columns: ["hostname", "_measurement"]) - |> aggregateWindow(every: 60s, fn: mean) - |> drop(columns: ["_time"])`, - idb.bucket, - idb.formatTime(from), idb.formatTime(to), - measurementsCond, hostsCond) - - rows, err := idb.queryClient.Query(ctx, query) - if err != nil { - log.Error("Error while performing query") - return nil, err - } - - // HANDLE QUERY RETURN - // Collect Float Arrays for Node@Metric -> No Scope Handling! - influxData := make(map[string]map[string][]schema.Float) - for rows.Next() { - row := rows.Record() - host, field := row.ValueByKey("hostname").(string), row.Measurement() - - influxHostData, ok := influxData[host] - if !ok { - influxHostData = make(map[string][]schema.Float) - influxData[host] = influxHostData - } - - influxFieldData, ok := influxData[host][field] - if !ok { - influxFieldData = make([]schema.Float, 0) - influxData[host][field] = influxFieldData - } - - val, ok := row.Value().(float64) - if ok { - influxData[host][field] = append(influxData[host][field], schema.Float(val)) - } else { - influxData[host][field] = append(influxData[host][field], schema.Float(0)) - } - } - - // BUILD FUNCTION RETURN - data := make(map[string]map[string][]*schema.JobMetric) - for node, metricData := range influxData { - - nodeData, ok := data[node] - if !ok { - nodeData = make(map[string][]*schema.JobMetric) - data[node] = nodeData - } - - for metric, floatArray := range metricData { - avg, min, max := 0.0, 0.0, 0.0 - for _, val := range floatArray { - avg += float64(val) - min = math.Min(min, float64(val)) - max = math.Max(max, float64(val)) - } - - stats := schema.MetricStatistics{ - Avg: (math.Round((avg/float64(len(floatArray)))*100) / 100), - Min: (math.Round(min*100) / 100), - Max: (math.Round(max*100) / 100), - } - - mc := archive.GetMetricConfig(cluster, metric) - nodeData[metric] = append(nodeData[metric], &schema.JobMetric{ - Unit: mc.Unit, - Timestep: mc.Timestep, - Series: []schema.Series{ - { - Hostname: node, - Statistics: stats, - Data: floatArray, - }, - }, - }) - } - } - - return data, nil -} - -// Used in Systems-View @ Node-List -// UNTESTED -func (idb *InfluxDBv2DataRepository) LoadNodeListData( - cluster, subCluster, nodeFilter string, - metrics []string, - scopes []schema.MetricScope, - resolution int, - from, to time.Time, - page *model.PageRequest, - ctx context.Context, -) (map[string]schema.JobData, int, bool, error) { - - // Assumption: idb.loadData() only returns series node-scope - use node scope for NodeList - - // 0) Init additional vars - var totalNodes int = 0 - var hasNextPage bool = false - - // 1) Get list of all nodes - var nodes []string - if subCluster != "" { - scNodes := archive.NodeLists[cluster][subCluster] - nodes = scNodes.PrintList() - } else { - subClusterNodeLists := archive.NodeLists[cluster] - for _, nodeList := range subClusterNodeLists { - nodes = append(nodes, nodeList.PrintList()...) - } - } - - // 2) Filter nodes - if nodeFilter != "" { - filteredNodes := []string{} - for _, node := range nodes { - if strings.Contains(node, nodeFilter) { - filteredNodes = append(filteredNodes, node) - } - } - nodes = filteredNodes - } - - // 2.1) Count total nodes && Sort nodes -> Sorting invalidated after return ... - totalNodes = len(nodes) - sort.Strings(nodes) - - // 3) Apply paging - if len(nodes) > page.ItemsPerPage { - start := (page.Page - 1) * page.ItemsPerPage - end := start + page.ItemsPerPage - if end > len(nodes) { - end = len(nodes) - hasNextPage = false - } else { - hasNextPage = true - } - nodes = nodes[start:end] - } - - // 4) Fetch And Convert Data, use idb.LoadNodeData() for query - - rawNodeData, err := idb.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) - if err != nil { - log.Error(fmt.Sprintf("Error while loading influx nodeData for nodeListData %#v\n", err)) - return nil, totalNodes, hasNextPage, err - } - - data := make(map[string]schema.JobData) - for node, nodeData := range rawNodeData { - // Init Nested Map Data Structures If Not Found - hostData, ok := data[node] - if !ok { - hostData = make(schema.JobData) - data[node] = hostData - } - - for metric, nodeMetricData := range nodeData { - metricData, ok := hostData[metric] - if !ok { - metricData = make(map[schema.MetricScope]*schema.JobMetric) - data[node][metric] = metricData - } - - data[node][metric][schema.MetricScopeNode] = nodeMetricData[0] // Only Node Scope Returned from loadNodeData - } - } - - return data, totalNodes, hasNextPage, nil -} diff --git a/internal/metricdata/metricdata.go b/internal/metricdata/metricdata.go index f30d837..e6b739a 100644 --- a/internal/metricdata/metricdata.go +++ b/internal/metricdata/metricdata.go @@ -54,8 +54,6 @@ func Init() error { switch kind.Kind { case "cc-metric-store": mdr = &CCMetricStore{} - case "influxdb": - mdr = &InfluxDBv2DataRepository{} case "prometheus": mdr = &PrometheusDataRepository{} case "test": diff --git a/internal/metricdata/prometheus.go b/internal/metricdata/prometheus.go index d16501e..fa49764 100644 --- a/internal/metricdata/prometheus.go +++ b/internal/metricdata/prometheus.go @@ -279,8 +279,8 @@ func (pdb *PrometheusDataRepository) LoadData( for i, resource := range job.Resources { nodes[i] = resource.Hostname } - from := job.StartTime - to := job.StartTime.Add(time.Duration(job.Duration) * time.Second) + from := time.Unix(job.StartTime, 0) + to := time.Unix(job.StartTime+int64(job.Duration), 0) for _, scope := range scopes { if scope != schema.MetricScopeNode { @@ -453,8 +453,8 @@ func (pdb *PrometheusDataRepository) LoadScopedStats( job *schema.Job, metrics []string, scopes []schema.MetricScope, - ctx context.Context) (schema.ScopedJobStats, error) { - + ctx context.Context, +) (schema.ScopedJobStats, error) { // Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable scopedJobStats := make(schema.ScopedJobStats) data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/) @@ -502,7 +502,6 @@ func (pdb *PrometheusDataRepository) LoadNodeListData( page *model.PageRequest, ctx context.Context, ) (map[string]schema.JobData, int, bool, error) { - // Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList // 0) Init additional vars diff --git a/internal/repository/job.go b/internal/repository/job.go index 84de6f7..c800141 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -9,12 +9,12 @@ import ( "encoding/json" "errors" "fmt" + "maps" "math" "strconv" "sync" "time" - "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/log" "github.com/ClusterCockpit/cc-backend/pkg/lrucache" @@ -33,6 +33,7 @@ type JobRepository struct { stmtCache *sq.StmtCache cache *lrucache.Cache driver string + Mutex sync.Mutex } func GetJobRepository() *JobRepository { @@ -51,17 +52,29 @@ func GetJobRepository() *JobRepository { } var jobColumns []string = []string{ - "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", - "job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state", - "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", + "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", + "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes", + "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", + "job.smt", "job.job_state", "job.duration", "job.walltime", "job.resources", + "job.footprint", "job.energy", } -func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { +var jobCacheColumns []string = []string{ + "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster", + "job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition", + "job_cache.array_job_id", "job_cache.num_nodes", "job_cache.num_hwthreads", + "job_cache.num_acc", "job_cache.exclusive", "job_cache.monitoring_status", "job_cache.smt", + "job_cache.job_state", "job_cache.duration", "job_cache.walltime", "job_cache.resources", + "job_cache.footprint", "job_cache.energy", +} + +func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { job := &schema.Job{} if err := row.Scan( - &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId, - &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, + &job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, + &job.StartTime, &job.Partition, &job.ArrayJobId, &job.NumNodes, &job.NumHWThreads, + &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State, &job.Duration, &job.Walltime, &job.RawResources, &job.RawFootprint, &job.Energy); err != nil { log.Warnf("Error while scanning rows (Job): %v", err) return nil, err @@ -79,10 +92,9 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) { } job.RawFootprint = nil - job.StartTime = time.Unix(job.StartTimeUnix, 0) // Always ensure accurate duration for running jobs if job.State == schema.JobStateRunning { - job.Duration = int32(time.Since(job.StartTime).Seconds()) + job.Duration = int32(time.Now().Unix() - job.StartTime) } return job, nil @@ -138,17 +150,6 @@ func (r *JobRepository) Flush() error { return nil } -func scanJobLink(row interface{ Scan(...interface{}) error }) (*model.JobLink, error) { - jobLink := &model.JobLink{} - if err := row.Scan( - &jobLink.ID, &jobLink.JobID); err != nil { - log.Warn("Error while scanning rows (jobLink)") - return nil, err - } - - return jobLink, nil -} - func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) { start := time.Now() cachekey := fmt.Sprintf("metadata:%d", job.ID) @@ -189,9 +190,7 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er if job.MetaData != nil { cpy := make(map[string]string, len(job.MetaData)+1) - for k, v := range job.MetaData { - cpy[k] = v - } + maps.Copy(cpy, job.MetaData) cpy[key] = val job.MetaData = cpy } else { @@ -389,7 +388,7 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table func (r *JobRepository) Partitions(cluster string) ([]string, error) { var err error start := time.Now() - partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) { + partitions := r.cache.Get("partitions:"+cluster, func() (any, time.Duration, int) { parts := []string{} if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil { return nil, 0, 1000 @@ -477,6 +476,7 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { return nil } +// FIXME: Reconsider filtering short jobs with harcoded threshold func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { query := sq.Select(jobColumns...).From("job"). Where(fmt.Sprintf("job.cluster = '%s'", cluster)). @@ -581,7 +581,7 @@ func (r *JobRepository) MarkArchived( func (r *JobRepository) UpdateEnergy( stmt sq.UpdateBuilder, - jobMeta *schema.JobMeta, + jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) @@ -631,7 +631,7 @@ func (r *JobRepository) UpdateEnergy( func (r *JobRepository) UpdateFootprint( stmt sq.UpdateBuilder, - jobMeta *schema.JobMeta, + jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 9e47974..1508c8d 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -13,6 +13,14 @@ import ( sq "github.com/Masterminds/squirrel" ) +const NamedJobCacheInsert string = `INSERT INTO job_cache ( + job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, + exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data +) VALUES ( + :job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data +);` + const NamedJobInsert string = `INSERT INTO job ( job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data @@ -21,8 +29,10 @@ const NamedJobInsert string = `INSERT INTO job ( :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data );` -func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { - res, err := r.DB.NamedExec(NamedJobInsert, job) +func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) { + r.Mutex.Lock() + res, err := r.DB.NamedExec(NamedJobCacheInsert, job) + r.Mutex.Unlock() if err != nil { log.Warn("Error while NamedJobInsert") return 0, err @@ -36,9 +46,48 @@ func (r *JobRepository) InsertJob(job *schema.JobMeta) (int64, error) { return id, nil } +func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { + r.Mutex.Lock() + defer r.Mutex.Unlock() + + query := sq.Select(jobCacheColumns...).From("job_cache") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Errorf("Error while running query %v", err) + return nil, err + } + + jobs := make([]*schema.Job, 0, 50) + for rows.Next() { + job, err := scanJob(rows) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jobs = append(jobs, job) + } + + _, err = r.DB.Exec( + "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + if err != nil { + log.Warnf("Error while Job sync: %v", err) + return nil, err + } + + _, err = r.DB.Exec("DELETE FROM job_cache") + if err != nil { + log.Warnf("Error while Job cache clean: %v", err) + return nil, err + } + + return jobs, nil +} + // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! -func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) { +func (r *JobRepository) Start(job *schema.Job) (id int64, err error) { job.RawFootprint, err = json.Marshal(job.Footprint) if err != nil { return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err) @@ -73,3 +122,19 @@ func (r *JobRepository) Stop( _, err = stmt.RunWith(r.stmtCache).Exec() return } + +func (r *JobRepository) StopCached( + jobId int64, + duration int32, + state schema.JobState, + monitoringStatus int32, +) (err error) { + stmt := sq.Update("job_cache"). + Set("job_state", state). + Set("duration", duration). + Set("monitoring_status", monitoringStatus). + Where("job.id = ?", jobId) + + _, err = stmt.RunWith(r.stmtCache).Exec() + return +} diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 1e2ccb8..2acdb87 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -43,6 +43,26 @@ func (r *JobRepository) Find( return scanJob(q.RunWith(r.stmtCache).QueryRow()) } +func (r *JobRepository) FindCached( + jobId *int64, + cluster *string, + startTime *int64, +) (*schema.Job, error) { + q := sq.Select(jobCacheColumns...).From("job_cache"). + Where("job_cache.job_id = ?", *jobId) + + if cluster != nil { + q = q.Where("job_cache.cluster = ?", *cluster) + } + if startTime != nil { + q = q.Where("job_cache.start_time = ?", *startTime) + } + + q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match + + return scanJob(q.RunWith(r.stmtCache).QueryRow()) +} + // Find executes a SQL query to find a specific batch job. // The job is queried using the batch job id, the cluster name, // and the start time of the job in UNIX epoch time seconds. @@ -83,6 +103,35 @@ func (r *JobRepository) FindAll( return jobs, nil } +// Get complete joblist only consisting of db ids. +// This is useful to process large job counts and intended to be used +// together with FindById to process jobs one by one +func (r *JobRepository) GetJobList() ([]int64, error) { + query := sq.Select("id").From("job"). + Where("job.job_state != 'running'") + + rows, err := query.RunWith(r.stmtCache).Query() + if err != nil { + log.Error("Error while running query") + return nil, err + } + + jl := make([]int64, 0, 1000) + for rows.Next() { + var id int64 + err := rows.Scan(&id) + if err != nil { + rows.Close() + log.Warn("Error while scanning rows") + return nil, err + } + jl = append(jl, id) + } + + log.Infof("Return job count %d", len(jl)) + return jl, nil +} + // FindById executes a SQL query to find a specific batch job. // The job is queried using the database id. // It returns a pointer to a schema.Job data structure and an error variable. @@ -178,7 +227,7 @@ func (r *JobRepository) FindConcurrentJobs( var startTime int64 var stopTime int64 - startTime = job.StartTimeUnix + startTime = job.StartTime hostname := job.Resources[0].Hostname if job.State == schema.JobStateRunning { diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go new file mode 100644 index 0000000..1016335 --- /dev/null +++ b/internal/repository/jobHooks.go @@ -0,0 +1,57 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +type JobHook interface { + JobStartCallback(job *schema.Job) + JobStopCallback(job *schema.Job) +} + +var ( + initOnce sync.Once + hooks []JobHook +) + +func RegisterJobJook(hook JobHook) { + initOnce.Do(func() { + hooks = make([]JobHook, 0) + }) + + if hook != nil { + hooks = append(hooks, hook) + } +} + +func CallJobStartHooks(jobs []*schema.Job) { + if hooks == nil { + return + } + + for _, hook := range hooks { + if hook != nil { + for _, job := range jobs { + hook.JobStartCallback(job) + } + } + } +} + +func CallJobStopHooks(job *schema.Job) { + if hooks == nil { + return + } + + for _, hook := range hooks { + if hook != nil { + hook.JobStopCallback(job) + } + } +} diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index 6a2ddec..2f72e77 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -148,9 +148,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select } if filter.DbID != nil { dbIDs := make([]string, len(filter.DbID)) - for i, val := range filter.DbID { - dbIDs[i] = val - } + copy(dbIDs, filter.DbID) query = query.Where(sq.Eq{"job.id": dbIDs}) } if filter.JobID != nil { diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index 363bb6c..bf7abd9 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -24,7 +24,7 @@ func TestFind(t *testing.T) { // fmt.Printf("%+v", job) - if job.ID != 5 { + if *job.ID != 5 { t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1366", job.JobID) } } diff --git a/internal/repository/migration.go b/internal/repository/migration.go index 0b2591e..fb78170 100644 --- a/internal/repository/migration.go +++ b/internal/repository/migration.go @@ -16,7 +16,7 @@ import ( "github.com/golang-migrate/migrate/v4/source/iofs" ) -const Version uint = 8 +const Version uint = 10 //go:embed migrations/* var migrationFiles embed.FS @@ -115,8 +115,17 @@ func MigrateDB(backend string, db string) error { } v, dirty, err := m.Version() + if err != nil { + if err == migrate.ErrNilVersion { + log.Warn("Legacy database without version or missing database file!") + } else { + return err + } + } - log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version) + if v < Version { + log.Infof("unsupported database version %d, need %d.\nPlease backup your database file and run cc-backend -migrate-db", v, Version) + } if dirty { return fmt.Errorf("last migration to version %d has failed, please fix the db manually and force version with -force-db flag", Version) diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql new file mode 100644 index 0000000..ef257cf --- /dev/null +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS job_cache; diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql new file mode 100644 index 0000000..7840369 --- /dev/null +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql @@ -0,0 +1,31 @@ +CREATE TABLE "job_cache" ( + id INTEGER PRIMARY KEY, + job_id BIGINT NOT NULL, + cluster VARCHAR(255) NOT NULL, + subcluster VARCHAR(255) NOT NULL, + start_time BIGINT NOT NULL, -- Unix timestamp + hpc_user VARCHAR(255) NOT NULL, + project VARCHAR(255) NOT NULL, + cluster_partition VARCHAR(255), + array_job_id BIGINT, + duration INT NOT NULL, + walltime INT NOT NULL, + job_state VARCHAR(255) NOT NULL + CHECK (job_state IN ( + 'running', 'completed', 'failed', 'cancelled', + 'stopped', 'timeout', 'preempted', 'out_of_memory' + )), + meta_data TEXT, -- JSON + resources TEXT NOT NULL, -- JSON + num_nodes INT NOT NULL, + num_hwthreads INT, + num_acc INT, + smt TINYINT NOT NULL DEFAULT 1 CHECK (smt IN (0, 1)), + exclusive TINYINT NOT NULL DEFAULT 1 CHECK (exclusive IN (0, 1, 2)), + monitoring_status TINYINT NOT NULL DEFAULT 1 + CHECK (monitoring_status IN (0, 1, 2, 3)), + energy REAL NOT NULL DEFAULT 0.0, + energy_footprint TEXT DEFAULT NULL, + footprint TEXT DEFAULT NULL, + UNIQUE (job_id, cluster, start_time) +); diff --git a/internal/repository/migrations/sqlite3/10_node-table.down.sql b/internal/repository/migrations/sqlite3/10_node-table.down.sql new file mode 100644 index 0000000..9119a5a --- /dev/null +++ b/internal/repository/migrations/sqlite3/10_node-table.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS node; diff --git a/internal/repository/migrations/sqlite3/10_node-table.up.sql b/internal/repository/migrations/sqlite3/10_node-table.up.sql new file mode 100644 index 0000000..a11f20d --- /dev/null +++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql @@ -0,0 +1,17 @@ +CREATE TABLE "node" ( + id INTEGER PRIMARY KEY, + hostname VARCHAR(255) NOT NULL, + cluster VARCHAR(255) NOT NULL, + subcluster VARCHAR(255) NOT NULL, + node_state VARCHAR(255) NOT NULL + CHECK (node_state IN ( + 'allocated', 'reserved', 'idle', 'mixed', + 'down', 'unknown' + )), + health_state VARCHAR(255) NOT NULL + CHECK (health_state IN ( + 'full', 'partial', 'failed' + )), + meta_data TEXT, -- JSON + UNIQUE (hostname, cluster) +); diff --git a/internal/repository/node.go b/internal/repository/node.go new file mode 100644 index 0000000..0e742c2 --- /dev/null +++ b/internal/repository/node.go @@ -0,0 +1,241 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "database/sql" + "encoding/json" + "fmt" + "maps" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/lrucache" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + sq "github.com/Masterminds/squirrel" + "github.com/jmoiron/sqlx" +) + +var ( + nodeRepoOnce sync.Once + nodeRepoInstance *NodeRepository +) + +type NodeRepository struct { + DB *sqlx.DB + stmtCache *sq.StmtCache + cache *lrucache.Cache + driver string +} + +func GetNodeRepository() *NodeRepository { + nodeRepoOnce.Do(func() { + db := GetConnection() + + nodeRepoInstance = &NodeRepository{ + DB: db.DB, + driver: db.Driver, + + stmtCache: sq.NewStmtCache(db.DB), + cache: lrucache.New(1024 * 1024), + } + }) + return nodeRepoInstance +} + +func (r *NodeRepository) FetchMetadata(node *schema.Node) (map[string]string, error) { + start := time.Now() + cachekey := fmt.Sprintf("metadata:%d", node.ID) + if cached := r.cache.Get(cachekey, nil); cached != nil { + node.MetaData = cached.(map[string]string) + return node.MetaData, nil + } + + if err := sq.Select("node.meta_data").From("node").Where("node.id = ?", node.ID). + RunWith(r.stmtCache).QueryRow().Scan(&node.RawMetaData); err != nil { + log.Warn("Error while scanning for node metadata") + return nil, err + } + + if len(node.RawMetaData) == 0 { + return nil, nil + } + + if err := json.Unmarshal(node.RawMetaData, &node.MetaData); err != nil { + log.Warn("Error while unmarshaling raw metadata json") + return nil, err + } + + r.cache.Put(cachekey, node.MetaData, len(node.RawMetaData), 24*time.Hour) + log.Debugf("Timer FetchMetadata %s", time.Since(start)) + return node.MetaData, nil +} + +func (r *NodeRepository) UpdateMetadata(node *schema.Node, key, val string) (err error) { + cachekey := fmt.Sprintf("metadata:%d", node.ID) + r.cache.Del(cachekey) + if node.MetaData == nil { + if _, err = r.FetchMetadata(node); err != nil { + log.Warnf("Error while fetching metadata for node, DB ID '%v'", node.ID) + return err + } + } + + if node.MetaData != nil { + cpy := make(map[string]string, len(node.MetaData)+1) + maps.Copy(cpy, node.MetaData) + cpy[key] = val + node.MetaData = cpy + } else { + node.MetaData = map[string]string{key: val} + } + + if node.RawMetaData, err = json.Marshal(node.MetaData); err != nil { + log.Warnf("Error while marshaling metadata for node, DB ID '%v'", node.ID) + return err + } + + if _, err = sq.Update("node"). + Set("meta_data", node.RawMetaData). + Where("node.id = ?", node.ID). + RunWith(r.stmtCache).Exec(); err != nil { + log.Warnf("Error while updating metadata for node, DB ID '%v'", node.ID) + return err + } + + r.cache.Put(cachekey, node.MetaData, len(node.RawMetaData), 24*time.Hour) + return nil +} + +func (r *NodeRepository) GetNode(id int64, withMeta bool) (*schema.Node, error) { + node := &schema.Node{} + if err := sq.Select("id", "hostname", "cluster", "subcluster", "node_state", + "health_state").From("node"). + Where("node.id = ?", id).RunWith(r.DB). + QueryRow().Scan(&node.ID, &node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, + &node.HealthState); err != nil { + log.Warnf("Error while querying node '%v' from database", id) + return nil, err + } + + if withMeta { + var err error + var meta map[string]string + if meta, err = r.FetchMetadata(node); err != nil { + log.Warnf("Error while fetching metadata for node '%v'", id) + return nil, err + } + node.MetaData = meta + } + + return node, nil +} + +const NamedNodeInsert string = ` +INSERT INTO node (hostname, cluster, subcluster, node_state, health_state) + VALUES (:hostname, :cluster, :subcluster, :node_state, :health_state);` + +func (r *NodeRepository) AddNode(node *schema.Node) (int64, error) { + var err error + + res, err := r.DB.NamedExec(NamedNodeInsert, node) + if err != nil { + log.Errorf("Error while adding node '%v' to database", node.Hostname) + return 0, err + } + node.ID, err = res.LastInsertId() + if err != nil { + log.Errorf("Error while getting last insert id for node '%v' from database", node.Hostname) + return 0, err + } + + return node.ID, nil +} + +func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeState *schema.NodeState) error { + var id int64 + if err := sq.Select("id").From("node"). + Where("node.hostname = ?", hostname).Where("node.cluster = ?", cluster).RunWith(r.DB). + QueryRow().Scan(&id); err != nil { + if err == sql.ErrNoRows { + subcluster, err := archive.GetSubClusterByNode(cluster, hostname) + if err != nil { + log.Errorf("Error while getting subcluster for node '%s' in cluster '%s': %v", hostname, cluster, err) + return err + } + node := schema.Node{ + Hostname: hostname, Cluster: cluster, SubCluster: subcluster, NodeState: *nodeState, + HealthState: schema.MonitoringStateFull, + } + _, err = r.AddNode(&node) + if err != nil { + log.Errorf("Error while adding node '%s' to database: %v", hostname, err) + return err + } + + return nil + } else { + log.Warnf("Error while querying node '%v' from database", id) + return err + } + } + + if _, err := sq.Update("node").Set("node_state", nodeState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { + log.Errorf("error while updating node '%s'", hostname) + return err + } + + return nil +} + +// func (r *NodeRepository) UpdateHealthState(hostname string, healthState *schema.MonitoringState) error { +// if _, err := sq.Update("node").Set("health_state", healthState).Where("node.id = ?", id).RunWith(r.DB).Exec(); err != nil { +// log.Errorf("error while updating node '%d'", id) +// return err +// } +// +// return nil +// } + +func (r *NodeRepository) DeleteNode(id int64) error { + _, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id) + if err != nil { + log.Errorf("Error while deleting node '%d' from DB", id) + return err + } + log.Infof("deleted node '%d' from DB", id) + return nil +} + +func (r *NodeRepository) QueryNodes() ([]*schema.Node, error) { + return nil, nil +} + +func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { + q := sq.Select("hostname", "cluster", "subcluster", "node_state", + "health_state").From("node").Where("node.cluster = ?", cluster).OrderBy("node.hostname ASC") + + rows, err := q.RunWith(r.DB).Query() + if err != nil { + log.Warn("Error while querying user list") + return nil, err + } + nodeList := make([]*schema.Node, 0, 100) + defer rows.Close() + for rows.Next() { + node := &schema.Node{} + if err := rows.Scan(&node.Hostname, &node.Cluster, + &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + log.Warn("Error while scanning node list") + return nil, err + } + + nodeList = append(nodeList, node) + } + + return nodeList, nil +} diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 410ba6c..7a5078f 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -291,7 +291,7 @@ func (r *JobRepository) JobsStats( return stats, nil } -func LoadJobStat(job *schema.JobMeta, metric string, statType string) float64 { +func LoadJobStat(job *schema.Job, metric string, statType string) float64 { if stats, ok := job.Statistics[metric]; ok { switch statType { case "avg": @@ -759,7 +759,6 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram( filters []*model.JobFilter, bins *int, ) []*model.MetricHistoPoints { - // Get Jobs jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil) if err != nil { diff --git a/internal/repository/tags.go b/internal/repository/tags.go index 544163e..a9416c4 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -45,6 +45,36 @@ func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*sche return tags, archive.UpdateTags(j, archiveTags) } +func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error) { + j, err := r.FindByIdDirect(job) + if err != nil { + log.Warn("Error while finding job by id") + return nil, err + } + + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(job, tag) + + if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { + s, _, _ := q.ToSql() + log.Errorf("Error adding tag with %s: %v", s, err) + return nil, err + } + + tags, err := r.GetTagsDirect(&job) + if err != nil { + log.Warn("Error while getting tags for job") + return nil, err + } + + archiveTags, err := r.getArchiveTags(&job) + if err != nil { + log.Warn("Error while getting tags for job") + return nil, err + } + + return tags, archive.UpdateTags(j, archiveTags) +} + // Removes a tag from a job by tag id func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) { j, err := r.FindByIdWithUser(user, job) @@ -82,7 +112,7 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT tagID, exists := r.TagId(tagType, tagName, tagScope) if !exists { log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) - return nil, fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) + return nil, fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } // Get Job @@ -122,7 +152,7 @@ func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagSc tagID, exists := r.TagId(tagType, tagName, tagScope) if !exists { log.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) - return fmt.Errorf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) + return fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } // Handle Delete JobTagTable @@ -291,6 +321,37 @@ func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType s return tagId, nil } +func (r *JobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) { + tagScope := "global" + + tagId, exists := r.TagId(tagType, tagName, tagScope) + if !exists { + tagId, err = r.CreateTag(tagType, tagName, tagScope) + if err != nil { + return 0, err + } + } + + if _, err := r.AddTagDirect(jobId, tagId); err != nil { + return 0, err + } + + return tagId, nil +} + +func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { + var id int64 + q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). + Where("jobtag.job_id = ?", jobId).Where("tag.tag_type = ?", tagType). + Where("tag.tag_name = ?", tagName) + err := q.RunWith(r.stmtCache).QueryRow().Scan(&id) + if err != nil { + return false + } else { + return true + } +} + // TagId returns the database id of the tag with the specified type and name. func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) { exists = true @@ -346,6 +407,32 @@ func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, e return tags, nil } +func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { + q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") + if job != nil { + q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job) + } + + rows, err := q.RunWith(r.stmtCache).Query() + if err != nil { + s, _, _ := q.ToSql() + log.Errorf("Error get tags with %s: %v", s, err) + return nil, err + } + + tags := make([]*schema.Tag, 0) + for rows.Next() { + tag := &schema.Tag{} + if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { + log.Warn("Error while scanning rows") + return nil, err + } + tags = append(tags, tag) + } + + return tags, nil +} + // GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has. func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index 43ec9d3..e9e20ce 100644 Binary files a/internal/repository/testdata/job.db and b/internal/repository/testdata/job.db differ diff --git a/internal/tagger/apps/gromacs.txt b/internal/tagger/apps/gromacs.txt new file mode 100644 index 0000000..c5d939b --- /dev/null +++ b/internal/tagger/apps/gromacs.txt @@ -0,0 +1,4 @@ +GROMACS +gromacs +GMX +mdrun diff --git a/internal/tagger/apps/openfoam.txt b/internal/tagger/apps/openfoam.txt new file mode 100644 index 0000000..542d645 --- /dev/null +++ b/internal/tagger/apps/openfoam.txt @@ -0,0 +1 @@ +openfoam diff --git a/internal/tagger/apps/python.txt b/internal/tagger/apps/python.txt new file mode 100644 index 0000000..7a5c661 --- /dev/null +++ b/internal/tagger/apps/python.txt @@ -0,0 +1,3 @@ +python +anaconda +conda diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt new file mode 100644 index 0000000..eec9092 --- /dev/null +++ b/internal/tagger/apps/vasp.txt @@ -0,0 +1,2 @@ +VASP +vasp diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go new file mode 100644 index 0000000..0af7096 --- /dev/null +++ b/internal/tagger/classifyJob.go @@ -0,0 +1,322 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "bytes" + "embed" + "encoding/json" + "fmt" + "maps" + "os" + "strings" + "text/template" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" + "github.com/expr-lang/expr" + "github.com/expr-lang/expr/vm" +) + +//go:embed jobclasses/* +var jobclassFiles embed.FS + +type Variable struct { + Name string `json:"name"` + Expr string `json:"expr"` +} + +type ruleVariable struct { + name string + expr *vm.Program +} + +type RuleFormat struct { + Name string `json:"name"` + Tag string `json:"tag"` + Parameters []string `json:"parameters"` + Metrics []string `json:"metrics"` + Requirements []string `json:"requirements"` + Variables []Variable `json:"variables"` + Rule string `json:"rule"` + Hint string `json:"hint"` +} + +type ruleInfo struct { + env map[string]any + metrics []string + requirements []*vm.Program + variables []ruleVariable + rule *vm.Program + hint *template.Template +} + +type JobClassTagger struct { + rules map[string]ruleInfo + parameters map[string]any + tagType string + cfgPath string +} + +func (t *JobClassTagger) prepareRule(b []byte, fns string) { + var rule RuleFormat + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&rule); err != nil { + log.Warn("Error while decoding raw job meta json") + return + } + + ri := ruleInfo{} + ri.env = make(map[string]any) + ri.metrics = make([]string, 0) + ri.requirements = make([]*vm.Program, 0) + ri.variables = make([]ruleVariable, 0) + + // check if all required parameters are available + for _, p := range rule.Parameters { + param, ok := t.parameters[p] + if !ok { + log.Warnf("prepareRule() > missing parameter %s in rule %s", p, fns) + return + } + ri.env[p] = param + } + + // set all required metrics + ri.metrics = append(ri.metrics, rule.Metrics...) + + // compile requirements + for _, r := range rule.Requirements { + req, err := expr.Compile(r, expr.AsBool()) + if err != nil { + log.Errorf("error compiling requirement %s: %#v", r, err) + return + } + ri.requirements = append(ri.requirements, req) + } + + // compile variables + for _, v := range rule.Variables { + req, err := expr.Compile(v.Expr, expr.AsFloat64()) + if err != nil { + log.Errorf("error compiling requirement %s: %#v", v.Name, err) + return + } + ri.variables = append(ri.variables, ruleVariable{name: v.Name, expr: req}) + } + + // compile rule + exp, err := expr.Compile(rule.Rule, expr.AsBool()) + if err != nil { + log.Errorf("error compiling rule %s: %#v", fns, err) + return + } + ri.rule = exp + + // prepare hint template + ri.hint, err = template.New(fns).Parse(rule.Hint) + if err != nil { + log.Errorf("error processing template %s: %#v", fns, err) + } + log.Infof("prepareRule() > processing %s with %d requirements and %d variables", fns, len(ri.requirements), len(ri.variables)) + + t.rules[rule.Tag] = ri +} + +func (t *JobClassTagger) EventMatch(s string) bool { + return strings.Contains(s, "jobclasses") +} + +// FIXME: Only process the file that caused the event +func (t *JobClassTagger) EventCallback() { + files, err := os.ReadDir(t.cfgPath) + if err != nil { + log.Fatal(err) + } + + if util.CheckFileExists(t.cfgPath + "/parameters.json") { + log.Info("Merge parameters") + b, err := os.ReadFile(t.cfgPath + "/parameters.json") + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + } + + var paramTmp map[string]any + if err := json.NewDecoder(bytes.NewReader(b)).Decode(¶mTmp); err != nil { + log.Warn("Error while decoding parameters.json") + } + + maps.Copy(t.parameters, paramTmp) + } + + for _, fn := range files { + fns := fn.Name() + if fns != "parameters.json" { + log.Debugf("Process: %s", fns) + filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + b, err := os.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return + } + t.prepareRule(b, fns) + } + } +} + +func (t *JobClassTagger) initParameters() error { + log.Info("Initialize parameters") + b, err := jobclassFiles.ReadFile("jobclasses/parameters.json") + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return err + } + + if err := json.NewDecoder(bytes.NewReader(b)).Decode(&t.parameters); err != nil { + log.Warn("Error while decoding parameters.json") + return err + } + + return nil +} + +func (t *JobClassTagger) Register() error { + t.cfgPath = "./var/tagger/jobclasses" + t.tagType = "jobClass" + + err := t.initParameters() + if err != nil { + log.Warnf("error reading parameters.json: %v", err) + return err + } + + files, err := jobclassFiles.ReadDir("jobclasses") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.rules = make(map[string]ruleInfo, 0) + for _, fn := range files { + fns := fn.Name() + if fns != "parameters.json" { + filename := fmt.Sprintf("jobclasses/%s", fns) + log.Infof("Process: %s", fns) + + b, err := jobclassFiles.ReadFile(filename) + if err != nil { + log.Warnf("prepareRule() > open file error: %v", err) + return err + } + t.prepareRule(b, fns) + } + } + + if util.CheckFileExists(t.cfgPath) { + t.EventCallback() + log.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) + } + + return nil +} + +func (t *JobClassTagger) Match(job *schema.Job) { + r := repository.GetJobRepository() + jobstats, err := archive.GetStatistics(job) + metricsList := archive.GetMetricConfigSubCluster(job.Cluster, job.SubCluster) + log.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) + if err != nil { + log.Errorf("job classification failed for job %d: %#v", job.JobID, err) + return + } + + for tag, ri := range t.rules { + env := make(map[string]any) + maps.Copy(env, ri.env) + log.Infof("Try to match rule %s for job %d", tag, job.JobID) + + // Initialize environment + env["job"] = map[string]any{ + "exclusive": job.Exclusive, + "duration": job.Duration, + "numCores": job.NumHWThreads, + "numNodes": job.NumNodes, + "jobState": job.State, + "numAcc": job.NumAcc, + "smt": job.SMT, + } + + // add metrics to env + for _, m := range ri.metrics { + stats, ok := jobstats[m] + if !ok { + log.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m) + return + } + env[m] = map[string]any{ + "min": stats.Min, + "max": stats.Max, + "avg": stats.Avg, + "limits": map[string]float64{ + "peak": metricsList[m].Peak, + "normal": metricsList[m].Normal, + "caution": metricsList[m].Caution, + "alert": metricsList[m].Alert, + }, + } + } + + // check rule requirements apply + for _, r := range ri.requirements { + ok, err := expr.Run(r, env) + if err != nil { + log.Errorf("error running requirement for rule %s: %#v", tag, err) + return + } + if !ok.(bool) { + log.Infof("requirement for rule %s not met", tag) + return + } + } + + // validate rule expression + for _, v := range ri.variables { + value, err := expr.Run(v.expr, env) + if err != nil { + log.Errorf("error running rule %s: %#v", tag, err) + return + } + env[v.name] = value + } + + // dump.P(env) + + match, err := expr.Run(ri.rule, env) + if err != nil { + log.Errorf("error running rule %s: %#v", tag, err) + return + } + if match.(bool) { + log.Info("Rule matches!") + id := *job.ID + if !r.HasTag(id, t.tagType, tag) { + r.AddTagOrCreateDirect(id, t.tagType, tag) + } + + // process hint template + var msg bytes.Buffer + if err := ri.hint.Execute(&msg, env); err != nil { + log.Errorf("Template error: %s", err.Error()) + return + } + + // FIXME: Handle case where multiple tags apply + r.UpdateMetadata(job, "message", msg.String()) + } else { + log.Info("Rule does not match!") + } + } +} diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go new file mode 100644 index 0000000..9e4bf29 --- /dev/null +++ b/internal/tagger/detectApp.go @@ -0,0 +1,125 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "bufio" + "embed" + "fmt" + "io/fs" + "os" + "path/filepath" + "strings" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/internal/util" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +//go:embed apps/* +var appFiles embed.FS + +type appInfo struct { + tag string + strings []string +} + +type AppTagger struct { + apps map[string]appInfo + tagType string + cfgPath string +} + +func (t *AppTagger) scanApp(f fs.File, fns string) { + scanner := bufio.NewScanner(f) + ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} + + for scanner.Scan() { + ai.strings = append(ai.strings, scanner.Text()) + } + delete(t.apps, ai.tag) + t.apps[ai.tag] = ai +} + +func (t *AppTagger) EventMatch(s string) bool { + return strings.Contains(s, "apps") +} + +// FIXME: Only process the file that caused the event +func (t *AppTagger) EventCallback() { + files, err := os.ReadDir(t.cfgPath) + if err != nil { + log.Fatal(err) + } + + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) + if err != nil { + log.Errorf("error opening app file %s: %#v", fns, err) + } + t.scanApp(f, fns) + } +} + +func (t *AppTagger) Register() error { + t.cfgPath = "./var/tagger/apps" + t.tagType = "app" + + files, err := appFiles.ReadDir("apps") + if err != nil { + return fmt.Errorf("error reading app folder: %#v", err) + } + t.apps = make(map[string]appInfo, 0) + for _, fn := range files { + fns := fn.Name() + log.Debugf("Process: %s", fns) + f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + if err != nil { + return fmt.Errorf("error opening app file %s: %#v", fns, err) + } + defer f.Close() + t.scanApp(f, fns) + } + + if util.CheckFileExists(t.cfgPath) { + t.EventCallback() + log.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) + } + + return nil +} + +func (t *AppTagger) Match(job *schema.Job) { + r := repository.GetJobRepository() + metadata, err := r.FetchMetadata(job) + if err != nil { + log.Infof("Cannot fetch metadata for job: %d on %s", job.JobID, job.Cluster) + return + } + + jobscript, ok := metadata["jobScript"] + if ok { + id := *job.ID + + out: + for _, a := range t.apps { + tag := a.tag + for _, s := range a.strings { + if strings.Contains(jobscript, s) { + if !r.HasTag(id, t.tagType, tag) { + r.AddTagOrCreateDirect(id, t.tagType, tag) + break out + } + } + } + } + } else { + log.Infof("Cannot extract job script for job: %d on %s", job.JobID, job.Cluster) + } +} diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go new file mode 100644 index 0000000..3b43cce --- /dev/null +++ b/internal/tagger/detectApp_test.go @@ -0,0 +1,59 @@ +// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "testing" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" +) + +func setup(tb testing.TB) *repository.JobRepository { + tb.Helper() + log.Init("warn", true) + dbfile := "../repository/testdata/job.db" + err := repository.MigrateDB("sqlite3", dbfile) + noErr(tb, err) + repository.Connect("sqlite3", dbfile) + return repository.GetJobRepository() +} + +func noErr(tb testing.TB, err error) { + tb.Helper() + + if err != nil { + tb.Fatal("Error is not nil:", err) + } +} + +func TestRegister(t *testing.T) { + var tagger AppTagger + + err := tagger.Register() + noErr(t, err) + + if len(tagger.apps) != 4 { + t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 3", len(tagger.apps)) + } +} + +func TestMatch(t *testing.T) { + r := setup(t) + + job, err := r.FindByIdDirect(5) + noErr(t, err) + + var tagger AppTagger + + err = tagger.Register() + noErr(t, err) + + tagger.Match(job) + + if !r.HasTag(5, "app", "vasp") { + t.Errorf("missing tag vasp") + } +} diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json new file mode 100644 index 0000000..01476c1 --- /dev/null +++ b/internal/tagger/jobclasses/highload.json @@ -0,0 +1,26 @@ +{ + "name": "Excessive CPU load", + "tag": "excessiveload", + "parameters": [ + "excessivecpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": ["cpu_load"], + "requirements": [ + "job.exclusive == 1", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "load_threshold", + "expr": "(job.numCores / job.numNodes) * excessivecpuload_threshold_factor" + }, + { + "name": "load_perc", + "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" + } + ], + "rule": "cpu_load.avg > cpu_load.limits.peak", + "hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.cpu_load.limits.peak}}." +} diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json new file mode 100644 index 0000000..2212bd1 --- /dev/null +++ b/internal/tagger/jobclasses/lowload.json @@ -0,0 +1,26 @@ +{ + "name": "Low CPU load", + "tag": "lowload", + "parameters": [ + "lowcpuload_threshold_factor", + "job_min_duration_seconds", + "sampling_interval_seconds" + ], + "metrics": ["cpu_load"], + "requirements": [ + "job.exclusive == 1", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "load_threshold", + "expr": "job.numCores * lowcpuload_threshold_factor" + }, + { + "name": "load_perc", + "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" + } + ], + "rule": "cpu_load.avg < cpu_load.limits.caution", + "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}." +} diff --git a/internal/tagger/jobclasses/parameters.json b/internal/tagger/jobclasses/parameters.json new file mode 100644 index 0000000..39e94c1 --- /dev/null +++ b/internal/tagger/jobclasses/parameters.json @@ -0,0 +1,14 @@ +{ + "lowcpuload_threshold_factor": 0.9, + "excessivecpuload_threshold_factor": 1.1, + "highmemoryusage_threshold_factor": 0.9, + "node_load_imbalance_threshold_factor": 0.1, + "core_load_imbalance_threshold_factor": 0.1, + "high_memory_load_threshold_factor": 0.9, + "lowgpuload_threshold_factor": 0.7, + "memory_leak_slope_threshold": 0.1, + "job_min_duration_seconds": 600.0, + "sampling_interval_seconds": 30.0, + "cpu_load_pre_cutoff_samples": 11.0, + "cpu_load_core_pre_cutoff_samples": 6.0 +} diff --git a/internal/tagger/rules.json b/internal/tagger/rules.json new file mode 100644 index 0000000..c88afb4 --- /dev/null +++ b/internal/tagger/rules.json @@ -0,0 +1,21 @@ +{ + "and": [ + { + "in": [ + "a40", + { + "var": "metaData.jobScript" + } + ] + }, + { + ">": [ + { + "var": "statistics.clock.min" + }, + 2000 + ] + } + ] + } + \ No newline at end of file diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go new file mode 100644 index 0000000..04edd49 --- /dev/null +++ b/internal/tagger/tagger.go @@ -0,0 +1,88 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +type Tagger interface { + Register() error + Match(job *schema.Job) +} + +var ( + initOnce sync.Once + jobTagger *JobTagger +) + +type JobTagger struct { + startTaggers []Tagger + stopTaggers []Tagger +} + +func newTagger() { + jobTagger = &JobTagger{} + jobTagger.startTaggers = make([]Tagger, 0) + jobTagger.startTaggers = append(jobTagger.startTaggers, &AppTagger{}) + jobTagger.stopTaggers = make([]Tagger, 0) + jobTagger.stopTaggers = append(jobTagger.stopTaggers, &JobClassTagger{}) + + for _, tagger := range jobTagger.startTaggers { + tagger.Register() + } + for _, tagger := range jobTagger.stopTaggers { + tagger.Register() + } +} + +func Init() { + initOnce.Do(func() { + newTagger() + repository.RegisterJobJook(jobTagger) + }) +} + +func (jt *JobTagger) JobStartCallback(job *schema.Job) { + for _, tagger := range jt.startTaggers { + tagger.Match(job) + } +} + +func (jt *JobTagger) JobStopCallback(job *schema.Job) { + for _, tagger := range jt.stopTaggers { + tagger.Match(job) + } +} + +func RunTaggers() error { + newTagger() + r := repository.GetJobRepository() + jl, err := r.GetJobList() + if err != nil { + log.Errorf("Error while getting job list %s", err) + return err + } + + for _, id := range jl { + job, err := r.FindByIdDirect(id) + if err != nil { + log.Errorf("Error while getting job %s", err) + return err + } + for _, tagger := range jobTagger.startTaggers { + tagger.Match(job) + } + for _, tagger := range jobTagger.stopTaggers { + log.Infof("Run stop tagger for job %d", job.ID) + tagger.Match(job) + } + } + return nil +} diff --git a/internal/tagger/tagger_test.go b/internal/tagger/tagger_test.go new file mode 100644 index 0000000..057ca17 --- /dev/null +++ b/internal/tagger/tagger_test.go @@ -0,0 +1,31 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package tagger + +import ( + "testing" + + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/schema" +) + +func TestInit(t *testing.T) { + Init() +} + +func TestJobStartCallback(t *testing.T) { + Init() + r := setup(t) + job, err := r.FindByIdDirect(2) + noErr(t, err) + + jobs := make([]*schema.Job, 0, 1) + jobs = append(jobs, job) + + repository.CallJobStartHooks(jobs) + if !r.HasTag(2, "app", "python") { + t.Errorf("missing tag python") + } +} diff --git a/internal/taskManager/commitJobService.go b/internal/taskManager/commitJobService.go new file mode 100644 index 0000000..c60acb3 --- /dev/null +++ b/internal/taskManager/commitJobService.go @@ -0,0 +1,35 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package taskManager + +import ( + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/go-co-op/gocron/v2" +) + +func RegisterCommitJobService() { + var frequency string + if config.Keys.CronFrequency != nil && config.Keys.CronFrequency.CommitJobWorker != "" { + frequency = config.Keys.CronFrequency.CommitJobWorker + } else { + frequency = "2m" + } + d, _ := time.ParseDuration(frequency) + log.Infof("Register commitJob service with %s interval", frequency) + + s.NewJob(gocron.DurationJob(d), + gocron.NewTask( + func() { + start := time.Now() + log.Printf("Jobcache sync started at %s", start.Format(time.RFC3339)) + jobs, _ := jobRepo.SyncJobs() + repository.CallJobStartHooks(jobs) + log.Printf("Jobcache sync and job callbacks are done and took %s", time.Since(start)) + })) +} diff --git a/internal/taskManager/taskManager.go b/internal/taskManager/taskManager.go index 2004e0d..7d9a3a2 100644 --- a/internal/taskManager/taskManager.go +++ b/internal/taskManager/taskManager.go @@ -81,6 +81,7 @@ func Start() { RegisterFootprintWorker() RegisterUpdateDurationWorker() + RegisterCommitJobService() s.Start() } diff --git a/internal/taskManager/updateFootprintService.go b/internal/taskManager/updateFootprintService.go index a220855..f417ad4 100644 --- a/internal/taskManager/updateFootprintService.go +++ b/internal/taskManager/updateFootprintService.go @@ -73,11 +73,7 @@ func RegisterFootprintWorker() { continue } - jobMeta := &schema.JobMeta{ - BaseJob: job.BaseJob, - StartTime: job.StartTime.Unix(), - Statistics: make(map[string]schema.JobStatistics), - } + job.Statistics = make(map[string]schema.JobStatistics) for _, metric := range allMetrics { avg, min, max := 0.0, 0.0, 0.0 @@ -95,7 +91,7 @@ func RegisterFootprintWorker() { } // Add values rounded to 2 digits: repo.LoadStats may return unrounded - jobMeta.Statistics[metric] = schema.JobStatistics{ + job.Statistics[metric] = schema.JobStatistics{ Unit: schema.Unit{ Prefix: archive.GetMetricConfig(job.Cluster, metric).Unit.Prefix, Base: archive.GetMetricConfig(job.Cluster, metric).Unit.Base, @@ -108,7 +104,7 @@ func RegisterFootprintWorker() { // Build Statement per Job, Add to Pending Array stmt := sq.Update("job") - stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta) + stmt, err = jobRepo.UpdateFootprint(stmt, job) if err != nil { log.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", job.ID, err.Error()) ce++ diff --git a/internal/util/fswatcher.go b/internal/util/fswatcher.go new file mode 100644 index 0000000..5d13462 --- /dev/null +++ b/internal/util/fswatcher.go @@ -0,0 +1,75 @@ +// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package util + +import ( + "sync" + + "github.com/ClusterCockpit/cc-backend/pkg/log" + "github.com/fsnotify/fsnotify" +) + +type Listener interface { + EventCallback() + EventMatch(event string) bool +} + +var ( + initOnce sync.Once + w *fsnotify.Watcher + listeners []Listener +) + +func AddListener(path string, l Listener) { + var err error + + initOnce.Do(func() { + var err error + w, err = fsnotify.NewWatcher() + if err != nil { + log.Error("creating a new watcher: %w", err) + } + listeners = make([]Listener, 0) + + go watchLoop(w) + }) + + listeners = append(listeners, l) + err = w.Add(path) + if err != nil { + log.Warnf("%q: %s", path, err) + } +} + +func FsWatcherShutdown() { + if w != nil { + w.Close() + } +} + +func watchLoop(w *fsnotify.Watcher) { + for { + select { + // Read from Errors. + case err, ok := <-w.Errors: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + return + } + log.Errorf("watch event loop: %s", err) + // Read from Events. + case e, ok := <-w.Events: + if !ok { // Channel was closed (i.e. Watcher.Close() was called). + return + } + + log.Infof("Event %s", e) + for _, l := range listeners { + if l.EventMatch(e.String()) { + l.EventCallback() + } + } + } + } +} diff --git a/pkg/archive/archive.go b/pkg/archive/archive.go index cd457eb..c221e91 100644 --- a/pkg/archive/archive.go +++ b/pkg/archive/archive.go @@ -23,7 +23,7 @@ type ArchiveBackend interface { Exists(job *schema.Job) bool - LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) + LoadJobMeta(job *schema.Job) (*schema.Job, error) LoadJobData(job *schema.Job) (schema.JobData, error) @@ -31,9 +31,9 @@ type ArchiveBackend interface { LoadClusterCfg(name string) (*schema.Cluster, error) - StoreJobMeta(jobMeta *schema.JobMeta) error + StoreJobMeta(jobMeta *schema.Job) error - ImportJob(jobMeta *schema.JobMeta, jobData *schema.JobData) error + ImportJob(jobMeta *schema.Job, jobData *schema.JobData) error GetClusters() []string @@ -51,7 +51,7 @@ type ArchiveBackend interface { } type JobContainer struct { - Meta *schema.JobMeta + Meta *schema.Job Data *schema.JobData } @@ -162,7 +162,6 @@ func LoadScopedStatsFromArchive( metrics []string, scopes []schema.MetricScope, ) (schema.ScopedJobStats, error) { - data, err := ar.LoadJobStats(job) if err != nil { log.Errorf("Error while loading job stats from archiveBackend: %s", err.Error()) diff --git a/pkg/archive/archive_test.go b/pkg/archive/archive_test.go index ac00ea1..ba53e38 100644 --- a/pkg/archive/archive_test.go +++ b/pkg/archive/archive_test.go @@ -9,7 +9,6 @@ import ( "fmt" "path/filepath" "testing" - "time" "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/archive" @@ -32,12 +31,12 @@ func setup(t *testing.T) archive.ArchiveBackend { jobs[0] = &schema.Job{} jobs[0].JobID = 1403244 jobs[0].Cluster = "emmy" - jobs[0].StartTime = time.Unix(1608923076, 0) + jobs[0].StartTime = 1608923076 jobs[1] = &schema.Job{} jobs[0].JobID = 1404397 jobs[0].Cluster = "emmy" - jobs[0].StartTime = time.Unix(1609300556, 0) + jobs[0].StartTime = 1609300556 return archive.GetHandle() } diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index d53941b..04d1349 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -69,16 +69,18 @@ func initClusterConfig() error { for _, sc := range cluster.SubClusters { newMetric := &schema.MetricConfig{ - Unit: mc.Unit, + Metric: schema.Metric{ + Name: mc.Name, + Unit: mc.Unit, + Peak: mc.Peak, + Normal: mc.Normal, + Caution: mc.Caution, + Alert: mc.Alert, + }, Energy: mc.Energy, - Name: mc.Name, Scope: mc.Scope, Aggregation: mc.Aggregation, - Peak: mc.Peak, - Caution: mc.Caution, - Alert: mc.Alert, Timestep: mc.Timestep, - Normal: mc.Normal, LowerIsBetter: mc.LowerIsBetter, } @@ -167,6 +169,45 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) { return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster) } +func GetMetricConfigSubCluster(cluster, subcluster string) map[string]*schema.Metric { + metrics := make(map[string]*schema.Metric) + + for _, c := range Clusters { + if c.Name == cluster { + for _, m := range c.MetricConfig { + for _, s := range m.SubClusters { + if s.Name == subcluster { + metrics[m.Name] = &schema.Metric{ + Name: m.Name, + Unit: s.Unit, + Peak: s.Peak, + Normal: s.Normal, + Caution: s.Caution, + Alert: s.Alert, + } + break + } + } + + _, ok := metrics[m.Name] + if !ok { + metrics[m.Name] = &schema.Metric{ + Name: m.Name, + Unit: m.Unit, + Peak: m.Peak, + Normal: m.Normal, + Caution: m.Caution, + Alert: m.Alert, + } + } + } + break + } + } + + return metrics +} + func GetMetricConfig(cluster, metric string) *schema.MetricConfig { for _, c := range Clusters { if c.Name == cluster { @@ -182,7 +223,7 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig { // AssignSubCluster sets the `job.subcluster` property of the job based // on its cluster and resources. -func AssignSubCluster(job *schema.BaseJob) error { +func AssignSubCluster(job *schema.Job) error { cluster := GetCluster(job.Cluster) if cluster == nil { return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster) diff --git a/pkg/archive/fsBackend.go b/pkg/archive/fsBackend.go index 711b1f5..a90c092 100644 --- a/pkg/archive/fsBackend.go +++ b/pkg/archive/fsBackend.go @@ -53,28 +53,27 @@ func getDirectory( rootPath, job.Cluster, lvl1, lvl2, - strconv.FormatInt(job.StartTime.Unix(), 10)) + strconv.FormatInt(job.StartTime, 10)) } func getPath( job *schema.Job, rootPath string, - file string) string { - + file string, +) string { return filepath.Join( getDirectory(job, rootPath), file) } -func loadJobMeta(filename string) (*schema.JobMeta, error) { - +func loadJobMeta(filename string) (*schema.Job, error) { b, err := os.ReadFile(filename) if err != nil { log.Errorf("loadJobMeta() > open file error: %v", err) - return &schema.JobMeta{}, err + return nil, err } if config.Keys.Validate { if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil { - return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err) + return nil, fmt.Errorf("validate job meta: %v", err) } } @@ -83,7 +82,6 @@ func loadJobMeta(filename string) (*schema.JobMeta, error) { func loadJobData(filename string, isCompressed bool) (schema.JobData, error) { f, err := os.Open(filename) - if err != nil { log.Errorf("fsBackend LoadJobData()- %v", err) return nil, err @@ -117,7 +115,6 @@ func loadJobData(filename string, isCompressed bool) (schema.JobData, error) { func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) { f, err := os.Open(filename) - if err != nil { log.Errorf("fsBackend LoadJobStats()- %v", err) return nil, err @@ -150,7 +147,6 @@ func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, er } func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) { - var config FsArchiveConfig if err := json.Unmarshal(rawConfig, &config); err != nil { log.Warnf("Init() > Unmarshal error: %#v", err) @@ -276,7 +272,6 @@ func (fsa *FsArchive) Exists(job *schema.Job) bool { } func (fsa *FsArchive) Clean(before int64, after int64) { - if after == 0 { after = math.MaxInt64 } @@ -392,7 +387,6 @@ func (fsa *FsArchive) Compress(jobs []*schema.Job) { } func (fsa *FsArchive) CompressLast(starttime int64) int64 { - filename := filepath.Join(fsa.path, "compress.txt") b, err := os.ReadFile(filename) if err != nil { @@ -435,13 +429,12 @@ func (fsa *FsArchive) LoadJobStats(job *schema.Job) (schema.ScopedJobStats, erro return loadJobStats(filename, isCompressed) } -func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) { +func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.Job, error) { filename := getPath(job, fsa.path, "meta.json") return loadJobMeta(filename) } func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { - b, err := os.ReadFile(filepath.Join(fsa.path, name, "cluster.json")) if err != nil { log.Errorf("LoadClusterCfg() > open file error: %v", err) @@ -456,7 +449,6 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { } func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { - ch := make(chan JobContainer) go func() { clustersDir, err := os.ReadDir(fsa.path) @@ -526,19 +518,13 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { return ch } -func (fsa *FsArchive) StoreJobMeta(jobMeta *schema.JobMeta) error { - - job := schema.Job{ - BaseJob: jobMeta.BaseJob, - StartTime: time.Unix(jobMeta.StartTime, 0), - StartTimeUnix: jobMeta.StartTime, - } - f, err := os.Create(getPath(&job, fsa.path, "meta.json")) +func (fsa *FsArchive) StoreJobMeta(job *schema.Job) error { + f, err := os.Create(getPath(job, fsa.path, "meta.json")) if err != nil { log.Error("Error while creating filepath for meta.json") return err } - if err := EncodeJobMeta(f, jobMeta); err != nil { + if err := EncodeJobMeta(f, job); err != nil { log.Error("Error while encoding job metadata to meta.json file") return err } @@ -555,15 +541,10 @@ func (fsa *FsArchive) GetClusters() []string { } func (fsa *FsArchive) ImportJob( - jobMeta *schema.JobMeta, - jobData *schema.JobData) error { - - job := schema.Job{ - BaseJob: jobMeta.BaseJob, - StartTime: time.Unix(jobMeta.StartTime, 0), - StartTimeUnix: jobMeta.StartTime, - } - dir := getPath(&job, fsa.path, "") + jobMeta *schema.Job, + jobData *schema.JobData, +) error { + dir := getPath(jobMeta, fsa.path, "") if err := os.MkdirAll(dir, 0777); err != nil { log.Error("Error while creating job archive path") return err @@ -583,28 +564,6 @@ func (fsa *FsArchive) ImportJob( return err } - // var isCompressed bool = true - // // TODO Use shortJob Config for check - // if jobMeta.Duration < 300 { - // isCompressed = false - // f, err = os.Create(path.Join(dir, "data.json")) - // } else { - // f, err = os.Create(path.Join(dir, "data.json.gz")) - // } - // if err != nil { - // return err - // } - // - // if isCompressed { - // if err := EncodeJobData(gzip.NewWriter(f), jobData); err != nil { - // return err - // } - // } else { - // if err := EncodeJobData(f, jobData); err != nil { - // return err - // } - // } - f, err = os.Create(path.Join(dir, "data.json")) if err != nil { log.Error("Error while creating filepath for data.json") diff --git a/pkg/archive/fsBackend_test.go b/pkg/archive/fsBackend_test.go index 9db68ed..ddb430a 100644 --- a/pkg/archive/fsBackend_test.go +++ b/pkg/archive/fsBackend_test.go @@ -9,7 +9,6 @@ import ( "fmt" "path/filepath" "testing" - "time" "github.com/ClusterCockpit/cc-backend/internal/util" "github.com/ClusterCockpit/cc-backend/pkg/schema" @@ -86,8 +85,11 @@ func TestLoadJobMeta(t *testing.T) { t.Fatal(err) } - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" @@ -114,8 +116,11 @@ func TestLoadJobData(t *testing.T) { t.Fatal(err) } - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" @@ -142,8 +147,11 @@ func BenchmarkLoadJobData(b *testing.B) { var fsa FsArchive fsa.Init(json.RawMessage(archiveCfg)) - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" @@ -165,8 +173,11 @@ func BenchmarkLoadJobDataCompressed(b *testing.B) { var fsa FsArchive fsa.Init(json.RawMessage(archiveCfg)) - jobIn := schema.Job{BaseJob: schema.JobDefaults} - jobIn.StartTime = time.Unix(1608923076, 0) + jobIn := schema.Job{ + Exclusive: 1, + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + jobIn.StartTime = 1608923076 jobIn.JobID = 1403244 jobIn.Cluster = "emmy" diff --git a/pkg/archive/json.go b/pkg/archive/json.go index 5201b74..d3639f5 100644 --- a/pkg/archive/json.go +++ b/pkg/archive/json.go @@ -69,8 +69,8 @@ func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) { return nil, err } -func DecodeJobMeta(r io.Reader) (*schema.JobMeta, error) { - var d schema.JobMeta +func DecodeJobMeta(r io.Reader) (*schema.Job, error) { + var d schema.Job if err := json.NewDecoder(r).Decode(&d); err != nil { log.Warn("Error while decoding raw job meta json") return &d, err @@ -103,7 +103,7 @@ func EncodeJobData(w io.Writer, d *schema.JobData) error { return nil } -func EncodeJobMeta(w io.Writer, d *schema.JobMeta) error { +func EncodeJobMeta(w io.Writer, d *schema.Job) error { // Sanitize parameters if err := json.NewEncoder(w).Encode(d); err != nil { log.Warn("Error while encoding new job meta json") diff --git a/pkg/archive/nodelist.go b/pkg/archive/nodelist.go index 7700185..26a15d2 100644 --- a/pkg/archive/nodelist.go +++ b/pkg/archive/nodelist.go @@ -61,7 +61,7 @@ func (nl *NodeList) PrintList() []string { } func (nl *NodeList) NodeCount() int { - var out int = 0 + out := 0 for _, term := range *nl { if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one out += 1 @@ -160,7 +160,7 @@ func (nle NLExprIntRange) limits() []map[string]int { m["start"] = int(nle.start) m["end"] = int(nle.end) m["digits"] = int(nle.digits) - if nle.zeroPadded == true { + if nle.zeroPadded { m["zeroPadded"] = 1 } else { m["zeroPadded"] = 0 @@ -183,14 +183,15 @@ func ParseNodeList(raw string) (NodeList, error) { rawterms := []string{} prevterm := 0 for i := 0; i < len(raw); i++ { - if raw[i] == '[' { + switch raw[i] { + case '[': for i < len(raw) && raw[i] != ']' { i++ } if i == len(raw) { return nil, fmt.Errorf("ARCHIVE/NODELIST > unclosed '['") } - } else if raw[i] == ',' { + case ',': rawterms = append(rawterms, raw[prevterm:i]) prevterm = i + 1 } diff --git a/pkg/schema/cluster.go b/pkg/schema/cluster.go index 322f308..1b9f2cc 100644 --- a/pkg/schema/cluster.go +++ b/pkg/schema/cluster.go @@ -45,31 +45,31 @@ type SubCluster struct { ThreadsPerCore int `json:"threadsPerCore"` } +type Metric struct { + Name string `json:"name"` + Unit Unit `json:"unit"` + Peak float64 `json:"peak"` + Normal float64 `json:"normal"` + Caution float64 `json:"caution"` + Alert float64 `json:"alert"` +} + type SubClusterConfig struct { - Name string `json:"name"` - Footprint string `json:"footprint,omitempty"` - Energy string `json:"energy"` - Peak float64 `json:"peak"` - Normal float64 `json:"normal"` - Caution float64 `json:"caution"` - Alert float64 `json:"alert"` - Remove bool `json:"remove"` - LowerIsBetter bool `json:"lowerIsBetter"` + Metric + Footprint string `json:"footprint,omitempty"` + Energy string `json:"energy"` + Remove bool `json:"remove"` + LowerIsBetter bool `json:"lowerIsBetter"` } type MetricConfig struct { - Unit Unit `json:"unit"` + Metric Energy string `json:"energy"` - Name string `json:"name"` Scope MetricScope `json:"scope"` Aggregation string `json:"aggregation"` Footprint string `json:"footprint,omitempty"` SubClusters []*SubClusterConfig `json:"subClusters,omitempty"` - Peak float64 `json:"peak"` - Caution float64 `json:"caution"` - Alert float64 `json:"alert"` Timestep int `json:"timestep"` - Normal float64 `json:"normal"` LowerIsBetter bool `json:"lowerIsBetter"` } @@ -127,7 +127,7 @@ func (topo *Topology) GetSocketsFromHWThreads( // those in the argument list are assigned to one of the sockets in the first // return value, return true as the second value. TODO: Optimize this, there // must be a more efficient way/algorithm. -func (topo *Topology) GetSocketsFromCores ( +func (topo *Topology) GetSocketsFromCores( cores []int, ) (sockets []int, exclusive bool) { socketsMap := map[int]int{} diff --git a/pkg/schema/config.go b/pkg/schema/config.go index 27d11be..eda3d91 100644 --- a/pkg/schema/config.go +++ b/pkg/schema/config.go @@ -89,6 +89,8 @@ type ResampleConfig struct { } type CronFrequency struct { + // Duration Update Worker [Defaults to '2m'] + CommitJobWorker string `json:"commit-job-worker"` // Duration Update Worker [Defaults to '5m'] DurationWorker string `json:"duration-worker"` // Metric-Footprint Update Worker [Defaults to '10m'] @@ -129,6 +131,8 @@ type ProgramConfig struct { // do not write to the job-archive. DisableArchive bool `json:"disable-archive"` + EnableJobTaggers bool `json:"enable-job-taggers"` + // Validate json input against schema Validate bool `json:"validate"` @@ -150,7 +154,7 @@ type ProgramConfig struct { // If overwritten, at least all the options in the defaults below must // be provided! Most options here can be overwritten by the user. - UiDefaults map[string]interface{} `json:"ui-defaults"` + UiDefaults map[string]any `json:"ui-defaults"` // If exists, will enable dynamic zoom in frontend metric plots using the configured values EnableResampling *ResampleConfig `json:"enable-resampling"` diff --git a/pkg/schema/job.go b/pkg/schema/job.go index 5e3110b..00051f4 100644 --- a/pkg/schema/job.go +++ b/pkg/schema/job.go @@ -8,43 +8,8 @@ import ( "errors" "fmt" "io" - "time" ) -// BaseJob is the common part of the job metadata structs -// -// Common subset of Job and JobMeta. Use one of those, not this type directly. - -type BaseJob struct { - Cluster string `json:"cluster" db:"cluster" example:"fritz"` - SubCluster string `json:"subCluster" db:"subcluster" example:"main"` - Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"` - Project string `json:"project" db:"project" example:"abcd200"` - User string `json:"user" db:"hpc_user" example:"abcd100h"` - State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` - Tags []*Tag `json:"tags,omitempty"` - RawEnergyFootprint []byte `json:"-" db:"energy_footprint"` - RawFootprint []byte `json:"-" db:"footprint"` - RawMetaData []byte `json:"-" db:"meta_data"` - RawResources []byte `json:"-" db:"resources"` - Resources []*Resource `json:"resources"` - EnergyFootprint map[string]float64 `json:"energyFootprint"` - Footprint map[string]float64 `json:"footprint"` - MetaData map[string]string `json:"metaData"` - ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` - Energy float64 `json:"energy" db:"energy"` - ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` - Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` - JobID int64 `json:"jobId" db:"job_id" example:"123000"` - Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` - SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` - MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` - Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` - NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` - NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` - NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` -} - // Job struct type // // This type is used as the GraphQL interface and using sqlx as a table row. @@ -52,10 +17,37 @@ type BaseJob struct { // Job model // @Description Information of a HPC job. type Job struct { - StartTime time.Time `json:"startTime"` - BaseJob - ID int64 `json:"id" db:"id"` - StartTimeUnix int64 `json:"-" db:"start_time" example:"1649723812"` + Cluster string `json:"cluster" db:"cluster" example:"fritz"` + SubCluster string `json:"subCluster" db:"subcluster" example:"main"` + Partition string `json:"partition,omitempty" db:"cluster_partition" example:"main"` + Project string `json:"project" db:"project" example:"abcd200"` + User string `json:"user" db:"hpc_user" example:"abcd100h"` + State JobState `json:"jobState" db:"job_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + Tags []*Tag `json:"tags,omitempty"` + RawEnergyFootprint []byte `json:"-" db:"energy_footprint"` + RawFootprint []byte `json:"-" db:"footprint"` + RawMetaData []byte `json:"-" db:"meta_data"` + RawResources []byte `json:"-" db:"resources"` + Resources []*Resource `json:"resources"` + EnergyFootprint map[string]float64 `json:"energyFootprint"` + Footprint map[string]float64 `json:"footprint"` + MetaData map[string]string `json:"metaData"` + ConcurrentJobs JobLinkResultList `json:"concurrentJobs"` + Energy float64 `json:"energy" db:"energy"` + ArrayJobId int64 `json:"arrayJobId,omitempty" db:"array_job_id" example:"123000"` + Walltime int64 `json:"walltime,omitempty" db:"walltime" example:"86400" minimum:"1"` + RequestedMemory int64 `json:"requestedMemory,omitempty" db:"requested_memory" example:"128000" minimum:"1"` // in MB + JobID int64 `json:"jobId" db:"job_id" example:"123000"` + Duration int32 `json:"duration" db:"duration" example:"43200" minimum:"1"` + SMT int32 `json:"smt,omitempty" db:"smt" example:"4"` + MonitoringStatus int32 `json:"monitoringStatus,omitempty" db:"monitoring_status" example:"1" minimum:"0" maximum:"3"` + Exclusive int32 `json:"exclusive" db:"exclusive" example:"1" minimum:"0" maximum:"2"` + NumAcc int32 `json:"numAcc,omitempty" db:"num_acc" example:"2" minimum:"1"` + NumHWThreads int32 `json:"numHwthreads,omitempty" db:"num_hwthreads" example:"20" minimum:"1"` + NumNodes int32 `json:"numNodes" db:"num_nodes" example:"2" minimum:"1"` + Statistics map[string]JobStatistics `json:"statistics"` + ID *int64 `json:"id,omitempty" db:"id"` + StartTime int64 `json:"startTime" db:"start_time" example:"1649723812"` } // JobMeta struct type @@ -68,6 +60,14 @@ type Job struct { // *int64 `json:"id,omitempty"` >> never used in the job-archive, only // available via REST-API // +// JobMeta model +// @Description Meta data information of a HPC job. +// type JobMeta struct { +// ID *int64 `json:"id,omitempty"` +// BaseJob +// Statistics map[string]JobStatistics `json:"statistics"` +// StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` +// } type JobLink struct { ID int64 `json:"id"` @@ -79,15 +79,6 @@ type JobLinkResultList struct { Count int `json:"count"` } -// JobMeta model -// @Description Meta data information of a HPC job. -type JobMeta struct { - ID *int64 `json:"id,omitempty"` - Statistics map[string]JobStatistics `json:"statistics"` - BaseJob - StartTime int64 `json:"startTime" db:"start_time" example:"1649723812" minimum:"1"` -} - const ( MonitoringStatusDisabled int32 = 0 MonitoringStatusRunningOrArchiving int32 = 1 @@ -95,10 +86,10 @@ const ( MonitoringStatusArchivingSuccessful int32 = 3 ) -var JobDefaults BaseJob = BaseJob{ - Exclusive: 1, - MonitoringStatus: MonitoringStatusRunningOrArchiving, -} +// var JobDefaults Job = Job{ +// Exclusive: 1, +// MonitoringStatus: MonitoringStatusRunningOrArchiving, +// } type Unit struct { Base string `json:"base"` @@ -145,7 +136,12 @@ const ( JobStateOutOfMemory JobState = "out_of_memory" ) -func (e *JobState) UnmarshalGQL(v interface{}) error { +func (j Job) GoString() string { + return fmt.Sprintf("Job{ID:%d, StartTime:%d, JobID:%v, BaseJob:%v}", + j.ID, j.StartTime, j.JobID, j) +} + +func (e *JobState) UnmarshalGQL(v any) error { str, ok := v.(string) if !ok { return fmt.Errorf("SCHEMA/JOB > enums must be strings") diff --git a/pkg/schema/node.go b/pkg/schema/node.go new file mode 100644 index 0000000..3e2bbfb --- /dev/null +++ b/pkg/schema/node.go @@ -0,0 +1,35 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package schema + +type NodeState string + +const ( + NodeStateAllocated NodeState = "allocated" + NodeStateReserved NodeState = "reserved" + NodeStateIdle NodeState = "idle" + NodeStateMixed NodeState = "mixed" + NodeStateDown NodeState = "down" + NodeStateUnknown NodeState = "unknown" +) + +type MonitoringState string + +const ( + MonitoringStateFull MonitoringState = "full" + MonitoringStatePartial MonitoringState = "partial" + MonitoringStateFailed MonitoringState = "failed" +) + +type Node struct { + ID int64 `json:"id" db:"id"` + Hostname string `json:"hostname" db:"hostname" example:"fritz"` + Cluster string `json:"cluster" db:"cluster" example:"fritz"` + SubCluster string `json:"subCluster" db:"subcluster" example:"main"` + NodeState NodeState `json:"nodeState" db:"node_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + HealthState MonitoringState `json:"healthState" db:"health_state" example:"completed" enums:"completed,failed,cancelled,stopped,timeout,out_of_memory"` + RawMetaData []byte `json:"-" db:"meta_data"` + MetaData map[string]string `json:"metaData"` +} diff --git a/pkg/schema/user.go b/pkg/schema/user.go index 9b62cfa..2fff453 100644 --- a/pkg/schema/user.go +++ b/pkg/schema/user.go @@ -6,6 +6,7 @@ package schema import ( "fmt" + "slices" "strings" ) @@ -50,12 +51,7 @@ type User struct { } func (u *User) HasProject(project string) bool { - for _, p := range u.Projects { - if p == project { - return true - } - } - return false + return slices.Contains(u.Projects, project) } func GetRoleString(roleInt Role) string { diff --git a/pkg/schema/validate.go b/pkg/schema/validate.go index 3511936..d14adf5 100644 --- a/pkg/schema/validate.go +++ b/pkg/schema/validate.go @@ -28,12 +28,13 @@ const ( //go:embed schemas/* var schemaFiles embed.FS -func Validate(k Kind, r io.Reader) (err error) { +func Validate(k Kind, r io.Reader) error { jsonschema.Loaders["embedfs"] = func(s string) (io.ReadCloser, error) { f := filepath.Join("schemas", strings.Split(s, "//")[1]) return schemaFiles.Open(f) } var s *jsonschema.Schema + var err error switch k { case Meta: @@ -54,7 +55,7 @@ func Validate(k Kind, r io.Reader) (err error) { } var v interface{} - if err := json.NewDecoder(r).Decode(&v); err != nil { + if err = json.NewDecoder(r).Decode(&v); err != nil { log.Warnf("Error while decoding raw json schema: %#v", err) return err }