mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-11-13 02:17:25 +01:00
BC: new schemas for basically everything
This commit is contained in:
parent
7fcc39a144
commit
89333666b3
@ -33,6 +33,7 @@ func (api *RestApi) MountRoutes(r *mux.Router) {
|
|||||||
r.HandleFunc("/api/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
|
r.HandleFunc("/api/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO/FIXME: UPDATE API!
|
||||||
type StartJobApiRequest struct {
|
type StartJobApiRequest struct {
|
||||||
JobId int64 `json:"jobId"`
|
JobId int64 `json:"jobId"`
|
||||||
UserId string `json:"userId"`
|
UserId string `json:"userId"`
|
||||||
@ -255,7 +256,7 @@ func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Printf("archiving job... (id: %s): clusterId=%s, jobId=%s, userId=%s, startTime=%s, nodes=%v\n", job.ID, job.ClusterID, job.JobID, job.UserID, job.StartTime, job.Nodes)
|
log.Printf("archiving job... (id: %s): clusterId=%s, jobId=%d, userId=%s, startTime=%s\n", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
|
||||||
if api.AsyncArchiving {
|
if api.AsyncArchiving {
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
rw.Header().Add("Content-Type", "application/json")
|
||||||
rw.WriteHeader(http.StatusOK)
|
rw.WriteHeader(http.StatusOK)
|
||||||
|
@ -56,7 +56,7 @@ models:
|
|||||||
- github.com/99designs/gqlgen/graphql.Int32
|
- github.com/99designs/gqlgen/graphql.Int32
|
||||||
Job:
|
Job:
|
||||||
fields:
|
fields:
|
||||||
tags:
|
Tags:
|
||||||
resolver: true
|
resolver: true
|
||||||
JobMetric:
|
JobMetric:
|
||||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric"
|
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric"
|
||||||
@ -68,4 +68,8 @@ models:
|
|||||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float"
|
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float"
|
||||||
JobMetricScope:
|
JobMetricScope:
|
||||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope"
|
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope"
|
||||||
|
JobResource:
|
||||||
|
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobResource"
|
||||||
|
Accelerator:
|
||||||
|
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Accelerator"
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -38,36 +38,42 @@ type IntRangeOutput struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Job struct {
|
type Job struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"Id"`
|
||||||
JobID string `json:"jobId"`
|
JobID int `json:"JobId"`
|
||||||
UserID string `json:"userId"`
|
User string `json:"User"`
|
||||||
ProjectID string `json:"projectId"`
|
Project string `json:"Project"`
|
||||||
ClusterID string `json:"clusterId"`
|
Cluster string `json:"Cluster"`
|
||||||
StartTime time.Time `json:"startTime"`
|
StartTime time.Time `json:"StartTime"`
|
||||||
Duration int `json:"duration"`
|
Duration int `json:"Duration"`
|
||||||
NumNodes int `json:"numNodes"`
|
NumNodes int `json:"NumNodes"`
|
||||||
Nodes []string `json:"nodes"`
|
NumHWThreads int `json:"NumHWThreads"`
|
||||||
HasProfile bool `json:"hasProfile"`
|
NumAcc int `json:"NumAcc"`
|
||||||
State JobState `json:"state"`
|
Smt int `json:"SMT"`
|
||||||
Tags []*JobTag `json:"tags"`
|
Exclusive int `json:"Exclusive"`
|
||||||
LoadAvg *float64 `json:"loadAvg"`
|
Partition string `json:"Partition"`
|
||||||
MemUsedMax *float64 `json:"memUsedMax"`
|
ArrayJobID int `json:"ArrayJobId"`
|
||||||
FlopsAnyAvg *float64 `json:"flopsAnyAvg"`
|
MonitoringStatus int `json:"MonitoringStatus"`
|
||||||
MemBwAvg *float64 `json:"memBwAvg"`
|
State JobState `json:"State"`
|
||||||
NetBwAvg *float64 `json:"netBwAvg"`
|
Tags []*JobTag `json:"Tags"`
|
||||||
FileBwAvg *float64 `json:"fileBwAvg"`
|
Resources []*schema.JobResource `json:"Resources"`
|
||||||
|
LoadAvg *float64 `json:"LoadAvg"`
|
||||||
|
MemUsedMax *float64 `json:"MemUsedMax"`
|
||||||
|
FlopsAnyAvg *float64 `json:"FlopsAnyAvg"`
|
||||||
|
MemBwAvg *float64 `json:"MemBwAvg"`
|
||||||
|
NetBwAvg *float64 `json:"NetBwAvg"`
|
||||||
|
FileBwAvg *float64 `json:"FileBwAvg"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobFilter struct {
|
type JobFilter struct {
|
||||||
Tags []string `json:"tags"`
|
Tags []string `json:"tags"`
|
||||||
JobID *StringInput `json:"jobId"`
|
JobID *StringInput `json:"jobId"`
|
||||||
UserID *StringInput `json:"userId"`
|
User *StringInput `json:"user"`
|
||||||
ProjectID *StringInput `json:"projectId"`
|
Project *StringInput `json:"project"`
|
||||||
ClusterID *StringInput `json:"clusterId"`
|
Cluster *StringInput `json:"cluster"`
|
||||||
Duration *IntRange `json:"duration"`
|
Duration *IntRange `json:"duration"`
|
||||||
NumNodes *IntRange `json:"numNodes"`
|
NumNodes *IntRange `json:"numNodes"`
|
||||||
StartTime *TimeRange `json:"startTime"`
|
StartTime *TimeRange `json:"startTime"`
|
||||||
IsRunning *bool `json:"isRunning"`
|
JobState []JobState `json:"jobState"`
|
||||||
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
|
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
|
||||||
MemBwAvg *FloatRange `json:"memBwAvg"`
|
MemBwAvg *FloatRange `json:"memBwAvg"`
|
||||||
LoadAvg *FloatRange `json:"loadAvg"`
|
LoadAvg *FloatRange `json:"loadAvg"`
|
||||||
@ -97,13 +103,14 @@ type JobsStatistics struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type MetricConfig struct {
|
type MetricConfig struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"Name"`
|
||||||
Unit string `json:"unit"`
|
Unit string `json:"Unit"`
|
||||||
Sampletime int `json:"sampletime"`
|
Timestep int `json:"Timestep"`
|
||||||
Peak int `json:"peak"`
|
Peak int `json:"Peak"`
|
||||||
Normal int `json:"normal"`
|
Normal int `json:"Normal"`
|
||||||
Caution int `json:"caution"`
|
Caution int `json:"Caution"`
|
||||||
Alert int `json:"alert"`
|
Alert int `json:"Alert"`
|
||||||
|
Scope string `json:"Scope"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type MetricFootprints struct {
|
type MetricFootprints struct {
|
||||||
@ -196,16 +203,24 @@ type JobState string
|
|||||||
const (
|
const (
|
||||||
JobStateRunning JobState = "running"
|
JobStateRunning JobState = "running"
|
||||||
JobStateCompleted JobState = "completed"
|
JobStateCompleted JobState = "completed"
|
||||||
|
JobStateFailed JobState = "failed"
|
||||||
|
JobStateCanceled JobState = "canceled"
|
||||||
|
JobStateStopped JobState = "stopped"
|
||||||
|
JobStateTimeout JobState = "timeout"
|
||||||
)
|
)
|
||||||
|
|
||||||
var AllJobState = []JobState{
|
var AllJobState = []JobState{
|
||||||
JobStateRunning,
|
JobStateRunning,
|
||||||
JobStateCompleted,
|
JobStateCompleted,
|
||||||
|
JobStateFailed,
|
||||||
|
JobStateCanceled,
|
||||||
|
JobStateStopped,
|
||||||
|
JobStateTimeout,
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e JobState) IsValid() bool {
|
func (e JobState) IsValid() bool {
|
||||||
switch e {
|
switch e {
|
||||||
case JobStateRunning, JobStateCompleted:
|
case JobStateRunning, JobStateCompleted, JobStateFailed, JobStateCanceled, JobStateStopped, JobStateTimeout:
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
|
@ -2,6 +2,7 @@ package graph
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"regexp"
|
"regexp"
|
||||||
@ -22,7 +23,12 @@ type Resolver struct {
|
|||||||
DB *sqlx.DB
|
DB *sqlx.DB
|
||||||
}
|
}
|
||||||
|
|
||||||
var JobTableCols []string = []string{"id", "job_id", "user_id", "project_id", "cluster_id", "start_time", "duration", "job_state", "num_nodes", "node_list", "flops_any_avg", "mem_bw_avg", "net_bw_avg", "file_bw_avg", "load_avg"}
|
var JobTableCols []string = []string{
|
||||||
|
"id", "job_id", "cluster", "start_time",
|
||||||
|
"user", "project", "partition", "array_job_id", "duration", "job_state", "resources",
|
||||||
|
"num_nodes", "num_hwthreads", "num_acc", "smt", "exclusive", "monitoring_status",
|
||||||
|
"load_avg", "mem_used_max", "flops_any_avg", "mem_bw_avg", "net_bw_avg", "file_bw_avg",
|
||||||
|
}
|
||||||
|
|
||||||
type Scannable interface {
|
type Scannable interface {
|
||||||
Scan(dest ...interface{}) error
|
Scan(dest ...interface{}) error
|
||||||
@ -30,13 +36,18 @@ type Scannable interface {
|
|||||||
|
|
||||||
// Helper function for scanning jobs with the `jobTableCols` columns selected.
|
// Helper function for scanning jobs with the `jobTableCols` columns selected.
|
||||||
func ScanJob(row Scannable) (*model.Job, error) {
|
func ScanJob(row Scannable) (*model.Job, error) {
|
||||||
job := &model.Job{HasProfile: true}
|
job := &model.Job{}
|
||||||
|
|
||||||
var nodeList string
|
var rawResources []byte
|
||||||
if err := row.Scan(
|
if err := row.Scan(
|
||||||
&job.ID, &job.JobID, &job.UserID, &job.ProjectID, &job.ClusterID,
|
&job.ID, &job.JobID, &job.Cluster, &job.StartTime,
|
||||||
&job.StartTime, &job.Duration, &job.State, &job.NumNodes, &nodeList,
|
&job.User, &job.Project, &job.Partition, &job.ArrayJobID, &job.Duration, &job.State, &rawResources,
|
||||||
&job.FlopsAnyAvg, &job.MemBwAvg, &job.NetBwAvg, &job.FileBwAvg, &job.LoadAvg); err != nil {
|
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Smt, &job.Exclusive, &job.MonitoringStatus,
|
||||||
|
&job.LoadAvg, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.NetBwAvg, &job.FileBwAvg); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(rawResources, &job.Resources); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,7 +55,6 @@ func ScanJob(row Scannable) (*model.Job, error) {
|
|||||||
job.Duration = int(time.Since(job.StartTime).Seconds())
|
job.Duration = int(time.Since(job.StartTime).Seconds())
|
||||||
}
|
}
|
||||||
|
|
||||||
job.Nodes = strings.Split(nodeList, ",")
|
|
||||||
return job, nil
|
return job, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,14 +140,14 @@ func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
|||||||
if filter.JobID != nil {
|
if filter.JobID != nil {
|
||||||
query = buildStringCondition("job.job_id", filter.JobID, query)
|
query = buildStringCondition("job.job_id", filter.JobID, query)
|
||||||
}
|
}
|
||||||
if filter.UserID != nil {
|
if filter.User != nil {
|
||||||
query = buildStringCondition("job.user_id", filter.UserID, query)
|
query = buildStringCondition("job.user", filter.User, query)
|
||||||
}
|
}
|
||||||
if filter.ProjectID != nil {
|
if filter.Project != nil {
|
||||||
query = buildStringCondition("job.project_id", filter.ProjectID, query)
|
query = buildStringCondition("job.project", filter.Project, query)
|
||||||
}
|
}
|
||||||
if filter.ClusterID != nil {
|
if filter.Cluster != nil {
|
||||||
query = buildStringCondition("job.cluster_id", filter.ClusterID, query)
|
query = buildStringCondition("job.cluster", filter.Cluster, query)
|
||||||
}
|
}
|
||||||
if filter.StartTime != nil {
|
if filter.StartTime != nil {
|
||||||
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
||||||
@ -145,12 +155,8 @@ func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
|||||||
if filter.Duration != nil {
|
if filter.Duration != nil {
|
||||||
query = buildIntCondition("job.duration", filter.Duration, query)
|
query = buildIntCondition("job.duration", filter.Duration, query)
|
||||||
}
|
}
|
||||||
if filter.IsRunning != nil {
|
if filter.JobState != nil {
|
||||||
if *filter.IsRunning {
|
query = query.Where("job.job_state IN ?", filter.JobState)
|
||||||
query = query.Where("job.job_state = 'running'")
|
|
||||||
} else {
|
|
||||||
query = query.Where("job.job_state = 'completed'")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if filter.NumNodes != nil {
|
if filter.NumNodes != nil {
|
||||||
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
|
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
|
||||||
|
@ -1,78 +1,102 @@
|
|||||||
type Job {
|
type Job {
|
||||||
id: ID! # Database ID, unique
|
Id: ID! # Database ID, unique
|
||||||
jobId: String! # ID given to the job by the cluster scheduler
|
JobId: Int! # ID given to the job by the cluster scheduler
|
||||||
userId: String! # Username
|
User: String! # Username
|
||||||
projectId: String! # Project
|
Project: String! # Project
|
||||||
clusterId: String! # Name of the cluster this job was running on
|
Cluster: String! # Name of the cluster this job was running on
|
||||||
startTime: Time! # RFC3339 formated string
|
StartTime: Time! # RFC3339 formated string
|
||||||
duration: Int! # For running jobs, the time it has already run
|
Duration: Int! # For running jobs, the time it has already run
|
||||||
numNodes: Int! # Number of nodes this job was running on
|
NumNodes: Int! # Number of nodes this job was running on
|
||||||
nodes: [String!]! # List of hostnames
|
NumHWThreads: Int!
|
||||||
hasProfile: Boolean! # TODO: Could be removed?
|
NumAcc: Int!
|
||||||
state: JobState! # State of the job
|
SMT: Int!
|
||||||
tags: [JobTag!]! # List of tags this job has
|
Exclusive: Int!
|
||||||
|
Partition: String!
|
||||||
|
ArrayJobId: Int!
|
||||||
|
MonitoringStatus: Int!
|
||||||
|
State: JobState! # State of the job
|
||||||
|
Tags: [JobTag!]! # List of tags this job has
|
||||||
|
Resources: [JobResource!]! # List of hosts/hwthreads/gpus/...
|
||||||
|
|
||||||
# Will be null for running jobs.
|
# Will be null for running jobs.
|
||||||
loadAvg: Float
|
LoadAvg: Float
|
||||||
memUsedMax: Float
|
MemUsedMax: Float
|
||||||
flopsAnyAvg: Float
|
FlopsAnyAvg: Float
|
||||||
memBwAvg: Float
|
MemBwAvg: Float
|
||||||
netBwAvg: Float
|
NetBwAvg: Float
|
||||||
fileBwAvg: Float
|
FileBwAvg: Float
|
||||||
|
}
|
||||||
|
|
||||||
|
type JobResource {
|
||||||
|
Hostname: String!
|
||||||
|
HWThreads: [Int!]
|
||||||
|
Accelerators: [Accelerator!]
|
||||||
|
}
|
||||||
|
|
||||||
|
type Accelerator {
|
||||||
|
Id: String!
|
||||||
|
Type: String!
|
||||||
|
Model: String!
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: Extend by more possible states?
|
# TODO: Extend by more possible states?
|
||||||
enum JobState {
|
enum JobState {
|
||||||
running
|
running
|
||||||
completed
|
completed
|
||||||
|
failed
|
||||||
|
canceled
|
||||||
|
stopped
|
||||||
|
timeout
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobTag {
|
type JobTag {
|
||||||
id: ID! # Database ID, unique
|
Id: ID! # Database ID, unique
|
||||||
tagType: String! # Type
|
TagType: String! # Type
|
||||||
tagName: String! # Name
|
TagName: String! # Name
|
||||||
}
|
}
|
||||||
|
|
||||||
type Cluster {
|
type Cluster {
|
||||||
clusterID: String!
|
ClusterID: String!
|
||||||
processorType: String!
|
ProcessorType: String!
|
||||||
socketsPerNode: Int!
|
SocketsPerNode: Int!
|
||||||
coresPerSocket: Int!
|
CoresPerSocket: Int!
|
||||||
threadsPerCore: Int!
|
ThreadsPerCore: Int!
|
||||||
flopRateScalar: Int!
|
FlopRateScalar: Int!
|
||||||
flopRateSimd: Int!
|
FlopRateSimd: Int!
|
||||||
memoryBandwidth: Int!
|
MemoryBandwidth: Int!
|
||||||
metricConfig: [MetricConfig!]!
|
MetricConfig: [MetricConfig!]!
|
||||||
filterRanges: FilterRanges!
|
FilterRanges: FilterRanges!
|
||||||
}
|
}
|
||||||
|
|
||||||
type MetricConfig {
|
type MetricConfig {
|
||||||
name: String!
|
Name: String!
|
||||||
unit: String!
|
Unit: String!
|
||||||
sampletime: Int!
|
Timestep: Int!
|
||||||
peak: Int!
|
Peak: Int!
|
||||||
normal: Int!
|
Normal: Int!
|
||||||
caution: Int!
|
Caution: Int!
|
||||||
alert: Int!
|
Alert: Int!
|
||||||
|
Scope: String!
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobMetric {
|
type JobMetric {
|
||||||
unit: String!
|
Unit: String!
|
||||||
scope: JobMetricScope!
|
Scope: JobMetricScope!
|
||||||
timestep: Int!
|
Timestep: Int!
|
||||||
series: [JobMetricSeries!]!
|
Series: [JobMetricSeries!]!
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobMetricSeries {
|
type JobMetricSeries {
|
||||||
node_id: String!
|
Hostname: String!
|
||||||
statistics: JobMetricStatistics
|
Id: Int
|
||||||
data: [NullableFloat!]!
|
Statistics: JobMetricStatistics
|
||||||
|
Data: [NullableFloat!]!
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobMetricStatistics {
|
type JobMetricStatistics {
|
||||||
avg: Float!
|
Avg: Float!
|
||||||
min: Float!
|
Min: Float!
|
||||||
max: Float!
|
Max: Float!
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobMetricWithName {
|
type JobMetricWithName {
|
||||||
@ -141,13 +165,13 @@ type FilterRanges {
|
|||||||
input JobFilter {
|
input JobFilter {
|
||||||
tags: [ID!]
|
tags: [ID!]
|
||||||
jobId: StringInput
|
jobId: StringInput
|
||||||
userId: StringInput
|
user: StringInput
|
||||||
projectId: StringInput
|
project: StringInput
|
||||||
clusterId: StringInput
|
cluster: StringInput
|
||||||
duration: IntRange
|
duration: IntRange
|
||||||
numNodes: IntRange
|
numNodes: IntRange
|
||||||
startTime: TimeRange
|
startTime: TimeRange
|
||||||
isRunning: Boolean
|
jobState: [JobState!]
|
||||||
flopsAnyAvg: FloatRange
|
flopsAnyAvg: FloatRange
|
||||||
memBwAvg: FloatRange
|
memBwAvg: FloatRange
|
||||||
loadAvg: FloatRange
|
loadAvg: FloatRange
|
||||||
|
@ -15,9 +15,14 @@ import (
|
|||||||
"github.com/ClusterCockpit/cc-jobarchive/graph/generated"
|
"github.com/ClusterCockpit/cc-jobarchive/graph/generated"
|
||||||
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
||||||
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
|
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
|
||||||
|
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||||
sq "github.com/Masterminds/squirrel"
|
sq "github.com/Masterminds/squirrel"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func (r *acceleratorResolver) ID(ctx context.Context, obj *schema.Accelerator) (string, error) {
|
||||||
|
panic(fmt.Errorf("not implemented"))
|
||||||
|
}
|
||||||
|
|
||||||
func (r *jobResolver) Tags(ctx context.Context, obj *model.Job) ([]*model.JobTag, error) {
|
func (r *jobResolver) Tags(ctx context.Context, obj *model.Job) ([]*model.JobTag, error) {
|
||||||
query := sq.
|
query := sq.
|
||||||
Select("tag.id", "tag.tag_type", "tag.tag_name").
|
Select("tag.id", "tag.tag_type", "tag.tag_name").
|
||||||
@ -232,6 +237,9 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
|||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Accelerator returns generated.AcceleratorResolver implementation.
|
||||||
|
func (r *Resolver) Accelerator() generated.AcceleratorResolver { return &acceleratorResolver{r} }
|
||||||
|
|
||||||
// Job returns generated.JobResolver implementation.
|
// Job returns generated.JobResolver implementation.
|
||||||
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
|
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
|
||||||
|
|
||||||
@ -241,6 +249,7 @@ func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResol
|
|||||||
// Query returns generated.QueryResolver implementation.
|
// Query returns generated.QueryResolver implementation.
|
||||||
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
||||||
|
|
||||||
|
type acceleratorResolver struct{ *Resolver }
|
||||||
type jobResolver struct{ *Resolver }
|
type jobResolver struct{ *Resolver }
|
||||||
type mutationResolver struct{ *Resolver }
|
type mutationResolver struct{ *Resolver }
|
||||||
type queryResolver struct{ *Resolver }
|
type queryResolver struct{ *Resolver }
|
||||||
|
120
init-db.go
120
init-db.go
@ -8,13 +8,61 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const JOBS_DB_SCHEMA string = `
|
||||||
|
DROP TABLE IF EXISTS job;
|
||||||
|
DROP TABLE IF EXISTS tag;
|
||||||
|
DROP TABLE IF EXISTS jobtag;
|
||||||
|
|
||||||
|
CREATE TABLE job (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT, -- Not needed in sqlite
|
||||||
|
job_id BIGINT NOT NULL,
|
||||||
|
cluster VARCHAR(255) NOT NULL,
|
||||||
|
start_time BITINT NOT NULL,
|
||||||
|
|
||||||
|
user VARCHAR(255) NOT NULL,
|
||||||
|
project VARCHAR(255) NOT NULL,
|
||||||
|
partition VARCHAR(255) NOT NULL,
|
||||||
|
array_job_id BIGINT NOT NULL,
|
||||||
|
duration INT,
|
||||||
|
job_state VARCHAR(255) CHECK(job_state IN ('running', 'completed', 'failed', 'canceled', 'stopped', 'timeout')) NOT NULL,
|
||||||
|
meta_data TEXT, -- json, but sqlite has no json type
|
||||||
|
resources TEXT NOT NULL, -- json, but sqlite has no json type
|
||||||
|
|
||||||
|
num_nodes INT NOT NULL,
|
||||||
|
num_hwthreads INT NOT NULL,
|
||||||
|
num_acc INT NOT NULL,
|
||||||
|
smt TINYINT CHECK(smt IN (0, 1 )) NOT NULL DEFAULT 1,
|
||||||
|
exclusive TINYINT CHECK(exclusive IN (0, 1, 2)) NOT NULL DEFAULT 1,
|
||||||
|
monitoring_status TINYINT CHECK(monitoring_status IN (0, 1 )) NOT NULL DEFAULT 1,
|
||||||
|
|
||||||
|
mem_used_max REAL NOT NULL DEFAULT 0.0,
|
||||||
|
flops_any_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
load_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
net_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
|
||||||
|
file_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||||
|
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
|
||||||
|
|
||||||
|
CREATE TABLE tag (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
tag_type VARCHAR(255) NOT NULL,
|
||||||
|
tag_name VARCHAR(255) NOT NULL);
|
||||||
|
|
||||||
|
CREATE TABLE jobtag (
|
||||||
|
job_id INTEGER,
|
||||||
|
tag_id INTEGER,
|
||||||
|
PRIMARY KEY (job_id, tag_id),
|
||||||
|
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
|
||||||
|
`
|
||||||
|
|
||||||
// Delete the tables "job", "tag" and "jobtag" from the database and
|
// Delete the tables "job", "tag" and "jobtag" from the database and
|
||||||
// repopulate them using the jobs found in `archive`.
|
// repopulate them using the jobs found in `archive`.
|
||||||
func initDB(db *sqlx.DB, archive string) error {
|
func initDB(db *sqlx.DB, archive string) error {
|
||||||
@ -22,39 +70,7 @@ func initDB(db *sqlx.DB, archive string) error {
|
|||||||
fmt.Println("Building database...")
|
fmt.Println("Building database...")
|
||||||
|
|
||||||
// Basic database structure:
|
// Basic database structure:
|
||||||
_, err := db.Exec(`
|
_, err := db.Exec(JOBS_DB_SCHEMA)
|
||||||
DROP TABLE IF EXISTS job;
|
|
||||||
DROP TABLE IF EXISTS tag;
|
|
||||||
DROP TABLE IF EXISTS jobtag;
|
|
||||||
|
|
||||||
CREATE TABLE job (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
job_id TEXT,
|
|
||||||
user_id TEXT,
|
|
||||||
project_id TEXT,
|
|
||||||
cluster_id TEXT,
|
|
||||||
start_time TIMESTAMP,
|
|
||||||
duration INTEGER,
|
|
||||||
job_state TEXT,
|
|
||||||
num_nodes INTEGER,
|
|
||||||
node_list TEXT,
|
|
||||||
metadata TEXT,
|
|
||||||
|
|
||||||
flops_any_avg REAL,
|
|
||||||
mem_bw_avg REAL,
|
|
||||||
net_bw_avg REAL,
|
|
||||||
file_bw_avg REAL,
|
|
||||||
load_avg REAL);
|
|
||||||
CREATE TABLE tag (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
tag_type TEXT,
|
|
||||||
tag_name TEXT);
|
|
||||||
CREATE TABLE jobtag (
|
|
||||||
job_id INTEGER,
|
|
||||||
tag_id INTEGER,
|
|
||||||
PRIMARY KEY (job_id, tag_id),
|
|
||||||
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE ON UPDATE NO ACTION,
|
|
||||||
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE ON UPDATE NO ACTION);`)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -64,9 +80,17 @@ func initDB(db *sqlx.DB, archive string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
insertstmt, err := db.Prepare(`INSERT INTO job
|
insertstmt, err := db.Prepare(`INSERT INTO job (
|
||||||
(job_id, user_id, project_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata, flops_any_avg, mem_bw_avg, net_bw_avg, file_bw_avg, load_avg)
|
job_id, cluster, start_time,
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);`)
|
user, project, partition, array_job_id, duration, job_state, meta_data, resources,
|
||||||
|
num_nodes, num_hwthreads, num_acc, smt, exclusive, monitoring_status,
|
||||||
|
flops_any_avg, mem_bw_avg
|
||||||
|
) VALUES (
|
||||||
|
?, ?, ?,
|
||||||
|
?, ?, ?, ?, ?, ?, ?, ?,
|
||||||
|
?, ?, ?, ?, ?, ?,
|
||||||
|
?, ?
|
||||||
|
);`)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -149,7 +173,7 @@ func initDB(db *sqlx.DB, archive string) error {
|
|||||||
// Create indexes after inserts so that they do not
|
// Create indexes after inserts so that they do not
|
||||||
// need to be continually updated.
|
// need to be continually updated.
|
||||||
if _, err := db.Exec(`
|
if _, err := db.Exec(`
|
||||||
CREATE INDEX job_by_user ON job (user_id);
|
CREATE INDEX job_by_user ON job (user);
|
||||||
CREATE INDEX job_by_starttime ON job (start_time);`); err != nil {
|
CREATE INDEX job_by_starttime ON job (start_time);`); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -167,19 +191,27 @@ func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) err
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
var job schema.JobMeta
|
var job schema.JobMeta = schema.JobMeta{
|
||||||
|
Exclusive: 1,
|
||||||
|
}
|
||||||
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&job); err != nil {
|
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&job); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Other metrics...
|
||||||
flopsAnyAvg := loadJobStat(&job, "flops_any")
|
flopsAnyAvg := loadJobStat(&job, "flops_any")
|
||||||
memBwAvg := loadJobStat(&job, "mem_bw")
|
memBwAvg := loadJobStat(&job, "mem_bw")
|
||||||
netBwAvg := loadJobStat(&job, "net_bw")
|
|
||||||
fileBwAvg := loadJobStat(&job, "file_bw")
|
|
||||||
loadAvg := loadJobStat(&job, "load_one")
|
|
||||||
|
|
||||||
res, err := stmt.Exec(job.JobId, job.UserId, job.ProjectId, job.ClusterId, job.StartTime, job.Duration, job.JobState,
|
resources, err := json.Marshal(job.Resources)
|
||||||
job.NumNodes, strings.Join(job.Nodes, ","), nil, flopsAnyAvg, memBwAvg, netBwAvg, fileBwAvg, loadAvg)
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := stmt.Exec(
|
||||||
|
job.JobId, job.Cluster, job.StartTime,
|
||||||
|
job.User, job.Project, job.Partition, job.ArrayJobId, job.Duration, job.JobState, job.MetaData, string(resources),
|
||||||
|
job.NumNodes, job.NumHWThreads, job.NumAcc, job.SMT, job.Exclusive, job.MonitoringStatus,
|
||||||
|
flopsAnyAvg, memBwAvg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,6 @@ import (
|
|||||||
"path"
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-jobarchive/config"
|
"github.com/ClusterCockpit/cc-jobarchive/config"
|
||||||
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
||||||
@ -21,19 +20,14 @@ import (
|
|||||||
// For a given job, return the path of the `data.json`/`meta.json` file.
|
// For a given job, return the path of the `data.json`/`meta.json` file.
|
||||||
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
|
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
|
||||||
func getPath(job *model.Job, file string, checkLegacy bool) (string, error) {
|
func getPath(job *model.Job, file string, checkLegacy bool) (string, error) {
|
||||||
id, err := strconv.Atoi(strings.Split(job.JobID, ".")[0])
|
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
lvl1, lvl2 := fmt.Sprintf("%d", id/1000), fmt.Sprintf("%03d", id%1000)
|
|
||||||
if !checkLegacy {
|
if !checkLegacy {
|
||||||
return filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
legacyPath := filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, file)
|
legacyPath := filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, file)
|
||||||
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
|
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
|
||||||
return filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return legacyPath, nil
|
return legacyPath, nil
|
||||||
@ -87,13 +81,13 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
|
|||||||
f.Close()
|
f.Close()
|
||||||
|
|
||||||
metaFile.Tags = make([]struct {
|
metaFile.Tags = make([]struct {
|
||||||
Name string "json:\"name\""
|
Name string "json:\"Name\""
|
||||||
Type string "json:\"type\""
|
Type string "json:\"Type\""
|
||||||
}, 0)
|
}, 0)
|
||||||
for _, tag := range tags {
|
for _, tag := range tags {
|
||||||
metaFile.Tags = append(metaFile.Tags, struct {
|
metaFile.Tags = append(metaFile.Tags, struct {
|
||||||
Name string "json:\"name\""
|
Name string "json:\"Name\""
|
||||||
Type string "json:\"type\""
|
Type string "json:\"Type\""
|
||||||
}{
|
}{
|
||||||
Name: tag.TagName,
|
Name: tag.TagName,
|
||||||
Type: tag.TagType,
|
Type: tag.TagType,
|
||||||
@ -143,7 +137,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
allMetrics := make([]string, 0)
|
allMetrics := make([]string, 0)
|
||||||
metricConfigs := config.GetClusterConfig(job.ClusterID).MetricConfig
|
metricConfigs := config.GetClusterConfig(job.Cluster).MetricConfig
|
||||||
for _, mc := range metricConfigs {
|
for _, mc := range metricConfigs {
|
||||||
allMetrics = append(allMetrics, mc.Name)
|
allMetrics = append(allMetrics, mc.Name)
|
||||||
}
|
}
|
||||||
@ -153,13 +147,13 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
tags := []struct {
|
tags := []struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"Name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"Type"`
|
||||||
}{}
|
}{}
|
||||||
for _, tag := range job.Tags {
|
for _, tag := range job.Tags {
|
||||||
tags = append(tags, struct {
|
tags = append(tags, struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"Name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"Type"`
|
||||||
}{
|
}{
|
||||||
Name: tag.TagName,
|
Name: tag.TagName,
|
||||||
Type: tag.TagType,
|
Type: tag.TagType,
|
||||||
@ -167,14 +161,23 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
metaData := &schema.JobMeta{
|
metaData := &schema.JobMeta{
|
||||||
JobId: job.JobID,
|
JobId: int64(job.JobID),
|
||||||
UserId: job.UserID,
|
User: job.User,
|
||||||
ClusterId: job.ClusterID,
|
Project: job.Project,
|
||||||
|
Cluster: job.Cluster,
|
||||||
NumNodes: job.NumNodes,
|
NumNodes: job.NumNodes,
|
||||||
JobState: job.State.String(),
|
NumHWThreads: job.NumHWThreads,
|
||||||
|
NumAcc: job.NumAcc,
|
||||||
|
Exclusive: int8(job.Exclusive),
|
||||||
|
MonitoringStatus: int8(job.MonitoringStatus),
|
||||||
|
SMT: int8(job.Smt),
|
||||||
|
Partition: job.Partition,
|
||||||
|
ArrayJobId: job.ArrayJobID,
|
||||||
|
JobState: string(job.State),
|
||||||
StartTime: job.StartTime.Unix(),
|
StartTime: job.StartTime.Unix(),
|
||||||
Duration: int64(job.Duration),
|
Duration: int64(job.Duration),
|
||||||
Nodes: job.Nodes,
|
Resources: job.Resources,
|
||||||
|
MetaData: "", // TODO/FIXME: Handle `meta_data`!
|
||||||
Tags: tags,
|
Tags: tags,
|
||||||
Statistics: make(map[string]*schema.JobMetaStatistics),
|
Statistics: make(map[string]*schema.JobMetaStatistics),
|
||||||
}
|
}
|
||||||
@ -188,7 +191,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
metaData.Statistics[metric] = &schema.JobMetaStatistics{
|
metaData.Statistics[metric] = &schema.JobMetaStatistics{
|
||||||
Unit: config.GetMetricConfig(job.ClusterID, metric).Unit,
|
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
|
||||||
Avg: avg / float64(job.NumNodes),
|
Avg: avg / float64(job.NumNodes),
|
||||||
Min: min,
|
Min: min,
|
||||||
Max: max,
|
Max: max,
|
||||||
|
@ -61,8 +61,13 @@ func (ccms *CCMetricStore) doRequest(job *model.Job, suffix string, metrics []st
|
|||||||
from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix()
|
from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix()
|
||||||
reqBody := ApiRequestBody{}
|
reqBody := ApiRequestBody{}
|
||||||
reqBody.Metrics = metrics
|
reqBody.Metrics = metrics
|
||||||
for _, node := range job.Nodes {
|
for _, node := range job.Resources {
|
||||||
reqBody.Selectors = append(reqBody.Selectors, []string{job.ClusterID, node})
|
if node.Accelerators != nil || node.HWThreads != nil {
|
||||||
|
// TODO/FIXME:
|
||||||
|
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
|
||||||
|
}
|
||||||
|
|
||||||
|
reqBody.Selectors = append(reqBody.Selectors, []string{job.Cluster, node.Hostname})
|
||||||
}
|
}
|
||||||
|
|
||||||
reqBodyBytes, err := json.Marshal(reqBody)
|
reqBodyBytes, err := json.Marshal(reqBody)
|
||||||
@ -86,32 +91,37 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
resdata := make([]map[string]ApiMetricData, 0, len(job.Nodes))
|
resdata := make([]map[string]ApiMetricData, 0, len(job.Resources))
|
||||||
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
|
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var jobData schema.JobData = make(schema.JobData)
|
var jobData schema.JobData = make(schema.JobData)
|
||||||
for _, metric := range metrics {
|
for _, metric := range metrics {
|
||||||
mc := config.GetMetricConfig(job.ClusterID, metric)
|
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||||
metricData := &schema.JobMetric{
|
metricData := &schema.JobMetric{
|
||||||
Scope: "node", // TODO: FIXME: Whatever...
|
Scope: "node", // TODO: FIXME: Whatever...
|
||||||
Unit: mc.Unit,
|
Unit: mc.Unit,
|
||||||
Timestep: mc.Sampletime,
|
Timestep: mc.Timestep,
|
||||||
Series: make([]*schema.MetricSeries, 0, len(job.Nodes)),
|
Series: make([]*schema.MetricSeries, 0, len(job.Resources)),
|
||||||
}
|
}
|
||||||
for i, node := range job.Nodes {
|
for i, node := range job.Resources {
|
||||||
|
if node.Accelerators != nil || node.HWThreads != nil {
|
||||||
|
// TODO/FIXME:
|
||||||
|
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
|
||||||
|
}
|
||||||
|
|
||||||
data := resdata[i][metric]
|
data := resdata[i][metric]
|
||||||
if data.Error != nil {
|
if data.Error != nil {
|
||||||
return nil, errors.New(*data.Error)
|
return nil, errors.New(*data.Error)
|
||||||
}
|
}
|
||||||
|
|
||||||
if data.Avg == nil || data.Min == nil || data.Max == nil {
|
if data.Avg == nil || data.Min == nil || data.Max == nil {
|
||||||
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node, metric)
|
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
|
||||||
}
|
}
|
||||||
|
|
||||||
metricData.Series = append(metricData.Series, &schema.MetricSeries{
|
metricData.Series = append(metricData.Series, &schema.MetricSeries{
|
||||||
NodeID: node,
|
Hostname: node.Hostname,
|
||||||
Data: data.Data,
|
Data: data.Data,
|
||||||
Statistics: &schema.MetricStatistics{
|
Statistics: &schema.MetricStatistics{
|
||||||
Avg: *data.Avg,
|
Avg: *data.Avg,
|
||||||
@ -132,7 +142,7 @@ func (ccms *CCMetricStore) LoadStats(job *model.Job, metrics []string, ctx conte
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
resdata := make([]map[string]ApiStatsData, 0, len(job.Nodes))
|
resdata := make([]map[string]ApiStatsData, 0, len(job.Resources))
|
||||||
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
|
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -140,17 +150,22 @@ func (ccms *CCMetricStore) LoadStats(job *model.Job, metrics []string, ctx conte
|
|||||||
stats := map[string]map[string]schema.MetricStatistics{}
|
stats := map[string]map[string]schema.MetricStatistics{}
|
||||||
for _, metric := range metrics {
|
for _, metric := range metrics {
|
||||||
nodestats := map[string]schema.MetricStatistics{}
|
nodestats := map[string]schema.MetricStatistics{}
|
||||||
for i, node := range job.Nodes {
|
for i, node := range job.Resources {
|
||||||
|
if node.Accelerators != nil || node.HWThreads != nil {
|
||||||
|
// TODO/FIXME:
|
||||||
|
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
|
||||||
|
}
|
||||||
|
|
||||||
data := resdata[i][metric]
|
data := resdata[i][metric]
|
||||||
if data.Error != nil {
|
if data.Error != nil {
|
||||||
return nil, errors.New(*data.Error)
|
return nil, errors.New(*data.Error)
|
||||||
}
|
}
|
||||||
|
|
||||||
if data.Samples == 0 {
|
if data.Samples == 0 {
|
||||||
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node, metric)
|
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
|
||||||
}
|
}
|
||||||
|
|
||||||
nodestats[node] = schema.MetricStatistics{
|
nodestats[node.Hostname] = schema.MetricStatistics{
|
||||||
Avg: float64(data.Avg),
|
Avg: float64(data.Avg),
|
||||||
Min: float64(data.Min),
|
Min: float64(data.Min),
|
||||||
Max: float64(data.Max),
|
Max: float64(data.Max),
|
||||||
|
@ -2,6 +2,7 @@ package metricdata
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
@ -46,9 +47,14 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
|
|||||||
}
|
}
|
||||||
fieldsCond := strings.Join(fieldsConds, " or ")
|
fieldsCond := strings.Join(fieldsConds, " or ")
|
||||||
|
|
||||||
hostsConds := make([]string, 0, len(job.Nodes))
|
hostsConds := make([]string, 0, len(job.Resources))
|
||||||
for _, h := range job.Nodes {
|
for _, h := range job.Resources {
|
||||||
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h))
|
if h.HWThreads != nil || h.Accelerators != nil {
|
||||||
|
// TODO/FIXME...
|
||||||
|
return nil, errors.New("the InfluxDB metric data repository does not support HWThreads or Accelerators")
|
||||||
|
}
|
||||||
|
|
||||||
|
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h.Hostname))
|
||||||
}
|
}
|
||||||
hostsCond := strings.Join(hostsConds, " or ")
|
hostsCond := strings.Join(hostsConds, " or ")
|
||||||
|
|
||||||
@ -72,18 +78,18 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
|
|||||||
field, host := row.Field(), row.ValueByKey("host").(string)
|
field, host := row.Field(), row.ValueByKey("host").(string)
|
||||||
jobMetric, ok := jobData[field]
|
jobMetric, ok := jobData[field]
|
||||||
if !ok {
|
if !ok {
|
||||||
mc := config.GetMetricConfig(job.ClusterID, field)
|
mc := config.GetMetricConfig(job.Cluster, field)
|
||||||
jobMetric = &schema.JobMetric{
|
jobMetric = &schema.JobMetric{
|
||||||
Scope: "node", // TODO: FIXME: Whatever...
|
Scope: "node", // TODO: FIXME: Whatever...
|
||||||
Unit: mc.Unit,
|
Unit: mc.Unit,
|
||||||
Timestep: mc.Sampletime,
|
Timestep: mc.Timestep,
|
||||||
Series: make([]*schema.MetricSeries, 0, len(job.Nodes)),
|
Series: make([]*schema.MetricSeries, 0, len(job.Resources)),
|
||||||
}
|
}
|
||||||
jobData[field] = jobMetric
|
jobData[field] = jobMetric
|
||||||
}
|
}
|
||||||
|
|
||||||
currentSeries = &schema.MetricSeries{
|
currentSeries = &schema.MetricSeries{
|
||||||
NodeID: host,
|
Hostname: host,
|
||||||
Statistics: nil,
|
Statistics: nil,
|
||||||
Data: make([]schema.Float, 0),
|
Data: make([]schema.Float, 0),
|
||||||
}
|
}
|
||||||
@ -102,7 +108,7 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
|
|||||||
jobMetric := jobData[metric]
|
jobMetric := jobData[metric]
|
||||||
for node, stats := range nodes {
|
for node, stats := range nodes {
|
||||||
for _, series := range jobMetric.Series {
|
for _, series := range jobMetric.Series {
|
||||||
if series.NodeID == node {
|
if series.Hostname == node {
|
||||||
series.Statistics = &stats
|
series.Statistics = &stats
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -115,9 +121,14 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
|
|||||||
func (idb *InfluxDBv2DataRepository) LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
func (idb *InfluxDBv2DataRepository) LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||||
stats := map[string]map[string]schema.MetricStatistics{}
|
stats := map[string]map[string]schema.MetricStatistics{}
|
||||||
|
|
||||||
hostsConds := make([]string, 0, len(job.Nodes))
|
hostsConds := make([]string, 0, len(job.Resources))
|
||||||
for _, h := range job.Nodes {
|
for _, h := range job.Resources {
|
||||||
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h))
|
if h.HWThreads != nil || h.Accelerators != nil {
|
||||||
|
// TODO/FIXME...
|
||||||
|
return nil, errors.New("the InfluxDB metric data repository does not support HWThreads or Accelerators")
|
||||||
|
}
|
||||||
|
|
||||||
|
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h.Hostname))
|
||||||
}
|
}
|
||||||
hostsCond := strings.Join(hostsConds, " or ")
|
hostsCond := strings.Join(hostsConds, " or ")
|
||||||
|
|
||||||
|
@ -59,9 +59,9 @@ func Init(jobArchivePath string, disableArchive bool) error {
|
|||||||
// Fetches the metric data for a job.
|
// Fetches the metric data for a job.
|
||||||
func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
||||||
if job.State == model.JobStateRunning || !useArchive {
|
if job.State == model.JobStateRunning || !useArchive {
|
||||||
repo, ok := metricDataRepos[job.ClusterID]
|
repo, ok := metricDataRepos[job.Cluster]
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.ClusterID)
|
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
|
||||||
}
|
}
|
||||||
|
|
||||||
return repo.LoadData(job, metrics, ctx)
|
return repo.LoadData(job, metrics, ctx)
|
||||||
@ -90,9 +90,9 @@ func LoadAverages(job *model.Job, metrics []string, data [][]schema.Float, ctx c
|
|||||||
return loadAveragesFromArchive(job, metrics, data)
|
return loadAveragesFromArchive(job, metrics, data)
|
||||||
}
|
}
|
||||||
|
|
||||||
repo, ok := metricDataRepos[job.ClusterID]
|
repo, ok := metricDataRepos[job.Cluster]
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("no metric data repository configured for '%s'", job.ClusterID)
|
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
|
||||||
}
|
}
|
||||||
|
|
||||||
stats, err := repo.LoadStats(job, metrics, ctx)
|
stats, err := repo.LoadStats(job, metrics, ctx)
|
||||||
|
@ -9,10 +9,10 @@ import (
|
|||||||
type JobData map[string]*JobMetric
|
type JobData map[string]*JobMetric
|
||||||
|
|
||||||
type JobMetric struct {
|
type JobMetric struct {
|
||||||
Unit string `json:"unit"`
|
Unit string `json:"Unit"`
|
||||||
Scope MetricScope `json:"scope"`
|
Scope MetricScope `json:"Scope"`
|
||||||
Timestep int `json:"timestep"`
|
Timestep int `json:"Timestep"`
|
||||||
Series []*MetricSeries `json:"series"`
|
Series []*MetricSeries `json:"Series"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type MetricScope string
|
type MetricScope string
|
||||||
@ -41,38 +41,59 @@ func (e MetricScope) MarshalGQL(w io.Writer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type MetricStatistics struct {
|
type MetricStatistics struct {
|
||||||
Avg float64 `json:"avg"`
|
Avg float64 `json:"Avg"`
|
||||||
Min float64 `json:"min"`
|
Min float64 `json:"Min"`
|
||||||
Max float64 `json:"max"`
|
Max float64 `json:"Max"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type MetricSeries struct {
|
type MetricSeries struct {
|
||||||
NodeID string `json:"node_id"`
|
Hostname string `json:"Hostname"`
|
||||||
Statistics *MetricStatistics `json:"statistics"`
|
Id int `json:"Id"`
|
||||||
Data []Float `json:"data"`
|
Statistics *MetricStatistics `json:"Statistics"`
|
||||||
|
Data []Float `json:"Data"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type JobMetaStatistics struct {
|
type JobMetaStatistics struct {
|
||||||
Unit string `json:"unit"`
|
Unit string `json:"Unit"`
|
||||||
Avg float64 `json:"avg"`
|
Avg float64 `json:"Avg"`
|
||||||
Min float64 `json:"min"`
|
Min float64 `json:"Min"`
|
||||||
Max float64 `json:"max"`
|
Max float64 `json:"Max"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Accelerator struct {
|
||||||
|
ID int `json:"Id"`
|
||||||
|
Type string `json:"Type"`
|
||||||
|
Model string `json:"Model"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type JobResource struct {
|
||||||
|
Hostname string `json:"Hostname"`
|
||||||
|
HWThreads []int `json:"HWThreads,omitempty"`
|
||||||
|
Accelerators []Accelerator `json:"Accelerators,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Format of `meta.json` files.
|
// Format of `meta.json` files.
|
||||||
type JobMeta struct {
|
type JobMeta struct {
|
||||||
JobId string `json:"job_id"`
|
JobId int64 `json:"JobId"`
|
||||||
UserId string `json:"user_id"`
|
User string `json:"User"`
|
||||||
ProjectId string `json:"project_id"`
|
Project string `json:"Project"`
|
||||||
ClusterId string `json:"cluster_id"`
|
Cluster string `json:"Cluster"`
|
||||||
NumNodes int `json:"num_nodes"`
|
NumNodes int `json:"NumNodes"`
|
||||||
JobState string `json:"job_state"`
|
NumHWThreads int `json:"NumHWThreads"`
|
||||||
StartTime int64 `json:"start_time"`
|
NumAcc int `json:"NumAcc"`
|
||||||
Duration int64 `json:"duration"`
|
Exclusive int8 `json:"Exclusive"`
|
||||||
Nodes []string `json:"nodes"`
|
MonitoringStatus int8 `json:"MonitoringStatus"`
|
||||||
|
SMT int8 `json:"SMT"`
|
||||||
|
Partition string `json:"Partition"`
|
||||||
|
ArrayJobId int `json:"ArrayJobId"`
|
||||||
|
JobState string `json:"JobState"`
|
||||||
|
StartTime int64 `json:"StartTime"`
|
||||||
|
Duration int64 `json:"Duration"`
|
||||||
|
Resources []*JobResource `json:"Resources"`
|
||||||
|
MetaData string `json:"MetaData"`
|
||||||
Tags []struct {
|
Tags []struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"Name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"Type"`
|
||||||
} `json:"tags"`
|
} `json:"Tags"`
|
||||||
Statistics map[string]*JobMetaStatistics `json:"statistics"`
|
Statistics map[string]*JobMetaStatistics `json:"Statistics"`
|
||||||
}
|
}
|
||||||
|
@ -308,12 +308,12 @@ func monitoringRoutes(router *mux.Router, resolver *graph.Resolver) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
templates.Render(rw, r, "monitoring/job/", &templates.Page{
|
templates.Render(rw, r, "monitoring/job/", &templates.Page{
|
||||||
Title: fmt.Sprintf("Job %s - ClusterCockpit", job.JobID),
|
Title: fmt.Sprintf("Job %d - ClusterCockpit", job.JobID),
|
||||||
Config: conf,
|
Config: conf,
|
||||||
Infos: map[string]interface{}{
|
Infos: map[string]interface{}{
|
||||||
"id": id,
|
"id": id,
|
||||||
"jobId": job.JobID,
|
"jobId": job.JobID,
|
||||||
"clusterId": job.ClusterID,
|
"clusterId": job.Cluster,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
Loading…
Reference in New Issue
Block a user