BC: new schemas for basically everything

This commit is contained in:
Lou Knauer 2021-12-16 13:17:48 +01:00
parent 7fcc39a144
commit 89333666b3
14 changed files with 1631 additions and 549 deletions

View File

@ -33,6 +33,7 @@ func (api *RestApi) MountRoutes(r *mux.Router) {
r.HandleFunc("/api/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch) r.HandleFunc("/api/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
} }
// TODO/FIXME: UPDATE API!
type StartJobApiRequest struct { type StartJobApiRequest struct {
JobId int64 `json:"jobId"` JobId int64 `json:"jobId"`
UserId string `json:"userId"` UserId string `json:"userId"`
@ -255,7 +256,7 @@ func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
return nil return nil
} }
log.Printf("archiving job... (id: %s): clusterId=%s, jobId=%s, userId=%s, startTime=%s, nodes=%v\n", job.ID, job.ClusterID, job.JobID, job.UserID, job.StartTime, job.Nodes) log.Printf("archiving job... (id: %s): clusterId=%s, jobId=%d, userId=%s, startTime=%s\n", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
if api.AsyncArchiving { if api.AsyncArchiving {
rw.Header().Add("Content-Type", "application/json") rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK) rw.WriteHeader(http.StatusOK)

View File

@ -56,7 +56,7 @@ models:
- github.com/99designs/gqlgen/graphql.Int32 - github.com/99designs/gqlgen/graphql.Int32
Job: Job:
fields: fields:
tags: Tags:
resolver: true resolver: true
JobMetric: JobMetric:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric" model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric"
@ -68,4 +68,8 @@ models:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float" model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float"
JobMetricScope: JobMetricScope:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope" model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope"
JobResource:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobResource"
Accelerator:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Accelerator"

File diff suppressed because it is too large Load Diff

View File

@ -38,36 +38,42 @@ type IntRangeOutput struct {
} }
type Job struct { type Job struct {
ID string `json:"id"` ID string `json:"Id"`
JobID string `json:"jobId"` JobID int `json:"JobId"`
UserID string `json:"userId"` User string `json:"User"`
ProjectID string `json:"projectId"` Project string `json:"Project"`
ClusterID string `json:"clusterId"` Cluster string `json:"Cluster"`
StartTime time.Time `json:"startTime"` StartTime time.Time `json:"StartTime"`
Duration int `json:"duration"` Duration int `json:"Duration"`
NumNodes int `json:"numNodes"` NumNodes int `json:"NumNodes"`
Nodes []string `json:"nodes"` NumHWThreads int `json:"NumHWThreads"`
HasProfile bool `json:"hasProfile"` NumAcc int `json:"NumAcc"`
State JobState `json:"state"` Smt int `json:"SMT"`
Tags []*JobTag `json:"tags"` Exclusive int `json:"Exclusive"`
LoadAvg *float64 `json:"loadAvg"` Partition string `json:"Partition"`
MemUsedMax *float64 `json:"memUsedMax"` ArrayJobID int `json:"ArrayJobId"`
FlopsAnyAvg *float64 `json:"flopsAnyAvg"` MonitoringStatus int `json:"MonitoringStatus"`
MemBwAvg *float64 `json:"memBwAvg"` State JobState `json:"State"`
NetBwAvg *float64 `json:"netBwAvg"` Tags []*JobTag `json:"Tags"`
FileBwAvg *float64 `json:"fileBwAvg"` Resources []*schema.JobResource `json:"Resources"`
LoadAvg *float64 `json:"LoadAvg"`
MemUsedMax *float64 `json:"MemUsedMax"`
FlopsAnyAvg *float64 `json:"FlopsAnyAvg"`
MemBwAvg *float64 `json:"MemBwAvg"`
NetBwAvg *float64 `json:"NetBwAvg"`
FileBwAvg *float64 `json:"FileBwAvg"`
} }
type JobFilter struct { type JobFilter struct {
Tags []string `json:"tags"` Tags []string `json:"tags"`
JobID *StringInput `json:"jobId"` JobID *StringInput `json:"jobId"`
UserID *StringInput `json:"userId"` User *StringInput `json:"user"`
ProjectID *StringInput `json:"projectId"` Project *StringInput `json:"project"`
ClusterID *StringInput `json:"clusterId"` Cluster *StringInput `json:"cluster"`
Duration *IntRange `json:"duration"` Duration *IntRange `json:"duration"`
NumNodes *IntRange `json:"numNodes"` NumNodes *IntRange `json:"numNodes"`
StartTime *TimeRange `json:"startTime"` StartTime *TimeRange `json:"startTime"`
IsRunning *bool `json:"isRunning"` JobState []JobState `json:"jobState"`
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"` FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
MemBwAvg *FloatRange `json:"memBwAvg"` MemBwAvg *FloatRange `json:"memBwAvg"`
LoadAvg *FloatRange `json:"loadAvg"` LoadAvg *FloatRange `json:"loadAvg"`
@ -97,13 +103,14 @@ type JobsStatistics struct {
} }
type MetricConfig struct { type MetricConfig struct {
Name string `json:"name"` Name string `json:"Name"`
Unit string `json:"unit"` Unit string `json:"Unit"`
Sampletime int `json:"sampletime"` Timestep int `json:"Timestep"`
Peak int `json:"peak"` Peak int `json:"Peak"`
Normal int `json:"normal"` Normal int `json:"Normal"`
Caution int `json:"caution"` Caution int `json:"Caution"`
Alert int `json:"alert"` Alert int `json:"Alert"`
Scope string `json:"Scope"`
} }
type MetricFootprints struct { type MetricFootprints struct {
@ -196,16 +203,24 @@ type JobState string
const ( const (
JobStateRunning JobState = "running" JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed" JobStateCompleted JobState = "completed"
JobStateFailed JobState = "failed"
JobStateCanceled JobState = "canceled"
JobStateStopped JobState = "stopped"
JobStateTimeout JobState = "timeout"
) )
var AllJobState = []JobState{ var AllJobState = []JobState{
JobStateRunning, JobStateRunning,
JobStateCompleted, JobStateCompleted,
JobStateFailed,
JobStateCanceled,
JobStateStopped,
JobStateTimeout,
} }
func (e JobState) IsValid() bool { func (e JobState) IsValid() bool {
switch e { switch e {
case JobStateRunning, JobStateCompleted: case JobStateRunning, JobStateCompleted, JobStateFailed, JobStateCanceled, JobStateStopped, JobStateTimeout:
return true return true
} }
return false return false

View File

@ -2,6 +2,7 @@ package graph
import ( import (
"context" "context"
"encoding/json"
"errors" "errors"
"fmt" "fmt"
"regexp" "regexp"
@ -22,7 +23,12 @@ type Resolver struct {
DB *sqlx.DB DB *sqlx.DB
} }
var JobTableCols []string = []string{"id", "job_id", "user_id", "project_id", "cluster_id", "start_time", "duration", "job_state", "num_nodes", "node_list", "flops_any_avg", "mem_bw_avg", "net_bw_avg", "file_bw_avg", "load_avg"} var JobTableCols []string = []string{
"id", "job_id", "cluster", "start_time",
"user", "project", "partition", "array_job_id", "duration", "job_state", "resources",
"num_nodes", "num_hwthreads", "num_acc", "smt", "exclusive", "monitoring_status",
"load_avg", "mem_used_max", "flops_any_avg", "mem_bw_avg", "net_bw_avg", "file_bw_avg",
}
type Scannable interface { type Scannable interface {
Scan(dest ...interface{}) error Scan(dest ...interface{}) error
@ -30,13 +36,18 @@ type Scannable interface {
// Helper function for scanning jobs with the `jobTableCols` columns selected. // Helper function for scanning jobs with the `jobTableCols` columns selected.
func ScanJob(row Scannable) (*model.Job, error) { func ScanJob(row Scannable) (*model.Job, error) {
job := &model.Job{HasProfile: true} job := &model.Job{}
var nodeList string var rawResources []byte
if err := row.Scan( if err := row.Scan(
&job.ID, &job.JobID, &job.UserID, &job.ProjectID, &job.ClusterID, &job.ID, &job.JobID, &job.Cluster, &job.StartTime,
&job.StartTime, &job.Duration, &job.State, &job.NumNodes, &nodeList, &job.User, &job.Project, &job.Partition, &job.ArrayJobID, &job.Duration, &job.State, &rawResources,
&job.FlopsAnyAvg, &job.MemBwAvg, &job.NetBwAvg, &job.FileBwAvg, &job.LoadAvg); err != nil { &job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Smt, &job.Exclusive, &job.MonitoringStatus,
&job.LoadAvg, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.NetBwAvg, &job.FileBwAvg); err != nil {
return nil, err
}
if err := json.Unmarshal(rawResources, &job.Resources); err != nil {
return nil, err return nil, err
} }
@ -44,7 +55,6 @@ func ScanJob(row Scannable) (*model.Job, error) {
job.Duration = int(time.Since(job.StartTime).Seconds()) job.Duration = int(time.Since(job.StartTime).Seconds())
} }
job.Nodes = strings.Split(nodeList, ",")
return job, nil return job, nil
} }
@ -130,14 +140,14 @@ func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
if filter.JobID != nil { if filter.JobID != nil {
query = buildStringCondition("job.job_id", filter.JobID, query) query = buildStringCondition("job.job_id", filter.JobID, query)
} }
if filter.UserID != nil { if filter.User != nil {
query = buildStringCondition("job.user_id", filter.UserID, query) query = buildStringCondition("job.user", filter.User, query)
} }
if filter.ProjectID != nil { if filter.Project != nil {
query = buildStringCondition("job.project_id", filter.ProjectID, query) query = buildStringCondition("job.project", filter.Project, query)
} }
if filter.ClusterID != nil { if filter.Cluster != nil {
query = buildStringCondition("job.cluster_id", filter.ClusterID, query) query = buildStringCondition("job.cluster", filter.Cluster, query)
} }
if filter.StartTime != nil { if filter.StartTime != nil {
query = buildTimeCondition("job.start_time", filter.StartTime, query) query = buildTimeCondition("job.start_time", filter.StartTime, query)
@ -145,12 +155,8 @@ func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
if filter.Duration != nil { if filter.Duration != nil {
query = buildIntCondition("job.duration", filter.Duration, query) query = buildIntCondition("job.duration", filter.Duration, query)
} }
if filter.IsRunning != nil { if filter.JobState != nil {
if *filter.IsRunning { query = query.Where("job.job_state IN ?", filter.JobState)
query = query.Where("job.job_state = 'running'")
} else {
query = query.Where("job.job_state = 'completed'")
}
} }
if filter.NumNodes != nil { if filter.NumNodes != nil {
query = buildIntCondition("job.num_nodes", filter.NumNodes, query) query = buildIntCondition("job.num_nodes", filter.NumNodes, query)

View File

@ -1,78 +1,102 @@
type Job { type Job {
id: ID! # Database ID, unique Id: ID! # Database ID, unique
jobId: String! # ID given to the job by the cluster scheduler JobId: Int! # ID given to the job by the cluster scheduler
userId: String! # Username User: String! # Username
projectId: String! # Project Project: String! # Project
clusterId: String! # Name of the cluster this job was running on Cluster: String! # Name of the cluster this job was running on
startTime: Time! # RFC3339 formated string StartTime: Time! # RFC3339 formated string
duration: Int! # For running jobs, the time it has already run Duration: Int! # For running jobs, the time it has already run
numNodes: Int! # Number of nodes this job was running on NumNodes: Int! # Number of nodes this job was running on
nodes: [String!]! # List of hostnames NumHWThreads: Int!
hasProfile: Boolean! # TODO: Could be removed? NumAcc: Int!
state: JobState! # State of the job SMT: Int!
tags: [JobTag!]! # List of tags this job has Exclusive: Int!
Partition: String!
ArrayJobId: Int!
MonitoringStatus: Int!
State: JobState! # State of the job
Tags: [JobTag!]! # List of tags this job has
Resources: [JobResource!]! # List of hosts/hwthreads/gpus/...
# Will be null for running jobs. # Will be null for running jobs.
loadAvg: Float LoadAvg: Float
memUsedMax: Float MemUsedMax: Float
flopsAnyAvg: Float FlopsAnyAvg: Float
memBwAvg: Float MemBwAvg: Float
netBwAvg: Float NetBwAvg: Float
fileBwAvg: Float FileBwAvg: Float
}
type JobResource {
Hostname: String!
HWThreads: [Int!]
Accelerators: [Accelerator!]
}
type Accelerator {
Id: String!
Type: String!
Model: String!
} }
# TODO: Extend by more possible states? # TODO: Extend by more possible states?
enum JobState { enum JobState {
running running
completed completed
failed
canceled
stopped
timeout
} }
type JobTag { type JobTag {
id: ID! # Database ID, unique Id: ID! # Database ID, unique
tagType: String! # Type TagType: String! # Type
tagName: String! # Name TagName: String! # Name
} }
type Cluster { type Cluster {
clusterID: String! ClusterID: String!
processorType: String! ProcessorType: String!
socketsPerNode: Int! SocketsPerNode: Int!
coresPerSocket: Int! CoresPerSocket: Int!
threadsPerCore: Int! ThreadsPerCore: Int!
flopRateScalar: Int! FlopRateScalar: Int!
flopRateSimd: Int! FlopRateSimd: Int!
memoryBandwidth: Int! MemoryBandwidth: Int!
metricConfig: [MetricConfig!]! MetricConfig: [MetricConfig!]!
filterRanges: FilterRanges! FilterRanges: FilterRanges!
} }
type MetricConfig { type MetricConfig {
name: String! Name: String!
unit: String! Unit: String!
sampletime: Int! Timestep: Int!
peak: Int! Peak: Int!
normal: Int! Normal: Int!
caution: Int! Caution: Int!
alert: Int! Alert: Int!
Scope: String!
} }
type JobMetric { type JobMetric {
unit: String! Unit: String!
scope: JobMetricScope! Scope: JobMetricScope!
timestep: Int! Timestep: Int!
series: [JobMetricSeries!]! Series: [JobMetricSeries!]!
} }
type JobMetricSeries { type JobMetricSeries {
node_id: String! Hostname: String!
statistics: JobMetricStatistics Id: Int
data: [NullableFloat!]! Statistics: JobMetricStatistics
Data: [NullableFloat!]!
} }
type JobMetricStatistics { type JobMetricStatistics {
avg: Float! Avg: Float!
min: Float! Min: Float!
max: Float! Max: Float!
} }
type JobMetricWithName { type JobMetricWithName {
@ -141,13 +165,13 @@ type FilterRanges {
input JobFilter { input JobFilter {
tags: [ID!] tags: [ID!]
jobId: StringInput jobId: StringInput
userId: StringInput user: StringInput
projectId: StringInput project: StringInput
clusterId: StringInput cluster: StringInput
duration: IntRange duration: IntRange
numNodes: IntRange numNodes: IntRange
startTime: TimeRange startTime: TimeRange
isRunning: Boolean jobState: [JobState!]
flopsAnyAvg: FloatRange flopsAnyAvg: FloatRange
memBwAvg: FloatRange memBwAvg: FloatRange
loadAvg: FloatRange loadAvg: FloatRange

View File

@ -15,9 +15,14 @@ import (
"github.com/ClusterCockpit/cc-jobarchive/graph/generated" "github.com/ClusterCockpit/cc-jobarchive/graph/generated"
"github.com/ClusterCockpit/cc-jobarchive/graph/model" "github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/metricdata" "github.com/ClusterCockpit/cc-jobarchive/metricdata"
"github.com/ClusterCockpit/cc-jobarchive/schema"
sq "github.com/Masterminds/squirrel" sq "github.com/Masterminds/squirrel"
) )
func (r *acceleratorResolver) ID(ctx context.Context, obj *schema.Accelerator) (string, error) {
panic(fmt.Errorf("not implemented"))
}
func (r *jobResolver) Tags(ctx context.Context, obj *model.Job) ([]*model.JobTag, error) { func (r *jobResolver) Tags(ctx context.Context, obj *model.Job) ([]*model.JobTag, error) {
query := sq. query := sq.
Select("tag.id", "tag.tag_type", "tag.tag_name"). Select("tag.id", "tag.tag_type", "tag.tag_name").
@ -232,6 +237,9 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return res, nil return res, nil
} }
// Accelerator returns generated.AcceleratorResolver implementation.
func (r *Resolver) Accelerator() generated.AcceleratorResolver { return &acceleratorResolver{r} }
// Job returns generated.JobResolver implementation. // Job returns generated.JobResolver implementation.
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} } func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
@ -241,6 +249,7 @@ func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResol
// Query returns generated.QueryResolver implementation. // Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} } func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
type acceleratorResolver struct{ *Resolver }
type jobResolver struct{ *Resolver } type jobResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver } type mutationResolver struct{ *Resolver }
type queryResolver struct{ *Resolver } type queryResolver struct{ *Resolver }

View File

@ -8,13 +8,61 @@ import (
"log" "log"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"time" "time"
"github.com/ClusterCockpit/cc-jobarchive/schema" "github.com/ClusterCockpit/cc-jobarchive/schema"
"github.com/jmoiron/sqlx" "github.com/jmoiron/sqlx"
) )
const JOBS_DB_SCHEMA string = `
DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tag;
DROP TABLE IF EXISTS jobtag;
CREATE TABLE job (
id INTEGER PRIMARY KEY AUTOINCREMENT, -- Not needed in sqlite
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
start_time BITINT NOT NULL,
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
partition VARCHAR(255) NOT NULL,
array_job_id BIGINT NOT NULL,
duration INT,
job_state VARCHAR(255) CHECK(job_state IN ('running', 'completed', 'failed', 'canceled', 'stopped', 'timeout')) NOT NULL,
meta_data TEXT, -- json, but sqlite has no json type
resources TEXT NOT NULL, -- json, but sqlite has no json type
num_nodes INT NOT NULL,
num_hwthreads INT NOT NULL,
num_acc INT NOT NULL,
smt TINYINT CHECK(smt IN (0, 1 )) NOT NULL DEFAULT 1,
exclusive TINYINT CHECK(exclusive IN (0, 1, 2)) NOT NULL DEFAULT 1,
monitoring_status TINYINT CHECK(monitoring_status IN (0, 1 )) NOT NULL DEFAULT 1,
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL);
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
`
// Delete the tables "job", "tag" and "jobtag" from the database and // Delete the tables "job", "tag" and "jobtag" from the database and
// repopulate them using the jobs found in `archive`. // repopulate them using the jobs found in `archive`.
func initDB(db *sqlx.DB, archive string) error { func initDB(db *sqlx.DB, archive string) error {
@ -22,39 +70,7 @@ func initDB(db *sqlx.DB, archive string) error {
fmt.Println("Building database...") fmt.Println("Building database...")
// Basic database structure: // Basic database structure:
_, err := db.Exec(` _, err := db.Exec(JOBS_DB_SCHEMA)
DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tag;
DROP TABLE IF EXISTS jobtag;
CREATE TABLE job (
id INTEGER PRIMARY KEY,
job_id TEXT,
user_id TEXT,
project_id TEXT,
cluster_id TEXT,
start_time TIMESTAMP,
duration INTEGER,
job_state TEXT,
num_nodes INTEGER,
node_list TEXT,
metadata TEXT,
flops_any_avg REAL,
mem_bw_avg REAL,
net_bw_avg REAL,
file_bw_avg REAL,
load_avg REAL);
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type TEXT,
tag_name TEXT);
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE ON UPDATE NO ACTION,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE ON UPDATE NO ACTION);`)
if err != nil { if err != nil {
return err return err
} }
@ -64,9 +80,17 @@ func initDB(db *sqlx.DB, archive string) error {
return err return err
} }
insertstmt, err := db.Prepare(`INSERT INTO job insertstmt, err := db.Prepare(`INSERT INTO job (
(job_id, user_id, project_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata, flops_any_avg, mem_bw_avg, net_bw_avg, file_bw_avg, load_avg) job_id, cluster, start_time,
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);`) user, project, partition, array_job_id, duration, job_state, meta_data, resources,
num_nodes, num_hwthreads, num_acc, smt, exclusive, monitoring_status,
flops_any_avg, mem_bw_avg
) VALUES (
?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?,
?, ?
);`)
if err != nil { if err != nil {
return err return err
} }
@ -149,7 +173,7 @@ func initDB(db *sqlx.DB, archive string) error {
// Create indexes after inserts so that they do not // Create indexes after inserts so that they do not
// need to be continually updated. // need to be continually updated.
if _, err := db.Exec(` if _, err := db.Exec(`
CREATE INDEX job_by_user ON job (user_id); CREATE INDEX job_by_user ON job (user);
CREATE INDEX job_by_starttime ON job (start_time);`); err != nil { CREATE INDEX job_by_starttime ON job (start_time);`); err != nil {
return err return err
} }
@ -167,19 +191,27 @@ func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) err
} }
defer f.Close() defer f.Close()
var job schema.JobMeta var job schema.JobMeta = schema.JobMeta{
Exclusive: 1,
}
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&job); err != nil { if err := json.NewDecoder(bufio.NewReader(f)).Decode(&job); err != nil {
return err return err
} }
// TODO: Other metrics...
flopsAnyAvg := loadJobStat(&job, "flops_any") flopsAnyAvg := loadJobStat(&job, "flops_any")
memBwAvg := loadJobStat(&job, "mem_bw") memBwAvg := loadJobStat(&job, "mem_bw")
netBwAvg := loadJobStat(&job, "net_bw")
fileBwAvg := loadJobStat(&job, "file_bw")
loadAvg := loadJobStat(&job, "load_one")
res, err := stmt.Exec(job.JobId, job.UserId, job.ProjectId, job.ClusterId, job.StartTime, job.Duration, job.JobState, resources, err := json.Marshal(job.Resources)
job.NumNodes, strings.Join(job.Nodes, ","), nil, flopsAnyAvg, memBwAvg, netBwAvg, fileBwAvg, loadAvg) if err != nil {
return err
}
res, err := stmt.Exec(
job.JobId, job.Cluster, job.StartTime,
job.User, job.Project, job.Partition, job.ArrayJobId, job.Duration, job.JobState, job.MetaData, string(resources),
job.NumNodes, job.NumHWThreads, job.NumAcc, job.SMT, job.Exclusive, job.MonitoringStatus,
flopsAnyAvg, memBwAvg)
if err != nil { if err != nil {
return err return err
} }

View File

@ -11,7 +11,6 @@ import (
"path" "path"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings"
"github.com/ClusterCockpit/cc-jobarchive/config" "github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/model" "github.com/ClusterCockpit/cc-jobarchive/graph/model"
@ -21,19 +20,14 @@ import (
// For a given job, return the path of the `data.json`/`meta.json` file. // For a given job, return the path of the `data.json`/`meta.json` file.
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97 // TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
func getPath(job *model.Job, file string, checkLegacy bool) (string, error) { func getPath(job *model.Job, file string, checkLegacy bool) (string, error) {
id, err := strconv.Atoi(strings.Split(job.JobID, ".")[0]) lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
if err != nil {
return "", err
}
lvl1, lvl2 := fmt.Sprintf("%d", id/1000), fmt.Sprintf("%03d", id%1000)
if !checkLegacy { if !checkLegacy {
return filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
} }
legacyPath := filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, file) legacyPath := filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, file)
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) { if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
return filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
} }
return legacyPath, nil return legacyPath, nil
@ -87,13 +81,13 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
f.Close() f.Close()
metaFile.Tags = make([]struct { metaFile.Tags = make([]struct {
Name string "json:\"name\"" Name string "json:\"Name\""
Type string "json:\"type\"" Type string "json:\"Type\""
}, 0) }, 0)
for _, tag := range tags { for _, tag := range tags {
metaFile.Tags = append(metaFile.Tags, struct { metaFile.Tags = append(metaFile.Tags, struct {
Name string "json:\"name\"" Name string "json:\"Name\""
Type string "json:\"type\"" Type string "json:\"Type\""
}{ }{
Name: tag.TagName, Name: tag.TagName,
Type: tag.TagType, Type: tag.TagType,
@ -143,7 +137,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
} }
allMetrics := make([]string, 0) allMetrics := make([]string, 0)
metricConfigs := config.GetClusterConfig(job.ClusterID).MetricConfig metricConfigs := config.GetClusterConfig(job.Cluster).MetricConfig
for _, mc := range metricConfigs { for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name) allMetrics = append(allMetrics, mc.Name)
} }
@ -153,13 +147,13 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
} }
tags := []struct { tags := []struct {
Name string `json:"name"` Name string `json:"Name"`
Type string `json:"type"` Type string `json:"Type"`
}{} }{}
for _, tag := range job.Tags { for _, tag := range job.Tags {
tags = append(tags, struct { tags = append(tags, struct {
Name string `json:"name"` Name string `json:"Name"`
Type string `json:"type"` Type string `json:"Type"`
}{ }{
Name: tag.TagName, Name: tag.TagName,
Type: tag.TagType, Type: tag.TagType,
@ -167,14 +161,23 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
} }
metaData := &schema.JobMeta{ metaData := &schema.JobMeta{
JobId: job.JobID, JobId: int64(job.JobID),
UserId: job.UserID, User: job.User,
ClusterId: job.ClusterID, Project: job.Project,
Cluster: job.Cluster,
NumNodes: job.NumNodes, NumNodes: job.NumNodes,
JobState: job.State.String(), NumHWThreads: job.NumHWThreads,
NumAcc: job.NumAcc,
Exclusive: int8(job.Exclusive),
MonitoringStatus: int8(job.MonitoringStatus),
SMT: int8(job.Smt),
Partition: job.Partition,
ArrayJobId: job.ArrayJobID,
JobState: string(job.State),
StartTime: job.StartTime.Unix(), StartTime: job.StartTime.Unix(),
Duration: int64(job.Duration), Duration: int64(job.Duration),
Nodes: job.Nodes, Resources: job.Resources,
MetaData: "", // TODO/FIXME: Handle `meta_data`!
Tags: tags, Tags: tags,
Statistics: make(map[string]*schema.JobMetaStatistics), Statistics: make(map[string]*schema.JobMetaStatistics),
} }
@ -188,7 +191,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
} }
metaData.Statistics[metric] = &schema.JobMetaStatistics{ metaData.Statistics[metric] = &schema.JobMetaStatistics{
Unit: config.GetMetricConfig(job.ClusterID, metric).Unit, Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
Avg: avg / float64(job.NumNodes), Avg: avg / float64(job.NumNodes),
Min: min, Min: min,
Max: max, Max: max,

View File

@ -61,8 +61,13 @@ func (ccms *CCMetricStore) doRequest(job *model.Job, suffix string, metrics []st
from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix() from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix()
reqBody := ApiRequestBody{} reqBody := ApiRequestBody{}
reqBody.Metrics = metrics reqBody.Metrics = metrics
for _, node := range job.Nodes { for _, node := range job.Resources {
reqBody.Selectors = append(reqBody.Selectors, []string{job.ClusterID, node}) if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
}
reqBody.Selectors = append(reqBody.Selectors, []string{job.Cluster, node.Hostname})
} }
reqBodyBytes, err := json.Marshal(reqBody) reqBodyBytes, err := json.Marshal(reqBody)
@ -86,32 +91,37 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
return nil, err return nil, err
} }
resdata := make([]map[string]ApiMetricData, 0, len(job.Nodes)) resdata := make([]map[string]ApiMetricData, 0, len(job.Resources))
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil { if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
return nil, err return nil, err
} }
var jobData schema.JobData = make(schema.JobData) var jobData schema.JobData = make(schema.JobData)
for _, metric := range metrics { for _, metric := range metrics {
mc := config.GetMetricConfig(job.ClusterID, metric) mc := config.GetMetricConfig(job.Cluster, metric)
metricData := &schema.JobMetric{ metricData := &schema.JobMetric{
Scope: "node", // TODO: FIXME: Whatever... Scope: "node", // TODO: FIXME: Whatever...
Unit: mc.Unit, Unit: mc.Unit,
Timestep: mc.Sampletime, Timestep: mc.Timestep,
Series: make([]*schema.MetricSeries, 0, len(job.Nodes)), Series: make([]*schema.MetricSeries, 0, len(job.Resources)),
} }
for i, node := range job.Nodes { for i, node := range job.Resources {
if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
}
data := resdata[i][metric] data := resdata[i][metric]
if data.Error != nil { if data.Error != nil {
return nil, errors.New(*data.Error) return nil, errors.New(*data.Error)
} }
if data.Avg == nil || data.Min == nil || data.Max == nil { if data.Avg == nil || data.Min == nil || data.Max == nil {
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node, metric) return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
} }
metricData.Series = append(metricData.Series, &schema.MetricSeries{ metricData.Series = append(metricData.Series, &schema.MetricSeries{
NodeID: node, Hostname: node.Hostname,
Data: data.Data, Data: data.Data,
Statistics: &schema.MetricStatistics{ Statistics: &schema.MetricStatistics{
Avg: *data.Avg, Avg: *data.Avg,
@ -132,7 +142,7 @@ func (ccms *CCMetricStore) LoadStats(job *model.Job, metrics []string, ctx conte
return nil, err return nil, err
} }
resdata := make([]map[string]ApiStatsData, 0, len(job.Nodes)) resdata := make([]map[string]ApiStatsData, 0, len(job.Resources))
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil { if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
return nil, err return nil, err
} }
@ -140,17 +150,22 @@ func (ccms *CCMetricStore) LoadStats(job *model.Job, metrics []string, ctx conte
stats := map[string]map[string]schema.MetricStatistics{} stats := map[string]map[string]schema.MetricStatistics{}
for _, metric := range metrics { for _, metric := range metrics {
nodestats := map[string]schema.MetricStatistics{} nodestats := map[string]schema.MetricStatistics{}
for i, node := range job.Nodes { for i, node := range job.Resources {
if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
}
data := resdata[i][metric] data := resdata[i][metric]
if data.Error != nil { if data.Error != nil {
return nil, errors.New(*data.Error) return nil, errors.New(*data.Error)
} }
if data.Samples == 0 { if data.Samples == 0 {
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node, metric) return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
} }
nodestats[node] = schema.MetricStatistics{ nodestats[node.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg), Avg: float64(data.Avg),
Min: float64(data.Min), Min: float64(data.Min),
Max: float64(data.Max), Max: float64(data.Max),

View File

@ -2,6 +2,7 @@ package metricdata
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"log" "log"
"os" "os"
@ -46,9 +47,14 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
} }
fieldsCond := strings.Join(fieldsConds, " or ") fieldsCond := strings.Join(fieldsConds, " or ")
hostsConds := make([]string, 0, len(job.Nodes)) hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Nodes { for _, h := range job.Resources {
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h)) if h.HWThreads != nil || h.Accelerators != nil {
// TODO/FIXME...
return nil, errors.New("the InfluxDB metric data repository does not support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h.Hostname))
} }
hostsCond := strings.Join(hostsConds, " or ") hostsCond := strings.Join(hostsConds, " or ")
@ -72,18 +78,18 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
field, host := row.Field(), row.ValueByKey("host").(string) field, host := row.Field(), row.ValueByKey("host").(string)
jobMetric, ok := jobData[field] jobMetric, ok := jobData[field]
if !ok { if !ok {
mc := config.GetMetricConfig(job.ClusterID, field) mc := config.GetMetricConfig(job.Cluster, field)
jobMetric = &schema.JobMetric{ jobMetric = &schema.JobMetric{
Scope: "node", // TODO: FIXME: Whatever... Scope: "node", // TODO: FIXME: Whatever...
Unit: mc.Unit, Unit: mc.Unit,
Timestep: mc.Sampletime, Timestep: mc.Timestep,
Series: make([]*schema.MetricSeries, 0, len(job.Nodes)), Series: make([]*schema.MetricSeries, 0, len(job.Resources)),
} }
jobData[field] = jobMetric jobData[field] = jobMetric
} }
currentSeries = &schema.MetricSeries{ currentSeries = &schema.MetricSeries{
NodeID: host, Hostname: host,
Statistics: nil, Statistics: nil,
Data: make([]schema.Float, 0), Data: make([]schema.Float, 0),
} }
@ -102,7 +108,7 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
jobMetric := jobData[metric] jobMetric := jobData[metric]
for node, stats := range nodes { for node, stats := range nodes {
for _, series := range jobMetric.Series { for _, series := range jobMetric.Series {
if series.NodeID == node { if series.Hostname == node {
series.Statistics = &stats series.Statistics = &stats
} }
} }
@ -115,9 +121,14 @@ func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string,
func (idb *InfluxDBv2DataRepository) LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) { func (idb *InfluxDBv2DataRepository) LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
stats := map[string]map[string]schema.MetricStatistics{} stats := map[string]map[string]schema.MetricStatistics{}
hostsConds := make([]string, 0, len(job.Nodes)) hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Nodes { for _, h := range job.Resources {
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h)) if h.HWThreads != nil || h.Accelerators != nil {
// TODO/FIXME...
return nil, errors.New("the InfluxDB metric data repository does not support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h.Hostname))
} }
hostsCond := strings.Join(hostsConds, " or ") hostsCond := strings.Join(hostsConds, " or ")

View File

@ -59,9 +59,9 @@ func Init(jobArchivePath string, disableArchive bool) error {
// Fetches the metric data for a job. // Fetches the metric data for a job.
func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) { func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
if job.State == model.JobStateRunning || !useArchive { if job.State == model.JobStateRunning || !useArchive {
repo, ok := metricDataRepos[job.ClusterID] repo, ok := metricDataRepos[job.Cluster]
if !ok { if !ok {
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.ClusterID) return nil, fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
} }
return repo.LoadData(job, metrics, ctx) return repo.LoadData(job, metrics, ctx)
@ -90,9 +90,9 @@ func LoadAverages(job *model.Job, metrics []string, data [][]schema.Float, ctx c
return loadAveragesFromArchive(job, metrics, data) return loadAveragesFromArchive(job, metrics, data)
} }
repo, ok := metricDataRepos[job.ClusterID] repo, ok := metricDataRepos[job.Cluster]
if !ok { if !ok {
return fmt.Errorf("no metric data repository configured for '%s'", job.ClusterID) return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
} }
stats, err := repo.LoadStats(job, metrics, ctx) stats, err := repo.LoadStats(job, metrics, ctx)

View File

@ -9,10 +9,10 @@ import (
type JobData map[string]*JobMetric type JobData map[string]*JobMetric
type JobMetric struct { type JobMetric struct {
Unit string `json:"unit"` Unit string `json:"Unit"`
Scope MetricScope `json:"scope"` Scope MetricScope `json:"Scope"`
Timestep int `json:"timestep"` Timestep int `json:"Timestep"`
Series []*MetricSeries `json:"series"` Series []*MetricSeries `json:"Series"`
} }
type MetricScope string type MetricScope string
@ -41,38 +41,59 @@ func (e MetricScope) MarshalGQL(w io.Writer) {
} }
type MetricStatistics struct { type MetricStatistics struct {
Avg float64 `json:"avg"` Avg float64 `json:"Avg"`
Min float64 `json:"min"` Min float64 `json:"Min"`
Max float64 `json:"max"` Max float64 `json:"Max"`
} }
type MetricSeries struct { type MetricSeries struct {
NodeID string `json:"node_id"` Hostname string `json:"Hostname"`
Statistics *MetricStatistics `json:"statistics"` Id int `json:"Id"`
Data []Float `json:"data"` Statistics *MetricStatistics `json:"Statistics"`
Data []Float `json:"Data"`
} }
type JobMetaStatistics struct { type JobMetaStatistics struct {
Unit string `json:"unit"` Unit string `json:"Unit"`
Avg float64 `json:"avg"` Avg float64 `json:"Avg"`
Min float64 `json:"min"` Min float64 `json:"Min"`
Max float64 `json:"max"` Max float64 `json:"Max"`
}
type Accelerator struct {
ID int `json:"Id"`
Type string `json:"Type"`
Model string `json:"Model"`
}
type JobResource struct {
Hostname string `json:"Hostname"`
HWThreads []int `json:"HWThreads,omitempty"`
Accelerators []Accelerator `json:"Accelerators,omitempty"`
} }
// Format of `meta.json` files. // Format of `meta.json` files.
type JobMeta struct { type JobMeta struct {
JobId string `json:"job_id"` JobId int64 `json:"JobId"`
UserId string `json:"user_id"` User string `json:"User"`
ProjectId string `json:"project_id"` Project string `json:"Project"`
ClusterId string `json:"cluster_id"` Cluster string `json:"Cluster"`
NumNodes int `json:"num_nodes"` NumNodes int `json:"NumNodes"`
JobState string `json:"job_state"` NumHWThreads int `json:"NumHWThreads"`
StartTime int64 `json:"start_time"` NumAcc int `json:"NumAcc"`
Duration int64 `json:"duration"` Exclusive int8 `json:"Exclusive"`
Nodes []string `json:"nodes"` MonitoringStatus int8 `json:"MonitoringStatus"`
SMT int8 `json:"SMT"`
Partition string `json:"Partition"`
ArrayJobId int `json:"ArrayJobId"`
JobState string `json:"JobState"`
StartTime int64 `json:"StartTime"`
Duration int64 `json:"Duration"`
Resources []*JobResource `json:"Resources"`
MetaData string `json:"MetaData"`
Tags []struct { Tags []struct {
Name string `json:"name"` Name string `json:"Name"`
Type string `json:"type"` Type string `json:"Type"`
} `json:"tags"` } `json:"Tags"`
Statistics map[string]*JobMetaStatistics `json:"statistics"` Statistics map[string]*JobMetaStatistics `json:"Statistics"`
} }

View File

@ -308,12 +308,12 @@ func monitoringRoutes(router *mux.Router, resolver *graph.Resolver) {
} }
templates.Render(rw, r, "monitoring/job/", &templates.Page{ templates.Render(rw, r, "monitoring/job/", &templates.Page{
Title: fmt.Sprintf("Job %s - ClusterCockpit", job.JobID), Title: fmt.Sprintf("Job %d - ClusterCockpit", job.JobID),
Config: conf, Config: conf,
Infos: map[string]interface{}{ Infos: map[string]interface{}{
"id": id, "id": id,
"jobId": job.JobID, "jobId": job.JobID,
"clusterId": job.ClusterID, "clusterId": job.Cluster,
}, },
}) })
}) })