all schemas new

This commit is contained in:
Lou Knauer 2021-12-17 15:49:22 +01:00
parent 89333666b3
commit 5403177edc
20 changed files with 3175 additions and 2383 deletions

View File

@ -1,5 +1,7 @@
# ClusterCockpit with a Golang backend
__*DOES NOT WORK WITH CURRENT FRONTEND*__
[![Build](https://github.com/ClusterCockpit/cc-jobarchive/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-jobarchive/actions/workflows/test.yml)
### Run server
@ -11,11 +13,6 @@ git clone --recursive git@github.com:ClusterCockpit/cc-jobarchive.git
# Prepare frontend
cd ./cc-jobarchive/frontend
yarn install
export CCFRONTEND_ROLLUP_INTRO='
const JOBVIEW_URL = job => `/monitoring/job/${job.id}`;
const USERVIEW_URL = userId => `/monitoring/user/${userId}`;
const TAG_URL = tag => `/monitoring/jobs/?tag=${tag.id}`;
'
yarn build
cd ..

View File

@ -2,17 +2,15 @@ package api
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"log"
"net/http"
"strings"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
"github.com/ClusterCockpit/cc-jobarchive/schema"
sq "github.com/Masterminds/squirrel"
"github.com/gorilla/mux"
"github.com/jmoiron/sqlx"
@ -33,18 +31,6 @@ func (api *RestApi) MountRoutes(r *mux.Router) {
r.HandleFunc("/api/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
}
// TODO/FIXME: UPDATE API!
type StartJobApiRequest struct {
JobId int64 `json:"jobId"`
UserId string `json:"userId"`
ClusterId string `json:"clusterId"`
StartTime int64 `json:"startTime"`
MetaData string `json:"metaData"`
ProjectId string `json:"projectId"`
Nodes []string `json:"nodes"`
NodeList string `json:"nodeList"`
}
type StartJobApiRespone struct {
DBID int64 `json:"id"`
}
@ -53,15 +39,12 @@ type StopJobApiRequest struct {
// JobId, ClusterId and StartTime are optional.
// They are only used if no database id was provided.
JobId *string `json:"jobId"`
ClusterId *string `json:"clusterId"`
Cluster *string `json:"clusterId"`
StartTime *int64 `json:"startTime"`
// Payload
StopTime int64 `json:"stopTime"`
}
type StopJobApiRespone struct {
DBID string `json:"id"`
State schema.JobState `json:"jobState"`
}
type TagJobApiRequest []*struct {
@ -110,7 +93,7 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
}
for _, tag := range req {
var tagId string
var tagId int64
if err := sq.Select("id").From("tag").
Where("tag.tag_type = ?", tag.Type).Where("tag.tag_name = ?", tag.Name).
RunWith(api.DB).QueryRow().Scan(&tagId); err != nil {
@ -123,10 +106,10 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
return
}
job.Tags = append(job.Tags, &model.JobTag{
job.Tags = append(job.Tags, &schema.Tag{
ID: tagId,
TagType: tag.Type,
TagName: tag.Name,
Type: tag.Type,
Name: tag.Name,
})
}
@ -136,31 +119,25 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
}
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
req := StartJobApiRequest{}
req := schema.JobMeta{BaseJob: schema.JobDefaults}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
if config.GetClusterConfig(req.ClusterId) == nil {
http.Error(rw, fmt.Sprintf("cluster '%s' does not exist", req.ClusterId), http.StatusBadRequest)
if config.GetClusterConfig(req.Cluster) == nil {
http.Error(rw, fmt.Sprintf("cluster '%s' does not exist", req.Cluster), http.StatusBadRequest)
return
}
if req.Nodes == nil {
req.Nodes = strings.Split(req.NodeList, "|")
if len(req.Nodes) == 1 {
req.Nodes = strings.Split(req.NodeList, ",")
}
}
if len(req.Nodes) == 0 || len(req.Nodes[0]) == 0 || len(req.UserId) == 0 {
if len(req.Resources) == 0 || len(req.User) == 0 || req.NumNodes == 0 {
http.Error(rw, "required fields are missing", http.StatusBadRequest)
return
}
// Check if combination of (job_id, cluster_id, start_time) already exists:
rows, err := api.DB.Query(`SELECT job.id FROM job WHERE job.job_id = ? AND job.cluster_id = ? AND job.start_time = ?`,
req.JobId, req.ClusterId, req.StartTime)
rows, err := api.DB.Query(`SELECT job.id FROM job WHERE job.job_id = ? AND job.cluster = ? AND job.start_time = ?`,
req.JobID, req.Cluster, req.StartTime)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
@ -173,9 +150,12 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
return
}
res, err := api.DB.Exec(
`INSERT INTO job (job_id, user_id, project_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);`,
req.JobId, req.UserId, req.ProjectId, req.ClusterId, req.StartTime, 0, model.JobStateRunning, len(req.Nodes), strings.Join(req.Nodes, ","), req.MetaData)
req.RawResources, err = json.Marshal(req.Resources)
if err != nil {
log.Fatal(err)
}
res, err := api.DB.NamedExec(schema.JobInsertStmt, req)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
@ -187,7 +167,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
return
}
log.Printf("new job (id: %d): clusterId=%s, jobId=%d, userId=%s, startTime=%d, nodes=%v\n", id, req.ClusterId, req.JobId, req.UserId, req.StartTime, req.Nodes)
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d\n", id, req.Cluster, req.JobID, req.User, req.StartTime)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusCreated)
json.NewEncoder(rw).Encode(StartJobApiRespone{
@ -203,66 +183,89 @@ func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
}
var err error
var job *model.Job
var sql string
var args []interface{}
id, ok := mux.Vars(r)["id"]
if ok {
job, err = graph.ScanJob(sq.Select(graph.JobTableCols...).From("job").Where("job.id = ?", id).RunWith(api.DB).QueryRow())
sql, args, err = sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id).ToSql()
} else {
job, err = graph.ScanJob(sq.Select(graph.JobTableCols...).From("job").
sql, args, err = sq.Select(schema.JobColumns...).From("job").
Where("job.job_id = ?", req.JobId).
Where("job.cluster_id = ?", req.ClusterId).
Where("job.start_time = ?", req.StartTime).
RunWith(api.DB).QueryRow())
Where("job.cluster = ?", req.Cluster).
Where("job.start_time = ?", req.StartTime).ToSql()
}
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
job, err := schema.ScanJob(api.DB.QueryRowx(sql, args...))
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != model.JobStateRunning {
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
http.Error(rw, "stop_time must be larger than start_time and only running jobs can be stopped", http.StatusBadRequest)
return
}
doArchiving := func(job *model.Job, ctx context.Context) error {
job.Duration = int(req.StopTime - job.StartTime.Unix())
if req.State != "" && !req.State.Valid() {
http.Error(rw, fmt.Sprintf("invalid job state: '%s'", req.State), http.StatusBadRequest)
return
} else {
req.State = schema.JobStateCompleted
}
doArchiving := func(job *schema.Job, ctx context.Context) error {
job.Duration = int32(req.StopTime - job.StartTime.Unix())
jobMeta, err := metricdata.ArchiveJob(job, ctx)
if err != nil {
log.Printf("archiving job (id: %s) failed: %s\n", job.ID, err.Error())
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
return err
}
getAvg := func(metric string) sql.NullFloat64 {
stats, ok := jobMeta.Statistics[metric]
if !ok {
return sql.NullFloat64{Valid: false}
stmt := sq.Update("job").
Set("job_state", req.State).
Set("duration", job.Duration).
Where("job.id = ?", job.ID)
for metric, stats := range jobMeta.Statistics {
switch metric {
case "flops_any":
stmt = stmt.Set("flops_any_avg", stats.Avg)
case "mem_used":
stmt = stmt.Set("mem_used_max", stats.Max)
case "mem_bw":
stmt = stmt.Set("mem_bw_avg", stats.Avg)
case "load":
stmt = stmt.Set("load_avg", stats.Avg)
case "net_bw":
stmt = stmt.Set("net_bw_avg", stats.Avg)
case "file_bw":
stmt = stmt.Set("file_bw_avg", stats.Avg)
}
return sql.NullFloat64{Valid: true, Float64: stats.Avg}
}
if _, err := api.DB.Exec(
`UPDATE job SET
job_state = ?, duration = ?,
flops_any_avg = ?, mem_bw_avg = ?, net_bw_avg = ?, file_bw_avg = ?, load_avg = ?
WHERE job.id = ?`,
model.JobStateCompleted, job.Duration,
getAvg("flops_any"), getAvg("mem_bw"), getAvg("net_bw"), getAvg("file_bw"), getAvg("load"),
job.ID); err != nil {
log.Printf("archiving job (id: %s) failed: %s\n", job.ID, err.Error())
sql, args, err := stmt.ToSql()
if err != nil {
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
return err
}
log.Printf("job stopped and archived (id: %s)\n", job.ID)
if _, err := api.DB.Exec(sql, args...); err != nil {
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
return err
}
log.Printf("job stopped and archived (dbid: %d)\n", job.ID)
return nil
}
log.Printf("archiving job... (id: %s): clusterId=%s, jobId=%d, userId=%s, startTime=%s\n", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s\n", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
if api.AsyncArchiving {
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(StopJobApiRespone{
DBID: job.ID,
})
json.NewEncoder(rw).Encode(job)
go doArchiving(job, context.Background())
} else {
err := doArchiving(job, r.Context())

View File

@ -46,8 +46,8 @@ func Init(usersdb *sqlx.DB, authEnabled bool, uiConfig map[string]interface{}, j
cluster.FilterRanges.StartTime.To = time.Unix(0, 0)
}
if cluster.ClusterID != de.Name() {
return fmt.Errorf("the file '%s/cluster.json' contains the clusterId '%s'", de.Name(), cluster.ClusterID)
if cluster.Name != de.Name() {
return fmt.Errorf("the file '%s/cluster.json' contains the clusterId '%s'", de.Name(), cluster.Name)
}
Clusters = append(Clusters, &cluster)
@ -149,7 +149,7 @@ func ServeConfig(rw http.ResponseWriter, r *http.Request) {
func GetClusterConfig(cluster string) *model.Cluster {
for _, c := range Clusters {
if c.ClusterID == cluster {
if c.Name == cluster {
return c
}
}
@ -158,7 +158,7 @@ func GetClusterConfig(cluster string) *model.Cluster {
func GetMetricConfig(cluster, metric string) *model.MetricConfig {
for _, c := range Clusters {
if c.ClusterID == cluster {
if c.Name == cluster {
for _, m := range c.MetricConfig {
if m.Name == metric {
return m

View File

@ -55,21 +55,19 @@ models:
- github.com/99designs/gqlgen/graphql.Int64
- github.com/99designs/gqlgen/graphql.Int32
Job:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Job"
fields:
Tags:
tags:
resolver: true
JobMetric:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric"
JobMetricSeries:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricSeries"
JobMetricStatistics:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricStatistics"
NullableFloat:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float"
JobMetricScope:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope"
JobResource:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobResource"
Accelerator:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Accelerator"
NullableFloat: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope" }
JobStatistics: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobStatistics" }
Tag: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobState" }
JobMetric: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Series" }
MetricStatistics: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricStatistics" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.StatsSeries" }

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +1,17 @@
package model
// Go look at `gqlgen.yml` and the schema package for other non-generated models.
type JobTag struct {
ID string `json:"id" db:"id"`
TagType string `json:"tagType" db:"tag_type"`
TagName string `json:"tagName" db:"tag_name"`
}
type Cluster struct {
ClusterID string `json:"clusterID"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
Name string `json:"name"`
MetricConfig []*MetricConfig `json:"metricConfig"`
FilterRanges *FilterRanges `json:"filterRanges"`
MetricDataRepository *struct {
Partitions []*Partition `json:"partitions"`
// NOT part of the API:
MetricDataRepository *MetricDataRepository `json:"metricDataRepository"`
}
type MetricDataRepository struct {
Kind string `json:"kind"`
Url string `json:"url"`
} `json:"metricDataRepository"`
Token string `json:"token"`
}

View File

@ -11,6 +11,12 @@ import (
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
type Accelerator struct {
ID string `json:"id"`
Type string `json:"type"`
Model string `json:"model"`
}
type FilterRanges struct {
Duration *IntRangeOutput `json:"duration"`
NumNodes *IntRangeOutput `json:"numNodes"`
@ -37,33 +43,6 @@ type IntRangeOutput struct {
To int `json:"to"`
}
type Job struct {
ID string `json:"Id"`
JobID int `json:"JobId"`
User string `json:"User"`
Project string `json:"Project"`
Cluster string `json:"Cluster"`
StartTime time.Time `json:"StartTime"`
Duration int `json:"Duration"`
NumNodes int `json:"NumNodes"`
NumHWThreads int `json:"NumHWThreads"`
NumAcc int `json:"NumAcc"`
Smt int `json:"SMT"`
Exclusive int `json:"Exclusive"`
Partition string `json:"Partition"`
ArrayJobID int `json:"ArrayJobId"`
MonitoringStatus int `json:"MonitoringStatus"`
State JobState `json:"State"`
Tags []*JobTag `json:"Tags"`
Resources []*schema.JobResource `json:"Resources"`
LoadAvg *float64 `json:"LoadAvg"`
MemUsedMax *float64 `json:"MemUsedMax"`
FlopsAnyAvg *float64 `json:"FlopsAnyAvg"`
MemBwAvg *float64 `json:"MemBwAvg"`
NetBwAvg *float64 `json:"NetBwAvg"`
FileBwAvg *float64 `json:"FileBwAvg"`
}
type JobFilter struct {
Tags []string `json:"tags"`
JobID *StringInput `json:"jobId"`
@ -73,7 +52,7 @@ type JobFilter struct {
Duration *IntRange `json:"duration"`
NumNodes *IntRange `json:"numNodes"`
StartTime *TimeRange `json:"startTime"`
JobState []JobState `json:"jobState"`
State []schema.JobState `json:"state"`
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
MemBwAvg *FloatRange `json:"memBwAvg"`
LoadAvg *FloatRange `json:"loadAvg"`
@ -82,11 +61,22 @@ type JobFilter struct {
type JobMetricWithName struct {
Name string `json:"name"`
Metric *schema.JobMetric `json:"metric"`
Node *schema.JobMetric `json:"node"`
Socket *schema.JobMetric `json:"socket"`
MemoryDomain *schema.JobMetric `json:"memoryDomain"`
Core *schema.JobMetric `json:"core"`
Hwthread *schema.JobMetric `json:"hwthread"`
}
type JobResource struct {
Hostname string `json:"hostname"`
Hwthreads []int `json:"hwthreads"`
Accelerators []int `json:"accelerators"`
Configuration *string `json:"configuration"`
}
type JobResultList struct {
Items []*Job `json:"items"`
Items []*schema.Job `json:"items"`
Offset *int `json:"offset"`
Limit *int `json:"limit"`
Count *int `json:"count"`
@ -103,14 +93,14 @@ type JobsStatistics struct {
}
type MetricConfig struct {
Name string `json:"Name"`
Unit string `json:"Unit"`
Timestep int `json:"Timestep"`
Peak int `json:"Peak"`
Normal int `json:"Normal"`
Caution int `json:"Caution"`
Alert int `json:"Alert"`
Scope string `json:"Scope"`
Name string `json:"name"`
Unit string `json:"unit"`
Scope string `json:"scope"`
Timestep int `json:"timestep"`
Peak float64 `json:"Peak"`
Normal float64 `json:"Normal"`
Caution float64 `json:"Caution"`
Alert float64 `json:"Alert"`
}
type MetricFootprints struct {
@ -138,6 +128,18 @@ type PageRequest struct {
Page int `json:"page"`
}
type Partition struct {
Name string `json:"name"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
Topology *Topology `json:"topology"`
}
type StringInput struct {
Eq *string `json:"eq"`
Contains *string `json:"contains"`
@ -155,6 +157,15 @@ type TimeRangeOutput struct {
To time.Time `json:"to"`
}
type Topology struct {
Node []int `json:"node"`
Socket [][]int `json:"socket"`
MemoryDomain [][]int `json:"memoryDomain"`
Die [][]int `json:"die"`
Core [][]int `json:"core"`
Accelerators []*Accelerator `json:"accelerators"`
}
type Aggregate string
const (
@ -198,55 +209,6 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
type JobState string
const (
JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed"
JobStateFailed JobState = "failed"
JobStateCanceled JobState = "canceled"
JobStateStopped JobState = "stopped"
JobStateTimeout JobState = "timeout"
)
var AllJobState = []JobState{
JobStateRunning,
JobStateCompleted,
JobStateFailed,
JobStateCanceled,
JobStateStopped,
JobStateTimeout,
}
func (e JobState) IsValid() bool {
switch e {
case JobStateRunning, JobStateCompleted, JobStateFailed, JobStateCanceled, JobStateStopped, JobStateTimeout:
return true
}
return false
}
func (e JobState) String() string {
return string(e)
}
func (e *JobState) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = JobState(str)
if !e.IsValid() {
return fmt.Errorf("%s is not a valid JobState", str)
}
return nil
}
func (e JobState) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
type SortDirectionEnum string
const (

View File

@ -2,15 +2,14 @@ package graph
import (
"context"
"encoding/json"
"errors"
"fmt"
"regexp"
"strings"
"time"
"github.com/ClusterCockpit/cc-jobarchive/auth"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
sq "github.com/Masterminds/squirrel"
"github.com/jmoiron/sqlx"
)
@ -23,44 +22,9 @@ type Resolver struct {
DB *sqlx.DB
}
var JobTableCols []string = []string{
"id", "job_id", "cluster", "start_time",
"user", "project", "partition", "array_job_id", "duration", "job_state", "resources",
"num_nodes", "num_hwthreads", "num_acc", "smt", "exclusive", "monitoring_status",
"load_avg", "mem_used_max", "flops_any_avg", "mem_bw_avg", "net_bw_avg", "file_bw_avg",
}
type Scannable interface {
Scan(dest ...interface{}) error
}
// Helper function for scanning jobs with the `jobTableCols` columns selected.
func ScanJob(row Scannable) (*model.Job, error) {
job := &model.Job{}
var rawResources []byte
if err := row.Scan(
&job.ID, &job.JobID, &job.Cluster, &job.StartTime,
&job.User, &job.Project, &job.Partition, &job.ArrayJobID, &job.Duration, &job.State, &rawResources,
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Smt, &job.Exclusive, &job.MonitoringStatus,
&job.LoadAvg, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.NetBwAvg, &job.FileBwAvg); err != nil {
return nil, err
}
if err := json.Unmarshal(rawResources, &job.Resources); err != nil {
return nil, err
}
if job.Duration == 0 && job.State == model.JobStateRunning {
job.Duration = int(time.Since(job.StartTime).Seconds())
}
return job, nil
}
// Helper function for the `jobs` GraphQL-Query. Is also used elsewhere when a list of jobs is needed.
func (r *Resolver) queryJobs(ctx context.Context, filters []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) ([]*model.Job, int, error) {
query := sq.Select(JobTableCols...).From("job")
func (r *Resolver) queryJobs(ctx context.Context, filters []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) ([]*schema.Job, int, error) {
query := sq.Select(schema.JobColumns...).From("job")
query = securityCheck(ctx, query)
if order != nil {
@ -85,33 +49,32 @@ func (r *Resolver) queryJobs(ctx context.Context, filters []*model.JobFilter, pa
query = buildWhereClause(f, query)
}
rows, err := query.RunWith(r.DB).Query()
sql, args, err := query.ToSql()
if err != nil {
return nil, 0, err
}
defer rows.Close()
jobs := make([]*model.Job, 0, 50)
rows, err := r.DB.Queryx(sql, args...)
if err != nil {
return nil, 0, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := ScanJob(rows)
job, err := schema.ScanJob(rows)
if err != nil {
return nil, 0, err
}
jobs = append(jobs, job)
}
// count all jobs:
query = sq.Select("count(*)").From("job")
for _, f := range filters {
query = buildWhereClause(f, query)
}
rows, err = query.RunWith(r.DB).Query()
if err != nil {
return nil, 0, err
}
defer rows.Close()
var count int
rows.Next()
if err := rows.Scan(&count); err != nil {
if err := query.RunWith(r.DB).Scan(&count); err != nil {
return nil, 0, err
}
@ -132,7 +95,7 @@ func securityCheck(ctx context.Context, query sq.SelectBuilder) sq.SelectBuilder
return query.Where("job.user_id = ?", user.Username)
}
// Build a sq.SelectBuilder out of a model.JobFilter.
// Build a sq.SelectBuilder out of a schema.JobFilter.
func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
if filter.Tags != nil {
query = query.Join("jobtag ON jobtag.job_id = job.id").Where("jobtag.tag_id IN ?", filter.Tags)
@ -155,8 +118,8 @@ func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
if filter.Duration != nil {
query = buildIntCondition("job.duration", filter.Duration, query)
}
if filter.JobState != nil {
query = query.Where("job.job_state IN ?", filter.JobState)
if filter.State != nil {
query = query.Where("job.job_state IN ?", filter.State)
}
if filter.NumNodes != nil {
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)

View File

@ -1,107 +1,122 @@
scalar Time
scalar NullableFloat
scalar MetricScope
scalar JobState
type Job {
Id: ID! # Database ID, unique
JobId: Int! # ID given to the job by the cluster scheduler
User: String! # Username
Project: String! # Project
Cluster: String! # Name of the cluster this job was running on
StartTime: Time! # RFC3339 formated string
Duration: Int! # For running jobs, the time it has already run
NumNodes: Int! # Number of nodes this job was running on
NumHWThreads: Int!
NumAcc: Int!
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
startTime: Time!
duration: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
SMT: Int!
Exclusive: Int!
Partition: String!
ArrayJobId: Int!
MonitoringStatus: Int!
State: JobState! # State of the job
Tags: [JobTag!]! # List of tags this job has
Resources: [JobResource!]! # List of hosts/hwthreads/gpus/...
# Will be null for running jobs.
LoadAvg: Float
MemUsedMax: Float
FlopsAnyAvg: Float
MemBwAvg: Float
NetBwAvg: Float
FileBwAvg: Float
}
type JobResource {
Hostname: String!
HWThreads: [Int!]
Accelerators: [Accelerator!]
}
type Accelerator {
Id: String!
Type: String!
Model: String!
}
# TODO: Extend by more possible states?
enum JobState {
running
completed
failed
canceled
stopped
timeout
}
type JobTag {
Id: ID! # Database ID, unique
TagType: String! # Type
TagName: String! # Name
exclusive: Int!
partition: String!
arrayJobId: Int!
monitoringStatus: Int!
state: JobState!
tags: [Tag!]!
resources: [JobResource!]!
}
type Cluster {
ClusterID: String!
ProcessorType: String!
SocketsPerNode: Int!
CoresPerSocket: Int!
ThreadsPerCore: Int!
FlopRateScalar: Int!
FlopRateSimd: Int!
MemoryBandwidth: Int!
MetricConfig: [MetricConfig!]!
FilterRanges: FilterRanges!
name: String!
metricConfig: [MetricConfig!]!
filterRanges: FilterRanges!
partitions: [Partition!]!
}
type Partition {
name: String!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: Int!
flopRateSimd: Int!
memoryBandwidth: Int!
topology: Topology!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
memoryDomain: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
accelerators: [Accelerator!]
}
type Accelerator {
id: String!
type: String!
model: String!
}
type MetricConfig {
Name: String!
Unit: String!
Timestep: Int!
Peak: Int!
Normal: Int!
Caution: Int!
Alert: Int!
Scope: String!
name: String!
unit: String!
scope: String!
timestep: Int!
Peak: Float!
Normal: Float!
Caution: Float!
Alert: Float!
}
type JobMetric {
Unit: String!
Scope: JobMetricScope!
Timestep: Int!
Series: [JobMetricSeries!]!
type Tag {
id: ID!
type: String!
name: String!
}
type JobMetricSeries {
Hostname: String!
Id: Int
Statistics: JobMetricStatistics
Data: [NullableFloat!]!
}
type JobMetricStatistics {
Avg: Float!
Min: Float!
Max: Float!
type JobResource {
hostname: String!
hwthreads: [Int!]
accelerators: [Int!]
configuration: String
}
type JobMetricWithName {
name: String!
metric: JobMetric!
node: JobMetric
socket: JobMetric
memoryDomain: JobMetric
core: JobMetric
hwthread: JobMetric
}
type JobMetric {
unit: String!
scope: MetricScope!
timestep: Int!
series: [Series!]!
statisticsSeries: [StatsSeries!]
}
type Series {
hostname: String!
id: Int
statistics: MetricStatistics
data: [NullableFloat!]!
}
type MetricStatistics {
avg: Float!
min: Float!
max: Float!
}
type StatsSeries {
mean: [NullableFloat!]
min: [NullableFloat!]
max: [NullableFloat!]
}
type MetricFootprints {
@ -123,7 +138,7 @@ type NodeMetrics {
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [JobTag!]! # List of all tags
tags: [Tag!]! # List of all tags
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!]): [JobMetricWithName!]!
@ -138,23 +153,16 @@ type Query {
}
type Mutation {
createTag(type: String!, name: String!): JobTag!
createTag(type: String!, name: String!): Tag!
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [JobTag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [JobTag!]!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput {
from: Int!
to: Int!
}
type TimeRangeOutput {
from: Time!
to: Time!
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
type FilterRanges {
duration: IntRangeOutput!
@ -171,7 +179,7 @@ input JobFilter {
duration: IntRange
numNodes: IntRange
startTime: TimeRange
jobState: [JobState!]
state: [JobState!]
flopsAnyAvg: FloatRange
memBwAvg: FloatRange
loadAvg: FloatRange
@ -195,20 +203,9 @@ input StringInput {
endsWith: String
}
input IntRange {
from: Int!
to: Int!
}
input FloatRange {
from: Float!
to: Float!
}
input TimeRange {
from: Time
to: Time
}
input IntRange { from: Int!, to: Int! }
input FloatRange { from: Float!, to: Float! }
input TimeRange { from: Time, to: Time }
type JobResultList {
items: [Job!]!
@ -236,7 +233,3 @@ input PageRequest {
itemsPerPage: Int!
page: Int!
}
scalar Time
scalar NullableFloat
scalar JobMetricScope

View File

@ -19,36 +19,35 @@ import (
sq "github.com/Masterminds/squirrel"
)
func (r *acceleratorResolver) ID(ctx context.Context, obj *schema.Accelerator) (string, error) {
panic(fmt.Errorf("not implemented"))
}
func (r *jobResolver) Tags(ctx context.Context, obj *model.Job) ([]*model.JobTag, error) {
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
query := sq.
Select("tag.id", "tag.tag_type", "tag.tag_name").
From("tag").
Join("jobtag ON jobtag.tag_id = tag.id").
Where("jobtag.job_id = ?", obj.ID)
rows, err := query.RunWith(r.DB).Query()
sql, args, err := query.ToSql()
if err != nil {
return nil, err
}
defer rows.Close()
tags := make([]*model.JobTag, 0)
for rows.Next() {
var tag model.JobTag
if err := rows.Scan(&tag.ID, &tag.TagType, &tag.TagName); err != nil {
tags := make([]*schema.Tag, 0)
if err := r.DB.Select(&tags, sql, args...); err != nil {
return nil, err
}
tags = append(tags, &tag)
}
return tags, nil
}
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*model.JobTag, error) {
func (r *jobResolver) Resources(ctx context.Context, obj *schema.Job) ([]*model.JobResource, error) {
panic(fmt.Errorf("not implemented"))
}
func (r *jobMetricResolver) StatisticsSeries(ctx context.Context, obj *schema.JobMetric) ([]*schema.StatsSeries, error) {
panic(fmt.Errorf("not implemented"))
}
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
res, err := r.DB.Exec("INSERT INTO tag (tag_type, tag_name) VALUES ($1, $2)", typeArg, name)
if err != nil {
return nil, err
@ -59,7 +58,7 @@ func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name s
return nil, err
}
return &model.JobTag{ID: strconv.FormatInt(id, 10), TagType: typeArg, TagName: name}, nil
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
}
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
@ -67,7 +66,7 @@ func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, er
panic(fmt.Errorf("not implemented"))
}
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*model.JobTag, error) {
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
jid, err := strconv.Atoi(job)
if err != nil {
return nil, err
@ -84,7 +83,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
}
}
tags, err := r.Job().Tags(ctx, &model.Job{ID: job})
dummyJob := schema.Job{}
dummyJob.ID = int64(jid)
tags, err := r.Job().Tags(ctx, &dummyJob)
if err != nil {
return nil, err
}
@ -97,7 +98,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
return tags, metricdata.UpdateTags(jobObj, tags)
}
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*model.JobTag, error) {
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
jid, err := strconv.Atoi(job)
if err != nil {
return nil, err
@ -114,7 +115,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
}
}
tags, err := r.Job().Tags(ctx, &model.Job{ID: job})
dummyJob := schema.Job{}
dummyJob.ID = int64(jid)
tags, err := r.Job().Tags(ctx, &dummyJob)
if err != nil {
return nil, err
}
@ -139,29 +142,28 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*model.Cluster, error)
return config.Clusters, nil
}
func (r *queryResolver) Tags(ctx context.Context) ([]*model.JobTag, error) {
rows, err := sq.Select("id", "tag_type", "tag_name").From("tag").RunWith(r.DB).Query()
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
sql, args, err := sq.Select("id", "tag_type", "tag_name").From("tag").ToSql()
if err != nil {
return nil, err
}
defer rows.Close()
tags := make([]*model.JobTag, 0)
for rows.Next() {
var tag model.JobTag
if err := rows.Scan(&tag.ID, &tag.TagType, &tag.TagName); err != nil {
tags := make([]*schema.Tag, 0)
if err := r.DB.Select(&tags, sql, args...); err != nil {
return nil, err
}
tags = append(tags, &tag)
}
return tags, nil
}
func (r *queryResolver) Job(ctx context.Context, id string) (*model.Job, error) {
query := sq.Select(JobTableCols...).From("job").Where("job.id = ?", id)
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
query := sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id)
query = securityCheck(ctx, query)
return ScanJob(query.RunWith(r.DB).QueryRow())
sql, args, err := query.ToSql()
if err != nil {
return nil, err
}
return schema.ScanJob(r.DB.QueryRowx(sql, args...))
}
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string) ([]*model.JobMetricWithName, error) {
@ -179,7 +181,11 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
for name, md := range data {
res = append(res, &model.JobMetricWithName{
Name: name,
Metric: md,
Node: md["node"],
Socket: md["socket"],
MemoryDomain: md["memoryDomain"],
Core: md["core"],
Hwthread: md["hwthread"],
})
}
@ -237,19 +243,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return res, nil
}
// Accelerator returns generated.AcceleratorResolver implementation.
func (r *Resolver) Accelerator() generated.AcceleratorResolver { return &acceleratorResolver{r} }
// Job returns generated.JobResolver implementation.
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
// JobMetric returns generated.JobMetricResolver implementation.
func (r *Resolver) JobMetric() generated.JobMetricResolver { return &jobMetricResolver{r} }
// Mutation returns generated.MutationResolver implementation.
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
// Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
type acceleratorResolver struct{ *Resolver }
type jobResolver struct{ *Resolver }
type jobMetricResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver }
type queryResolver struct{ *Resolver }

View File

@ -3,6 +3,7 @@ package graph
import (
"context"
"database/sql"
"errors"
"fmt"
"math"
@ -16,9 +17,9 @@ import (
// GraphQL validation should make sure that no unkown values can be specified.
var groupBy2column = map[model.Aggregate]string{
model.AggregateUser: "job.user_id",
model.AggregateProject: "job.project_id",
model.AggregateCluster: "job.cluster_id",
model.AggregateUser: "job.user",
model.AggregateProject: "job.project",
model.AggregateCluster: "job.cluster",
}
// Helper function for the jobsStatistics GraphQL query placed here so that schema.resolvers.go is not too full.
@ -28,7 +29,8 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
// `socketsPerNode` and `coresPerSocket` can differ from cluster to cluster, so we need to explicitly loop over those.
for _, cluster := range config.Clusters {
corehoursCol := fmt.Sprintf("SUM(job.duration * job.num_nodes * %d * %d) / 3600", cluster.SocketsPerNode, cluster.CoresPerSocket)
for _, partition := range cluster.Partitions {
corehoursCol := fmt.Sprintf("SUM(job.duration * job.num_nodes * %d * %d) / 3600", partition.SocketsPerNode, partition.CoresPerSocket)
var query sq.SelectBuilder
if groupBy == nil {
query = sq.Select(
@ -36,7 +38,7 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
"COUNT(job.id)",
"SUM(job.duration) / 3600",
corehoursCol,
).From("job").Where("job.cluster_id = ?", cluster.ClusterID)
).From("job")
} else {
col := groupBy2column[*groupBy]
query = sq.Select(
@ -44,9 +46,13 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
"COUNT(job.id)",
"SUM(job.duration) / 3600",
corehoursCol,
).From("job").Where("job.cluster_id = ?", cluster.ClusterID).GroupBy(col)
).From("job").GroupBy(col)
}
query = query.
Where("job.cluster = ?", cluster.Name).
Where("job.partition = ?", partition.Name)
query = securityCheck(ctx, query)
for _, f := range filter {
query = buildWhereClause(f, query)
@ -80,6 +86,7 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
}
}
}
}
if groupBy == nil {
query := sq.Select("COUNT(job.id)").From("job").Where("job.duration < 120")
@ -204,9 +211,16 @@ func (r *Resolver) rooflineHeatmap(ctx context.Context, filter []*model.JobFilte
return nil, err
}
flops, membw := jobdata["flops_any"], jobdata["mem_bw"]
if flops == nil && membw == nil {
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %s", job.ID)
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil {
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %d", job.ID)
}
flops, ok1 := flops_["node"]
membw, ok2 := membw_["node"]
if !ok1 || !ok2 {
// TODO/FIXME:
return nil, errors.New("todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
}
for n := 0; n < len(flops.Series); n++ {

View File

@ -2,7 +2,6 @@ package main
import (
"bufio"
"database/sql"
"encoding/json"
"fmt"
"log"
@ -23,7 +22,7 @@ const JOBS_DB_SCHEMA string = `
id INTEGER PRIMARY KEY AUTOINCREMENT, -- Not needed in sqlite
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
start_time BITINT NOT NULL,
start_time TIMESTAMP NOT NULL,
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
@ -80,25 +79,20 @@ func initDB(db *sqlx.DB, archive string) error {
return err
}
insertstmt, err := db.Prepare(`INSERT INTO job (
job_id, cluster, start_time,
user, project, partition, array_job_id, duration, job_state, meta_data, resources,
num_nodes, num_hwthreads, num_acc, smt, exclusive, monitoring_status,
flops_any_avg, mem_bw_avg
) VALUES (
?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?,
?, ?
);`)
if err != nil {
return err
}
tx, err := db.Begin()
tx, err := db.Beginx()
if err != nil {
return err
}
stmt, err := tx.PrepareNamed(schema.JobInsertStmt)
if err != nil {
return err
}
i := 0
tags := make(map[string]int64)
handleDirectory := func(filename string) error {
@ -110,16 +104,16 @@ func initDB(db *sqlx.DB, archive string) error {
}
}
tx, err = db.Begin()
tx, err = db.Beginx()
if err != nil {
return err
}
insertstmt = tx.Stmt(insertstmt)
stmt = tx.NamedStmt(stmt)
fmt.Printf("%d jobs inserted...\r", i)
}
err := loadJob(tx, insertstmt, tags, filename)
err := loadJob(tx, stmt, tags, filename)
if err == nil {
i += 1
}
@ -151,14 +145,14 @@ func initDB(db *sqlx.DB, archive string) error {
return err
}
for _, startTiemDir := range startTimeDirs {
if startTiemDir.Type().IsRegular() && startTiemDir.Name() == "meta.json" {
for _, startTimeDir := range startTimeDirs {
if startTimeDir.Type().IsRegular() && startTimeDir.Name() == "meta.json" {
if err := handleDirectory(dirpath); err != nil {
log.Printf("in %s: %s\n", dirpath, err.Error())
}
} else if startTiemDir.IsDir() {
if err := handleDirectory(filepath.Join(dirpath, startTiemDir.Name())); err != nil {
log.Printf("in %s: %s\n", filepath.Join(dirpath, startTiemDir.Name()), err.Error())
} else if startTimeDir.IsDir() {
if err := handleDirectory(filepath.Join(dirpath, startTimeDir.Name())); err != nil {
log.Printf("in %s: %s\n", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
}
}
}
@ -184,34 +178,28 @@ func initDB(db *sqlx.DB, archive string) error {
// Read the `meta.json` file at `path` and insert it to the database using the prepared
// insert statement `stmt`. `tags` maps all existing tags to their database ID.
func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) error {
func loadJob(tx *sqlx.Tx, stmt *sqlx.NamedStmt, tags map[string]int64, path string) error {
f, err := os.Open(filepath.Join(path, "meta.json"))
if err != nil {
return err
}
defer f.Close()
var job schema.JobMeta = schema.JobMeta{
Exclusive: 1,
}
var job schema.JobMeta = schema.JobMeta{BaseJob: schema.JobDefaults}
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&job); err != nil {
return err
}
// TODO: Other metrics...
flopsAnyAvg := loadJobStat(&job, "flops_any")
memBwAvg := loadJobStat(&job, "mem_bw")
job.FlopsAnyAvg = loadJobStat(&job, "flops_any")
job.MemBwAvg = loadJobStat(&job, "mem_bw")
resources, err := json.Marshal(job.Resources)
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return err
}
res, err := stmt.Exec(
job.JobId, job.Cluster, job.StartTime,
job.User, job.Project, job.Partition, job.ArrayJobId, job.Duration, job.JobState, job.MetaData, string(resources),
job.NumNodes, job.NumHWThreads, job.NumAcc, job.SMT, job.Exclusive, job.MonitoringStatus,
flopsAnyAvg, memBwAvg)
res, err := stmt.Exec(job)
if err != nil {
return err
}
@ -244,12 +232,10 @@ func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) err
return nil
}
func loadJobStat(job *schema.JobMeta, metric string) sql.NullFloat64 {
val := sql.NullFloat64{Valid: false}
func loadJobStat(job *schema.JobMeta, metric string) float64 {
if stats, ok := job.Statistics[metric]; ok {
val.Valid = true
val.Float64 = stats.Avg
return stats.Avg
}
return val
return 0.0
}

View File

@ -13,13 +13,12 @@ import (
"strconv"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
// For a given job, return the path of the `data.json`/`meta.json` file.
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
func getPath(job *model.Job, file string, checkLegacy bool) (string, error) {
func getPath(job *schema.Job, file string, checkLegacy bool) (string, error) {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
if !checkLegacy {
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
@ -34,7 +33,7 @@ func getPath(job *model.Job, file string, checkLegacy bool) (string, error) {
}
// Assuming job is completed/archived, return the jobs metric data.
func loadFromArchive(job *model.Job) (schema.JobData, error) {
func loadFromArchive(job *schema.Job) (schema.JobData, error) {
filename, err := getPath(job, "data.json", true)
if err != nil {
return nil, err
@ -56,8 +55,8 @@ func loadFromArchive(job *model.Job) (schema.JobData, error) {
// If the job is archived, find its `meta.json` file and override the tags list
// in that JSON file. If the job is not archived, nothing is done.
func UpdateTags(job *model.Job, tags []*model.JobTag) error {
if job.State == model.JobStateRunning {
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
if job.State == schema.JobStateRunning {
return nil
}
@ -74,23 +73,19 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
return err
}
var metaFile schema.JobMeta
var metaFile schema.JobMeta = schema.JobMeta{
BaseJob: schema.JobDefaults,
}
if err := json.NewDecoder(f).Decode(&metaFile); err != nil {
return err
}
f.Close()
metaFile.Tags = make([]struct {
Name string "json:\"Name\""
Type string "json:\"Type\""
}, 0)
metaFile.Tags = make([]*schema.Tag, 0)
for _, tag := range tags {
metaFile.Tags = append(metaFile.Tags, struct {
Name string "json:\"Name\""
Type string "json:\"Type\""
}{
Name: tag.TagName,
Type: tag.TagType,
metaFile.Tags = append(metaFile.Tags, &schema.Tag{
Name: tag.Name,
Type: tag.Type,
})
}
@ -103,7 +98,7 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
}
// Helper to metricdata.LoadAverages().
func loadAveragesFromArchive(job *model.Job, metrics []string, data [][]schema.Float) error {
func loadAveragesFromArchive(job *schema.Job, metrics []string, data [][]schema.Float) error {
filename, err := getPath(job, "meta.json", true)
if err != nil {
return err
@ -131,8 +126,8 @@ func loadAveragesFromArchive(job *model.Job, metrics []string, data [][]schema.F
}
// Writes a running job to the job-archive
func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
if job.State != model.JobStateRunning {
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
if job.State != schema.JobStateRunning {
return nil, errors.New("cannot archive job that is not running")
}
@ -146,51 +141,27 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
return nil, err
}
tags := []struct {
Name string `json:"Name"`
Type string `json:"Type"`
}{}
for _, tag := range job.Tags {
tags = append(tags, struct {
Name string `json:"Name"`
Type string `json:"Type"`
}{
Name: tag.TagName,
Type: tag.TagType,
})
}
metaData := &schema.JobMeta{
JobId: int64(job.JobID),
User: job.User,
Project: job.Project,
Cluster: job.Cluster,
NumNodes: job.NumNodes,
NumHWThreads: job.NumHWThreads,
NumAcc: job.NumAcc,
Exclusive: int8(job.Exclusive),
MonitoringStatus: int8(job.MonitoringStatus),
SMT: int8(job.Smt),
Partition: job.Partition,
ArrayJobId: job.ArrayJobID,
JobState: string(job.State),
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Duration: int64(job.Duration),
Resources: job.Resources,
MetaData: "", // TODO/FIXME: Handle `meta_data`!
Tags: tags,
Statistics: make(map[string]*schema.JobMetaStatistics),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
for _, nodedata := range data.Series {
avg += nodedata.Statistics.Avg
min = math.Min(min, nodedata.Statistics.Min)
max = math.Max(max, nodedata.Statistics.Max)
nodeData, ok := data["node"]
if !ok {
// TODO/FIXME: Calc average for non-node metrics as well!
continue
}
metaData.Statistics[metric] = &schema.JobMetaStatistics{
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
Avg: avg / float64(job.NumNodes),
Min: min,
@ -202,7 +173,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
// only return the JobMeta structure as the
// statistics in there are needed.
if !useArchive {
return metaData, nil
return jobMeta, nil
}
dirPath, err := getPath(job, "", false)
@ -220,7 +191,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
}
defer f.Close()
writer := bufio.NewWriter(f)
if err := json.NewEncoder(writer).Encode(metaData); err != nil {
if err := json.NewEncoder(writer).Encode(jobMeta); err != nil {
return nil, err
}
if err := writer.Flush(); err != nil {
@ -239,5 +210,5 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
return nil, err
}
return metaData, f.Close()
return jobMeta, f.Close()
}

View File

@ -12,7 +12,6 @@ import (
"time"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
@ -57,7 +56,7 @@ func (ccms *CCMetricStore) Init(url string) error {
return nil
}
func (ccms *CCMetricStore) doRequest(job *model.Job, suffix string, metrics []string, ctx context.Context) (*http.Response, error) {
func (ccms *CCMetricStore) doRequest(job *schema.Job, suffix string, metrics []string, ctx context.Context) (*http.Response, error) {
from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix()
reqBody := ApiRequestBody{}
reqBody.Metrics = metrics
@ -85,7 +84,7 @@ func (ccms *CCMetricStore) doRequest(job *model.Job, suffix string, metrics []st
return ccms.client.Do(req)
}
func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
res, err := ccms.doRequest(job, "timeseries?with-stats=true", metrics, ctx)
if err != nil {
return nil, err
@ -103,8 +102,9 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
Scope: "node", // TODO: FIXME: Whatever...
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: make([]*schema.MetricSeries, 0, len(job.Resources)),
Series: make([]schema.Series, 0, len(job.Resources)),
}
for i, node := range job.Resources {
if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
@ -120,7 +120,7 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
}
metricData.Series = append(metricData.Series, &schema.MetricSeries{
metricData.Series = append(metricData.Series, schema.Series{
Hostname: node.Hostname,
Data: data.Data,
Statistics: &schema.MetricStatistics{
@ -130,13 +130,13 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
},
})
}
jobData[metric] = metricData
jobData[metric] = map[string]*schema.JobMetric{"node": metricData}
}
return jobData, nil
}
func (ccms *CCMetricStore) LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
res, err := ccms.doRequest(job, "stats", metrics, ctx)
if err != nil {
return nil, err

View File

@ -1,5 +1,6 @@
package metricdata
/*
import (
"context"
"errors"
@ -175,3 +176,4 @@ func (idb *InfluxDBv2DataRepository) LoadStats(job *model.Job, metrics []string,
func (idb *InfluxDBv2DataRepository) LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
return nil, nil
}
*/

View File

@ -5,7 +5,6 @@ import (
"fmt"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
@ -15,10 +14,10 @@ type MetricDataRepository interface {
Init(url string) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error)
LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job.
LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of nodes to a map of metrics to the data for the requested time.
LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error)
@ -41,15 +40,15 @@ func Init(jobArchivePath string, disableArchive bool) error {
if err := ccms.Init(cluster.MetricDataRepository.Url); err != nil {
return err
}
metricDataRepos[cluster.ClusterID] = ccms
case "influxdb-v2":
idb := &InfluxDBv2DataRepository{}
if err := idb.Init(cluster.MetricDataRepository.Url); err != nil {
return err
}
metricDataRepos[cluster.ClusterID] = idb
metricDataRepos[cluster.Name] = ccms
// case "influxdb-v2":
// idb := &InfluxDBv2DataRepository{}
// if err := idb.Init(cluster.MetricDataRepository.Url); err != nil {
// return err
// }
// metricDataRepos[cluster.Name] = idb
default:
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", cluster.MetricDataRepository.Kind, cluster.ClusterID)
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", cluster.MetricDataRepository.Kind, cluster.Name)
}
}
}
@ -57,8 +56,8 @@ func Init(jobArchivePath string, disableArchive bool) error {
}
// Fetches the metric data for a job.
func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
if job.State == model.JobStateRunning || !useArchive {
func LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
if job.State == schema.JobStateRunning || !useArchive {
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
@ -85,8 +84,8 @@ func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.Job
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(job *model.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
if job.State != model.JobStateRunning && useArchive {
func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
if job.State != schema.JobStateRunning && useArchive {
return loadAveragesFromArchive(job, metrics, data)
}

153
schema/job.go Normal file
View File

@ -0,0 +1,153 @@
package schema
import (
"encoding/json"
"errors"
"fmt"
"io"
"time"
)
type BaseJob struct {
ID int64 `json:"id" db:"id"`
JobID int64 `json:"jobId" db:"job_id"`
User string `json:"user" db:"user"`
Project string `json:"project" db:"project"`
Cluster string `json:"cluster" db:"cluster"`
Partition string `json:"partition" db:"partition"`
ArrayJobId int32 `json:"arrayJobId" db:"array_job_id"`
NumNodes int32 `json:"numNodes" db:"num_nodes"`
NumHWThreads int32 `json:"numHwthreads" db:"num_hwthreads"`
NumAcc int32 `json:"numAcc" db:"num_acc"`
Exclusive int32 `json:"exclusive" db:"exclusive"`
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status"`
SMT int32 `json:"smt" db:"smt"`
State JobState `json:"jobState" db:"job_state"`
Duration int32 `json:"duration" db:"duration"`
Tags []*Tag `json:"tags"`
RawResources []byte `json:"-" db:"resources"`
Resources []Resource `json:"resources"`
MetaData interface{} `json:"metaData" db:"meta_data"`
MemUsedMax float64 `json:"-" db:"mem_used_max"`
FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"`
MemBwAvg float64 `json:"-" db:"mem_bw_avg"`
LoadAvg float64 `json:"-" db:"load_avg"`
NetBwAvg float64 `json:"-" db:"net_bw_avg"`
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"`
FileBwAvg float64 `json:"-" db:"file_bw_avg"`
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"`
}
type JobMeta struct {
BaseJob
StartTime int64 `json:"startTime" db:"start_time"`
Statistics map[string]JobStatistics `json:"statistics,omitempty"`
}
var JobDefaults BaseJob = BaseJob{
Exclusive: 1,
MonitoringStatus: 1,
MetaData: "",
}
var JobColumns []string = []string{
"id", "job_id", "user", "project", "cluster", "partition", "array_job_id", "num_nodes",
"num_hwthreads", "num_acc", "exclusive", "monitoring_status", "smt", "job_state",
"duration", "resources", "meta_data",
}
const JobInsertStmt string = `INSERT INTO job (
job_id, user, project, cluster, partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, resources, meta_data,
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
) VALUES (
:job_id, :user, :project, :cluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :resources, :meta_data,
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
);`
type Job struct {
BaseJob
StartTime time.Time `json:"startTime" db:"start_time"`
}
type Scannable interface {
StructScan(dest interface{}) error
}
// Helper function for scanning jobs with the `jobTableCols` columns selected.
func ScanJob(row Scannable) (*Job, error) {
job := &Job{BaseJob: JobDefaults}
if err := row.StructScan(&job); err != nil {
return nil, err
}
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
return nil, err
}
if job.Duration == 0 && job.State == JobStateRunning {
job.Duration = int32(time.Since(job.StartTime).Seconds())
}
return job, nil
}
type JobStatistics struct {
Unit string `json:"unit"`
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
type Tag struct {
ID int64 `json:"id" db:"id"`
Type string `json:"type" db:"tag_type"`
Name string `json:"name" db:"tag_name"`
}
type Resource struct {
Hostname string `json:"hostname"`
HWThreads []int `json:"hwthreads,omitempty"`
Accelerators []int `json:"accelerators,omitempty"`
Configuration string `json:"configuration,omitempty"`
}
type JobState string
const (
JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed"
JobStateFailed JobState = "failed"
JobStateCanceled JobState = "canceled"
JobStateStopped JobState = "stopped"
JobStateTimeout JobState = "timeout"
)
func (e *JobState) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = JobState(str)
if !e.Valid() {
return errors.New("invalid job state")
}
return nil
}
func (e JobState) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}
func (e JobState) Valid() bool {
return e == JobStateRunning ||
e == JobStateCompleted ||
e == JobStateFailed ||
e == JobStateCanceled ||
e == JobStateStopped ||
e == JobStateTimeout
}

View File

@ -5,14 +5,34 @@ import (
"io"
)
// Format of `data.json` files.
type JobData map[string]*JobMetric
type JobData map[string]map[string]*JobMetric
type JobMetric struct {
Unit string `json:"Unit"`
Scope MetricScope `json:"Scope"`
Timestep int `json:"Timestep"`
Series []*MetricSeries `json:"Series"`
Unit string `json:"unit"`
Scope MetricScope `json:"scope"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
}
type Series struct {
Hostname string `json:"hostname"`
Id *int `json:"id,omitempty"`
Statistics *MetricStatistics `json:"statistics"`
Data []Float `json:"data"`
}
type MetricStatistics struct {
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
type StatsSeries struct {
Mean []Float `json:"mean,omitempty"`
Min []Float `json:"min,omitempty"`
Max []Float `json:"max,omitempty"`
Percentiles map[int][]Float `json:"percentiles,omitempty"`
}
type MetricScope string
@ -39,61 +59,3 @@ func (e *MetricScope) UnmarshalGQL(v interface{}) error {
func (e MetricScope) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}
type MetricStatistics struct {
Avg float64 `json:"Avg"`
Min float64 `json:"Min"`
Max float64 `json:"Max"`
}
type MetricSeries struct {
Hostname string `json:"Hostname"`
Id int `json:"Id"`
Statistics *MetricStatistics `json:"Statistics"`
Data []Float `json:"Data"`
}
type JobMetaStatistics struct {
Unit string `json:"Unit"`
Avg float64 `json:"Avg"`
Min float64 `json:"Min"`
Max float64 `json:"Max"`
}
type Accelerator struct {
ID int `json:"Id"`
Type string `json:"Type"`
Model string `json:"Model"`
}
type JobResource struct {
Hostname string `json:"Hostname"`
HWThreads []int `json:"HWThreads,omitempty"`
Accelerators []Accelerator `json:"Accelerators,omitempty"`
}
// Format of `meta.json` files.
type JobMeta struct {
JobId int64 `json:"JobId"`
User string `json:"User"`
Project string `json:"Project"`
Cluster string `json:"Cluster"`
NumNodes int `json:"NumNodes"`
NumHWThreads int `json:"NumHWThreads"`
NumAcc int `json:"NumAcc"`
Exclusive int8 `json:"Exclusive"`
MonitoringStatus int8 `json:"MonitoringStatus"`
SMT int8 `json:"SMT"`
Partition string `json:"Partition"`
ArrayJobId int `json:"ArrayJobId"`
JobState string `json:"JobState"`
StartTime int64 `json:"StartTime"`
Duration int64 `json:"Duration"`
Resources []*JobResource `json:"Resources"`
MetaData string `json:"MetaData"`
Tags []struct {
Name string `json:"Name"`
Type string `json:"Type"`
} `json:"Tags"`
Statistics map[string]*JobMetaStatistics `json:"Statistics"`
}

View File

@ -176,9 +176,8 @@ func main() {
resolver := &graph.Resolver{DB: db}
graphQLEndpoint := handler.NewDefaultServer(generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
graphQLPlayground := playground.Handler("GraphQL playground", "/query")
restApi := &api.RestApi{
api := &api.RestApi{
DB: db,
Resolver: resolver,
AsyncArchiving: programConfig.AsyncArchiving,
}
@ -235,7 +234,7 @@ func main() {
})
monitoringRoutes(secured, resolver)
restApi.MountRoutes(secured)
api.MountRoutes(secured)
r.PathPrefix("/").Handler(http.FileServer(http.Dir(programConfig.StaticFiles)))
handler := handlers.CORS(

View File

@ -35,7 +35,7 @@
<table class="table">
<thead>
<tr>
<th>Name/ID</th>
<th>Name</th>
<th>Jobs</th>
<th>System View</th>
<th>Analysis View</th>
@ -44,10 +44,10 @@
<tbody>
{{range .Infos.clusters}}
<tr>
<td>{{.ClusterID}}</td>
<td><a href="/monitoring/jobs/?cluster={{.ClusterID}}">Jobs</a></td>
<td><a href="/monitoring/systems/?cluster={{.ClusterID}}">System View</a></td>
<td><a href="/monitoring/analysis/?cluster={{.ClusterID}}">Analysis View</a></td>
<td>{{.Name}}</td>
<td><a href="/monitoring/jobs/?cluster={{.Name}}">Jobs</a></td>
<td><a href="/monitoring/systems/?cluster={{.Name}}">System View</a></td>
<td><a href="/monitoring/analysis/?cluster={{.Name}}">Analysis View</a></td>
</tr>
{{end}}
</tbody>