mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-11-10 08:57:25 +01:00
all schemas new
This commit is contained in:
parent
89333666b3
commit
5403177edc
@ -1,5 +1,7 @@
|
||||
# ClusterCockpit with a Golang backend
|
||||
|
||||
__*DOES NOT WORK WITH CURRENT FRONTEND*__
|
||||
|
||||
[![Build](https://github.com/ClusterCockpit/cc-jobarchive/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-jobarchive/actions/workflows/test.yml)
|
||||
|
||||
### Run server
|
||||
@ -11,11 +13,6 @@ git clone --recursive git@github.com:ClusterCockpit/cc-jobarchive.git
|
||||
# Prepare frontend
|
||||
cd ./cc-jobarchive/frontend
|
||||
yarn install
|
||||
export CCFRONTEND_ROLLUP_INTRO='
|
||||
const JOBVIEW_URL = job => `/monitoring/job/${job.id}`;
|
||||
const USERVIEW_URL = userId => `/monitoring/user/${userId}`;
|
||||
const TAG_URL = tag => `/monitoring/jobs/?tag=${tag.id}`;
|
||||
'
|
||||
yarn build
|
||||
|
||||
cd ..
|
||||
|
143
api/rest.go
143
api/rest.go
@ -2,17 +2,15 @@ package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-jobarchive/config"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/graph"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/jmoiron/sqlx"
|
||||
@ -33,18 +31,6 @@ func (api *RestApi) MountRoutes(r *mux.Router) {
|
||||
r.HandleFunc("/api/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
|
||||
}
|
||||
|
||||
// TODO/FIXME: UPDATE API!
|
||||
type StartJobApiRequest struct {
|
||||
JobId int64 `json:"jobId"`
|
||||
UserId string `json:"userId"`
|
||||
ClusterId string `json:"clusterId"`
|
||||
StartTime int64 `json:"startTime"`
|
||||
MetaData string `json:"metaData"`
|
||||
ProjectId string `json:"projectId"`
|
||||
Nodes []string `json:"nodes"`
|
||||
NodeList string `json:"nodeList"`
|
||||
}
|
||||
|
||||
type StartJobApiRespone struct {
|
||||
DBID int64 `json:"id"`
|
||||
}
|
||||
@ -53,15 +39,12 @@ type StopJobApiRequest struct {
|
||||
// JobId, ClusterId and StartTime are optional.
|
||||
// They are only used if no database id was provided.
|
||||
JobId *string `json:"jobId"`
|
||||
ClusterId *string `json:"clusterId"`
|
||||
Cluster *string `json:"clusterId"`
|
||||
StartTime *int64 `json:"startTime"`
|
||||
|
||||
// Payload
|
||||
StopTime int64 `json:"stopTime"`
|
||||
}
|
||||
|
||||
type StopJobApiRespone struct {
|
||||
DBID string `json:"id"`
|
||||
State schema.JobState `json:"jobState"`
|
||||
}
|
||||
|
||||
type TagJobApiRequest []*struct {
|
||||
@ -110,7 +93,7 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
for _, tag := range req {
|
||||
var tagId string
|
||||
var tagId int64
|
||||
if err := sq.Select("id").From("tag").
|
||||
Where("tag.tag_type = ?", tag.Type).Where("tag.tag_name = ?", tag.Name).
|
||||
RunWith(api.DB).QueryRow().Scan(&tagId); err != nil {
|
||||
@ -123,10 +106,10 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags = append(job.Tags, &model.JobTag{
|
||||
job.Tags = append(job.Tags, &schema.Tag{
|
||||
ID: tagId,
|
||||
TagType: tag.Type,
|
||||
TagName: tag.Name,
|
||||
Type: tag.Type,
|
||||
Name: tag.Name,
|
||||
})
|
||||
}
|
||||
|
||||
@ -136,31 +119,25 @@ func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
req := StartJobApiRequest{}
|
||||
req := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if config.GetClusterConfig(req.ClusterId) == nil {
|
||||
http.Error(rw, fmt.Sprintf("cluster '%s' does not exist", req.ClusterId), http.StatusBadRequest)
|
||||
if config.GetClusterConfig(req.Cluster) == nil {
|
||||
http.Error(rw, fmt.Sprintf("cluster '%s' does not exist", req.Cluster), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if req.Nodes == nil {
|
||||
req.Nodes = strings.Split(req.NodeList, "|")
|
||||
if len(req.Nodes) == 1 {
|
||||
req.Nodes = strings.Split(req.NodeList, ",")
|
||||
}
|
||||
}
|
||||
if len(req.Nodes) == 0 || len(req.Nodes[0]) == 0 || len(req.UserId) == 0 {
|
||||
if len(req.Resources) == 0 || len(req.User) == 0 || req.NumNodes == 0 {
|
||||
http.Error(rw, "required fields are missing", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Check if combination of (job_id, cluster_id, start_time) already exists:
|
||||
rows, err := api.DB.Query(`SELECT job.id FROM job WHERE job.job_id = ? AND job.cluster_id = ? AND job.start_time = ?`,
|
||||
req.JobId, req.ClusterId, req.StartTime)
|
||||
rows, err := api.DB.Query(`SELECT job.id FROM job WHERE job.job_id = ? AND job.cluster = ? AND job.start_time = ?`,
|
||||
req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@ -173,9 +150,12 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
res, err := api.DB.Exec(
|
||||
`INSERT INTO job (job_id, user_id, project_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);`,
|
||||
req.JobId, req.UserId, req.ProjectId, req.ClusterId, req.StartTime, 0, model.JobStateRunning, len(req.Nodes), strings.Join(req.Nodes, ","), req.MetaData)
|
||||
req.RawResources, err = json.Marshal(req.Resources)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
res, err := api.DB.NamedExec(schema.JobInsertStmt, req)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
@ -187,7 +167,7 @@ func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("new job (id: %d): clusterId=%s, jobId=%d, userId=%s, startTime=%d, nodes=%v\n", id, req.ClusterId, req.JobId, req.UserId, req.StartTime, req.Nodes)
|
||||
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d\n", id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusCreated)
|
||||
json.NewEncoder(rw).Encode(StartJobApiRespone{
|
||||
@ -203,66 +183,89 @@ func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
var err error
|
||||
var job *model.Job
|
||||
var sql string
|
||||
var args []interface{}
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
if ok {
|
||||
job, err = graph.ScanJob(sq.Select(graph.JobTableCols...).From("job").Where("job.id = ?", id).RunWith(api.DB).QueryRow())
|
||||
sql, args, err = sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id).ToSql()
|
||||
} else {
|
||||
job, err = graph.ScanJob(sq.Select(graph.JobTableCols...).From("job").
|
||||
sql, args, err = sq.Select(schema.JobColumns...).From("job").
|
||||
Where("job.job_id = ?", req.JobId).
|
||||
Where("job.cluster_id = ?", req.ClusterId).
|
||||
Where("job.start_time = ?", req.StartTime).
|
||||
RunWith(api.DB).QueryRow())
|
||||
Where("job.cluster = ?", req.Cluster).
|
||||
Where("job.start_time = ?", req.StartTime).ToSql()
|
||||
}
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusNotFound)
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
job, err := schema.ScanJob(api.DB.QueryRowx(sql, args...))
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != model.JobStateRunning {
|
||||
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
|
||||
http.Error(rw, "stop_time must be larger than start_time and only running jobs can be stopped", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
doArchiving := func(job *model.Job, ctx context.Context) error {
|
||||
job.Duration = int(req.StopTime - job.StartTime.Unix())
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
http.Error(rw, fmt.Sprintf("invalid job state: '%s'", req.State), http.StatusBadRequest)
|
||||
return
|
||||
} else {
|
||||
req.State = schema.JobStateCompleted
|
||||
}
|
||||
|
||||
doArchiving := func(job *schema.Job, ctx context.Context) error {
|
||||
job.Duration = int32(req.StopTime - job.StartTime.Unix())
|
||||
jobMeta, err := metricdata.ArchiveJob(job, ctx)
|
||||
if err != nil {
|
||||
log.Printf("archiving job (id: %s) failed: %s\n", job.ID, err.Error())
|
||||
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
getAvg := func(metric string) sql.NullFloat64 {
|
||||
stats, ok := jobMeta.Statistics[metric]
|
||||
if !ok {
|
||||
return sql.NullFloat64{Valid: false}
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", req.State).
|
||||
Set("duration", job.Duration).
|
||||
Where("job.id = ?", job.ID)
|
||||
|
||||
for metric, stats := range jobMeta.Statistics {
|
||||
switch metric {
|
||||
case "flops_any":
|
||||
stmt = stmt.Set("flops_any_avg", stats.Avg)
|
||||
case "mem_used":
|
||||
stmt = stmt.Set("mem_used_max", stats.Max)
|
||||
case "mem_bw":
|
||||
stmt = stmt.Set("mem_bw_avg", stats.Avg)
|
||||
case "load":
|
||||
stmt = stmt.Set("load_avg", stats.Avg)
|
||||
case "net_bw":
|
||||
stmt = stmt.Set("net_bw_avg", stats.Avg)
|
||||
case "file_bw":
|
||||
stmt = stmt.Set("file_bw_avg", stats.Avg)
|
||||
}
|
||||
return sql.NullFloat64{Valid: true, Float64: stats.Avg}
|
||||
}
|
||||
|
||||
if _, err := api.DB.Exec(
|
||||
`UPDATE job SET
|
||||
job_state = ?, duration = ?,
|
||||
flops_any_avg = ?, mem_bw_avg = ?, net_bw_avg = ?, file_bw_avg = ?, load_avg = ?
|
||||
WHERE job.id = ?`,
|
||||
model.JobStateCompleted, job.Duration,
|
||||
getAvg("flops_any"), getAvg("mem_bw"), getAvg("net_bw"), getAvg("file_bw"), getAvg("load"),
|
||||
job.ID); err != nil {
|
||||
log.Printf("archiving job (id: %s) failed: %s\n", job.ID, err.Error())
|
||||
sql, args, err := stmt.ToSql()
|
||||
if err != nil {
|
||||
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
log.Printf("job stopped and archived (id: %s)\n", job.ID)
|
||||
if _, err := api.DB.Exec(sql, args...); err != nil {
|
||||
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
log.Printf("job stopped and archived (dbid: %d)\n", job.ID)
|
||||
return nil
|
||||
}
|
||||
|
||||
log.Printf("archiving job... (id: %s): clusterId=%s, jobId=%d, userId=%s, startTime=%s\n", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
|
||||
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s\n", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
|
||||
if api.AsyncArchiving {
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(StopJobApiRespone{
|
||||
DBID: job.ID,
|
||||
})
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
go doArchiving(job, context.Background())
|
||||
} else {
|
||||
err := doArchiving(job, r.Context())
|
||||
|
@ -46,8 +46,8 @@ func Init(usersdb *sqlx.DB, authEnabled bool, uiConfig map[string]interface{}, j
|
||||
cluster.FilterRanges.StartTime.To = time.Unix(0, 0)
|
||||
}
|
||||
|
||||
if cluster.ClusterID != de.Name() {
|
||||
return fmt.Errorf("the file '%s/cluster.json' contains the clusterId '%s'", de.Name(), cluster.ClusterID)
|
||||
if cluster.Name != de.Name() {
|
||||
return fmt.Errorf("the file '%s/cluster.json' contains the clusterId '%s'", de.Name(), cluster.Name)
|
||||
}
|
||||
|
||||
Clusters = append(Clusters, &cluster)
|
||||
@ -149,7 +149,7 @@ func ServeConfig(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func GetClusterConfig(cluster string) *model.Cluster {
|
||||
for _, c := range Clusters {
|
||||
if c.ClusterID == cluster {
|
||||
if c.Name == cluster {
|
||||
return c
|
||||
}
|
||||
}
|
||||
@ -158,7 +158,7 @@ func GetClusterConfig(cluster string) *model.Cluster {
|
||||
|
||||
func GetMetricConfig(cluster, metric string) *model.MetricConfig {
|
||||
for _, c := range Clusters {
|
||||
if c.ClusterID == cluster {
|
||||
if c.Name == cluster {
|
||||
for _, m := range c.MetricConfig {
|
||||
if m.Name == metric {
|
||||
return m
|
||||
|
28
gqlgen.yml
28
gqlgen.yml
@ -55,21 +55,19 @@ models:
|
||||
- github.com/99designs/gqlgen/graphql.Int64
|
||||
- github.com/99designs/gqlgen/graphql.Int32
|
||||
Job:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Job"
|
||||
fields:
|
||||
Tags:
|
||||
tags:
|
||||
resolver: true
|
||||
JobMetric:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric"
|
||||
JobMetricSeries:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricSeries"
|
||||
JobMetricStatistics:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricStatistics"
|
||||
NullableFloat:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float"
|
||||
JobMetricScope:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope"
|
||||
JobResource:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobResource"
|
||||
Accelerator:
|
||||
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Accelerator"
|
||||
NullableFloat: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float" }
|
||||
MetricScope: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope" }
|
||||
JobStatistics: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobStatistics" }
|
||||
Tag: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Tag" }
|
||||
Resource: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Resource" }
|
||||
JobState: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobState" }
|
||||
JobMetric: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric" }
|
||||
Series: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Series" }
|
||||
MetricStatistics: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricStatistics" }
|
||||
StatsSeries: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.StatsSeries" }
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,26 +1,17 @@
|
||||
package model
|
||||
|
||||
// Go look at `gqlgen.yml` and the schema package for other non-generated models.
|
||||
|
||||
type JobTag struct {
|
||||
ID string `json:"id" db:"id"`
|
||||
TagType string `json:"tagType" db:"tag_type"`
|
||||
TagName string `json:"tagName" db:"tag_name"`
|
||||
}
|
||||
|
||||
type Cluster struct {
|
||||
ClusterID string `json:"clusterID"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
FlopRateScalar int `json:"flopRateScalar"`
|
||||
FlopRateSimd int `json:"flopRateSimd"`
|
||||
MemoryBandwidth int `json:"memoryBandwidth"`
|
||||
Name string `json:"name"`
|
||||
MetricConfig []*MetricConfig `json:"metricConfig"`
|
||||
FilterRanges *FilterRanges `json:"filterRanges"`
|
||||
MetricDataRepository *struct {
|
||||
Partitions []*Partition `json:"partitions"`
|
||||
|
||||
// NOT part of the API:
|
||||
MetricDataRepository *MetricDataRepository `json:"metricDataRepository"`
|
||||
}
|
||||
|
||||
type MetricDataRepository struct {
|
||||
Kind string `json:"kind"`
|
||||
Url string `json:"url"`
|
||||
} `json:"metricDataRepository"`
|
||||
Token string `json:"token"`
|
||||
}
|
||||
|
@ -11,6 +11,12 @@ import (
|
||||
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||
)
|
||||
|
||||
type Accelerator struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Model string `json:"model"`
|
||||
}
|
||||
|
||||
type FilterRanges struct {
|
||||
Duration *IntRangeOutput `json:"duration"`
|
||||
NumNodes *IntRangeOutput `json:"numNodes"`
|
||||
@ -37,33 +43,6 @@ type IntRangeOutput struct {
|
||||
To int `json:"to"`
|
||||
}
|
||||
|
||||
type Job struct {
|
||||
ID string `json:"Id"`
|
||||
JobID int `json:"JobId"`
|
||||
User string `json:"User"`
|
||||
Project string `json:"Project"`
|
||||
Cluster string `json:"Cluster"`
|
||||
StartTime time.Time `json:"StartTime"`
|
||||
Duration int `json:"Duration"`
|
||||
NumNodes int `json:"NumNodes"`
|
||||
NumHWThreads int `json:"NumHWThreads"`
|
||||
NumAcc int `json:"NumAcc"`
|
||||
Smt int `json:"SMT"`
|
||||
Exclusive int `json:"Exclusive"`
|
||||
Partition string `json:"Partition"`
|
||||
ArrayJobID int `json:"ArrayJobId"`
|
||||
MonitoringStatus int `json:"MonitoringStatus"`
|
||||
State JobState `json:"State"`
|
||||
Tags []*JobTag `json:"Tags"`
|
||||
Resources []*schema.JobResource `json:"Resources"`
|
||||
LoadAvg *float64 `json:"LoadAvg"`
|
||||
MemUsedMax *float64 `json:"MemUsedMax"`
|
||||
FlopsAnyAvg *float64 `json:"FlopsAnyAvg"`
|
||||
MemBwAvg *float64 `json:"MemBwAvg"`
|
||||
NetBwAvg *float64 `json:"NetBwAvg"`
|
||||
FileBwAvg *float64 `json:"FileBwAvg"`
|
||||
}
|
||||
|
||||
type JobFilter struct {
|
||||
Tags []string `json:"tags"`
|
||||
JobID *StringInput `json:"jobId"`
|
||||
@ -73,7 +52,7 @@ type JobFilter struct {
|
||||
Duration *IntRange `json:"duration"`
|
||||
NumNodes *IntRange `json:"numNodes"`
|
||||
StartTime *TimeRange `json:"startTime"`
|
||||
JobState []JobState `json:"jobState"`
|
||||
State []schema.JobState `json:"state"`
|
||||
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
|
||||
MemBwAvg *FloatRange `json:"memBwAvg"`
|
||||
LoadAvg *FloatRange `json:"loadAvg"`
|
||||
@ -82,11 +61,22 @@ type JobFilter struct {
|
||||
|
||||
type JobMetricWithName struct {
|
||||
Name string `json:"name"`
|
||||
Metric *schema.JobMetric `json:"metric"`
|
||||
Node *schema.JobMetric `json:"node"`
|
||||
Socket *schema.JobMetric `json:"socket"`
|
||||
MemoryDomain *schema.JobMetric `json:"memoryDomain"`
|
||||
Core *schema.JobMetric `json:"core"`
|
||||
Hwthread *schema.JobMetric `json:"hwthread"`
|
||||
}
|
||||
|
||||
type JobResource struct {
|
||||
Hostname string `json:"hostname"`
|
||||
Hwthreads []int `json:"hwthreads"`
|
||||
Accelerators []int `json:"accelerators"`
|
||||
Configuration *string `json:"configuration"`
|
||||
}
|
||||
|
||||
type JobResultList struct {
|
||||
Items []*Job `json:"items"`
|
||||
Items []*schema.Job `json:"items"`
|
||||
Offset *int `json:"offset"`
|
||||
Limit *int `json:"limit"`
|
||||
Count *int `json:"count"`
|
||||
@ -103,14 +93,14 @@ type JobsStatistics struct {
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
Name string `json:"Name"`
|
||||
Unit string `json:"Unit"`
|
||||
Timestep int `json:"Timestep"`
|
||||
Peak int `json:"Peak"`
|
||||
Normal int `json:"Normal"`
|
||||
Caution int `json:"Caution"`
|
||||
Alert int `json:"Alert"`
|
||||
Scope string `json:"Scope"`
|
||||
Name string `json:"name"`
|
||||
Unit string `json:"unit"`
|
||||
Scope string `json:"scope"`
|
||||
Timestep int `json:"timestep"`
|
||||
Peak float64 `json:"Peak"`
|
||||
Normal float64 `json:"Normal"`
|
||||
Caution float64 `json:"Caution"`
|
||||
Alert float64 `json:"Alert"`
|
||||
}
|
||||
|
||||
type MetricFootprints struct {
|
||||
@ -138,6 +128,18 @@ type PageRequest struct {
|
||||
Page int `json:"page"`
|
||||
}
|
||||
|
||||
type Partition struct {
|
||||
Name string `json:"name"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
FlopRateScalar int `json:"flopRateScalar"`
|
||||
FlopRateSimd int `json:"flopRateSimd"`
|
||||
MemoryBandwidth int `json:"memoryBandwidth"`
|
||||
Topology *Topology `json:"topology"`
|
||||
}
|
||||
|
||||
type StringInput struct {
|
||||
Eq *string `json:"eq"`
|
||||
Contains *string `json:"contains"`
|
||||
@ -155,6 +157,15 @@ type TimeRangeOutput struct {
|
||||
To time.Time `json:"to"`
|
||||
}
|
||||
|
||||
type Topology struct {
|
||||
Node []int `json:"node"`
|
||||
Socket [][]int `json:"socket"`
|
||||
MemoryDomain [][]int `json:"memoryDomain"`
|
||||
Die [][]int `json:"die"`
|
||||
Core [][]int `json:"core"`
|
||||
Accelerators []*Accelerator `json:"accelerators"`
|
||||
}
|
||||
|
||||
type Aggregate string
|
||||
|
||||
const (
|
||||
@ -198,55 +209,6 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
type JobState string
|
||||
|
||||
const (
|
||||
JobStateRunning JobState = "running"
|
||||
JobStateCompleted JobState = "completed"
|
||||
JobStateFailed JobState = "failed"
|
||||
JobStateCanceled JobState = "canceled"
|
||||
JobStateStopped JobState = "stopped"
|
||||
JobStateTimeout JobState = "timeout"
|
||||
)
|
||||
|
||||
var AllJobState = []JobState{
|
||||
JobStateRunning,
|
||||
JobStateCompleted,
|
||||
JobStateFailed,
|
||||
JobStateCanceled,
|
||||
JobStateStopped,
|
||||
JobStateTimeout,
|
||||
}
|
||||
|
||||
func (e JobState) IsValid() bool {
|
||||
switch e {
|
||||
case JobStateRunning, JobStateCompleted, JobStateFailed, JobStateCanceled, JobStateStopped, JobStateTimeout:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (e JobState) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *JobState) UnmarshalGQL(v interface{}) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
}
|
||||
|
||||
*e = JobState(str)
|
||||
if !e.IsValid() {
|
||||
return fmt.Errorf("%s is not a valid JobState", str)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e JobState) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
type SortDirectionEnum string
|
||||
|
||||
const (
|
||||
|
@ -2,15 +2,14 @@ package graph
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-jobarchive/auth"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@ -23,44 +22,9 @@ type Resolver struct {
|
||||
DB *sqlx.DB
|
||||
}
|
||||
|
||||
var JobTableCols []string = []string{
|
||||
"id", "job_id", "cluster", "start_time",
|
||||
"user", "project", "partition", "array_job_id", "duration", "job_state", "resources",
|
||||
"num_nodes", "num_hwthreads", "num_acc", "smt", "exclusive", "monitoring_status",
|
||||
"load_avg", "mem_used_max", "flops_any_avg", "mem_bw_avg", "net_bw_avg", "file_bw_avg",
|
||||
}
|
||||
|
||||
type Scannable interface {
|
||||
Scan(dest ...interface{}) error
|
||||
}
|
||||
|
||||
// Helper function for scanning jobs with the `jobTableCols` columns selected.
|
||||
func ScanJob(row Scannable) (*model.Job, error) {
|
||||
job := &model.Job{}
|
||||
|
||||
var rawResources []byte
|
||||
if err := row.Scan(
|
||||
&job.ID, &job.JobID, &job.Cluster, &job.StartTime,
|
||||
&job.User, &job.Project, &job.Partition, &job.ArrayJobID, &job.Duration, &job.State, &rawResources,
|
||||
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Smt, &job.Exclusive, &job.MonitoringStatus,
|
||||
&job.LoadAvg, &job.MemUsedMax, &job.FlopsAnyAvg, &job.MemBwAvg, &job.NetBwAvg, &job.FileBwAvg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(rawResources, &job.Resources); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if job.Duration == 0 && job.State == model.JobStateRunning {
|
||||
job.Duration = int(time.Since(job.StartTime).Seconds())
|
||||
}
|
||||
|
||||
return job, nil
|
||||
}
|
||||
|
||||
// Helper function for the `jobs` GraphQL-Query. Is also used elsewhere when a list of jobs is needed.
|
||||
func (r *Resolver) queryJobs(ctx context.Context, filters []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) ([]*model.Job, int, error) {
|
||||
query := sq.Select(JobTableCols...).From("job")
|
||||
func (r *Resolver) queryJobs(ctx context.Context, filters []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) ([]*schema.Job, int, error) {
|
||||
query := sq.Select(schema.JobColumns...).From("job")
|
||||
query = securityCheck(ctx, query)
|
||||
|
||||
if order != nil {
|
||||
@ -85,33 +49,32 @@ func (r *Resolver) queryJobs(ctx context.Context, filters []*model.JobFilter, pa
|
||||
query = buildWhereClause(f, query)
|
||||
}
|
||||
|
||||
rows, err := query.RunWith(r.DB).Query()
|
||||
sql, args, err := query.ToSql()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
jobs := make([]*model.Job, 0, 50)
|
||||
rows, err := r.DB.Queryx(sql, args...)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
for rows.Next() {
|
||||
job, err := ScanJob(rows)
|
||||
job, err := schema.ScanJob(rows)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
// count all jobs:
|
||||
query = sq.Select("count(*)").From("job")
|
||||
for _, f := range filters {
|
||||
query = buildWhereClause(f, query)
|
||||
}
|
||||
rows, err = query.RunWith(r.DB).Query()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var count int
|
||||
rows.Next()
|
||||
if err := rows.Scan(&count); err != nil {
|
||||
if err := query.RunWith(r.DB).Scan(&count); err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
@ -132,7 +95,7 @@ func securityCheck(ctx context.Context, query sq.SelectBuilder) sq.SelectBuilder
|
||||
return query.Where("job.user_id = ?", user.Username)
|
||||
}
|
||||
|
||||
// Build a sq.SelectBuilder out of a model.JobFilter.
|
||||
// Build a sq.SelectBuilder out of a schema.JobFilter.
|
||||
func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if filter.Tags != nil {
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where("jobtag.tag_id IN ?", filter.Tags)
|
||||
@ -155,8 +118,8 @@ func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
if filter.Duration != nil {
|
||||
query = buildIntCondition("job.duration", filter.Duration, query)
|
||||
}
|
||||
if filter.JobState != nil {
|
||||
query = query.Where("job.job_state IN ?", filter.JobState)
|
||||
if filter.State != nil {
|
||||
query = query.Where("job.job_state IN ?", filter.State)
|
||||
}
|
||||
if filter.NumNodes != nil {
|
||||
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
|
||||
|
@ -1,107 +1,122 @@
|
||||
scalar Time
|
||||
scalar NullableFloat
|
||||
scalar MetricScope
|
||||
scalar JobState
|
||||
|
||||
type Job {
|
||||
Id: ID! # Database ID, unique
|
||||
JobId: Int! # ID given to the job by the cluster scheduler
|
||||
User: String! # Username
|
||||
Project: String! # Project
|
||||
Cluster: String! # Name of the cluster this job was running on
|
||||
StartTime: Time! # RFC3339 formated string
|
||||
Duration: Int! # For running jobs, the time it has already run
|
||||
NumNodes: Int! # Number of nodes this job was running on
|
||||
NumHWThreads: Int!
|
||||
NumAcc: Int!
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
user: String!
|
||||
project: String!
|
||||
cluster: String!
|
||||
startTime: Time!
|
||||
duration: Int!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
SMT: Int!
|
||||
Exclusive: Int!
|
||||
Partition: String!
|
||||
ArrayJobId: Int!
|
||||
MonitoringStatus: Int!
|
||||
State: JobState! # State of the job
|
||||
Tags: [JobTag!]! # List of tags this job has
|
||||
Resources: [JobResource!]! # List of hosts/hwthreads/gpus/...
|
||||
|
||||
# Will be null for running jobs.
|
||||
LoadAvg: Float
|
||||
MemUsedMax: Float
|
||||
FlopsAnyAvg: Float
|
||||
MemBwAvg: Float
|
||||
NetBwAvg: Float
|
||||
FileBwAvg: Float
|
||||
}
|
||||
|
||||
type JobResource {
|
||||
Hostname: String!
|
||||
HWThreads: [Int!]
|
||||
Accelerators: [Accelerator!]
|
||||
}
|
||||
|
||||
type Accelerator {
|
||||
Id: String!
|
||||
Type: String!
|
||||
Model: String!
|
||||
}
|
||||
|
||||
# TODO: Extend by more possible states?
|
||||
enum JobState {
|
||||
running
|
||||
completed
|
||||
failed
|
||||
canceled
|
||||
stopped
|
||||
timeout
|
||||
}
|
||||
|
||||
type JobTag {
|
||||
Id: ID! # Database ID, unique
|
||||
TagType: String! # Type
|
||||
TagName: String! # Name
|
||||
exclusive: Int!
|
||||
partition: String!
|
||||
arrayJobId: Int!
|
||||
monitoringStatus: Int!
|
||||
state: JobState!
|
||||
tags: [Tag!]!
|
||||
resources: [JobResource!]!
|
||||
}
|
||||
|
||||
type Cluster {
|
||||
ClusterID: String!
|
||||
ProcessorType: String!
|
||||
SocketsPerNode: Int!
|
||||
CoresPerSocket: Int!
|
||||
ThreadsPerCore: Int!
|
||||
FlopRateScalar: Int!
|
||||
FlopRateSimd: Int!
|
||||
MemoryBandwidth: Int!
|
||||
MetricConfig: [MetricConfig!]!
|
||||
FilterRanges: FilterRanges!
|
||||
name: String!
|
||||
metricConfig: [MetricConfig!]!
|
||||
filterRanges: FilterRanges!
|
||||
partitions: [Partition!]!
|
||||
}
|
||||
|
||||
type Partition {
|
||||
name: String!
|
||||
processorType: String!
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: Int!
|
||||
flopRateSimd: Int!
|
||||
memoryBandwidth: Int!
|
||||
topology: Topology!
|
||||
}
|
||||
|
||||
type Topology {
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
memoryDomain: [[Int!]!]
|
||||
die: [[Int!]!]
|
||||
core: [[Int!]!]
|
||||
accelerators: [Accelerator!]
|
||||
}
|
||||
|
||||
type Accelerator {
|
||||
id: String!
|
||||
type: String!
|
||||
model: String!
|
||||
}
|
||||
|
||||
type MetricConfig {
|
||||
Name: String!
|
||||
Unit: String!
|
||||
Timestep: Int!
|
||||
Peak: Int!
|
||||
Normal: Int!
|
||||
Caution: Int!
|
||||
Alert: Int!
|
||||
Scope: String!
|
||||
name: String!
|
||||
unit: String!
|
||||
scope: String!
|
||||
timestep: Int!
|
||||
Peak: Float!
|
||||
Normal: Float!
|
||||
Caution: Float!
|
||||
Alert: Float!
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
Unit: String!
|
||||
Scope: JobMetricScope!
|
||||
Timestep: Int!
|
||||
Series: [JobMetricSeries!]!
|
||||
type Tag {
|
||||
id: ID!
|
||||
type: String!
|
||||
name: String!
|
||||
}
|
||||
|
||||
type JobMetricSeries {
|
||||
Hostname: String!
|
||||
Id: Int
|
||||
Statistics: JobMetricStatistics
|
||||
Data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type JobMetricStatistics {
|
||||
Avg: Float!
|
||||
Min: Float!
|
||||
Max: Float!
|
||||
type JobResource {
|
||||
hostname: String!
|
||||
hwthreads: [Int!]
|
||||
accelerators: [Int!]
|
||||
configuration: String
|
||||
}
|
||||
|
||||
type JobMetricWithName {
|
||||
name: String!
|
||||
metric: JobMetric!
|
||||
|
||||
node: JobMetric
|
||||
socket: JobMetric
|
||||
memoryDomain: JobMetric
|
||||
core: JobMetric
|
||||
hwthread: JobMetric
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
unit: String!
|
||||
scope: MetricScope!
|
||||
timestep: Int!
|
||||
series: [Series!]!
|
||||
statisticsSeries: [StatsSeries!]
|
||||
}
|
||||
|
||||
type Series {
|
||||
hostname: String!
|
||||
id: Int
|
||||
statistics: MetricStatistics
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type MetricStatistics {
|
||||
avg: Float!
|
||||
min: Float!
|
||||
max: Float!
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]
|
||||
min: [NullableFloat!]
|
||||
max: [NullableFloat!]
|
||||
}
|
||||
|
||||
type MetricFootprints {
|
||||
@ -123,7 +138,7 @@ type NodeMetrics {
|
||||
|
||||
type Query {
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [JobTag!]! # List of all tags
|
||||
tags: [Tag!]! # List of all tags
|
||||
|
||||
job(id: ID!): Job
|
||||
jobMetrics(id: ID!, metrics: [String!]): [JobMetricWithName!]!
|
||||
@ -138,23 +153,16 @@ type Query {
|
||||
}
|
||||
|
||||
type Mutation {
|
||||
createTag(type: String!, name: String!): JobTag!
|
||||
createTag(type: String!, name: String!): Tag!
|
||||
deleteTag(id: ID!): ID!
|
||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [JobTag!]!
|
||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [JobTag!]!
|
||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
|
||||
updateConfiguration(name: String!, value: String!): String
|
||||
}
|
||||
|
||||
type IntRangeOutput {
|
||||
from: Int!
|
||||
to: Int!
|
||||
}
|
||||
|
||||
type TimeRangeOutput {
|
||||
from: Time!
|
||||
to: Time!
|
||||
}
|
||||
type IntRangeOutput { from: Int!, to: Int! }
|
||||
type TimeRangeOutput { from: Time!, to: Time! }
|
||||
|
||||
type FilterRanges {
|
||||
duration: IntRangeOutput!
|
||||
@ -171,7 +179,7 @@ input JobFilter {
|
||||
duration: IntRange
|
||||
numNodes: IntRange
|
||||
startTime: TimeRange
|
||||
jobState: [JobState!]
|
||||
state: [JobState!]
|
||||
flopsAnyAvg: FloatRange
|
||||
memBwAvg: FloatRange
|
||||
loadAvg: FloatRange
|
||||
@ -195,20 +203,9 @@ input StringInput {
|
||||
endsWith: String
|
||||
}
|
||||
|
||||
input IntRange {
|
||||
from: Int!
|
||||
to: Int!
|
||||
}
|
||||
|
||||
input FloatRange {
|
||||
from: Float!
|
||||
to: Float!
|
||||
}
|
||||
|
||||
input TimeRange {
|
||||
from: Time
|
||||
to: Time
|
||||
}
|
||||
input IntRange { from: Int!, to: Int! }
|
||||
input FloatRange { from: Float!, to: Float! }
|
||||
input TimeRange { from: Time, to: Time }
|
||||
|
||||
type JobResultList {
|
||||
items: [Job!]!
|
||||
@ -236,7 +233,3 @@ input PageRequest {
|
||||
itemsPerPage: Int!
|
||||
page: Int!
|
||||
}
|
||||
|
||||
scalar Time
|
||||
scalar NullableFloat
|
||||
scalar JobMetricScope
|
||||
|
@ -19,36 +19,35 @@ import (
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
func (r *acceleratorResolver) ID(ctx context.Context, obj *schema.Accelerator) (string, error) {
|
||||
panic(fmt.Errorf("not implemented"))
|
||||
}
|
||||
|
||||
func (r *jobResolver) Tags(ctx context.Context, obj *model.Job) ([]*model.JobTag, error) {
|
||||
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
|
||||
query := sq.
|
||||
Select("tag.id", "tag.tag_type", "tag.tag_name").
|
||||
From("tag").
|
||||
Join("jobtag ON jobtag.tag_id = tag.id").
|
||||
Where("jobtag.job_id = ?", obj.ID)
|
||||
|
||||
rows, err := query.RunWith(r.DB).Query()
|
||||
sql, args, err := query.ToSql()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
tags := make([]*model.JobTag, 0)
|
||||
for rows.Next() {
|
||||
var tag model.JobTag
|
||||
if err := rows.Scan(&tag.ID, &tag.TagType, &tag.TagName); err != nil {
|
||||
tags := make([]*schema.Tag, 0)
|
||||
if err := r.DB.Select(&tags, sql, args...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tags = append(tags, &tag)
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*model.JobTag, error) {
|
||||
func (r *jobResolver) Resources(ctx context.Context, obj *schema.Job) ([]*model.JobResource, error) {
|
||||
panic(fmt.Errorf("not implemented"))
|
||||
}
|
||||
|
||||
func (r *jobMetricResolver) StatisticsSeries(ctx context.Context, obj *schema.JobMetric) ([]*schema.StatsSeries, error) {
|
||||
panic(fmt.Errorf("not implemented"))
|
||||
}
|
||||
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
|
||||
res, err := r.DB.Exec("INSERT INTO tag (tag_type, tag_name) VALUES ($1, $2)", typeArg, name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -59,7 +58,7 @@ func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name s
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &model.JobTag{ID: strconv.FormatInt(id, 10), TagType: typeArg, TagName: name}, nil
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
|
||||
}
|
||||
|
||||
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
|
||||
@ -67,7 +66,7 @@ func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, er
|
||||
panic(fmt.Errorf("not implemented"))
|
||||
}
|
||||
|
||||
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*model.JobTag, error) {
|
||||
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
jid, err := strconv.Atoi(job)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -84,7 +83,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
}
|
||||
}
|
||||
|
||||
tags, err := r.Job().Tags(ctx, &model.Job{ID: job})
|
||||
dummyJob := schema.Job{}
|
||||
dummyJob.ID = int64(jid)
|
||||
tags, err := r.Job().Tags(ctx, &dummyJob)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -97,7 +98,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
return tags, metricdata.UpdateTags(jobObj, tags)
|
||||
}
|
||||
|
||||
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*model.JobTag, error) {
|
||||
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
jid, err := strconv.Atoi(job)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -114,7 +115,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
|
||||
}
|
||||
}
|
||||
|
||||
tags, err := r.Job().Tags(ctx, &model.Job{ID: job})
|
||||
dummyJob := schema.Job{}
|
||||
dummyJob.ID = int64(jid)
|
||||
tags, err := r.Job().Tags(ctx, &dummyJob)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -139,29 +142,28 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*model.Cluster, error)
|
||||
return config.Clusters, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) Tags(ctx context.Context) ([]*model.JobTag, error) {
|
||||
rows, err := sq.Select("id", "tag_type", "tag_name").From("tag").RunWith(r.DB).Query()
|
||||
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
|
||||
sql, args, err := sq.Select("id", "tag_type", "tag_name").From("tag").ToSql()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
tags := make([]*model.JobTag, 0)
|
||||
for rows.Next() {
|
||||
var tag model.JobTag
|
||||
if err := rows.Scan(&tag.ID, &tag.TagType, &tag.TagName); err != nil {
|
||||
tags := make([]*schema.Tag, 0)
|
||||
if err := r.DB.Select(&tags, sql, args...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tags = append(tags, &tag)
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) Job(ctx context.Context, id string) (*model.Job, error) {
|
||||
query := sq.Select(JobTableCols...).From("job").Where("job.id = ?", id)
|
||||
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
|
||||
query := sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id)
|
||||
query = securityCheck(ctx, query)
|
||||
return ScanJob(query.RunWith(r.DB).QueryRow())
|
||||
sql, args, err := query.ToSql()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return schema.ScanJob(r.DB.QueryRowx(sql, args...))
|
||||
}
|
||||
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string) ([]*model.JobMetricWithName, error) {
|
||||
@ -179,7 +181,11 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
|
||||
for name, md := range data {
|
||||
res = append(res, &model.JobMetricWithName{
|
||||
Name: name,
|
||||
Metric: md,
|
||||
Node: md["node"],
|
||||
Socket: md["socket"],
|
||||
MemoryDomain: md["memoryDomain"],
|
||||
Core: md["core"],
|
||||
Hwthread: md["hwthread"],
|
||||
})
|
||||
}
|
||||
|
||||
@ -237,19 +243,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// Accelerator returns generated.AcceleratorResolver implementation.
|
||||
func (r *Resolver) Accelerator() generated.AcceleratorResolver { return &acceleratorResolver{r} }
|
||||
|
||||
// Job returns generated.JobResolver implementation.
|
||||
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
|
||||
|
||||
// JobMetric returns generated.JobMetricResolver implementation.
|
||||
func (r *Resolver) JobMetric() generated.JobMetricResolver { return &jobMetricResolver{r} }
|
||||
|
||||
// Mutation returns generated.MutationResolver implementation.
|
||||
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
|
||||
|
||||
// Query returns generated.QueryResolver implementation.
|
||||
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
||||
|
||||
type acceleratorResolver struct{ *Resolver }
|
||||
type jobResolver struct{ *Resolver }
|
||||
type jobMetricResolver struct{ *Resolver }
|
||||
type mutationResolver struct{ *Resolver }
|
||||
type queryResolver struct{ *Resolver }
|
||||
|
@ -3,6 +3,7 @@ package graph
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
@ -16,9 +17,9 @@ import (
|
||||
|
||||
// GraphQL validation should make sure that no unkown values can be specified.
|
||||
var groupBy2column = map[model.Aggregate]string{
|
||||
model.AggregateUser: "job.user_id",
|
||||
model.AggregateProject: "job.project_id",
|
||||
model.AggregateCluster: "job.cluster_id",
|
||||
model.AggregateUser: "job.user",
|
||||
model.AggregateProject: "job.project",
|
||||
model.AggregateCluster: "job.cluster",
|
||||
}
|
||||
|
||||
// Helper function for the jobsStatistics GraphQL query placed here so that schema.resolvers.go is not too full.
|
||||
@ -28,7 +29,8 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
|
||||
// `socketsPerNode` and `coresPerSocket` can differ from cluster to cluster, so we need to explicitly loop over those.
|
||||
for _, cluster := range config.Clusters {
|
||||
corehoursCol := fmt.Sprintf("SUM(job.duration * job.num_nodes * %d * %d) / 3600", cluster.SocketsPerNode, cluster.CoresPerSocket)
|
||||
for _, partition := range cluster.Partitions {
|
||||
corehoursCol := fmt.Sprintf("SUM(job.duration * job.num_nodes * %d * %d) / 3600", partition.SocketsPerNode, partition.CoresPerSocket)
|
||||
var query sq.SelectBuilder
|
||||
if groupBy == nil {
|
||||
query = sq.Select(
|
||||
@ -36,7 +38,7 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
"COUNT(job.id)",
|
||||
"SUM(job.duration) / 3600",
|
||||
corehoursCol,
|
||||
).From("job").Where("job.cluster_id = ?", cluster.ClusterID)
|
||||
).From("job")
|
||||
} else {
|
||||
col := groupBy2column[*groupBy]
|
||||
query = sq.Select(
|
||||
@ -44,9 +46,13 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
"COUNT(job.id)",
|
||||
"SUM(job.duration) / 3600",
|
||||
corehoursCol,
|
||||
).From("job").Where("job.cluster_id = ?", cluster.ClusterID).GroupBy(col)
|
||||
).From("job").GroupBy(col)
|
||||
}
|
||||
|
||||
query = query.
|
||||
Where("job.cluster = ?", cluster.Name).
|
||||
Where("job.partition = ?", partition.Name)
|
||||
|
||||
query = securityCheck(ctx, query)
|
||||
for _, f := range filter {
|
||||
query = buildWhereClause(f, query)
|
||||
@ -80,6 +86,7 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
query := sq.Select("COUNT(job.id)").From("job").Where("job.duration < 120")
|
||||
@ -204,9 +211,16 @@ func (r *Resolver) rooflineHeatmap(ctx context.Context, filter []*model.JobFilte
|
||||
return nil, err
|
||||
}
|
||||
|
||||
flops, membw := jobdata["flops_any"], jobdata["mem_bw"]
|
||||
if flops == nil && membw == nil {
|
||||
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %s", job.ID)
|
||||
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
||||
if flops_ == nil && membw_ == nil {
|
||||
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
}
|
||||
|
||||
flops, ok1 := flops_["node"]
|
||||
membw, ok2 := membw_["node"]
|
||||
if !ok1 || !ok2 {
|
||||
// TODO/FIXME:
|
||||
return nil, errors.New("todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
}
|
||||
|
||||
for n := 0; n < len(flops.Series); n++ {
|
||||
|
64
init-db.go
64
init-db.go
@ -2,7 +2,6 @@ package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
@ -23,7 +22,7 @@ const JOBS_DB_SCHEMA string = `
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT, -- Not needed in sqlite
|
||||
job_id BIGINT NOT NULL,
|
||||
cluster VARCHAR(255) NOT NULL,
|
||||
start_time BITINT NOT NULL,
|
||||
start_time TIMESTAMP NOT NULL,
|
||||
|
||||
user VARCHAR(255) NOT NULL,
|
||||
project VARCHAR(255) NOT NULL,
|
||||
@ -80,25 +79,20 @@ func initDB(db *sqlx.DB, archive string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
insertstmt, err := db.Prepare(`INSERT INTO job (
|
||||
job_id, cluster, start_time,
|
||||
user, project, partition, array_job_id, duration, job_state, meta_data, resources,
|
||||
num_nodes, num_hwthreads, num_acc, smt, exclusive, monitoring_status,
|
||||
flops_any_avg, mem_bw_avg
|
||||
) VALUES (
|
||||
?, ?, ?,
|
||||
?, ?, ?, ?, ?, ?, ?, ?,
|
||||
?, ?, ?, ?, ?, ?,
|
||||
?, ?
|
||||
);`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tx, err := db.Begin()
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stmt, err := tx.PrepareNamed(schema.JobInsertStmt)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
i := 0
|
||||
tags := make(map[string]int64)
|
||||
handleDirectory := func(filename string) error {
|
||||
@ -110,16 +104,16 @@ func initDB(db *sqlx.DB, archive string) error {
|
||||
}
|
||||
}
|
||||
|
||||
tx, err = db.Begin()
|
||||
tx, err = db.Beginx()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
insertstmt = tx.Stmt(insertstmt)
|
||||
stmt = tx.NamedStmt(stmt)
|
||||
fmt.Printf("%d jobs inserted...\r", i)
|
||||
}
|
||||
|
||||
err := loadJob(tx, insertstmt, tags, filename)
|
||||
err := loadJob(tx, stmt, tags, filename)
|
||||
if err == nil {
|
||||
i += 1
|
||||
}
|
||||
@ -151,14 +145,14 @@ func initDB(db *sqlx.DB, archive string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, startTiemDir := range startTimeDirs {
|
||||
if startTiemDir.Type().IsRegular() && startTiemDir.Name() == "meta.json" {
|
||||
for _, startTimeDir := range startTimeDirs {
|
||||
if startTimeDir.Type().IsRegular() && startTimeDir.Name() == "meta.json" {
|
||||
if err := handleDirectory(dirpath); err != nil {
|
||||
log.Printf("in %s: %s\n", dirpath, err.Error())
|
||||
}
|
||||
} else if startTiemDir.IsDir() {
|
||||
if err := handleDirectory(filepath.Join(dirpath, startTiemDir.Name())); err != nil {
|
||||
log.Printf("in %s: %s\n", filepath.Join(dirpath, startTiemDir.Name()), err.Error())
|
||||
} else if startTimeDir.IsDir() {
|
||||
if err := handleDirectory(filepath.Join(dirpath, startTimeDir.Name())); err != nil {
|
||||
log.Printf("in %s: %s\n", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -184,34 +178,28 @@ func initDB(db *sqlx.DB, archive string) error {
|
||||
|
||||
// Read the `meta.json` file at `path` and insert it to the database using the prepared
|
||||
// insert statement `stmt`. `tags` maps all existing tags to their database ID.
|
||||
func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) error {
|
||||
func loadJob(tx *sqlx.Tx, stmt *sqlx.NamedStmt, tags map[string]int64, path string) error {
|
||||
f, err := os.Open(filepath.Join(path, "meta.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var job schema.JobMeta = schema.JobMeta{
|
||||
Exclusive: 1,
|
||||
}
|
||||
var job schema.JobMeta = schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&job); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
flopsAnyAvg := loadJobStat(&job, "flops_any")
|
||||
memBwAvg := loadJobStat(&job, "mem_bw")
|
||||
job.FlopsAnyAvg = loadJobStat(&job, "flops_any")
|
||||
job.MemBwAvg = loadJobStat(&job, "mem_bw")
|
||||
|
||||
resources, err := json.Marshal(job.Resources)
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
res, err := stmt.Exec(
|
||||
job.JobId, job.Cluster, job.StartTime,
|
||||
job.User, job.Project, job.Partition, job.ArrayJobId, job.Duration, job.JobState, job.MetaData, string(resources),
|
||||
job.NumNodes, job.NumHWThreads, job.NumAcc, job.SMT, job.Exclusive, job.MonitoringStatus,
|
||||
flopsAnyAvg, memBwAvg)
|
||||
res, err := stmt.Exec(job)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -244,12 +232,10 @@ func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) err
|
||||
return nil
|
||||
}
|
||||
|
||||
func loadJobStat(job *schema.JobMeta, metric string) sql.NullFloat64 {
|
||||
val := sql.NullFloat64{Valid: false}
|
||||
func loadJobStat(job *schema.JobMeta, metric string) float64 {
|
||||
if stats, ok := job.Statistics[metric]; ok {
|
||||
val.Valid = true
|
||||
val.Float64 = stats.Avg
|
||||
return stats.Avg
|
||||
}
|
||||
|
||||
return val
|
||||
return 0.0
|
||||
}
|
||||
|
@ -13,13 +13,12 @@ import (
|
||||
"strconv"
|
||||
|
||||
"github.com/ClusterCockpit/cc-jobarchive/config"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||
)
|
||||
|
||||
// For a given job, return the path of the `data.json`/`meta.json` file.
|
||||
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
|
||||
func getPath(job *model.Job, file string, checkLegacy bool) (string, error) {
|
||||
func getPath(job *schema.Job, file string, checkLegacy bool) (string, error) {
|
||||
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
|
||||
if !checkLegacy {
|
||||
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
||||
@ -34,7 +33,7 @@ func getPath(job *model.Job, file string, checkLegacy bool) (string, error) {
|
||||
}
|
||||
|
||||
// Assuming job is completed/archived, return the jobs metric data.
|
||||
func loadFromArchive(job *model.Job) (schema.JobData, error) {
|
||||
func loadFromArchive(job *schema.Job) (schema.JobData, error) {
|
||||
filename, err := getPath(job, "data.json", true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -56,8 +55,8 @@ func loadFromArchive(job *model.Job) (schema.JobData, error) {
|
||||
|
||||
// If the job is archived, find its `meta.json` file and override the tags list
|
||||
// in that JSON file. If the job is not archived, nothing is done.
|
||||
func UpdateTags(job *model.Job, tags []*model.JobTag) error {
|
||||
if job.State == model.JobStateRunning {
|
||||
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
|
||||
if job.State == schema.JobStateRunning {
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -74,23 +73,19 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
|
||||
return err
|
||||
}
|
||||
|
||||
var metaFile schema.JobMeta
|
||||
var metaFile schema.JobMeta = schema.JobMeta{
|
||||
BaseJob: schema.JobDefaults,
|
||||
}
|
||||
if err := json.NewDecoder(f).Decode(&metaFile); err != nil {
|
||||
return err
|
||||
}
|
||||
f.Close()
|
||||
|
||||
metaFile.Tags = make([]struct {
|
||||
Name string "json:\"Name\""
|
||||
Type string "json:\"Type\""
|
||||
}, 0)
|
||||
metaFile.Tags = make([]*schema.Tag, 0)
|
||||
for _, tag := range tags {
|
||||
metaFile.Tags = append(metaFile.Tags, struct {
|
||||
Name string "json:\"Name\""
|
||||
Type string "json:\"Type\""
|
||||
}{
|
||||
Name: tag.TagName,
|
||||
Type: tag.TagType,
|
||||
metaFile.Tags = append(metaFile.Tags, &schema.Tag{
|
||||
Name: tag.Name,
|
||||
Type: tag.Type,
|
||||
})
|
||||
}
|
||||
|
||||
@ -103,7 +98,7 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
|
||||
}
|
||||
|
||||
// Helper to metricdata.LoadAverages().
|
||||
func loadAveragesFromArchive(job *model.Job, metrics []string, data [][]schema.Float) error {
|
||||
func loadAveragesFromArchive(job *schema.Job, metrics []string, data [][]schema.Float) error {
|
||||
filename, err := getPath(job, "meta.json", true)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -131,8 +126,8 @@ func loadAveragesFromArchive(job *model.Job, metrics []string, data [][]schema.F
|
||||
}
|
||||
|
||||
// Writes a running job to the job-archive
|
||||
func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
if job.State != model.JobStateRunning {
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
if job.State != schema.JobStateRunning {
|
||||
return nil, errors.New("cannot archive job that is not running")
|
||||
}
|
||||
|
||||
@ -146,51 +141,27 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := []struct {
|
||||
Name string `json:"Name"`
|
||||
Type string `json:"Type"`
|
||||
}{}
|
||||
for _, tag := range job.Tags {
|
||||
tags = append(tags, struct {
|
||||
Name string `json:"Name"`
|
||||
Type string `json:"Type"`
|
||||
}{
|
||||
Name: tag.TagName,
|
||||
Type: tag.TagType,
|
||||
})
|
||||
}
|
||||
|
||||
metaData := &schema.JobMeta{
|
||||
JobId: int64(job.JobID),
|
||||
User: job.User,
|
||||
Project: job.Project,
|
||||
Cluster: job.Cluster,
|
||||
NumNodes: job.NumNodes,
|
||||
NumHWThreads: job.NumHWThreads,
|
||||
NumAcc: job.NumAcc,
|
||||
Exclusive: int8(job.Exclusive),
|
||||
MonitoringStatus: int8(job.MonitoringStatus),
|
||||
SMT: int8(job.Smt),
|
||||
Partition: job.Partition,
|
||||
ArrayJobId: job.ArrayJobID,
|
||||
JobState: string(job.State),
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Duration: int64(job.Duration),
|
||||
Resources: job.Resources,
|
||||
MetaData: "", // TODO/FIXME: Handle `meta_data`!
|
||||
Tags: tags,
|
||||
Statistics: make(map[string]*schema.JobMetaStatistics),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
for _, nodedata := range data.Series {
|
||||
avg += nodedata.Statistics.Avg
|
||||
min = math.Min(min, nodedata.Statistics.Min)
|
||||
max = math.Max(max, nodedata.Statistics.Max)
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// TODO/FIXME: Calc average for non-node metrics as well!
|
||||
continue
|
||||
}
|
||||
|
||||
metaData.Statistics[metric] = &schema.JobMetaStatistics{
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
|
||||
Avg: avg / float64(job.NumNodes),
|
||||
Min: min,
|
||||
@ -202,7 +173,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if !useArchive {
|
||||
return metaData, nil
|
||||
return jobMeta, nil
|
||||
}
|
||||
|
||||
dirPath, err := getPath(job, "", false)
|
||||
@ -220,7 +191,7 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
writer := bufio.NewWriter(f)
|
||||
if err := json.NewEncoder(writer).Encode(metaData); err != nil {
|
||||
if err := json.NewEncoder(writer).Encode(jobMeta); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := writer.Flush(); err != nil {
|
||||
@ -239,5 +210,5 @@ func ArchiveJob(job *model.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return metaData, f.Close()
|
||||
return jobMeta, f.Close()
|
||||
}
|
||||
|
@ -12,7 +12,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-jobarchive/config"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||
)
|
||||
|
||||
@ -57,7 +56,7 @@ func (ccms *CCMetricStore) Init(url string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) doRequest(job *model.Job, suffix string, metrics []string, ctx context.Context) (*http.Response, error) {
|
||||
func (ccms *CCMetricStore) doRequest(job *schema.Job, suffix string, metrics []string, ctx context.Context) (*http.Response, error) {
|
||||
from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix()
|
||||
reqBody := ApiRequestBody{}
|
||||
reqBody.Metrics = metrics
|
||||
@ -85,7 +84,7 @@ func (ccms *CCMetricStore) doRequest(job *model.Job, suffix string, metrics []st
|
||||
return ccms.client.Do(req)
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
||||
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
||||
res, err := ccms.doRequest(job, "timeseries?with-stats=true", metrics, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -103,8 +102,9 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
|
||||
Scope: "node", // TODO: FIXME: Whatever...
|
||||
Unit: mc.Unit,
|
||||
Timestep: mc.Timestep,
|
||||
Series: make([]*schema.MetricSeries, 0, len(job.Resources)),
|
||||
Series: make([]schema.Series, 0, len(job.Resources)),
|
||||
}
|
||||
|
||||
for i, node := range job.Resources {
|
||||
if node.Accelerators != nil || node.HWThreads != nil {
|
||||
// TODO/FIXME:
|
||||
@ -120,7 +120,7 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
|
||||
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
|
||||
}
|
||||
|
||||
metricData.Series = append(metricData.Series, &schema.MetricSeries{
|
||||
metricData.Series = append(metricData.Series, schema.Series{
|
||||
Hostname: node.Hostname,
|
||||
Data: data.Data,
|
||||
Statistics: &schema.MetricStatistics{
|
||||
@ -130,13 +130,13 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
|
||||
},
|
||||
})
|
||||
}
|
||||
jobData[metric] = metricData
|
||||
jobData[metric] = map[string]*schema.JobMetric{"node": metricData}
|
||||
}
|
||||
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
res, err := ccms.doRequest(job, "stats", metrics, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -1,5 +1,6 @@
|
||||
package metricdata
|
||||
|
||||
/*
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
@ -175,3 +176,4 @@ func (idb *InfluxDBv2DataRepository) LoadStats(job *model.Job, metrics []string,
|
||||
func (idb *InfluxDBv2DataRepository) LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
|
||||
return nil, nil
|
||||
}
|
||||
*/
|
||||
|
@ -5,7 +5,6 @@ import (
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-jobarchive/config"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
|
||||
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
||||
)
|
||||
|
||||
@ -15,10 +14,10 @@ type MetricDataRepository interface {
|
||||
Init(url string) error
|
||||
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error)
|
||||
LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job.
|
||||
LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of nodes to a map of metrics to the data for the requested time.
|
||||
LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error)
|
||||
@ -41,15 +40,15 @@ func Init(jobArchivePath string, disableArchive bool) error {
|
||||
if err := ccms.Init(cluster.MetricDataRepository.Url); err != nil {
|
||||
return err
|
||||
}
|
||||
metricDataRepos[cluster.ClusterID] = ccms
|
||||
case "influxdb-v2":
|
||||
idb := &InfluxDBv2DataRepository{}
|
||||
if err := idb.Init(cluster.MetricDataRepository.Url); err != nil {
|
||||
return err
|
||||
}
|
||||
metricDataRepos[cluster.ClusterID] = idb
|
||||
metricDataRepos[cluster.Name] = ccms
|
||||
// case "influxdb-v2":
|
||||
// idb := &InfluxDBv2DataRepository{}
|
||||
// if err := idb.Init(cluster.MetricDataRepository.Url); err != nil {
|
||||
// return err
|
||||
// }
|
||||
// metricDataRepos[cluster.Name] = idb
|
||||
default:
|
||||
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", cluster.MetricDataRepository.Kind, cluster.ClusterID)
|
||||
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", cluster.MetricDataRepository.Kind, cluster.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -57,8 +56,8 @@ func Init(jobArchivePath string, disableArchive bool) error {
|
||||
}
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
||||
if job.State == model.JobStateRunning || !useArchive {
|
||||
func LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
||||
if job.State == schema.JobStateRunning || !useArchive {
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
|
||||
@ -85,8 +84,8 @@ func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.Job
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(job *model.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
|
||||
if job.State != model.JobStateRunning && useArchive {
|
||||
func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
|
||||
if job.State != schema.JobStateRunning && useArchive {
|
||||
return loadAveragesFromArchive(job, metrics, data)
|
||||
}
|
||||
|
||||
|
153
schema/job.go
Normal file
153
schema/job.go
Normal file
@ -0,0 +1,153 @@
|
||||
package schema
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
)
|
||||
|
||||
type BaseJob struct {
|
||||
ID int64 `json:"id" db:"id"`
|
||||
JobID int64 `json:"jobId" db:"job_id"`
|
||||
User string `json:"user" db:"user"`
|
||||
Project string `json:"project" db:"project"`
|
||||
Cluster string `json:"cluster" db:"cluster"`
|
||||
Partition string `json:"partition" db:"partition"`
|
||||
ArrayJobId int32 `json:"arrayJobId" db:"array_job_id"`
|
||||
NumNodes int32 `json:"numNodes" db:"num_nodes"`
|
||||
NumHWThreads int32 `json:"numHwthreads" db:"num_hwthreads"`
|
||||
NumAcc int32 `json:"numAcc" db:"num_acc"`
|
||||
Exclusive int32 `json:"exclusive" db:"exclusive"`
|
||||
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status"`
|
||||
SMT int32 `json:"smt" db:"smt"`
|
||||
State JobState `json:"jobState" db:"job_state"`
|
||||
Duration int32 `json:"duration" db:"duration"`
|
||||
Tags []*Tag `json:"tags"`
|
||||
RawResources []byte `json:"-" db:"resources"`
|
||||
Resources []Resource `json:"resources"`
|
||||
MetaData interface{} `json:"metaData" db:"meta_data"`
|
||||
|
||||
MemUsedMax float64 `json:"-" db:"mem_used_max"`
|
||||
FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"`
|
||||
MemBwAvg float64 `json:"-" db:"mem_bw_avg"`
|
||||
LoadAvg float64 `json:"-" db:"load_avg"`
|
||||
NetBwAvg float64 `json:"-" db:"net_bw_avg"`
|
||||
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"`
|
||||
FileBwAvg float64 `json:"-" db:"file_bw_avg"`
|
||||
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"`
|
||||
}
|
||||
|
||||
type JobMeta struct {
|
||||
BaseJob
|
||||
StartTime int64 `json:"startTime" db:"start_time"`
|
||||
Statistics map[string]JobStatistics `json:"statistics,omitempty"`
|
||||
}
|
||||
|
||||
var JobDefaults BaseJob = BaseJob{
|
||||
Exclusive: 1,
|
||||
MonitoringStatus: 1,
|
||||
MetaData: "",
|
||||
}
|
||||
|
||||
var JobColumns []string = []string{
|
||||
"id", "job_id", "user", "project", "cluster", "partition", "array_job_id", "num_nodes",
|
||||
"num_hwthreads", "num_acc", "exclusive", "monitoring_status", "smt", "job_state",
|
||||
"duration", "resources", "meta_data",
|
||||
}
|
||||
|
||||
const JobInsertStmt string = `INSERT INTO job (
|
||||
job_id, user, project, cluster, partition, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, resources, meta_data,
|
||||
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
|
||||
) VALUES (
|
||||
:job_id, :user, :project, :cluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :resources, :meta_data,
|
||||
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
|
||||
);`
|
||||
|
||||
type Job struct {
|
||||
BaseJob
|
||||
StartTime time.Time `json:"startTime" db:"start_time"`
|
||||
}
|
||||
|
||||
type Scannable interface {
|
||||
StructScan(dest interface{}) error
|
||||
}
|
||||
|
||||
// Helper function for scanning jobs with the `jobTableCols` columns selected.
|
||||
func ScanJob(row Scannable) (*Job, error) {
|
||||
job := &Job{BaseJob: JobDefaults}
|
||||
if err := row.StructScan(&job); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if job.Duration == 0 && job.State == JobStateRunning {
|
||||
job.Duration = int32(time.Since(job.StartTime).Seconds())
|
||||
}
|
||||
|
||||
return job, nil
|
||||
}
|
||||
|
||||
type JobStatistics struct {
|
||||
Unit string `json:"unit"`
|
||||
Avg float64 `json:"avg"`
|
||||
Min float64 `json:"min"`
|
||||
Max float64 `json:"max"`
|
||||
}
|
||||
|
||||
type Tag struct {
|
||||
ID int64 `json:"id" db:"id"`
|
||||
Type string `json:"type" db:"tag_type"`
|
||||
Name string `json:"name" db:"tag_name"`
|
||||
}
|
||||
|
||||
type Resource struct {
|
||||
Hostname string `json:"hostname"`
|
||||
HWThreads []int `json:"hwthreads,omitempty"`
|
||||
Accelerators []int `json:"accelerators,omitempty"`
|
||||
Configuration string `json:"configuration,omitempty"`
|
||||
}
|
||||
|
||||
type JobState string
|
||||
|
||||
const (
|
||||
JobStateRunning JobState = "running"
|
||||
JobStateCompleted JobState = "completed"
|
||||
JobStateFailed JobState = "failed"
|
||||
JobStateCanceled JobState = "canceled"
|
||||
JobStateStopped JobState = "stopped"
|
||||
JobStateTimeout JobState = "timeout"
|
||||
)
|
||||
|
||||
func (e *JobState) UnmarshalGQL(v interface{}) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
}
|
||||
|
||||
*e = JobState(str)
|
||||
if !e.Valid() {
|
||||
return errors.New("invalid job state")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e JobState) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprintf(w, "\"%s\"", e)
|
||||
}
|
||||
|
||||
func (e JobState) Valid() bool {
|
||||
return e == JobStateRunning ||
|
||||
e == JobStateCompleted ||
|
||||
e == JobStateFailed ||
|
||||
e == JobStateCanceled ||
|
||||
e == JobStateStopped ||
|
||||
e == JobStateTimeout
|
||||
}
|
@ -5,14 +5,34 @@ import (
|
||||
"io"
|
||||
)
|
||||
|
||||
// Format of `data.json` files.
|
||||
type JobData map[string]*JobMetric
|
||||
type JobData map[string]map[string]*JobMetric
|
||||
|
||||
type JobMetric struct {
|
||||
Unit string `json:"Unit"`
|
||||
Scope MetricScope `json:"Scope"`
|
||||
Timestep int `json:"Timestep"`
|
||||
Series []*MetricSeries `json:"Series"`
|
||||
Unit string `json:"unit"`
|
||||
Scope MetricScope `json:"scope"`
|
||||
Timestep int `json:"timestep"`
|
||||
Series []Series `json:"series"`
|
||||
StatsSeries *StatsSeries `json:"statisticsSeries,omitempty"`
|
||||
}
|
||||
|
||||
type Series struct {
|
||||
Hostname string `json:"hostname"`
|
||||
Id *int `json:"id,omitempty"`
|
||||
Statistics *MetricStatistics `json:"statistics"`
|
||||
Data []Float `json:"data"`
|
||||
}
|
||||
|
||||
type MetricStatistics struct {
|
||||
Avg float64 `json:"avg"`
|
||||
Min float64 `json:"min"`
|
||||
Max float64 `json:"max"`
|
||||
}
|
||||
|
||||
type StatsSeries struct {
|
||||
Mean []Float `json:"mean,omitempty"`
|
||||
Min []Float `json:"min,omitempty"`
|
||||
Max []Float `json:"max,omitempty"`
|
||||
Percentiles map[int][]Float `json:"percentiles,omitempty"`
|
||||
}
|
||||
|
||||
type MetricScope string
|
||||
@ -39,61 +59,3 @@ func (e *MetricScope) UnmarshalGQL(v interface{}) error {
|
||||
func (e MetricScope) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprintf(w, "\"%s\"", e)
|
||||
}
|
||||
|
||||
type MetricStatistics struct {
|
||||
Avg float64 `json:"Avg"`
|
||||
Min float64 `json:"Min"`
|
||||
Max float64 `json:"Max"`
|
||||
}
|
||||
|
||||
type MetricSeries struct {
|
||||
Hostname string `json:"Hostname"`
|
||||
Id int `json:"Id"`
|
||||
Statistics *MetricStatistics `json:"Statistics"`
|
||||
Data []Float `json:"Data"`
|
||||
}
|
||||
|
||||
type JobMetaStatistics struct {
|
||||
Unit string `json:"Unit"`
|
||||
Avg float64 `json:"Avg"`
|
||||
Min float64 `json:"Min"`
|
||||
Max float64 `json:"Max"`
|
||||
}
|
||||
|
||||
type Accelerator struct {
|
||||
ID int `json:"Id"`
|
||||
Type string `json:"Type"`
|
||||
Model string `json:"Model"`
|
||||
}
|
||||
|
||||
type JobResource struct {
|
||||
Hostname string `json:"Hostname"`
|
||||
HWThreads []int `json:"HWThreads,omitempty"`
|
||||
Accelerators []Accelerator `json:"Accelerators,omitempty"`
|
||||
}
|
||||
|
||||
// Format of `meta.json` files.
|
||||
type JobMeta struct {
|
||||
JobId int64 `json:"JobId"`
|
||||
User string `json:"User"`
|
||||
Project string `json:"Project"`
|
||||
Cluster string `json:"Cluster"`
|
||||
NumNodes int `json:"NumNodes"`
|
||||
NumHWThreads int `json:"NumHWThreads"`
|
||||
NumAcc int `json:"NumAcc"`
|
||||
Exclusive int8 `json:"Exclusive"`
|
||||
MonitoringStatus int8 `json:"MonitoringStatus"`
|
||||
SMT int8 `json:"SMT"`
|
||||
Partition string `json:"Partition"`
|
||||
ArrayJobId int `json:"ArrayJobId"`
|
||||
JobState string `json:"JobState"`
|
||||
StartTime int64 `json:"StartTime"`
|
||||
Duration int64 `json:"Duration"`
|
||||
Resources []*JobResource `json:"Resources"`
|
||||
MetaData string `json:"MetaData"`
|
||||
Tags []struct {
|
||||
Name string `json:"Name"`
|
||||
Type string `json:"Type"`
|
||||
} `json:"Tags"`
|
||||
Statistics map[string]*JobMetaStatistics `json:"Statistics"`
|
||||
}
|
||||
|
@ -176,9 +176,8 @@ func main() {
|
||||
resolver := &graph.Resolver{DB: db}
|
||||
graphQLEndpoint := handler.NewDefaultServer(generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
|
||||
graphQLPlayground := playground.Handler("GraphQL playground", "/query")
|
||||
restApi := &api.RestApi{
|
||||
api := &api.RestApi{
|
||||
DB: db,
|
||||
Resolver: resolver,
|
||||
AsyncArchiving: programConfig.AsyncArchiving,
|
||||
}
|
||||
|
||||
@ -235,7 +234,7 @@ func main() {
|
||||
})
|
||||
|
||||
monitoringRoutes(secured, resolver)
|
||||
restApi.MountRoutes(secured)
|
||||
api.MountRoutes(secured)
|
||||
|
||||
r.PathPrefix("/").Handler(http.FileServer(http.Dir(programConfig.StaticFiles)))
|
||||
handler := handlers.CORS(
|
||||
|
@ -35,7 +35,7 @@
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name/ID</th>
|
||||
<th>Name</th>
|
||||
<th>Jobs</th>
|
||||
<th>System View</th>
|
||||
<th>Analysis View</th>
|
||||
@ -44,10 +44,10 @@
|
||||
<tbody>
|
||||
{{range .Infos.clusters}}
|
||||
<tr>
|
||||
<td>{{.ClusterID}}</td>
|
||||
<td><a href="/monitoring/jobs/?cluster={{.ClusterID}}">Jobs</a></td>
|
||||
<td><a href="/monitoring/systems/?cluster={{.ClusterID}}">System View</a></td>
|
||||
<td><a href="/monitoring/analysis/?cluster={{.ClusterID}}">Analysis View</a></td>
|
||||
<td>{{.Name}}</td>
|
||||
<td><a href="/monitoring/jobs/?cluster={{.Name}}">Jobs</a></td>
|
||||
<td><a href="/monitoring/systems/?cluster={{.Name}}">System View</a></td>
|
||||
<td><a href="/monitoring/analysis/?cluster={{.Name}}">Analysis View</a></td>
|
||||
</tr>
|
||||
{{end}}
|
||||
</tbody>
|
||||
|
Loading…
Reference in New Issue
Block a user