Merge pull request #1 from ClusterCockpit/full-backend

Full backend
This commit is contained in:
Lou 2022-01-10 16:19:43 +01:00 committed by GitHub
commit ff24d946fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 6727 additions and 2138 deletions

4
.env Normal file
View File

@ -0,0 +1,4 @@
export JWT_PUBLIC_KEY="kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
export JWT_PRIVATE_KEY="dtPC/6dWJFKZK7KZ78CvWuynylOmjBFyMsUWArwmodOTN9itjL5POlqdZkcnmpJ0yPm4pRaCrvgFaFAbpyik/Q=="
export SESSION_KEY="67d829bf61dc5f87a73fd814e2c9f629"
export LDAP_ADMIN_PASSWORD="mashup"

2
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "frontend"]
path = frontend
url = git@github.com:ClusterCockpit/cc-svelte-datatable.git
url = git@github.com:ClusterCockpit/cc-frontend.git

View File

@ -1,7 +1,11 @@
# ClusterCockpit with a Golang backend (Only supports archived jobs)
# ClusterCockpit with a Golang backend
__*DOES NOT WORK WITH CURRENT FRONTEND*__
[![Build](https://github.com/ClusterCockpit/cc-jobarchive/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-jobarchive/actions/workflows/test.yml)
Create your job-archive accoring to [this specification](https://github.com/ClusterCockpit/cc-specifications). At least one cluster with a valid `cluster.json` file is required. Having no jobs in the job-archive at all is fine. You may use the sample job-archive available for download [in cc-docker/develop](https://github.com/ClusterCockpit/cc-docker/tree/develop).
### Run server
```sh
@ -27,13 +31,23 @@ touch ./var/job.db
# This will first initialize the job.db database by traversing all
# `meta.json` files in the job-archive. After that, a HTTP server on
# the port 8080 will be running. The `--init-db` is only needed the first time.
./cc-jobarchive --init-db
./cc-jobarchive --init-db --add-user <your-username>:admin:<your-password>
# Show other options:
./cc-jobarchive --help
```
### Configuration
A config file in the JSON format can be provided using `--config` to override the defaults. Look at the beginning of `server.go` for the defaults and consequently the format of the configuration file.
### Update GraphQL schema
This project uses [gqlgen](https://github.com/99designs/gqlgen) for the GraphQL API. The schema can be found in `./graph/schema.graphqls`. After changing it, you need to run `go run github.com/99designs/gqlgen` which will update `graph/model`. In case new resolvers are needed, they will be inserted into `graph/schema.resolvers.go`, where you will need to implement them.
### TODO
- [ ] Documentation
- [ ] Write more TODOs
- [ ] Caching
- [ ] Generate JWTs based on the provided keys

171
api/openapi.yaml Normal file
View File

@ -0,0 +1,171 @@
#
# ClusterCockpit's API spec can be exported via:
# docker exec -it cc-php php bin/console api:openapi:export --yaml
#
# This spec is written by hand and hopefully up to date with the API.
#
openapi: 3.0.3
info:
title: 'ClusterCockpit REST API'
description: 'API for batch job control'
version: 0.0.2
servers:
- url: /
description: ''
paths:
'/api/jobs/{id}':
get:
operationId: 'getJob'
summary: 'Get job resource'
parameters:
- name: id
in: path
required: true
schema: { type: integer }
description: 'Database ID (Resource Identifier)'
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
404:
description: 'Resource not found'
'/api/jobs/tag_job/{id}':
post:
operationId: 'tagJob'
summary: 'Add a tag to a job'
parameters:
- name: id
in: path
required: true
schema: { type: integer }
description: 'Job ID'
requestBody:
description: 'Array of tags to add'
required: true
content:
'application/json':
schema:
type: array
items:
$ref: '#/components/schemas/Tag'
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
404:
description: 'Job or tag does not exist'
400:
description: 'Bad request'
'/api/jobs/start_job/':
post:
operationId: 'startJob'
summary: 'Add a newly started job'
requestBody:
required: true
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
responses:
201:
description: 'Job successfully'
content:
'application/json':
schema:
type: object
properties:
id:
type: integer
description: 'The database ID assigned to this job'
400:
description: 'Bad request'
422:
description: 'The combination of jobId, clusterId and startTime does already exist'
'/api/jobs/stop_job/':
post:
operationId: stopJobViaJobID
summary: 'Mark a job as stopped. Which job to stop is specified by the request body.'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [jobId, cluster, startTime, stopTime]
properties:
jobId: { type: integer }
cluster: { type: string }
startTime: { type: integer }
stopTime: { type: integer }
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
400:
description: 'Bad request'
404:
description: 'Resource not found'
'/api/jobs/stop_job/{id}':
post:
operationId: 'stopJobViaDBID'
summary: 'Mark a job as stopped.'
parameters:
- name: id
in: path
required: true
schema: { type: integer }
description: 'Database ID (Resource Identifier)'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [stopTime]
properties:
stopTime: { type: integer }
responses:
200:
description: 'Job resource'
content:
'application/json':
schema:
$ref: '#/components/schemas/Job'
400:
description: 'Bad request'
404:
description: 'Resource not found'
components:
schemas:
Tag:
description: 'A job tag'
type: object
properties:
id:
type: string
description: 'Database ID'
type:
type: string
description: 'Tag type'
name:
type: string
description: 'Tag name'
Job:
$ref: https://raw.githubusercontent.com/ClusterCockpit/cc-specifications/master/schema/json/job-meta.schema.json
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- bearerAuth: [] # Applies `bearerAuth` globally

340
api/rest.go Normal file
View File

@ -0,0 +1,340 @@
package api
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"path/filepath"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph"
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
"github.com/ClusterCockpit/cc-jobarchive/schema"
sq "github.com/Masterminds/squirrel"
"github.com/gorilla/mux"
"github.com/jmoiron/sqlx"
)
type RestApi struct {
DB *sqlx.DB
Resolver *graph.Resolver
AsyncArchiving bool
MachineStateDir string
}
func (api *RestApi) MountRoutes(r *mux.Router) {
r = r.PathPrefix("/api").Subrouter()
r.StrictSlash(true)
r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/stop_job/", api.stopJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/stop_job/{id}", api.stopJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/{id}", api.getJob).Methods(http.MethodGet)
r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
}
type StartJobApiRespone struct {
DBID int64 `json:"id"`
}
type StopJobApiRequest struct {
// JobId, ClusterId and StartTime are optional.
// They are only used if no database id was provided.
JobId *string `json:"jobId"`
Cluster *string `json:"cluster"`
StartTime *int64 `json:"startTime"`
// Payload
StopTime int64 `json:"stopTime"`
State schema.JobState `json:"jobState"`
}
type TagJobApiRequest []*struct {
Name string `json:"name"`
Type string `json:"type"`
}
func (api *RestApi) getJob(rw http.ResponseWriter, r *http.Request) {
id := mux.Vars(r)["id"]
job, err := api.Resolver.Query().Job(r.Context(), id)
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
return
}
job.Tags, err = api.Resolver.Job().Tags(r.Context(), job)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
id := mux.Vars(r)["id"]
job, err := api.Resolver.Query().Job(r.Context(), id)
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
return
}
job.Tags, err = api.Resolver.Job().Tags(r.Context(), job)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
var req TagJobApiRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
for _, tag := range req {
var tagId int64
if err := sq.Select("id").From("tag").
Where("tag.tag_type = ?", tag.Type).Where("tag.tag_name = ?", tag.Name).
RunWith(api.DB).QueryRow().Scan(&tagId); err != nil {
http.Error(rw, fmt.Sprintf("the tag '%s:%s' does not exist", tag.Type, tag.Name), http.StatusNotFound)
return
}
if _, err := api.DB.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, job.ID, tagId); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
job.Tags = append(job.Tags, &schema.Tag{
ID: tagId,
Type: tag.Type,
Name: tag.Name,
})
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
req := schema.JobMeta{BaseJob: schema.JobDefaults}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
if config.GetClusterConfig(req.Cluster) == nil {
http.Error(rw, fmt.Sprintf("cluster '%s' does not exist", req.Cluster), http.StatusBadRequest)
return
}
if len(req.Resources) == 0 || len(req.User) == 0 || req.NumNodes == 0 {
http.Error(rw, "required fields are missing", http.StatusBadRequest)
return
}
// Check if combination of (job_id, cluster_id, start_time) already exists:
rows, err := api.DB.Query(`SELECT job.id FROM job WHERE job.job_id = ? AND job.cluster = ? AND job.start_time = ?`,
req.JobID, req.Cluster, req.StartTime)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if rows.Next() {
var id int64 = -1
rows.Scan(&id)
http.Error(rw, fmt.Sprintf("a job with that job_id, cluster_id and start_time already exists (database id: %d)", id), http.StatusUnprocessableEntity)
return
}
req.RawResources, err = json.Marshal(req.Resources)
if err != nil {
log.Fatal(err)
}
res, err := api.DB.NamedExec(`INSERT INTO job (
job_id, user, project, cluster, partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, resources, meta_data
) VALUES (
:job_id, :user, :project, :cluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :resources, :meta_data
);`, req)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
id, err := res.LastInsertId()
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d\n", id, req.Cluster, req.JobID, req.User, req.StartTime)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusCreated)
json.NewEncoder(rw).Encode(StartJobApiRespone{
DBID: id,
})
}
func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
req := StopJobApiRequest{}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
var err error
var sql string
var args []interface{}
id, ok := mux.Vars(r)["id"]
if ok {
sql, args, err = sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id).ToSql()
} else {
sql, args, err = sq.Select(schema.JobColumns...).From("job").
Where("job.job_id = ?", req.JobId).
Where("job.cluster = ?", req.Cluster).
Where("job.start_time = ?", req.StartTime).ToSql()
}
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
job, err := schema.ScanJob(api.DB.QueryRowx(sql, args...))
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
http.Error(rw, "stop_time must be larger than start_time and only running jobs can be stopped", http.StatusBadRequest)
return
}
if req.State != "" && !req.State.Valid() {
http.Error(rw, fmt.Sprintf("invalid job state: '%s'", req.State), http.StatusBadRequest)
return
} else {
req.State = schema.JobStateCompleted
}
doArchiving := func(job *schema.Job, ctx context.Context) error {
job.Duration = int32(req.StopTime - job.StartTime.Unix())
jobMeta, err := metricdata.ArchiveJob(job, ctx)
if err != nil {
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
return err
}
stmt := sq.Update("job").
Set("job_state", req.State).
Set("duration", job.Duration).
Where("job.id = ?", job.ID)
for metric, stats := range jobMeta.Statistics {
switch metric {
case "flops_any":
stmt = stmt.Set("flops_any_avg", stats.Avg)
case "mem_used":
stmt = stmt.Set("mem_used_max", stats.Max)
case "mem_bw":
stmt = stmt.Set("mem_bw_avg", stats.Avg)
case "load":
stmt = stmt.Set("load_avg", stats.Avg)
case "net_bw":
stmt = stmt.Set("net_bw_avg", stats.Avg)
case "file_bw":
stmt = stmt.Set("file_bw_avg", stats.Avg)
}
}
sql, args, err := stmt.ToSql()
if err != nil {
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
return err
}
if _, err := api.DB.Exec(sql, args...); err != nil {
log.Printf("archiving job (dbid: %d) failed: %s\n", job.ID, err.Error())
return err
}
log.Printf("job stopped and archived (dbid: %d)\n", job.ID)
return nil
}
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s\n", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
if api.AsyncArchiving {
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
go doArchiving(job, context.Background())
} else {
err := doArchiving(job, r.Context())
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
} else {
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
}
}
func (api *RestApi) putMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
http.Error(rw, "not enabled", http.StatusNotFound)
return
}
vars := mux.Vars(r)
cluster := vars["cluster"]
host := vars["host"]
dir := filepath.Join(api.MachineStateDir, cluster)
if err := os.MkdirAll(dir, 0755); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
filename := filepath.Join(dir, fmt.Sprintf("%s.json", host))
f, err := os.Create(filename)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
defer f.Close()
if _, err := io.Copy(f, r.Body); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
rw.WriteHeader(http.StatusCreated)
}
func (api *RestApi) getMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
http.Error(rw, "not enabled", http.StatusNotFound)
return
}
vars := mux.Vars(r)
filename := filepath.Join(api.MachineStateDir, vars["cluster"], fmt.Sprintf("%s.json", vars["host"]))
// Sets the content-type and 'Last-Modified' Header and so on automatically
http.ServeFile(rw, r, filename)
}

339
auth/auth.go Normal file
View File

@ -0,0 +1,339 @@
package auth
import (
"context"
"crypto/ed25519"
"crypto/rand"
"database/sql"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"log"
"net/http"
"os"
"strings"
"github.com/ClusterCockpit/cc-jobarchive/templates"
sq "github.com/Masterminds/squirrel"
"github.com/golang-jwt/jwt/v4"
"github.com/gorilla/sessions"
"github.com/jmoiron/sqlx"
"golang.org/x/crypto/bcrypt"
)
type User struct {
Username string
Password string
Name string
IsAdmin bool
IsAPIUser bool
ViaLdap bool
Email string
}
type ContextKey string
const ContextUserKey ContextKey = "user"
var JwtPublicKey ed25519.PublicKey
var JwtPrivateKey ed25519.PrivateKey
var sessionStore *sessions.CookieStore
func Init(db *sqlx.DB, ldapConfig *LdapConfig) error {
_, err := db.Exec(`
CREATE TABLE IF NOT EXISTS user (
username varchar(255) PRIMARY KEY,
password varchar(255) DEFAULT NULL,
ldap tinyint DEFAULT 0,
name varchar(255) DEFAULT NULL,
roles varchar(255) DEFAULT NULL,
email varchar(255) DEFAULT NULL);`)
if err != nil {
return err
}
sessKey := os.Getenv("SESSION_KEY")
if sessKey == "" {
log.Println("warning: environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
bytes := make([]byte, 32)
if _, err := rand.Read(bytes); err != nil {
return err
}
sessionStore = sessions.NewCookieStore(bytes)
} else {
bytes, err := base64.StdEncoding.DecodeString(sessKey)
if err != nil {
return err
}
sessionStore = sessions.NewCookieStore(bytes)
}
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
if pubKey == "" || privKey == "" {
log.Println("warning: environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
} else {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
return err
}
JwtPublicKey = ed25519.PublicKey(bytes)
bytes, err = base64.StdEncoding.DecodeString(privKey)
if err != nil {
return err
}
JwtPrivateKey = ed25519.PrivateKey(bytes)
}
if ldapConfig != nil {
if err := initLdap(ldapConfig); err != nil {
return err
}
}
return nil
}
// arg must be formated like this: "<username>:[admin]:<password>"
func AddUserToDB(db *sqlx.DB, arg string) error {
parts := strings.SplitN(arg, ":", 3)
if len(parts) != 3 || len(parts[0]) == 0 || len(parts[2]) == 0 || !(len(parts[1]) == 0 || parts[1] == "admin") {
return errors.New("invalid argument format")
}
password, err := bcrypt.GenerateFromPassword([]byte(parts[2]), bcrypt.DefaultCost)
if err != nil {
return err
}
roles := "[]"
if parts[1] == "admin" {
roles = "[\"ROLE_ADMIN\"]"
}
if parts[1] == "api" {
roles = "[\"ROLE_API\"]"
}
_, err = sq.Insert("user").Columns("username", "password", "roles").Values(parts[0], string(password), roles).RunWith(db).Exec()
if err != nil {
return err
}
log.Printf("new user '%s' added (roles: %s)\n", parts[0], roles)
return nil
}
func DelUserFromDB(db *sqlx.DB, username string) error {
_, err := db.Exec(`DELETE FROM user WHERE user.username = ?`, username)
return err
}
func FetchUserFromDB(db *sqlx.DB, username string) (*User, error) {
user := &User{Username: username}
var hashedPassword, name, rawRoles, email sql.NullString
if err := sq.Select("password", "ldap", "name", "roles", "email").From("user").
Where("user.username = ?", username).RunWith(db).
QueryRow().Scan(&hashedPassword, &user.ViaLdap, &name, &rawRoles, &email); err != nil {
return nil, fmt.Errorf("user '%s' not found (%s)", username, err.Error())
}
user.Password = hashedPassword.String
user.Name = name.String
user.Email = email.String
var roles []string
if rawRoles.Valid {
json.Unmarshal([]byte(rawRoles.String), &roles)
}
for _, role := range roles {
switch role {
case "ROLE_ADMIN":
user.IsAdmin = true
case "ROLE_API":
user.IsAPIUser = true
}
}
return user, nil
}
// Handle a POST request that should log the user in,
// starting a new session.
func Login(db *sqlx.DB) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
username, password := r.FormValue("username"), r.FormValue("password")
user, err := FetchUserFromDB(db, username)
if err == nil && user.ViaLdap && ldapAuthEnabled {
err = loginViaLdap(user, password)
} else if err == nil && !user.ViaLdap && user.Password != "" {
if e := bcrypt.CompareHashAndPassword([]byte(user.Password), []byte(password)); e != nil {
err = fmt.Errorf("user '%s' provided the wrong password (%s)", username, e.Error())
}
} else {
err = errors.New("could not authenticate user")
}
if err != nil {
log.Printf("login failed: %s\n", err.Error())
rw.WriteHeader(http.StatusUnauthorized)
templates.Render(rw, r, "login.html", &templates.Page{
Title: "Login failed",
Login: &templates.LoginPage{
Error: "Username or password incorrect",
},
})
return
}
session, err := sessionStore.New(r, "session")
if err != nil {
log.Printf("session creation failed: %s\n", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
session.Values["username"] = user.Username
session.Values["is_admin"] = user.IsAdmin
if err := sessionStore.Save(r, rw, session); err != nil {
log.Printf("session save failed: %s\n", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("login successfull: user: %#v\n", user)
http.Redirect(rw, r, "/", http.StatusTemporaryRedirect)
})
}
var ErrTokenInvalid error = errors.New("invalid token")
func authViaToken(r *http.Request) (*User, error) {
if JwtPublicKey == nil {
return nil, nil
}
rawtoken := r.Header.Get("X-Auth-Token")
if rawtoken == "" {
rawtoken = r.Header.Get("Authorization")
prefix := "Bearer "
if !strings.HasPrefix(rawtoken, prefix) {
return nil, nil
}
rawtoken = rawtoken[len(prefix):]
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
return JwtPublicKey, nil
})
if err != nil {
return nil, ErrTokenInvalid
}
if err := token.Claims.Valid(); err != nil {
return nil, ErrTokenInvalid
}
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
isAdmin, _ := claims["is_admin"].(bool)
isAPIUser, _ := claims["is_api"].(bool)
return &User{
Username: sub,
IsAdmin: isAdmin,
IsAPIUser: isAPIUser,
}, nil
}
// Authenticate the user and put a User object in the
// context of the request. If authentication fails,
// do not continue but send client to the login screen.
func Auth(next http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := authViaToken(r)
if err == ErrTokenInvalid {
log.Printf("authentication failed: invalid token\n")
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
if user != nil {
ctx := context.WithValue(r.Context(), ContextUserKey, user)
next.ServeHTTP(rw, r.WithContext(ctx))
return
}
session, err := sessionStore.Get(r, "session")
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if session.IsNew {
log.Printf("authentication failed: no session or jwt found\n")
rw.WriteHeader(http.StatusUnauthorized)
templates.Render(rw, r, "login.html", &templates.Page{
Title: "Authentication failed",
Login: &templates.LoginPage{
Error: "No valid session or JWT provided",
},
})
return
}
ctx := context.WithValue(r.Context(), ContextUserKey, &User{
Username: session.Values["username"].(string),
IsAdmin: session.Values["is_admin"].(bool),
})
next.ServeHTTP(rw, r.WithContext(ctx))
})
}
// Generate a new JWT that can be used for authentication
func ProvideJWT(user *User) (string, error) {
if JwtPrivateKey == nil {
return "", errors.New("environment variable 'JWT_PUBLIC_KEY' not set")
}
tok := jwt.NewWithClaims(jwt.SigningMethodEdDSA, jwt.MapClaims{
"sub": user.Username,
"is_admin": user.IsAdmin,
"is_api": user.IsAPIUser,
})
return tok.SignedString(JwtPrivateKey)
}
func GetUser(ctx context.Context) *User {
x := ctx.Value(ContextUserKey)
if x == nil {
return nil
}
return x.(*User)
}
// Clears the session cookie
func Logout(rw http.ResponseWriter, r *http.Request) {
session, err := sessionStore.Get(r, "session")
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if !session.IsNew {
session.Options.MaxAge = -1
if err := sessionStore.Save(r, rw, session); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
}
templates.Render(rw, r, "login.html", &templates.Page{
Title: "Logout successful",
Login: &templates.LoginPage{
Info: "Logout successful",
},
})
}

183
auth/ldap.go Normal file
View File

@ -0,0 +1,183 @@
package auth
import (
"crypto/tls"
"errors"
"fmt"
"log"
"os"
"strings"
"sync"
"github.com/go-ldap/ldap/v3"
"github.com/jmoiron/sqlx"
)
type LdapConfig struct {
Url string `json:"url"`
UserBase string `json:"user_base"`
SearchDN string `json:"search_dn"`
UserBind string `json:"user_bind"`
UserFilter string `json:"user_filter"`
TLS bool `json:"tls"`
}
var ldapAuthEnabled bool = false
var ldapConfig *LdapConfig
var ldapAdminPassword string
func initLdap(config *LdapConfig) error {
ldapAdminPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
if ldapAdminPassword == "" {
log.Println("warning: environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync or authentication will not work)")
}
ldapConfig = config
ldapAuthEnabled = true
return nil
}
var ldapConnectionsLock sync.Mutex
var ldapConnections []*ldap.Conn = []*ldap.Conn{}
// TODO: Add a connection pool or something like
// that so that connections can be reused/cached.
func getLdapConnection() (*ldap.Conn, error) {
ldapConnectionsLock.Lock()
n := len(ldapConnections)
if n > 0 {
conn := ldapConnections[n-1]
ldapConnections = ldapConnections[:n-1]
ldapConnectionsLock.Unlock()
return conn, nil
}
ldapConnectionsLock.Unlock()
conn, err := ldap.DialURL(ldapConfig.Url)
if err != nil {
return nil, err
}
if ldapConfig.TLS {
if err := conn.StartTLS(&tls.Config{InsecureSkipVerify: true}); err != nil {
conn.Close()
return nil, err
}
}
if err := conn.Bind(ldapConfig.SearchDN, ldapAdminPassword); err != nil {
conn.Close()
return nil, err
}
return conn, nil
}
func releaseConnection(conn *ldap.Conn) {
// Re-bind to the user we can run queries with
if err := conn.Bind(ldapConfig.SearchDN, ldapAdminPassword); err != nil {
conn.Close()
log.Printf("ldap error: %s", err.Error())
}
ldapConnectionsLock.Lock()
defer ldapConnectionsLock.Unlock()
n := len(ldapConnections)
if n > 2 {
conn.Close()
return
}
ldapConnections = append(ldapConnections, conn)
}
func loginViaLdap(user *User, password string) error {
l, err := getLdapConnection()
if err != nil {
return err
}
defer releaseConnection(l)
userDn := strings.Replace(ldapConfig.UserBind, "{username}", user.Username, -1)
if err := l.Bind(userDn, password); err != nil {
return err
}
user.ViaLdap = true
return nil
}
// Delete users where user.ldap is 1 and that do not show up in the ldap search results.
// Add users to the users table that are new in the ldap search results.
func SyncWithLDAP(db *sqlx.DB) error {
if !ldapAuthEnabled {
return errors.New("ldap not enabled")
}
const IN_DB int = 1
const IN_LDAP int = 2
const IN_BOTH int = 3
users := map[string]int{}
rows, err := db.Query(`SELECT username FROM user WHERE user.ldap = 1`)
if err != nil {
return err
}
for rows.Next() {
var username string
if err := rows.Scan(&username); err != nil {
return err
}
users[username] = IN_DB
}
l, err := getLdapConnection()
if err != nil {
return err
}
defer releaseConnection(l)
ldapResults, err := l.Search(ldap.NewSearchRequest(
ldapConfig.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
ldapConfig.UserFilter, []string{"dn", "uid", "gecos"}, nil))
if err != nil {
return err
}
newnames := map[string]string{}
for _, entry := range ldapResults.Entries {
username := entry.GetAttributeValue("uid")
if username == "" {
return errors.New("no attribute 'uid'")
}
_, ok := users[username]
if !ok {
users[username] = IN_LDAP
newnames[username] = entry.GetAttributeValue("gecos")
} else {
users[username] = IN_BOTH
}
}
for username, where := range users {
if where == IN_DB {
fmt.Printf("ldap-sync: remove '%s' (does not show up in LDAP anymore)\n", username)
if _, err := db.Exec(`DELETE FROM user WHERE user.username = ?`, username); err != nil {
return err
}
} else if where == IN_LDAP {
name := newnames[username]
fmt.Printf("ldap-sync: add '%s' (name: '%s', roles: [], ldap: true)\n", username, name)
if _, err := db.Exec(`INSERT INTO user (username, ldap, name, roles) VALUES (?, ?, ?, ?)`,
username, 1, name, "[]"); err != nil {
return err
}
}
}
return nil
}

View File

@ -3,82 +3,160 @@ package config
import (
"context"
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"path/filepath"
"sync"
"time"
"github.com/ClusterCockpit/cc-jobarchive/auth"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/jmoiron/sqlx"
)
var db *sqlx.DB
var lock sync.RWMutex
var config map[string]interface{}
var uiDefaults map[string]interface{}
var Clusters []*model.Cluster
const configFilePath string = "./var/ui.config.json"
func init() {
lock.Lock()
defer lock.Unlock()
bytes, err := os.ReadFile(configFilePath)
func Init(usersdb *sqlx.DB, authEnabled bool, uiConfig map[string]interface{}, jobArchive string) error {
db = usersdb
uiDefaults = uiConfig
entries, err := os.ReadDir(jobArchive)
if err != nil {
log.Fatal(err)
return err
}
if err := json.Unmarshal(bytes, &config); err != nil {
log.Fatal(err)
Clusters = []*model.Cluster{}
for _, de := range entries {
bytes, err := os.ReadFile(filepath.Join(jobArchive, de.Name(), "cluster.json"))
if err != nil {
return err
}
var cluster model.Cluster
if err := json.Unmarshal(bytes, &cluster); err != nil {
return err
}
if cluster.FilterRanges.StartTime.To.IsZero() {
cluster.FilterRanges.StartTime.To = time.Unix(0, 0)
}
if cluster.Name != de.Name() {
return fmt.Errorf("the file '%s/cluster.json' contains the clusterId '%s'", de.Name(), cluster.Name)
}
Clusters = append(Clusters, &cluster)
}
if authEnabled {
_, err := db.Exec(`
CREATE TABLE IF NOT EXISTS configuration (
username varchar(255),
key varchar(255),
value varchar(255),
PRIMARY KEY (username, key),
FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);`)
if err != nil {
return err
}
}
return nil
}
// Call this function to change the current configuration.
// `value` must be valid JSON. This This function is thread-safe.
func UpdateConfig(key, value string, ctx context.Context) error {
var v interface{}
if err := json.Unmarshal([]byte(value), &v); err != nil {
return err
// Return the personalised UI config for the currently authenticated
// user or return the plain default config.
func GetUIConfig(r *http.Request) (map[string]interface{}, error) {
lock.RLock()
config := make(map[string]interface{}, len(uiDefaults))
for k, v := range uiDefaults {
config[k] = v
}
lock.RUnlock()
user := auth.GetUser(r.Context())
if user == nil {
return config, nil
}
lock.Lock()
defer lock.Unlock()
config[key] = v
bytes, err := json.Marshal(config)
rows, err := db.Query(`SELECT key, value FROM configuration WHERE configuration.username = ?`, user.Username)
if err != nil {
return err
return nil, err
}
if err := os.WriteFile(configFilePath, bytes, 0644); err != nil {
for rows.Next() {
var key, rawval string
if err := rows.Scan(&key, &rawval); err != nil {
return nil, err
}
var val interface{}
if err := json.Unmarshal([]byte(rawval), &val); err != nil {
return nil, err
}
config[key] = val
}
return config, nil
}
// If the context does not have a user, update the global ui configuration without persisting it!
// If there is a (authenticated) user, update only his configuration.
func UpdateConfig(key, value string, ctx context.Context) error {
user := auth.GetUser(ctx)
if user == nil {
lock.RLock()
defer lock.RUnlock()
var val interface{}
if err := json.Unmarshal([]byte(value), &val); err != nil {
return err
}
uiDefaults[key] = val
return nil
}
if _, err := db.Exec(`REPLACE INTO configuration (username, key, value) VALUES (?, ?, ?)`,
user.Username, key, value); err != nil {
log.Printf("db.Exec: %s\n", err.Error())
return err
}
return nil
}
// http.HandlerFunc compatible function that serves the current configuration as JSON
func ServeConfig(rw http.ResponseWriter, r *http.Request) {
lock.RLock()
defer lock.RUnlock()
rw.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(rw).Encode(config); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
}
}
func GetClusterConfig(cluster string) *model.Cluster {
for _, c := range Clusters {
if c.ClusterID == cluster {
if c.Name == cluster {
return c
}
}
return nil
}
func GetPartition(cluster, partition string) *model.Partition {
for _, c := range Clusters {
if c.Name == cluster {
for _, p := range c.Partitions {
if p.Name == partition {
return p
}
}
}
}
return nil
}
func GetMetricConfig(cluster, metric string) *model.MetricConfig {
for _, c := range Clusters {
if c.ClusterID == cluster {
if c.Name == cluster {
for _, m := range c.MetricConfig {
if m.Name == metric {
return m

@ -1 +1 @@
Subproject commit b487af3496b46942d9848337bc2821575a1390b2
Subproject commit cc48461a810dbd3565000150fc99332743de92ba

8
go.mod
View File

@ -5,9 +5,15 @@ go 1.15
require (
github.com/99designs/gqlgen v0.13.0
github.com/Masterminds/squirrel v1.5.1
github.com/go-ldap/ldap/v3 v3.4.1
github.com/golang-jwt/jwt/v4 v4.1.0
github.com/gorilla/handlers v1.5.1
github.com/gorilla/mux v1.6.1
github.com/gorilla/mux v1.8.0
github.com/gorilla/sessions v1.2.1
github.com/jmoiron/sqlx v1.3.1
github.com/mattn/go-sqlite3 v1.14.6
github.com/stretchr/testify v1.5.1 // indirect
github.com/vektah/gqlparser/v2 v2.1.0
golang.org/x/crypto v0.0.0-20211117183948-ae814b36b871
gopkg.in/yaml.v2 v2.3.0 // indirect
)

32
go.sum
View File

@ -1,5 +1,7 @@
github.com/99designs/gqlgen v0.13.0 h1:haLTcUp3Vwp80xMVEg5KRNwzfUrgFdRmtBY8fuB8scA=
github.com/99designs/gqlgen v0.13.0/go.mod h1:NV130r6f4tpRWuAI+zsrSdooO/eWUv+Gyyoi3rEfXIk=
github.com/Azure/go-ntlmssp v0.0.0-20200615164410-66371956d46c h1:/IBSNwUN8+eKzUzbJPqhK839ygXJ82sde8x3ogr6R28=
github.com/Azure/go-ntlmssp v0.0.0-20200615164410-66371956d46c/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/Masterminds/squirrel v1.5.1 h1:kWAKlLLJFxZG7N2E0mBMNWVp5AuUX+JUrnhFN74Eg+w=
github.com/Masterminds/squirrel v1.5.1/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10=
@ -19,16 +21,26 @@ github.com/dgryski/trifles v0.0.0-20190318185328-a8d75aae118c h1:TUuUh0Xgj97tLMN
github.com/dgryski/trifles v0.0.0-20190318185328-a8d75aae118c/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/felixge/httpsnoop v1.0.1 h1:lvB5Jl89CsZtGIWuTcDM1E/vkVs49/Ml7JJe07l8SPQ=
github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/go-asn1-ber/asn1-ber v1.5.1 h1:pDbRAunXzIUXfx4CB2QJFv5IuPiuoW+sWvr/Us009o8=
github.com/go-asn1-ber/asn1-ber v1.5.1/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-chi/chi v3.3.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ=
github.com/go-ldap/ldap/v3 v3.4.1 h1:fU/0xli6HY02ocbMuozHAYsaHLcnkLjvho2r5a34BUU=
github.com/go-ldap/ldap/v3 v3.4.1/go.mod h1:iYS1MdmrmceOJ1QOTnRXrIs7i3kloqtmGQjRvjKpyMg=
github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs=
github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
github.com/gogo/protobuf v1.0.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/gorilla/context v0.0.0-20160226214623-1ea25387ff6f h1:9oNbS1z4rVpbnkHBdPZU4jo9bSmrLpII768arSyMFgk=
github.com/golang-jwt/jwt/v4 v4.1.0 h1:XUgk2Ex5veyVFVeLm0xhusUTQybEbexJXrvPNOKkSY0=
github.com/golang-jwt/jwt/v4 v4.1.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg=
github.com/gorilla/context v0.0.0-20160226214623-1ea25387ff6f/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg=
github.com/gorilla/handlers v1.5.1 h1:9lRY6j8DEeeBT10CvO9hGW0gmky0BprnvDI5vfhUHH4=
github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv/4g96P1Q=
github.com/gorilla/mux v1.6.1 h1:KOwqsTYZdeuMacU7CxjMNYEKeBvLbxW+psodrbcEa3A=
github.com/gorilla/mux v1.6.1/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/gorilla/securecookie v1.1.1 h1:miw7JPhV+b/lAHSXz4qd/nN9jRiAFV5FwjeKyCS8BvQ=
github.com/gorilla/securecookie v1.1.1/go.mod h1:ra0sb63/xPlUeL+yeDciTfxMRAA+MP+HVt/4epWDjd4=
github.com/gorilla/sessions v1.2.1 h1:DHd3rPN5lE3Ts3D8rKkQ8x/0kqfeNmBAaiSi+o7FsgI=
github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM=
github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc=
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/hashicorp/golang-lru v0.5.0 h1:CL2msUPvZTLb5O648aiLNJw3hnBxN2+1Jq8rCOH9wdo=
@ -73,8 +85,9 @@ github.com/shurcooL/vfsgen v0.0.0-20180121065927-ffb13db8def0/go.mod h1:TrYk7fJV
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.1/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/urfave/cli/v2 v2.1.1 h1:Qt8FeAtxE/vfdrLmR3rxR6JRE0RoVmbXu8+6kZtYU4k=
github.com/urfave/cli/v2 v2.1.1/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ=
github.com/vektah/dataloaden v0.2.1-0.20190515034641-a19b9a6e7c9e/go.mod h1:/HUdMve7rvxZma+2ZELQeNh88+003LL7Pf/CZ089j8U=
@ -82,16 +95,26 @@ github.com/vektah/gqlparser/v2 v2.1.0 h1:uiKJ+T5HMGGQM2kRKQ8Pxw8+Zq9qhhZhz/lieYv
github.com/vektah/gqlparser/v2 v2.1.0/go.mod h1:SyUiHgLATUR8BiYURfTirrTcGpcE+4XkV2se04Px1Ms=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20211117183948-ae814b36b871 h1:/pEO3GD/ABYAjuakUS6xSEmmlyVS4kxBNkeA9tLJiTI=
golang.org/x/crypto v0.0.0-20211117183948-ae814b36b871/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190125232054-d66bd3c5d5a6/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190515012406-7d7faa4812bd/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20200114235610-7ae403b6b589 h1:rjUrONFu4kLchcZTfp3/96bR8bW8dIa8uz3cR5n0cgM=
@ -101,7 +124,8 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.4 h1:/eiJrUcujPVeJ3xlSWaiNi3uSVmDGBK1pDHUHAnao1I=
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
sourcegraph.com/sourcegraph/appdash v0.0.0-20180110180208-2cc67fd64755/go.mod h1:hI742Nqp5OhwiqlzhgfbWU4mW4yO10fP+LoT9WOswdU=
sourcegraph.com/sourcegraph/appdash-data v0.0.0-20151005221446-73f23eafcf67/go.mod h1:L5q+DGLGOQFpo1snNEkLOJT2d1YTW66rWNzatr3He1k=

View File

@ -55,17 +55,19 @@ models:
- github.com/99designs/gqlgen/graphql.Int64
- github.com/99designs/gqlgen/graphql.Int32
Job:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Job"
fields:
tags:
resolver: true
JobMetric:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric"
JobMetricSeries:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricSeries"
JobMetricStatistics:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricStatistics"
NullableFloat:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float"
JobMetricScope:
model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope"
NullableFloat: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Float" }
MetricScope: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricScope" }
JobStatistics: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobStatistics" }
Tag: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Tag" }
Resource: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Resource" }
JobState: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobState" }
JobMetric: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.JobMetric" }
Series: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.Series" }
MetricStatistics: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.MetricStatistics" }
StatsSeries: { model: "github.com/ClusterCockpit/cc-jobarchive/schema.StatsSeries" }

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,17 @@
package model
// Go look at `gqlgen.yml` and the schema package for other non-generated models.
type Cluster struct {
Name string `json:"name"`
MetricConfig []*MetricConfig `json:"metricConfig"`
FilterRanges *FilterRanges `json:"filterRanges"`
Partitions []*Partition `json:"partitions"`
type JobTag struct {
ID string `json:"id" db:"id"`
TagType string `json:"tagType" db:"tag_type"`
TagName string `json:"tagName" db:"tag_name"`
// NOT part of the API:
MetricDataRepository *MetricDataRepository `json:"metricDataRepository"`
}
type MetricDataRepository struct {
Kind string `json:"kind"`
Url string `json:"url"`
Token string `json:"token"`
}

View File

@ -11,17 +11,10 @@ import (
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
type Cluster struct {
ClusterID string `json:"clusterID"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
MetricConfig []*MetricConfig `json:"metricConfig"`
FilterRanges *FilterRanges `json:"filterRanges"`
type Accelerator struct {
ID string `json:"id"`
Type string `json:"type"`
Model string `json:"model"`
}
type FilterRanges struct {
@ -50,41 +43,20 @@ type IntRangeOutput struct {
To int `json:"to"`
}
type Job struct {
ID string `json:"id"`
JobID string `json:"jobId"`
UserID string `json:"userId"`
ProjectID string `json:"projectId"`
ClusterID string `json:"clusterId"`
StartTime time.Time `json:"startTime"`
Duration int `json:"duration"`
NumNodes int `json:"numNodes"`
Nodes []string `json:"nodes"`
HasProfile bool `json:"hasProfile"`
State JobState `json:"state"`
Tags []*JobTag `json:"tags"`
LoadAvg *float64 `json:"loadAvg"`
MemUsedMax *float64 `json:"memUsedMax"`
FlopsAnyAvg *float64 `json:"flopsAnyAvg"`
MemBwAvg *float64 `json:"memBwAvg"`
NetBwAvg *float64 `json:"netBwAvg"`
FileBwAvg *float64 `json:"fileBwAvg"`
}
type JobFilter struct {
Tags []string `json:"tags"`
JobID *StringInput `json:"jobId"`
UserID *StringInput `json:"userId"`
ProjectID *StringInput `json:"projectId"`
ClusterID *StringInput `json:"clusterId"`
Duration *IntRange `json:"duration"`
NumNodes *IntRange `json:"numNodes"`
StartTime *TimeRange `json:"startTime"`
IsRunning *bool `json:"isRunning"`
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
MemBwAvg *FloatRange `json:"memBwAvg"`
LoadAvg *FloatRange `json:"loadAvg"`
MemUsedMax *FloatRange `json:"memUsedMax"`
Tags []string `json:"tags"`
JobID *StringInput `json:"jobId"`
User *StringInput `json:"user"`
Project *StringInput `json:"project"`
Cluster *StringInput `json:"cluster"`
Duration *IntRange `json:"duration"`
NumNodes *IntRange `json:"numNodes"`
StartTime *TimeRange `json:"startTime"`
State []schema.JobState `json:"state"`
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
MemBwAvg *FloatRange `json:"memBwAvg"`
LoadAvg *FloatRange `json:"loadAvg"`
MemUsedMax *FloatRange `json:"memUsedMax"`
}
type JobMetricWithName struct {
@ -93,10 +65,10 @@ type JobMetricWithName struct {
}
type JobResultList struct {
Items []*Job `json:"items"`
Offset *int `json:"offset"`
Limit *int `json:"limit"`
Count *int `json:"count"`
Items []*schema.Job `json:"items"`
Offset *int `json:"offset"`
Limit *int `json:"limit"`
Count *int `json:"count"`
}
type JobsStatistics struct {
@ -110,13 +82,14 @@ type JobsStatistics struct {
}
type MetricConfig struct {
Name string `json:"name"`
Unit string `json:"unit"`
Sampletime int `json:"sampletime"`
Peak int `json:"peak"`
Normal int `json:"normal"`
Caution int `json:"caution"`
Alert int `json:"alert"`
Name string `json:"name"`
Unit string `json:"unit"`
Scope schema.MetricScope `json:"scope"`
Timestep int `json:"timestep"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
}
type MetricFootprints struct {
@ -124,6 +97,16 @@ type MetricFootprints struct {
Footprints []schema.Float `json:"footprints"`
}
type NodeMetric struct {
Name string `json:"name"`
Data []schema.Float `json:"data"`
}
type NodeMetrics struct {
ID string `json:"id"`
Metrics []*NodeMetric `json:"metrics"`
}
type OrderByInput struct {
Field string `json:"field"`
Order SortDirectionEnum `json:"order"`
@ -134,6 +117,18 @@ type PageRequest struct {
Page int `json:"page"`
}
type Partition struct {
Name string `json:"name"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
Topology *Topology `json:"topology"`
}
type StringInput struct {
Eq *string `json:"eq"`
Contains *string `json:"contains"`
@ -151,6 +146,15 @@ type TimeRangeOutput struct {
To time.Time `json:"to"`
}
type Topology struct {
Node []int `json:"node"`
Socket [][]int `json:"socket"`
MemoryDomain [][]int `json:"memoryDomain"`
Die [][]int `json:"die"`
Core [][]int `json:"core"`
Accelerators []*Accelerator `json:"accelerators"`
}
type Aggregate string
const (
@ -194,47 +198,6 @@ func (e Aggregate) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
type JobState string
const (
JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed"
)
var AllJobState = []JobState{
JobStateRunning,
JobStateCompleted,
}
func (e JobState) IsValid() bool {
switch e {
case JobStateRunning, JobStateCompleted:
return true
}
return false
}
func (e JobState) String() string {
return string(e)
}
func (e *JobState) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = JobState(str)
if !e.IsValid() {
return fmt.Errorf("%s is not a valid JobState", str)
}
return nil
}
func (e JobState) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
type SortDirectionEnum string
const (

View File

@ -1,12 +1,15 @@
package graph
import (
"context"
"errors"
"fmt"
"regexp"
"strings"
"github.com/ClusterCockpit/cc-jobarchive/auth"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
sq "github.com/Masterminds/squirrel"
"github.com/jmoiron/sqlx"
)
@ -19,31 +22,10 @@ type Resolver struct {
DB *sqlx.DB
}
var JobTableCols []string = []string{"id", "job_id", "user_id", "project_id", "cluster_id", "start_time", "duration", "job_state", "num_nodes", "node_list", "flops_any_avg", "mem_bw_avg", "net_bw_avg", "file_bw_avg", "load_avg"}
type Scannable interface {
Scan(dest ...interface{}) error
}
// Helper function for scanning jobs with the `jobTableCols` columns selected.
func ScanJob(row Scannable) (*model.Job, error) {
job := &model.Job{HasProfile: true}
var nodeList string
if err := row.Scan(
&job.ID, &job.JobID, &job.UserID, &job.ProjectID, &job.ClusterID,
&job.StartTime, &job.Duration, &job.State, &job.NumNodes, &nodeList,
&job.FlopsAnyAvg, &job.MemBwAvg, &job.NetBwAvg, &job.FileBwAvg, &job.LoadAvg); err != nil {
return nil, err
}
job.Nodes = strings.Split(nodeList, ",")
return job, nil
}
// Helper function for the `jobs` GraphQL-Query. Is also used elsewhere when a list of jobs is needed.
func (r *Resolver) queryJobs(filters []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) ([]*model.Job, int, error) {
query := sq.Select(JobTableCols...).From("job")
func (r *Resolver) queryJobs(ctx context.Context, filters []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) ([]*schema.Job, int, error) {
query := sq.Select(schema.JobColumns...).From("job")
query = securityCheck(ctx, query)
if order != nil {
field := toSnakeCase(order.Field)
@ -67,55 +49,68 @@ func (r *Resolver) queryJobs(filters []*model.JobFilter, page *model.PageRequest
query = buildWhereClause(f, query)
}
rows, err := query.RunWith(r.DB).Query()
sql, args, err := query.ToSql()
if err != nil {
return nil, 0, err
}
defer rows.Close()
jobs := make([]*model.Job, 0, 50)
rows, err := r.DB.Queryx(sql, args...)
if err != nil {
return nil, 0, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := ScanJob(rows)
job, err := schema.ScanJob(rows)
if err != nil {
return nil, 0, err
}
jobs = append(jobs, job)
}
// count all jobs:
query = sq.Select("count(*)").From("job")
for _, f := range filters {
query = buildWhereClause(f, query)
}
rows, err = query.RunWith(r.DB).Query()
if err != nil {
return nil, 0, err
}
defer rows.Close()
var count int
rows.Next()
if err := rows.Scan(&count); err != nil {
if err := query.RunWith(r.DB).Scan(&count); err != nil {
return nil, 0, err
}
return jobs, count, nil
}
// Build a sq.SelectBuilder out of a model.JobFilter.
func securityCheck(ctx context.Context, query sq.SelectBuilder) sq.SelectBuilder {
val := ctx.Value(auth.ContextUserKey)
if val == nil {
return query
}
user := val.(*auth.User)
if user.IsAdmin {
return query
}
return query.Where("job.user_id = ?", user.Username)
}
// Build a sq.SelectBuilder out of a schema.JobFilter.
func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
if filter.Tags != nil {
query = query.Join("jobtag ON jobtag.job_id = job.id").Where("jobtag.tag_id IN ?", filter.Tags)
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags})
}
if filter.JobID != nil {
query = buildStringCondition("job.job_id", filter.JobID, query)
}
if filter.UserID != nil {
query = buildStringCondition("job.user_id", filter.UserID, query)
if filter.User != nil {
query = buildStringCondition("job.user", filter.User, query)
}
if filter.ProjectID != nil {
query = buildStringCondition("job.project_id", filter.ProjectID, query)
if filter.Project != nil {
query = buildStringCondition("job.project", filter.Project, query)
}
if filter.ClusterID != nil {
query = buildStringCondition("job.cluster_id", filter.ClusterID, query)
if filter.Cluster != nil {
query = buildStringCondition("job.cluster", filter.Cluster, query)
}
if filter.StartTime != nil {
query = buildTimeCondition("job.start_time", filter.StartTime, query)
@ -123,12 +118,13 @@ func buildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
if filter.Duration != nil {
query = buildIntCondition("job.duration", filter.Duration, query)
}
if filter.IsRunning != nil {
if *filter.IsRunning {
query = query.Where("job.job_state = 'running'")
} else {
query = query.Where("job.job_state = 'completed'")
if filter.State != nil {
states := make([]string, len(filter.State))
for i, val := range filter.State {
states[i] = string(val)
}
query = query.Where(sq.Eq{"job.job_state": states})
}
if filter.NumNodes != nil {
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
@ -173,20 +169,23 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select
return query.Where(field+" = ?", *cond.Eq)
}
if cond.StartsWith != nil {
return query.Where(field+"LIKE ?", fmt.Sprint(*cond.StartsWith, "%"))
return query.Where(field+" LIKE ?", fmt.Sprint(*cond.StartsWith, "%"))
}
if cond.EndsWith != nil {
return query.Where(field+"LIKE ?", fmt.Sprint("%", *cond.StartsWith))
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.EndsWith))
}
if cond.Contains != nil {
return query.Where(field+"LIKE ?", fmt.Sprint("%", *cond.StartsWith, "%"))
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.Contains, "%"))
}
return query
}
var matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
var matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
func toSnakeCase(str string) string {
matchFirstCap := regexp.MustCompile("(.)([A-Z][a-z]+)")
matchAllCap := regexp.MustCompile("([a-z0-9])([A-Z])")
str = strings.ReplaceAll(str, "'", "")
str = strings.ReplaceAll(str, "\\", "")
snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
return strings.ToLower(snake)

View File

@ -1,40 +1,38 @@
scalar Time
scalar NullableFloat
scalar MetricScope
scalar JobState
type Job {
id: ID! # Database ID, unique
jobId: String! # ID given to the job by the cluster scheduler
userId: String! # Username
projectId: String! # Project
clusterId: String! # Name of the cluster this job was running on
startTime: Time! # RFC3339 formated string
duration: Int! # For running jobs, the time it has already run
numNodes: Int! # Number of nodes this job was running on
nodes: [String!]! # List of hostnames
hasProfile: Boolean! # TODO: Could be removed?
state: JobState! # State of the job
tags: [JobTag!]! # List of tags this job has
# Will be null for running jobs.
loadAvg: Float
memUsedMax: Float
flopsAnyAvg: Float
memBwAvg: Float
netBwAvg: Float
fileBwAvg: Float
}
# TODO: Extend by more possible states?
enum JobState {
running
completed
}
type JobTag {
id: ID! # Database ID, unique
tagType: String! # Type
tagName: String! # Name
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
startTime: Time!
duration: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
SMT: Int!
exclusive: Int!
partition: String!
arrayJobId: Int!
monitoringStatus: Int!
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
}
type Cluster {
clusterID: String!
name: String!
metricConfig: [MetricConfig!]!
filterRanges: FilterRanges!
partitions: [Partition!]!
}
type Partition {
name: String!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
@ -42,37 +40,46 @@ type Cluster {
flopRateScalar: Int!
flopRateSimd: Int!
memoryBandwidth: Int!
metricConfig: [MetricConfig!]!
filterRanges: FilterRanges!
topology: Topology!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
memoryDomain: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
accelerators: [Accelerator!]
}
type Accelerator {
id: String!
type: String!
model: String!
}
type MetricConfig {
name: String!
unit: String!
sampletime: Int!
peak: Int!
normal: Int!
caution: Int!
alert: Int!
}
type JobMetric {
name: String!
unit: String!
scope: JobMetricScope!
scope: MetricScope!
timestep: Int!
series: [JobMetricSeries!]!
peak: Float!
normal: Float!
caution: Float!
alert: Float!
}
type JobMetricSeries {
node_id: String!
statistics: JobMetricStatistics
data: [NullableFloat!]!
type Tag {
id: ID!
type: String!
name: String!
}
type JobMetricStatistics {
avg: Float!
min: Float!
max: Float!
type Resource {
hostname: String!
hwthreads: [Int!]
accelerators: [Int!]
configuration: String
}
type JobMetricWithName {
@ -80,6 +87,33 @@ type JobMetricWithName {
metric: JobMetric!
}
type JobMetric {
unit: String!
scope: MetricScope!
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
}
type Series {
hostname: String!
id: Int
statistics: MetricStatistics
data: [NullableFloat!]!
}
type MetricStatistics {
avg: NullableFloat!
min: NullableFloat!
max: NullableFloat!
}
type StatsSeries {
mean: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type MetricFootprints {
name: String!
footprints: [NullableFloat!]!
@ -87,38 +121,43 @@ type MetricFootprints {
enum Aggregate { USER, PROJECT, CLUSTER }
type NodeMetric {
name: String!
data: [NullableFloat!]!
}
type NodeMetrics {
id: String!
metrics: [NodeMetric!]!
}
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [JobTag!]! # List of all tags
tags: [Tag!]! # List of all tags
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!]): [JobMetricWithName!]!
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): [MetricFootprints]!
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(cluster: ID!, nodes: [String!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
}
type Mutation {
createTag(type: String!, name: String!): JobTag!
createTag(type: String!, name: String!): Tag!
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [JobTag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [JobTag!]!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput {
from: Int!
to: Int!
}
type TimeRangeOutput {
from: Time!
to: Time!
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
type FilterRanges {
duration: IntRangeOutput!
@ -129,13 +168,13 @@ type FilterRanges {
input JobFilter {
tags: [ID!]
jobId: StringInput
userId: StringInput
projectId: StringInput
clusterId: StringInput
user: StringInput
project: StringInput
cluster: StringInput
duration: IntRange
numNodes: IntRange
startTime: TimeRange
isRunning: Boolean
state: [JobState!]
flopsAnyAvg: FloatRange
memBwAvg: FloatRange
loadAvg: FloatRange
@ -159,20 +198,9 @@ input StringInput {
endsWith: String
}
input IntRange {
from: Int!
to: Int!
}
input FloatRange {
from: Float!
to: Float!
}
input TimeRange {
from: Time
to: Time
}
input IntRange { from: Int!, to: Int! }
input FloatRange { from: Float!, to: Float! }
input TimeRange { from: Time, to: Time }
type JobResultList {
items: [Job!]!
@ -200,7 +228,3 @@ input PageRequest {
itemsPerPage: Int!
page: Int!
}
scalar Time
scalar NullableFloat
scalar JobMetricScope

View File

@ -5,42 +5,41 @@ package graph
import (
"context"
"errors"
"fmt"
"strconv"
"time"
"github.com/ClusterCockpit/cc-jobarchive/auth"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/generated"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
"github.com/ClusterCockpit/cc-jobarchive/schema"
sq "github.com/Masterminds/squirrel"
)
func (r *jobResolver) Tags(ctx context.Context, obj *model.Job) ([]*model.JobTag, error) {
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
query := sq.
Select("tag.id", "tag.tag_type", "tag.tag_name").
From("tag").
Join("jobtag ON jobtag.tag_id = tag.id").
Where("jobtag.job_id = ?", obj.ID)
rows, err := query.RunWith(r.DB).Query()
sql, args, err := query.ToSql()
if err != nil {
return nil, err
}
defer rows.Close()
tags := make([]*model.JobTag, 0)
for rows.Next() {
var tag model.JobTag
if err := rows.Scan(&tag.ID, &tag.TagType, &tag.TagName); err != nil {
return nil, err
}
tags = append(tags, &tag)
tags := make([]*schema.Tag, 0)
if err := r.DB.Select(&tags, sql, args...); err != nil {
return nil, err
}
return tags, nil
}
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*model.JobTag, error) {
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
res, err := r.DB.Exec("INSERT INTO tag (tag_type, tag_name) VALUES ($1, $2)", typeArg, name)
if err != nil {
return nil, err
@ -51,7 +50,7 @@ func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name s
return nil, err
}
return &model.JobTag{ID: strconv.FormatInt(id, 10), TagType: typeArg, TagName: name}, nil
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
}
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
@ -59,7 +58,7 @@ func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, er
panic(fmt.Errorf("not implemented"))
}
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*model.JobTag, error) {
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
jid, err := strconv.Atoi(job)
if err != nil {
return nil, err
@ -76,7 +75,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
}
}
tags, err := r.Job().Tags(ctx, &model.Job{ID: job})
dummyJob := schema.Job{}
dummyJob.ID = int64(jid)
tags, err := r.Job().Tags(ctx, &dummyJob)
if err != nil {
return nil, err
}
@ -89,7 +90,7 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
return tags, metricdata.UpdateTags(jobObj, tags)
}
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*model.JobTag, error) {
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
jid, err := strconv.Atoi(job)
if err != nil {
return nil, err
@ -106,7 +107,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
}
}
tags, err := r.Job().Tags(ctx, &model.Job{ID: job})
dummyJob := schema.Job{}
dummyJob.ID = int64(jid)
tags, err := r.Job().Tags(ctx, &dummyJob)
if err != nil {
return nil, err
}
@ -131,46 +134,53 @@ func (r *queryResolver) Clusters(ctx context.Context) ([]*model.Cluster, error)
return config.Clusters, nil
}
func (r *queryResolver) Tags(ctx context.Context) ([]*model.JobTag, error) {
rows, err := sq.Select("id", "tag_type", "tag_name").From("tag").RunWith(r.DB).Query()
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
sql, args, err := sq.Select("id", "tag_type", "tag_name").From("tag").ToSql()
if err != nil {
return nil, err
}
defer rows.Close()
tags := make([]*model.JobTag, 0)
for rows.Next() {
var tag model.JobTag
if err := rows.Scan(&tag.ID, &tag.TagType, &tag.TagName); err != nil {
return nil, err
}
tags = append(tags, &tag)
tags := make([]*schema.Tag, 0)
if err := r.DB.Select(&tags, sql, args...); err != nil {
return nil, err
}
return tags, nil
}
func (r *queryResolver) Job(ctx context.Context, id string) (*model.Job, error) {
return ScanJob(sq.Select(JobTableCols...).From("job").Where("job.id = ?", id).RunWith(r.DB).QueryRow())
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
query := sq.Select(schema.JobColumns...).From("job").Where("job.id = ?", id)
query = securityCheck(ctx, query)
sql, args, err := query.ToSql()
if err != nil {
return nil, err
}
return schema.ScanJob(r.DB.QueryRowx(sql, args...))
}
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string) ([]*model.JobMetricWithName, error) {
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
return nil, err
}
data, err := metricdata.LoadData(job, metrics, ctx)
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
if err != nil {
return nil, err
}
res := []*model.JobMetricWithName{}
for name, md := range data {
res = append(res, &model.JobMetricWithName{
Name: name,
Metric: md,
})
for scope, metric := range md {
if metric.Scope != schema.MetricScope(scope) {
panic("WTF?")
}
res = append(res, &model.JobMetricWithName{
Name: name,
Metric: metric,
})
}
}
return res, err
@ -181,7 +191,7 @@ func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobF
}
func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) (*model.JobResultList, error) {
jobs, count, err := r.queryJobs(filter, page, order)
jobs, count, err := r.queryJobs(ctx, filter, page, order)
if err != nil {
return nil, err
}
@ -197,6 +207,36 @@ func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.Job
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
}
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
user := auth.GetUser(ctx)
if user != nil && !user.IsAdmin {
return nil, errors.New("you need to be an administrator for this query")
}
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, from.Unix(), to.Unix(), ctx)
if err != nil {
return nil, err
}
res := make([]*model.NodeMetrics, 0, len(data))
for node, metrics := range data {
nodeMetrics := make([]*model.NodeMetric, 0, len(metrics))
for metric, data := range metrics {
nodeMetrics = append(nodeMetrics, &model.NodeMetric{
Name: metric,
Data: data,
})
}
res = append(res, &model.NodeMetrics{
ID: node,
Metrics: nodeMetrics,
})
}
return res, nil
}
// Job returns generated.JobResolver implementation.
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }

View File

@ -3,6 +3,7 @@ package graph
import (
"context"
"database/sql"
"errors"
"fmt"
"math"
@ -16,9 +17,9 @@ import (
// GraphQL validation should make sure that no unkown values can be specified.
var groupBy2column = map[model.Aggregate]string{
model.AggregateUser: "job.user_id",
model.AggregateProject: "job.project_id",
model.AggregateCluster: "job.cluster_id",
model.AggregateUser: "job.user",
model.AggregateProject: "job.project",
model.AggregateCluster: "job.cluster",
}
// Helper function for the jobsStatistics GraphQL query placed here so that schema.resolvers.go is not too full.
@ -28,52 +29,59 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
// `socketsPerNode` and `coresPerSocket` can differ from cluster to cluster, so we need to explicitly loop over those.
for _, cluster := range config.Clusters {
corehoursCol := fmt.Sprintf("SUM(job.duration * job.num_nodes * %d * %d) / 3600", cluster.SocketsPerNode, cluster.CoresPerSocket)
var query sq.SelectBuilder
if groupBy == nil {
query = sq.Select(
"''",
"COUNT(job.id)",
"SUM(job.duration) / 3600",
corehoursCol,
).From("job").Where("job.cluster_id = ?", cluster.ClusterID)
} else {
col := groupBy2column[*groupBy]
query = sq.Select(
col,
"COUNT(job.id)",
"SUM(job.duration) / 3600",
corehoursCol,
).From("job").Where("job.cluster_id = ?", cluster.ClusterID).GroupBy(col)
}
for _, partition := range cluster.Partitions {
corehoursCol := fmt.Sprintf("SUM(job.duration * job.num_nodes * %d * %d) / 3600", partition.SocketsPerNode, partition.CoresPerSocket)
var query sq.SelectBuilder
if groupBy == nil {
query = sq.Select(
"''",
"COUNT(job.id)",
"SUM(job.duration) / 3600",
corehoursCol,
).From("job")
} else {
col := groupBy2column[*groupBy]
query = sq.Select(
col,
"COUNT(job.id)",
"SUM(job.duration) / 3600",
corehoursCol,
).From("job").GroupBy(col)
}
for _, f := range filter {
query = buildWhereClause(f, query)
}
query = query.
Where("job.cluster = ?", cluster.Name).
Where("job.partition = ?", partition.Name)
rows, err := query.RunWith(r.DB).Query()
if err != nil {
return nil, err
}
query = securityCheck(ctx, query)
for _, f := range filter {
query = buildWhereClause(f, query)
}
for rows.Next() {
var id sql.NullString
var jobs, walltime, corehours sql.NullInt64
if err := rows.Scan(&id, &jobs, &walltime, &corehours); err != nil {
rows, err := query.RunWith(r.DB).Query()
if err != nil {
return nil, err
}
if id.Valid {
if s, ok := stats[id.String]; ok {
s.TotalJobs += int(jobs.Int64)
s.TotalWalltime += int(walltime.Int64)
s.TotalCoreHours += int(corehours.Int64)
} else {
stats[id.String] = &model.JobsStatistics{
ID: id.String,
TotalJobs: int(jobs.Int64),
TotalWalltime: int(walltime.Int64),
TotalCoreHours: int(corehours.Int64),
for rows.Next() {
var id sql.NullString
var jobs, walltime, corehours sql.NullInt64
if err := rows.Scan(&id, &jobs, &walltime, &corehours); err != nil {
return nil, err
}
if id.Valid {
if s, ok := stats[id.String]; ok {
s.TotalJobs += int(jobs.Int64)
s.TotalWalltime += int(walltime.Int64)
s.TotalCoreHours += int(corehours.Int64)
} else {
stats[id.String] = &model.JobsStatistics{
ID: id.String,
TotalJobs: int(jobs.Int64),
TotalWalltime: int(walltime.Int64),
TotalCoreHours: int(corehours.Int64),
}
}
}
}
@ -82,6 +90,7 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
if groupBy == nil {
query := sq.Select("COUNT(job.id)").From("job").Where("job.duration < 120")
query = securityCheck(ctx, query)
for _, f := range filter {
query = buildWhereClause(f, query)
}
@ -91,6 +100,7 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
} else {
col := groupBy2column[*groupBy]
query := sq.Select(col, "COUNT(job.id)").From("job").Where("job.duration < 120")
query = securityCheck(ctx, query)
for _, f := range filter {
query = buildWhereClause(f, query)
}
@ -133,12 +143,12 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
if histogramsNeeded {
var err error
stat.HistWalltime, err = r.jobsStatisticsHistogram("ROUND(job.duration / 3600) as value", filter, id, col)
stat.HistWalltime, err = r.jobsStatisticsHistogram(ctx, "ROUND(job.duration / 3600) as value", filter, id, col)
if err != nil {
return nil, err
}
stat.HistNumNodes, err = r.jobsStatisticsHistogram("job.num_nodes as value", filter, id, col)
stat.HistNumNodes, err = r.jobsStatisticsHistogram(ctx, "job.num_nodes as value", filter, id, col)
if err != nil {
return nil, err
}
@ -150,8 +160,9 @@ func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobF
// `value` must be the column grouped by, but renamed to "value". `id` and `col` can optionally be used
// to add a condition to the query of the kind "<col> = <id>".
func (r *queryResolver) jobsStatisticsHistogram(value string, filters []*model.JobFilter, id, col string) ([]*model.HistoPoint, error) {
func (r *queryResolver) jobsStatisticsHistogram(ctx context.Context, value string, filters []*model.JobFilter, id, col string) ([]*model.HistoPoint, error) {
query := sq.Select(value, "COUNT(job.id) AS count").From("job")
query = securityCheck(ctx, query)
for _, f := range filters {
query = buildWhereClause(f, query)
}
@ -179,7 +190,7 @@ func (r *queryResolver) jobsStatisticsHistogram(value string, filters []*model.J
// Helper function for the rooflineHeatmap GraphQL query placed here so that schema.resolvers.go is not too full.
func (r *Resolver) rooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
jobs, count, err := r.queryJobs(filter, &model.PageRequest{Page: 1, ItemsPerPage: 501}, nil)
jobs, count, err := r.queryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: 501}, nil)
if err != nil {
return nil, err
}
@ -195,14 +206,21 @@ func (r *Resolver) rooflineHeatmap(ctx context.Context, filter []*model.JobFilte
}
for _, job := range jobs {
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, ctx)
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
if err != nil {
return nil, err
}
flops, membw := jobdata["flops_any"], jobdata["mem_bw"]
if flops == nil && membw == nil {
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %s", job.ID)
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil {
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %d", job.ID)
}
flops, ok1 := flops_["node"]
membw, ok2 := membw_["node"]
if !ok1 || !ok2 {
// TODO/FIXME:
return nil, errors.New("todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
}
for n := 0; n < len(flops.Series); n++ {
@ -232,7 +250,7 @@ func (r *Resolver) rooflineHeatmap(ctx context.Context, filter []*model.JobFilte
// Helper function for the jobsFootprints GraphQL query placed here so that schema.resolvers.go is not too full.
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) ([]*model.MetricFootprints, error) {
jobs, count, err := r.queryJobs(filter, &model.PageRequest{Page: 1, ItemsPerPage: 501}, nil)
jobs, count, err := r.queryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: 501}, nil)
if err != nil {
return nil, err
}

View File

@ -2,18 +2,66 @@ package main
import (
"bufio"
"database/sql"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"strings"
"time"
"github.com/ClusterCockpit/cc-jobarchive/schema"
"github.com/jmoiron/sqlx"
)
const JOBS_DB_SCHEMA string = `
DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tag;
DROP TABLE IF EXISTS jobtag;
CREATE TABLE job (
id INTEGER PRIMARY KEY AUTOINCREMENT, -- Not needed in sqlite
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
start_time TIMESTAMP NOT NULL,
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
partition VARCHAR(255) NOT NULL,
array_job_id BIGINT NOT NULL,
duration INT,
job_state VARCHAR(255) CHECK(job_state IN ('running', 'completed', 'failed', 'canceled', 'stopped', 'timeout')) NOT NULL,
meta_data TEXT, -- json, but sqlite has no json type
resources TEXT NOT NULL, -- json, but sqlite has no json type
num_nodes INT NOT NULL,
num_hwthreads INT NOT NULL,
num_acc INT NOT NULL,
smt TINYINT CHECK(smt IN (0, 1 )) NOT NULL DEFAULT 1,
exclusive TINYINT CHECK(exclusive IN (0, 1, 2)) NOT NULL DEFAULT 1,
monitoring_status TINYINT CHECK(monitoring_status IN (0, 1 )) NOT NULL DEFAULT 1,
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL);
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
`
// Delete the tables "job", "tag" and "jobtag" from the database and
// repopulate them using the jobs found in `archive`.
func initDB(db *sqlx.DB, archive string) error {
@ -21,99 +69,101 @@ func initDB(db *sqlx.DB, archive string) error {
fmt.Println("Building database...")
// Basic database structure:
_, err := db.Exec(`
DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tag;
DROP TABLE IF EXISTS jobtag;
CREATE TABLE job (
id INTEGER PRIMARY KEY,
job_id TEXT,
user_id TEXT,
project_id TEXT,
cluster_id TEXT,
start_time TIMESTAMP,
duration INTEGER,
job_state TEXT,
num_nodes INTEGER,
node_list TEXT,
metadata TEXT,
flops_any_avg REAL,
mem_bw_avg REAL,
net_bw_avg REAL,
file_bw_avg REAL,
load_avg REAL);
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type TEXT,
tag_name TEXT);
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE ON UPDATE NO ACTION,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE ON UPDATE NO ACTION);`)
_, err := db.Exec(JOBS_DB_SCHEMA)
if err != nil {
return err
}
entries0, err := os.ReadDir(archive)
clustersDir, err := os.ReadDir(archive)
if err != nil {
return err
}
insertstmt, err := db.Prepare(`INSERT INTO job
(job_id, user_id, project_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata, flops_any_avg, mem_bw_avg, net_bw_avg, file_bw_avg, load_avg)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);`)
if err != nil {
return err
}
var tx *sql.Tx = nil
var i int = 0
tx, err := db.Beginx()
if err != nil {
return err
}
stmt, err := tx.PrepareNamed(`INSERT INTO job (
job_id, user, project, cluster, partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, resources, meta_data,
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
) VALUES (
:job_id, :user, :project, :cluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :resources, :meta_data,
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
);`)
if err != nil {
return err
}
i := 0
tags := make(map[string]int64)
for _, entry0 := range entries0 {
entries1, err := os.ReadDir(filepath.Join(archive, entry0.Name()))
if err != nil {
return err
}
for _, entry1 := range entries1 {
if !entry1.IsDir() {
continue
handleDirectory := func(filename string) error {
// Bundle 100 inserts into one transaction for better performance:
if i%100 == 0 {
if tx != nil {
if err := tx.Commit(); err != nil {
return err
}
}
entries2, err := os.ReadDir(filepath.Join(archive, entry0.Name(), entry1.Name()))
tx, err = db.Beginx()
if err != nil {
return err
}
for _, entry2 := range entries2 {
// Bundle 200 inserts into one transaction for better performance:
if i%200 == 0 {
if tx != nil {
if err := tx.Commit(); err != nil {
return err
stmt = tx.NamedStmt(stmt)
fmt.Printf("%d jobs inserted...\r", i)
}
err := loadJob(tx, stmt, tags, filename)
if err == nil {
i += 1
}
return err
}
for _, clusterDir := range clustersDir {
lvl1Dirs, err := os.ReadDir(filepath.Join(archive, clusterDir.Name()))
if err != nil {
return err
}
for _, lvl1Dir := range lvl1Dirs {
if !lvl1Dir.IsDir() {
// Could be the cluster.json file
continue
}
lvl2Dirs, err := os.ReadDir(filepath.Join(archive, clusterDir.Name(), lvl1Dir.Name()))
if err != nil {
return err
}
for _, lvl2Dir := range lvl2Dirs {
dirpath := filepath.Join(archive, clusterDir.Name(), lvl1Dir.Name(), lvl2Dir.Name())
startTimeDirs, err := os.ReadDir(dirpath)
if err != nil {
return err
}
for _, startTimeDir := range startTimeDirs {
if startTimeDir.Type().IsRegular() && startTimeDir.Name() == "meta.json" {
if err := handleDirectory(dirpath); err != nil {
log.Printf("in %s: %s\n", dirpath, err.Error())
}
} else if startTimeDir.IsDir() {
if err := handleDirectory(filepath.Join(dirpath, startTimeDir.Name())); err != nil {
log.Printf("in %s: %s\n", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
}
}
tx, err = db.Begin()
if err != nil {
return err
}
insertstmt = tx.Stmt(insertstmt)
fmt.Printf("%d jobs inserted...\r", i)
}
filename := filepath.Join(archive, entry0.Name(), entry1.Name(), entry2.Name())
if err = loadJob(tx, insertstmt, tags, filename); err != nil {
fmt.Printf("failed to load '%s': %s", filename, err.Error())
continue
}
i += 1
}
}
}
@ -125,37 +175,44 @@ func initDB(db *sqlx.DB, archive string) error {
// Create indexes after inserts so that they do not
// need to be continually updated.
if _, err := db.Exec(`
CREATE INDEX job_by_user ON job (user_id);
CREATE INDEX job_by_user ON job (user);
CREATE INDEX job_by_starttime ON job (start_time);`); err != nil {
return err
}
fmt.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
return nil
}
// Read the `meta.json` file at `path` and insert it to the database using the prepared
// insert statement `stmt`. `tags` maps all existing tags to their database ID.
func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) error {
func loadJob(tx *sqlx.Tx, stmt *sqlx.NamedStmt, tags map[string]int64, path string) error {
f, err := os.Open(filepath.Join(path, "meta.json"))
if err != nil {
return err
}
defer f.Close()
var job schema.JobMeta
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&job); err != nil {
var jobMeta schema.JobMeta = schema.JobMeta{BaseJob: schema.JobDefaults}
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&jobMeta); err != nil {
return err
}
flopsAnyAvg := loadJobStat(&job, "flops_any")
memBwAvg := loadJobStat(&job, "mem_bw")
netBwAvg := loadJobStat(&job, "net_bw")
fileBwAvg := loadJobStat(&job, "file_bw")
loadAvg := loadJobStat(&job, "load_one")
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
}
res, err := stmt.Exec(job.JobId, job.UserId, job.ProjectId, job.ClusterId, job.StartTime, job.Duration, job.JobState,
job.NumNodes, strings.Join(job.Nodes, ","), nil, flopsAnyAvg, memBwAvg, netBwAvg, fileBwAvg, loadAvg)
// TODO: Other metrics...
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return err
}
res, err := stmt.Exec(job)
if err != nil {
return err
}
@ -188,12 +245,10 @@ func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) err
return nil
}
func loadJobStat(job *schema.JobMeta, metric string) sql.NullFloat64 {
val := sql.NullFloat64{Valid: false}
func loadJobStat(job *schema.JobMeta, metric string) float64 {
if stats, ok := job.Statistics[metric]; ok {
val.Valid = true
val.Float64 = stats.Avg
return stats.Avg
}
return val
return 0.0
}

View File

@ -11,35 +11,30 @@ import (
"path"
"path/filepath"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
var JobArchivePath string = "./var/job-archive"
// For a given job, return the path of the `data.json`/`meta.json` file.
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
func getPath(job *model.Job, file string) (string, error) {
id, err := strconv.Atoi(strings.Split(job.JobID, ".")[0])
if err != nil {
return "", err
func getPath(job *schema.Job, file string, checkLegacy bool) (string, error) {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
if !checkLegacy {
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
}
lvl1, lvl2 := fmt.Sprintf("%d", id/1000), fmt.Sprintf("%03d", id%1000)
legacyPath := filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, file)
legacyPath := filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, file)
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
return filepath.Join(JobArchivePath, job.ClusterID, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
}
return legacyPath, nil
}
// Assuming job is completed/archived, return the jobs metric data.
func loadFromArchive(job *model.Job) (schema.JobData, error) {
filename, err := getPath(job, "data.json")
func loadFromArchive(job *schema.Job) (schema.JobData, error) {
filename, err := getPath(job, "data.json", true)
if err != nil {
return nil, err
}
@ -60,12 +55,12 @@ func loadFromArchive(job *model.Job) (schema.JobData, error) {
// If the job is archived, find its `meta.json` file and override the tags list
// in that JSON file. If the job is not archived, nothing is done.
func UpdateTags(job *model.Job, tags []*model.JobTag) error {
if job.State == model.JobStateRunning {
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
if job.State == schema.JobStateRunning {
return nil
}
filename, err := getPath(job, "meta.json")
filename, err := getPath(job, "meta.json", true)
if err != nil {
return err
}
@ -78,23 +73,19 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
return err
}
var metaFile schema.JobMeta
var metaFile schema.JobMeta = schema.JobMeta{
BaseJob: schema.JobDefaults,
}
if err := json.NewDecoder(f).Decode(&metaFile); err != nil {
return err
}
f.Close()
metaFile.Tags = make([]struct {
Name string "json:\"name\""
Type string "json:\"type\""
}, 0)
metaFile.Tags = make([]*schema.Tag, 0)
for _, tag := range tags {
metaFile.Tags = append(metaFile.Tags, struct {
Name string "json:\"name\""
Type string "json:\"type\""
}{
Name: tag.TagName,
Type: tag.TagType,
metaFile.Tags = append(metaFile.Tags, &schema.Tag{
Name: tag.Name,
Type: tag.Type,
})
}
@ -107,8 +98,8 @@ func UpdateTags(job *model.Job, tags []*model.JobTag) error {
}
// Helper to metricdata.LoadAverages().
func loadAveragesFromArchive(job *model.Job, metrics []string, data [][]schema.Float) error {
filename, err := getPath(job, "meta.json")
func loadAveragesFromArchive(job *schema.Job, metrics []string, data [][]schema.Float) error {
filename, err := getPath(job, "meta.json", true)
if err != nil {
return err
}
@ -135,97 +126,144 @@ func loadAveragesFromArchive(job *model.Job, metrics []string, data [][]schema.F
}
// Writes a running job to the job-archive
func ArchiveJob(job *model.Job, ctx context.Context) error {
if job.State != model.JobStateRunning {
return errors.New("cannot archive job that is not running")
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
if job.State != schema.JobStateRunning {
return nil, errors.New("cannot archive job that is not running")
}
allMetrics := make([]string, 0)
metricConfigs := config.GetClusterConfig(job.ClusterID).MetricConfig
metricConfigs := config.GetClusterConfig(job.Cluster).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
jobData, err := LoadData(job, allMetrics, ctx)
// TODO: Use more granular resolution on non-exclusive jobs?
scopes := []schema.MetricScope{schema.MetricScopeNode}
jobData, err := LoadData(job, allMetrics, scopes, ctx)
if err != nil {
return err
return nil, err
}
tags := []struct {
Name string `json:"name"`
Type string `json:"type"`
}{}
for _, tag := range job.Tags {
tags = append(tags, struct {
Name string `json:"name"`
Type string `json:"type"`
}{
Name: tag.TagName,
Type: tag.TagType,
})
if err := calcStatisticsSeries(job, jobData); err != nil {
return nil, err
}
metaData := &schema.JobMeta{
JobId: job.JobID,
UserId: job.UserID,
ClusterId: job.ClusterID,
NumNodes: job.NumNodes,
JobState: job.State.String(),
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Duration: int64(job.Duration),
Nodes: job.Nodes,
Tags: tags,
Statistics: make(map[string]*schema.JobMetaStatistics),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
for _, nodedata := range data.Series {
avg += nodedata.Statistics.Avg
min = math.Min(min, nodedata.Statistics.Min)
max = math.Max(max, nodedata.Statistics.Max)
nodeData, ok := data["node"]
if !ok {
// TODO/FIXME: Calc average for non-node metrics as well!
continue
}
metaData.Statistics[metric] = &schema.JobMetaStatistics{
Unit: config.GetMetricConfig(job.ClusterID, metric).Unit,
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
Avg: avg / float64(job.NumNodes),
Min: min,
Max: max,
}
}
dirPath, err := getPath(job, "")
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if !useArchive {
return jobMeta, nil
}
dirPath, err := getPath(job, "", false)
if err != nil {
return err
return nil, err
}
if err := os.MkdirAll(dirPath, 0777); err != nil {
return err
return nil, err
}
f, err := os.Create(path.Join(dirPath, "meta.json"))
if err != nil {
return err
return nil, err
}
defer f.Close()
writer := bufio.NewWriter(f)
if err := json.NewEncoder(writer).Encode(metaData); err != nil {
return err
if err := json.NewEncoder(writer).Encode(jobMeta); err != nil {
return nil, err
}
if err := writer.Flush(); err != nil {
return err
return nil, err
}
f, err = os.Create(path.Join(dirPath, "data.json"))
if err != nil {
return err
return nil, err
}
writer = bufio.NewWriter(f)
if err := json.NewEncoder(writer).Encode(metaData); err != nil {
return err
if err := json.NewEncoder(writer).Encode(jobData); err != nil {
return nil, err
}
if err := writer.Flush(); err != nil {
return err
return nil, err
}
return f.Close()
return jobMeta, f.Close()
}
// Add statisticsSeries fields
func calcStatisticsSeries(job *schema.Job, jobData schema.JobData) error {
for _, scopes := range jobData {
for _, jobMetric := range scopes {
if jobMetric.StatisticsSeries != nil {
continue
}
if len(jobMetric.Series) < 5 {
continue
}
n := 0
for _, series := range jobMetric.Series {
if len(series.Data) > n {
n = len(series.Data)
}
}
mean, min, max := make([]schema.Float, n), make([]schema.Float, n), make([]schema.Float, n)
for i := 0; i < n; i++ {
sum, smin, smax := schema.Float(0.), math.MaxFloat32, -math.MaxFloat32
for _, series := range jobMetric.Series {
if i >= len(series.Data) {
sum, smin, smax = schema.NaN, math.NaN(), math.NaN()
break
}
x := series.Data[i]
sum += x
smin = math.Min(smin, float64(x))
smax = math.Max(smax, float64(x))
}
sum /= schema.Float(len(jobMetric.Series))
mean[i] = sum
min[i] = schema.Float(smin)
max[i] = schema.Float(smax)
}
jobMetric.StatisticsSeries = &schema.StatsSeries{
Min: min, Mean: mean, Max: max,
}
jobMetric.Series = nil
}
}
return nil
}

View File

@ -1,17 +1,18 @@
package metricdata
import (
"bufio"
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"log"
"net/http"
"os"
"strconv"
"time"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
@ -31,9 +32,9 @@ type ApiMetricData struct {
From int64 `json:"from"`
To int64 `json:"to"`
Data []schema.Float `json:"data"`
Avg *float64 `json:"avg"`
Min *float64 `json:"min"`
Max *float64 `json:"max"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
}
type ApiStatsData struct {
@ -46,22 +47,23 @@ type ApiStatsData struct {
Max schema.Float `json:"max"`
}
func (ccms *CCMetricStore) Init() error {
ccms.url = os.Getenv("CCMETRICSTORE_URL")
ccms.jwt = os.Getenv("CCMETRICSTORE_JWT")
if ccms.url == "" || ccms.jwt == "" {
return errors.New("environment variables 'CCMETRICSTORE_URL' or 'CCMETRICSTORE_JWT' not set")
}
func (ccms *CCMetricStore) Init(url, token string) error {
ccms.url = url
ccms.jwt = token
return nil
}
func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
func (ccms *CCMetricStore) doRequest(job *schema.Job, suffix string, metrics []string, ctx context.Context) (*http.Response, error) {
from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix()
reqBody := ApiRequestBody{}
reqBody.Metrics = metrics
for _, node := range job.Nodes {
reqBody.Selectors = append(reqBody.Selectors, []string{job.ClusterID, node})
for _, node := range job.Resources {
if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
}
reqBody.Selectors = append(reqBody.Selectors, []string{job.Cluster, node.Hostname})
}
reqBodyBytes, err := json.Marshal(reqBody)
@ -69,53 +71,324 @@ func (ccms *CCMetricStore) LoadData(job *model.Job, metrics []string, ctx contex
return nil, err
}
authHeader := fmt.Sprintf("Bearer %s", ccms.jwt)
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/api/%d/%d/timeseries?with-stats=true", ccms.url, from, to), bytes.NewReader(reqBodyBytes))
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/api/%d/%d/%s", ccms.url, from, to, suffix), bytes.NewReader(reqBodyBytes))
if err != nil {
return nil, err
}
req.Header.Add("Authorization", authHeader)
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
return ccms.client.Do(req)
}
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
// log.Printf("job: %#v", job)
type ApiQuery struct {
Metric string `json:"metric"`
Hostname string `json:"hostname"`
Type *string `json:"type,omitempty"`
TypeIds []string `json:"type-ids,omitempty"`
SubType *string `json:"subtype,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
}
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
From int64 `json:"from"`
To int64 `json:"to"`
Queries []ApiQuery `json:"queries"`
}
type ApiQueryResponse struct {
ApiMetricData
Query *ApiQuery `json:"query"`
}
reqBody := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: make([]ApiQuery, 0),
}
if len(scopes) != 1 {
return nil, errors.New("todo: support more than one scope in a query")
}
topology := config.GetPartition(job.Cluster, job.Partition).Topology
scopeForMetric := map[string]schema.MetricScope{}
for _, metric := range metrics {
mc := config.GetMetricConfig(job.Cluster, metric)
if mc == nil {
// return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
continue
}
nativeScope, requestedScope := mc.Scope, scopes[0]
// case 1: A metric is requested at node scope with a native scope of node as well
// case 2: A metric is requested at node scope and node is exclusive
// case 3: A metric has native scope node
if (nativeScope == requestedScope && nativeScope == schema.MetricScopeNode) ||
(job.Exclusive == 1 && requestedScope == schema.MetricScopeNode) ||
(nativeScope == schema.MetricScopeNode) {
nodes := map[string]bool{}
for _, resource := range job.Resources {
nodes[resource.Hostname] = true
}
for node := range nodes {
reqBody.Queries = append(reqBody.Queries, ApiQuery{
Metric: metric,
Hostname: node,
})
}
scopeForMetric[metric] = schema.MetricScopeNode
continue
}
// case: Read a metric at hwthread scope with native scope hwthread
if nativeScope == requestedScope && nativeScope == schema.MetricScopeHWThread && job.NumNodes == 1 {
hwthreads := job.Resources[0].HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
for _, hwthread := range hwthreads {
reqBody.Queries = append(reqBody.Queries, ApiQuery{
Metric: metric,
Hostname: job.Resources[0].Hostname,
Type: &t,
TypeIds: []string{strconv.Itoa(hwthread)},
})
}
scopeForMetric[metric] = schema.MetricScopeHWThread
continue
}
// case: A metric is requested at node scope, has a hwthread scope and node is not exclusive and runs on a single node
if requestedScope == schema.MetricScopeNode && nativeScope == schema.MetricScopeHWThread && job.Exclusive != 1 && job.NumNodes == 1 {
hwthreads := job.Resources[0].HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
ids := make([]string, 0, len(hwthreads))
for _, hwthread := range hwthreads {
ids = append(ids, strconv.Itoa(hwthread))
}
reqBody.Queries = append(reqBody.Queries, ApiQuery{
Metric: metric,
Hostname: job.Resources[0].Hostname,
Type: &t,
TypeIds: ids,
})
scopeForMetric[metric] = schema.MetricScopeNode
continue
}
// TODO: Job teilt sich knoten und metric native scope ist kleiner als node
panic("todo")
}
// log.Printf("query: %#v", reqBody)
buf := &bytes.Buffer{}
if err := json.NewEncoder(buf).Encode(reqBody); err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.url+"/api/query", buf)
if err != nil {
return nil, err
}
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
res, err := ccms.client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("cc-metric-store replied with: %s", res.Status)
}
resdata := make([]map[string]ApiMetricData, 0, len(reqBody.Selectors))
var resBody []ApiQueryResponse
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
return nil, err
}
// log.Printf("response: %#v", resBody)
var jobData schema.JobData = make(schema.JobData)
for _, res := range resBody {
metric := res.Query.Metric
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
if res.Error != nil {
return nil, fmt.Errorf("cc-metric-store error while fetching %s: %s", metric, *res.Error)
}
mc := config.GetMetricConfig(job.Cluster, metric)
scope := scopeForMetric[metric]
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0),
}
jobData[metric][scope] = jobMetric
}
id := (*int)(nil)
if res.Query.Type != nil {
id = new(int)
*id, _ = strconv.Atoi(res.Query.TypeIds[0])
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// TODO: use schema.Float instead of float64?
// This is done because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: res.Query.Hostname,
Id: id,
Statistics: &schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
return jobData, nil
}
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
res, err := ccms.doRequest(job, "stats", metrics, ctx)
if err != nil {
return nil, err
}
resdata := make([]map[string]ApiStatsData, 0, len(job.Resources))
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
return nil, err
}
var jobData schema.JobData = make(schema.JobData)
stats := map[string]map[string]schema.MetricStatistics{}
for _, metric := range metrics {
mc := config.GetMetricConfig(job.ClusterID, metric)
metricData := &schema.JobMetric{
Scope: "node", // TODO: FIXME: Whatever...
Unit: mc.Unit,
Timestep: mc.Sampletime,
Series: make([]*schema.MetricSeries, 0, len(job.Nodes)),
}
for i, node := range job.Nodes {
nodestats := map[string]schema.MetricStatistics{}
for i, node := range job.Resources {
if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
}
data := resdata[i][metric]
if data.Error != nil {
return nil, errors.New(*data.Error)
}
if data.Avg == nil || data.Min == nil || data.Max == nil {
return nil, errors.New("no data")
if data.Samples == 0 {
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
}
metricData.Series = append(metricData.Series, &schema.MetricSeries{
NodeID: node,
Data: data.Data,
Statistics: &schema.MetricStatistics{
Avg: *data.Avg,
Min: *data.Min,
Max: *data.Max,
},
})
nodestats[node.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg),
Min: float64(data.Min),
Max: float64(data.Max),
}
}
jobData[metric] = metricData
stats[metric] = nodestats
}
return jobData, nil
return stats, nil
}
func (ccms *CCMetricStore) LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
reqBody := ApiRequestBody{}
reqBody.Metrics = metrics
for _, node := range nodes {
reqBody.Selectors = append(reqBody.Selectors, []string{clusterId, node})
}
reqBodyBytes, err := json.Marshal(reqBody)
if err != nil {
return nil, err
}
var req *http.Request
if nodes == nil {
req, err = http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/api/%s/%d/%d/all-nodes", ccms.url, clusterId, from, to), bytes.NewReader(reqBodyBytes))
} else {
req, err = http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/api/%d/%d/timeseries", ccms.url, from, to), bytes.NewReader(reqBodyBytes))
}
if err != nil {
return nil, err
}
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
res, err := ccms.client.Do(req)
if err != nil {
return nil, err
}
data := map[string]map[string][]schema.Float{}
if nodes == nil {
resdata := map[string]map[string]ApiMetricData{}
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
return nil, err
}
for node, metrics := range resdata {
nodedata := map[string][]schema.Float{}
for metric, data := range metrics {
if data.Error != nil {
return nil, errors.New(*data.Error)
}
nodedata[metric] = data.Data
}
data[node] = nodedata
}
} else {
resdata := make([]map[string]ApiMetricData, 0, len(nodes))
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
return nil, err
}
for i, node := range nodes {
metricsData := map[string][]schema.Float{}
for metric, data := range resdata[i] {
if data.Error != nil {
return nil, errors.New(*data.Error)
}
metricsData[metric] = data.Data
}
data[node] = metricsData
}
}
return data, nil
}

179
metricdata/influxdb-v2.go Normal file
View File

@ -0,0 +1,179 @@
package metricdata
/*
import (
"context"
"errors"
"fmt"
"log"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/schema"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
)
type InfluxDBv2DataRepository struct {
client influxdb2.Client
queryClient influxdb2Api.QueryAPI
bucket, measurement string
}
func (idb *InfluxDBv2DataRepository) Init(url string) error {
token := os.Getenv("INFLUXDB_V2_TOKEN")
if token == "" {
log.Println("warning: environment variable 'INFLUXDB_V2_TOKEN' not set")
}
idb.client = influxdb2.NewClient(url, token)
idb.queryClient = idb.client.QueryAPI("ClusterCockpit")
idb.bucket = "ClusterCockpit/data"
idb.measurement = "data"
return nil
}
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
return fmt.Sprintf("%d-%02d-%02dT%02d:%02d:%02dZ",
t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second())
}
func (idb *InfluxDBv2DataRepository) LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
fieldsConds := make([]string, 0, len(metrics))
for _, m := range metrics {
fieldsConds = append(fieldsConds, fmt.Sprintf(`r._field == "%s"`, m))
}
fieldsCond := strings.Join(fieldsConds, " or ")
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO/FIXME...
return nil, errors.New("the InfluxDB metric data repository does not support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
query := fmt.Sprintf(`from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => r._measurement == "%s" and (%s) and (%s))
|> drop(columns: ["_start", "_stop", "_measurement"])`, idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(job.StartTime.Add(time.Duration(job.Duration)).Add(1*time.Second)),
idb.measurement, hostsCond, fieldsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
return nil, err
}
jobData := make(schema.JobData)
var currentSeries *schema.MetricSeries = nil
for rows.Next() {
row := rows.Record()
if currentSeries == nil || rows.TableChanged() {
field, host := row.Field(), row.ValueByKey("host").(string)
jobMetric, ok := jobData[field]
if !ok {
mc := config.GetMetricConfig(job.Cluster, field)
jobMetric = &schema.JobMetric{
Scope: "node", // TODO: FIXME: Whatever...
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: make([]*schema.MetricSeries, 0, len(job.Resources)),
}
jobData[field] = jobMetric
}
currentSeries = &schema.MetricSeries{
Hostname: host,
Statistics: nil,
Data: make([]schema.Float, 0),
}
jobMetric.Series = append(jobMetric.Series, currentSeries)
}
val := row.Value().(float64)
currentSeries.Data = append(currentSeries.Data, schema.Float(val))
}
stats, err := idb.LoadStats(job, metrics, ctx)
if err != nil {
return nil, err
}
for metric, nodes := range stats {
jobMetric := jobData[metric]
for node, stats := range nodes {
for _, series := range jobMetric.Series {
if series.Hostname == node {
series.Statistics = &stats
}
}
}
}
return jobData, nil
}
func (idb *InfluxDBv2DataRepository) LoadStats(job *model.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
stats := map[string]map[string]schema.MetricStatistics{}
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO/FIXME...
return nil, errors.New("the InfluxDB metric data repository does not support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r.host == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
for _, metric := range metrics {
query := fmt.Sprintf(`
data = from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => r._measurement == "%s" and r._field == "%s" and (%s))
union(tables: [
data |> mean(column: "_value") |> set(key: "_field", value: "avg")
data |> min(column: "_value") |> set(key: "_field", value: "min")
data |> max(column: "_value") |> set(key: "_field", value: "max")
])
|> pivot(rowKey: ["host"], columnKey: ["_field"], valueColumn: "_value")
|> group()`, idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(job.StartTime.Add(time.Duration(job.Duration)).Add(1*time.Second)),
idb.measurement, metric, hostsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
return nil, err
}
nodes := map[string]schema.MetricStatistics{}
for rows.Next() {
row := rows.Record()
host := row.ValueByKey("host").(string)
avg, min, max := row.ValueByKey("avg").(float64),
row.ValueByKey("min").(float64),
row.ValueByKey("max").(float64)
nodes[host] = schema.MetricStatistics{
Avg: avg,
Min: min,
Max: max,
}
}
stats[metric] = nodes
}
return stats, nil
}
func (idb *InfluxDBv2DataRepository) LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
return nil, nil
}
*/

View File

@ -2,31 +2,74 @@ package metricdata
import (
"context"
"errors"
"fmt"
"log"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
var runningJobs *CCMetricStore
type MetricDataRepository interface {
// Initialize this MetricDataRepository. One instance of
// this interface will only ever be responsible for one cluster.
Init(url, token string) error
func init() {
runningJobs = &CCMetricStore{}
if err := runningJobs.Init(); err != nil {
log.Fatalln(err)
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of nodes to a map of metrics to the data for the requested time.
LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error)
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
var JobArchivePath string
var useArchive bool
func Init(jobArchivePath string, disableArchive bool) error {
useArchive = !disableArchive
JobArchivePath = jobArchivePath
for _, cluster := range config.Clusters {
if cluster.MetricDataRepository != nil {
switch cluster.MetricDataRepository.Kind {
case "cc-metric-store":
ccms := &CCMetricStore{}
if err := ccms.Init(cluster.MetricDataRepository.Url, cluster.MetricDataRepository.Token); err != nil {
return err
}
metricDataRepos[cluster.Name] = ccms
// case "influxdb-v2":
// idb := &InfluxDBv2DataRepository{}
// if err := idb.Init(cluster.MetricDataRepository.Url); err != nil {
// return err
// }
// metricDataRepos[cluster.Name] = idb
default:
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", cluster.MetricDataRepository.Kind, cluster.Name)
}
}
}
return nil
}
// Fetches the metric data for a job.
func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
if job.State == model.JobStateRunning {
return runningJobs.LoadData(job, metrics, ctx)
}
func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
if job.State == schema.JobStateRunning || !useArchive {
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
}
if job.State != model.JobStateCompleted {
return nil, fmt.Errorf("job of state '%s' is not supported", job.State)
data, err := repo.LoadData(job, metrics, scopes, ctx)
if err != nil {
return nil, err
}
calcStatisticsSeries(job, data)
return data, nil
}
data, err := loadFromArchive(job)
@ -47,10 +90,58 @@ func LoadData(job *model.Job, metrics []string, ctx context.Context) (schema.Job
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(job *model.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
if job.State != model.JobStateCompleted {
return errors.New("only completed jobs are supported")
func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
if job.State != schema.JobStateRunning && useArchive {
return loadAveragesFromArchive(job, metrics, data)
}
return loadAveragesFromArchive(job, metrics, data)
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx)
if err != nil {
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
func LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
repo, ok := metricDataRepos[clusterId]
if !ok {
return nil, fmt.Errorf("no metric data repository configured for '%s'", clusterId)
}
if metrics == nil {
for _, m := range config.GetClusterConfig(clusterId).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeData(clusterId, metrics, nodes, from, to, ctx)
if err != nil {
return nil, err
}
if data == nil {
return nil, fmt.Errorf("the metric data repository for '%s' does not support this query", clusterId)
}
return data, nil
}

View File

@ -1,115 +0,0 @@
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"strings"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
sq "github.com/Masterminds/squirrel"
)
type StartJobRequestBody struct {
JobId string `json:"job_id"`
UserId string `json:"user_id"`
ProjectId string `json:"project_id"`
ClusterId string `json:"cluster_id"`
StartTime int64 `json:"start_time"`
Nodes []string `json:"nodes"`
Metadata string `json:"metadata"`
}
type StartJobResponeBody struct {
DBID int64 `json:"db_id"`
}
type StopJobRequestBody struct {
DBID *int64 `json:"db_id"`
JobId string `json:"job_id"`
ClusterId string `json:"cluster_id"`
StartTime int64 `json:"start_time"`
StopTime int64 `json:"stop_time"`
}
func startJob(rw http.ResponseWriter, r *http.Request) {
req := StartJobRequestBody{}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
if config.GetClusterConfig(req.ClusterId) == nil {
http.Error(rw, fmt.Sprintf("cluster '%s' does not exist", req.ClusterId), http.StatusBadRequest)
return
}
res, err := db.Exec(
`INSERT INTO job (job_id, user_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);`,
req.JobId, req.UserId, req.ClusterId, req.StartTime, 0, model.JobStateRunning, len(req.Nodes), strings.Join(req.Nodes, ","), req.Metadata)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
id, err := res.LastInsertId()
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("New job started (db-id=%d)\n", id)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(StartJobResponeBody{
DBID: id,
})
}
func stopJob(rw http.ResponseWriter, r *http.Request) {
req := StopJobRequestBody{}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
var err error
var job *model.Job
if req.DBID != nil {
job, err = graph.ScanJob(sq.Select(graph.JobTableCols...).From("job").Where("job.id = ?", req.DBID).RunWith(db).QueryRow())
} else {
job, err = graph.ScanJob(sq.Select(graph.JobTableCols...).From("job").
Where("job.job_id = ?", req.JobId).
Where("job.cluster_id = ?", req.ClusterId).
Where("job.start_time = ?", req.StartTime).
RunWith(db).QueryRow())
}
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != model.JobStateRunning {
http.Error(rw, "stop_time must be larger than start_time and only running jobs can be stopped", http.StatusBadRequest)
return
}
job.Duration = int(job.StartTime.Unix() - req.StopTime)
if err := metricdata.ArchiveJob(job, r.Context()); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if _, err := db.Exec(`UPDATE job SET job.duration = ?, job.job_state = ? WHERE job.id = ?;`,
job.Duration, model.JobStateCompleted, job.ID); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
rw.WriteHeader(http.StatusOK)
}

150
schema/job.go Normal file
View File

@ -0,0 +1,150 @@
package schema
import (
"encoding/json"
"errors"
"fmt"
"io"
"time"
)
// Common subset of Job and JobMeta. Use one of those, not
// this type directly.
type BaseJob struct {
JobID int64 `json:"jobId" db:"job_id"`
User string `json:"user" db:"user"`
Project string `json:"project" db:"project"`
Cluster string `json:"cluster" db:"cluster"`
Partition string `json:"partition" db:"partition"`
ArrayJobId int32 `json:"arrayJobId" db:"array_job_id"`
NumNodes int32 `json:"numNodes" db:"num_nodes"`
NumHWThreads int32 `json:"numHwthreads" db:"num_hwthreads"`
NumAcc int32 `json:"numAcc" db:"num_acc"`
Exclusive int32 `json:"exclusive" db:"exclusive"`
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status"`
SMT int32 `json:"smt" db:"smt"`
State JobState `json:"jobState" db:"job_state"`
Duration int32 `json:"duration" db:"duration"`
Tags []*Tag `json:"tags"`
RawResources []byte `json:"-" db:"resources"`
Resources []*Resource `json:"resources"`
MetaData interface{} `json:"metaData" db:"meta_data"`
}
// This type is used as the GraphQL interface and using sqlx as a table row.
type Job struct {
ID int64 `json:"id" db:"id"`
BaseJob
StartTime time.Time `json:"startTime" db:"start_time"`
MemUsedMax float64 `json:"-" db:"mem_used_max"`
FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"`
MemBwAvg float64 `json:"-" db:"mem_bw_avg"`
LoadAvg float64 `json:"-" db:"load_avg"`
NetBwAvg float64 `json:"-" db:"net_bw_avg"`
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"`
FileBwAvg float64 `json:"-" db:"file_bw_avg"`
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"`
}
// When reading from the database or sending data via GraphQL, the start time can be in the much more
// convenient time.Time type. In the `meta.json` files, the start time is encoded as a unix epoch timestamp.
// This is why there is this struct, which contains all fields from the regular job struct, but "overwrites"
// the StartTime field with one of type int64.
type JobMeta struct {
BaseJob
StartTime int64 `json:"startTime" db:"start_time"`
Statistics map[string]JobStatistics `json:"statistics,omitempty"`
}
var JobDefaults BaseJob = BaseJob{
Exclusive: 1,
MonitoringStatus: 1,
MetaData: "",
}
var JobColumns []string = []string{
"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.start_time", "job.partition", "job.array_job_id", "job.num_nodes",
"job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",
"job.duration", "job.resources", "job.meta_data",
}
type Scannable interface {
StructScan(dest interface{}) error
}
// Helper function for scanning jobs with the `jobTableCols` columns selected.
func ScanJob(row Scannable) (*Job, error) {
job := &Job{BaseJob: JobDefaults}
if err := row.StructScan(job); err != nil {
return nil, err
}
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
return nil, err
}
if job.Duration == 0 && job.State == JobStateRunning {
job.Duration = int32(time.Since(job.StartTime).Seconds())
}
job.RawResources = nil
return job, nil
}
type JobStatistics struct {
Unit string `json:"unit"`
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
type Tag struct {
ID int64 `json:"id" db:"id"`
Type string `json:"type" db:"tag_type"`
Name string `json:"name" db:"tag_name"`
}
type Resource struct {
Hostname string `json:"hostname"`
HWThreads []int `json:"hwthreads,omitempty"`
Accelerators []int `json:"accelerators,omitempty"`
Configuration string `json:"configuration,omitempty"`
}
type JobState string
const (
JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed"
JobStateFailed JobState = "failed"
JobStateCanceled JobState = "canceled"
JobStateStopped JobState = "stopped"
JobStateTimeout JobState = "timeout"
)
func (e *JobState) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = JobState(str)
if !e.Valid() {
return errors.New("invalid job state")
}
return nil
}
func (e JobState) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}
func (e JobState) Valid() bool {
return e == JobStateRunning ||
e == JobStateCompleted ||
e == JobStateFailed ||
e == JobStateCanceled ||
e == JobStateStopped ||
e == JobStateTimeout
}

View File

@ -5,39 +5,21 @@ import (
"io"
)
// Format of `data.json` files.
type JobData map[string]*JobMetric
type JobData map[string]map[MetricScope]*JobMetric
type JobMetric struct {
Unit string `json:"unit"`
Scope MetricScope `json:"scope"`
Timestep int `json:"timestep"`
Series []*MetricSeries `json:"series"`
Unit string `json:"unit"`
Scope MetricScope `json:"scope"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries"`
}
type MetricScope string
const (
MetricScopeNode MetricScope = "node"
MetricScopeSocket MetricScope = "socket"
MetricScopeCpu MetricScope = "cpu"
)
func (e *MetricScope) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = MetricScope(str)
if *e != "node" && *e != "socket" && *e != "cpu" {
return fmt.Errorf("%s is not a valid MetricScope", str)
}
return nil
}
func (e MetricScope) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
type Series struct {
Hostname string `json:"hostname"`
Id *int `json:"id,omitempty"`
Statistics *MetricStatistics `json:"statistics"`
Data []Float `json:"data"`
}
type MetricStatistics struct {
@ -46,33 +28,51 @@ type MetricStatistics struct {
Max float64 `json:"max"`
}
type MetricSeries struct {
NodeID string `json:"node_id"`
Statistics *MetricStatistics `json:"statistics"`
Data []Float `json:"data"`
type StatsSeries struct {
Mean []Float `json:"mean"`
Min []Float `json:"min"`
Max []Float `json:"max"`
Percentiles map[int][]Float `json:"percentiles,omitempty"`
}
type JobMetaStatistics struct {
Unit string `json:"unit"`
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
type MetricScope string
const (
MetricScopeNode MetricScope = "node"
MetricScopeSocket MetricScope = "socket"
MetricScopeCpu MetricScope = "cpu"
MetricScopeHWThread MetricScope = "hwthread"
)
var metricScopeGranularity map[MetricScope]int = map[MetricScope]int{
MetricScopeNode: 1,
MetricScopeSocket: 2,
MetricScopeCpu: 3,
MetricScopeHWThread: 4,
}
// Format of `meta.json` files.
type JobMeta struct {
JobId string `json:"job_id"`
UserId string `json:"user_id"`
ProjectId string `json:"project_id"`
ClusterId string `json:"cluster_id"`
NumNodes int `json:"num_nodes"`
JobState string `json:"job_state"`
StartTime int64 `json:"start_time"`
Duration int64 `json:"duration"`
Nodes []string `json:"nodes"`
Tags []struct {
Name string `json:"name"`
Type string `json:"type"`
} `json:"tags"`
Statistics map[string]*JobMetaStatistics `json:"statistics"`
func (e *MetricScope) MaxGranularity(other MetricScope) MetricScope {
a := metricScopeGranularity[*e]
b := metricScopeGranularity[other]
if a < b {
return *e
}
return other
}
func (e *MetricScope) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = MetricScope(str)
if _, ok := metricScopeGranularity[*e]; !ok {
return fmt.Errorf("%s is not a valid MetricScope", str)
}
return nil
}
func (e MetricScope) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}

461
server.go
View File

@ -3,19 +3,23 @@ package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"net/http"
"net/url"
"os"
"path/filepath"
"time"
"strconv"
"github.com/99designs/gqlgen/graphql/handler"
"github.com/99designs/gqlgen/graphql/playground"
"github.com/ClusterCockpit/cc-jobarchive/api"
"github.com/ClusterCockpit/cc-jobarchive/auth"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/graph"
"github.com/ClusterCockpit/cc-jobarchive/graph/generated"
"github.com/ClusterCockpit/cc-jobarchive/graph/model"
"github.com/ClusterCockpit/cc-jobarchive/metricdata"
"github.com/ClusterCockpit/cc-jobarchive/schema"
"github.com/ClusterCockpit/cc-jobarchive/templates"
"github.com/gorilla/handlers"
"github.com/gorilla/mux"
"github.com/jmoiron/sqlx"
@ -24,86 +28,423 @@ import (
var db *sqlx.DB
func main() {
var reinitDB bool
var port, staticFiles, jobDBFile string
// Format of the configurartion (file). See below for the defaults.
type ProgramConfig struct {
// Address where the http (or https) server will listen on (for example: 'localhost:80').
Addr string `json:"addr"`
flag.StringVar(&port, "port", "8080", "Port on which to listen")
flag.StringVar(&staticFiles, "static-files", "./frontend/public", "Directory who's contents shall be served as static files")
flag.StringVar(&jobDBFile, "job-db", "./var/job.db", "SQLite 3 Jobs Database File")
flag.BoolVar(&reinitDB, "init-db", false, "Initialize new SQLite Database")
// Disable authentication (for everything: API, Web-UI, ...)
DisableAuthentication bool `json:"disable-authentication"`
// Folder where static assets can be found, will be served directly
StaticFiles string `json:"static-files"`
// Currently only SQLite3 ist supported, so this should be a filename
DB string `json:"db"`
// Path to the job-archive
JobArchive string `json:"job-archive"`
// Make the /api/jobs/stop_job endpoint do the heavy work in the background.
AsyncArchiving bool `json:"async-archive"`
// Keep all metric data in the metric data repositories,
// do not write to the job-archive.
DisableArchive bool `json:"disable-archive"`
// For LDAP Authentication and user syncronisation.
LdapConfig *auth.LdapConfig `json:"ldap"`
// If both those options are not empty, use HTTPS using those certificates.
HttpsCertFile string `json:"https-cert-file"`
HttpsKeyFile string `json:"https-key-file"`
// If overwriten, at least all the options in the defaults below must
// be provided! Most options here can be overwritten by the user.
UiDefaults map[string]interface{} `json:"ui-defaults"`
// Where to store MachineState files
MachineStateDir string `json:"machine-state-dir"`
}
var programConfig ProgramConfig = ProgramConfig{
Addr: "0.0.0.0:8080",
DisableAuthentication: false,
StaticFiles: "./frontend/public",
DB: "./var/job.db",
JobArchive: "./var/job-archive",
AsyncArchiving: true,
DisableArchive: false,
LdapConfig: &auth.LdapConfig{
Url: "ldap://localhost",
UserBase: "ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
SearchDN: "cn=admin,dc=rrze,dc=uni-erlangen,dc=de",
UserBind: "uid={username},ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
UserFilter: "(&(objectclass=posixAccount)(uid=*))",
},
HttpsCertFile: "",
HttpsKeyFile: "",
UiDefaults: map[string]interface{}{
"analysis_view_histogramMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"analysis_view_scatterPlotMetrics": [][]string{{"flops_any", "mem_bw"}, {"flops_any", "cpu_load"}, {"cpu_load", "mem_bw"}},
"job_view_nodestats_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"job_view_polarPlotMetrics": []string{"flops_any", "mem_bw", "mem_used", "net_bw", "file_bw"},
"job_view_selectedMetrics": []string{"flops_any", "mem_bw", "mem_used"},
"plot_general_colorBackground": true,
"plot_general_colorscheme": []string{"#00bfff", "#0000ff", "#ff00ff", "#ff0000", "#ff8000", "#ffff00", "#80ff00"},
"plot_general_lineWidth": 1,
"plot_list_jobsPerPage": 10,
"plot_list_selectedMetrics": []string{"cpu_load", "mem_used", "flops_any", "mem_bw", "clock"},
"plot_view_plotsPerRow": 4,
"plot_view_showPolarplot": true,
"plot_view_showRoofline": true,
"plot_view_showStatTable": true,
},
MachineStateDir: "./var/machine-state",
}
func main() {
var flagReinitDB, flagStopImmediately, flagSyncLDAP bool
var flagConfigFile string
var flagNewUser, flagDelUser, flagGenJWT string
flag.BoolVar(&flagReinitDB, "init-db", false, "Go through job-archive and re-initialize `job`, `tag`, and `jobtag` tables")
flag.BoolVar(&flagSyncLDAP, "sync-ldap", false, "Sync the `user` table with ldap")
flag.BoolVar(&flagStopImmediately, "no-server", false, "Do not start a server, stop right after initialization and argument handling")
flag.StringVar(&flagConfigFile, "config", "", "Location of the config file for this server (overwrites the defaults)")
flag.StringVar(&flagNewUser, "add-user", "", "Add a new user. Argument format: `<username>:[admin|api]:<password>`")
flag.StringVar(&flagDelUser, "del-user", "", "Remove user by username")
flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by the username")
flag.Parse()
var err error
db, err = sqlx.Open("sqlite3", jobDBFile)
if err != nil {
log.Fatal(err)
}
// See https://github.com/mattn/go-sqlite3/issues/274
db.SetMaxOpenConns(1)
defer db.Close()
if reinitDB {
if err = initDB(db, metricdata.JobArchivePath); err != nil {
if flagConfigFile != "" {
data, err := os.ReadFile(flagConfigFile)
if err != nil {
log.Fatal(err)
}
if err := json.Unmarshal(data, &programConfig); err != nil {
log.Fatal(err)
}
}
config.Clusters, err = loadClusters()
var err error
// This might need to change for other databases:
db, err = sqlx.Open("sqlite3", fmt.Sprintf("%s?_foreign_keys=on", programConfig.DB))
if err != nil {
log.Fatal(err)
}
// Only for sqlite, not needed for any other database:
db.SetMaxOpenConns(1)
// Initialize sub-modules...
if !programConfig.DisableAuthentication {
if err := auth.Init(db, programConfig.LdapConfig); err != nil {
log.Fatal(err)
}
if flagNewUser != "" {
if err := auth.AddUserToDB(db, flagNewUser); err != nil {
log.Fatal(err)
}
}
if flagDelUser != "" {
if err := auth.DelUserFromDB(db, flagDelUser); err != nil {
log.Fatal(err)
}
}
if flagSyncLDAP {
auth.SyncWithLDAP(db)
}
if flagGenJWT != "" {
user, err := auth.FetchUserFromDB(db, flagGenJWT)
if err != nil {
log.Fatal(err)
}
if !user.IsAPIUser {
log.Println("warning: that user does not have the API role")
}
jwt, err := auth.ProvideJWT(user)
if err != nil {
log.Fatal(err)
}
fmt.Printf("JWT for '%s': %s\n", user.Username, jwt)
}
} else if flagNewUser != "" || flagDelUser != "" {
log.Fatalln("arguments --add-user and --del-user can only be used if authentication is enabled")
}
if err := config.Init(db, !programConfig.DisableAuthentication, programConfig.UiDefaults, programConfig.JobArchive); err != nil {
log.Fatal(err)
}
if err := metricdata.Init(programConfig.JobArchive, programConfig.DisableArchive); err != nil {
log.Fatal(err)
}
if flagReinitDB {
if err := initDB(db, programConfig.JobArchive); err != nil {
log.Fatal(err)
}
}
if flagStopImmediately {
return
}
// Build routes...
resolver := &graph.Resolver{DB: db}
graphQLEndpoint := handler.NewDefaultServer(generated.NewExecutableSchema(generated.Config{Resolvers: resolver}))
// graphQLEndpoint.SetRecoverFunc(func(ctx context.Context, err interface{}) error {
// switch e := err.(type) {
// case string:
// return fmt.Errorf("panic: %s", e)
// case error:
// return fmt.Errorf("panic caused by: %w", e)
// }
// return errors.New("internal server error (panic)")
// })
graphQLPlayground := playground.Handler("GraphQL playground", "/query")
api := &api.RestApi{
DB: db,
AsyncArchiving: programConfig.AsyncArchiving,
Resolver: resolver,
MachineStateDir: programConfig.MachineStateDir,
}
handleGetLogin := func(rw http.ResponseWriter, r *http.Request) {
templates.Render(rw, r, "login.html", &templates.Page{
Title: "Login",
Login: &templates.LoginPage{},
})
}
r := mux.NewRouter()
loggedRouter := handlers.LoggingHandler(os.Stdout, r)
r.NotFoundHandler = http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
templates.Render(rw, r, "404.html", &templates.Page{
Title: "Not found",
})
})
srv := handler.NewDefaultServer(generated.NewExecutableSchema(generated.Config{
Resolvers: &graph.Resolver{DB: db}}))
r.HandleFunc("/graphql-playground", playground.Handler("GraphQL playground", "/query"))
r.Handle("/query", srv)
r.Handle("/playground", graphQLPlayground)
r.Handle("/login", auth.Login(db)).Methods(http.MethodPost)
r.HandleFunc("/login", handleGetLogin).Methods(http.MethodGet)
r.HandleFunc("/logout", auth.Logout).Methods(http.MethodPost)
r.HandleFunc("/config.json", config.ServeConfig).Methods("GET")
r.HandleFunc("/api/start-job", startJob).Methods("POST")
r.HandleFunc("/api/stop-job", stopJob).Methods("POST")
if len(staticFiles) != 0 {
r.PathPrefix("/").Handler(http.FileServer(http.Dir(staticFiles)))
secured := r.PathPrefix("/").Subrouter()
if !programConfig.DisableAuthentication {
secured.Use(auth.Auth)
}
secured.Handle("/query", graphQLEndpoint)
log.Printf("GraphQL playground: http://localhost:%s/graphql-playground", port)
log.Printf("Home: http://localhost:%s/index.html", port)
log.Fatal(http.ListenAndServe("127.0.0.1:"+port,
handlers.CORS(handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization"}),
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
handlers.AllowedOrigins([]string{"*"}))(loggedRouter)))
}
func loadClusters() ([]*model.Cluster, error) {
entries, err := os.ReadDir(metricdata.JobArchivePath)
if err != nil {
return nil, err
}
clusters := []*model.Cluster{}
for _, de := range entries {
bytes, err := os.ReadFile(filepath.Join(metricdata.JobArchivePath, de.Name(), "cluster.json"))
secured.HandleFunc("/", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
return nil, err
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
var cluster model.Cluster
if err := json.Unmarshal(bytes, &cluster); err != nil {
return nil, err
infos := map[string]interface{}{
"clusters": config.Clusters,
"username": "",
"admin": true,
}
if cluster.FilterRanges.StartTime.To.IsZero() {
cluster.FilterRanges.StartTime.To = time.Unix(0, 0)
if user := auth.GetUser(r.Context()); user != nil {
infos["username"] = user.Username
infos["admin"] = user.IsAdmin
}
clusters = append(clusters, &cluster)
templates.Render(rw, r, "home.html", &templates.Page{
Title: "ClusterCockpit",
Config: conf,
Infos: infos,
})
})
monitoringRoutes(secured, resolver)
api.MountRoutes(secured)
r.PathPrefix("/").Handler(http.FileServer(http.Dir(programConfig.StaticFiles)))
handler := handlers.CORS(
handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization"}),
handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}),
handlers.AllowedOrigins([]string{"*"}))(handlers.LoggingHandler(os.Stdout, handlers.CompressHandler(r)))
// Start http or https server
if programConfig.HttpsCertFile != "" && programConfig.HttpsKeyFile != "" {
log.Printf("HTTPS server running at %s...", programConfig.Addr)
err = http.ListenAndServeTLS(programConfig.Addr, programConfig.HttpsCertFile, programConfig.HttpsKeyFile, handler)
} else {
log.Printf("HTTP server running at %s...", programConfig.Addr)
err = http.ListenAndServe(programConfig.Addr, handler)
}
log.Fatal(err)
}
func monitoringRoutes(router *mux.Router, resolver *graph.Resolver) {
buildFilterPresets := func(query url.Values) map[string]interface{} {
filterPresets := map[string]interface{}{}
if query.Get("cluster") != "" {
filterPresets["cluster"] = query.Get("cluster")
}
if query.Get("project") != "" {
filterPresets["project"] = query.Get("project")
}
if query.Get("state") != "" && schema.JobState(query.Get("state")).Valid() {
filterPresets["state"] = query.Get("state")
}
if rawtags, ok := query["tag"]; ok {
tags := make([]int, len(rawtags))
for i, tid := range rawtags {
var err error
tags[i], err = strconv.Atoi(tid)
if err != nil {
tags[i] = -1
}
}
filterPresets["tags"] = tags
}
return filterPresets
}
return clusters, nil
router.HandleFunc("/monitoring/jobs/", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
templates.Render(rw, r, "monitoring/jobs.html", &templates.Page{
Title: "Jobs - ClusterCockpit",
Config: conf,
FilterPresets: buildFilterPresets(r.URL.Query()),
})
})
router.HandleFunc("/monitoring/job/{id:[0-9]+}", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
id := mux.Vars(r)["id"]
job, err := resolver.Query().Job(r.Context(), id)
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
return
}
templates.Render(rw, r, "monitoring/job.html", &templates.Page{
Title: fmt.Sprintf("Job %d - ClusterCockpit", job.JobID),
Config: conf,
Infos: map[string]interface{}{
"id": id,
"jobId": job.JobID,
"clusterId": job.Cluster,
},
})
})
router.HandleFunc("/monitoring/users/", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
templates.Render(rw, r, "monitoring/users.html", &templates.Page{
Title: "Users - ClusterCockpit",
Config: conf,
})
})
router.HandleFunc("/monitoring/user/{id}", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
id := mux.Vars(r)["id"]
// TODO: One could check if the user exists, but that would be unhelpfull if authentication
// is disabled or the user does not exist but has started jobs.
templates.Render(rw, r, "monitoring/user.html", &templates.Page{
Title: fmt.Sprintf("User %s - ClusterCockpit", id),
Config: conf,
Infos: map[string]interface{}{"username": id},
FilterPresets: buildFilterPresets(r.URL.Query()),
})
})
router.HandleFunc("/monitoring/analysis/", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
filterPresets := map[string]interface{}{}
query := r.URL.Query()
if query.Get("cluster") != "" {
filterPresets["clusterId"] = query.Get("cluster")
}
templates.Render(rw, r, "monitoring/analysis.html", &templates.Page{
Title: "Analysis View - ClusterCockpit",
Config: conf,
FilterPresets: filterPresets,
})
})
router.HandleFunc("/monitoring/systems/", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
filterPresets := map[string]interface{}{}
query := r.URL.Query()
if query.Get("cluster") != "" {
filterPresets["clusterId"] = query.Get("cluster")
}
templates.Render(rw, r, "monitoring/systems.html", &templates.Page{
Title: "System View - ClusterCockpit",
Config: conf,
FilterPresets: filterPresets,
})
})
router.HandleFunc("/monitoring/node/{clusterId}/{nodeId}", func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
vars := mux.Vars(r)
templates.Render(rw, r, "monitoring/node.html", &templates.Page{
Title: fmt.Sprintf("Node %s - ClusterCockpit", vars["nodeId"]),
Config: conf,
Infos: map[string]interface{}{
"nodeId": vars["nodeId"],
"clusterId": vars["clusterId"],
},
})
})
}

10
templates/404.html Normal file
View File

@ -0,0 +1,10 @@
{{template "base.html" .}}
{{define "content"}}
<div class="row">
<div class="col">
<div class="alert alert-error" role="alert">
404: Not found
</div>
</div>
</div>
{{end}}

28
templates/base.html Normal file
View File

@ -0,0 +1,28 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset='utf-8'>
<meta name='viewport' content='width=device-width,initial-scale=1'>
<title>{{.Title}}</title>
<link rel='icon' type='image/png' href='/favicon.png'>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.1/dist/css/bootstrap.min.css" integrity="sha384-F3w7mX95PdgyTmZZMECAngseQB83DfGTowi0iMjiWaeVhAn4FJkqJByhZMI3AhiU" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css">
<link rel='stylesheet' href='/global.css'>
<link rel='stylesheet' href='/uPlot.min.css'>
{{block "stylesheets" .}}{{end}}
</head>
<body>
<div class="container">
<div class="row">
<div class="col">
{{block "content" .}}
Whoops, you should not see this...
{{end}}
</div>
</div>
</div>
{{block "javascript" .}}{{end}}
</body>
</html>

57
templates/home.html Normal file
View File

@ -0,0 +1,57 @@
{{define "content"}}
<div class="row">
<div class="col">
{{if .Infos.username}}
<i class="bi bi-person-circle"></i> {{ .Infos.username }}
{{if .Infos.admin}}
<span class="badge bg-primary">Admin</span>
{{end}}
{{end}}
</div>
<div class="col" style="text-align: right;">
<form method="post" action="/logout">
<button type="submit" class="btn btn-primary">Logout</button>
</form>
</div>
</div>
<div class="row">
{{if .Infos.admin}}
<div class="col-4">
<ul>
<li><a href="/monitoring/jobs/">All jobs</a></li>
<li><a href="/monitoring/users/">All users</a></li>
</ul>
</div>
{{else}}
<div class="col-4">
<ul>
<li><a href="/monitoring/jobs/">My jobs</a></li>
<li><a href="/monitoring/user/{{.Infos.username}}">My user view</a></li>
</ul>
</div>
{{end}}
<div class="col-8">
<h2>Clusters</h2>
<table class="table">
<thead>
<tr>
<th>Name</th>
<th>Jobs</th>
<th>System View</th>
<th>Analysis View</th>
</tr>
</thead>
<tbody>
{{range .Infos.clusters}}
<tr>
<td>{{.Name}}</td>
<td><a href="/monitoring/jobs/?cluster={{.Name}}">Jobs</a></td>
<td><a href="/monitoring/systems/?cluster={{.Name}}">System View</a></td>
<td><a href="/monitoring/analysis/?cluster={{.Name}}">Analysis View</a></td>
</tr>
{{end}}
</tbody>
</table>
</div>
</div>
{{end}}

47
templates/login.html Normal file
View File

@ -0,0 +1,47 @@
{{define "content"}}
<div class="row">
<div class="col">
<h1>
ClusterCockpit Login
</h1>
</div>
</div>
<div class="row">
<div class="col">
{{if .Login.Error}}
<div class="alert alert-warning" role="alert">
{{.Login.Error}}
</div>
{{end}}
{{if .Login.Info}}
<div class="alert alert-success" role="alert">
{{.Login.Info}}
</div>
{{end}}
</div>
</div>
<div class="row">
<div class="col">
<form method="post" action="/login">
<div class="mb-3">
<label class="form-label" for="username">Username</label>
<input class="form-control" type="text" id="username" name="username">
</div>
<div class="mb-3">
<label class="form-label" for="password">Password</label>
<input class="form-control" type="password" id="password" name="password">
</div>
<button type="submit" class="btn btn-primary">Login</button>
</form>
</div>
</div>
<br/>
<div class="row">
<div class="col">
<form method="post" action="/logout">
<button type="submit" class="btn btn-primary">Logout</button>
</form>
</div>
</div>
{{end}}

View File

@ -0,0 +1,18 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/analysis.css'>
{{end}}
{{define "javascript"}}
<script>
const filterPresets = {{ .FilterPresets }};
const clusterCockpitConfigPromise = Promise.resolve({
plot_view_plotsPerRow: {{ .Config.plot_view_plotsPerRow }},
analysis_view_histogramMetrics: {{ .Config.analysis_view_histogramMetrics }},
analysis_view_scatterPlotMetrics: {{ .Config.analysis_view_scatterPlotMetrics }}
});
</script>
<script src='/build/analysis.js'></script>
{{end}}

View File

@ -0,0 +1,29 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/job.css'>
{{end}}
{{define "javascript"}}
<script>
const jobInfos = {
id: "{{ .Infos.id }}",
jobId: "{{ .Infos.jobId }}",
clusterId: "{{ .Infos.clusterId }}"
};
const clusterCockpitConfigPromise = Promise.resolve({
plot_general_colorscheme: {{ .Config.plot_general_colorscheme }},
plot_general_lineWidth: {{ .Config.plot_general_lineWidth }},
plot_general_colorBackground: {{ .Config.plot_general_colorBackground }},
plot_view_showRoofline: {{ .Config.plot_view_showRoofline }},
plot_view_showPolarplot: {{ .Config.plot_view_showPolarplot }},
plot_view_showStatTable: {{ .Config.plot_view_showStatTable }},
plot_view_plotsPerRow: {{ .Config.plot_view_plotsPerRow }},
job_view_selectedMetrics: {{ .Config.job_view_selectedMetrics }},
job_view_nodestats_selectedMetrics: {{ .Config.job_view_nodestats_selectedMetrics }},
job_view_polarPlotMetrics: {{ .Config.plot_view_polarPlotMetrics }},
});
</script>
<script src='/build/job.js'></script>
{{end}}

View File

@ -0,0 +1,14 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/jobs.css'>
{{end}}
{{define "javascript"}}
<script>
const filterPresets = {{ .FilterPresets }};
const clusterCockpitConfig = {{ .Config }};
</script>
<script src='/build/jobs.js'></script>
{{end}}

View File

@ -0,0 +1,21 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/node.css'>
{{end}}
{{define "javascript"}}
<script>
const nodeInfos = {
nodeId: "{{ .Infos.nodeId }}",
clusterId: "{{ .Infos.clusterId }}"
};
const clusterCockpitConfigPromise = Promise.resolve({
plot_general_colorscheme: {{ .Config.plot_general_colorscheme }},
plot_general_lineWidth: {{ .Config.plot_general_lineWidth }},
plot_general_colorBackground: {{ .Config.plot_general_colorBackground }},
});
</script>
<script src='/build/node.js'></script>
{{end}}

View File

@ -0,0 +1,19 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/systems.css'>
{{end}}
{{define "javascript"}}
<script>
const filterPresets = {{ .FilterPresets }};
const clusterCockpitConfigPromise = Promise.resolve({
plot_view_plotsPerRow: {{ .Config.plot_view_plotsPerRow }},
plot_general_colorscheme: {{ .Config.plot_general_colorscheme }},
plot_general_lineWidth: {{ .Config.plot_general_lineWidth }},
plot_general_colorBackground: {{ .Config.plot_general_colorBackground }},
});
</script>
<script src='/build/systems.js'></script>
{{end}}

View File

@ -0,0 +1,15 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/user.css'>
{{end}}
{{define "javascript"}}
<script>
const userInfos = {{ .Infos }};
const filterPresets = {{ .FilterPresets }};
const clusterCockpitConfig = {{ .Config }};
</script>
<script src='/build/user.js'></script>
{{end}}

View File

@ -0,0 +1,14 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/users.css'>
{{end}}
{{define "javascript"}}
<script>
const filterPresets = null;
const clusterCockpitConfigPromise = Promise.resolve({});
</script>
<script src='/build/users.js'></script>
{{end}}

56
templates/templates.go Normal file
View File

@ -0,0 +1,56 @@
package templates
import (
"html/template"
"log"
"net/http"
)
var templatesDir string
var debugMode bool = true
var templates map[string]*template.Template = map[string]*template.Template{}
type Page struct {
Title string
Login *LoginPage
FilterPresets map[string]interface{}
Infos map[string]interface{}
Config map[string]interface{}
}
type LoginPage struct {
Error string
Info string
}
func init() {
templatesDir = "./templates/"
base := template.Must(template.ParseFiles(templatesDir + "base.html"))
files := []string{
"home.html", "404.html", "login.html",
"monitoring/jobs.html", "monitoring/job.html",
"monitoring/users.html", "monitoring/user.html",
"monitoring/analysis.html",
"monitoring/systems.html",
"monitoring/node.html",
}
for _, file := range files {
templates[file] = template.Must(template.Must(base.Clone()).ParseFiles(templatesDir + file))
}
}
func Render(rw http.ResponseWriter, r *http.Request, file string, page *Page) {
t, ok := templates[file]
if !ok {
panic("templates must be predefinied!")
}
if debugMode {
t = template.Must(template.ParseFiles(templatesDir+"base.html", templatesDir+file))
}
if err := t.Execute(rw, page); err != nil {
log.Printf("template error: %s\n", err.Error())
}
}

40
utils/add-job.mjs Normal file
View File

@ -0,0 +1,40 @@
import fetch from 'node-fetch'
// Just for testing
const job = {
jobId: 123,
user: 'lou',
project: 'testproj',
cluster: 'heidi',
partition: 'default',
arrayJobId: 0,
numNodes: 1,
numHwthreads: 8,
numAcc: 0,
exclusive: 1,
monitoringStatus: 1,
smt: 1,
jobState: 'running',
duration: 2*60*60,
tags: [],
resources: [
{
hostname: 'heidi',
hwthreads: [0, 1, 2, 3, 4, 5, 6, 7]
}
],
metaData: null,
startTime: 1641427200
}
fetch('http://localhost:8080/api/jobs/start_job/', {
method: 'POST',
body: JSON.stringify(job),
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJpc19hZG1pbiI6dHJ1ZSwiaXNfYXBpIjpmYWxzZSwic3ViIjoibG91In0.nY6dCgLSdm7zXz1xPkrb_3JnnUCgExXeXcrTlAAySs4p72VKJhmzzC1RxgkJE26l8tDYUilM-o-urzlaqK5aDA'
}
})
.then(res => res.status == 200 ? res.json() : res.text())
.then(res => console.log(res))

View File

@ -1 +0,0 @@
{"analysis_view_histogramMetrics":["flops_any","mem_bw","mem_used"],"analysis_view_scatterPlotMetrics":[["flops_any","mem_bw"],["flops_any","cpu_load"],["cpu_load","mem_bw"]],"job_view_nodestats_selectedMetrics":["flops_any","mem_bw","mem_used"],"job_view_polarPlotMetrics":["flops_any","mem_bw","mem_used","net_bw","file_bw"],"job_view_selectedMetrics":["flops_any","mem_bw","mem_used"],"plot_general_colorBackground":true,"plot_general_colorscheme":["#00bfff","#0000ff","#ff00ff","#ff0000","#ff8000","#ffff00","#80ff00"],"plot_general_lineWidth":1,"plot_list_jobsPerPage":10,"plot_list_selectedMetrics":["cpu_load","mem_used","flops_any","mem_bw","clock"],"plot_view_plotsPerRow":4,"plot_view_showPolarplot":true,"plot_view_showRoofline":true,"plot_view_showStatTable":true}