Refactor directory structure

This commit is contained in:
Jan Eitzinger
2022-06-21 17:52:36 +02:00
parent 45359cca9d
commit 81819db436
54 changed files with 767 additions and 454 deletions

642
internal/api/rest.go Normal file
View File

@@ -0,0 +1,642 @@
package api
import (
"bufio"
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/gorilla/mux"
)
type RestApi struct {
JobRepository *repository.JobRepository
Resolver *graph.Resolver
Authentication *auth.Authentication
MachineStateDir string
OngoingArchivings sync.WaitGroup
}
func (api *RestApi) MountRoutes(r *mux.Router) {
r = r.PathPrefix("/api").Subrouter()
r.StrictSlash(true)
r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/stop_job/", api.stopJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/stop_job/{id}", api.stopJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/import/", api.importJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
// r.HandleFunc("/jobs/{id}", api.getJob).Methods(http.MethodGet)
r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
if api.Authentication != nil {
r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete)
r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost)
r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost)
}
if api.MachineStateDir != "" {
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
}
}
type StartJobApiResponse struct {
DBID int64 `json:"id"`
}
type StopJobApiRequest struct {
// JobId, ClusterId and StartTime are optional.
// They are only used if no database id was provided.
JobId *int64 `json:"jobId"`
Cluster *string `json:"cluster"`
StartTime *int64 `json:"startTime"`
// Payload
StopTime int64 `json:"stopTime"`
State schema.JobState `json:"jobState"`
}
type ErrorResponse struct {
Status string `json:"status"`
Error string `json:"error"`
}
func handleError(err error, statusCode int, rw http.ResponseWriter) {
log.Warnf("REST API: %s", err.Error())
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(statusCode)
json.NewEncoder(rw).Encode(ErrorResponse{
Status: http.StatusText(statusCode),
Error: err.Error(),
})
}
func decode(r io.Reader, val interface{}) error {
dec := json.NewDecoder(r)
dec.DisallowUnknownFields()
return dec.Decode(val)
}
type TagJobApiRequest []*struct {
Name string `json:"name"`
Type string `json:"type"`
}
// Return a list of jobs
func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
withMetadata := false
filter := &model.JobFilter{}
page := &model.PageRequest{ItemsPerPage: -1, Page: 1}
order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
for key, vals := range r.URL.Query() {
switch key {
case "state":
for _, s := range vals {
state := schema.JobState(s)
if !state.Valid() {
http.Error(rw, "invalid query parameter value: state", http.StatusBadRequest)
return
}
filter.State = append(filter.State, state)
}
case "cluster":
filter.Cluster = &model.StringInput{Eq: &vals[0]}
case "start-time":
st := strings.Split(vals[0], "-")
if len(st) != 2 {
http.Error(rw, "invalid query parameter value: startTime", http.StatusBadRequest)
return
}
from, err := strconv.ParseInt(st[0], 10, 64)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
to, err := strconv.ParseInt(st[1], 10, 64)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
ufrom, uto := time.Unix(from, 0), time.Unix(to, 0)
filter.StartTime = &model.TimeRange{From: &ufrom, To: &uto}
case "page":
x, err := strconv.Atoi(vals[0])
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
page.Page = x
case "items-per-page":
x, err := strconv.Atoi(vals[0])
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
page.ItemsPerPage = x
case "with-metadata":
withMetadata = true
default:
http.Error(rw, "invalid query parameter: "+key, http.StatusBadRequest)
return
}
}
jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
results := make([]*schema.JobMeta, 0, len(jobs))
for _, job := range jobs {
if withMetadata {
if _, err := api.JobRepository.FetchMetadata(job); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
}
res := &schema.JobMeta{
ID: &job.ID,
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
}
res.Tags, err = api.JobRepository.GetTags(&job.ID)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if res.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful {
res.Statistics, err = metricdata.GetStatistics(job)
if err != nil {
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
}
}
results = append(results, res)
}
log.Debugf("/api/jobs: %d jobs returned", len(results))
bw := bufio.NewWriter(rw)
defer bw.Flush()
if err := json.NewEncoder(bw).Encode(map[string]interface{}{
"jobs": results,
}); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
}
// Add a tag to a job
func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
iid, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
job, err := api.JobRepository.FindById(iid)
if err != nil {
http.Error(rw, err.Error(), http.StatusNotFound)
return
}
job.Tags, err = api.JobRepository.GetTags(&job.ID)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
var req TagJobApiRequest
if err := decode(r.Body, &req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
for _, tag := range req {
tagId, err := api.JobRepository.AddTagOrCreate(job.ID, tag.Type, tag.Name)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
job.Tags = append(job.Tags, &schema.Tag{
ID: tagId,
Type: tag.Type,
Name: tag.Name,
})
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
// A new job started. The body should be in the `meta.json` format, but some fields required
// there are optional here (e.g. `jobState` defaults to "running").
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleApi) {
handleError(fmt.Errorf("missing role: %#v", auth.RoleApi), http.StatusForbidden, rw)
return
}
req := schema.JobMeta{BaseJob: schema.JobDefaults}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
return
}
if req.State == "" {
req.State = schema.JobStateRunning
}
if err := repository.SanityChecks(&req.BaseJob); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
// Check if combination of (job_id, cluster_id, start_time) already exists:
job, err := api.JobRepository.Find(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows {
handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw)
return
} else if err == nil {
if (req.StartTime - job.StartTimeUnix) < 86400 {
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d", job.ID), http.StatusUnprocessableEntity, rw)
return
}
}
id, err := api.JobRepository.Start(&req)
if err != nil {
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
return
}
for _, tag := range req.Tags {
if _, err := api.JobRepository.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw)
return
}
}
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusCreated)
json.NewEncoder(rw).Encode(StartJobApiResponse{
DBID: id,
})
}
// A job has stopped and should be archived.
func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleApi) {
handleError(fmt.Errorf("missing role: %#v", auth.RoleApi), http.StatusForbidden, rw)
return
}
// Parse request body
req := StopJobApiRequest{}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
return
}
// Fetch job (that will be stopped) from db
id, ok := mux.Vars(r)["id"]
var job *schema.Job
var err error
if ok {
id, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
return
}
job, err = api.JobRepository.FindById(id)
} else {
if req.JobId == nil {
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
return
}
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
}
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
// Sanity checks
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
handleError(errors.New("stopTime must be larger than startTime and only running jobs can be stopped"), http.StatusBadRequest, rw)
return
}
if req.State != "" && !req.State.Valid() {
handleError(fmt.Errorf("invalid job state: %#v", req.State), http.StatusBadRequest, rw)
return
} else {
req.State = schema.JobStateCompleted
}
// Mark job as stopped in the database (update state and duration)
job.Duration = int32(req.StopTime - job.StartTime.Unix())
job.State = req.State
if err := api.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
handleError(fmt.Errorf("marking job as stopped failed: %w", err), http.StatusInternalServerError, rw)
return
}
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
// Send a response (with status OK). This means that erros that happen from here on forward
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
// writing to the filesystem fails, the client will not know.
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
// Monitoring is disabled...
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
return
}
// We need to start a new goroutine as this functions needs to return
// for the response to be flushed to the client.
api.OngoingArchivings.Add(1) // So that a shutdown does not interrupt this goroutine.
go func() {
defer api.OngoingArchivings.Done()
if _, err := api.JobRepository.FetchMetadata(job); err != nil {
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
api.JobRepository.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
return
}
// metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and create meta.json/data.json files
jobMeta, err := metricdata.ArchiveJob(job, context.Background())
if err != nil {
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
api.JobRepository.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
return
}
// Update the jobs database entry one last time:
if err := api.JobRepository.Archive(job.ID, schema.MonitoringStatusArchivingSuccessful, jobMeta.Statistics); err != nil {
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
return
}
log.Printf("archiving job (dbid: %d) successful", job.ID)
}()
}
func (api *RestApi) importJob(rw http.ResponseWriter, r *http.Request) {
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleApi) {
handleError(fmt.Errorf("missing role: %#v", auth.RoleApi), http.StatusForbidden, rw)
return
}
var body struct {
Meta *schema.JobMeta `json:"meta"`
Data *schema.JobData `json:"data"`
}
if err := decode(r.Body, &body); err != nil {
handleError(fmt.Errorf("import failed: %s", err.Error()), http.StatusBadRequest, rw)
return
}
if err := api.JobRepository.ImportJob(body.Meta, body.Data); err != nil {
handleError(fmt.Errorf("import failed: %s", err.Error()), http.StatusUnprocessableEntity, rw)
return
}
rw.Write([]byte(`{ "status": "OK" }`))
}
func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
id := mux.Vars(r)["id"]
metrics := r.URL.Query()["metric"]
var scopes []schema.MetricScope
for _, scope := range r.URL.Query()["scope"] {
var s schema.MetricScope
if err := s.UnmarshalGQL(scope); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
scopes = append(scopes, s)
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
type Respone struct {
Data *struct {
JobMetrics []*model.JobMetricWithName `json:"jobMetrics"`
} `json:"data"`
Error *struct {
Message string `json:"message"`
} `json:"error"`
}
data, err := api.Resolver.Query().JobMetrics(r.Context(), id, metrics, scopes)
if err != nil {
json.NewEncoder(rw).Encode(Respone{
Error: &struct {
Message string "json:\"message\""
}{Message: err.Error()},
})
return
}
json.NewEncoder(rw).Encode(Respone{
Data: &struct {
JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\""
}{JobMetrics: data},
})
}
func (api *RestApi) getJWT(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
username := r.FormValue("username")
me := auth.GetUser(r.Context())
if !me.HasRole(auth.RoleAdmin) {
if username != me.Username {
http.Error(rw, "only admins are allowed to sign JWTs not for themselves", http.StatusForbidden)
return
}
}
user, err := api.Authentication.FetchUser(username)
if err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
jwt, err := api.Authentication.ProvideJWT(user)
if err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.WriteHeader(http.StatusOK)
rw.Write([]byte(jwt))
}
func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
me := auth.GetUser(r.Context())
if !me.HasRole(auth.RoleAdmin) {
http.Error(rw, "only admins are allowed to create new users", http.StatusForbidden)
return
}
username, password, role, name, email := r.FormValue("username"), r.FormValue("password"), r.FormValue("role"), r.FormValue("name"), r.FormValue("email")
if len(password) == 0 && role != auth.RoleApi {
http.Error(rw, "only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest)
return
}
if err := api.Authentication.CreateUser(username, name, password, email, []string{role}); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.Write([]byte(fmt.Sprintf("User %#v successfully created!\n", username)))
}
func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) {
if user := auth.GetUser(r.Context()); !user.HasRole(auth.RoleAdmin) {
http.Error(rw, "only admins are allowed to delete a user", http.StatusForbidden)
return
}
username := r.FormValue("username")
if err := api.Authentication.DelUser(username); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.WriteHeader(http.StatusOK)
}
func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) {
if user := auth.GetUser(r.Context()); !user.HasRole(auth.RoleAdmin) {
http.Error(rw, "only admins are allowed to fetch a list of users", http.StatusForbidden)
return
}
users, err := api.Authentication.FetchUsers(
r.URL.Query().Get("via-ldap") == "true",
r.URL.Query().Get("not-just-user") == "true")
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
json.NewEncoder(rw).Encode(users)
}
func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) {
if user := auth.GetUser(r.Context()); !user.HasRole(auth.RoleAdmin) {
http.Error(rw, "only admins are allowed to update a user", http.StatusForbidden)
return
}
// TODO: Handle anything but roles...
newrole := r.FormValue("add-role")
if err := api.Authentication.AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.Write([]byte("success"))
}
func (api *RestApi) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
key, value := r.FormValue("key"), r.FormValue("value")
fmt.Printf("KEY: %#v\nVALUE: %#v\n", key, value)
if err := config.UpdateConfig(key, value, r.Context()); err != nil {
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
return
}
rw.Write([]byte("success"))
}
func (api *RestApi) putMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
http.Error(rw, "not enabled", http.StatusNotFound)
return
}
vars := mux.Vars(r)
cluster := vars["cluster"]
host := vars["host"]
dir := filepath.Join(api.MachineStateDir, cluster)
if err := os.MkdirAll(dir, 0755); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
filename := filepath.Join(dir, fmt.Sprintf("%s.json", host))
f, err := os.Create(filename)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
defer f.Close()
if _, err := io.Copy(f, r.Body); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
rw.WriteHeader(http.StatusCreated)
}
func (api *RestApi) getMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
http.Error(rw, "not enabled", http.StatusNotFound)
return
}
vars := mux.Vars(r)
filename := filepath.Join(api.MachineStateDir, vars["cluster"], fmt.Sprintf("%s.json", vars["host"]))
// Sets the content-type and 'Last-Modified' Header and so on automatically
http.ServeFile(rw, r, filename)
}

473
internal/auth/auth.go Normal file
View File

@@ -0,0 +1,473 @@
package auth
import (
"context"
"crypto/ed25519"
"crypto/rand"
"database/sql"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"net/http"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
sq "github.com/Masterminds/squirrel"
"github.com/golang-jwt/jwt/v4"
"github.com/gorilla/sessions"
"github.com/jmoiron/sqlx"
"golang.org/x/crypto/bcrypt"
)
// Only Username and Roles will always be filled in when returned by `GetUser`.
// If Name and Email is needed as well, use auth.FetchUser(), which does a database
// query for all fields.
type User struct {
Username string `json:"username"`
Password string `json:"-"`
Name string `json:"name"`
Roles []string `json:"roles"`
ViaLdap bool `json:"via-ldap"`
Email string `json:"email"`
}
const (
RoleAdmin string = "admin"
RoleApi string = "api"
RoleUser string = "user"
)
func (u *User) HasRole(role string) bool {
for _, r := range u.Roles {
if r == role {
return true
}
}
return false
}
type ContextKey string
const ContextUserKey ContextKey = "user"
type Authentication struct {
db *sqlx.DB
sessionStore *sessions.CookieStore
jwtPublicKey ed25519.PublicKey
jwtPrivateKey ed25519.PrivateKey
ldapConfig *LdapConfig
ldapSyncUserPassword string
// If zero, tokens/sessions do not expire.
SessionMaxAge time.Duration
JwtMaxAge time.Duration
}
func (auth *Authentication) Init(db *sqlx.DB, ldapConfig *LdapConfig) error {
auth.db = db
_, err := db.Exec(`
CREATE TABLE IF NOT EXISTS user (
username varchar(255) PRIMARY KEY NOT NULL,
password varchar(255) DEFAULT NULL,
ldap tinyint NOT NULL DEFAULT 0,
name varchar(255) DEFAULT NULL,
roles varchar(255) NOT NULL DEFAULT "[]",
email varchar(255) DEFAULT NULL);`)
if err != nil {
return err
}
sessKey := os.Getenv("SESSION_KEY")
if sessKey == "" {
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
bytes := make([]byte, 32)
if _, err := rand.Read(bytes); err != nil {
return err
}
auth.sessionStore = sessions.NewCookieStore(bytes)
} else {
bytes, err := base64.StdEncoding.DecodeString(sessKey)
if err != nil {
return err
}
auth.sessionStore = sessions.NewCookieStore(bytes)
}
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
if pubKey == "" || privKey == "" {
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
} else {
bytes, err := base64.StdEncoding.DecodeString(pubKey)
if err != nil {
return err
}
auth.jwtPublicKey = ed25519.PublicKey(bytes)
bytes, err = base64.StdEncoding.DecodeString(privKey)
if err != nil {
return err
}
auth.jwtPrivateKey = ed25519.PrivateKey(bytes)
}
if ldapConfig != nil {
auth.ldapConfig = ldapConfig
if err := auth.initLdap(); err != nil {
return err
}
}
return nil
}
// arg must be formated like this: "<username>:[admin|api|]:<password>"
func (auth *Authentication) AddUser(arg string) error {
parts := strings.SplitN(arg, ":", 3)
if len(parts) != 3 || len(parts[0]) == 0 {
return errors.New("invalid argument format")
}
roles := strings.Split(parts[1], ",")
return auth.CreateUser(parts[0], "", parts[2], "", roles)
}
func (auth *Authentication) CreateUser(username, name, password, email string, roles []string) error {
for _, role := range roles {
if role != RoleAdmin && role != RoleApi && role != RoleUser {
return fmt.Errorf("invalid user role: %#v", role)
}
}
if username == "" {
return errors.New("username should not be empty")
}
if password != "" {
bytes, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
if err != nil {
return err
}
password = string(bytes)
}
rolesJson, _ := json.Marshal(roles)
cols := []string{"username", "password", "roles"}
vals := []interface{}{username, password, string(rolesJson)}
if name != "" {
cols = append(cols, "name")
vals = append(vals, name)
}
if email != "" {
cols = append(cols, "email")
vals = append(vals, email)
}
if _, err := sq.Insert("user").Columns(cols...).Values(vals...).RunWith(auth.db).Exec(); err != nil {
return err
}
log.Infof("new user %#v created (roles: %s)", username, roles)
return nil
}
func (auth *Authentication) AddRole(ctx context.Context, username string, role string) error {
user, err := auth.FetchUser(username)
if err != nil {
return err
}
if role != RoleAdmin && role != RoleApi && role != RoleUser {
return fmt.Errorf("invalid user role: %#v", role)
}
for _, r := range user.Roles {
if r == role {
return fmt.Errorf("user %#v already has role %#v", username, role)
}
}
roles, _ := json.Marshal(append(user.Roles, role))
if _, err := sq.Update("user").Set("roles", roles).Where("user.username = ?", username).RunWith(auth.db).Exec(); err != nil {
return err
}
return nil
}
func (auth *Authentication) DelUser(username string) error {
_, err := auth.db.Exec(`DELETE FROM user WHERE user.username = ?`, username)
return err
}
func (auth *Authentication) FetchUsers(viaLdap, notJustUser bool) ([]*User, error) {
q := sq.Select("username", "name", "email", "roles").From("user")
if !viaLdap {
if notJustUser {
q = q.Where("ldap = 0 OR (roles != '[\"user\"]' AND roles != '[]')")
} else {
q = q.Where("ldap = 0")
}
} else {
if notJustUser {
q = q.Where("ldap = 1 OR (roles != '[\"user\"]' AND roles != '[]')")
} else {
q = q.Where("ldap = 1")
}
}
rows, err := q.RunWith(auth.db).Query()
if err != nil {
return nil, err
}
users := make([]*User, 0)
defer rows.Close()
for rows.Next() {
rawroles := ""
user := &User{}
var name, email sql.NullString
if err := rows.Scan(&user.Username, &name, &email, &rawroles); err != nil {
return nil, err
}
if err := json.Unmarshal([]byte(rawroles), &user.Roles); err != nil {
return nil, err
}
user.Name = name.String
user.Email = email.String
users = append(users, user)
}
return users, nil
}
func (auth *Authentication) FetchUser(username string) (*User, error) {
user := &User{Username: username}
var hashedPassword, name, rawRoles, email sql.NullString
if err := sq.Select("password", "ldap", "name", "roles", "email").From("user").
Where("user.username = ?", username).RunWith(auth.db).
QueryRow().Scan(&hashedPassword, &user.ViaLdap, &name, &rawRoles, &email); err != nil {
return nil, fmt.Errorf("user '%s' not found (%s)", username, err.Error())
}
user.Password = hashedPassword.String
user.Name = name.String
user.Email = email.String
if rawRoles.Valid {
if err := json.Unmarshal([]byte(rawRoles.String), &user.Roles); err != nil {
return nil, err
}
}
return user, nil
}
func FetchUser(ctx context.Context, db *sqlx.DB, username string) (*model.User, error) {
me := GetUser(ctx)
if me != nil && !me.HasRole(RoleAdmin) && me.Username != username {
return nil, errors.New("forbidden")
}
user := &model.User{Username: username}
var name, email sql.NullString
if err := sq.Select("name", "email").From("user").Where("user.username = ?", username).
RunWith(db).QueryRow().Scan(&name, &email); err != nil {
if err == sql.ErrNoRows {
return nil, nil
}
return nil, err
}
user.Name = name.String
user.Email = email.String
return user, nil
}
// Handle a POST request that should log the user in, starting a new session.
func (auth *Authentication) Login(onsuccess http.Handler, onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error)) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
username, password := r.FormValue("username"), r.FormValue("password")
user, err := auth.FetchUser(username)
if err == nil && user.ViaLdap && auth.ldapConfig != nil {
err = auth.loginViaLdap(user, password)
} else if err == nil && !user.ViaLdap && user.Password != "" {
if e := bcrypt.CompareHashAndPassword([]byte(user.Password), []byte(password)); e != nil {
err = fmt.Errorf("user '%s' provided the wrong password (%s)", username, e.Error())
}
} else {
err = errors.New("could not authenticate user")
}
if err != nil {
log.Warnf("login of user %#v failed: %s", username, err.Error())
onfailure(rw, r, err)
return
}
session, err := auth.sessionStore.New(r, "session")
if err != nil {
log.Errorf("session creation failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if auth.SessionMaxAge != 0 {
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
}
session.Values["username"] = user.Username
session.Values["roles"] = user.Roles
if err := auth.sessionStore.Save(r, rw, session); err != nil {
log.Errorf("session save failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
log.Infof("login successfull: user: %#v (roles: %v)", user.Username, user.Roles)
ctx := context.WithValue(r.Context(), ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
})
}
var ErrTokenInvalid error = errors.New("invalid token")
func (auth *Authentication) authViaToken(r *http.Request) (*User, error) {
if auth.jwtPublicKey == nil {
return nil, nil
}
rawtoken := r.Header.Get("X-Auth-Token")
if rawtoken == "" {
rawtoken = r.Header.Get("Authorization")
prefix := "Bearer "
if !strings.HasPrefix(rawtoken, prefix) {
return nil, nil
}
rawtoken = rawtoken[len(prefix):]
}
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
return auth.jwtPublicKey, nil
})
if err != nil {
return nil, err
}
if err := token.Claims.Valid(); err != nil {
return nil, err
}
claims := token.Claims.(jwt.MapClaims)
sub, _ := claims["sub"].(string)
var roles []string
if rawroles, ok := claims["roles"].([]interface{}); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
roles = append(roles, r)
}
}
}
// TODO: Check if sub is still a valid user!
return &User{
Username: sub,
Roles: roles,
}, nil
}
// Authenticate the user and put a User object in the
// context of the request. If authentication fails,
// do not continue but send client to the login screen.
func (auth *Authentication) Auth(onsuccess http.Handler, onfailure func(rw http.ResponseWriter, r *http.Request, authErr error)) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
user, err := auth.authViaToken(r)
if err != nil {
log.Warnf("authentication failed: %s", err.Error())
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
if user != nil {
// Successfull authentication using a token
ctx := context.WithValue(r.Context(), ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
session, err := auth.sessionStore.Get(r, "session")
if err != nil {
// sessionStore.Get will return a new session if no current one is attached to this request.
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if session.IsNew {
log.Warn("authentication failed: no session or jwt found")
onfailure(rw, r, errors.New("no valid session or JWT provided"))
return
}
username, _ := session.Values["username"].(string)
roles, _ := session.Values["roles"].([]string)
ctx := context.WithValue(r.Context(), ContextUserKey, &User{
Username: username,
Roles: roles,
})
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
})
}
// Generate a new JWT that can be used for authentication
func (auth *Authentication) ProvideJWT(user *User) (string, error) {
if auth.jwtPrivateKey == nil {
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
}
now := time.Now()
claims := jwt.MapClaims{
"sub": user.Username,
"roles": user.Roles,
"iat": now.Unix(),
}
if auth.JwtMaxAge != 0 {
claims["exp"] = now.Add(auth.JwtMaxAge).Unix()
}
return jwt.NewWithClaims(jwt.SigningMethodEdDSA, claims).SignedString(auth.jwtPrivateKey)
}
func GetUser(ctx context.Context) *User {
x := ctx.Value(ContextUserKey)
if x == nil {
return nil
}
return x.(*User)
}
// Clears the session cookie
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
session, err := auth.sessionStore.Get(r, "session")
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if !session.IsNew {
session.Options.MaxAge = -1
if err := auth.sessionStore.Save(r, rw, session); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
}
onsuccess.ServeHTTP(rw, r)
})
}

160
internal/auth/ldap.go Normal file
View File

@@ -0,0 +1,160 @@
package auth
import (
"errors"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/go-ldap/ldap/v3"
)
type LdapConfig struct {
Url string `json:"url"`
UserBase string `json:"user_base"`
SearchDN string `json:"search_dn"`
UserBind string `json:"user_bind"`
UserFilter string `json:"user_filter"`
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
SyncDelOldUsers bool `json:"sync_del_old_users"`
}
func (auth *Authentication) initLdap() error {
auth.ldapSyncUserPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
if auth.ldapSyncUserPassword == "" {
log.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync or authentication will not work)")
}
if auth.ldapConfig.SyncInterval != "" {
interval, err := time.ParseDuration(auth.ldapConfig.SyncInterval)
if err != nil {
return err
}
if interval == 0 {
return nil
}
go func() {
ticker := time.NewTicker(interval)
for t := range ticker.C {
log.Printf("LDAP sync started at %s", t.Format(time.RFC3339))
if err := auth.SyncWithLDAP(auth.ldapConfig.SyncDelOldUsers); err != nil {
log.Errorf("LDAP sync failed: %s", err.Error())
}
log.Print("LDAP sync done")
}
}()
}
return nil
}
// TODO: Add a connection pool or something like
// that so that connections can be reused/cached.
func (auth *Authentication) getLdapConnection(admin bool) (*ldap.Conn, error) {
conn, err := ldap.DialURL(auth.ldapConfig.Url)
if err != nil {
return nil, err
}
if admin {
if err := conn.Bind(auth.ldapConfig.SearchDN, auth.ldapSyncUserPassword); err != nil {
conn.Close()
return nil, err
}
}
return conn, nil
}
func (auth *Authentication) loginViaLdap(user *User, password string) error {
l, err := auth.getLdapConnection(false)
if err != nil {
return err
}
defer l.Close()
userDn := strings.Replace(auth.ldapConfig.UserBind, "{username}", user.Username, -1)
if err := l.Bind(userDn, password); err != nil {
return err
}
user.ViaLdap = true
return nil
}
// Delete users where user.ldap is 1 and that do not show up in the ldap search results.
// Add users to the users table that are new in the ldap search results.
func (auth *Authentication) SyncWithLDAP(deleteOldUsers bool) error {
if auth.ldapConfig == nil {
return errors.New("ldap not enabled")
}
const IN_DB int = 1
const IN_LDAP int = 2
const IN_BOTH int = 3
users := map[string]int{}
rows, err := auth.db.Query(`SELECT username FROM user WHERE user.ldap = 1`)
if err != nil {
return err
}
for rows.Next() {
var username string
if err := rows.Scan(&username); err != nil {
return err
}
users[username] = IN_DB
}
l, err := auth.getLdapConnection(true)
if err != nil {
return err
}
defer l.Close()
ldapResults, err := l.Search(ldap.NewSearchRequest(
auth.ldapConfig.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
auth.ldapConfig.UserFilter, []string{"dn", "uid", "gecos"}, nil))
if err != nil {
return err
}
newnames := map[string]string{}
for _, entry := range ldapResults.Entries {
username := entry.GetAttributeValue("uid")
if username == "" {
return errors.New("no attribute 'uid'")
}
_, ok := users[username]
if !ok {
users[username] = IN_LDAP
newnames[username] = entry.GetAttributeValue("gecos")
} else {
users[username] = IN_BOTH
}
}
for username, where := range users {
if where == IN_DB && deleteOldUsers {
log.Infof("ldap-sync: remove %#v (does not show up in LDAP anymore)", username)
if _, err := auth.db.Exec(`DELETE FROM user WHERE user.username = ?`, username); err != nil {
return err
}
} else if where == IN_LDAP {
name := newnames[username]
log.Infof("ldap-sync: add %#v (name: %#v, roles: [user], ldap: true)", username, name)
if _, err := auth.db.Exec(`INSERT INTO user (username, ldap, name, roles) VALUES (?, ?, ?, ?)`,
username, 1, name, "[\""+RoleUser+"\"]"); err != nil {
return err
}
}
}
return nil
}

298
internal/config/config.go Normal file
View File

@@ -0,0 +1,298 @@
package config
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"os"
"path/filepath"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/iamlouk/lrucache"
"github.com/jmoiron/sqlx"
)
var db *sqlx.DB
var lookupConfigStmt *sqlx.Stmt
var lock sync.RWMutex
var uiDefaults map[string]interface{}
var cache *lrucache.Cache = lrucache.New(1024)
var Clusters []*model.Cluster
var nodeLists map[string]map[string]NodeList
func Init(usersdb *sqlx.DB, authEnabled bool, uiConfig map[string]interface{}, jobArchive string) error {
db = usersdb
uiDefaults = uiConfig
entries, err := os.ReadDir(jobArchive)
if err != nil {
return err
}
Clusters = []*model.Cluster{}
nodeLists = map[string]map[string]NodeList{}
for _, de := range entries {
raw, err := os.ReadFile(filepath.Join(jobArchive, de.Name(), "cluster.json"))
if err != nil {
return err
}
var cluster model.Cluster
// Disabled because of the historic 'measurement' field.
// dec := json.NewDecoder(bytes.NewBuffer(raw))
// dec.DisallowUnknownFields()
// if err := dec.Decode(&cluster); err != nil {
// return err
// }
if err := json.Unmarshal(raw, &cluster); err != nil {
return err
}
if len(cluster.Name) == 0 || len(cluster.MetricConfig) == 0 || len(cluster.SubClusters) == 0 {
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
}
for _, mc := range cluster.MetricConfig {
if len(mc.Name) == 0 {
return errors.New("cluster.metricConfig.name should not be empty")
}
if mc.Timestep < 1 {
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
}
// For backwards compability...
if mc.Scope == "" {
mc.Scope = schema.MetricScopeNode
}
if !mc.Scope.Valid() {
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
}
}
if cluster.FilterRanges.StartTime.To.IsZero() {
cluster.FilterRanges.StartTime.To = time.Unix(0, 0)
}
if cluster.Name != de.Name() {
return fmt.Errorf("the file '.../%s/cluster.json' contains the clusterId '%s'", de.Name(), cluster.Name)
}
Clusters = append(Clusters, &cluster)
nodeLists[cluster.Name] = make(map[string]NodeList)
for _, sc := range cluster.SubClusters {
if sc.Nodes == "" {
continue
}
nl, err := ParseNodeList(sc.Nodes)
if err != nil {
return fmt.Errorf("in %s/cluster.json: %w", cluster.Name, err)
}
nodeLists[cluster.Name][sc.Name] = nl
}
}
if authEnabled {
_, err := db.Exec(`
CREATE TABLE IF NOT EXISTS configuration (
username varchar(255),
confkey varchar(255),
value varchar(255),
PRIMARY KEY (username, confkey),
FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);`)
if err != nil {
return err
}
lookupConfigStmt, err = db.Preparex(`SELECT confkey, value FROM configuration WHERE configuration.username = ?`)
if err != nil {
return err
}
}
return nil
}
// Return the personalised UI config for the currently authenticated
// user or return the plain default config.
func GetUIConfig(r *http.Request) (map[string]interface{}, error) {
user := auth.GetUser(r.Context())
if user == nil {
lock.RLock()
copy := make(map[string]interface{}, len(uiDefaults))
for k, v := range uiDefaults {
copy[k] = v
}
lock.RUnlock()
return copy, nil
}
data := cache.Get(user.Username, func() (interface{}, time.Duration, int) {
config := make(map[string]interface{}, len(uiDefaults))
for k, v := range uiDefaults {
config[k] = v
}
rows, err := lookupConfigStmt.Query(user.Username)
if err != nil {
return err, 0, 0
}
size := 0
defer rows.Close()
for rows.Next() {
var key, rawval string
if err := rows.Scan(&key, &rawval); err != nil {
return err, 0, 0
}
var val interface{}
if err := json.Unmarshal([]byte(rawval), &val); err != nil {
return err, 0, 0
}
size += len(key)
size += len(rawval)
config[key] = val
}
return config, 24 * time.Hour, size
})
if err, ok := data.(error); ok {
return nil, err
}
return data.(map[string]interface{}), nil
}
// If the context does not have a user, update the global ui configuration without persisting it!
// If there is a (authenticated) user, update only his configuration.
func UpdateConfig(key, value string, ctx context.Context) error {
user := auth.GetUser(ctx)
if user == nil {
var val interface{}
if err := json.Unmarshal([]byte(value), &val); err != nil {
return err
}
lock.Lock()
defer lock.Unlock()
uiDefaults[key] = val
return nil
}
// Disabled because now `plot_list_selectedMetrics:<cluster>` is possible.
// if _, ok := uiDefaults[key]; !ok {
// return errors.New("this configuration key does not exist")
// }
if _, err := db.Exec(`REPLACE INTO configuration (username, confkey, value) VALUES (?, ?, ?)`,
user.Username, key, value); err != nil {
return err
}
cache.Del(user.Username)
return nil
}
func GetCluster(cluster string) *model.Cluster {
for _, c := range Clusters {
if c.Name == cluster {
return c
}
}
return nil
}
func GetSubCluster(cluster, subcluster string) *model.SubCluster {
for _, c := range Clusters {
if c.Name == cluster {
for _, p := range c.SubClusters {
if p.Name == subcluster {
return p
}
}
}
}
return nil
}
func GetMetricConfig(cluster, metric string) *model.MetricConfig {
for _, c := range Clusters {
if c.Name == cluster {
for _, m := range c.MetricConfig {
if m.Name == metric {
return m
}
}
}
}
return nil
}
// AssignSubCluster sets the `job.subcluster` property of the job based
// on its cluster and resources.
func AssignSubCluster(job *schema.BaseJob) error {
cluster := GetCluster(job.Cluster)
if cluster == nil {
return fmt.Errorf("unkown cluster: %#v", job.Cluster)
}
if job.SubCluster != "" {
for _, sc := range cluster.SubClusters {
if sc.Name == job.SubCluster {
return nil
}
}
return fmt.Errorf("already assigned subcluster %#v unkown (cluster: %#v)", job.SubCluster, job.Cluster)
}
if len(job.Resources) == 0 {
return fmt.Errorf("job without any resources/hosts")
}
host0 := job.Resources[0].Hostname
for sc, nl := range nodeLists[job.Cluster] {
if nl != nil && nl.Contains(host0) {
job.SubCluster = sc
return nil
}
}
if cluster.SubClusters[0].Nodes == "" {
job.SubCluster = cluster.SubClusters[0].Name
return nil
}
return fmt.Errorf("no subcluster found for cluster %#v and host %#v", job.Cluster, host0)
}
func GetSubClusterByNode(cluster, hostname string) (string, error) {
for sc, nl := range nodeLists[cluster] {
if nl != nil && nl.Contains(hostname) {
return sc, nil
}
}
c := GetCluster(cluster)
if c == nil {
return "", fmt.Errorf("unkown cluster: %#v", cluster)
}
if c.SubClusters[0].Nodes == "" {
return c.SubClusters[0].Name, nil
}
return "", fmt.Errorf("no subcluster found for cluster %#v and host %#v", cluster, hostname)
}

171
internal/config/nodelist.go Normal file
View File

@@ -0,0 +1,171 @@
package config
import (
"fmt"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
type NodeList [][]interface {
consume(input string) (next string, ok bool)
}
func (nl *NodeList) Contains(name string) bool {
var ok bool
for _, term := range *nl {
str := name
for _, expr := range term {
str, ok = expr.consume(str)
if !ok {
break
}
}
if ok && str == "" {
return true
}
}
return false
}
type NLExprString string
func (nle NLExprString) consume(input string) (next string, ok bool) {
str := string(nle)
if strings.HasPrefix(input, str) {
return strings.TrimPrefix(input, str), true
}
return "", false
}
type NLExprIntRanges []NLExprIntRange
func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
for _, nle := range nles {
if next, ok := nle.consume(input); ok {
return next, ok
}
}
return "", false
}
type NLExprIntRange struct {
start, end int64
zeroPadded bool
digits int
}
func (nle NLExprIntRange) consume(input string) (next string, ok bool) {
if !nle.zeroPadded || nle.digits < 1 {
log.Error("node list: only zero-padded ranges are allowed")
return "", false
}
if len(input) < nle.digits {
return "", false
}
numerals, rest := input[:nle.digits], input[nle.digits:]
for len(numerals) > 1 && numerals[0] == '0' {
numerals = numerals[1:]
}
x, err := strconv.ParseInt(numerals, 10, 32)
if err != nil {
return "", false
}
if nle.start <= x && x <= nle.end {
return rest, true
}
return "", false
}
func ParseNodeList(raw string) (NodeList, error) {
isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') }
isDigit := func(r byte) bool { return '0' <= r && r <= '9' }
rawterms := []string{}
prevterm := 0
for i := 0; i < len(raw); i++ {
if raw[i] == '[' {
for i < len(raw) && raw[i] != ']' {
i++
}
if i == len(raw) {
return nil, fmt.Errorf("node list: unclosed '['")
}
} else if raw[i] == ',' {
rawterms = append(rawterms, raw[prevterm:i])
prevterm = i + 1
}
}
if prevterm != len(raw) {
rawterms = append(rawterms, raw[prevterm:])
}
nl := NodeList{}
for _, rawterm := range rawterms {
exprs := []interface {
consume(input string) (next string, ok bool)
}{}
for i := 0; i < len(rawterm); i++ {
c := rawterm[i]
if isLetter(c) || isDigit(c) {
j := i
for j < len(rawterm) && (isLetter(rawterm[j]) || isDigit(rawterm[j])) {
j++
}
exprs = append(exprs, NLExprString(rawterm[i:j]))
i = j - 1
} else if c == '[' {
end := strings.Index(rawterm[i:], "]")
if end == -1 {
return nil, fmt.Errorf("node list: unclosed '['")
}
parts := strings.Split(rawterm[i+1:i+end], ",")
nles := NLExprIntRanges{}
for _, part := range parts {
minus := strings.Index(part, "-")
if minus == -1 {
return nil, fmt.Errorf("node list: no '-' found inside '[...]'")
}
s1, s2 := part[0:minus], part[minus+1:]
if len(s1) != len(s2) || len(s1) == 0 {
return nil, fmt.Errorf("node list: %#v and %#v are not of equal length or of length zero", s1, s2)
}
x1, err := strconv.ParseInt(s1, 10, 32)
if err != nil {
return nil, fmt.Errorf("node list: %w", err)
}
x2, err := strconv.ParseInt(s2, 10, 32)
if err != nil {
return nil, fmt.Errorf("node list: %w", err)
}
nles = append(nles, NLExprIntRange{
start: x1,
end: x2,
digits: len(s1),
zeroPadded: true,
})
}
exprs = append(exprs, nles)
i += end
} else {
return nil, fmt.Errorf("node list: invalid character: %#v", rune(c))
}
}
nl = append(nl, exprs)
}
return nl, nil
}

View File

@@ -0,0 +1,55 @@
package config
import (
"testing"
)
func TestNodeList(t *testing.T) {
nl, err := ParseNodeList("hallo,wel123t,emmy[01-99],fritz[005-500],woody[100-200]")
if err != nil {
t.Fatal(err)
}
if nl.Contains("hello") || nl.Contains("woody") {
t.Fail()
}
if nl.Contains("fritz1") || nl.Contains("fritz9") || nl.Contains("fritz004") || nl.Contains("woody201") {
t.Fail()
}
if !nl.Contains("hallo") || !nl.Contains("wel123t") {
t.Fail()
}
if !nl.Contains("emmy01") || !nl.Contains("emmy42") || !nl.Contains("emmy99") {
t.Fail()
}
if !nl.Contains("woody100") || !nl.Contains("woody199") {
t.Fail()
}
}
func TestNodeListCommasInBrackets(t *testing.T) {
nl, err := ParseNodeList("a[1000-2000,2010-2090,3000-5000]")
if err != nil {
t.Fatal(err)
}
if nl.Contains("hello") || nl.Contains("woody") {
t.Fatal("1")
}
if nl.Contains("a0") || nl.Contains("a0000") || nl.Contains("a5001") || nl.Contains("a2005") {
t.Fatal("2")
}
if !nl.Contains("a1001") || !nl.Contains("a2000") {
t.Fatal("3")
}
if !nl.Contains("a2042") || !nl.Contains("a4321") || !nl.Contains("a3000") {
t.Fatal("4")
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,122 @@
package model
import (
"encoding/json"
"strconv"
)
type Cluster struct {
Name string `json:"name"`
MetricConfig []*MetricConfig `json:"metricConfig"`
FilterRanges *FilterRanges `json:"filterRanges"`
SubClusters []*SubCluster `json:"subClusters"`
// NOT part of the GraphQL API. This has to be a JSON object with a field `"kind"`.
// All other fields depend on that kind (e.g. "cc-metric-store", "influxdb-v2").
MetricDataRepository json.RawMessage `json:"metricDataRepository"`
}
// Return a list of socket IDs given a list of hwthread IDs.
// Even if just one hwthread is in that socket, add it to the list.
// If no hwthreads other than those in the argument list are assigned to
// one of the sockets in the first return value, return true as the second value.
// TODO: Optimize this, there must be a more efficient way/algorithm.
func (topo *Topology) GetSocketsFromHWThreads(hwthreads []int) (sockets []int, exclusive bool) {
socketsMap := map[int]int{}
for _, hwthread := range hwthreads {
for socket, hwthreadsInSocket := range topo.Socket {
for _, hwthreadInSocket := range hwthreadsInSocket {
if hwthread == hwthreadInSocket {
socketsMap[socket] += 1
}
}
}
}
exclusive = true
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
sockets = make([]int, 0, len(socketsMap))
for socket, count := range socketsMap {
sockets = append(sockets, socket)
exclusive = exclusive && count == hwthreadsPerSocket
}
return sockets, exclusive
}
// Return a list of core IDs given a list of hwthread IDs.
// Even if just one hwthread is in that core, add it to the list.
// If no hwthreads other than those in the argument list are assigned to
// one of the cores in the first return value, return true as the second value.
// TODO: Optimize this, there must be a more efficient way/algorithm.
func (topo *Topology) GetCoresFromHWThreads(hwthreads []int) (cores []int, exclusive bool) {
coresMap := map[int]int{}
for _, hwthread := range hwthreads {
for core, hwthreadsInCore := range topo.Core {
for _, hwthreadInCore := range hwthreadsInCore {
if hwthread == hwthreadInCore {
coresMap[core] += 1
}
}
}
}
exclusive = true
hwthreadsPerCore := len(topo.Node) / len(topo.Core)
cores = make([]int, 0, len(coresMap))
for core, count := range coresMap {
cores = append(cores, core)
exclusive = exclusive && count == hwthreadsPerCore
}
return cores, exclusive
}
// Return a list of memory domain IDs given a list of hwthread IDs.
// Even if just one hwthread is in that memory domain, add it to the list.
// If no hwthreads other than those in the argument list are assigned to
// one of the memory domains in the first return value, return true as the second value.
// TODO: Optimize this, there must be a more efficient way/algorithm.
func (topo *Topology) GetMemoryDomainsFromHWThreads(hwthreads []int) (memDoms []int, exclusive bool) {
memDomsMap := map[int]int{}
for _, hwthread := range hwthreads {
for memDom, hwthreadsInmemDom := range topo.MemoryDomain {
for _, hwthreadInmemDom := range hwthreadsInmemDom {
if hwthread == hwthreadInmemDom {
memDomsMap[memDom] += 1
}
}
}
}
exclusive = true
hwthreadsPermemDom := len(topo.Node) / len(topo.MemoryDomain)
memDoms = make([]int, 0, len(memDomsMap))
for memDom, count := range memDomsMap {
memDoms = append(memDoms, memDom)
exclusive = exclusive && count == hwthreadsPermemDom
}
return memDoms, exclusive
}
func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
accels := make([]int, 0)
for _, accel := range topo.Accelerators {
id, err := strconv.Atoi(accel.ID)
if err != nil {
return nil, err
}
accels = append(accels, id)
}
return accels, nil
}
func (topo *Topology) GetAcceleratorIndex(id string) (int, bool) {
for idx, accel := range topo.Accelerators {
if accel.ID == id {
return idx, true
}
}
return -1, false
}

View File

@@ -0,0 +1,310 @@
// Code generated by github.com/99designs/gqlgen, DO NOT EDIT.
package model
import (
"fmt"
"io"
"strconv"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type Accelerator struct {
ID string `json:"id"`
Type string `json:"type"`
Model string `json:"model"`
}
type Count struct {
Name string `json:"name"`
Count int `json:"count"`
}
type FilterRanges struct {
Duration *IntRangeOutput `json:"duration"`
NumNodes *IntRangeOutput `json:"numNodes"`
StartTime *TimeRangeOutput `json:"startTime"`
}
type FloatRange struct {
From float64 `json:"from"`
To float64 `json:"to"`
}
type Footprints struct {
Nodehours []schema.Float `json:"nodehours"`
Metrics []*MetricFootprints `json:"metrics"`
}
type HistoPoint struct {
Count int `json:"count"`
Value int `json:"value"`
}
type IntRange struct {
From int `json:"from"`
To int `json:"to"`
}
type IntRangeOutput struct {
From int `json:"from"`
To int `json:"to"`
}
type JobFilter struct {
Tags []string `json:"tags"`
JobID *StringInput `json:"jobId"`
ArrayJobID *int `json:"arrayJobId"`
User *StringInput `json:"user"`
Project *StringInput `json:"project"`
Cluster *StringInput `json:"cluster"`
Partition *StringInput `json:"partition"`
Duration *IntRange `json:"duration"`
MinRunningFor *int `json:"minRunningFor"`
NumNodes *IntRange `json:"numNodes"`
NumAccelerators *IntRange `json:"numAccelerators"`
NumHWThreads *IntRange `json:"numHWThreads"`
StartTime *TimeRange `json:"startTime"`
State []schema.JobState `json:"state"`
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
MemBwAvg *FloatRange `json:"memBwAvg"`
LoadAvg *FloatRange `json:"loadAvg"`
MemUsedMax *FloatRange `json:"memUsedMax"`
}
type JobMetricWithName struct {
Name string `json:"name"`
Metric *schema.JobMetric `json:"metric"`
}
type JobResultList struct {
Items []*schema.Job `json:"items"`
Offset *int `json:"offset"`
Limit *int `json:"limit"`
Count *int `json:"count"`
}
type JobsStatistics struct {
ID string `json:"id"`
TotalJobs int `json:"totalJobs"`
ShortJobs int `json:"shortJobs"`
TotalWalltime int `json:"totalWalltime"`
TotalCoreHours int `json:"totalCoreHours"`
HistDuration []*HistoPoint `json:"histDuration"`
HistNumNodes []*HistoPoint `json:"histNumNodes"`
}
type MetricConfig struct {
Name string `json:"name"`
Unit string `json:"unit"`
Scope schema.MetricScope `json:"scope"`
Aggregation *string `json:"aggregation"`
Timestep int `json:"timestep"`
Peak *float64 `json:"peak"`
Normal *float64 `json:"normal"`
Caution *float64 `json:"caution"`
Alert *float64 `json:"alert"`
SubClusters []*SubClusterConfig `json:"subClusters"`
}
type MetricFootprints struct {
Metric string `json:"metric"`
Data []schema.Float `json:"data"`
}
type NodeMetrics struct {
Host string `json:"host"`
SubCluster string `json:"subCluster"`
Metrics []*JobMetricWithName `json:"metrics"`
}
type OrderByInput struct {
Field string `json:"field"`
Order SortDirectionEnum `json:"order"`
}
type PageRequest struct {
ItemsPerPage int `json:"itemsPerPage"`
Page int `json:"page"`
}
type StringInput struct {
Eq *string `json:"eq"`
Contains *string `json:"contains"`
StartsWith *string `json:"startsWith"`
EndsWith *string `json:"endsWith"`
}
type SubCluster struct {
Name string `json:"name"`
Nodes string `json:"nodes"`
NumberOfNodes int `json:"numberOfNodes"`
ProcessorType string `json:"processorType"`
SocketsPerNode int `json:"socketsPerNode"`
CoresPerSocket int `json:"coresPerSocket"`
ThreadsPerCore int `json:"threadsPerCore"`
FlopRateScalar int `json:"flopRateScalar"`
FlopRateSimd int `json:"flopRateSimd"`
MemoryBandwidth int `json:"memoryBandwidth"`
Topology *Topology `json:"topology"`
}
type SubClusterConfig struct {
Name string `json:"name"`
Peak float64 `json:"peak"`
Normal float64 `json:"normal"`
Caution float64 `json:"caution"`
Alert float64 `json:"alert"`
}
type TimeRange struct {
From *time.Time `json:"from"`
To *time.Time `json:"to"`
}
type TimeRangeOutput struct {
From time.Time `json:"from"`
To time.Time `json:"to"`
}
type Topology struct {
Node []int `json:"node"`
Socket [][]int `json:"socket"`
MemoryDomain [][]int `json:"memoryDomain"`
Die [][]int `json:"die"`
Core [][]int `json:"core"`
Accelerators []*Accelerator `json:"accelerators"`
}
type User struct {
Username string `json:"username"`
Name string `json:"name"`
Email string `json:"email"`
}
type Aggregate string
const (
AggregateUser Aggregate = "USER"
AggregateProject Aggregate = "PROJECT"
AggregateCluster Aggregate = "CLUSTER"
)
var AllAggregate = []Aggregate{
AggregateUser,
AggregateProject,
AggregateCluster,
}
func (e Aggregate) IsValid() bool {
switch e {
case AggregateUser, AggregateProject, AggregateCluster:
return true
}
return false
}
func (e Aggregate) String() string {
return string(e)
}
func (e *Aggregate) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = Aggregate(str)
if !e.IsValid() {
return fmt.Errorf("%s is not a valid Aggregate", str)
}
return nil
}
func (e Aggregate) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
type SortDirectionEnum string
const (
SortDirectionEnumDesc SortDirectionEnum = "DESC"
SortDirectionEnumAsc SortDirectionEnum = "ASC"
)
var AllSortDirectionEnum = []SortDirectionEnum{
SortDirectionEnumDesc,
SortDirectionEnumAsc,
}
func (e SortDirectionEnum) IsValid() bool {
switch e {
case SortDirectionEnumDesc, SortDirectionEnumAsc:
return true
}
return false
}
func (e SortDirectionEnum) String() string {
return string(e)
}
func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = SortDirectionEnum(str)
if !e.IsValid() {
return fmt.Errorf("%s is not a valid SortDirectionEnum", str)
}
return nil
}
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}
type Weights string
const (
WeightsNodeCount Weights = "NODE_COUNT"
WeightsNodeHours Weights = "NODE_HOURS"
)
var AllWeights = []Weights{
WeightsNodeCount,
WeightsNodeHours,
}
func (e Weights) IsValid() bool {
switch e {
case WeightsNodeCount, WeightsNodeHours:
return true
}
return false
}
func (e Weights) String() string {
return string(e)
}
func (e *Weights) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = Weights(str)
if !e.IsValid() {
return fmt.Errorf("%s is not a valid Weights", str)
}
return nil
}
func (e Weights) MarshalGQL(w io.Writer) {
fmt.Fprint(w, strconv.Quote(e.String()))
}

View File

@@ -0,0 +1,15 @@
package graph
import (
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/jmoiron/sqlx"
)
// This file will not be regenerated automatically.
//
// It serves as dependency injection for your app, add any dependencies you require here.
type Resolver struct {
DB *sqlx.DB
Repo *repository.JobRepository
}

View File

@@ -0,0 +1,275 @@
scalar Time
scalar Any
scalar NullableFloat
scalar MetricScope
scalar JobState
type Job {
id: ID!
jobId: Int!
user: String!
project: String!
cluster: String!
subCluster: String!
startTime: Time!
duration: Int!
walltime: Int!
numNodes: Int!
numHWThreads: Int!
numAcc: Int!
SMT: Int!
exclusive: Int!
partition: String!
arrayJobId: Int!
monitoringStatus: Int!
state: JobState!
tags: [Tag!]!
resources: [Resource!]!
metaData: Any
userData: User
}
type Cluster {
name: String!
partitions: [String!]! # Slurm partitions
metricConfig: [MetricConfig!]!
filterRanges: FilterRanges!
subClusters: [SubCluster!]! # Hardware partitions/subclusters
}
type SubCluster {
name: String!
nodes: String!
numberOfNodes: Int!
processorType: String!
socketsPerNode: Int!
coresPerSocket: Int!
threadsPerCore: Int!
flopRateScalar: Int!
flopRateSimd: Int!
memoryBandwidth: Int!
topology: Topology!
}
type Topology {
node: [Int!]
socket: [[Int!]!]
memoryDomain: [[Int!]!]
die: [[Int!]!]
core: [[Int!]!]
accelerators: [Accelerator!]
}
type Accelerator {
id: String!
type: String!
model: String!
}
type SubClusterConfig {
name: String!
peak: Float!
normal: Float!
caution: Float!
alert: Float!
}
type MetricConfig {
name: String!
unit: String!
scope: MetricScope!
aggregation: String
timestep: Int!
peak: Float
normal: Float
caution: Float
alert: Float
subClusters: [SubClusterConfig]
}
type Tag {
id: ID!
type: String!
name: String!
}
type Resource {
hostname: String!
hwthreads: [Int!]
accelerators: [String!]
configuration: String
}
type JobMetricWithName {
name: String!
metric: JobMetric!
}
type JobMetric {
unit: String!
scope: MetricScope!
timestep: Int!
series: [Series!]
statisticsSeries: StatsSeries
}
type Series {
hostname: String!
id: Int
statistics: MetricStatistics
data: [NullableFloat!]!
}
type MetricStatistics {
avg: Float!
min: Float!
max: Float!
}
type StatsSeries {
mean: [NullableFloat!]!
min: [NullableFloat!]!
max: [NullableFloat!]!
}
type MetricFootprints {
metric: String!
data: [NullableFloat!]!
}
type Footprints {
nodehours: [NullableFloat!]!
metrics: [MetricFootprints!]!
}
enum Aggregate { USER, PROJECT, CLUSTER }
enum Weights { NODE_COUNT, NODE_HOURS }
type NodeMetrics {
host: String!
subCluster: String!
metrics: [JobMetricWithName!]!
}
type Count {
name: String!
count: Int!
}
type User {
username: String!
name: String!
email: String!
}
type Query {
clusters: [Cluster!]! # List of all clusters
tags: [Tag!]! # List of all tags
user(username: String!): User
allocatedNodes(cluster: String!): [Count!]!
job(id: ID!): Job
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, weight: Weights, limit: Int): [Count!]!
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
}
type Mutation {
createTag(type: String!, name: String!): Tag!
deleteTag(id: ID!): ID!
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
updateConfiguration(name: String!, value: String!): String
}
type IntRangeOutput { from: Int!, to: Int! }
type TimeRangeOutput { from: Time!, to: Time! }
type FilterRanges {
duration: IntRangeOutput!
numNodes: IntRangeOutput!
startTime: TimeRangeOutput!
}
input JobFilter {
tags: [ID!]
jobId: StringInput
arrayJobId: Int
user: StringInput
project: StringInput
cluster: StringInput
partition: StringInput
duration: IntRange
minRunningFor: Int
numNodes: IntRange
numAccelerators: IntRange
numHWThreads: IntRange
startTime: TimeRange
state: [JobState!]
flopsAnyAvg: FloatRange
memBwAvg: FloatRange
loadAvg: FloatRange
memUsedMax: FloatRange
}
input OrderByInput {
field: String!
order: SortDirectionEnum! = ASC
}
enum SortDirectionEnum {
DESC
ASC
}
input StringInput {
eq: String
contains: String
startsWith: String
endsWith: String
}
input IntRange { from: Int!, to: Int! }
input FloatRange { from: Float!, to: Float! }
input TimeRange { from: Time, to: Time }
type JobResultList {
items: [Job!]!
offset: Int
limit: Int
count: Int
}
type HistoPoint {
count: Int!
value: Int!
}
type JobsStatistics {
id: ID! # If `groupBy` was used, ID of the user/project/cluster
totalJobs: Int! # Number of jobs that matched
shortJobs: Int! # Number of jobs with a duration of less than 2 minutes
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
totalCoreHours: Int! # Sum of the core hours of all matched jobs
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
}
input PageRequest {
itemsPerPage: Int!
page: Int!
}

View File

@@ -0,0 +1,280 @@
package graph
// This file will be automatically regenerated based on the schema, any resolver implementations
// will be copied through when generating and any unknown code will be moved to the end.
import (
"context"
"errors"
"fmt"
"strconv"
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func (r *clusterResolver) Partitions(ctx context.Context, obj *model.Cluster) ([]string, error) {
return r.Repo.Partitions(obj.Name)
}
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
return r.Repo.GetTags(&obj.ID)
}
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) {
return r.Repo.FetchMetadata(obj)
}
func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.User, error) {
return auth.FetchUser(ctx, r.DB, obj.User)
}
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
id, err := r.Repo.CreateTag(typeArg, name)
if err != nil {
return nil, err
}
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
}
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
// The UI does not allow this currently anyways.
panic(fmt.Errorf("not implemented"))
}
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
return nil, err
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
tid, err := strconv.ParseInt(tagId, 10, 64)
if err != nil {
return nil, err
}
if tags, err = r.Repo.AddTag(jid, tid); err != nil {
return nil, err
}
}
return tags, nil
}
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
jid, err := strconv.ParseInt(job, 10, 64)
if err != nil {
return nil, err
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
tid, err := strconv.ParseInt(tagId, 10, 64)
if err != nil {
return nil, err
}
if tags, err = r.Repo.RemoveTag(jid, tid); err != nil {
return nil, err
}
}
return tags, nil
}
func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) {
if err := config.UpdateConfig(name, value, ctx); err != nil {
return nil, err
}
return nil, nil
}
func (r *queryResolver) Clusters(ctx context.Context) ([]*model.Cluster, error) {
return config.Clusters, nil
}
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
return r.Repo.GetTags(nil)
}
func (r *queryResolver) User(ctx context.Context, username string) (*model.User, error) {
return auth.FetchUser(ctx, r.DB, username)
}
func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) {
data, err := r.Repo.AllocatedNodes(cluster)
if err != nil {
return nil, err
}
counts := make([]*model.Count, 0, len(data))
for subcluster, hosts := range data {
counts = append(counts, &model.Count{
Name: subcluster,
Count: len(hosts),
})
}
return counts, nil
}
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
numericId, err := strconv.ParseInt(id, 10, 64)
if err != nil {
return nil, err
}
job, err := r.Repo.FindById(numericId)
if err != nil {
return nil, err
}
if user := auth.GetUser(ctx); user != nil && !user.HasRole(auth.RoleAdmin) && job.User != user.Username {
return nil, errors.New("you are not allowed to see this job")
}
return job, nil
}
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
job, err := r.Query().Job(ctx, id)
if err != nil {
return nil, err
}
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
if err != nil {
return nil, err
}
res := []*model.JobMetricWithName{}
for name, md := range data {
for scope, metric := range md {
if metric.Scope != schema.MetricScope(scope) {
panic("WTF?")
}
res = append(res, &model.JobMetricWithName{
Name: name,
Metric: metric,
})
}
}
return res, err
}
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
return r.jobsFootprints(ctx, filter, metrics)
}
func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) (*model.JobResultList, error) {
if page == nil {
page = &model.PageRequest{
ItemsPerPage: 50,
Page: 1,
}
}
jobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
if err != nil {
return nil, err
}
count, err := r.Repo.CountJobs(ctx, filter)
if err != nil {
return nil, err
}
return &model.JobResultList{Items: jobs, Count: &count}, nil
}
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
return r.jobsStatistics(ctx, filter, groupBy)
}
func (r *queryResolver) JobsCount(ctx context.Context, filter []*model.JobFilter, groupBy model.Aggregate, weight *model.Weights, limit *int) ([]*model.Count, error) {
counts, err := r.Repo.CountGroupedJobs(ctx, groupBy, filter, weight, limit)
if err != nil {
return nil, err
}
res := make([]*model.Count, 0, len(counts))
for name, count := range counts {
res = append(res, &model.Count{
Name: name,
Count: count,
})
}
return res, nil
}
func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
}
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
user := auth.GetUser(ctx)
if user != nil && !user.HasRole(auth.RoleAdmin) {
return nil, errors.New("you need to be an administrator for this query")
}
if metrics == nil {
for _, mc := range config.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
return nil, err
}
nodeMetrics := make([]*model.NodeMetrics, 0, len(data))
for hostname, metrics := range data {
host := &model.NodeMetrics{
Host: hostname,
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
}
host.SubCluster, _ = config.GetSubClusterByNode(cluster, hostname)
for metric, scopedMetrics := range metrics {
for _, scopedMetric := range scopedMetrics {
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
Name: metric,
Metric: scopedMetric,
})
}
}
nodeMetrics = append(nodeMetrics, host)
}
return nodeMetrics, nil
}
// Cluster returns generated.ClusterResolver implementation.
func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver{r} }
// Job returns generated.JobResolver implementation.
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
// Mutation returns generated.MutationResolver implementation.
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
// Query returns generated.QueryResolver implementation.
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
type clusterResolver struct{ *Resolver }
type jobResolver struct{ *Resolver }
type mutationResolver struct{ *Resolver }
type queryResolver struct{ *Resolver }

302
internal/graph/stats.go Normal file
View File

@@ -0,0 +1,302 @@
package graph
import (
"context"
"database/sql"
"errors"
"fmt"
"math"
"time"
"github.com/99designs/gqlgen/graphql"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
// GraphQL validation should make sure that no unkown values can be specified.
var groupBy2column = map[model.Aggregate]string{
model.AggregateUser: "job.user",
model.AggregateProject: "job.project",
model.AggregateCluster: "job.cluster",
}
const ShortJobDuration int = 5 * 60
// Helper function for the jobsStatistics GraphQL query placed here so that schema.resolvers.go is not too full.
func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobFilter, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
// In case `groupBy` is nil (not used), the model.JobsStatistics used is at the key '' (empty string)
stats := map[string]*model.JobsStatistics{}
// `socketsPerNode` and `coresPerSocket` can differ from cluster to cluster, so we need to explicitly loop over those.
for _, cluster := range config.Clusters {
for _, subcluster := range cluster.SubClusters {
corehoursCol := fmt.Sprintf("CAST(ROUND(SUM(job.duration * job.num_nodes * %d * %d) / 3600) as int)", subcluster.SocketsPerNode, subcluster.CoresPerSocket)
var query sq.SelectBuilder
if groupBy == nil {
query = sq.Select(
"''",
"COUNT(job.id)",
"CAST(ROUND(SUM(job.duration) / 3600) as int)",
corehoursCol,
).From("job")
} else {
col := groupBy2column[*groupBy]
query = sq.Select(
col,
"COUNT(job.id)",
"CAST(ROUND(SUM(job.duration) / 3600) as int)",
corehoursCol,
).From("job").GroupBy(col)
}
query = query.
Where("job.cluster = ?", cluster.Name).
Where("job.subcluster = ?", subcluster.Name)
query = repository.SecurityCheck(ctx, query)
for _, f := range filter {
query = repository.BuildWhereClause(f, query)
}
rows, err := query.RunWith(r.DB).Query()
if err != nil {
return nil, err
}
for rows.Next() {
var id sql.NullString
var jobs, walltime, corehours sql.NullInt64
if err := rows.Scan(&id, &jobs, &walltime, &corehours); err != nil {
return nil, err
}
if id.Valid {
if s, ok := stats[id.String]; ok {
s.TotalJobs += int(jobs.Int64)
s.TotalWalltime += int(walltime.Int64)
s.TotalCoreHours += int(corehours.Int64)
} else {
stats[id.String] = &model.JobsStatistics{
ID: id.String,
TotalJobs: int(jobs.Int64),
TotalWalltime: int(walltime.Int64),
TotalCoreHours: int(corehours.Int64),
}
}
}
}
}
}
if groupBy == nil {
query := sq.Select("COUNT(job.id)").From("job").Where("job.duration < ?", ShortJobDuration)
query = repository.SecurityCheck(ctx, query)
for _, f := range filter {
query = repository.BuildWhereClause(f, query)
}
if err := query.RunWith(r.DB).QueryRow().Scan(&(stats[""].ShortJobs)); err != nil {
return nil, err
}
} else {
col := groupBy2column[*groupBy]
query := sq.Select(col, "COUNT(job.id)").From("job").Where("job.duration < ?", ShortJobDuration)
query = repository.SecurityCheck(ctx, query)
for _, f := range filter {
query = repository.BuildWhereClause(f, query)
}
rows, err := query.RunWith(r.DB).Query()
if err != nil {
return nil, err
}
for rows.Next() {
var id sql.NullString
var shortJobs sql.NullInt64
if err := rows.Scan(&id, &shortJobs); err != nil {
return nil, err
}
if id.Valid {
stats[id.String].ShortJobs = int(shortJobs.Int64)
}
}
}
// Calculating the histogram data is expensive, so only do it if needed.
// An explicit resolver can not be used because we need to know the filters.
histogramsNeeded := false
fields := graphql.CollectFieldsCtx(ctx, nil)
for _, col := range fields {
if col.Name == "histDuration" || col.Name == "histNumNodes" {
histogramsNeeded = true
}
}
res := make([]*model.JobsStatistics, 0, len(stats))
for _, stat := range stats {
res = append(res, stat)
id, col := "", ""
if groupBy != nil {
id = stat.ID
col = groupBy2column[*groupBy]
}
if histogramsNeeded {
var err error
value := fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as int) as value`, time.Now().Unix())
stat.HistDuration, err = r.jobsStatisticsHistogram(ctx, value, filter, id, col)
if err != nil {
return nil, err
}
stat.HistNumNodes, err = r.jobsStatisticsHistogram(ctx, "job.num_nodes as value", filter, id, col)
if err != nil {
return nil, err
}
}
}
return res, nil
}
// `value` must be the column grouped by, but renamed to "value". `id` and `col` can optionally be used
// to add a condition to the query of the kind "<col> = <id>".
func (r *queryResolver) jobsStatisticsHistogram(ctx context.Context, value string, filters []*model.JobFilter, id, col string) ([]*model.HistoPoint, error) {
query := sq.Select(value, "COUNT(job.id) AS count").From("job")
query = repository.SecurityCheck(ctx, query)
for _, f := range filters {
query = repository.BuildWhereClause(f, query)
}
if len(id) != 0 && len(col) != 0 {
query = query.Where(col+" = ?", id)
}
rows, err := query.GroupBy("value").RunWith(r.DB).Query()
if err != nil {
return nil, err
}
points := make([]*model.HistoPoint, 0)
for rows.Next() {
point := model.HistoPoint{}
if err := rows.Scan(&point.Value, &point.Count); err != nil {
return nil, err
}
points = append(points, &point)
}
return points, nil
}
const MAX_JOBS_FOR_ANALYSIS = 500
// Helper function for the rooflineHeatmap GraphQL query placed here so that schema.resolvers.go is not too full.
func (r *Resolver) rooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
if err != nil {
return nil, err
}
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
return nil, fmt.Errorf("too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
}
fcols, frows := float64(cols), float64(rows)
minX, minY, maxX, maxY = math.Log10(minX), math.Log10(minY), math.Log10(maxX), math.Log10(maxY)
tiles := make([][]float64, rows)
for i := range tiles {
tiles[i] = make([]float64, cols)
}
for _, job := range jobs {
if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed {
continue
}
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
if err != nil {
return nil, err
}
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil {
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %d", job.ID)
}
flops, ok1 := flops_["node"]
membw, ok2 := membw_["node"]
if !ok1 || !ok2 {
// TODO/FIXME:
return nil, errors.New("todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
}
for n := 0; n < len(flops.Series); n++ {
flopsSeries, membwSeries := flops.Series[n], membw.Series[n]
for i := 0; i < len(flopsSeries.Data); i++ {
if i >= len(membwSeries.Data) {
break
}
x, y := math.Log10(float64(flopsSeries.Data[i]/membwSeries.Data[i])), math.Log10(float64(flopsSeries.Data[i]))
if math.IsNaN(x) || math.IsNaN(y) || x < minX || x >= maxX || y < minY || y > maxY {
continue
}
x, y = math.Floor(((x-minX)/(maxX-minX))*fcols), math.Floor(((y-minY)/(maxY-minY))*frows)
if x < 0 || x >= fcols || y < 0 || y >= frows {
continue
}
tiles[int(y)][int(x)] += 1
}
}
}
return tiles, nil
}
// Helper function for the jobsFootprints GraphQL query placed here so that schema.resolvers.go is not too full.
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
if err != nil {
return nil, err
}
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
return nil, fmt.Errorf("too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
}
avgs := make([][]schema.Float, len(metrics))
for i := range avgs {
avgs[i] = make([]schema.Float, 0, len(jobs))
}
nodehours := make([]schema.Float, 0, len(jobs))
for _, job := range jobs {
if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed {
continue
}
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
return nil, err
}
nodehours = append(nodehours, schema.Float(float64(job.Duration)/60.0*float64(job.NumNodes)))
}
res := make([]*model.MetricFootprints, len(avgs))
for i, arr := range avgs {
res[i] = &model.MetricFootprints{
Metric: metrics[i],
Data: arr,
}
}
return &model.Footprints{
Nodehours: nodehours,
Metrics: res,
}, nil
}

View File

@@ -0,0 +1,257 @@
package metricdata
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path"
"path/filepath"
"strconv"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// For a given job, return the path of the `data.json`/`meta.json` file.
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
func getPath(job *schema.Job, file string, checkLegacy bool) (string, error) {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
if !checkLegacy {
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
}
legacyPath := filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, file)
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
}
return legacyPath, nil
}
// Assuming job is completed/archived, return the jobs metric data.
func loadFromArchive(job *schema.Job) (schema.JobData, error) {
filename, err := getPath(job, "data.json", true)
if err != nil {
return nil, err
}
data := cache.Get(filename, func() (value interface{}, ttl time.Duration, size int) {
f, err := os.Open(filename)
if err != nil {
return err, 0, 1000
}
defer f.Close()
var data schema.JobData
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&data); err != nil {
return err, 0, 1000
}
return data, 1 * time.Hour, data.Size()
})
if err, ok := data.(error); ok {
return nil, err
}
return data.(schema.JobData), nil
}
func loadMetaJson(job *schema.Job) (*schema.JobMeta, error) {
filename, err := getPath(job, "meta.json", true)
if err != nil {
return nil, err
}
bytes, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
var metaFile schema.JobMeta = schema.JobMeta{
BaseJob: schema.JobDefaults,
}
if err := json.Unmarshal(bytes, &metaFile); err != nil {
return nil, err
}
return &metaFile, nil
}
// If the job is archived, find its `meta.json` file and override the tags list
// in that JSON file. If the job is not archived, nothing is done.
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
if job.State == schema.JobStateRunning {
return nil
}
filename, err := getPath(job, "meta.json", true)
if err != nil {
return err
}
f, err := os.Open(filename)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
var metaFile schema.JobMeta = schema.JobMeta{
BaseJob: schema.JobDefaults,
}
if err := json.NewDecoder(f).Decode(&metaFile); err != nil {
return err
}
f.Close()
metaFile.Tags = make([]*schema.Tag, 0)
for _, tag := range tags {
metaFile.Tags = append(metaFile.Tags, &schema.Tag{
Name: tag.Name,
Type: tag.Type,
})
}
bytes, err := json.Marshal(metaFile)
if err != nil {
return err
}
return os.WriteFile(filename, bytes, 0644)
}
// Helper to metricdata.LoadAverages().
func loadAveragesFromArchive(job *schema.Job, metrics []string, data [][]schema.Float) error {
metaFile, err := loadMetaJson(job)
if err != nil {
return err
}
for i, m := range metrics {
if stat, ok := metaFile.Statistics[m]; ok {
data[i] = append(data[i], schema.Float(stat.Avg))
} else {
data[i] = append(data[i], schema.NaN)
}
}
return nil
}
func GetStatistics(job *schema.Job) (map[string]schema.JobStatistics, error) {
metaFile, err := loadMetaJson(job)
if err != nil {
return nil, err
}
return metaFile.Statistics, nil
}
// Writes a running job to the job-archive
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
allMetrics := make([]string, 0)
metricConfigs := config.GetCluster(job.Cluster).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
// TODO: Talk about this! What resolutions to store data at...
scopes := []schema.MetricScope{schema.MetricScopeNode}
if job.NumNodes <= 8 {
scopes = append(scopes, schema.MetricScopeCore)
}
jobData, err := LoadData(job, allMetrics, scopes, ctx)
if err != nil {
return nil, err
}
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
nodeData, ok := data["node"]
if !ok {
// TODO/FIXME: Calc average for non-node metrics as well!
continue
}
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
Avg: avg / float64(job.NumNodes),
Min: min,
Max: max,
}
}
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if !useArchive {
return jobMeta, nil
}
dir, err := getPath(job, "", false)
if err != nil {
return nil, err
}
return jobMeta, writeFiles(dir, jobMeta, &jobData)
}
func writeFiles(dir string, jobMeta *schema.JobMeta, jobData *schema.JobData) error {
if err := os.MkdirAll(dir, 0777); err != nil {
return err
}
f, err := os.Create(path.Join(dir, "meta.json"))
if err != nil {
return err
}
if err := json.NewEncoder(f).Encode(jobMeta); err != nil {
return err
}
if err := f.Close(); err != nil {
return err
}
f, err = os.Create(path.Join(dir, "data.json"))
if err != nil {
return err
}
if err := json.NewEncoder(f).Encode(jobData); err != nil {
return err
}
return f.Close()
}
// Used to import a non-running job into the job-archive.
func ImportJob(job *schema.JobMeta, jobData *schema.JobData) error {
dir, err := getPath(&schema.Job{
BaseJob: job.BaseJob,
StartTimeUnix: job.StartTime,
StartTime: time.Unix(job.StartTime, 0),
}, "", false)
if err != nil {
return err
}
return writeFiles(dir, job, jobData)
}

View File

@@ -0,0 +1,611 @@
package metricdata
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type CCMetricStoreConfig struct {
Kind string `json:"kind"`
Url string `json:"url"`
Token string `json:"token"`
// If metrics are known to this MetricDataRepository under a different
// name than in the `metricConfig` section of the 'cluster.json',
// provide this optional mapping of local to remote name for this metric.
Renamings map[string]string `json:"metricRenamings"`
}
type CCMetricStore struct {
jwt string
url string
queryEndpoint string
client http.Client
here2there map[string]string
there2here map[string]string
}
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
Queries []ApiQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
}
type ApiQuery struct {
Metric string `json:"metric"`
Hostname string `json:"host"`
Aggregate bool `json:"aggreg"`
Type *string `json:"type,omitempty"`
TypeIds []string `json:"type-ids,omitempty"`
SubType *string `json:"subtype,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
}
type ApiQueryResponse struct {
Queries []ApiQuery `json:"queries,omitempty"`
Results [][]ApiMetricData `json:"results"`
}
type ApiMetricData struct {
Error *string `json:"error"`
From int64 `json:"from"`
To int64 `json:"to"`
Data []schema.Float `json:"data"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
}
func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error {
var config CCMetricStoreConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
return err
}
ccms.url = config.Url
ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url)
ccms.jwt = config.Token
ccms.client = http.Client{
Timeout: 10 * time.Second,
}
if config.Renamings != nil {
ccms.here2there = config.Renamings
ccms.there2here = make(map[string]string, len(config.Renamings))
for k, v := range ccms.here2there {
ccms.there2here[v] = k
}
} else {
ccms.here2there = make(map[string]string)
ccms.there2here = make(map[string]string)
}
return nil
}
func (ccms *CCMetricStore) toRemoteName(metric string) string {
if renamed, ok := ccms.here2there[metric]; ok {
return renamed
}
return metric
}
func (ccms *CCMetricStore) toLocalName(metric string) string {
if renamed, ok := ccms.there2here[metric]; ok {
return renamed
}
return metric
}
func (ccms *CCMetricStore) doRequest(ctx context.Context, body *ApiQueryRequest) (*ApiQueryResponse, error) {
buf := &bytes.Buffer{}
if err := json.NewEncoder(buf).Encode(body); err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.queryEndpoint, buf)
if err != nil {
return nil, err
}
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
res, err := ccms.client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
}
var resBody ApiQueryResponse
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
return nil, err
}
return &resBody, nil
}
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
topology := config.GetSubCluster(job.Cluster, job.SubCluster).Topology
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
if err != nil {
return nil, err
}
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: true,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
var errors []string
var jobData schema.JobData = make(schema.JobData)
for i, row := range resBody.Results {
query := req.Queries[i]
metric := ccms.toLocalName(query.Metric)
scope := assignedScope[i]
mc := config.GetMetricConfig(job.Cluster, metric)
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0),
}
jobData[metric][scope] = jobMetric
}
for _, res := range row {
if res.Error != nil {
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
continue
}
id := (*int)(nil)
if query.Type != nil {
id = new(int)
*id, err = strconv.Atoi(query.TypeIds[0])
if err != nil || *query.Type == acceleratorString {
*id, _ = topology.GetAcceleratorIndex(query.TypeIds[0])
}
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// TODO: use schema.Float instead of float64?
// This is done because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname,
Id: id,
Statistics: &schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
// So that one can later check len(jobData):
if len(jobMetric.Series) == 0 {
delete(jobData[metric], scope)
if len(jobData[metric]) == 0 {
delete(jobData, metric)
}
}
}
if len(errors) != 0 {
return jobData, fmt.Errorf("cc-metric-store: %s", strings.Join(errors, ", "))
}
return jobData, nil
}
var (
hwthreadString = string("cpu") // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
coreString = string(schema.MetricScopeCore)
memoryDomainString = string(schema.MetricScopeMemoryDomain)
socketString = string(schema.MetricScopeSocket)
acceleratorString = string(schema.MetricScopeAccelerator)
)
func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) {
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
topology := config.GetSubCluster(job.Cluster, job.SubCluster).Topology
assignedScope := []schema.MetricScope{}
for _, metric := range metrics {
remoteName := ccms.toRemoteName(metric)
mc := config.GetMetricConfig(job.Cluster, metric)
if mc == nil {
// return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
// log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
continue
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
}
handledScopes = append(handledScopes, scope)
for _, host := range job.Resources {
hwthreads := host.HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &acceleratorString,
TypeIds: host.Accelerators,
})
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
continue
}
// Accelerator -> Node
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
if len(host.Accelerators) == 0 {
continue
}
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &acceleratorString,
TypeIds: host.Accelerators,
})
assignedScope = append(assignedScope, scope)
continue
}
// HWThread -> HWThead
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &hwthreadString,
TypeIds: intToStringSlice(hwthreads),
})
assignedScope = append(assignedScope, scope)
continue
}
// HWThread -> Core
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
for _, core := range cores {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Core[core]),
})
assignedScope = append(assignedScope, scope)
}
continue
}
// HWThread -> Socket
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
for _, socket := range sockets {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Socket[socket]),
})
assignedScope = append(assignedScope, scope)
}
continue
}
// HWThread -> Node
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(hwthreads),
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Core
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &coreString,
TypeIds: intToStringSlice(cores),
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &coreString,
TypeIds: intToStringSlice(cores),
})
assignedScope = append(assignedScope, scope)
continue
}
// MemoryDomain -> MemoryDomain
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// MemoryDoman -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// Socket -> Node
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// Node -> Node
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
})
assignedScope = append(assignedScope, scope)
continue
}
return nil, nil, fmt.Errorf("TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
}
}
}
return queries, assignedScope, nil
}
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode})
if err != nil {
return nil, err
}
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: false,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
for i, res := range resBody.Results {
query := req.Queries[i]
metric := ccms.toLocalName(query.Metric)
data := res[0]
if data.Error != nil {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
}
metricdata, ok := stats[metric]
if !ok {
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
stats[metric] = metricdata
}
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
}
metricdata[query.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg),
Min: float64(data.Min),
Max: float64(data.Max),
}
}
return stats, nil
}
// TODO: Support sub-node-scope metrics! For this, the partition of a node needs to be known!
func (ccms *CCMetricStore) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
req := ApiQueryRequest{
Cluster: cluster,
From: from.Unix(),
To: to.Unix(),
WithStats: true,
WithData: true,
}
if nodes == nil {
for _, metric := range metrics {
req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric))
}
} else {
for _, node := range nodes {
for _, metric := range metrics {
req.Queries = append(req.Queries, ApiQuery{
Hostname: node,
Metric: ccms.toRemoteName(metric),
})
}
}
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
var errors []string
data := make(map[string]map[string][]*schema.JobMetric)
for i, res := range resBody.Results {
var query ApiQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
} else {
query = req.Queries[i]
}
metric := ccms.toLocalName(query.Metric)
qdata := res[0]
if qdata.Error != nil {
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
}
if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() {
// return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0.
}
hostdata, ok := data[query.Hostname]
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[query.Hostname] = hostdata
}
mc := config.GetMetricConfig(cluster, metric)
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: mc.Unit,
Scope: schema.MetricScopeNode,
Timestep: mc.Timestep,
Series: []schema.Series{
{
Hostname: query.Hostname,
Data: qdata.Data,
Statistics: &schema.MetricStatistics{
Avg: float64(qdata.Avg),
Min: float64(qdata.Min),
Max: float64(qdata.Max),
},
},
},
})
}
if len(errors) != 0 {
return data, fmt.Errorf("cc-metric-store: %s", strings.Join(errors, ", "))
}
return data, nil
}
func intToStringSlice(is []int) []string {
ss := make([]string, len(is))
for i, x := range is {
ss[i] = strconv.Itoa(x)
}
return ss
}

View File

@@ -0,0 +1,308 @@
package metricdata
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"log"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
)
type InfluxDBv2DataRepositoryConfig struct {
Url string `json:"url"`
Token string `json:"token"`
Bucket string `json:"bucket"`
Org string `json:"org"`
SkipTls bool `json:"skiptls"`
}
type InfluxDBv2DataRepository struct {
client influxdb2.Client
queryClient influxdb2Api.QueryAPI
bucket, measurement string
}
func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error {
var config InfluxDBv2DataRepositoryConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
return err
}
idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls}))
idb.queryClient = idb.client.QueryAPI(config.Org)
idb.bucket = config.Bucket
return nil
}
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00”
}
func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time {
return time.Unix(epoch, 0)
}
func (idb *InfluxDBv2DataRepository) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
measurementsConds := make([]string, 0, len(metrics))
for _, m := range metrics {
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
}
measurementsCond := strings.Join(measurementsConds, " or ")
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
jobData := make(schema.JobData) // Empty Schema: map[<string>FIELD]map[<MetricScope>SCOPE]<*JobMetric>METRIC
// Requested Scopes
for _, scope := range scopes {
query := ""
switch scope {
case "node":
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows
// log.Println("Note: Scope 'node' requested. ")
query = fmt.Sprintf(`
from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => (%s) and (%s) )
|> drop(columns: ["_start", "_stop"])
|> group(columns: ["hostname", "_measurement"])
|> aggregateWindow(every: 60s, fn: mean)
|> drop(columns: ["_time"])`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
measurementsCond, hostsCond)
case "socket":
log.Println("Note: Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ")
continue
case "core":
log.Println("Note: Scope 'core' requested, but not yet supported: Will return 'node' scope only. ")
continue
// Get Finest Granularity only, Set NULL to 0.0
// query = fmt.Sprintf(`
// from(bucket: "%s")
// |> range(start: %s, stop: %s)
// |> filter(fn: (r) => %s )
// |> filter(fn: (r) => %s )
// |> drop(columns: ["_start", "_stop", "cluster"])
// |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`,
// idb.bucket,
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
// measurementsCond, hostsCond)
default:
log.Println("Note: Unknown Scope requested: Will return 'node' scope. ")
continue
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node'")
}
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
return nil, err
}
// Init Metrics: Only Node level now -> TODO: Matching /check on scope level ...
for _, metric := range metrics {
jobMetric, ok := jobData[metric]
if !ok {
mc := config.GetMetricConfig(job.Cluster, metric)
jobMetric = map[schema.MetricScope]*schema.JobMetric{
scope: { // uses scope var from above!
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0, len(job.Resources)),
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
},
}
}
jobData[metric] = jobMetric
}
// Process Result: Time-Data
field, host, hostSeries := "", "", schema.Series{}
// typeId := 0
switch scope {
case "node":
for rows.Next() {
row := rows.Record()
if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() {
if host != "" {
// Append Series before reset
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
hostSeries = schema.Series{
Hostname: host,
Statistics: nil,
Data: make([]schema.Float, 0),
}
}
val, ok := row.Value().(float64)
if ok {
hostSeries.Data = append(hostSeries.Data, schema.Float(val))
} else {
hostSeries.Data = append(hostSeries.Data, schema.Float(0))
}
}
case "socket":
continue
case "core":
continue
// Include Series.Id in hostSeries
// for rows.Next() {
// row := rows.Record()
// if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) {
// if ( host != "" ) {
// // Append Series before reset
// jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
// }
// field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int)
// hostSeries = schema.Series{
// Hostname: host,
// Id: &typeId,
// Statistics: nil,
// Data: make([]schema.Float, 0),
// }
// }
// val := row.Value().(float64)
// hostSeries.Data = append(hostSeries.Data, schema.Float(val))
// }
default:
continue
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'")
}
// Append last Series
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
// Get Stats
stats, err := idb.LoadStats(job, metrics, ctx)
if err != nil {
return nil, err
}
for _, scope := range scopes {
if scope == "node" { // No 'socket/core' support yet
for metric, nodes := range stats {
// log.Println(fmt.Sprintf("<< Add Stats for : Field %s >>", metric))
for node, stats := range nodes {
// log.Println(fmt.Sprintf("<< Add Stats for : Host %s : Min %.2f, Max %.2f, Avg %.2f >>", node, stats.Min, stats.Max, stats.Avg ))
for index, _ := range jobData[metric][scope].Series {
// log.Println(fmt.Sprintf("<< Try to add Stats to Series in Position %d >>", index))
if jobData[metric][scope].Series[index].Hostname == node {
// log.Println(fmt.Sprintf("<< Match for Series in Position %d : Host %s >>", index, jobData[metric][scope].Series[index].Hostname))
jobData[metric][scope].Series[index].Statistics = &schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
// log.Println(fmt.Sprintf("<< Result Inner: Min %.2f, Max %.2f, Avg %.2f >>", jobData[metric][scope].Series[index].Statistics.Min, jobData[metric][scope].Series[index].Statistics.Max, jobData[metric][scope].Series[index].Statistics.Avg))
}
}
}
}
}
}
// DEBUG:
// for _, scope := range scopes {
// for _, met := range metrics {
// for _, series := range jobData[met][scope].Series {
// log.Println(fmt.Sprintf("<< Result: %d data points for metric %s on %s with scope %s, Stats: Min %.2f, Max %.2f, Avg %.2f >>",
// len(series.Data), met, series.Hostname, scope,
// series.Statistics.Min, series.Statistics.Max, series.Statistics.Avg))
// }
// }
// }
return jobData, nil
}
func (idb *InfluxDBv2DataRepository) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
stats := map[string]map[string]schema.MetricStatistics{}
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
// lenMet := len(metrics)
for _, metric := range metrics {
// log.Println(fmt.Sprintf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet))
query := fmt.Sprintf(`
data = from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s))
union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"),
data |> min(column: "_value") |> set(key: "_field", value: "min"),
data |> max(column: "_value") |> set(key: "_field", value: "max")])
|> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value")
|> group()`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
metric, hostsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
return nil, err
}
nodes := map[string]schema.MetricStatistics{}
for rows.Next() {
row := rows.Record()
host := row.ValueByKey("hostname").(string)
avg, avgok := row.ValueByKey("avg").(float64)
if !avgok {
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg))
avg = 0.0
}
min, minok := row.ValueByKey("min").(float64)
if !minok {
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min))
min = 0.0
}
max, maxok := row.ValueByKey("max").(float64)
if !maxok {
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max))
max = 0.0
}
nodes[host] = schema.MetricStatistics{
Avg: avg,
Min: min,
Max: max,
}
}
stats[metric] = nodes
}
return stats, nil
}
func (idb *InfluxDBv2DataRepository) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
// TODO : Implement to be used in Analysis- und System/Node-View
log.Println(fmt.Sprintf("LoadNodeData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodes %v, scopes %v", cluster, metrics, nodes, scopes))
return nil, errors.New("unimplemented for InfluxDBv2DataRepository")
}

View File

@@ -0,0 +1,254 @@
package metricdata
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/iamlouk/lrucache"
)
type MetricDataRepository interface {
// Initialize this MetricDataRepository. One instance of
// this interface will only ever be responsible for one cluster.
Init(rawConfig json.RawMessage) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of hosts to a map of metrics at the requested scopes for that node.
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
var JobArchivePath string
var useArchive bool
func Init(jobArchivePath string, disableArchive bool) error {
useArchive = !disableArchive
JobArchivePath = jobArchivePath
for _, cluster := range config.Clusters {
if cluster.MetricDataRepository != nil {
var kind struct {
Kind string `json:"kind"`
}
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
return err
}
var mdr MetricDataRepository
switch kind.Kind {
case "cc-metric-store":
mdr = &CCMetricStore{}
case "influxdb":
mdr = &InfluxDBv2DataRepository{}
case "test":
mdr = &TestMetricDataRepository{}
default:
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", kind.Kind, cluster.Name)
}
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
return err
}
metricDataRepos[cluster.Name] = mdr
}
}
return nil
}
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
// Fetches the metric data for a job.
func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
!useArchive {
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster), 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := config.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = repo.LoadData(job, metrics, scopes, ctx)
if err != nil {
if len(jd) != 0 {
log.Errorf("partial error: %s", err.Error())
} else {
return err, 0, 0
}
}
size = jd.Size()
} else {
jd, err = loadFromArchive(job)
if err != nil {
return err, 0, 0
}
// Avoid sending unrequested data to the client:
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = 1 // loadFromArchive() caches in the same cache.
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
prepareJobData(job, jd, scopes)
return jd, ttl, size
})
if err, ok := data.(error); ok {
return nil, err
}
return data.(schema.JobData), nil
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
if job.State != schema.JobStateRunning && useArchive {
return loadAveragesFromArchive(job, metrics, data)
}
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx)
if err != nil {
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// Used for the node/system view. Returns a map of nodes to a map of metrics.
func LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
repo, ok := metricDataRepos[cluster]
if !ok {
return nil, fmt.Errorf("no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range config.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
log.Errorf("partial error: %s", err.Error())
} else {
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("the metric data repository for '%s' does not support this query", cluster)
}
return data, nil
}
func cacheKey(job *schema.Job, metrics []string, scopes []schema.MetricScope) string {
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
return fmt.Sprintf("%d(%s):[%v],[%v]",
job.ID, job.State, metrics, scopes)
}
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need to be available at the scope 'node'.
// If a job has a lot of nodes, statisticsSeries should be available so that a min/mean/max Graph can be used instead of
// a lot of single lines.
func prepareJobData(job *schema.Job, jobData schema.JobData, scopes []schema.MetricScope) {
const maxSeriesSize int = 15
for _, scopes := range jobData {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jobData.AddNodeScope("flops_any")
jobData.AddNodeScope("mem_bw")
}
}

View File

@@ -0,0 +1,32 @@
package metricdata
import (
"context"
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
panic("TODO")
}
// Only a mock for unit-testing.
type TestMetricDataRepository struct{}
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
return nil
}
func (tmdr *TestMetricDataRepository) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
return TestLoadDataCallback(job, metrics, scopes, ctx)
}
func (tmdr *TestMetricDataRepository) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
panic("TODO")
}

View File

@@ -0,0 +1,58 @@
package repository
import (
"fmt"
"log"
"sync"
"time"
"github.com/jmoiron/sqlx"
)
var (
dbConnOnce sync.Once
dbConnInstance *DBConnection
)
type DBConnection struct {
DB *sqlx.DB
}
func Connect(driver string, db string) {
var err error
var dbHandle *sqlx.DB
dbConnOnce.Do(func() {
if driver == "sqlite3" {
dbHandle, err = sqlx.Open("sqlite3", fmt.Sprintf("%s?_foreign_keys=on", db))
if err != nil {
log.Fatal(err)
}
// sqlite does not multithread. Having more than one connection open would just mean
// waiting for locks.
dbHandle.SetMaxOpenConns(1)
} else if driver == "mysql" {
dbHandle, err = sqlx.Open("mysql", fmt.Sprintf("%s?multiStatements=true", db))
if err != nil {
log.Fatal(err)
}
dbHandle.SetConnMaxLifetime(time.Minute * 3)
dbHandle.SetMaxOpenConns(10)
dbHandle.SetMaxIdleConns(10)
} else {
log.Fatalf("unsupported database driver: %s", driver)
}
dbConnInstance = &DBConnection{DB: dbHandle}
})
}
func GetConnection() *DBConnection {
if dbConnInstance == nil {
log.Fatalf("Database connection not initialized!")
}
return dbConnInstance
}

View File

@@ -0,0 +1,155 @@
package repository
import (
"bytes"
"database/sql"
"encoding/json"
"fmt"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
const NamedJobInsert string = `INSERT INTO job (
job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data,
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
) VALUES (
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data,
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
);`
// Import all jobs specified as `<path-to-meta.json>:<path-to-data.json>,...`
func (r *JobRepository) HandleImportFlag(flag string) error {
for _, pair := range strings.Split(flag, ",") {
files := strings.Split(pair, ":")
if len(files) != 2 {
return fmt.Errorf("invalid import flag format")
}
raw, err := os.ReadFile(files[0])
if err != nil {
return err
}
dec := json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
if err := dec.Decode(&jobMeta); err != nil {
return err
}
raw, err = os.ReadFile(files[1])
if err != nil {
return err
}
dec = json.NewDecoder(bytes.NewReader(raw))
dec.DisallowUnknownFields()
jobData := schema.JobData{}
if err := dec.Decode(&jobData); err != nil {
return err
}
if err := r.ImportJob(&jobMeta, &jobData); err != nil {
return err
}
}
return nil
}
func (r *JobRepository) ImportJob(jobMeta *schema.JobMeta, jobData *schema.JobData) (err error) {
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
if err := metricdata.ImportJob(jobMeta, jobData); err != nil {
return err
}
if job, err := r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
if err != nil {
return err
}
return fmt.Errorf("a job with that jobId, cluster and startTime does already exist (dbid: %d)", job.ID)
}
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
// TODO: Other metrics...
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return err
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
return err
}
if err := SanityChecks(&job.BaseJob); err != nil {
return err
}
res, err := r.DB.NamedExec(NamedJobInsert, job)
if err != nil {
return err
}
id, err := res.LastInsertId()
if err != nil {
return err
}
for _, tag := range job.Tags {
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
return err
}
}
log.Infof("Successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
return nil
}
// This function also sets the subcluster if necessary!
func SanityChecks(job *schema.BaseJob) error {
if c := config.GetCluster(job.Cluster); c == nil {
return fmt.Errorf("no such cluster: %#v", job.Cluster)
}
if err := config.AssignSubCluster(job); err != nil {
return err
}
if !job.State.Valid() {
return fmt.Errorf("not a valid job state: %#v", job.State)
}
if len(job.Resources) == 0 || len(job.User) == 0 {
return fmt.Errorf("'resources' and 'user' should not be empty")
}
if job.NumAcc < 0 || job.NumHWThreads < 0 || job.NumNodes < 1 {
return fmt.Errorf("'numNodes', 'numAcc' or 'numHWThreads' invalid")
}
if len(job.Resources) != int(job.NumNodes) {
return fmt.Errorf("len(resources) does not equal numNodes (%d vs %d)", len(job.Resources), job.NumNodes)
}
return nil
}
func loadJobStat(job *schema.JobMeta, metric string) float64 {
if stats, ok := job.Statistics[metric]; ok {
return stats.Avg
}
return 0.0
}

271
internal/repository/init.go Normal file
View File

@@ -0,0 +1,271 @@
package repository
import (
"bufio"
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/jmoiron/sqlx"
)
// `AUTO_INCREMENT` is in a comment because of this hack:
// https://stackoverflow.com/a/41028314 (sqlite creates unique ids automatically)
const JobsDBSchema string = `
DROP TABLE IF EXISTS jobtag;
DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tag;
CREATE TABLE job (
id INTEGER PRIMARY KEY /*!40101 AUTO_INCREMENT */,
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
start_time BIGINT NOT NULL, -- Unix timestamp
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
` + "`partition`" + ` VARCHAR(255) NOT NULL, -- partition is a keyword in mysql -.-
array_job_id BIGINT NOT NULL,
duration INT NOT NULL DEFAULT 0,
walltime INT NOT NULL DEFAULT 0,
job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
meta_data TEXT, -- JSON
resources TEXT NOT NULL, -- JSON
num_nodes INT NOT NULL,
num_hwthreads INT NOT NULL,
num_acc INT NOT NULL,
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
CREATE TABLE tag (
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL,
CONSTRAINT be_unique UNIQUE (tag_type, tag_name));
CREATE TABLE jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
`
// Indexes are created after the job-archive is traversed for faster inserts.
const JobsDbIndexes string = `
CREATE INDEX job_by_user ON job (user);
CREATE INDEX job_by_starttime ON job (start_time);
CREATE INDEX job_by_job_id ON job (job_id);
CREATE INDEX job_by_state ON job (job_state);
`
// Delete the tables "job", "tag" and "jobtag" from the database and
// repopulate them using the jobs found in `archive`.
func InitDB(db *sqlx.DB, archive string) error {
starttime := time.Now()
log.Print("Building job table...")
// Basic database structure:
_, err := db.Exec(JobsDBSchema)
if err != nil {
return err
}
clustersDir, err := os.ReadDir(archive)
if err != nil {
return err
}
if err != nil {
return err
}
// Inserts are bundled into transactions because in sqlite,
// that speeds up inserts A LOT.
tx, err := db.Beginx()
if err != nil {
return err
}
stmt, err := tx.PrepareNamed(NamedJobInsert)
if err != nil {
return err
}
// Not using log.Print because we want the line to end with `\r` and
// this function is only ever called when a special command line flag
// is passed anyways.
fmt.Printf("%d jobs inserted...\r", 0)
i := 0
tags := make(map[string]int64)
handleDirectory := func(filename string) error {
// Bundle 100 inserts into one transaction for better performance:
if i%100 == 0 {
if tx != nil {
if err := tx.Commit(); err != nil {
return err
}
}
tx, err = db.Beginx()
if err != nil {
return err
}
stmt = tx.NamedStmt(stmt)
fmt.Printf("%d jobs inserted...\r", i)
}
err := loadJob(tx, stmt, tags, filename)
if err == nil {
i += 1
}
return err
}
for _, clusterDir := range clustersDir {
lvl1Dirs, err := os.ReadDir(filepath.Join(archive, clusterDir.Name()))
if err != nil {
return err
}
for _, lvl1Dir := range lvl1Dirs {
if !lvl1Dir.IsDir() {
// Could be the cluster.json file
continue
}
lvl2Dirs, err := os.ReadDir(filepath.Join(archive, clusterDir.Name(), lvl1Dir.Name()))
if err != nil {
return err
}
for _, lvl2Dir := range lvl2Dirs {
dirpath := filepath.Join(archive, clusterDir.Name(), lvl1Dir.Name(), lvl2Dir.Name())
startTimeDirs, err := os.ReadDir(dirpath)
if err != nil {
return err
}
// For compability with the old job-archive directory structure where
// there was no start time directory.
for _, startTimeDir := range startTimeDirs {
if startTimeDir.Type().IsRegular() && startTimeDir.Name() == "meta.json" {
if err := handleDirectory(dirpath); err != nil {
log.Errorf("in %s: %s", dirpath, err.Error())
}
} else if startTimeDir.IsDir() {
if err := handleDirectory(filepath.Join(dirpath, startTimeDir.Name())); err != nil {
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
}
}
}
}
}
}
if err := tx.Commit(); err != nil {
return err
}
// Create indexes after inserts so that they do not
// need to be continually updated.
if _, err := db.Exec(JobsDbIndexes); err != nil {
return err
}
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
return nil
}
// TODO: Remove double logic, use repository/import.go!
// Read the `meta.json` file at `path` and insert it to the database using the prepared
// insert statement `stmt`. `tags` maps all existing tags to their database ID.
func loadJob(tx *sqlx.Tx, stmt *sqlx.NamedStmt, tags map[string]int64, path string) error {
f, err := os.Open(filepath.Join(path, "meta.json"))
if err != nil {
return err
}
defer f.Close()
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&jobMeta); err != nil {
return err
}
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
job := schema.Job{
BaseJob: jobMeta.BaseJob,
StartTime: time.Unix(jobMeta.StartTime, 0),
StartTimeUnix: jobMeta.StartTime,
}
// TODO: Other metrics...
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return err
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
return err
}
if err := SanityChecks(&job.BaseJob); err != nil {
return err
}
res, err := stmt.Exec(job)
if err != nil {
return err
}
id, err := res.LastInsertId()
if err != nil {
return err
}
for _, tag := range job.Tags {
tagstr := tag.Name + ":" + tag.Type
tagId, ok := tags[tagstr]
if !ok {
res, err := tx.Exec(`INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)`, tag.Name, tag.Type)
if err != nil {
return err
}
tagId, err = res.LastInsertId()
if err != nil {
return err
}
tags[tagstr] = tagId
}
if _, err := tx.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, id, tagId); err != nil {
return err
}
}
return nil
}

411
internal/repository/job.go Normal file
View File

@@ -0,0 +1,411 @@
package repository
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"strconv"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
"github.com/iamlouk/lrucache"
"github.com/jmoiron/sqlx"
)
var (
jobRepoOnce sync.Once
jobRepoInstance *JobRepository
)
type JobRepository struct {
DB *sqlx.DB
stmtCache *sq.StmtCache
cache *lrucache.Cache
}
func GetRepository() *JobRepository {
jobRepoOnce.Do(func() {
db := GetConnection()
jobRepoInstance = &JobRepository{
DB: db.DB,
stmtCache: sq.NewStmtCache(db.DB),
cache: lrucache.New(1024 * 1024),
}
})
return jobRepoInstance
}
var jobColumns []string = []string{
"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id",
"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",
"job.duration", "job.walltime", "job.resources", // "job.meta_data",
}
func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
job := &schema.Job{}
if err := row.Scan(
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
&job.Duration, &job.Walltime, &job.RawResources /*&job.MetaData*/); err != nil {
return nil, err
}
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
return nil, err
}
job.StartTime = time.Unix(job.StartTimeUnix, 0)
if job.Duration == 0 && job.State == schema.JobStateRunning {
job.Duration = int32(time.Since(job.StartTime).Seconds())
}
job.RawResources = nil
return job, nil
}
func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) {
cachekey := fmt.Sprintf("metadata:%d", job.ID)
if cached := r.cache.Get(cachekey, nil); cached != nil {
job.MetaData = cached.(map[string]string)
return job.MetaData, nil
}
if err := sq.Select("job.meta_data").From("job").Where("job.id = ?", job.ID).
RunWith(r.stmtCache).QueryRow().Scan(&job.RawMetaData); err != nil {
return nil, err
}
if len(job.RawMetaData) == 0 {
return nil, nil
}
if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
return nil, err
}
r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)
return job.MetaData, nil
}
func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err error) {
cachekey := fmt.Sprintf("metadata:%d", job.ID)
r.cache.Del(cachekey)
if job.MetaData == nil {
if _, err = r.FetchMetadata(job); err != nil {
return err
}
}
if job.MetaData != nil {
cpy := make(map[string]string, len(job.MetaData)+1)
for k, v := range job.MetaData {
cpy[k] = v
}
cpy[key] = val
job.MetaData = cpy
} else {
job.MetaData = map[string]string{key: val}
}
if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil {
return err
}
if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {
return err
}
r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)
return nil
}
// Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) Find(
jobId *int64,
cluster *string,
startTime *int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).From("job").
Where("job.job_id = ?", *jobId)
if cluster != nil {
q = q.Where("job.cluster = ?", *cluster)
}
if startTime != nil {
q = q.Where("job.start_time = ?", *startTime)
}
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindById executes a SQL query to find a specific batch job.
// The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindById(
jobId int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// Start inserts a new job in the table, returning the unique job ID.
// Statistics are not transfered!
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return -1, fmt.Errorf("encoding resources field failed: %w", err)
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
return -1, fmt.Errorf("encoding metaData field failed: %w", err)
}
res, err := r.DB.NamedExec(`INSERT INTO job (
job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data
) VALUES (
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data
);`, job)
if err != nil {
return -1, err
}
return res.LastInsertId()
}
// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) Stop(
jobId int64,
duration int32,
state schema.JobState,
monitoringStatus int32) (err error) {
stmt := sq.Update("job").
Set("job_state", state).
Set("duration", duration).
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", jobId)
_, err = stmt.RunWith(r.stmtCache).Exec()
return
}
// TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC;
func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggregate, filters []*model.JobFilter, weight *model.Weights, limit *int) (map[string]int, error) {
if !aggreg.IsValid() {
return nil, errors.New("invalid aggregate")
}
runner := (sq.BaseRunner)(r.stmtCache)
count := "count(*) as count"
if weight != nil {
switch *weight {
case model.WeightsNodeCount:
count = "sum(job.num_nodes) as count"
case model.WeightsNodeHours:
now := time.Now().Unix()
count = fmt.Sprintf(`sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) as count`, now)
runner = r.DB
}
}
q := sq.Select("job."+string(aggreg), count).From("job").GroupBy("job." + string(aggreg)).OrderBy("count DESC")
q = SecurityCheck(ctx, q)
for _, f := range filters {
q = BuildWhereClause(f, q)
}
if limit != nil {
q = q.Limit(uint64(*limit))
}
counts := map[string]int{}
rows, err := q.RunWith(runner).Query()
if err != nil {
return nil, err
}
for rows.Next() {
var group string
var count int
if err := rows.Scan(&group, &count); err != nil {
return nil, err
}
counts[group] = count
}
return counts, nil
}
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
stmt := sq.Update("job").
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", job)
_, err = stmt.RunWith(r.stmtCache).Exec()
return
}
// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) Archive(
jobId int64,
monitoringStatus int32,
metricStats map[string]schema.JobStatistics) error {
stmt := sq.Update("job").
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", jobId)
for metric, stats := range metricStats {
switch metric {
case "flops_any":
stmt = stmt.Set("flops_any_avg", stats.Avg)
case "mem_used":
stmt = stmt.Set("mem_used_max", stats.Max)
case "mem_bw":
stmt = stmt.Set("mem_bw_avg", stats.Avg)
case "load":
stmt = stmt.Set("load_avg", stats.Avg)
case "net_bw":
stmt = stmt.Set("net_bw_avg", stats.Avg)
case "file_bw":
stmt = stmt.Set("file_bw_avg", stats.Avg)
}
}
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
return err
}
return nil
}
var ErrNotFound = errors.New("no such job or user")
// FindJobOrUser returns a job database ID or a username if a job or user machtes the search term.
// As 0 is a valid job id, check if username is "" instead in order to check what machted.
// If nothing matches the search, `ErrNotFound` is returned.
func (r *JobRepository) FindJobOrUser(ctx context.Context, searchterm string) (job int64, username string, err error) {
user := auth.GetUser(ctx)
if id, err := strconv.Atoi(searchterm); err == nil {
qb := sq.Select("job.id").From("job").Where("job.job_id = ?", id)
if user != nil && !user.HasRole(auth.RoleAdmin) {
qb = qb.Where("job.user = ?", user.Username)
}
err := qb.RunWith(r.stmtCache).QueryRow().Scan(&job)
if err != nil && err != sql.ErrNoRows {
return 0, "", err
} else if err == nil {
return job, "", nil
}
}
if user == nil || user.HasRole(auth.RoleAdmin) {
err := sq.Select("job.user").Distinct().From("job").
Where("job.user = ?", searchterm).
RunWith(r.stmtCache).QueryRow().Scan(&username)
if err != nil && err != sql.ErrNoRows {
return 0, "", err
} else if err == nil {
return 0, username, nil
}
}
return 0, "", ErrNotFound
}
func (r *JobRepository) Partitions(cluster string) ([]string, error) {
var err error
partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) {
parts := []string{}
if err = r.DB.Select(&parts, `SELECT DISTINCT job.partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
return nil, 0, 1000
}
return parts, 1 * time.Hour, 1
})
if err != nil {
return nil, err
}
return partitions.([]string), nil
}
// AllocatedNodes returns a map of all subclusters to a map of hostnames to the amount of jobs running on that host.
// Hosts with zero jobs running on them will not show up!
func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]int, error) {
subclusters := make(map[string]map[string]int)
rows, err := sq.Select("resources", "subcluster").From("job").
Where("job.job_state = 'running'").
Where("job.cluster = ?", cluster).
RunWith(r.stmtCache).Query()
if err != nil {
return nil, err
}
var raw []byte
defer rows.Close()
for rows.Next() {
raw = raw[0:0]
var resources []*schema.Resource
var subcluster string
if err := rows.Scan(&raw, &subcluster); err != nil {
return nil, err
}
if err := json.Unmarshal(raw, &resources); err != nil {
return nil, err
}
hosts, ok := subclusters[subcluster]
if !ok {
hosts = make(map[string]int)
subclusters[subcluster] = hosts
}
for _, resource := range resources {
hosts[resource.Hostname] += 1
}
}
return subclusters, nil
}
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
res, err := sq.Update("job").
Set("monitoring_status", schema.MonitoringStatusArchivingFailed).
Set("duration", 0).
Set("job_state", schema.JobStateFailed).
Where("job.job_state = 'running'").
Where("job.walltime > 0").
Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).
RunWith(r.DB).Exec()
if err != nil {
return err
}
rowsAffected, err := res.RowsAffected()
if err != nil {
return err
}
if rowsAffected > 0 {
log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected)
}
return nil
}

View File

@@ -0,0 +1,66 @@
package repository
import (
"fmt"
"testing"
"github.com/jmoiron/sqlx"
_ "github.com/mattn/go-sqlite3"
)
var db *sqlx.DB
func init() {
Connect("sqlite3", "../../test/test.db")
}
func setup(t *testing.T) *JobRepository {
return GetRepository()
}
func TestFind(t *testing.T) {
r := setup(t)
jobId, cluster, startTime := int64(1404396), "emmy", int64(1609299584)
job, err := r.Find(&jobId, &cluster, &startTime)
if err != nil {
t.Fatal(err)
}
// fmt.Printf("%+v", job)
if job.ID != 1366 {
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1366", job.JobID)
}
}
func TestFindById(t *testing.T) {
r := setup(t)
job, err := r.FindById(1366)
if err != nil {
t.Fatal(err)
}
// fmt.Printf("%+v", job)
if job.JobID != 1404396 {
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1404396", job.JobID)
}
}
func TestGetTags(t *testing.T) {
r := setup(t)
tags, counts, err := r.CountTags(nil)
if err != nil {
t.Fatal(err)
}
fmt.Printf("TAGS %+v \n", tags)
// fmt.Printf("COUNTS %+v \n", counts)
if counts["bandwidth"] != 6 {
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 6", counts["load-imbalance"])
}
}

View File

@@ -0,0 +1,217 @@
package repository
import (
"context"
"errors"
"fmt"
"regexp"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
// QueryJobs returns a list of jobs matching the provided filters. page and order are optional-
func (r *JobRepository) QueryJobs(
ctx context.Context,
filters []*model.JobFilter,
page *model.PageRequest,
order *model.OrderByInput) ([]*schema.Job, error) {
query := sq.Select(jobColumns...).From("job")
query = SecurityCheck(ctx, query)
if order != nil {
field := toSnakeCase(order.Field)
if order.Order == model.SortDirectionEnumAsc {
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
} else if order.Order == model.SortDirectionEnumDesc {
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
} else {
return nil, errors.New("invalid sorting order")
}
}
if page != nil && page.ItemsPerPage != -1 {
limit := uint64(page.ItemsPerPage)
query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
}
for _, f := range filters {
query = BuildWhereClause(f, query)
}
sql, args, err := query.ToSql()
if err != nil {
return nil, err
}
log.Debugf("SQL query: `%s`, args: %#v", sql, args)
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
return nil, err
}
jobs := make([]*schema.Job, 0, 50)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
rows.Close()
return nil, err
}
jobs = append(jobs, job)
}
return jobs, nil
}
// CountJobs counts the number of jobs matching the filters.
func (r *JobRepository) CountJobs(
ctx context.Context,
filters []*model.JobFilter) (int, error) {
// count all jobs:
query := sq.Select("count(*)").From("job")
query = SecurityCheck(ctx, query)
for _, f := range filters {
query = BuildWhereClause(f, query)
}
var count int
if err := query.RunWith(r.DB).Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) sq.SelectBuilder {
user := auth.GetUser(ctx)
if user == nil || user.HasRole(auth.RoleAdmin) || user.HasRole(auth.RoleApi) {
return query
}
return query.Where("job.user = ?", user.Username)
}
// Build a sq.SelectBuilder out of a schema.JobFilter.
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
if filter.Tags != nil {
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags})
}
if filter.JobID != nil {
query = buildStringCondition("job.job_id", filter.JobID, query)
}
if filter.ArrayJobID != nil {
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
}
if filter.User != nil {
query = buildStringCondition("job.user", filter.User, query)
}
if filter.Project != nil {
query = buildStringCondition("job.project", filter.Project, query)
}
if filter.Cluster != nil {
query = buildStringCondition("job.cluster", filter.Cluster, query)
}
if filter.Partition != nil {
query = buildStringCondition("job.partition", filter.Partition, query)
}
if filter.StartTime != nil {
query = buildTimeCondition("job.start_time", filter.StartTime, query)
}
if filter.Duration != nil {
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
query = query.Where("(CASE WHEN job.job_state = 'running' THEN (? - job.start_time) ELSE job.duration END) BETWEEN ? AND ?", now, filter.Duration.From, filter.Duration.To)
}
if filter.MinRunningFor != nil {
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
}
if filter.State != nil {
states := make([]string, len(filter.State))
for i, val := range filter.State {
states[i] = string(val)
}
query = query.Where(sq.Eq{"job.job_state": states})
}
if filter.NumNodes != nil {
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
}
if filter.NumAccelerators != nil {
query = buildIntCondition("job.num_acc", filter.NumAccelerators, query)
}
if filter.NumHWThreads != nil {
query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
}
if filter.FlopsAnyAvg != nil {
query = buildFloatCondition("job.flops_any_avg", filter.FlopsAnyAvg, query)
}
if filter.MemBwAvg != nil {
query = buildFloatCondition("job.mem_bw_avg", filter.MemBwAvg, query)
}
if filter.LoadAvg != nil {
query = buildFloatCondition("job.load_avg", filter.LoadAvg, query)
}
if filter.MemUsedMax != nil {
query = buildFloatCondition("job.mem_used_max", filter.MemUsedMax, query)
}
return query
}
func buildIntCondition(field string, cond *model.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
}
func buildTimeCondition(field string, cond *model.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
if cond.From != nil && cond.To != nil {
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
} else if cond.From != nil {
return query.Where("? <= "+field, cond.From.Unix())
} else if cond.To != nil {
return query.Where(field+" <= ?", cond.To.Unix())
} else {
return query
}
}
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
}
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
if cond.Eq != nil {
return query.Where(field+" = ?", *cond.Eq)
}
if cond.StartsWith != nil {
return query.Where(field+" LIKE ?", fmt.Sprint(*cond.StartsWith, "%"))
}
if cond.EndsWith != nil {
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.EndsWith))
}
if cond.Contains != nil {
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.Contains, "%"))
}
return query
}
var matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
var matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
func toSnakeCase(str string) string {
for _, c := range str {
if c == '\'' || c == '\\' {
panic("A hacker (probably not)!!!")
}
}
str = strings.ReplaceAll(str, "'", "")
str = strings.ReplaceAll(str, "\\", "")
snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
return strings.ToLower(snake)
}

150
internal/repository/tags.go Normal file
View File

@@ -0,0 +1,150 @@
package repository
import (
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
sq "github.com/Masterminds/squirrel"
)
// Add the tag with id `tagId` to the job with the database id `jobId`.
func (r *JobRepository) AddTag(job int64, tag int64) ([]*schema.Tag, error) {
if _, err := r.stmtCache.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES ($1, $2)`, job, tag); err != nil {
return nil, err
}
j, err := r.FindById(job)
if err != nil {
return nil, err
}
tags, err := r.GetTags(&job)
if err != nil {
return nil, err
}
return tags, metricdata.UpdateTags(j, tags)
}
// Removes a tag from a job
func (r *JobRepository) RemoveTag(job, tag int64) ([]*schema.Tag, error) {
if _, err := r.stmtCache.Exec("DELETE FROM jobtag WHERE jobtag.job_id = $1 AND jobtag.tag_id = $2", job, tag); err != nil {
return nil, err
}
j, err := r.FindById(job)
if err != nil {
return nil, err
}
tags, err := r.GetTags(&job)
if err != nil {
return nil, err
}
return tags, metricdata.UpdateTags(j, tags)
}
// CreateTag creates a new tag with the specified type and name and returns its database id.
func (r *JobRepository) CreateTag(tagType string, tagName string) (tagId int64, err error) {
res, err := r.stmtCache.Exec("INSERT INTO tag (tag_type, tag_name) VALUES ($1, $2)", tagType, tagName)
if err != nil {
return 0, err
}
return res.LastInsertId()
}
func (r *JobRepository) CountTags(user *string) (tags []schema.Tag, counts map[string]int, err error) {
tags = make([]schema.Tag, 0, 100)
xrows, err := r.DB.Queryx("SELECT * FROM tag")
if err != nil {
return nil, nil, err
}
for xrows.Next() {
var t schema.Tag
if err := xrows.StructScan(&t); err != nil {
return nil, nil, err
}
tags = append(tags, t)
}
q := sq.Select("t.tag_name, count(jt.tag_id)").
From("tag t").
LeftJoin("jobtag jt ON t.id = jt.tag_id").
GroupBy("t.tag_name")
if user != nil {
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.user = ?)", *user)
}
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
return nil, nil, err
}
counts = make(map[string]int)
for rows.Next() {
var tagName string
var count int
if err := rows.Scan(&tagName, &count); err != nil {
return nil, nil, err
}
counts[tagName] = count
}
err = rows.Err()
return
}
// AddTagOrCreate adds the tag with the specified type and name to the job with the database id `jobId`.
// If such a tag does not yet exist, it is created.
func (r *JobRepository) AddTagOrCreate(jobId int64, tagType string, tagName string) (tagId int64, err error) {
tagId, exists := r.TagId(tagType, tagName)
if !exists {
tagId, err = r.CreateTag(tagType, tagName)
if err != nil {
return 0, err
}
}
if _, err := r.AddTag(jobId, tagId); err != nil {
return 0, err
}
return tagId, nil
}
// TagId returns the database id of the tag with the specified type and name.
func (r *JobRepository) TagId(tagType string, tagName string) (tagId int64, exists bool) {
exists = true
if err := sq.Select("id").From("tag").
Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).
RunWith(r.stmtCache).QueryRow().Scan(&tagId); err != nil {
exists = false
}
return
}
// GetTags returns a list of all tags if job is nil or of the tags that the job with that database ID has.
func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
q := sq.Select("id", "tag_type", "tag_name").From("tag")
if job != nil {
q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job)
}
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
return nil, err
}
tags := make([]*schema.Tag, 0)
for rows.Next() {
tag := &schema.Tag{}
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name); err != nil {
return nil, err
}
tags = append(tags, tag)
}
return tags, nil
}

View File

@@ -0,0 +1,288 @@
package routerConfig
import (
"fmt"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/templates"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/gorilla/mux"
)
type InfoType map[string]interface{}
type Route struct {
Route string
Template string
Title string
Filter bool
Setup func(i InfoType, r *http.Request) InfoType
}
var routes []Route = []Route{
{"/", "home.tmpl", "ClusterCockpit", false, setupHomeRoute},
{"/config", "config.tmpl", "Settings", false, func(i InfoType, r *http.Request) InfoType { return i }},
{"/monitoring/jobs/", "monitoring/jobs.tmpl", "Jobs - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { return i }},
{"/monitoring/job/{id:[0-9]+}", "monitoring/job.tmpl", "Job <ID> - ClusterCockpit", false, setupJobRoute},
{"/monitoring/users/", "monitoring/list.tmpl", "Users - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "USER"; return i }},
{"/monitoring/projects/", "monitoring/list.tmpl", "Projects - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "PROJECT"; return i }},
{"/monitoring/tags/", "monitoring/taglist.tmpl", "Tags - ClusterCockpit", false, setupTaglistRoute},
{"/monitoring/user/{id}", "monitoring/user.tmpl", "User <ID> - ClusterCockpit", true, setupUserRoute},
{"/monitoring/systems/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> - ClusterCockpit", false, setupClusterRoute},
{"/monitoring/node/{cluster}/{hostname}", "monitoring/node.tmpl", "Node <ID> - ClusterCockpit", false, setupNodeRoute},
{"/monitoring/analysis/{cluster}", "monitoring/analysis.tmpl", "Analaysis - ClusterCockpit", true, setupAnalysisRoute},
{"/monitoring/status/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterRoute},
}
func setupHomeRoute(i InfoType, r *http.Request) InfoType {
type cluster struct {
Name string
RunningJobs int
TotalJobs int
RecentShortJobs int
}
jobRepo := repository.GetRepository()
runningJobs, err := jobRepo.CountGroupedJobs(r.Context(), model.AggregateCluster, []*model.JobFilter{{
State: []schema.JobState{schema.JobStateRunning},
}}, nil, nil)
if err != nil {
log.Errorf("failed to count jobs: %s", err.Error())
runningJobs = map[string]int{}
}
totalJobs, err := jobRepo.CountGroupedJobs(r.Context(), model.AggregateCluster, nil, nil, nil)
if err != nil {
log.Errorf("failed to count jobs: %s", err.Error())
totalJobs = map[string]int{}
}
from := time.Now().Add(-24 * time.Hour)
recentShortJobs, err := jobRepo.CountGroupedJobs(r.Context(), model.AggregateCluster, []*model.JobFilter{{
StartTime: &model.TimeRange{From: &from, To: nil},
Duration: &model.IntRange{From: 0, To: graph.ShortJobDuration},
}}, nil, nil)
if err != nil {
log.Errorf("failed to count jobs: %s", err.Error())
recentShortJobs = map[string]int{}
}
clusters := make([]cluster, 0)
for _, c := range config.Clusters {
clusters = append(clusters, cluster{
Name: c.Name,
RunningJobs: runningJobs[c.Name],
TotalJobs: totalJobs[c.Name],
RecentShortJobs: recentShortJobs[c.Name],
})
}
i["clusters"] = clusters
return i
}
func setupJobRoute(i InfoType, r *http.Request) InfoType {
i["id"] = mux.Vars(r)["id"]
return i
}
func setupUserRoute(i InfoType, r *http.Request) InfoType {
jobRepo := repository.GetRepository()
username := mux.Vars(r)["id"]
i["id"] = username
i["username"] = username
if user, _ := auth.FetchUser(r.Context(), jobRepo.DB, username); user != nil {
i["name"] = user.Name
i["email"] = user.Email
}
return i
}
func setupClusterRoute(i InfoType, r *http.Request) InfoType {
vars := mux.Vars(r)
i["id"] = vars["cluster"]
i["cluster"] = vars["cluster"]
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
if from != "" || to != "" {
i["from"] = from
i["to"] = to
}
return i
}
func setupNodeRoute(i InfoType, r *http.Request) InfoType {
vars := mux.Vars(r)
i["cluster"] = vars["cluster"]
i["hostname"] = vars["hostname"]
i["id"] = fmt.Sprintf("%s (%s)", vars["cluster"], vars["hostname"])
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
if from != "" || to != "" {
i["from"] = from
i["to"] = to
}
return i
}
func setupAnalysisRoute(i InfoType, r *http.Request) InfoType {
i["cluster"] = mux.Vars(r)["cluster"]
return i
}
func setupTaglistRoute(i InfoType, r *http.Request) InfoType {
var username *string = nil
jobRepo := repository.GetRepository()
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleAdmin) {
username = &user.Username
}
tags, counts, err := jobRepo.CountTags(username)
tagMap := make(map[string][]map[string]interface{})
if err != nil {
log.Errorf("GetTags failed: %s", err.Error())
i["tagmap"] = tagMap
return i
}
for _, tag := range tags {
tagItem := map[string]interface{}{
"id": tag.ID,
"name": tag.Name,
"count": counts[tag.Name],
}
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
}
i["tagmap"] = tagMap
return i
}
func buildFilterPresets(query url.Values) map[string]interface{} {
filterPresets := map[string]interface{}{}
if query.Get("cluster") != "" {
filterPresets["cluster"] = query.Get("cluster")
}
if query.Get("partition") != "" {
filterPresets["partition"] = query.Get("partition")
}
if query.Get("project") != "" {
filterPresets["project"] = query.Get("project")
filterPresets["projectMatch"] = "eq"
}
if query.Get("user") != "" {
filterPresets["user"] = query.Get("user")
filterPresets["userMatch"] = "eq"
}
if len(query["state"]) != 0 {
filterPresets["state"] = query["state"]
}
if rawtags, ok := query["tag"]; ok {
tags := make([]int, len(rawtags))
for i, tid := range rawtags {
var err error
tags[i], err = strconv.Atoi(tid)
if err != nil {
tags[i] = -1
}
}
filterPresets["tags"] = tags
}
if query.Get("duration") != "" {
parts := strings.Split(query.Get("duration"), "-")
if len(parts) == 2 {
a, e1 := strconv.Atoi(parts[0])
b, e2 := strconv.Atoi(parts[1])
if e1 == nil && e2 == nil {
filterPresets["duration"] = map[string]int{"from": a, "to": b}
}
}
}
if query.Get("numNodes") != "" {
parts := strings.Split(query.Get("numNodes"), "-")
if len(parts) == 2 {
a, e1 := strconv.Atoi(parts[0])
b, e2 := strconv.Atoi(parts[1])
if e1 == nil && e2 == nil {
filterPresets["numNodes"] = map[string]int{"from": a, "to": b}
}
}
}
if query.Get("numAccelerators") != "" {
parts := strings.Split(query.Get("numAccelerators"), "-")
if len(parts) == 2 {
a, e1 := strconv.Atoi(parts[0])
b, e2 := strconv.Atoi(parts[1])
if e1 == nil && e2 == nil {
filterPresets["numAccelerators"] = map[string]int{"from": a, "to": b}
}
}
}
if query.Get("jobId") != "" {
filterPresets["jobId"] = query.Get("jobId")
}
if query.Get("arrayJobId") != "" {
if num, err := strconv.Atoi(query.Get("arrayJobId")); err == nil {
filterPresets["arrayJobId"] = num
}
}
if query.Get("startTime") != "" {
parts := strings.Split(query.Get("startTime"), "-")
if len(parts) == 2 {
a, e1 := strconv.ParseInt(parts[0], 10, 64)
b, e2 := strconv.ParseInt(parts[1], 10, 64)
if e1 == nil && e2 == nil {
filterPresets["startTime"] = map[string]string{
"from": time.Unix(a, 0).Format(time.RFC3339),
"to": time.Unix(b, 0).Format(time.RFC3339),
}
}
}
}
return filterPresets
}
func SetupRoutes(router *mux.Router) {
for _, route := range routes {
route := route
router.HandleFunc(route.Route, func(rw http.ResponseWriter, r *http.Request) {
conf, err := config.GetUIConfig(r)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
title := route.Title
infos := route.Setup(map[string]interface{}{}, r)
if id, ok := infos["id"]; ok {
title = strings.Replace(route.Title, "<ID>", id.(string), 1)
}
username, isAdmin := "", true
if user := auth.GetUser(r.Context()); user != nil {
username = user.Username
isAdmin = user.HasRole(auth.RoleAdmin)
}
page := templates.Page{
Title: title,
User: templates.User{Username: username, IsAdmin: isAdmin},
Config: conf,
Infos: infos,
}
if route.Filter {
page.FilterPresets = buildFilterPresets(r.URL.Query())
}
templates.Render(rw, r, route.Template, &page)
})
}
}

View File

@@ -0,0 +1,131 @@
package runtimeEnv
import (
"bufio"
"errors"
"fmt"
"os"
"os/exec"
"os/user"
"strconv"
"strings"
"syscall"
)
// Very simple and limited .env file reader.
// All variable definitions found are directly
// added to the processes environment.
func LoadEnv(file string) error {
f, err := os.Open(file)
if err != nil {
return err
}
defer f.Close()
s := bufio.NewScanner(bufio.NewReader(f))
for s.Scan() {
line := s.Text()
if strings.HasPrefix(line, "#") || len(line) == 0 {
continue
}
if strings.Contains(line, "#") {
return errors.New("'#' are only supported at the start of a line")
}
line = strings.TrimPrefix(line, "export ")
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
return fmt.Errorf("unsupported line: %#v", line)
}
key := strings.TrimSpace(parts[0])
val := strings.TrimSpace(parts[1])
if strings.HasPrefix(val, "\"") {
if !strings.HasSuffix(val, "\"") {
return fmt.Errorf("unsupported line: %#v", line)
}
runes := []rune(val[1 : len(val)-1])
sb := strings.Builder{}
for i := 0; i < len(runes); i++ {
if runes[i] == '\\' {
i++
switch runes[i] {
case 'n':
sb.WriteRune('\n')
case 'r':
sb.WriteRune('\r')
case 't':
sb.WriteRune('\t')
case '"':
sb.WriteRune('"')
default:
return fmt.Errorf("unsupprorted escape sequence in quoted string: backslash %#v", runes[i])
}
continue
}
sb.WriteRune(runes[i])
}
val = sb.String()
}
os.Setenv(key, val)
}
return s.Err()
}
// Changes the processes user and group to that
// specified in the config.json. The go runtime
// takes care of all threads (and not only the calling one)
// executing the underlying systemcall.
func DropPrivileges(username string, group string) error {
if group != "" {
g, err := user.LookupGroup(group)
if err != nil {
return err
}
gid, _ := strconv.Atoi(g.Gid)
if err := syscall.Setgid(gid); err != nil {
return err
}
}
if username != "" {
u, err := user.Lookup(username)
if err != nil {
return err
}
uid, _ := strconv.Atoi(u.Uid)
if err := syscall.Setuid(uid); err != nil {
return err
}
}
return nil
}
// If started via systemd, inform systemd that we are running:
// https://www.freedesktop.org/software/systemd/man/sd_notify.html
func SystemdNotifiy(ready bool, status string) {
if os.Getenv("NOTIFY_SOCKET") == "" {
// Not started using systemd
return
}
args := []string{fmt.Sprintf("--pid=%d", os.Getpid())}
if ready {
args = append(args, "--ready")
}
if status != "" {
args = append(args, fmt.Sprintf("--status=%s", status))
}
cmd := exec.Command("systemd-notify", args...)
cmd.Run() // errors ignored on purpose, there is not much to do anyways.
}

View File

@@ -0,0 +1,80 @@
package templates
import (
"html/template"
"net/http"
"os"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
)
var templatesDir string
var debugMode bool = os.Getenv("DEBUG") == "1"
var templates map[string]*template.Template = map[string]*template.Template{}
type User struct {
Username string // Username of the currently logged in user
IsAdmin bool
}
type Page struct {
Title string // Page title
Error string // For generic use (e.g. the exact error message on /login)
Info string // For generic use (e.g. "Logout successfull" on /login)
User User // Information about the currently logged in user
Clusters []string // List of all clusters for use in the Header
FilterPresets map[string]interface{} // For pages with the Filter component, this can be used to set initial filters.
Infos map[string]interface{} // For generic use (e.g. username for /monitoring/user/<id>, job id for /monitoring/job/<id>)
Config map[string]interface{} // UI settings for the currently logged in user (e.g. line width, ...)
}
func init() {
bp := "./"
ebp := os.Getenv("BASEPATH")
if ebp != "" {
bp = ebp
}
templatesDir = bp + "web/templates/"
base := template.Must(template.ParseFiles(templatesDir + "base.tmpl"))
files := []string{
"home.tmpl", "404.tmpl", "login.tmpl",
"imprint.tmpl", "privacy.tmpl",
"config.tmpl",
"monitoring/jobs.tmpl",
"monitoring/job.tmpl",
"monitoring/taglist.tmpl",
"monitoring/list.tmpl",
"monitoring/user.tmpl",
"monitoring/systems.tmpl",
"monitoring/status.tmpl",
"monitoring/node.tmpl",
"monitoring/analysis.tmpl",
}
for _, file := range files {
templates[file] = template.Must(template.Must(base.Clone()).ParseFiles(templatesDir + file))
}
}
func Render(rw http.ResponseWriter, r *http.Request, file string, page *Page) {
t, ok := templates[file]
if !ok {
panic("templates must be predefinied!")
}
if debugMode {
t = template.Must(template.ParseFiles(templatesDir+"base.tmpl", templatesDir+file))
}
if page.Clusters == nil {
for _, c := range config.Clusters {
page.Clusters = append(page.Clusters, c.Name)
}
}
if err := t.Execute(rw, page); err != nil {
log.Errorf("template error: %s", err.Error())
}
}