mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-09-06 16:52:59 +02:00
Refactor directory structure
This commit is contained in:
642
internal/api/rest.go
Normal file
642
internal/api/rest.go
Normal file
@@ -0,0 +1,642 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
type RestApi struct {
|
||||
JobRepository *repository.JobRepository
|
||||
Resolver *graph.Resolver
|
||||
Authentication *auth.Authentication
|
||||
MachineStateDir string
|
||||
OngoingArchivings sync.WaitGroup
|
||||
}
|
||||
|
||||
func (api *RestApi) MountRoutes(r *mux.Router) {
|
||||
r = r.PathPrefix("/api").Subrouter()
|
||||
r.StrictSlash(true)
|
||||
|
||||
r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/stop_job/", api.stopJob).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/stop_job/{id}", api.stopJob).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/import/", api.importJob).Methods(http.MethodPost, http.MethodPut)
|
||||
|
||||
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
|
||||
// r.HandleFunc("/jobs/{id}", api.getJob).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
|
||||
|
||||
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
|
||||
|
||||
if api.Authentication != nil {
|
||||
r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost)
|
||||
r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost)
|
||||
}
|
||||
|
||||
if api.MachineStateDir != "" {
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
|
||||
}
|
||||
}
|
||||
|
||||
type StartJobApiResponse struct {
|
||||
DBID int64 `json:"id"`
|
||||
}
|
||||
|
||||
type StopJobApiRequest struct {
|
||||
// JobId, ClusterId and StartTime are optional.
|
||||
// They are only used if no database id was provided.
|
||||
JobId *int64 `json:"jobId"`
|
||||
Cluster *string `json:"cluster"`
|
||||
StartTime *int64 `json:"startTime"`
|
||||
|
||||
// Payload
|
||||
StopTime int64 `json:"stopTime"`
|
||||
State schema.JobState `json:"jobState"`
|
||||
}
|
||||
|
||||
type ErrorResponse struct {
|
||||
Status string `json:"status"`
|
||||
Error string `json:"error"`
|
||||
}
|
||||
|
||||
func handleError(err error, statusCode int, rw http.ResponseWriter) {
|
||||
log.Warnf("REST API: %s", err.Error())
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(statusCode)
|
||||
json.NewEncoder(rw).Encode(ErrorResponse{
|
||||
Status: http.StatusText(statusCode),
|
||||
Error: err.Error(),
|
||||
})
|
||||
}
|
||||
|
||||
func decode(r io.Reader, val interface{}) error {
|
||||
dec := json.NewDecoder(r)
|
||||
dec.DisallowUnknownFields()
|
||||
return dec.Decode(val)
|
||||
}
|
||||
|
||||
type TagJobApiRequest []*struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
// Return a list of jobs
|
||||
func (api *RestApi) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
withMetadata := false
|
||||
filter := &model.JobFilter{}
|
||||
page := &model.PageRequest{ItemsPerPage: -1, Page: 1}
|
||||
order := &model.OrderByInput{Field: "startTime", Order: model.SortDirectionEnumDesc}
|
||||
for key, vals := range r.URL.Query() {
|
||||
switch key {
|
||||
case "state":
|
||||
for _, s := range vals {
|
||||
state := schema.JobState(s)
|
||||
if !state.Valid() {
|
||||
http.Error(rw, "invalid query parameter value: state", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
filter.State = append(filter.State, state)
|
||||
}
|
||||
case "cluster":
|
||||
filter.Cluster = &model.StringInput{Eq: &vals[0]}
|
||||
case "start-time":
|
||||
st := strings.Split(vals[0], "-")
|
||||
if len(st) != 2 {
|
||||
http.Error(rw, "invalid query parameter value: startTime", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
from, err := strconv.ParseInt(st[0], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
to, err := strconv.ParseInt(st[1], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
ufrom, uto := time.Unix(from, 0), time.Unix(to, 0)
|
||||
filter.StartTime = &model.TimeRange{From: &ufrom, To: &uto}
|
||||
case "page":
|
||||
x, err := strconv.Atoi(vals[0])
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
page.Page = x
|
||||
case "items-per-page":
|
||||
x, err := strconv.Atoi(vals[0])
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
page.ItemsPerPage = x
|
||||
case "with-metadata":
|
||||
withMetadata = true
|
||||
default:
|
||||
http.Error(rw, "invalid query parameter: "+key, http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
jobs, err := api.JobRepository.QueryJobs(r.Context(), []*model.JobFilter{filter}, page, order)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
results := make([]*schema.JobMeta, 0, len(jobs))
|
||||
for _, job := range jobs {
|
||||
if withMetadata {
|
||||
if _, err := api.JobRepository.FetchMetadata(job); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
res := &schema.JobMeta{
|
||||
ID: &job.ID,
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
}
|
||||
|
||||
res.Tags, err = api.JobRepository.GetTags(&job.ID)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if res.MonitoringStatus == schema.MonitoringStatusArchivingSuccessful {
|
||||
res.Statistics, err = metricdata.GetStatistics(job)
|
||||
if err != nil {
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results = append(results, res)
|
||||
}
|
||||
|
||||
log.Debugf("/api/jobs: %d jobs returned", len(results))
|
||||
bw := bufio.NewWriter(rw)
|
||||
defer bw.Flush()
|
||||
if err := json.NewEncoder(bw).Encode(map[string]interface{}{
|
||||
"jobs": results,
|
||||
}); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Add a tag to a job
|
||||
func (api *RestApi) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
iid, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(iid)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags, err = api.JobRepository.GetTags(&job.ID)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
var req TagJobApiRequest
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
for _, tag := range req {
|
||||
tagId, err := api.JobRepository.AddTagOrCreate(job.ID, tag.Type, tag.Name)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
job.Tags = append(job.Tags, &schema.Tag{
|
||||
ID: tagId,
|
||||
Type: tag.Type,
|
||||
Name: tag.Name,
|
||||
})
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
}
|
||||
|
||||
// A new job started. The body should be in the `meta.json` format, but some fields required
|
||||
// there are optional here (e.g. `jobState` defaults to "running").
|
||||
func (api *RestApi) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleApi) {
|
||||
handleError(fmt.Errorf("missing role: %#v", auth.RoleApi), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
req := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State == "" {
|
||||
req.State = schema.JobStateRunning
|
||||
}
|
||||
if err := repository.SanityChecks(&req.BaseJob); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Check if combination of (job_id, cluster_id, start_time) already exists:
|
||||
job, err := api.JobRepository.Find(&req.JobID, &req.Cluster, nil)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
handleError(fmt.Errorf("checking for duplicate failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
} else if err == nil {
|
||||
if (req.StartTime - job.StartTimeUnix) < 86400 {
|
||||
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d", job.ID), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
id, err := api.JobRepository.Start(&req)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
for _, tag := range req.Tags {
|
||||
if _, err := api.JobRepository.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
handleError(fmt.Errorf("adding tag to new job %d failed: %w", id, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusCreated)
|
||||
json.NewEncoder(rw).Encode(StartJobApiResponse{
|
||||
DBID: id,
|
||||
})
|
||||
}
|
||||
|
||||
// A job has stopped and should be archived.
|
||||
func (api *RestApi) stopJob(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleApi) {
|
||||
handleError(fmt.Errorf("missing role: %#v", auth.RoleApi), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse request body
|
||||
req := StopJobApiRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(id)
|
||||
} else {
|
||||
if req.JobId == nil {
|
||||
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Sanity checks
|
||||
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
|
||||
handleError(errors.New("stopTime must be larger than startTime and only running jobs can be stopped"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
handleError(fmt.Errorf("invalid job state: %#v", req.State), http.StatusBadRequest, rw)
|
||||
return
|
||||
} else {
|
||||
req.State = schema.JobStateCompleted
|
||||
}
|
||||
|
||||
// Mark job as stopped in the database (update state and duration)
|
||||
job.Duration = int32(req.StopTime - job.StartTime.Unix())
|
||||
job.State = req.State
|
||||
if err := api.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
handleError(fmt.Errorf("marking job as stopped failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
|
||||
|
||||
// Send a response (with status OK). This means that erros that happen from here on forward
|
||||
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
|
||||
// writing to the filesystem fails, the client will not know.
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
|
||||
// Monitoring is disabled...
|
||||
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
||||
return
|
||||
}
|
||||
|
||||
// We need to start a new goroutine as this functions needs to return
|
||||
// for the response to be flushed to the client.
|
||||
api.OngoingArchivings.Add(1) // So that a shutdown does not interrupt this goroutine.
|
||||
go func() {
|
||||
defer api.OngoingArchivings.Done()
|
||||
|
||||
if _, err := api.JobRepository.FetchMetadata(job); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
|
||||
api.JobRepository.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
return
|
||||
}
|
||||
|
||||
// metricdata.ArchiveJob will fetch all the data from a MetricDataRepository and create meta.json/data.json files
|
||||
jobMeta, err := metricdata.ArchiveJob(job, context.Background())
|
||||
if err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
|
||||
api.JobRepository.UpdateMonitoringStatus(job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
return
|
||||
}
|
||||
|
||||
// Update the jobs database entry one last time:
|
||||
if err := api.JobRepository.Archive(job.ID, schema.MonitoringStatusArchivingSuccessful, jobMeta.Statistics); err != nil {
|
||||
log.Errorf("archiving job (dbid: %d) failed: %s", job.ID, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("archiving job (dbid: %d) successful", job.ID)
|
||||
}()
|
||||
}
|
||||
|
||||
func (api *RestApi) importJob(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleApi) {
|
||||
handleError(fmt.Errorf("missing role: %#v", auth.RoleApi), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Meta *schema.JobMeta `json:"meta"`
|
||||
Data *schema.JobData `json:"data"`
|
||||
}
|
||||
if err := decode(r.Body, &body); err != nil {
|
||||
handleError(fmt.Errorf("import failed: %s", err.Error()), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := api.JobRepository.ImportJob(body.Meta, body.Data); err != nil {
|
||||
handleError(fmt.Errorf("import failed: %s", err.Error()), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Write([]byte(`{ "status": "OK" }`))
|
||||
}
|
||||
|
||||
func (api *RestApi) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
id := mux.Vars(r)["id"]
|
||||
metrics := r.URL.Query()["metric"]
|
||||
var scopes []schema.MetricScope
|
||||
for _, scope := range r.URL.Query()["scope"] {
|
||||
var s schema.MetricScope
|
||||
if err := s.UnmarshalGQL(scope); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
scopes = append(scopes, s)
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
|
||||
type Respone struct {
|
||||
Data *struct {
|
||||
JobMetrics []*model.JobMetricWithName `json:"jobMetrics"`
|
||||
} `json:"data"`
|
||||
Error *struct {
|
||||
Message string `json:"message"`
|
||||
} `json:"error"`
|
||||
}
|
||||
|
||||
data, err := api.Resolver.Query().JobMetrics(r.Context(), id, metrics, scopes)
|
||||
if err != nil {
|
||||
json.NewEncoder(rw).Encode(Respone{
|
||||
Error: &struct {
|
||||
Message string "json:\"message\""
|
||||
}{Message: err.Error()},
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
json.NewEncoder(rw).Encode(Respone{
|
||||
Data: &struct {
|
||||
JobMetrics []*model.JobMetricWithName "json:\"jobMetrics\""
|
||||
}{JobMetrics: data},
|
||||
})
|
||||
}
|
||||
|
||||
func (api *RestApi) getJWT(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
username := r.FormValue("username")
|
||||
me := auth.GetUser(r.Context())
|
||||
if !me.HasRole(auth.RoleAdmin) {
|
||||
if username != me.Username {
|
||||
http.Error(rw, "only admins are allowed to sign JWTs not for themselves", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
user, err := api.Authentication.FetchUser(username)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
jwt, err := api.Authentication.ProvideJWT(user)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
rw.Write([]byte(jwt))
|
||||
}
|
||||
|
||||
func (api *RestApi) createUser(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
me := auth.GetUser(r.Context())
|
||||
if !me.HasRole(auth.RoleAdmin) {
|
||||
http.Error(rw, "only admins are allowed to create new users", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
username, password, role, name, email := r.FormValue("username"), r.FormValue("password"), r.FormValue("role"), r.FormValue("name"), r.FormValue("email")
|
||||
if len(password) == 0 && role != auth.RoleApi {
|
||||
http.Error(rw, "only API users are allowed to have a blank password (login will be impossible)", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if err := api.Authentication.CreateUser(username, name, password, email, []string{role}); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Write([]byte(fmt.Sprintf("User %#v successfully created!\n", username)))
|
||||
}
|
||||
|
||||
func (api *RestApi) deleteUser(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := auth.GetUser(r.Context()); !user.HasRole(auth.RoleAdmin) {
|
||||
http.Error(rw, "only admins are allowed to delete a user", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
username := r.FormValue("username")
|
||||
if err := api.Authentication.DelUser(username); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
func (api *RestApi) getUsers(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := auth.GetUser(r.Context()); !user.HasRole(auth.RoleAdmin) {
|
||||
http.Error(rw, "only admins are allowed to fetch a list of users", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
users, err := api.Authentication.FetchUsers(
|
||||
r.URL.Query().Get("via-ldap") == "true",
|
||||
r.URL.Query().Get("not-just-user") == "true")
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
json.NewEncoder(rw).Encode(users)
|
||||
}
|
||||
|
||||
func (api *RestApi) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := auth.GetUser(r.Context()); !user.HasRole(auth.RoleAdmin) {
|
||||
http.Error(rw, "only admins are allowed to update a user", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
// TODO: Handle anything but roles...
|
||||
newrole := r.FormValue("add-role")
|
||||
if err := api.Authentication.AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Write([]byte("success"))
|
||||
}
|
||||
|
||||
func (api *RestApi) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
key, value := r.FormValue("key"), r.FormValue("value")
|
||||
|
||||
fmt.Printf("KEY: %#v\nVALUE: %#v\n", key, value)
|
||||
|
||||
if err := config.UpdateConfig(key, value, r.Context()); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Write([]byte("success"))
|
||||
}
|
||||
|
||||
func (api *RestApi) putMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
if api.MachineStateDir == "" {
|
||||
http.Error(rw, "not enabled", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
vars := mux.Vars(r)
|
||||
cluster := vars["cluster"]
|
||||
host := vars["host"]
|
||||
dir := filepath.Join(api.MachineStateDir, cluster)
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
filename := filepath.Join(dir, fmt.Sprintf("%s.json", host))
|
||||
f, err := os.Create(filename)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if _, err := io.Copy(f, r.Body); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusCreated)
|
||||
}
|
||||
|
||||
func (api *RestApi) getMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
if api.MachineStateDir == "" {
|
||||
http.Error(rw, "not enabled", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
vars := mux.Vars(r)
|
||||
filename := filepath.Join(api.MachineStateDir, vars["cluster"], fmt.Sprintf("%s.json", vars["host"]))
|
||||
|
||||
// Sets the content-type and 'Last-Modified' Header and so on automatically
|
||||
http.ServeFile(rw, r, filename)
|
||||
}
|
473
internal/auth/auth.go
Normal file
473
internal/auth/auth.go
Normal file
@@ -0,0 +1,473 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
"crypto/rand"
|
||||
"database/sql"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
"github.com/golang-jwt/jwt/v4"
|
||||
"github.com/gorilla/sessions"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"golang.org/x/crypto/bcrypt"
|
||||
)
|
||||
|
||||
// Only Username and Roles will always be filled in when returned by `GetUser`.
|
||||
// If Name and Email is needed as well, use auth.FetchUser(), which does a database
|
||||
// query for all fields.
|
||||
type User struct {
|
||||
Username string `json:"username"`
|
||||
Password string `json:"-"`
|
||||
Name string `json:"name"`
|
||||
Roles []string `json:"roles"`
|
||||
ViaLdap bool `json:"via-ldap"`
|
||||
Email string `json:"email"`
|
||||
}
|
||||
|
||||
const (
|
||||
RoleAdmin string = "admin"
|
||||
RoleApi string = "api"
|
||||
RoleUser string = "user"
|
||||
)
|
||||
|
||||
func (u *User) HasRole(role string) bool {
|
||||
for _, r := range u.Roles {
|
||||
if r == role {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type ContextKey string
|
||||
|
||||
const ContextUserKey ContextKey = "user"
|
||||
|
||||
type Authentication struct {
|
||||
db *sqlx.DB
|
||||
sessionStore *sessions.CookieStore
|
||||
jwtPublicKey ed25519.PublicKey
|
||||
jwtPrivateKey ed25519.PrivateKey
|
||||
|
||||
ldapConfig *LdapConfig
|
||||
ldapSyncUserPassword string
|
||||
|
||||
// If zero, tokens/sessions do not expire.
|
||||
SessionMaxAge time.Duration
|
||||
JwtMaxAge time.Duration
|
||||
}
|
||||
|
||||
func (auth *Authentication) Init(db *sqlx.DB, ldapConfig *LdapConfig) error {
|
||||
auth.db = db
|
||||
_, err := db.Exec(`
|
||||
CREATE TABLE IF NOT EXISTS user (
|
||||
username varchar(255) PRIMARY KEY NOT NULL,
|
||||
password varchar(255) DEFAULT NULL,
|
||||
ldap tinyint NOT NULL DEFAULT 0,
|
||||
name varchar(255) DEFAULT NULL,
|
||||
roles varchar(255) NOT NULL DEFAULT "[]",
|
||||
email varchar(255) DEFAULT NULL);`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sessKey := os.Getenv("SESSION_KEY")
|
||||
if sessKey == "" {
|
||||
log.Warn("environment variable 'SESSION_KEY' not set (will use non-persistent random key)")
|
||||
bytes := make([]byte, 32)
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
return err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(sessKey)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
auth.sessionStore = sessions.NewCookieStore(bytes)
|
||||
}
|
||||
|
||||
pubKey, privKey := os.Getenv("JWT_PUBLIC_KEY"), os.Getenv("JWT_PRIVATE_KEY")
|
||||
if pubKey == "" || privKey == "" {
|
||||
log.Warn("environment variables 'JWT_PUBLIC_KEY' or 'JWT_PRIVATE_KEY' not set (token based authentication will not work)")
|
||||
} else {
|
||||
bytes, err := base64.StdEncoding.DecodeString(pubKey)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
auth.jwtPublicKey = ed25519.PublicKey(bytes)
|
||||
bytes, err = base64.StdEncoding.DecodeString(privKey)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
auth.jwtPrivateKey = ed25519.PrivateKey(bytes)
|
||||
}
|
||||
|
||||
if ldapConfig != nil {
|
||||
auth.ldapConfig = ldapConfig
|
||||
if err := auth.initLdap(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// arg must be formated like this: "<username>:[admin|api|]:<password>"
|
||||
func (auth *Authentication) AddUser(arg string) error {
|
||||
parts := strings.SplitN(arg, ":", 3)
|
||||
if len(parts) != 3 || len(parts[0]) == 0 {
|
||||
return errors.New("invalid argument format")
|
||||
}
|
||||
|
||||
roles := strings.Split(parts[1], ",")
|
||||
return auth.CreateUser(parts[0], "", parts[2], "", roles)
|
||||
}
|
||||
|
||||
func (auth *Authentication) CreateUser(username, name, password, email string, roles []string) error {
|
||||
for _, role := range roles {
|
||||
if role != RoleAdmin && role != RoleApi && role != RoleUser {
|
||||
return fmt.Errorf("invalid user role: %#v", role)
|
||||
}
|
||||
}
|
||||
|
||||
if username == "" {
|
||||
return errors.New("username should not be empty")
|
||||
}
|
||||
|
||||
if password != "" {
|
||||
bytes, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
password = string(bytes)
|
||||
}
|
||||
|
||||
rolesJson, _ := json.Marshal(roles)
|
||||
cols := []string{"username", "password", "roles"}
|
||||
vals := []interface{}{username, password, string(rolesJson)}
|
||||
if name != "" {
|
||||
cols = append(cols, "name")
|
||||
vals = append(vals, name)
|
||||
}
|
||||
if email != "" {
|
||||
cols = append(cols, "email")
|
||||
vals = append(vals, email)
|
||||
}
|
||||
|
||||
if _, err := sq.Insert("user").Columns(cols...).Values(vals...).RunWith(auth.db).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Infof("new user %#v created (roles: %s)", username, roles)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (auth *Authentication) AddRole(ctx context.Context, username string, role string) error {
|
||||
user, err := auth.FetchUser(username)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if role != RoleAdmin && role != RoleApi && role != RoleUser {
|
||||
return fmt.Errorf("invalid user role: %#v", role)
|
||||
}
|
||||
|
||||
for _, r := range user.Roles {
|
||||
if r == role {
|
||||
return fmt.Errorf("user %#v already has role %#v", username, role)
|
||||
}
|
||||
}
|
||||
|
||||
roles, _ := json.Marshal(append(user.Roles, role))
|
||||
if _, err := sq.Update("user").Set("roles", roles).Where("user.username = ?", username).RunWith(auth.db).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (auth *Authentication) DelUser(username string) error {
|
||||
_, err := auth.db.Exec(`DELETE FROM user WHERE user.username = ?`, username)
|
||||
return err
|
||||
}
|
||||
|
||||
func (auth *Authentication) FetchUsers(viaLdap, notJustUser bool) ([]*User, error) {
|
||||
q := sq.Select("username", "name", "email", "roles").From("user")
|
||||
if !viaLdap {
|
||||
if notJustUser {
|
||||
q = q.Where("ldap = 0 OR (roles != '[\"user\"]' AND roles != '[]')")
|
||||
} else {
|
||||
q = q.Where("ldap = 0")
|
||||
}
|
||||
} else {
|
||||
if notJustUser {
|
||||
q = q.Where("ldap = 1 OR (roles != '[\"user\"]' AND roles != '[]')")
|
||||
} else {
|
||||
q = q.Where("ldap = 1")
|
||||
}
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(auth.db).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
users := make([]*User, 0)
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
rawroles := ""
|
||||
user := &User{}
|
||||
var name, email sql.NullString
|
||||
if err := rows.Scan(&user.Username, &name, &email, &rawroles); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal([]byte(rawroles), &user.Roles); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
user.Name = name.String
|
||||
user.Email = email.String
|
||||
users = append(users, user)
|
||||
}
|
||||
return users, nil
|
||||
}
|
||||
|
||||
func (auth *Authentication) FetchUser(username string) (*User, error) {
|
||||
user := &User{Username: username}
|
||||
var hashedPassword, name, rawRoles, email sql.NullString
|
||||
if err := sq.Select("password", "ldap", "name", "roles", "email").From("user").
|
||||
Where("user.username = ?", username).RunWith(auth.db).
|
||||
QueryRow().Scan(&hashedPassword, &user.ViaLdap, &name, &rawRoles, &email); err != nil {
|
||||
return nil, fmt.Errorf("user '%s' not found (%s)", username, err.Error())
|
||||
}
|
||||
|
||||
user.Password = hashedPassword.String
|
||||
user.Name = name.String
|
||||
user.Email = email.String
|
||||
if rawRoles.Valid {
|
||||
if err := json.Unmarshal([]byte(rawRoles.String), &user.Roles); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return user, nil
|
||||
}
|
||||
|
||||
func FetchUser(ctx context.Context, db *sqlx.DB, username string) (*model.User, error) {
|
||||
me := GetUser(ctx)
|
||||
if me != nil && !me.HasRole(RoleAdmin) && me.Username != username {
|
||||
return nil, errors.New("forbidden")
|
||||
}
|
||||
|
||||
user := &model.User{Username: username}
|
||||
var name, email sql.NullString
|
||||
if err := sq.Select("name", "email").From("user").Where("user.username = ?", username).
|
||||
RunWith(db).QueryRow().Scan(&name, &email); err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
user.Name = name.String
|
||||
user.Email = email.String
|
||||
return user, nil
|
||||
}
|
||||
|
||||
// Handle a POST request that should log the user in, starting a new session.
|
||||
func (auth *Authentication) Login(onsuccess http.Handler, onfailure func(rw http.ResponseWriter, r *http.Request, loginErr error)) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
username, password := r.FormValue("username"), r.FormValue("password")
|
||||
user, err := auth.FetchUser(username)
|
||||
if err == nil && user.ViaLdap && auth.ldapConfig != nil {
|
||||
err = auth.loginViaLdap(user, password)
|
||||
} else if err == nil && !user.ViaLdap && user.Password != "" {
|
||||
if e := bcrypt.CompareHashAndPassword([]byte(user.Password), []byte(password)); e != nil {
|
||||
err = fmt.Errorf("user '%s' provided the wrong password (%s)", username, e.Error())
|
||||
}
|
||||
} else {
|
||||
err = errors.New("could not authenticate user")
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("login of user %#v failed: %s", username, err.Error())
|
||||
onfailure(rw, r, err)
|
||||
return
|
||||
}
|
||||
|
||||
session, err := auth.sessionStore.New(r, "session")
|
||||
if err != nil {
|
||||
log.Errorf("session creation failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if auth.SessionMaxAge != 0 {
|
||||
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
||||
}
|
||||
session.Values["username"] = user.Username
|
||||
session.Values["roles"] = user.Roles
|
||||
if err := auth.sessionStore.Save(r, rw, session); err != nil {
|
||||
log.Errorf("session save failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
log.Infof("login successfull: user: %#v (roles: %v)", user.Username, user.Roles)
|
||||
ctx := context.WithValue(r.Context(), ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
})
|
||||
}
|
||||
|
||||
var ErrTokenInvalid error = errors.New("invalid token")
|
||||
|
||||
func (auth *Authentication) authViaToken(r *http.Request) (*User, error) {
|
||||
if auth.jwtPublicKey == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
rawtoken := r.Header.Get("X-Auth-Token")
|
||||
if rawtoken == "" {
|
||||
rawtoken = r.Header.Get("Authorization")
|
||||
prefix := "Bearer "
|
||||
if !strings.HasPrefix(rawtoken, prefix) {
|
||||
return nil, nil
|
||||
}
|
||||
rawtoken = rawtoken[len(prefix):]
|
||||
}
|
||||
|
||||
token, err := jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
|
||||
if t.Method != jwt.SigningMethodEdDSA {
|
||||
return nil, errors.New("only Ed25519/EdDSA supported")
|
||||
}
|
||||
return auth.jwtPublicKey, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := token.Claims.Valid(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
sub, _ := claims["sub"].(string)
|
||||
|
||||
var roles []string
|
||||
if rawroles, ok := claims["roles"].([]interface{}); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
roles = append(roles, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Check if sub is still a valid user!
|
||||
return &User{
|
||||
Username: sub,
|
||||
Roles: roles,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Authenticate the user and put a User object in the
|
||||
// context of the request. If authentication fails,
|
||||
// do not continue but send client to the login screen.
|
||||
func (auth *Authentication) Auth(onsuccess http.Handler, onfailure func(rw http.ResponseWriter, r *http.Request, authErr error)) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
user, err := auth.authViaToken(r)
|
||||
if err != nil {
|
||||
log.Warnf("authentication failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
if user != nil {
|
||||
// Successfull authentication using a token
|
||||
ctx := context.WithValue(r.Context(), ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
|
||||
session, err := auth.sessionStore.Get(r, "session")
|
||||
if err != nil {
|
||||
// sessionStore.Get will return a new session if no current one is attached to this request.
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if session.IsNew {
|
||||
log.Warn("authentication failed: no session or jwt found")
|
||||
onfailure(rw, r, errors.New("no valid session or JWT provided"))
|
||||
return
|
||||
}
|
||||
|
||||
username, _ := session.Values["username"].(string)
|
||||
roles, _ := session.Values["roles"].([]string)
|
||||
ctx := context.WithValue(r.Context(), ContextUserKey, &User{
|
||||
Username: username,
|
||||
Roles: roles,
|
||||
})
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
})
|
||||
}
|
||||
|
||||
// Generate a new JWT that can be used for authentication
|
||||
func (auth *Authentication) ProvideJWT(user *User) (string, error) {
|
||||
if auth.jwtPrivateKey == nil {
|
||||
return "", errors.New("environment variable 'JWT_PRIVATE_KEY' not set")
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
claims := jwt.MapClaims{
|
||||
"sub": user.Username,
|
||||
"roles": user.Roles,
|
||||
"iat": now.Unix(),
|
||||
}
|
||||
if auth.JwtMaxAge != 0 {
|
||||
claims["exp"] = now.Add(auth.JwtMaxAge).Unix()
|
||||
}
|
||||
|
||||
return jwt.NewWithClaims(jwt.SigningMethodEdDSA, claims).SignedString(auth.jwtPrivateKey)
|
||||
}
|
||||
|
||||
func GetUser(ctx context.Context) *User {
|
||||
x := ctx.Value(ContextUserKey)
|
||||
if x == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return x.(*User)
|
||||
}
|
||||
|
||||
// Clears the session cookie
|
||||
func (auth *Authentication) Logout(onsuccess http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
|
||||
session, err := auth.sessionStore.Get(r, "session")
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if !session.IsNew {
|
||||
session.Options.MaxAge = -1
|
||||
if err := auth.sessionStore.Save(r, rw, session); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
onsuccess.ServeHTTP(rw, r)
|
||||
})
|
||||
}
|
160
internal/auth/ldap.go
Normal file
160
internal/auth/ldap.go
Normal file
@@ -0,0 +1,160 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/go-ldap/ldap/v3"
|
||||
)
|
||||
|
||||
type LdapConfig struct {
|
||||
Url string `json:"url"`
|
||||
UserBase string `json:"user_base"`
|
||||
SearchDN string `json:"search_dn"`
|
||||
UserBind string `json:"user_bind"`
|
||||
UserFilter string `json:"user_filter"`
|
||||
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
|
||||
SyncDelOldUsers bool `json:"sync_del_old_users"`
|
||||
}
|
||||
|
||||
func (auth *Authentication) initLdap() error {
|
||||
auth.ldapSyncUserPassword = os.Getenv("LDAP_ADMIN_PASSWORD")
|
||||
if auth.ldapSyncUserPassword == "" {
|
||||
log.Warn("environment variable 'LDAP_ADMIN_PASSWORD' not set (ldap sync or authentication will not work)")
|
||||
}
|
||||
|
||||
if auth.ldapConfig.SyncInterval != "" {
|
||||
interval, err := time.ParseDuration(auth.ldapConfig.SyncInterval)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if interval == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(interval)
|
||||
for t := range ticker.C {
|
||||
log.Printf("LDAP sync started at %s", t.Format(time.RFC3339))
|
||||
if err := auth.SyncWithLDAP(auth.ldapConfig.SyncDelOldUsers); err != nil {
|
||||
log.Errorf("LDAP sync failed: %s", err.Error())
|
||||
}
|
||||
log.Print("LDAP sync done")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: Add a connection pool or something like
|
||||
// that so that connections can be reused/cached.
|
||||
func (auth *Authentication) getLdapConnection(admin bool) (*ldap.Conn, error) {
|
||||
conn, err := ldap.DialURL(auth.ldapConfig.Url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if admin {
|
||||
if err := conn.Bind(auth.ldapConfig.SearchDN, auth.ldapSyncUserPassword); err != nil {
|
||||
conn.Close()
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return conn, nil
|
||||
}
|
||||
|
||||
func (auth *Authentication) loginViaLdap(user *User, password string) error {
|
||||
l, err := auth.getLdapConnection(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
userDn := strings.Replace(auth.ldapConfig.UserBind, "{username}", user.Username, -1)
|
||||
if err := l.Bind(userDn, password); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
user.ViaLdap = true
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete users where user.ldap is 1 and that do not show up in the ldap search results.
|
||||
// Add users to the users table that are new in the ldap search results.
|
||||
func (auth *Authentication) SyncWithLDAP(deleteOldUsers bool) error {
|
||||
if auth.ldapConfig == nil {
|
||||
return errors.New("ldap not enabled")
|
||||
}
|
||||
|
||||
const IN_DB int = 1
|
||||
const IN_LDAP int = 2
|
||||
const IN_BOTH int = 3
|
||||
|
||||
users := map[string]int{}
|
||||
rows, err := auth.db.Query(`SELECT username FROM user WHERE user.ldap = 1`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var username string
|
||||
if err := rows.Scan(&username); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
users[username] = IN_DB
|
||||
}
|
||||
|
||||
l, err := auth.getLdapConnection(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
ldapResults, err := l.Search(ldap.NewSearchRequest(
|
||||
auth.ldapConfig.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||
auth.ldapConfig.UserFilter, []string{"dn", "uid", "gecos"}, nil))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
newnames := map[string]string{}
|
||||
for _, entry := range ldapResults.Entries {
|
||||
username := entry.GetAttributeValue("uid")
|
||||
if username == "" {
|
||||
return errors.New("no attribute 'uid'")
|
||||
}
|
||||
|
||||
_, ok := users[username]
|
||||
if !ok {
|
||||
users[username] = IN_LDAP
|
||||
newnames[username] = entry.GetAttributeValue("gecos")
|
||||
} else {
|
||||
users[username] = IN_BOTH
|
||||
}
|
||||
}
|
||||
|
||||
for username, where := range users {
|
||||
if where == IN_DB && deleteOldUsers {
|
||||
log.Infof("ldap-sync: remove %#v (does not show up in LDAP anymore)", username)
|
||||
if _, err := auth.db.Exec(`DELETE FROM user WHERE user.username = ?`, username); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if where == IN_LDAP {
|
||||
name := newnames[username]
|
||||
log.Infof("ldap-sync: add %#v (name: %#v, roles: [user], ldap: true)", username, name)
|
||||
if _, err := auth.db.Exec(`INSERT INTO user (username, ldap, name, roles) VALUES (?, ?, ?, ?)`,
|
||||
username, 1, name, "[\""+RoleUser+"\"]"); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
298
internal/config/config.go
Normal file
298
internal/config/config.go
Normal file
@@ -0,0 +1,298 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/iamlouk/lrucache"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
var db *sqlx.DB
|
||||
var lookupConfigStmt *sqlx.Stmt
|
||||
|
||||
var lock sync.RWMutex
|
||||
var uiDefaults map[string]interface{}
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(1024)
|
||||
|
||||
var Clusters []*model.Cluster
|
||||
var nodeLists map[string]map[string]NodeList
|
||||
|
||||
func Init(usersdb *sqlx.DB, authEnabled bool, uiConfig map[string]interface{}, jobArchive string) error {
|
||||
db = usersdb
|
||||
uiDefaults = uiConfig
|
||||
entries, err := os.ReadDir(jobArchive)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
Clusters = []*model.Cluster{}
|
||||
nodeLists = map[string]map[string]NodeList{}
|
||||
for _, de := range entries {
|
||||
raw, err := os.ReadFile(filepath.Join(jobArchive, de.Name(), "cluster.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var cluster model.Cluster
|
||||
|
||||
// Disabled because of the historic 'measurement' field.
|
||||
// dec := json.NewDecoder(bytes.NewBuffer(raw))
|
||||
// dec.DisallowUnknownFields()
|
||||
// if err := dec.Decode(&cluster); err != nil {
|
||||
// return err
|
||||
// }
|
||||
|
||||
if err := json.Unmarshal(raw, &cluster); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(cluster.Name) == 0 || len(cluster.MetricConfig) == 0 || len(cluster.SubClusters) == 0 {
|
||||
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
|
||||
}
|
||||
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
if len(mc.Name) == 0 {
|
||||
return errors.New("cluster.metricConfig.name should not be empty")
|
||||
}
|
||||
if mc.Timestep < 1 {
|
||||
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
|
||||
}
|
||||
|
||||
// For backwards compability...
|
||||
if mc.Scope == "" {
|
||||
mc.Scope = schema.MetricScopeNode
|
||||
}
|
||||
if !mc.Scope.Valid() {
|
||||
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
|
||||
}
|
||||
}
|
||||
|
||||
if cluster.FilterRanges.StartTime.To.IsZero() {
|
||||
cluster.FilterRanges.StartTime.To = time.Unix(0, 0)
|
||||
}
|
||||
|
||||
if cluster.Name != de.Name() {
|
||||
return fmt.Errorf("the file '.../%s/cluster.json' contains the clusterId '%s'", de.Name(), cluster.Name)
|
||||
}
|
||||
|
||||
Clusters = append(Clusters, &cluster)
|
||||
|
||||
nodeLists[cluster.Name] = make(map[string]NodeList)
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Nodes == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
nl, err := ParseNodeList(sc.Nodes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("in %s/cluster.json: %w", cluster.Name, err)
|
||||
}
|
||||
nodeLists[cluster.Name][sc.Name] = nl
|
||||
}
|
||||
}
|
||||
|
||||
if authEnabled {
|
||||
_, err := db.Exec(`
|
||||
CREATE TABLE IF NOT EXISTS configuration (
|
||||
username varchar(255),
|
||||
confkey varchar(255),
|
||||
value varchar(255),
|
||||
PRIMARY KEY (username, confkey),
|
||||
FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
lookupConfigStmt, err = db.Preparex(`SELECT confkey, value FROM configuration WHERE configuration.username = ?`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Return the personalised UI config for the currently authenticated
|
||||
// user or return the plain default config.
|
||||
func GetUIConfig(r *http.Request) (map[string]interface{}, error) {
|
||||
user := auth.GetUser(r.Context())
|
||||
if user == nil {
|
||||
lock.RLock()
|
||||
copy := make(map[string]interface{}, len(uiDefaults))
|
||||
for k, v := range uiDefaults {
|
||||
copy[k] = v
|
||||
}
|
||||
lock.RUnlock()
|
||||
return copy, nil
|
||||
}
|
||||
|
||||
data := cache.Get(user.Username, func() (interface{}, time.Duration, int) {
|
||||
config := make(map[string]interface{}, len(uiDefaults))
|
||||
for k, v := range uiDefaults {
|
||||
config[k] = v
|
||||
}
|
||||
|
||||
rows, err := lookupConfigStmt.Query(user.Username)
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
size := 0
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var key, rawval string
|
||||
if err := rows.Scan(&key, &rawval); err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
var val interface{}
|
||||
if err := json.Unmarshal([]byte(rawval), &val); err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
size += len(key)
|
||||
size += len(rawval)
|
||||
config[key] = val
|
||||
}
|
||||
|
||||
return config, 24 * time.Hour, size
|
||||
})
|
||||
if err, ok := data.(error); ok {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(map[string]interface{}), nil
|
||||
}
|
||||
|
||||
// If the context does not have a user, update the global ui configuration without persisting it!
|
||||
// If there is a (authenticated) user, update only his configuration.
|
||||
func UpdateConfig(key, value string, ctx context.Context) error {
|
||||
user := auth.GetUser(ctx)
|
||||
if user == nil {
|
||||
var val interface{}
|
||||
if err := json.Unmarshal([]byte(value), &val); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
uiDefaults[key] = val
|
||||
return nil
|
||||
}
|
||||
|
||||
// Disabled because now `plot_list_selectedMetrics:<cluster>` is possible.
|
||||
// if _, ok := uiDefaults[key]; !ok {
|
||||
// return errors.New("this configuration key does not exist")
|
||||
// }
|
||||
|
||||
if _, err := db.Exec(`REPLACE INTO configuration (username, confkey, value) VALUES (?, ?, ?)`,
|
||||
user.Username, key, value); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cache.Del(user.Username)
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetCluster(cluster string) *model.Cluster {
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
return c
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetSubCluster(cluster, subcluster string) *model.SubCluster {
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, p := range c.SubClusters {
|
||||
if p.Name == subcluster {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetMetricConfig(cluster, metric string) *model.MetricConfig {
|
||||
for _, c := range Clusters {
|
||||
if c.Name == cluster {
|
||||
for _, m := range c.MetricConfig {
|
||||
if m.Name == metric {
|
||||
return m
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AssignSubCluster sets the `job.subcluster` property of the job based
|
||||
// on its cluster and resources.
|
||||
func AssignSubCluster(job *schema.BaseJob) error {
|
||||
cluster := GetCluster(job.Cluster)
|
||||
if cluster == nil {
|
||||
return fmt.Errorf("unkown cluster: %#v", job.Cluster)
|
||||
}
|
||||
|
||||
if job.SubCluster != "" {
|
||||
for _, sc := range cluster.SubClusters {
|
||||
if sc.Name == job.SubCluster {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("already assigned subcluster %#v unkown (cluster: %#v)", job.SubCluster, job.Cluster)
|
||||
}
|
||||
|
||||
if len(job.Resources) == 0 {
|
||||
return fmt.Errorf("job without any resources/hosts")
|
||||
}
|
||||
|
||||
host0 := job.Resources[0].Hostname
|
||||
for sc, nl := range nodeLists[job.Cluster] {
|
||||
if nl != nil && nl.Contains(host0) {
|
||||
job.SubCluster = sc
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
if cluster.SubClusters[0].Nodes == "" {
|
||||
job.SubCluster = cluster.SubClusters[0].Name
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("no subcluster found for cluster %#v and host %#v", job.Cluster, host0)
|
||||
}
|
||||
|
||||
func GetSubClusterByNode(cluster, hostname string) (string, error) {
|
||||
for sc, nl := range nodeLists[cluster] {
|
||||
if nl != nil && nl.Contains(hostname) {
|
||||
return sc, nil
|
||||
}
|
||||
}
|
||||
|
||||
c := GetCluster(cluster)
|
||||
if c == nil {
|
||||
return "", fmt.Errorf("unkown cluster: %#v", cluster)
|
||||
}
|
||||
|
||||
if c.SubClusters[0].Nodes == "" {
|
||||
return c.SubClusters[0].Name, nil
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("no subcluster found for cluster %#v and host %#v", cluster, hostname)
|
||||
}
|
171
internal/config/nodelist.go
Normal file
171
internal/config/nodelist.go
Normal file
@@ -0,0 +1,171 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
)
|
||||
|
||||
type NodeList [][]interface {
|
||||
consume(input string) (next string, ok bool)
|
||||
}
|
||||
|
||||
func (nl *NodeList) Contains(name string) bool {
|
||||
var ok bool
|
||||
for _, term := range *nl {
|
||||
str := name
|
||||
for _, expr := range term {
|
||||
str, ok = expr.consume(str)
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if ok && str == "" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
type NLExprString string
|
||||
|
||||
func (nle NLExprString) consume(input string) (next string, ok bool) {
|
||||
str := string(nle)
|
||||
if strings.HasPrefix(input, str) {
|
||||
return strings.TrimPrefix(input, str), true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
type NLExprIntRanges []NLExprIntRange
|
||||
|
||||
func (nles NLExprIntRanges) consume(input string) (next string, ok bool) {
|
||||
for _, nle := range nles {
|
||||
if next, ok := nle.consume(input); ok {
|
||||
return next, ok
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
type NLExprIntRange struct {
|
||||
start, end int64
|
||||
zeroPadded bool
|
||||
digits int
|
||||
}
|
||||
|
||||
func (nle NLExprIntRange) consume(input string) (next string, ok bool) {
|
||||
if !nle.zeroPadded || nle.digits < 1 {
|
||||
log.Error("node list: only zero-padded ranges are allowed")
|
||||
return "", false
|
||||
}
|
||||
|
||||
if len(input) < nle.digits {
|
||||
return "", false
|
||||
}
|
||||
|
||||
numerals, rest := input[:nle.digits], input[nle.digits:]
|
||||
for len(numerals) > 1 && numerals[0] == '0' {
|
||||
numerals = numerals[1:]
|
||||
}
|
||||
|
||||
x, err := strconv.ParseInt(numerals, 10, 32)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
|
||||
if nle.start <= x && x <= nle.end {
|
||||
return rest, true
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
func ParseNodeList(raw string) (NodeList, error) {
|
||||
isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') }
|
||||
isDigit := func(r byte) bool { return '0' <= r && r <= '9' }
|
||||
|
||||
rawterms := []string{}
|
||||
prevterm := 0
|
||||
for i := 0; i < len(raw); i++ {
|
||||
if raw[i] == '[' {
|
||||
for i < len(raw) && raw[i] != ']' {
|
||||
i++
|
||||
}
|
||||
if i == len(raw) {
|
||||
return nil, fmt.Errorf("node list: unclosed '['")
|
||||
}
|
||||
} else if raw[i] == ',' {
|
||||
rawterms = append(rawterms, raw[prevterm:i])
|
||||
prevterm = i + 1
|
||||
}
|
||||
}
|
||||
if prevterm != len(raw) {
|
||||
rawterms = append(rawterms, raw[prevterm:])
|
||||
}
|
||||
|
||||
nl := NodeList{}
|
||||
for _, rawterm := range rawterms {
|
||||
exprs := []interface {
|
||||
consume(input string) (next string, ok bool)
|
||||
}{}
|
||||
for i := 0; i < len(rawterm); i++ {
|
||||
c := rawterm[i]
|
||||
if isLetter(c) || isDigit(c) {
|
||||
j := i
|
||||
for j < len(rawterm) && (isLetter(rawterm[j]) || isDigit(rawterm[j])) {
|
||||
j++
|
||||
}
|
||||
exprs = append(exprs, NLExprString(rawterm[i:j]))
|
||||
i = j - 1
|
||||
} else if c == '[' {
|
||||
end := strings.Index(rawterm[i:], "]")
|
||||
if end == -1 {
|
||||
return nil, fmt.Errorf("node list: unclosed '['")
|
||||
}
|
||||
|
||||
parts := strings.Split(rawterm[i+1:i+end], ",")
|
||||
nles := NLExprIntRanges{}
|
||||
for _, part := range parts {
|
||||
minus := strings.Index(part, "-")
|
||||
if minus == -1 {
|
||||
return nil, fmt.Errorf("node list: no '-' found inside '[...]'")
|
||||
}
|
||||
|
||||
s1, s2 := part[0:minus], part[minus+1:]
|
||||
if len(s1) != len(s2) || len(s1) == 0 {
|
||||
return nil, fmt.Errorf("node list: %#v and %#v are not of equal length or of length zero", s1, s2)
|
||||
}
|
||||
|
||||
x1, err := strconv.ParseInt(s1, 10, 32)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("node list: %w", err)
|
||||
}
|
||||
x2, err := strconv.ParseInt(s2, 10, 32)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("node list: %w", err)
|
||||
}
|
||||
|
||||
nles = append(nles, NLExprIntRange{
|
||||
start: x1,
|
||||
end: x2,
|
||||
digits: len(s1),
|
||||
zeroPadded: true,
|
||||
})
|
||||
}
|
||||
|
||||
exprs = append(exprs, nles)
|
||||
i += end
|
||||
} else {
|
||||
return nil, fmt.Errorf("node list: invalid character: %#v", rune(c))
|
||||
}
|
||||
}
|
||||
nl = append(nl, exprs)
|
||||
}
|
||||
|
||||
return nl, nil
|
||||
}
|
55
internal/config/nodelist_test.go
Normal file
55
internal/config/nodelist_test.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNodeList(t *testing.T) {
|
||||
nl, err := ParseNodeList("hallo,wel123t,emmy[01-99],fritz[005-500],woody[100-200]")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if nl.Contains("hello") || nl.Contains("woody") {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if nl.Contains("fritz1") || nl.Contains("fritz9") || nl.Contains("fritz004") || nl.Contains("woody201") {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if !nl.Contains("hallo") || !nl.Contains("wel123t") {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if !nl.Contains("emmy01") || !nl.Contains("emmy42") || !nl.Contains("emmy99") {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if !nl.Contains("woody100") || !nl.Contains("woody199") {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeListCommasInBrackets(t *testing.T) {
|
||||
nl, err := ParseNodeList("a[1000-2000,2010-2090,3000-5000]")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if nl.Contains("hello") || nl.Contains("woody") {
|
||||
t.Fatal("1")
|
||||
}
|
||||
|
||||
if nl.Contains("a0") || nl.Contains("a0000") || nl.Contains("a5001") || nl.Contains("a2005") {
|
||||
t.Fatal("2")
|
||||
}
|
||||
|
||||
if !nl.Contains("a1001") || !nl.Contains("a2000") {
|
||||
t.Fatal("3")
|
||||
}
|
||||
|
||||
if !nl.Contains("a2042") || !nl.Contains("a4321") || !nl.Contains("a3000") {
|
||||
t.Fatal("4")
|
||||
}
|
||||
}
|
13264
internal/graph/generated/generated.go
Normal file
13264
internal/graph/generated/generated.go
Normal file
File diff suppressed because it is too large
Load Diff
122
internal/graph/model/models.go
Normal file
122
internal/graph/model/models.go
Normal file
@@ -0,0 +1,122 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type Cluster struct {
|
||||
Name string `json:"name"`
|
||||
MetricConfig []*MetricConfig `json:"metricConfig"`
|
||||
FilterRanges *FilterRanges `json:"filterRanges"`
|
||||
SubClusters []*SubCluster `json:"subClusters"`
|
||||
|
||||
// NOT part of the GraphQL API. This has to be a JSON object with a field `"kind"`.
|
||||
// All other fields depend on that kind (e.g. "cc-metric-store", "influxdb-v2").
|
||||
MetricDataRepository json.RawMessage `json:"metricDataRepository"`
|
||||
}
|
||||
|
||||
// Return a list of socket IDs given a list of hwthread IDs.
|
||||
// Even if just one hwthread is in that socket, add it to the list.
|
||||
// If no hwthreads other than those in the argument list are assigned to
|
||||
// one of the sockets in the first return value, return true as the second value.
|
||||
// TODO: Optimize this, there must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetSocketsFromHWThreads(hwthreads []int) (sockets []int, exclusive bool) {
|
||||
socketsMap := map[int]int{}
|
||||
for _, hwthread := range hwthreads {
|
||||
for socket, hwthreadsInSocket := range topo.Socket {
|
||||
for _, hwthreadInSocket := range hwthreadsInSocket {
|
||||
if hwthread == hwthreadInSocket {
|
||||
socketsMap[socket] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exclusive = true
|
||||
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
|
||||
sockets = make([]int, 0, len(socketsMap))
|
||||
for socket, count := range socketsMap {
|
||||
sockets = append(sockets, socket)
|
||||
exclusive = exclusive && count == hwthreadsPerSocket
|
||||
}
|
||||
|
||||
return sockets, exclusive
|
||||
}
|
||||
|
||||
// Return a list of core IDs given a list of hwthread IDs.
|
||||
// Even if just one hwthread is in that core, add it to the list.
|
||||
// If no hwthreads other than those in the argument list are assigned to
|
||||
// one of the cores in the first return value, return true as the second value.
|
||||
// TODO: Optimize this, there must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetCoresFromHWThreads(hwthreads []int) (cores []int, exclusive bool) {
|
||||
coresMap := map[int]int{}
|
||||
for _, hwthread := range hwthreads {
|
||||
for core, hwthreadsInCore := range topo.Core {
|
||||
for _, hwthreadInCore := range hwthreadsInCore {
|
||||
if hwthread == hwthreadInCore {
|
||||
coresMap[core] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exclusive = true
|
||||
hwthreadsPerCore := len(topo.Node) / len(topo.Core)
|
||||
cores = make([]int, 0, len(coresMap))
|
||||
for core, count := range coresMap {
|
||||
cores = append(cores, core)
|
||||
exclusive = exclusive && count == hwthreadsPerCore
|
||||
}
|
||||
|
||||
return cores, exclusive
|
||||
}
|
||||
|
||||
// Return a list of memory domain IDs given a list of hwthread IDs.
|
||||
// Even if just one hwthread is in that memory domain, add it to the list.
|
||||
// If no hwthreads other than those in the argument list are assigned to
|
||||
// one of the memory domains in the first return value, return true as the second value.
|
||||
// TODO: Optimize this, there must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetMemoryDomainsFromHWThreads(hwthreads []int) (memDoms []int, exclusive bool) {
|
||||
memDomsMap := map[int]int{}
|
||||
for _, hwthread := range hwthreads {
|
||||
for memDom, hwthreadsInmemDom := range topo.MemoryDomain {
|
||||
for _, hwthreadInmemDom := range hwthreadsInmemDom {
|
||||
if hwthread == hwthreadInmemDom {
|
||||
memDomsMap[memDom] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exclusive = true
|
||||
hwthreadsPermemDom := len(topo.Node) / len(topo.MemoryDomain)
|
||||
memDoms = make([]int, 0, len(memDomsMap))
|
||||
for memDom, count := range memDomsMap {
|
||||
memDoms = append(memDoms, memDom)
|
||||
exclusive = exclusive && count == hwthreadsPermemDom
|
||||
}
|
||||
|
||||
return memDoms, exclusive
|
||||
}
|
||||
|
||||
func (topo *Topology) GetAcceleratorIDs() ([]int, error) {
|
||||
accels := make([]int, 0)
|
||||
for _, accel := range topo.Accelerators {
|
||||
id, err := strconv.Atoi(accel.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
accels = append(accels, id)
|
||||
}
|
||||
return accels, nil
|
||||
}
|
||||
|
||||
func (topo *Topology) GetAcceleratorIndex(id string) (int, bool) {
|
||||
for idx, accel := range topo.Accelerators {
|
||||
if accel.ID == id {
|
||||
return idx, true
|
||||
}
|
||||
}
|
||||
return -1, false
|
||||
}
|
310
internal/graph/model/models_gen.go
Normal file
310
internal/graph/model/models_gen.go
Normal file
@@ -0,0 +1,310 @@
|
||||
// Code generated by github.com/99designs/gqlgen, DO NOT EDIT.
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
type Accelerator struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Model string `json:"model"`
|
||||
}
|
||||
|
||||
type Count struct {
|
||||
Name string `json:"name"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
type FilterRanges struct {
|
||||
Duration *IntRangeOutput `json:"duration"`
|
||||
NumNodes *IntRangeOutput `json:"numNodes"`
|
||||
StartTime *TimeRangeOutput `json:"startTime"`
|
||||
}
|
||||
|
||||
type FloatRange struct {
|
||||
From float64 `json:"from"`
|
||||
To float64 `json:"to"`
|
||||
}
|
||||
|
||||
type Footprints struct {
|
||||
Nodehours []schema.Float `json:"nodehours"`
|
||||
Metrics []*MetricFootprints `json:"metrics"`
|
||||
}
|
||||
|
||||
type HistoPoint struct {
|
||||
Count int `json:"count"`
|
||||
Value int `json:"value"`
|
||||
}
|
||||
|
||||
type IntRange struct {
|
||||
From int `json:"from"`
|
||||
To int `json:"to"`
|
||||
}
|
||||
|
||||
type IntRangeOutput struct {
|
||||
From int `json:"from"`
|
||||
To int `json:"to"`
|
||||
}
|
||||
|
||||
type JobFilter struct {
|
||||
Tags []string `json:"tags"`
|
||||
JobID *StringInput `json:"jobId"`
|
||||
ArrayJobID *int `json:"arrayJobId"`
|
||||
User *StringInput `json:"user"`
|
||||
Project *StringInput `json:"project"`
|
||||
Cluster *StringInput `json:"cluster"`
|
||||
Partition *StringInput `json:"partition"`
|
||||
Duration *IntRange `json:"duration"`
|
||||
MinRunningFor *int `json:"minRunningFor"`
|
||||
NumNodes *IntRange `json:"numNodes"`
|
||||
NumAccelerators *IntRange `json:"numAccelerators"`
|
||||
NumHWThreads *IntRange `json:"numHWThreads"`
|
||||
StartTime *TimeRange `json:"startTime"`
|
||||
State []schema.JobState `json:"state"`
|
||||
FlopsAnyAvg *FloatRange `json:"flopsAnyAvg"`
|
||||
MemBwAvg *FloatRange `json:"memBwAvg"`
|
||||
LoadAvg *FloatRange `json:"loadAvg"`
|
||||
MemUsedMax *FloatRange `json:"memUsedMax"`
|
||||
}
|
||||
|
||||
type JobMetricWithName struct {
|
||||
Name string `json:"name"`
|
||||
Metric *schema.JobMetric `json:"metric"`
|
||||
}
|
||||
|
||||
type JobResultList struct {
|
||||
Items []*schema.Job `json:"items"`
|
||||
Offset *int `json:"offset"`
|
||||
Limit *int `json:"limit"`
|
||||
Count *int `json:"count"`
|
||||
}
|
||||
|
||||
type JobsStatistics struct {
|
||||
ID string `json:"id"`
|
||||
TotalJobs int `json:"totalJobs"`
|
||||
ShortJobs int `json:"shortJobs"`
|
||||
TotalWalltime int `json:"totalWalltime"`
|
||||
TotalCoreHours int `json:"totalCoreHours"`
|
||||
HistDuration []*HistoPoint `json:"histDuration"`
|
||||
HistNumNodes []*HistoPoint `json:"histNumNodes"`
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
Name string `json:"name"`
|
||||
Unit string `json:"unit"`
|
||||
Scope schema.MetricScope `json:"scope"`
|
||||
Aggregation *string `json:"aggregation"`
|
||||
Timestep int `json:"timestep"`
|
||||
Peak *float64 `json:"peak"`
|
||||
Normal *float64 `json:"normal"`
|
||||
Caution *float64 `json:"caution"`
|
||||
Alert *float64 `json:"alert"`
|
||||
SubClusters []*SubClusterConfig `json:"subClusters"`
|
||||
}
|
||||
|
||||
type MetricFootprints struct {
|
||||
Metric string `json:"metric"`
|
||||
Data []schema.Float `json:"data"`
|
||||
}
|
||||
|
||||
type NodeMetrics struct {
|
||||
Host string `json:"host"`
|
||||
SubCluster string `json:"subCluster"`
|
||||
Metrics []*JobMetricWithName `json:"metrics"`
|
||||
}
|
||||
|
||||
type OrderByInput struct {
|
||||
Field string `json:"field"`
|
||||
Order SortDirectionEnum `json:"order"`
|
||||
}
|
||||
|
||||
type PageRequest struct {
|
||||
ItemsPerPage int `json:"itemsPerPage"`
|
||||
Page int `json:"page"`
|
||||
}
|
||||
|
||||
type StringInput struct {
|
||||
Eq *string `json:"eq"`
|
||||
Contains *string `json:"contains"`
|
||||
StartsWith *string `json:"startsWith"`
|
||||
EndsWith *string `json:"endsWith"`
|
||||
}
|
||||
|
||||
type SubCluster struct {
|
||||
Name string `json:"name"`
|
||||
Nodes string `json:"nodes"`
|
||||
NumberOfNodes int `json:"numberOfNodes"`
|
||||
ProcessorType string `json:"processorType"`
|
||||
SocketsPerNode int `json:"socketsPerNode"`
|
||||
CoresPerSocket int `json:"coresPerSocket"`
|
||||
ThreadsPerCore int `json:"threadsPerCore"`
|
||||
FlopRateScalar int `json:"flopRateScalar"`
|
||||
FlopRateSimd int `json:"flopRateSimd"`
|
||||
MemoryBandwidth int `json:"memoryBandwidth"`
|
||||
Topology *Topology `json:"topology"`
|
||||
}
|
||||
|
||||
type SubClusterConfig struct {
|
||||
Name string `json:"name"`
|
||||
Peak float64 `json:"peak"`
|
||||
Normal float64 `json:"normal"`
|
||||
Caution float64 `json:"caution"`
|
||||
Alert float64 `json:"alert"`
|
||||
}
|
||||
|
||||
type TimeRange struct {
|
||||
From *time.Time `json:"from"`
|
||||
To *time.Time `json:"to"`
|
||||
}
|
||||
|
||||
type TimeRangeOutput struct {
|
||||
From time.Time `json:"from"`
|
||||
To time.Time `json:"to"`
|
||||
}
|
||||
|
||||
type Topology struct {
|
||||
Node []int `json:"node"`
|
||||
Socket [][]int `json:"socket"`
|
||||
MemoryDomain [][]int `json:"memoryDomain"`
|
||||
Die [][]int `json:"die"`
|
||||
Core [][]int `json:"core"`
|
||||
Accelerators []*Accelerator `json:"accelerators"`
|
||||
}
|
||||
|
||||
type User struct {
|
||||
Username string `json:"username"`
|
||||
Name string `json:"name"`
|
||||
Email string `json:"email"`
|
||||
}
|
||||
|
||||
type Aggregate string
|
||||
|
||||
const (
|
||||
AggregateUser Aggregate = "USER"
|
||||
AggregateProject Aggregate = "PROJECT"
|
||||
AggregateCluster Aggregate = "CLUSTER"
|
||||
)
|
||||
|
||||
var AllAggregate = []Aggregate{
|
||||
AggregateUser,
|
||||
AggregateProject,
|
||||
AggregateCluster,
|
||||
}
|
||||
|
||||
func (e Aggregate) IsValid() bool {
|
||||
switch e {
|
||||
case AggregateUser, AggregateProject, AggregateCluster:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (e Aggregate) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *Aggregate) UnmarshalGQL(v interface{}) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
}
|
||||
|
||||
*e = Aggregate(str)
|
||||
if !e.IsValid() {
|
||||
return fmt.Errorf("%s is not a valid Aggregate", str)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e Aggregate) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
type SortDirectionEnum string
|
||||
|
||||
const (
|
||||
SortDirectionEnumDesc SortDirectionEnum = "DESC"
|
||||
SortDirectionEnumAsc SortDirectionEnum = "ASC"
|
||||
)
|
||||
|
||||
var AllSortDirectionEnum = []SortDirectionEnum{
|
||||
SortDirectionEnumDesc,
|
||||
SortDirectionEnumAsc,
|
||||
}
|
||||
|
||||
func (e SortDirectionEnum) IsValid() bool {
|
||||
switch e {
|
||||
case SortDirectionEnumDesc, SortDirectionEnumAsc:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (e SortDirectionEnum) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *SortDirectionEnum) UnmarshalGQL(v interface{}) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
}
|
||||
|
||||
*e = SortDirectionEnum(str)
|
||||
if !e.IsValid() {
|
||||
return fmt.Errorf("%s is not a valid SortDirectionEnum", str)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e SortDirectionEnum) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
||||
|
||||
type Weights string
|
||||
|
||||
const (
|
||||
WeightsNodeCount Weights = "NODE_COUNT"
|
||||
WeightsNodeHours Weights = "NODE_HOURS"
|
||||
)
|
||||
|
||||
var AllWeights = []Weights{
|
||||
WeightsNodeCount,
|
||||
WeightsNodeHours,
|
||||
}
|
||||
|
||||
func (e Weights) IsValid() bool {
|
||||
switch e {
|
||||
case WeightsNodeCount, WeightsNodeHours:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (e Weights) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
func (e *Weights) UnmarshalGQL(v interface{}) error {
|
||||
str, ok := v.(string)
|
||||
if !ok {
|
||||
return fmt.Errorf("enums must be strings")
|
||||
}
|
||||
|
||||
*e = Weights(str)
|
||||
if !e.IsValid() {
|
||||
return fmt.Errorf("%s is not a valid Weights", str)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e Weights) MarshalGQL(w io.Writer) {
|
||||
fmt.Fprint(w, strconv.Quote(e.String()))
|
||||
}
|
15
internal/graph/resolver.go
Normal file
15
internal/graph/resolver.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package graph
|
||||
|
||||
import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
// This file will not be regenerated automatically.
|
||||
//
|
||||
// It serves as dependency injection for your app, add any dependencies you require here.
|
||||
|
||||
type Resolver struct {
|
||||
DB *sqlx.DB
|
||||
Repo *repository.JobRepository
|
||||
}
|
275
internal/graph/schema.graphqls
Normal file
275
internal/graph/schema.graphqls
Normal file
@@ -0,0 +1,275 @@
|
||||
scalar Time
|
||||
scalar Any
|
||||
|
||||
scalar NullableFloat
|
||||
scalar MetricScope
|
||||
scalar JobState
|
||||
|
||||
type Job {
|
||||
id: ID!
|
||||
jobId: Int!
|
||||
user: String!
|
||||
project: String!
|
||||
cluster: String!
|
||||
subCluster: String!
|
||||
startTime: Time!
|
||||
duration: Int!
|
||||
walltime: Int!
|
||||
numNodes: Int!
|
||||
numHWThreads: Int!
|
||||
numAcc: Int!
|
||||
SMT: Int!
|
||||
exclusive: Int!
|
||||
partition: String!
|
||||
arrayJobId: Int!
|
||||
monitoringStatus: Int!
|
||||
state: JobState!
|
||||
tags: [Tag!]!
|
||||
resources: [Resource!]!
|
||||
|
||||
metaData: Any
|
||||
userData: User
|
||||
}
|
||||
|
||||
type Cluster {
|
||||
name: String!
|
||||
partitions: [String!]! # Slurm partitions
|
||||
metricConfig: [MetricConfig!]!
|
||||
filterRanges: FilterRanges!
|
||||
subClusters: [SubCluster!]! # Hardware partitions/subclusters
|
||||
}
|
||||
|
||||
type SubCluster {
|
||||
name: String!
|
||||
nodes: String!
|
||||
numberOfNodes: Int!
|
||||
processorType: String!
|
||||
socketsPerNode: Int!
|
||||
coresPerSocket: Int!
|
||||
threadsPerCore: Int!
|
||||
flopRateScalar: Int!
|
||||
flopRateSimd: Int!
|
||||
memoryBandwidth: Int!
|
||||
topology: Topology!
|
||||
}
|
||||
|
||||
type Topology {
|
||||
node: [Int!]
|
||||
socket: [[Int!]!]
|
||||
memoryDomain: [[Int!]!]
|
||||
die: [[Int!]!]
|
||||
core: [[Int!]!]
|
||||
accelerators: [Accelerator!]
|
||||
}
|
||||
|
||||
type Accelerator {
|
||||
id: String!
|
||||
type: String!
|
||||
model: String!
|
||||
}
|
||||
|
||||
type SubClusterConfig {
|
||||
name: String!
|
||||
peak: Float!
|
||||
normal: Float!
|
||||
caution: Float!
|
||||
alert: Float!
|
||||
}
|
||||
|
||||
type MetricConfig {
|
||||
name: String!
|
||||
unit: String!
|
||||
scope: MetricScope!
|
||||
aggregation: String
|
||||
timestep: Int!
|
||||
peak: Float
|
||||
normal: Float
|
||||
caution: Float
|
||||
alert: Float
|
||||
subClusters: [SubClusterConfig]
|
||||
}
|
||||
|
||||
type Tag {
|
||||
id: ID!
|
||||
type: String!
|
||||
name: String!
|
||||
}
|
||||
|
||||
type Resource {
|
||||
hostname: String!
|
||||
hwthreads: [Int!]
|
||||
accelerators: [String!]
|
||||
configuration: String
|
||||
}
|
||||
|
||||
type JobMetricWithName {
|
||||
name: String!
|
||||
metric: JobMetric!
|
||||
}
|
||||
|
||||
type JobMetric {
|
||||
unit: String!
|
||||
scope: MetricScope!
|
||||
timestep: Int!
|
||||
series: [Series!]
|
||||
statisticsSeries: StatsSeries
|
||||
}
|
||||
|
||||
type Series {
|
||||
hostname: String!
|
||||
id: Int
|
||||
statistics: MetricStatistics
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type MetricStatistics {
|
||||
avg: Float!
|
||||
min: Float!
|
||||
max: Float!
|
||||
}
|
||||
|
||||
type StatsSeries {
|
||||
mean: [NullableFloat!]!
|
||||
min: [NullableFloat!]!
|
||||
max: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type MetricFootprints {
|
||||
metric: String!
|
||||
data: [NullableFloat!]!
|
||||
}
|
||||
|
||||
type Footprints {
|
||||
nodehours: [NullableFloat!]!
|
||||
metrics: [MetricFootprints!]!
|
||||
}
|
||||
|
||||
enum Aggregate { USER, PROJECT, CLUSTER }
|
||||
enum Weights { NODE_COUNT, NODE_HOURS }
|
||||
|
||||
type NodeMetrics {
|
||||
host: String!
|
||||
subCluster: String!
|
||||
metrics: [JobMetricWithName!]!
|
||||
}
|
||||
|
||||
type Count {
|
||||
name: String!
|
||||
count: Int!
|
||||
}
|
||||
|
||||
type User {
|
||||
username: String!
|
||||
name: String!
|
||||
email: String!
|
||||
}
|
||||
|
||||
type Query {
|
||||
clusters: [Cluster!]! # List of all clusters
|
||||
tags: [Tag!]! # List of all tags
|
||||
|
||||
user(username: String!): User
|
||||
allocatedNodes(cluster: String!): [Count!]!
|
||||
|
||||
job(id: ID!): Job
|
||||
jobMetrics(id: ID!, metrics: [String!], scopes: [MetricScope!]): [JobMetricWithName!]!
|
||||
jobsFootprints(filter: [JobFilter!], metrics: [String!]!): Footprints
|
||||
|
||||
jobs(filter: [JobFilter!], page: PageRequest, order: OrderByInput): JobResultList!
|
||||
jobsStatistics(filter: [JobFilter!], groupBy: Aggregate): [JobsStatistics!]!
|
||||
jobsCount(filter: [JobFilter]!, groupBy: Aggregate!, weight: Weights, limit: Int): [Count!]!
|
||||
|
||||
rooflineHeatmap(filter: [JobFilter!]!, rows: Int!, cols: Int!, minX: Float!, minY: Float!, maxX: Float!, maxY: Float!): [[Float!]!]!
|
||||
|
||||
nodeMetrics(cluster: String!, nodes: [String!], scopes: [MetricScope!], metrics: [String!], from: Time!, to: Time!): [NodeMetrics!]!
|
||||
}
|
||||
|
||||
type Mutation {
|
||||
createTag(type: String!, name: String!): Tag!
|
||||
deleteTag(id: ID!): ID!
|
||||
addTagsToJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
removeTagsFromJob(job: ID!, tagIds: [ID!]!): [Tag!]!
|
||||
|
||||
updateConfiguration(name: String!, value: String!): String
|
||||
}
|
||||
|
||||
type IntRangeOutput { from: Int!, to: Int! }
|
||||
type TimeRangeOutput { from: Time!, to: Time! }
|
||||
|
||||
type FilterRanges {
|
||||
duration: IntRangeOutput!
|
||||
numNodes: IntRangeOutput!
|
||||
startTime: TimeRangeOutput!
|
||||
}
|
||||
|
||||
input JobFilter {
|
||||
tags: [ID!]
|
||||
jobId: StringInput
|
||||
arrayJobId: Int
|
||||
user: StringInput
|
||||
project: StringInput
|
||||
cluster: StringInput
|
||||
partition: StringInput
|
||||
duration: IntRange
|
||||
|
||||
minRunningFor: Int
|
||||
|
||||
numNodes: IntRange
|
||||
numAccelerators: IntRange
|
||||
numHWThreads: IntRange
|
||||
|
||||
startTime: TimeRange
|
||||
state: [JobState!]
|
||||
flopsAnyAvg: FloatRange
|
||||
memBwAvg: FloatRange
|
||||
loadAvg: FloatRange
|
||||
memUsedMax: FloatRange
|
||||
}
|
||||
|
||||
input OrderByInput {
|
||||
field: String!
|
||||
order: SortDirectionEnum! = ASC
|
||||
}
|
||||
|
||||
enum SortDirectionEnum {
|
||||
DESC
|
||||
ASC
|
||||
}
|
||||
|
||||
input StringInput {
|
||||
eq: String
|
||||
contains: String
|
||||
startsWith: String
|
||||
endsWith: String
|
||||
}
|
||||
|
||||
input IntRange { from: Int!, to: Int! }
|
||||
input FloatRange { from: Float!, to: Float! }
|
||||
input TimeRange { from: Time, to: Time }
|
||||
|
||||
type JobResultList {
|
||||
items: [Job!]!
|
||||
offset: Int
|
||||
limit: Int
|
||||
count: Int
|
||||
}
|
||||
|
||||
type HistoPoint {
|
||||
count: Int!
|
||||
value: Int!
|
||||
}
|
||||
|
||||
type JobsStatistics {
|
||||
id: ID! # If `groupBy` was used, ID of the user/project/cluster
|
||||
totalJobs: Int! # Number of jobs that matched
|
||||
shortJobs: Int! # Number of jobs with a duration of less than 2 minutes
|
||||
totalWalltime: Int! # Sum of the duration of all matched jobs in hours
|
||||
totalCoreHours: Int! # Sum of the core hours of all matched jobs
|
||||
histDuration: [HistoPoint!]! # value: hour, count: number of jobs with a rounded duration of value
|
||||
histNumNodes: [HistoPoint!]! # value: number of nodes, count: number of jobs with that number of nodes
|
||||
}
|
||||
|
||||
input PageRequest {
|
||||
itemsPerPage: Int!
|
||||
page: Int!
|
||||
}
|
280
internal/graph/schema.resolvers.go
Normal file
280
internal/graph/schema.resolvers.go
Normal file
@@ -0,0 +1,280 @@
|
||||
package graph
|
||||
|
||||
// This file will be automatically regenerated based on the schema, any resolver implementations
|
||||
// will be copied through when generating and any unknown code will be moved to the end.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
func (r *clusterResolver) Partitions(ctx context.Context, obj *model.Cluster) ([]string, error) {
|
||||
return r.Repo.Partitions(obj.Name)
|
||||
}
|
||||
|
||||
func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(&obj.ID)
|
||||
}
|
||||
|
||||
func (r *jobResolver) MetaData(ctx context.Context, obj *schema.Job) (interface{}, error) {
|
||||
return r.Repo.FetchMetadata(obj)
|
||||
}
|
||||
|
||||
func (r *jobResolver) UserData(ctx context.Context, obj *schema.Job) (*model.User, error) {
|
||||
return auth.FetchUser(ctx, r.DB, obj.User)
|
||||
}
|
||||
|
||||
func (r *mutationResolver) CreateTag(ctx context.Context, typeArg string, name string) (*schema.Tag, error) {
|
||||
id, err := r.Repo.CreateTag(typeArg, name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &schema.Tag{ID: id, Type: typeArg, Name: name}, nil
|
||||
}
|
||||
|
||||
func (r *mutationResolver) DeleteTag(ctx context.Context, id string) (string, error) {
|
||||
// The UI does not allow this currently anyways.
|
||||
panic(fmt.Errorf("not implemented"))
|
||||
}
|
||||
|
||||
func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.AddTag(jid, tid); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, tagIds []string) ([]*schema.Tag, error) {
|
||||
jid, err := strconv.ParseInt(job, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if tags, err = r.Repo.RemoveTag(jid, tid); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
func (r *mutationResolver) UpdateConfiguration(ctx context.Context, name string, value string) (*string, error) {
|
||||
if err := config.UpdateConfig(name, value, ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) Clusters(ctx context.Context) ([]*model.Cluster, error) {
|
||||
return config.Clusters, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
|
||||
return r.Repo.GetTags(nil)
|
||||
}
|
||||
|
||||
func (r *queryResolver) User(ctx context.Context, username string) (*model.User, error) {
|
||||
return auth.FetchUser(ctx, r.DB, username)
|
||||
}
|
||||
|
||||
func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) {
|
||||
data, err := r.Repo.AllocatedNodes(cluster)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
counts := make([]*model.Count, 0, len(data))
|
||||
for subcluster, hosts := range data {
|
||||
counts = append(counts, &model.Count{
|
||||
Name: subcluster,
|
||||
Count: len(hosts),
|
||||
})
|
||||
}
|
||||
|
||||
return counts, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job, err := r.Repo.FindById(numericId)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if user := auth.GetUser(ctx); user != nil && !user.HasRole(auth.RoleAdmin) && job.User != user.Username {
|
||||
return nil, errors.New("you are not allowed to see this job")
|
||||
}
|
||||
|
||||
return job, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []string, scopes []schema.MetricScope) ([]*model.JobMetricWithName, error) {
|
||||
job, err := r.Query().Job(ctx, id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadData(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := []*model.JobMetricWithName{}
|
||||
for name, md := range data {
|
||||
for scope, metric := range md {
|
||||
if metric.Scope != schema.MetricScope(scope) {
|
||||
panic("WTF?")
|
||||
}
|
||||
|
||||
res = append(res, &model.JobMetricWithName{
|
||||
Name: name,
|
||||
Metric: metric,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
func (r *queryResolver) JobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
return r.jobsFootprints(ctx, filter, metrics)
|
||||
}
|
||||
|
||||
func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, page *model.PageRequest, order *model.OrderByInput) (*model.JobResultList, error) {
|
||||
if page == nil {
|
||||
page = &model.PageRequest{
|
||||
ItemsPerPage: 50,
|
||||
Page: 1,
|
||||
}
|
||||
}
|
||||
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, page, order)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
count, err := r.Repo.CountJobs(ctx, filter)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &model.JobResultList{Items: jobs, Count: &count}, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) JobsStatistics(ctx context.Context, filter []*model.JobFilter, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
|
||||
return r.jobsStatistics(ctx, filter, groupBy)
|
||||
}
|
||||
|
||||
func (r *queryResolver) JobsCount(ctx context.Context, filter []*model.JobFilter, groupBy model.Aggregate, weight *model.Weights, limit *int) ([]*model.Count, error) {
|
||||
counts, err := r.Repo.CountGroupedJobs(ctx, groupBy, filter, weight, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := make([]*model.Count, 0, len(counts))
|
||||
for name, count := range counts {
|
||||
res = append(res, &model.Count{
|
||||
Name: name,
|
||||
Count: count,
|
||||
})
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
||||
return r.rooflineHeatmap(ctx, filter, rows, cols, minX, minY, maxX, maxY)
|
||||
}
|
||||
|
||||
func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) {
|
||||
user := auth.GetUser(ctx)
|
||||
if user != nil && !user.HasRole(auth.RoleAdmin) {
|
||||
return nil, errors.New("you need to be an administrator for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range config.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := metricdata.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodeMetrics := make([]*model.NodeMetrics, 0, len(data))
|
||||
for hostname, metrics := range data {
|
||||
host := &model.NodeMetrics{
|
||||
Host: hostname,
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
}
|
||||
host.SubCluster, _ = config.GetSubClusterByNode(cluster, hostname)
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
|
||||
Name: metric,
|
||||
Metric: scopedMetric,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
nodeMetrics = append(nodeMetrics, host)
|
||||
}
|
||||
|
||||
return nodeMetrics, nil
|
||||
}
|
||||
|
||||
// Cluster returns generated.ClusterResolver implementation.
|
||||
func (r *Resolver) Cluster() generated.ClusterResolver { return &clusterResolver{r} }
|
||||
|
||||
// Job returns generated.JobResolver implementation.
|
||||
func (r *Resolver) Job() generated.JobResolver { return &jobResolver{r} }
|
||||
|
||||
// Mutation returns generated.MutationResolver implementation.
|
||||
func (r *Resolver) Mutation() generated.MutationResolver { return &mutationResolver{r} }
|
||||
|
||||
// Query returns generated.QueryResolver implementation.
|
||||
func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
||||
|
||||
type clusterResolver struct{ *Resolver }
|
||||
type jobResolver struct{ *Resolver }
|
||||
type mutationResolver struct{ *Resolver }
|
||||
type queryResolver struct{ *Resolver }
|
302
internal/graph/stats.go
Normal file
302
internal/graph/stats.go
Normal file
@@ -0,0 +1,302 @@
|
||||
package graph
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
// GraphQL validation should make sure that no unkown values can be specified.
|
||||
var groupBy2column = map[model.Aggregate]string{
|
||||
model.AggregateUser: "job.user",
|
||||
model.AggregateProject: "job.project",
|
||||
model.AggregateCluster: "job.cluster",
|
||||
}
|
||||
|
||||
const ShortJobDuration int = 5 * 60
|
||||
|
||||
// Helper function for the jobsStatistics GraphQL query placed here so that schema.resolvers.go is not too full.
|
||||
func (r *queryResolver) jobsStatistics(ctx context.Context, filter []*model.JobFilter, groupBy *model.Aggregate) ([]*model.JobsStatistics, error) {
|
||||
// In case `groupBy` is nil (not used), the model.JobsStatistics used is at the key '' (empty string)
|
||||
stats := map[string]*model.JobsStatistics{}
|
||||
|
||||
// `socketsPerNode` and `coresPerSocket` can differ from cluster to cluster, so we need to explicitly loop over those.
|
||||
for _, cluster := range config.Clusters {
|
||||
for _, subcluster := range cluster.SubClusters {
|
||||
corehoursCol := fmt.Sprintf("CAST(ROUND(SUM(job.duration * job.num_nodes * %d * %d) / 3600) as int)", subcluster.SocketsPerNode, subcluster.CoresPerSocket)
|
||||
var query sq.SelectBuilder
|
||||
if groupBy == nil {
|
||||
query = sq.Select(
|
||||
"''",
|
||||
"COUNT(job.id)",
|
||||
"CAST(ROUND(SUM(job.duration) / 3600) as int)",
|
||||
corehoursCol,
|
||||
).From("job")
|
||||
} else {
|
||||
col := groupBy2column[*groupBy]
|
||||
query = sq.Select(
|
||||
col,
|
||||
"COUNT(job.id)",
|
||||
"CAST(ROUND(SUM(job.duration) / 3600) as int)",
|
||||
corehoursCol,
|
||||
).From("job").GroupBy(col)
|
||||
}
|
||||
|
||||
query = query.
|
||||
Where("job.cluster = ?", cluster.Name).
|
||||
Where("job.subcluster = ?", subcluster.Name)
|
||||
|
||||
query = repository.SecurityCheck(ctx, query)
|
||||
for _, f := range filter {
|
||||
query = repository.BuildWhereClause(f, query)
|
||||
}
|
||||
|
||||
rows, err := query.RunWith(r.DB).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id sql.NullString
|
||||
var jobs, walltime, corehours sql.NullInt64
|
||||
if err := rows.Scan(&id, &jobs, &walltime, &corehours); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
if s, ok := stats[id.String]; ok {
|
||||
s.TotalJobs += int(jobs.Int64)
|
||||
s.TotalWalltime += int(walltime.Int64)
|
||||
s.TotalCoreHours += int(corehours.Int64)
|
||||
} else {
|
||||
stats[id.String] = &model.JobsStatistics{
|
||||
ID: id.String,
|
||||
TotalJobs: int(jobs.Int64),
|
||||
TotalWalltime: int(walltime.Int64),
|
||||
TotalCoreHours: int(corehours.Int64),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if groupBy == nil {
|
||||
query := sq.Select("COUNT(job.id)").From("job").Where("job.duration < ?", ShortJobDuration)
|
||||
query = repository.SecurityCheck(ctx, query)
|
||||
for _, f := range filter {
|
||||
query = repository.BuildWhereClause(f, query)
|
||||
}
|
||||
if err := query.RunWith(r.DB).QueryRow().Scan(&(stats[""].ShortJobs)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
col := groupBy2column[*groupBy]
|
||||
query := sq.Select(col, "COUNT(job.id)").From("job").Where("job.duration < ?", ShortJobDuration)
|
||||
query = repository.SecurityCheck(ctx, query)
|
||||
for _, f := range filter {
|
||||
query = repository.BuildWhereClause(f, query)
|
||||
}
|
||||
rows, err := query.RunWith(r.DB).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var id sql.NullString
|
||||
var shortJobs sql.NullInt64
|
||||
if err := rows.Scan(&id, &shortJobs); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
stats[id.String].ShortJobs = int(shortJobs.Int64)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculating the histogram data is expensive, so only do it if needed.
|
||||
// An explicit resolver can not be used because we need to know the filters.
|
||||
histogramsNeeded := false
|
||||
fields := graphql.CollectFieldsCtx(ctx, nil)
|
||||
for _, col := range fields {
|
||||
if col.Name == "histDuration" || col.Name == "histNumNodes" {
|
||||
histogramsNeeded = true
|
||||
}
|
||||
}
|
||||
|
||||
res := make([]*model.JobsStatistics, 0, len(stats))
|
||||
for _, stat := range stats {
|
||||
res = append(res, stat)
|
||||
id, col := "", ""
|
||||
if groupBy != nil {
|
||||
id = stat.ID
|
||||
col = groupBy2column[*groupBy]
|
||||
}
|
||||
|
||||
if histogramsNeeded {
|
||||
var err error
|
||||
value := fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as int) as value`, time.Now().Unix())
|
||||
stat.HistDuration, err = r.jobsStatisticsHistogram(ctx, value, filter, id, col)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stat.HistNumNodes, err = r.jobsStatisticsHistogram(ctx, "job.num_nodes as value", filter, id, col)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// `value` must be the column grouped by, but renamed to "value". `id` and `col` can optionally be used
|
||||
// to add a condition to the query of the kind "<col> = <id>".
|
||||
func (r *queryResolver) jobsStatisticsHistogram(ctx context.Context, value string, filters []*model.JobFilter, id, col string) ([]*model.HistoPoint, error) {
|
||||
query := sq.Select(value, "COUNT(job.id) AS count").From("job")
|
||||
query = repository.SecurityCheck(ctx, query)
|
||||
for _, f := range filters {
|
||||
query = repository.BuildWhereClause(f, query)
|
||||
}
|
||||
|
||||
if len(id) != 0 && len(col) != 0 {
|
||||
query = query.Where(col+" = ?", id)
|
||||
}
|
||||
|
||||
rows, err := query.GroupBy("value").RunWith(r.DB).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
points := make([]*model.HistoPoint, 0)
|
||||
for rows.Next() {
|
||||
point := model.HistoPoint{}
|
||||
if err := rows.Scan(&point.Value, &point.Count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
points = append(points, &point)
|
||||
}
|
||||
return points, nil
|
||||
}
|
||||
|
||||
const MAX_JOBS_FOR_ANALYSIS = 500
|
||||
|
||||
// Helper function for the rooflineHeatmap GraphQL query placed here so that schema.resolvers.go is not too full.
|
||||
func (r *Resolver) rooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) {
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||
return nil, fmt.Errorf("too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
|
||||
}
|
||||
|
||||
fcols, frows := float64(cols), float64(rows)
|
||||
minX, minY, maxX, maxY = math.Log10(minX), math.Log10(minY), math.Log10(maxX), math.Log10(maxY)
|
||||
tiles := make([][]float64, rows)
|
||||
for i := range tiles {
|
||||
tiles[i] = make([]float64, cols)
|
||||
}
|
||||
|
||||
for _, job := range jobs {
|
||||
if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed {
|
||||
continue
|
||||
}
|
||||
|
||||
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
||||
if flops_ == nil && membw_ == nil {
|
||||
return nil, fmt.Errorf("'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
}
|
||||
|
||||
flops, ok1 := flops_["node"]
|
||||
membw, ok2 := membw_["node"]
|
||||
if !ok1 || !ok2 {
|
||||
// TODO/FIXME:
|
||||
return nil, errors.New("todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
|
||||
}
|
||||
|
||||
for n := 0; n < len(flops.Series); n++ {
|
||||
flopsSeries, membwSeries := flops.Series[n], membw.Series[n]
|
||||
for i := 0; i < len(flopsSeries.Data); i++ {
|
||||
if i >= len(membwSeries.Data) {
|
||||
break
|
||||
}
|
||||
|
||||
x, y := math.Log10(float64(flopsSeries.Data[i]/membwSeries.Data[i])), math.Log10(float64(flopsSeries.Data[i]))
|
||||
if math.IsNaN(x) || math.IsNaN(y) || x < minX || x >= maxX || y < minY || y > maxY {
|
||||
continue
|
||||
}
|
||||
|
||||
x, y = math.Floor(((x-minX)/(maxX-minX))*fcols), math.Floor(((y-minY)/(maxY-minY))*frows)
|
||||
if x < 0 || x >= fcols || y < 0 || y >= frows {
|
||||
continue
|
||||
}
|
||||
|
||||
tiles[int(y)][int(x)] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tiles, nil
|
||||
}
|
||||
|
||||
// Helper function for the jobsFootprints GraphQL query placed here so that schema.resolvers.go is not too full.
|
||||
func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobFilter, metrics []string) (*model.Footprints, error) {
|
||||
jobs, err := r.Repo.QueryJobs(ctx, filter, &model.PageRequest{Page: 1, ItemsPerPage: MAX_JOBS_FOR_ANALYSIS + 1}, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
|
||||
return nil, fmt.Errorf("too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
|
||||
}
|
||||
|
||||
avgs := make([][]schema.Float, len(metrics))
|
||||
for i := range avgs {
|
||||
avgs[i] = make([]schema.Float, 0, len(jobs))
|
||||
}
|
||||
|
||||
nodehours := make([]schema.Float, 0, len(jobs))
|
||||
for _, job := range jobs {
|
||||
if job.MonitoringStatus == schema.MonitoringStatusDisabled || job.MonitoringStatus == schema.MonitoringStatusArchivingFailed {
|
||||
continue
|
||||
}
|
||||
|
||||
if err := metricdata.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodehours = append(nodehours, schema.Float(float64(job.Duration)/60.0*float64(job.NumNodes)))
|
||||
}
|
||||
|
||||
res := make([]*model.MetricFootprints, len(avgs))
|
||||
for i, arr := range avgs {
|
||||
res[i] = &model.MetricFootprints{
|
||||
Metric: metrics[i],
|
||||
Data: arr,
|
||||
}
|
||||
}
|
||||
|
||||
return &model.Footprints{
|
||||
Nodehours: nodehours,
|
||||
Metrics: res,
|
||||
}, nil
|
||||
}
|
257
internal/metricdata/archive.go
Normal file
257
internal/metricdata/archive.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
// For a given job, return the path of the `data.json`/`meta.json` file.
|
||||
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
|
||||
func getPath(job *schema.Job, file string, checkLegacy bool) (string, error) {
|
||||
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
|
||||
if !checkLegacy {
|
||||
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
||||
}
|
||||
|
||||
legacyPath := filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, file)
|
||||
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
|
||||
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
||||
}
|
||||
|
||||
return legacyPath, nil
|
||||
}
|
||||
|
||||
// Assuming job is completed/archived, return the jobs metric data.
|
||||
func loadFromArchive(job *schema.Job) (schema.JobData, error) {
|
||||
filename, err := getPath(job, "data.json", true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data := cache.Get(filename, func() (value interface{}, ttl time.Duration, size int) {
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return err, 0, 1000
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var data schema.JobData
|
||||
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&data); err != nil {
|
||||
return err, 0, 1000
|
||||
}
|
||||
|
||||
return data, 1 * time.Hour, data.Size()
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
func loadMetaJson(job *schema.Job) (*schema.JobMeta, error) {
|
||||
filename, err := getPath(job, "meta.json", true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
bytes, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var metaFile schema.JobMeta = schema.JobMeta{
|
||||
BaseJob: schema.JobDefaults,
|
||||
}
|
||||
if err := json.Unmarshal(bytes, &metaFile); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &metaFile, nil
|
||||
}
|
||||
|
||||
// If the job is archived, find its `meta.json` file and override the tags list
|
||||
// in that JSON file. If the job is not archived, nothing is done.
|
||||
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
|
||||
if job.State == schema.JobStateRunning {
|
||||
return nil
|
||||
}
|
||||
|
||||
filename, err := getPath(job, "meta.json", true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
var metaFile schema.JobMeta = schema.JobMeta{
|
||||
BaseJob: schema.JobDefaults,
|
||||
}
|
||||
if err := json.NewDecoder(f).Decode(&metaFile); err != nil {
|
||||
return err
|
||||
}
|
||||
f.Close()
|
||||
|
||||
metaFile.Tags = make([]*schema.Tag, 0)
|
||||
for _, tag := range tags {
|
||||
metaFile.Tags = append(metaFile.Tags, &schema.Tag{
|
||||
Name: tag.Name,
|
||||
Type: tag.Type,
|
||||
})
|
||||
}
|
||||
|
||||
bytes, err := json.Marshal(metaFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return os.WriteFile(filename, bytes, 0644)
|
||||
}
|
||||
|
||||
// Helper to metricdata.LoadAverages().
|
||||
func loadAveragesFromArchive(job *schema.Job, metrics []string, data [][]schema.Float) error {
|
||||
metaFile, err := loadMetaJson(job)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
if stat, ok := metaFile.Statistics[m]; ok {
|
||||
data[i] = append(data[i], schema.Float(stat.Avg))
|
||||
} else {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetStatistics(job *schema.Job) (map[string]schema.JobStatistics, error) {
|
||||
metaFile, err := loadMetaJson(job)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return metaFile.Statistics, nil
|
||||
}
|
||||
|
||||
// Writes a running job to the job-archive
|
||||
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
allMetrics := make([]string, 0)
|
||||
metricConfigs := config.GetCluster(job.Cluster).MetricConfig
|
||||
for _, mc := range metricConfigs {
|
||||
allMetrics = append(allMetrics, mc.Name)
|
||||
}
|
||||
|
||||
// TODO: Talk about this! What resolutions to store data at...
|
||||
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||
if job.NumNodes <= 8 {
|
||||
scopes = append(scopes, schema.MetricScopeCore)
|
||||
}
|
||||
|
||||
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobMeta := &schema.JobMeta{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTime: job.StartTime.Unix(),
|
||||
Statistics: make(map[string]schema.JobStatistics),
|
||||
}
|
||||
|
||||
for metric, data := range jobData {
|
||||
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
nodeData, ok := data["node"]
|
||||
if !ok {
|
||||
// TODO/FIXME: Calc average for non-node metrics as well!
|
||||
continue
|
||||
}
|
||||
|
||||
for _, series := range nodeData.Series {
|
||||
avg += series.Statistics.Avg
|
||||
min = math.Min(min, series.Statistics.Min)
|
||||
max = math.Max(max, series.Statistics.Max)
|
||||
}
|
||||
|
||||
jobMeta.Statistics[metric] = schema.JobStatistics{
|
||||
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
|
||||
Avg: avg / float64(job.NumNodes),
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if !useArchive {
|
||||
return jobMeta, nil
|
||||
}
|
||||
|
||||
dir, err := getPath(job, "", false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return jobMeta, writeFiles(dir, jobMeta, &jobData)
|
||||
}
|
||||
|
||||
func writeFiles(dir string, jobMeta *schema.JobMeta, jobData *schema.JobData) error {
|
||||
if err := os.MkdirAll(dir, 0777); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f, err := os.Create(path.Join(dir, "meta.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := json.NewEncoder(f).Encode(jobMeta); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f, err = os.Create(path.Join(dir, "data.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := json.NewEncoder(f).Encode(jobData); err != nil {
|
||||
return err
|
||||
}
|
||||
return f.Close()
|
||||
}
|
||||
|
||||
// Used to import a non-running job into the job-archive.
|
||||
func ImportJob(job *schema.JobMeta, jobData *schema.JobData) error {
|
||||
dir, err := getPath(&schema.Job{
|
||||
BaseJob: job.BaseJob,
|
||||
StartTimeUnix: job.StartTime,
|
||||
StartTime: time.Unix(job.StartTime, 0),
|
||||
}, "", false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return writeFiles(dir, job, jobData)
|
||||
}
|
611
internal/metricdata/cc-metric-store.go
Normal file
611
internal/metricdata/cc-metric-store.go
Normal file
@@ -0,0 +1,611 @@
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
type CCMetricStoreConfig struct {
|
||||
Kind string `json:"kind"`
|
||||
Url string `json:"url"`
|
||||
Token string `json:"token"`
|
||||
|
||||
// If metrics are known to this MetricDataRepository under a different
|
||||
// name than in the `metricConfig` section of the 'cluster.json',
|
||||
// provide this optional mapping of local to remote name for this metric.
|
||||
Renamings map[string]string `json:"metricRenamings"`
|
||||
}
|
||||
|
||||
type CCMetricStore struct {
|
||||
jwt string
|
||||
url string
|
||||
queryEndpoint string
|
||||
client http.Client
|
||||
here2there map[string]string
|
||||
there2here map[string]string
|
||||
}
|
||||
|
||||
type ApiQueryRequest struct {
|
||||
Cluster string `json:"cluster"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
WithStats bool `json:"with-stats"`
|
||||
WithData bool `json:"with-data"`
|
||||
Queries []ApiQuery `json:"queries"`
|
||||
ForAllNodes []string `json:"for-all-nodes"`
|
||||
}
|
||||
|
||||
type ApiQuery struct {
|
||||
Metric string `json:"metric"`
|
||||
Hostname string `json:"host"`
|
||||
Aggregate bool `json:"aggreg"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
TypeIds []string `json:"type-ids,omitempty"`
|
||||
SubType *string `json:"subtype,omitempty"`
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||
}
|
||||
|
||||
type ApiQueryResponse struct {
|
||||
Queries []ApiQuery `json:"queries,omitempty"`
|
||||
Results [][]ApiMetricData `json:"results"`
|
||||
}
|
||||
|
||||
type ApiMetricData struct {
|
||||
Error *string `json:"error"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
Data []schema.Float `json:"data"`
|
||||
Avg schema.Float `json:"avg"`
|
||||
Min schema.Float `json:"min"`
|
||||
Max schema.Float `json:"max"`
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error {
|
||||
var config CCMetricStoreConfig
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ccms.url = config.Url
|
||||
ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url)
|
||||
ccms.jwt = config.Token
|
||||
ccms.client = http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
if config.Renamings != nil {
|
||||
ccms.here2there = config.Renamings
|
||||
ccms.there2here = make(map[string]string, len(config.Renamings))
|
||||
for k, v := range ccms.here2there {
|
||||
ccms.there2here[v] = k
|
||||
}
|
||||
} else {
|
||||
ccms.here2there = make(map[string]string)
|
||||
ccms.there2here = make(map[string]string)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) toRemoteName(metric string) string {
|
||||
if renamed, ok := ccms.here2there[metric]; ok {
|
||||
return renamed
|
||||
}
|
||||
|
||||
return metric
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) toLocalName(metric string) string {
|
||||
if renamed, ok := ccms.there2here[metric]; ok {
|
||||
return renamed
|
||||
}
|
||||
|
||||
return metric
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) doRequest(ctx context.Context, body *ApiQueryRequest) (*ApiQueryResponse, error) {
|
||||
buf := &bytes.Buffer{}
|
||||
if err := json.NewEncoder(buf).Encode(body); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.queryEndpoint, buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if ccms.jwt != "" {
|
||||
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
|
||||
}
|
||||
|
||||
res, err := ccms.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
|
||||
}
|
||||
|
||||
var resBody ApiQueryResponse
|
||||
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &resBody, nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
topology := config.GetSubCluster(job.Cluster, job.SubCluster).Topology
|
||||
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := ApiQueryRequest{
|
||||
Cluster: job.Cluster,
|
||||
From: job.StartTime.Unix(),
|
||||
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
|
||||
Queries: queries,
|
||||
WithStats: true,
|
||||
WithData: true,
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var errors []string
|
||||
var jobData schema.JobData = make(schema.JobData)
|
||||
for i, row := range resBody.Results {
|
||||
query := req.Queries[i]
|
||||
metric := ccms.toLocalName(query.Metric)
|
||||
scope := assignedScope[i]
|
||||
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||
if _, ok := jobData[metric]; !ok {
|
||||
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
}
|
||||
|
||||
jobMetric, ok := jobData[metric][scope]
|
||||
if !ok {
|
||||
jobMetric = &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Scope: scope,
|
||||
Timestep: mc.Timestep,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
jobData[metric][scope] = jobMetric
|
||||
}
|
||||
|
||||
for _, res := range row {
|
||||
if res.Error != nil {
|
||||
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
||||
continue
|
||||
}
|
||||
|
||||
id := (*int)(nil)
|
||||
if query.Type != nil {
|
||||
id = new(int)
|
||||
*id, err = strconv.Atoi(query.TypeIds[0])
|
||||
if err != nil || *query.Type == acceleratorString {
|
||||
*id, _ = topology.GetAcceleratorIndex(query.TypeIds[0])
|
||||
}
|
||||
}
|
||||
|
||||
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
|
||||
// TODO: use schema.Float instead of float64?
|
||||
// This is done because regular float64 can not be JSONed when NaN.
|
||||
res.Avg = schema.Float(0)
|
||||
res.Min = schema.Float(0)
|
||||
res.Max = schema.Float(0)
|
||||
}
|
||||
|
||||
jobMetric.Series = append(jobMetric.Series, schema.Series{
|
||||
Hostname: query.Hostname,
|
||||
Id: id,
|
||||
Statistics: &schema.MetricStatistics{
|
||||
Avg: float64(res.Avg),
|
||||
Min: float64(res.Min),
|
||||
Max: float64(res.Max),
|
||||
},
|
||||
Data: res.Data,
|
||||
})
|
||||
}
|
||||
|
||||
// So that one can later check len(jobData):
|
||||
if len(jobMetric.Series) == 0 {
|
||||
delete(jobData[metric], scope)
|
||||
if len(jobData[metric]) == 0 {
|
||||
delete(jobData, metric)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(errors) != 0 {
|
||||
return jobData, fmt.Errorf("cc-metric-store: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
var (
|
||||
hwthreadString = string("cpu") // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
|
||||
coreString = string(schema.MetricScopeCore)
|
||||
memoryDomainString = string(schema.MetricScopeMemoryDomain)
|
||||
socketString = string(schema.MetricScopeSocket)
|
||||
acceleratorString = string(schema.MetricScopeAccelerator)
|
||||
)
|
||||
|
||||
func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) {
|
||||
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
||||
topology := config.GetSubCluster(job.Cluster, job.SubCluster).Topology
|
||||
assignedScope := []schema.MetricScope{}
|
||||
|
||||
for _, metric := range metrics {
|
||||
remoteName := ccms.toRemoteName(metric)
|
||||
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||
if mc == nil {
|
||||
// return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
||||
// log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
// Avoid duplicates...
|
||||
handledScopes := make([]schema.MetricScope, 0, 3)
|
||||
|
||||
scopesLoop:
|
||||
for _, requestedScope := range scopes {
|
||||
nativeScope := mc.Scope
|
||||
scope := nativeScope.Max(requestedScope)
|
||||
for _, s := range handledScopes {
|
||||
if scope == s {
|
||||
continue scopesLoop
|
||||
}
|
||||
}
|
||||
handledScopes = append(handledScopes, scope)
|
||||
|
||||
for _, host := range job.Resources {
|
||||
hwthreads := host.HWThreads
|
||||
if hwthreads == nil {
|
||||
hwthreads = topology.Node
|
||||
}
|
||||
|
||||
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
|
||||
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: host.Accelerators,
|
||||
})
|
||||
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
|
||||
continue
|
||||
}
|
||||
|
||||
// Accelerator -> Node
|
||||
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
|
||||
if len(host.Accelerators) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: host.Accelerators,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> HWThead
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(hwthreads),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> Core
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
|
||||
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
||||
for _, core := range cores {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Core[core]),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> Socket
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
||||
for _, socket := range sockets {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(topology.Socket[socket]),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// HWThread -> Node
|
||||
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &hwthreadString,
|
||||
TypeIds: intToStringSlice(hwthreads),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Core -> Core
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
|
||||
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Core -> Node
|
||||
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
|
||||
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &coreString,
|
||||
TypeIds: intToStringSlice(cores),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// MemoryDomain -> MemoryDomain
|
||||
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
|
||||
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// MemoryDoman -> Node
|
||||
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
|
||||
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &memoryDomainString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Socket -> Socket
|
||||
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: false,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Socket -> Node
|
||||
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
|
||||
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
Aggregate: true,
|
||||
Type: &socketString,
|
||||
TypeIds: intToStringSlice(sockets),
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
// Node -> Node
|
||||
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: remoteName,
|
||||
Hostname: host.Hostname,
|
||||
})
|
||||
assignedScope = append(assignedScope, scope)
|
||||
continue
|
||||
}
|
||||
|
||||
return nil, nil, fmt.Errorf("TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return queries, assignedScope, nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := ApiQueryRequest{
|
||||
Cluster: job.Cluster,
|
||||
From: job.StartTime.Unix(),
|
||||
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
|
||||
Queries: queries,
|
||||
WithStats: true,
|
||||
WithData: false,
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
|
||||
for i, res := range resBody.Results {
|
||||
query := req.Queries[i]
|
||||
metric := ccms.toLocalName(query.Metric)
|
||||
data := res[0]
|
||||
if data.Error != nil {
|
||||
return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
|
||||
}
|
||||
|
||||
metricdata, ok := stats[metric]
|
||||
if !ok {
|
||||
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
|
||||
stats[metric] = metricdata
|
||||
}
|
||||
|
||||
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
|
||||
return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
|
||||
}
|
||||
|
||||
metricdata[query.Hostname] = schema.MetricStatistics{
|
||||
Avg: float64(data.Avg),
|
||||
Min: float64(data.Min),
|
||||
Max: float64(data.Max),
|
||||
}
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// TODO: Support sub-node-scope metrics! For this, the partition of a node needs to be known!
|
||||
func (ccms *CCMetricStore) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
req := ApiQueryRequest{
|
||||
Cluster: cluster,
|
||||
From: from.Unix(),
|
||||
To: to.Unix(),
|
||||
WithStats: true,
|
||||
WithData: true,
|
||||
}
|
||||
|
||||
if nodes == nil {
|
||||
for _, metric := range metrics {
|
||||
req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric))
|
||||
}
|
||||
} else {
|
||||
for _, node := range nodes {
|
||||
for _, metric := range metrics {
|
||||
req.Queries = append(req.Queries, ApiQuery{
|
||||
Hostname: node,
|
||||
Metric: ccms.toRemoteName(metric),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var errors []string
|
||||
data := make(map[string]map[string][]*schema.JobMetric)
|
||||
for i, res := range resBody.Results {
|
||||
var query ApiQuery
|
||||
if resBody.Queries != nil {
|
||||
query = resBody.Queries[i]
|
||||
} else {
|
||||
query = req.Queries[i]
|
||||
}
|
||||
|
||||
metric := ccms.toLocalName(query.Metric)
|
||||
qdata := res[0]
|
||||
if qdata.Error != nil {
|
||||
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
|
||||
}
|
||||
|
||||
if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() {
|
||||
// return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
|
||||
qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0.
|
||||
}
|
||||
|
||||
hostdata, ok := data[query.Hostname]
|
||||
if !ok {
|
||||
hostdata = make(map[string][]*schema.JobMetric)
|
||||
data[query.Hostname] = hostdata
|
||||
}
|
||||
|
||||
mc := config.GetMetricConfig(cluster, metric)
|
||||
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Scope: schema.MetricScopeNode,
|
||||
Timestep: mc.Timestep,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: query.Hostname,
|
||||
Data: qdata.Data,
|
||||
Statistics: &schema.MetricStatistics{
|
||||
Avg: float64(qdata.Avg),
|
||||
Min: float64(qdata.Min),
|
||||
Max: float64(qdata.Max),
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if len(errors) != 0 {
|
||||
return data, fmt.Errorf("cc-metric-store: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func intToStringSlice(is []int) []string {
|
||||
ss := make([]string, len(is))
|
||||
for i, x := range is {
|
||||
ss[i] = strconv.Itoa(x)
|
||||
}
|
||||
return ss
|
||||
}
|
308
internal/metricdata/influxdb-v2.go
Normal file
308
internal/metricdata/influxdb-v2.go
Normal file
@@ -0,0 +1,308 @@
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
|
||||
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
|
||||
)
|
||||
|
||||
type InfluxDBv2DataRepositoryConfig struct {
|
||||
Url string `json:"url"`
|
||||
Token string `json:"token"`
|
||||
Bucket string `json:"bucket"`
|
||||
Org string `json:"org"`
|
||||
SkipTls bool `json:"skiptls"`
|
||||
}
|
||||
|
||||
type InfluxDBv2DataRepository struct {
|
||||
client influxdb2.Client
|
||||
queryClient influxdb2Api.QueryAPI
|
||||
bucket, measurement string
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error {
|
||||
var config InfluxDBv2DataRepositoryConfig
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls}))
|
||||
idb.queryClient = idb.client.QueryAPI(config.Org)
|
||||
idb.bucket = config.Bucket
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
|
||||
return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00”
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time {
|
||||
return time.Unix(epoch, 0)
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
|
||||
measurementsConds := make([]string, 0, len(metrics))
|
||||
for _, m := range metrics {
|
||||
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
|
||||
}
|
||||
measurementsCond := strings.Join(measurementsConds, " or ")
|
||||
|
||||
hostsConds := make([]string, 0, len(job.Resources))
|
||||
for _, h := range job.Resources {
|
||||
if h.HWThreads != nil || h.Accelerators != nil {
|
||||
// TODO
|
||||
return nil, errors.New("the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
|
||||
}
|
||||
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
|
||||
}
|
||||
hostsCond := strings.Join(hostsConds, " or ")
|
||||
|
||||
jobData := make(schema.JobData) // Empty Schema: map[<string>FIELD]map[<MetricScope>SCOPE]<*JobMetric>METRIC
|
||||
// Requested Scopes
|
||||
for _, scope := range scopes {
|
||||
query := ""
|
||||
switch scope {
|
||||
case "node":
|
||||
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows
|
||||
// log.Println("Note: Scope 'node' requested. ")
|
||||
query = fmt.Sprintf(`
|
||||
from(bucket: "%s")
|
||||
|> range(start: %s, stop: %s)
|
||||
|> filter(fn: (r) => (%s) and (%s) )
|
||||
|> drop(columns: ["_start", "_stop"])
|
||||
|> group(columns: ["hostname", "_measurement"])
|
||||
|> aggregateWindow(every: 60s, fn: mean)
|
||||
|> drop(columns: ["_time"])`,
|
||||
idb.bucket,
|
||||
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
|
||||
measurementsCond, hostsCond)
|
||||
case "socket":
|
||||
log.Println("Note: Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ")
|
||||
continue
|
||||
case "core":
|
||||
log.Println("Note: Scope 'core' requested, but not yet supported: Will return 'node' scope only. ")
|
||||
continue
|
||||
// Get Finest Granularity only, Set NULL to 0.0
|
||||
// query = fmt.Sprintf(`
|
||||
// from(bucket: "%s")
|
||||
// |> range(start: %s, stop: %s)
|
||||
// |> filter(fn: (r) => %s )
|
||||
// |> filter(fn: (r) => %s )
|
||||
// |> drop(columns: ["_start", "_stop", "cluster"])
|
||||
// |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`,
|
||||
// idb.bucket,
|
||||
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
|
||||
// measurementsCond, hostsCond)
|
||||
default:
|
||||
log.Println("Note: Unknown Scope requested: Will return 'node' scope. ")
|
||||
continue
|
||||
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node'")
|
||||
}
|
||||
|
||||
rows, err := idb.queryClient.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Init Metrics: Only Node level now -> TODO: Matching /check on scope level ...
|
||||
for _, metric := range metrics {
|
||||
jobMetric, ok := jobData[metric]
|
||||
if !ok {
|
||||
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||
jobMetric = map[schema.MetricScope]*schema.JobMetric{
|
||||
scope: { // uses scope var from above!
|
||||
Unit: mc.Unit,
|
||||
Scope: scope,
|
||||
Timestep: mc.Timestep,
|
||||
Series: make([]schema.Series, 0, len(job.Resources)),
|
||||
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
|
||||
},
|
||||
}
|
||||
}
|
||||
jobData[metric] = jobMetric
|
||||
}
|
||||
|
||||
// Process Result: Time-Data
|
||||
field, host, hostSeries := "", "", schema.Series{}
|
||||
// typeId := 0
|
||||
switch scope {
|
||||
case "node":
|
||||
for rows.Next() {
|
||||
row := rows.Record()
|
||||
if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() {
|
||||
if host != "" {
|
||||
// Append Series before reset
|
||||
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
}
|
||||
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
|
||||
hostSeries = schema.Series{
|
||||
Hostname: host,
|
||||
Statistics: nil,
|
||||
Data: make([]schema.Float, 0),
|
||||
}
|
||||
}
|
||||
val, ok := row.Value().(float64)
|
||||
if ok {
|
||||
hostSeries.Data = append(hostSeries.Data, schema.Float(val))
|
||||
} else {
|
||||
hostSeries.Data = append(hostSeries.Data, schema.Float(0))
|
||||
}
|
||||
}
|
||||
case "socket":
|
||||
continue
|
||||
case "core":
|
||||
continue
|
||||
// Include Series.Id in hostSeries
|
||||
// for rows.Next() {
|
||||
// row := rows.Record()
|
||||
// if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) {
|
||||
// if ( host != "" ) {
|
||||
// // Append Series before reset
|
||||
// jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
// }
|
||||
// field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int)
|
||||
// hostSeries = schema.Series{
|
||||
// Hostname: host,
|
||||
// Id: &typeId,
|
||||
// Statistics: nil,
|
||||
// Data: make([]schema.Float, 0),
|
||||
// }
|
||||
// }
|
||||
// val := row.Value().(float64)
|
||||
// hostSeries.Data = append(hostSeries.Data, schema.Float(val))
|
||||
// }
|
||||
default:
|
||||
continue
|
||||
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'")
|
||||
}
|
||||
// Append last Series
|
||||
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
|
||||
}
|
||||
|
||||
// Get Stats
|
||||
stats, err := idb.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope == "node" { // No 'socket/core' support yet
|
||||
for metric, nodes := range stats {
|
||||
// log.Println(fmt.Sprintf("<< Add Stats for : Field %s >>", metric))
|
||||
for node, stats := range nodes {
|
||||
// log.Println(fmt.Sprintf("<< Add Stats for : Host %s : Min %.2f, Max %.2f, Avg %.2f >>", node, stats.Min, stats.Max, stats.Avg ))
|
||||
for index, _ := range jobData[metric][scope].Series {
|
||||
// log.Println(fmt.Sprintf("<< Try to add Stats to Series in Position %d >>", index))
|
||||
if jobData[metric][scope].Series[index].Hostname == node {
|
||||
// log.Println(fmt.Sprintf("<< Match for Series in Position %d : Host %s >>", index, jobData[metric][scope].Series[index].Hostname))
|
||||
jobData[metric][scope].Series[index].Statistics = &schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
|
||||
// log.Println(fmt.Sprintf("<< Result Inner: Min %.2f, Max %.2f, Avg %.2f >>", jobData[metric][scope].Series[index].Statistics.Min, jobData[metric][scope].Series[index].Statistics.Max, jobData[metric][scope].Series[index].Statistics.Avg))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DEBUG:
|
||||
// for _, scope := range scopes {
|
||||
// for _, met := range metrics {
|
||||
// for _, series := range jobData[met][scope].Series {
|
||||
// log.Println(fmt.Sprintf("<< Result: %d data points for metric %s on %s with scope %s, Stats: Min %.2f, Max %.2f, Avg %.2f >>",
|
||||
// len(series.Data), met, series.Hostname, scope,
|
||||
// series.Statistics.Min, series.Statistics.Max, series.Statistics.Avg))
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
|
||||
stats := map[string]map[string]schema.MetricStatistics{}
|
||||
|
||||
hostsConds := make([]string, 0, len(job.Resources))
|
||||
for _, h := range job.Resources {
|
||||
if h.HWThreads != nil || h.Accelerators != nil {
|
||||
// TODO
|
||||
return nil, errors.New("the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
|
||||
}
|
||||
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
|
||||
}
|
||||
hostsCond := strings.Join(hostsConds, " or ")
|
||||
|
||||
// lenMet := len(metrics)
|
||||
|
||||
for _, metric := range metrics {
|
||||
// log.Println(fmt.Sprintf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet))
|
||||
|
||||
query := fmt.Sprintf(`
|
||||
data = from(bucket: "%s")
|
||||
|> range(start: %s, stop: %s)
|
||||
|> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s))
|
||||
union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"),
|
||||
data |> min(column: "_value") |> set(key: "_field", value: "min"),
|
||||
data |> max(column: "_value") |> set(key: "_field", value: "max")])
|
||||
|> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value")
|
||||
|> group()`,
|
||||
idb.bucket,
|
||||
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
|
||||
metric, hostsCond)
|
||||
|
||||
rows, err := idb.queryClient.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodes := map[string]schema.MetricStatistics{}
|
||||
for rows.Next() {
|
||||
row := rows.Record()
|
||||
host := row.ValueByKey("hostname").(string)
|
||||
|
||||
avg, avgok := row.ValueByKey("avg").(float64)
|
||||
if !avgok {
|
||||
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg))
|
||||
avg = 0.0
|
||||
}
|
||||
min, minok := row.ValueByKey("min").(float64)
|
||||
if !minok {
|
||||
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min))
|
||||
min = 0.0
|
||||
}
|
||||
max, maxok := row.ValueByKey("max").(float64)
|
||||
if !maxok {
|
||||
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max))
|
||||
max = 0.0
|
||||
}
|
||||
|
||||
nodes[host] = schema.MetricStatistics{
|
||||
Avg: avg,
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
stats[metric] = nodes
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (idb *InfluxDBv2DataRepository) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
// TODO : Implement to be used in Analysis- und System/Node-View
|
||||
log.Println(fmt.Sprintf("LoadNodeData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodes %v, scopes %v", cluster, metrics, nodes, scopes))
|
||||
|
||||
return nil, errors.New("unimplemented for InfluxDBv2DataRepository")
|
||||
}
|
254
internal/metricdata/metricdata.go
Normal file
254
internal/metricdata/metricdata.go
Normal file
@@ -0,0 +1,254 @@
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/iamlouk/lrucache"
|
||||
)
|
||||
|
||||
type MetricDataRepository interface {
|
||||
// Initialize this MetricDataRepository. One instance of
|
||||
// this interface will only ever be responsible for one cluster.
|
||||
Init(rawConfig json.RawMessage) error
|
||||
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
|
||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes for that node.
|
||||
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
|
||||
}
|
||||
|
||||
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
|
||||
|
||||
var JobArchivePath string
|
||||
|
||||
var useArchive bool
|
||||
|
||||
func Init(jobArchivePath string, disableArchive bool) error {
|
||||
useArchive = !disableArchive
|
||||
JobArchivePath = jobArchivePath
|
||||
for _, cluster := range config.Clusters {
|
||||
if cluster.MetricDataRepository != nil {
|
||||
var kind struct {
|
||||
Kind string `json:"kind"`
|
||||
}
|
||||
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var mdr MetricDataRepository
|
||||
switch kind.Kind {
|
||||
case "cc-metric-store":
|
||||
mdr = &CCMetricStore{}
|
||||
case "influxdb":
|
||||
mdr = &InfluxDBv2DataRepository{}
|
||||
case "test":
|
||||
mdr = &TestMetricDataRepository{}
|
||||
default:
|
||||
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", kind.Kind, cluster.Name)
|
||||
}
|
||||
|
||||
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
|
||||
return err
|
||||
}
|
||||
metricDataRepos[cluster.Name] = mdr
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
!useArchive {
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
if !ok {
|
||||
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := config.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
log.Errorf("partial error: %s", err.Error())
|
||||
} else {
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
jd, err = loadFromArchive(job)
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = 1 // loadFromArchive() caches in the same cache.
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
prepareJobData(job, jd, scopes)
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
|
||||
if job.State != schema.JobStateRunning && useArchive {
|
||||
return loadAveragesFromArchive(job, metrics, data)
|
||||
}
|
||||
|
||||
repo, ok := metricDataRepos[job.Cluster]
|
||||
if !ok {
|
||||
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for the node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
repo, ok := metricDataRepos[cluster]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range config.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
log.Errorf("partial error: %s", err.Error())
|
||||
} else {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func cacheKey(job *schema.Job, metrics []string, scopes []schema.MetricScope) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]",
|
||||
job.ID, job.State, metrics, scopes)
|
||||
}
|
||||
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need to be available at the scope 'node'.
|
||||
// If a job has a lot of nodes, statisticsSeries should be available so that a min/mean/max Graph can be used instead of
|
||||
// a lot of single lines.
|
||||
func prepareJobData(job *schema.Job, jobData schema.JobData, scopes []schema.MetricScope) {
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jobData {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jobData.AddNodeScope("flops_any")
|
||||
jobData.AddNodeScope("mem_bw")
|
||||
}
|
||||
}
|
32
internal/metricdata/utils.go
Normal file
32
internal/metricdata/utils.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
// Only a mock for unit-testing.
|
||||
type TestMetricDataRepository struct{}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx)
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
panic("TODO")
|
||||
}
|
58
internal/repository/dbConnection.go
Normal file
58
internal/repository/dbConnection.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
var (
|
||||
dbConnOnce sync.Once
|
||||
dbConnInstance *DBConnection
|
||||
)
|
||||
|
||||
type DBConnection struct {
|
||||
DB *sqlx.DB
|
||||
}
|
||||
|
||||
func Connect(driver string, db string) {
|
||||
var err error
|
||||
var dbHandle *sqlx.DB
|
||||
|
||||
dbConnOnce.Do(func() {
|
||||
if driver == "sqlite3" {
|
||||
dbHandle, err = sqlx.Open("sqlite3", fmt.Sprintf("%s?_foreign_keys=on", db))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// sqlite does not multithread. Having more than one connection open would just mean
|
||||
// waiting for locks.
|
||||
dbHandle.SetMaxOpenConns(1)
|
||||
} else if driver == "mysql" {
|
||||
dbHandle, err = sqlx.Open("mysql", fmt.Sprintf("%s?multiStatements=true", db))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
dbHandle.SetConnMaxLifetime(time.Minute * 3)
|
||||
dbHandle.SetMaxOpenConns(10)
|
||||
dbHandle.SetMaxIdleConns(10)
|
||||
} else {
|
||||
log.Fatalf("unsupported database driver: %s", driver)
|
||||
}
|
||||
|
||||
dbConnInstance = &DBConnection{DB: dbHandle}
|
||||
})
|
||||
}
|
||||
|
||||
func GetConnection() *DBConnection {
|
||||
if dbConnInstance == nil {
|
||||
log.Fatalf("Database connection not initialized!")
|
||||
}
|
||||
|
||||
return dbConnInstance
|
||||
}
|
155
internal/repository/import.go
Normal file
155
internal/repository/import.go
Normal file
@@ -0,0 +1,155 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
)
|
||||
|
||||
const NamedJobInsert string = `INSERT INTO job (
|
||||
job_id, user, project, cluster, subcluster, ` + "`partition`" + `, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data,
|
||||
mem_used_max, flops_any_avg, mem_bw_avg, load_avg, net_bw_avg, net_data_vol_total, file_bw_avg, file_data_vol_total
|
||||
) VALUES (
|
||||
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data,
|
||||
:mem_used_max, :flops_any_avg, :mem_bw_avg, :load_avg, :net_bw_avg, :net_data_vol_total, :file_bw_avg, :file_data_vol_total
|
||||
);`
|
||||
|
||||
// Import all jobs specified as `<path-to-meta.json>:<path-to-data.json>,...`
|
||||
func (r *JobRepository) HandleImportFlag(flag string) error {
|
||||
for _, pair := range strings.Split(flag, ",") {
|
||||
files := strings.Split(pair, ":")
|
||||
if len(files) != 2 {
|
||||
return fmt.Errorf("invalid import flag format")
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(files[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dec := json.NewDecoder(bytes.NewReader(raw))
|
||||
dec.DisallowUnknownFields()
|
||||
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err := dec.Decode(&jobMeta); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
raw, err = os.ReadFile(files[1])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dec = json.NewDecoder(bytes.NewReader(raw))
|
||||
dec.DisallowUnknownFields()
|
||||
jobData := schema.JobData{}
|
||||
if err := dec.Decode(&jobData); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := r.ImportJob(&jobMeta, &jobData); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) ImportJob(jobMeta *schema.JobMeta, jobData *schema.JobData) (err error) {
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
if err := metricdata.ImportJob(jobMeta, jobData); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if job, err := r.Find(&jobMeta.JobID, &jobMeta.Cluster, &jobMeta.StartTime); err != sql.ErrNoRows {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return fmt.Errorf("a job with that jobId, cluster and startTime does already exist (dbid: %d)", job.ID)
|
||||
}
|
||||
|
||||
job := schema.Job{
|
||||
BaseJob: jobMeta.BaseJob,
|
||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.FlopsAnyAvg = loadJobStat(jobMeta, "flops_any")
|
||||
job.MemBwAvg = loadJobStat(jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(jobMeta, "file_bw")
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := SanityChecks(&job.BaseJob); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
res, err := r.DB.NamedExec(NamedJobInsert, job)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, tag := range job.Tags {
|
||||
if _, err := r.AddTagOrCreate(id, tag.Type, tag.Name); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Successfully imported a new job (jobId: %d, cluster: %s, dbid: %d)", job.JobID, job.Cluster, id)
|
||||
return nil
|
||||
}
|
||||
|
||||
// This function also sets the subcluster if necessary!
|
||||
func SanityChecks(job *schema.BaseJob) error {
|
||||
if c := config.GetCluster(job.Cluster); c == nil {
|
||||
return fmt.Errorf("no such cluster: %#v", job.Cluster)
|
||||
}
|
||||
if err := config.AssignSubCluster(job); err != nil {
|
||||
return err
|
||||
}
|
||||
if !job.State.Valid() {
|
||||
return fmt.Errorf("not a valid job state: %#v", job.State)
|
||||
}
|
||||
if len(job.Resources) == 0 || len(job.User) == 0 {
|
||||
return fmt.Errorf("'resources' and 'user' should not be empty")
|
||||
}
|
||||
if job.NumAcc < 0 || job.NumHWThreads < 0 || job.NumNodes < 1 {
|
||||
return fmt.Errorf("'numNodes', 'numAcc' or 'numHWThreads' invalid")
|
||||
}
|
||||
if len(job.Resources) != int(job.NumNodes) {
|
||||
return fmt.Errorf("len(resources) does not equal numNodes (%d vs %d)", len(job.Resources), job.NumNodes)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func loadJobStat(job *schema.JobMeta, metric string) float64 {
|
||||
if stats, ok := job.Statistics[metric]; ok {
|
||||
return stats.Avg
|
||||
}
|
||||
|
||||
return 0.0
|
||||
}
|
271
internal/repository/init.go
Normal file
271
internal/repository/init.go
Normal file
@@ -0,0 +1,271 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
// `AUTO_INCREMENT` is in a comment because of this hack:
|
||||
// https://stackoverflow.com/a/41028314 (sqlite creates unique ids automatically)
|
||||
const JobsDBSchema string = `
|
||||
DROP TABLE IF EXISTS jobtag;
|
||||
DROP TABLE IF EXISTS job;
|
||||
DROP TABLE IF EXISTS tag;
|
||||
|
||||
CREATE TABLE job (
|
||||
id INTEGER PRIMARY KEY /*!40101 AUTO_INCREMENT */,
|
||||
job_id BIGINT NOT NULL,
|
||||
cluster VARCHAR(255) NOT NULL,
|
||||
subcluster VARCHAR(255) NOT NULL,
|
||||
start_time BIGINT NOT NULL, -- Unix timestamp
|
||||
|
||||
user VARCHAR(255) NOT NULL,
|
||||
project VARCHAR(255) NOT NULL,
|
||||
` + "`partition`" + ` VARCHAR(255) NOT NULL, -- partition is a keyword in mysql -.-
|
||||
array_job_id BIGINT NOT NULL,
|
||||
duration INT NOT NULL DEFAULT 0,
|
||||
walltime INT NOT NULL DEFAULT 0,
|
||||
job_state VARCHAR(255) NOT NULL CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', 'stopped', 'timeout', 'preempted', 'out_of_memory')),
|
||||
meta_data TEXT, -- JSON
|
||||
resources TEXT NOT NULL, -- JSON
|
||||
|
||||
num_nodes INT NOT NULL,
|
||||
num_hwthreads INT NOT NULL,
|
||||
num_acc INT NOT NULL,
|
||||
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
|
||||
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
|
||||
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
|
||||
|
||||
mem_used_max REAL NOT NULL DEFAULT 0.0,
|
||||
flops_any_avg REAL NOT NULL DEFAULT 0.0,
|
||||
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||
load_avg REAL NOT NULL DEFAULT 0.0,
|
||||
net_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
|
||||
file_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||
file_data_vol_total REAL NOT NULL DEFAULT 0.0);
|
||||
|
||||
CREATE TABLE tag (
|
||||
id INTEGER PRIMARY KEY,
|
||||
tag_type VARCHAR(255) NOT NULL,
|
||||
tag_name VARCHAR(255) NOT NULL,
|
||||
CONSTRAINT be_unique UNIQUE (tag_type, tag_name));
|
||||
|
||||
CREATE TABLE jobtag (
|
||||
job_id INTEGER,
|
||||
tag_id INTEGER,
|
||||
PRIMARY KEY (job_id, tag_id),
|
||||
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
|
||||
`
|
||||
|
||||
// Indexes are created after the job-archive is traversed for faster inserts.
|
||||
const JobsDbIndexes string = `
|
||||
CREATE INDEX job_by_user ON job (user);
|
||||
CREATE INDEX job_by_starttime ON job (start_time);
|
||||
CREATE INDEX job_by_job_id ON job (job_id);
|
||||
CREATE INDEX job_by_state ON job (job_state);
|
||||
`
|
||||
|
||||
// Delete the tables "job", "tag" and "jobtag" from the database and
|
||||
// repopulate them using the jobs found in `archive`.
|
||||
func InitDB(db *sqlx.DB, archive string) error {
|
||||
starttime := time.Now()
|
||||
log.Print("Building job table...")
|
||||
|
||||
// Basic database structure:
|
||||
_, err := db.Exec(JobsDBSchema)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
clustersDir, err := os.ReadDir(archive)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Inserts are bundled into transactions because in sqlite,
|
||||
// that speeds up inserts A LOT.
|
||||
tx, err := db.Beginx()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stmt, err := tx.PrepareNamed(NamedJobInsert)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Not using log.Print because we want the line to end with `\r` and
|
||||
// this function is only ever called when a special command line flag
|
||||
// is passed anyways.
|
||||
fmt.Printf("%d jobs inserted...\r", 0)
|
||||
i := 0
|
||||
tags := make(map[string]int64)
|
||||
handleDirectory := func(filename string) error {
|
||||
// Bundle 100 inserts into one transaction for better performance:
|
||||
if i%100 == 0 {
|
||||
if tx != nil {
|
||||
if err := tx.Commit(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
tx, err = db.Beginx()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stmt = tx.NamedStmt(stmt)
|
||||
fmt.Printf("%d jobs inserted...\r", i)
|
||||
}
|
||||
|
||||
err := loadJob(tx, stmt, tags, filename)
|
||||
if err == nil {
|
||||
i += 1
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
for _, clusterDir := range clustersDir {
|
||||
lvl1Dirs, err := os.ReadDir(filepath.Join(archive, clusterDir.Name()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, lvl1Dir := range lvl1Dirs {
|
||||
if !lvl1Dir.IsDir() {
|
||||
// Could be the cluster.json file
|
||||
continue
|
||||
}
|
||||
|
||||
lvl2Dirs, err := os.ReadDir(filepath.Join(archive, clusterDir.Name(), lvl1Dir.Name()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, lvl2Dir := range lvl2Dirs {
|
||||
dirpath := filepath.Join(archive, clusterDir.Name(), lvl1Dir.Name(), lvl2Dir.Name())
|
||||
startTimeDirs, err := os.ReadDir(dirpath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// For compability with the old job-archive directory structure where
|
||||
// there was no start time directory.
|
||||
for _, startTimeDir := range startTimeDirs {
|
||||
if startTimeDir.Type().IsRegular() && startTimeDir.Name() == "meta.json" {
|
||||
if err := handleDirectory(dirpath); err != nil {
|
||||
log.Errorf("in %s: %s", dirpath, err.Error())
|
||||
}
|
||||
} else if startTimeDir.IsDir() {
|
||||
if err := handleDirectory(filepath.Join(dirpath, startTimeDir.Name())); err != nil {
|
||||
log.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Create indexes after inserts so that they do not
|
||||
// need to be continually updated.
|
||||
if _, err := db.Exec(JobsDbIndexes); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: Remove double logic, use repository/import.go!
|
||||
// Read the `meta.json` file at `path` and insert it to the database using the prepared
|
||||
// insert statement `stmt`. `tags` maps all existing tags to their database ID.
|
||||
func loadJob(tx *sqlx.Tx, stmt *sqlx.NamedStmt, tags map[string]int64, path string) error {
|
||||
f, err := os.Open(filepath.Join(path, "meta.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
jobMeta := schema.JobMeta{BaseJob: schema.JobDefaults}
|
||||
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&jobMeta); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful
|
||||
job := schema.Job{
|
||||
BaseJob: jobMeta.BaseJob,
|
||||
StartTime: time.Unix(jobMeta.StartTime, 0),
|
||||
StartTimeUnix: jobMeta.StartTime,
|
||||
}
|
||||
|
||||
// TODO: Other metrics...
|
||||
job.FlopsAnyAvg = loadJobStat(&jobMeta, "flops_any")
|
||||
job.MemBwAvg = loadJobStat(&jobMeta, "mem_bw")
|
||||
job.NetBwAvg = loadJobStat(&jobMeta, "net_bw")
|
||||
job.FileBwAvg = loadJobStat(&jobMeta, "file_bw")
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := SanityChecks(&job.BaseJob); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
res, err := stmt.Exec(job)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, tag := range job.Tags {
|
||||
tagstr := tag.Name + ":" + tag.Type
|
||||
tagId, ok := tags[tagstr]
|
||||
if !ok {
|
||||
res, err := tx.Exec(`INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)`, tag.Name, tag.Type)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tagId, err = res.LastInsertId()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tags[tagstr] = tagId
|
||||
}
|
||||
|
||||
if _, err := tx.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, id, tagId); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
411
internal/repository/job.go
Normal file
411
internal/repository/job.go
Normal file
@@ -0,0 +1,411 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
"github.com/iamlouk/lrucache"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
var (
|
||||
jobRepoOnce sync.Once
|
||||
jobRepoInstance *JobRepository
|
||||
)
|
||||
|
||||
type JobRepository struct {
|
||||
DB *sqlx.DB
|
||||
|
||||
stmtCache *sq.StmtCache
|
||||
cache *lrucache.Cache
|
||||
}
|
||||
|
||||
func GetRepository() *JobRepository {
|
||||
jobRepoOnce.Do(func() {
|
||||
db := GetConnection()
|
||||
|
||||
jobRepoInstance = &JobRepository{
|
||||
DB: db.DB,
|
||||
stmtCache: sq.NewStmtCache(db.DB),
|
||||
cache: lrucache.New(1024 * 1024),
|
||||
}
|
||||
})
|
||||
|
||||
return jobRepoInstance
|
||||
}
|
||||
|
||||
var jobColumns []string = []string{
|
||||
"job.id", "job.job_id", "job.user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.partition", "job.array_job_id",
|
||||
"job.num_nodes", "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", "job.smt", "job.job_state",
|
||||
"job.duration", "job.walltime", "job.resources", // "job.meta_data",
|
||||
}
|
||||
|
||||
func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
|
||||
job := &schema.Job{}
|
||||
if err := row.Scan(
|
||||
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,
|
||||
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
|
||||
&job.Duration, &job.Walltime, &job.RawResources /*&job.MetaData*/); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job.StartTime = time.Unix(job.StartTimeUnix, 0)
|
||||
if job.Duration == 0 && job.State == schema.JobStateRunning {
|
||||
job.Duration = int32(time.Since(job.StartTime).Seconds())
|
||||
}
|
||||
|
||||
job.RawResources = nil
|
||||
return job, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) {
|
||||
cachekey := fmt.Sprintf("metadata:%d", job.ID)
|
||||
if cached := r.cache.Get(cachekey, nil); cached != nil {
|
||||
job.MetaData = cached.(map[string]string)
|
||||
return job.MetaData, nil
|
||||
}
|
||||
|
||||
if err := sq.Select("job.meta_data").From("job").Where("job.id = ?", job.ID).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&job.RawMetaData); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(job.RawMetaData) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)
|
||||
return job.MetaData, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err error) {
|
||||
cachekey := fmt.Sprintf("metadata:%d", job.ID)
|
||||
r.cache.Del(cachekey)
|
||||
if job.MetaData == nil {
|
||||
if _, err = r.FetchMetadata(job); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if job.MetaData != nil {
|
||||
cpy := make(map[string]string, len(job.MetaData)+1)
|
||||
for k, v := range job.MetaData {
|
||||
cpy[k] = v
|
||||
}
|
||||
cpy[key] = val
|
||||
job.MetaData = cpy
|
||||
} else {
|
||||
job.MetaData = map[string]string{key: val}
|
||||
}
|
||||
|
||||
if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) Find(
|
||||
jobId *int64,
|
||||
cluster *string,
|
||||
startTime *int64) (*schema.Job, error) {
|
||||
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
}
|
||||
if startTime != nil {
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindById executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindById(
|
||||
jobId int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// Start inserts a new job in the table, returning the unique job ID.
|
||||
// Statistics are not transfered!
|
||||
func (r *JobRepository) Start(job *schema.JobMeta) (id int64, err error) {
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("encoding resources field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("encoding metaData field failed: %w", err)
|
||||
}
|
||||
|
||||
res, err := r.DB.NamedExec(`INSERT INTO job (
|
||||
job_id, user, project, cluster, subcluster, `+"`partition`"+`, array_job_id, num_nodes, num_hwthreads, num_acc,
|
||||
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, resources, meta_data
|
||||
) VALUES (
|
||||
:job_id, :user, :project, :cluster, :subcluster, :partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
|
||||
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :resources, :meta_data
|
||||
);`, job)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
return res.LastInsertId()
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Stop(
|
||||
jobId int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32) (err error) {
|
||||
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
|
||||
// TODO: Use node hours instead: SELECT job.user, sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN CAST(strftime('%s', 'now') AS INTEGER) - job.start_time ELSE job.duration END)) as x FROM job GROUP BY user ORDER BY x DESC;
|
||||
func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggregate, filters []*model.JobFilter, weight *model.Weights, limit *int) (map[string]int, error) {
|
||||
if !aggreg.IsValid() {
|
||||
return nil, errors.New("invalid aggregate")
|
||||
}
|
||||
|
||||
runner := (sq.BaseRunner)(r.stmtCache)
|
||||
count := "count(*) as count"
|
||||
if weight != nil {
|
||||
switch *weight {
|
||||
case model.WeightsNodeCount:
|
||||
count = "sum(job.num_nodes) as count"
|
||||
case model.WeightsNodeHours:
|
||||
now := time.Now().Unix()
|
||||
count = fmt.Sprintf(`sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) as count`, now)
|
||||
runner = r.DB
|
||||
}
|
||||
}
|
||||
|
||||
q := sq.Select("job."+string(aggreg), count).From("job").GroupBy("job." + string(aggreg)).OrderBy("count DESC")
|
||||
q = SecurityCheck(ctx, q)
|
||||
for _, f := range filters {
|
||||
q = BuildWhereClause(f, q)
|
||||
}
|
||||
if limit != nil {
|
||||
q = q.Limit(uint64(*limit))
|
||||
}
|
||||
|
||||
counts := map[string]int{}
|
||||
rows, err := q.RunWith(runner).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var group string
|
||||
var count int
|
||||
if err := rows.Scan(&group, &count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
counts[group] = count
|
||||
}
|
||||
|
||||
return counts, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) {
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", job)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Archive(
|
||||
jobId int64,
|
||||
monitoringStatus int32,
|
||||
metricStats map[string]schema.JobStatistics) error {
|
||||
|
||||
stmt := sq.Update("job").
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
|
||||
for metric, stats := range metricStats {
|
||||
switch metric {
|
||||
case "flops_any":
|
||||
stmt = stmt.Set("flops_any_avg", stats.Avg)
|
||||
case "mem_used":
|
||||
stmt = stmt.Set("mem_used_max", stats.Max)
|
||||
case "mem_bw":
|
||||
stmt = stmt.Set("mem_bw_avg", stats.Avg)
|
||||
case "load":
|
||||
stmt = stmt.Set("load_avg", stats.Avg)
|
||||
case "net_bw":
|
||||
stmt = stmt.Set("net_bw_avg", stats.Avg)
|
||||
case "file_bw":
|
||||
stmt = stmt.Set("file_bw_avg", stats.Avg)
|
||||
}
|
||||
}
|
||||
|
||||
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var ErrNotFound = errors.New("no such job or user")
|
||||
|
||||
// FindJobOrUser returns a job database ID or a username if a job or user machtes the search term.
|
||||
// As 0 is a valid job id, check if username is "" instead in order to check what machted.
|
||||
// If nothing matches the search, `ErrNotFound` is returned.
|
||||
func (r *JobRepository) FindJobOrUser(ctx context.Context, searchterm string) (job int64, username string, err error) {
|
||||
user := auth.GetUser(ctx)
|
||||
if id, err := strconv.Atoi(searchterm); err == nil {
|
||||
qb := sq.Select("job.id").From("job").Where("job.job_id = ?", id)
|
||||
if user != nil && !user.HasRole(auth.RoleAdmin) {
|
||||
qb = qb.Where("job.user = ?", user.Username)
|
||||
}
|
||||
|
||||
err := qb.RunWith(r.stmtCache).QueryRow().Scan(&job)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
return 0, "", err
|
||||
} else if err == nil {
|
||||
return job, "", nil
|
||||
}
|
||||
}
|
||||
|
||||
if user == nil || user.HasRole(auth.RoleAdmin) {
|
||||
err := sq.Select("job.user").Distinct().From("job").
|
||||
Where("job.user = ?", searchterm).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&username)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
return 0, "", err
|
||||
} else if err == nil {
|
||||
return 0, username, nil
|
||||
}
|
||||
}
|
||||
|
||||
return 0, "", ErrNotFound
|
||||
}
|
||||
|
||||
func (r *JobRepository) Partitions(cluster string) ([]string, error) {
|
||||
var err error
|
||||
partitions := r.cache.Get("partitions:"+cluster, func() (interface{}, time.Duration, int) {
|
||||
parts := []string{}
|
||||
if err = r.DB.Select(&parts, `SELECT DISTINCT job.partition FROM job WHERE job.cluster = ?;`, cluster); err != nil {
|
||||
return nil, 0, 1000
|
||||
}
|
||||
|
||||
return parts, 1 * time.Hour, 1
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return partitions.([]string), nil
|
||||
}
|
||||
|
||||
// AllocatedNodes returns a map of all subclusters to a map of hostnames to the amount of jobs running on that host.
|
||||
// Hosts with zero jobs running on them will not show up!
|
||||
func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]int, error) {
|
||||
subclusters := make(map[string]map[string]int)
|
||||
rows, err := sq.Select("resources", "subcluster").From("job").
|
||||
Where("job.job_state = 'running'").
|
||||
Where("job.cluster = ?", cluster).
|
||||
RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var raw []byte
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
raw = raw[0:0]
|
||||
var resources []*schema.Resource
|
||||
var subcluster string
|
||||
if err := rows.Scan(&raw, &subcluster); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := json.Unmarshal(raw, &resources); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hosts, ok := subclusters[subcluster]
|
||||
if !ok {
|
||||
hosts = make(map[string]int)
|
||||
subclusters[subcluster] = hosts
|
||||
}
|
||||
|
||||
for _, resource := range resources {
|
||||
hosts[resource.Hostname] += 1
|
||||
}
|
||||
}
|
||||
|
||||
return subclusters, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
|
||||
res, err := sq.Update("job").
|
||||
Set("monitoring_status", schema.MonitoringStatusArchivingFailed).
|
||||
Set("duration", 0).
|
||||
Set("job_state", schema.JobStateFailed).
|
||||
Where("job.job_state = 'running'").
|
||||
Where("job.walltime > 0").
|
||||
Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).
|
||||
RunWith(r.DB).Exec()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
rowsAffected, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if rowsAffected > 0 {
|
||||
log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected)
|
||||
}
|
||||
return nil
|
||||
}
|
66
internal/repository/job_test.go
Normal file
66
internal/repository/job_test.go
Normal file
@@ -0,0 +1,66 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/jmoiron/sqlx"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
var db *sqlx.DB
|
||||
|
||||
func init() {
|
||||
Connect("sqlite3", "../../test/test.db")
|
||||
}
|
||||
|
||||
func setup(t *testing.T) *JobRepository {
|
||||
return GetRepository()
|
||||
}
|
||||
|
||||
func TestFind(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
jobId, cluster, startTime := int64(1404396), "emmy", int64(1609299584)
|
||||
job, err := r.Find(&jobId, &cluster, &startTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// fmt.Printf("%+v", job)
|
||||
|
||||
if job.ID != 1366 {
|
||||
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1366", job.JobID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindById(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
job, err := r.FindById(1366)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// fmt.Printf("%+v", job)
|
||||
|
||||
if job.JobID != 1404396 {
|
||||
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 1404396", job.JobID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetTags(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
tags, counts, err := r.CountTags(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Printf("TAGS %+v \n", tags)
|
||||
// fmt.Printf("COUNTS %+v \n", counts)
|
||||
|
||||
if counts["bandwidth"] != 6 {
|
||||
t.Errorf("wrong summary for diagnostic 3\ngot: %d \nwant: 6", counts["load-imbalance"])
|
||||
}
|
||||
}
|
217
internal/repository/query.go
Normal file
217
internal/repository/query.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
// QueryJobs returns a list of jobs matching the provided filters. page and order are optional-
|
||||
func (r *JobRepository) QueryJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter,
|
||||
page *model.PageRequest,
|
||||
order *model.OrderByInput) ([]*schema.Job, error) {
|
||||
|
||||
query := sq.Select(jobColumns...).From("job")
|
||||
query = SecurityCheck(ctx, query)
|
||||
|
||||
if order != nil {
|
||||
field := toSnakeCase(order.Field)
|
||||
if order.Order == model.SortDirectionEnumAsc {
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
|
||||
} else if order.Order == model.SortDirectionEnumDesc {
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
|
||||
} else {
|
||||
return nil, errors.New("invalid sorting order")
|
||||
}
|
||||
}
|
||||
|
||||
if page != nil && page.ItemsPerPage != -1 {
|
||||
limit := uint64(page.ItemsPerPage)
|
||||
query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
|
||||
}
|
||||
|
||||
for _, f := range filters {
|
||||
query = BuildWhereClause(f, query)
|
||||
}
|
||||
|
||||
sql, args, err := query.ToSql()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.Debugf("SQL query: `%s`, args: %#v", sql, args)
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
return nil, err
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// CountJobs counts the number of jobs matching the filters.
|
||||
func (r *JobRepository) CountJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter) (int, error) {
|
||||
|
||||
// count all jobs:
|
||||
query := sq.Select("count(*)").From("job")
|
||||
query = SecurityCheck(ctx, query)
|
||||
for _, f := range filters {
|
||||
query = BuildWhereClause(f, query)
|
||||
}
|
||||
var count int
|
||||
if err := query.RunWith(r.DB).Scan(&count); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
user := auth.GetUser(ctx)
|
||||
if user == nil || user.HasRole(auth.RoleAdmin) || user.HasRole(auth.RoleApi) {
|
||||
return query
|
||||
}
|
||||
|
||||
return query.Where("job.user = ?", user.Username)
|
||||
}
|
||||
|
||||
// Build a sq.SelectBuilder out of a schema.JobFilter.
|
||||
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if filter.Tags != nil {
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags})
|
||||
}
|
||||
if filter.JobID != nil {
|
||||
query = buildStringCondition("job.job_id", filter.JobID, query)
|
||||
}
|
||||
if filter.ArrayJobID != nil {
|
||||
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
|
||||
}
|
||||
if filter.User != nil {
|
||||
query = buildStringCondition("job.user", filter.User, query)
|
||||
}
|
||||
if filter.Project != nil {
|
||||
query = buildStringCondition("job.project", filter.Project, query)
|
||||
}
|
||||
if filter.Cluster != nil {
|
||||
query = buildStringCondition("job.cluster", filter.Cluster, query)
|
||||
}
|
||||
if filter.Partition != nil {
|
||||
query = buildStringCondition("job.partition", filter.Partition, query)
|
||||
}
|
||||
if filter.StartTime != nil {
|
||||
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
||||
}
|
||||
if filter.Duration != nil {
|
||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||
query = query.Where("(CASE WHEN job.job_state = 'running' THEN (? - job.start_time) ELSE job.duration END) BETWEEN ? AND ?", now, filter.Duration.From, filter.Duration.To)
|
||||
}
|
||||
if filter.MinRunningFor != nil {
|
||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
|
||||
}
|
||||
if filter.State != nil {
|
||||
states := make([]string, len(filter.State))
|
||||
for i, val := range filter.State {
|
||||
states[i] = string(val)
|
||||
}
|
||||
|
||||
query = query.Where(sq.Eq{"job.job_state": states})
|
||||
}
|
||||
if filter.NumNodes != nil {
|
||||
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
|
||||
}
|
||||
if filter.NumAccelerators != nil {
|
||||
query = buildIntCondition("job.num_acc", filter.NumAccelerators, query)
|
||||
}
|
||||
if filter.NumHWThreads != nil {
|
||||
query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
|
||||
}
|
||||
if filter.FlopsAnyAvg != nil {
|
||||
query = buildFloatCondition("job.flops_any_avg", filter.FlopsAnyAvg, query)
|
||||
}
|
||||
if filter.MemBwAvg != nil {
|
||||
query = buildFloatCondition("job.mem_bw_avg", filter.MemBwAvg, query)
|
||||
}
|
||||
if filter.LoadAvg != nil {
|
||||
query = buildFloatCondition("job.load_avg", filter.LoadAvg, query)
|
||||
}
|
||||
if filter.MemUsedMax != nil {
|
||||
query = buildFloatCondition("job.mem_used_max", filter.MemUsedMax, query)
|
||||
}
|
||||
return query
|
||||
}
|
||||
|
||||
func buildIntCondition(field string, cond *model.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
}
|
||||
|
||||
func buildTimeCondition(field string, cond *model.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if cond.From != nil && cond.To != nil {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
|
||||
} else if cond.From != nil {
|
||||
return query.Where("? <= "+field, cond.From.Unix())
|
||||
} else if cond.To != nil {
|
||||
return query.Where(field+" <= ?", cond.To.Unix())
|
||||
} else {
|
||||
return query
|
||||
}
|
||||
}
|
||||
|
||||
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
}
|
||||
|
||||
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if cond.Eq != nil {
|
||||
return query.Where(field+" = ?", *cond.Eq)
|
||||
}
|
||||
if cond.StartsWith != nil {
|
||||
return query.Where(field+" LIKE ?", fmt.Sprint(*cond.StartsWith, "%"))
|
||||
}
|
||||
if cond.EndsWith != nil {
|
||||
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.EndsWith))
|
||||
}
|
||||
if cond.Contains != nil {
|
||||
return query.Where(field+" LIKE ?", fmt.Sprint("%", *cond.Contains, "%"))
|
||||
}
|
||||
return query
|
||||
}
|
||||
|
||||
var matchFirstCap = regexp.MustCompile("(.)([A-Z][a-z]+)")
|
||||
var matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
|
||||
|
||||
func toSnakeCase(str string) string {
|
||||
for _, c := range str {
|
||||
if c == '\'' || c == '\\' {
|
||||
panic("A hacker (probably not)!!!")
|
||||
}
|
||||
}
|
||||
|
||||
str = strings.ReplaceAll(str, "'", "")
|
||||
str = strings.ReplaceAll(str, "\\", "")
|
||||
snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
|
||||
snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
|
||||
return strings.ToLower(snake)
|
||||
}
|
150
internal/repository/tags.go
Normal file
150
internal/repository/tags.go
Normal file
@@ -0,0 +1,150 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
// Add the tag with id `tagId` to the job with the database id `jobId`.
|
||||
func (r *JobRepository) AddTag(job int64, tag int64) ([]*schema.Tag, error) {
|
||||
if _, err := r.stmtCache.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES ($1, $2)`, job, tag); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
j, err := r.FindById(job)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags, err := r.GetTags(&job)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tags, metricdata.UpdateTags(j, tags)
|
||||
}
|
||||
|
||||
// Removes a tag from a job
|
||||
func (r *JobRepository) RemoveTag(job, tag int64) ([]*schema.Tag, error) {
|
||||
if _, err := r.stmtCache.Exec("DELETE FROM jobtag WHERE jobtag.job_id = $1 AND jobtag.tag_id = $2", job, tag); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
j, err := r.FindById(job)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags, err := r.GetTags(&job)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tags, metricdata.UpdateTags(j, tags)
|
||||
}
|
||||
|
||||
// CreateTag creates a new tag with the specified type and name and returns its database id.
|
||||
func (r *JobRepository) CreateTag(tagType string, tagName string) (tagId int64, err error) {
|
||||
res, err := r.stmtCache.Exec("INSERT INTO tag (tag_type, tag_name) VALUES ($1, $2)", tagType, tagName)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return res.LastInsertId()
|
||||
}
|
||||
|
||||
func (r *JobRepository) CountTags(user *string) (tags []schema.Tag, counts map[string]int, err error) {
|
||||
tags = make([]schema.Tag, 0, 100)
|
||||
xrows, err := r.DB.Queryx("SELECT * FROM tag")
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
for xrows.Next() {
|
||||
var t schema.Tag
|
||||
if err := xrows.StructScan(&t); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
tags = append(tags, t)
|
||||
}
|
||||
|
||||
q := sq.Select("t.tag_name, count(jt.tag_id)").
|
||||
From("tag t").
|
||||
LeftJoin("jobtag jt ON t.id = jt.tag_id").
|
||||
GroupBy("t.tag_name")
|
||||
if user != nil {
|
||||
q = q.Where("jt.job_id IN (SELECT id FROM job WHERE job.user = ?)", *user)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
counts = make(map[string]int)
|
||||
for rows.Next() {
|
||||
var tagName string
|
||||
var count int
|
||||
if err := rows.Scan(&tagName, &count); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
counts[tagName] = count
|
||||
}
|
||||
err = rows.Err()
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// AddTagOrCreate adds the tag with the specified type and name to the job with the database id `jobId`.
|
||||
// If such a tag does not yet exist, it is created.
|
||||
func (r *JobRepository) AddTagOrCreate(jobId int64, tagType string, tagName string) (tagId int64, err error) {
|
||||
tagId, exists := r.TagId(tagType, tagName)
|
||||
if !exists {
|
||||
tagId, err = r.CreateTag(tagType, tagName)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
if _, err := r.AddTag(jobId, tagId); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return tagId, nil
|
||||
}
|
||||
|
||||
// TagId returns the database id of the tag with the specified type and name.
|
||||
func (r *JobRepository) TagId(tagType string, tagName string) (tagId int64, exists bool) {
|
||||
exists = true
|
||||
if err := sq.Select("id").From("tag").
|
||||
Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&tagId); err != nil {
|
||||
exists = false
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// GetTags returns a list of all tags if job is nil or of the tags that the job with that database ID has.
|
||||
func (r *JobRepository) GetTags(job *int64) ([]*schema.Tag, error) {
|
||||
q := sq.Select("id", "tag_type", "tag_name").From("tag")
|
||||
if job != nil {
|
||||
q = q.Join("jobtag ON jobtag.tag_id = tag.id").Where("jobtag.job_id = ?", *job)
|
||||
}
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tags := make([]*schema.Tag, 0)
|
||||
for rows.Next() {
|
||||
tag := &schema.Tag{}
|
||||
if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tags = append(tags, tag)
|
||||
}
|
||||
|
||||
return tags, nil
|
||||
}
|
288
internal/routerConfig/routes.go
Normal file
288
internal/routerConfig/routes.go
Normal file
@@ -0,0 +1,288 @@
|
||||
package routerConfig
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/templates"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
type InfoType map[string]interface{}
|
||||
|
||||
type Route struct {
|
||||
Route string
|
||||
Template string
|
||||
Title string
|
||||
Filter bool
|
||||
Setup func(i InfoType, r *http.Request) InfoType
|
||||
}
|
||||
|
||||
var routes []Route = []Route{
|
||||
{"/", "home.tmpl", "ClusterCockpit", false, setupHomeRoute},
|
||||
{"/config", "config.tmpl", "Settings", false, func(i InfoType, r *http.Request) InfoType { return i }},
|
||||
{"/monitoring/jobs/", "monitoring/jobs.tmpl", "Jobs - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { return i }},
|
||||
{"/monitoring/job/{id:[0-9]+}", "monitoring/job.tmpl", "Job <ID> - ClusterCockpit", false, setupJobRoute},
|
||||
{"/monitoring/users/", "monitoring/list.tmpl", "Users - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "USER"; return i }},
|
||||
{"/monitoring/projects/", "monitoring/list.tmpl", "Projects - ClusterCockpit", true, func(i InfoType, r *http.Request) InfoType { i["listType"] = "PROJECT"; return i }},
|
||||
{"/monitoring/tags/", "monitoring/taglist.tmpl", "Tags - ClusterCockpit", false, setupTaglistRoute},
|
||||
{"/monitoring/user/{id}", "monitoring/user.tmpl", "User <ID> - ClusterCockpit", true, setupUserRoute},
|
||||
{"/monitoring/systems/{cluster}", "monitoring/systems.tmpl", "Cluster <ID> - ClusterCockpit", false, setupClusterRoute},
|
||||
{"/monitoring/node/{cluster}/{hostname}", "monitoring/node.tmpl", "Node <ID> - ClusterCockpit", false, setupNodeRoute},
|
||||
{"/monitoring/analysis/{cluster}", "monitoring/analysis.tmpl", "Analaysis - ClusterCockpit", true, setupAnalysisRoute},
|
||||
{"/monitoring/status/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterRoute},
|
||||
}
|
||||
|
||||
func setupHomeRoute(i InfoType, r *http.Request) InfoType {
|
||||
type cluster struct {
|
||||
Name string
|
||||
RunningJobs int
|
||||
TotalJobs int
|
||||
RecentShortJobs int
|
||||
}
|
||||
jobRepo := repository.GetRepository()
|
||||
|
||||
runningJobs, err := jobRepo.CountGroupedJobs(r.Context(), model.AggregateCluster, []*model.JobFilter{{
|
||||
State: []schema.JobState{schema.JobStateRunning},
|
||||
}}, nil, nil)
|
||||
if err != nil {
|
||||
log.Errorf("failed to count jobs: %s", err.Error())
|
||||
runningJobs = map[string]int{}
|
||||
}
|
||||
totalJobs, err := jobRepo.CountGroupedJobs(r.Context(), model.AggregateCluster, nil, nil, nil)
|
||||
if err != nil {
|
||||
log.Errorf("failed to count jobs: %s", err.Error())
|
||||
totalJobs = map[string]int{}
|
||||
}
|
||||
from := time.Now().Add(-24 * time.Hour)
|
||||
recentShortJobs, err := jobRepo.CountGroupedJobs(r.Context(), model.AggregateCluster, []*model.JobFilter{{
|
||||
StartTime: &model.TimeRange{From: &from, To: nil},
|
||||
Duration: &model.IntRange{From: 0, To: graph.ShortJobDuration},
|
||||
}}, nil, nil)
|
||||
if err != nil {
|
||||
log.Errorf("failed to count jobs: %s", err.Error())
|
||||
recentShortJobs = map[string]int{}
|
||||
}
|
||||
|
||||
clusters := make([]cluster, 0)
|
||||
for _, c := range config.Clusters {
|
||||
clusters = append(clusters, cluster{
|
||||
Name: c.Name,
|
||||
RunningJobs: runningJobs[c.Name],
|
||||
TotalJobs: totalJobs[c.Name],
|
||||
RecentShortJobs: recentShortJobs[c.Name],
|
||||
})
|
||||
}
|
||||
|
||||
i["clusters"] = clusters
|
||||
return i
|
||||
}
|
||||
|
||||
func setupJobRoute(i InfoType, r *http.Request) InfoType {
|
||||
i["id"] = mux.Vars(r)["id"]
|
||||
return i
|
||||
}
|
||||
|
||||
func setupUserRoute(i InfoType, r *http.Request) InfoType {
|
||||
jobRepo := repository.GetRepository()
|
||||
username := mux.Vars(r)["id"]
|
||||
i["id"] = username
|
||||
i["username"] = username
|
||||
if user, _ := auth.FetchUser(r.Context(), jobRepo.DB, username); user != nil {
|
||||
i["name"] = user.Name
|
||||
i["email"] = user.Email
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
func setupClusterRoute(i InfoType, r *http.Request) InfoType {
|
||||
vars := mux.Vars(r)
|
||||
i["id"] = vars["cluster"]
|
||||
i["cluster"] = vars["cluster"]
|
||||
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
|
||||
if from != "" || to != "" {
|
||||
i["from"] = from
|
||||
i["to"] = to
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
func setupNodeRoute(i InfoType, r *http.Request) InfoType {
|
||||
vars := mux.Vars(r)
|
||||
i["cluster"] = vars["cluster"]
|
||||
i["hostname"] = vars["hostname"]
|
||||
i["id"] = fmt.Sprintf("%s (%s)", vars["cluster"], vars["hostname"])
|
||||
from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to")
|
||||
if from != "" || to != "" {
|
||||
i["from"] = from
|
||||
i["to"] = to
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
func setupAnalysisRoute(i InfoType, r *http.Request) InfoType {
|
||||
i["cluster"] = mux.Vars(r)["cluster"]
|
||||
return i
|
||||
}
|
||||
|
||||
func setupTaglistRoute(i InfoType, r *http.Request) InfoType {
|
||||
var username *string = nil
|
||||
jobRepo := repository.GetRepository()
|
||||
if user := auth.GetUser(r.Context()); user != nil && !user.HasRole(auth.RoleAdmin) {
|
||||
username = &user.Username
|
||||
}
|
||||
|
||||
tags, counts, err := jobRepo.CountTags(username)
|
||||
tagMap := make(map[string][]map[string]interface{})
|
||||
if err != nil {
|
||||
log.Errorf("GetTags failed: %s", err.Error())
|
||||
i["tagmap"] = tagMap
|
||||
return i
|
||||
}
|
||||
|
||||
for _, tag := range tags {
|
||||
tagItem := map[string]interface{}{
|
||||
"id": tag.ID,
|
||||
"name": tag.Name,
|
||||
"count": counts[tag.Name],
|
||||
}
|
||||
tagMap[tag.Type] = append(tagMap[tag.Type], tagItem)
|
||||
}
|
||||
i["tagmap"] = tagMap
|
||||
return i
|
||||
}
|
||||
|
||||
func buildFilterPresets(query url.Values) map[string]interface{} {
|
||||
filterPresets := map[string]interface{}{}
|
||||
|
||||
if query.Get("cluster") != "" {
|
||||
filterPresets["cluster"] = query.Get("cluster")
|
||||
}
|
||||
if query.Get("partition") != "" {
|
||||
filterPresets["partition"] = query.Get("partition")
|
||||
}
|
||||
if query.Get("project") != "" {
|
||||
filterPresets["project"] = query.Get("project")
|
||||
filterPresets["projectMatch"] = "eq"
|
||||
}
|
||||
if query.Get("user") != "" {
|
||||
filterPresets["user"] = query.Get("user")
|
||||
filterPresets["userMatch"] = "eq"
|
||||
}
|
||||
if len(query["state"]) != 0 {
|
||||
filterPresets["state"] = query["state"]
|
||||
}
|
||||
if rawtags, ok := query["tag"]; ok {
|
||||
tags := make([]int, len(rawtags))
|
||||
for i, tid := range rawtags {
|
||||
var err error
|
||||
tags[i], err = strconv.Atoi(tid)
|
||||
if err != nil {
|
||||
tags[i] = -1
|
||||
}
|
||||
}
|
||||
filterPresets["tags"] = tags
|
||||
}
|
||||
if query.Get("duration") != "" {
|
||||
parts := strings.Split(query.Get("duration"), "-")
|
||||
if len(parts) == 2 {
|
||||
a, e1 := strconv.Atoi(parts[0])
|
||||
b, e2 := strconv.Atoi(parts[1])
|
||||
if e1 == nil && e2 == nil {
|
||||
filterPresets["duration"] = map[string]int{"from": a, "to": b}
|
||||
}
|
||||
}
|
||||
}
|
||||
if query.Get("numNodes") != "" {
|
||||
parts := strings.Split(query.Get("numNodes"), "-")
|
||||
if len(parts) == 2 {
|
||||
a, e1 := strconv.Atoi(parts[0])
|
||||
b, e2 := strconv.Atoi(parts[1])
|
||||
if e1 == nil && e2 == nil {
|
||||
filterPresets["numNodes"] = map[string]int{"from": a, "to": b}
|
||||
}
|
||||
}
|
||||
}
|
||||
if query.Get("numAccelerators") != "" {
|
||||
parts := strings.Split(query.Get("numAccelerators"), "-")
|
||||
if len(parts) == 2 {
|
||||
a, e1 := strconv.Atoi(parts[0])
|
||||
b, e2 := strconv.Atoi(parts[1])
|
||||
if e1 == nil && e2 == nil {
|
||||
filterPresets["numAccelerators"] = map[string]int{"from": a, "to": b}
|
||||
}
|
||||
}
|
||||
}
|
||||
if query.Get("jobId") != "" {
|
||||
filterPresets["jobId"] = query.Get("jobId")
|
||||
}
|
||||
if query.Get("arrayJobId") != "" {
|
||||
if num, err := strconv.Atoi(query.Get("arrayJobId")); err == nil {
|
||||
filterPresets["arrayJobId"] = num
|
||||
}
|
||||
}
|
||||
if query.Get("startTime") != "" {
|
||||
parts := strings.Split(query.Get("startTime"), "-")
|
||||
if len(parts) == 2 {
|
||||
a, e1 := strconv.ParseInt(parts[0], 10, 64)
|
||||
b, e2 := strconv.ParseInt(parts[1], 10, 64)
|
||||
if e1 == nil && e2 == nil {
|
||||
filterPresets["startTime"] = map[string]string{
|
||||
"from": time.Unix(a, 0).Format(time.RFC3339),
|
||||
"to": time.Unix(b, 0).Format(time.RFC3339),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return filterPresets
|
||||
}
|
||||
|
||||
func SetupRoutes(router *mux.Router) {
|
||||
for _, route := range routes {
|
||||
route := route
|
||||
router.HandleFunc(route.Route, func(rw http.ResponseWriter, r *http.Request) {
|
||||
conf, err := config.GetUIConfig(r)
|
||||
if err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
title := route.Title
|
||||
infos := route.Setup(map[string]interface{}{}, r)
|
||||
if id, ok := infos["id"]; ok {
|
||||
title = strings.Replace(route.Title, "<ID>", id.(string), 1)
|
||||
}
|
||||
|
||||
username, isAdmin := "", true
|
||||
if user := auth.GetUser(r.Context()); user != nil {
|
||||
username = user.Username
|
||||
isAdmin = user.HasRole(auth.RoleAdmin)
|
||||
}
|
||||
|
||||
page := templates.Page{
|
||||
Title: title,
|
||||
User: templates.User{Username: username, IsAdmin: isAdmin},
|
||||
Config: conf,
|
||||
Infos: infos,
|
||||
}
|
||||
|
||||
if route.Filter {
|
||||
page.FilterPresets = buildFilterPresets(r.URL.Query())
|
||||
}
|
||||
|
||||
templates.Render(rw, r, route.Template, &page)
|
||||
})
|
||||
}
|
||||
}
|
131
internal/runtimeEnv/setup.go
Normal file
131
internal/runtimeEnv/setup.go
Normal file
@@ -0,0 +1,131 @@
|
||||
package runtimeEnv
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// Very simple and limited .env file reader.
|
||||
// All variable definitions found are directly
|
||||
// added to the processes environment.
|
||||
func LoadEnv(file string) error {
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer f.Close()
|
||||
s := bufio.NewScanner(bufio.NewReader(f))
|
||||
for s.Scan() {
|
||||
line := s.Text()
|
||||
if strings.HasPrefix(line, "#") || len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.Contains(line, "#") {
|
||||
return errors.New("'#' are only supported at the start of a line")
|
||||
}
|
||||
|
||||
line = strings.TrimPrefix(line, "export ")
|
||||
parts := strings.SplitN(line, "=", 2)
|
||||
if len(parts) != 2 {
|
||||
return fmt.Errorf("unsupported line: %#v", line)
|
||||
}
|
||||
|
||||
key := strings.TrimSpace(parts[0])
|
||||
val := strings.TrimSpace(parts[1])
|
||||
if strings.HasPrefix(val, "\"") {
|
||||
if !strings.HasSuffix(val, "\"") {
|
||||
return fmt.Errorf("unsupported line: %#v", line)
|
||||
}
|
||||
|
||||
runes := []rune(val[1 : len(val)-1])
|
||||
sb := strings.Builder{}
|
||||
for i := 0; i < len(runes); i++ {
|
||||
if runes[i] == '\\' {
|
||||
i++
|
||||
switch runes[i] {
|
||||
case 'n':
|
||||
sb.WriteRune('\n')
|
||||
case 'r':
|
||||
sb.WriteRune('\r')
|
||||
case 't':
|
||||
sb.WriteRune('\t')
|
||||
case '"':
|
||||
sb.WriteRune('"')
|
||||
default:
|
||||
return fmt.Errorf("unsupprorted escape sequence in quoted string: backslash %#v", runes[i])
|
||||
}
|
||||
continue
|
||||
}
|
||||
sb.WriteRune(runes[i])
|
||||
}
|
||||
|
||||
val = sb.String()
|
||||
}
|
||||
|
||||
os.Setenv(key, val)
|
||||
}
|
||||
|
||||
return s.Err()
|
||||
}
|
||||
|
||||
// Changes the processes user and group to that
|
||||
// specified in the config.json. The go runtime
|
||||
// takes care of all threads (and not only the calling one)
|
||||
// executing the underlying systemcall.
|
||||
func DropPrivileges(username string, group string) error {
|
||||
if group != "" {
|
||||
g, err := user.LookupGroup(group)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
gid, _ := strconv.Atoi(g.Gid)
|
||||
if err := syscall.Setgid(gid); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if username != "" {
|
||||
u, err := user.Lookup(username)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
uid, _ := strconv.Atoi(u.Uid)
|
||||
if err := syscall.Setuid(uid); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// If started via systemd, inform systemd that we are running:
|
||||
// https://www.freedesktop.org/software/systemd/man/sd_notify.html
|
||||
func SystemdNotifiy(ready bool, status string) {
|
||||
if os.Getenv("NOTIFY_SOCKET") == "" {
|
||||
// Not started using systemd
|
||||
return
|
||||
}
|
||||
|
||||
args := []string{fmt.Sprintf("--pid=%d", os.Getpid())}
|
||||
if ready {
|
||||
args = append(args, "--ready")
|
||||
}
|
||||
|
||||
if status != "" {
|
||||
args = append(args, fmt.Sprintf("--status=%s", status))
|
||||
}
|
||||
|
||||
cmd := exec.Command("systemd-notify", args...)
|
||||
cmd.Run() // errors ignored on purpose, there is not much to do anyways.
|
||||
}
|
80
internal/templates/templates.go
Normal file
80
internal/templates/templates.go
Normal file
@@ -0,0 +1,80 @@
|
||||
package templates
|
||||
|
||||
import (
|
||||
"html/template"
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
||||
)
|
||||
|
||||
var templatesDir string
|
||||
var debugMode bool = os.Getenv("DEBUG") == "1"
|
||||
var templates map[string]*template.Template = map[string]*template.Template{}
|
||||
|
||||
type User struct {
|
||||
Username string // Username of the currently logged in user
|
||||
IsAdmin bool
|
||||
}
|
||||
|
||||
type Page struct {
|
||||
Title string // Page title
|
||||
Error string // For generic use (e.g. the exact error message on /login)
|
||||
Info string // For generic use (e.g. "Logout successfull" on /login)
|
||||
User User // Information about the currently logged in user
|
||||
Clusters []string // List of all clusters for use in the Header
|
||||
FilterPresets map[string]interface{} // For pages with the Filter component, this can be used to set initial filters.
|
||||
Infos map[string]interface{} // For generic use (e.g. username for /monitoring/user/<id>, job id for /monitoring/job/<id>)
|
||||
Config map[string]interface{} // UI settings for the currently logged in user (e.g. line width, ...)
|
||||
}
|
||||
|
||||
func init() {
|
||||
bp := "./"
|
||||
ebp := os.Getenv("BASEPATH")
|
||||
|
||||
if ebp != "" {
|
||||
bp = ebp
|
||||
}
|
||||
templatesDir = bp + "web/templates/"
|
||||
base := template.Must(template.ParseFiles(templatesDir + "base.tmpl"))
|
||||
files := []string{
|
||||
"home.tmpl", "404.tmpl", "login.tmpl",
|
||||
"imprint.tmpl", "privacy.tmpl",
|
||||
"config.tmpl",
|
||||
"monitoring/jobs.tmpl",
|
||||
"monitoring/job.tmpl",
|
||||
"monitoring/taglist.tmpl",
|
||||
"monitoring/list.tmpl",
|
||||
"monitoring/user.tmpl",
|
||||
"monitoring/systems.tmpl",
|
||||
"monitoring/status.tmpl",
|
||||
"monitoring/node.tmpl",
|
||||
"monitoring/analysis.tmpl",
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
templates[file] = template.Must(template.Must(base.Clone()).ParseFiles(templatesDir + file))
|
||||
}
|
||||
}
|
||||
|
||||
func Render(rw http.ResponseWriter, r *http.Request, file string, page *Page) {
|
||||
t, ok := templates[file]
|
||||
if !ok {
|
||||
panic("templates must be predefinied!")
|
||||
}
|
||||
|
||||
if debugMode {
|
||||
t = template.Must(template.ParseFiles(templatesDir+"base.tmpl", templatesDir+file))
|
||||
}
|
||||
|
||||
if page.Clusters == nil {
|
||||
for _, c := range config.Clusters {
|
||||
page.Clusters = append(page.Clusters, c.Name)
|
||||
}
|
||||
}
|
||||
|
||||
if err := t.Execute(rw, page); err != nil {
|
||||
log.Errorf("template error: %s", err.Error())
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user