Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix

This commit is contained in:
2026-03-11 05:06:26 +01:00
317 changed files with 32717 additions and 15040 deletions

View File

@@ -23,47 +23,45 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/gorilla/mux"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-chi/chi/v5"
_ "github.com/mattn/go-sqlite3"
)
func setup(t *testing.T) *api.RestAPI {
repository.ResetConnection()
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"*"
]
},
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"api-allowed-ips": [
"*"
]
},
"metric-store": {
"checkpoints": {
"interval": "12h"
},
"retention-in-memory": "48h",
"memory-cap": 100
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
"jwts": {
"max-age": "2m"
}
}
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
}
]
}`
const testclusterJSON = `{
"name": "testcluster",
@@ -141,7 +139,7 @@ func setup(t *testing.T) *api.RestAPI {
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB("sqlite3", dbfilepath)
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
@@ -152,28 +150,23 @@ func setup(t *testing.T) *api.RestAPI {
}
ccconf.Init(cfgFilePath)
metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
config.Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect("sqlite3", dbfilepath)
repository.Connect(dbfilepath)
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
t.Fatal(err)
}
if err := metricdata.Init(); err != nil {
t.Fatal(err)
}
// metricstore initialization removed - it's initialized via callback in tests
archiver.Start(repository.GetJobRepository(), context.Background())
@@ -190,11 +183,9 @@ func setup(t *testing.T) *api.RestAPI {
}
func cleanup() {
// Gracefully shutdown archiver with timeout
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
// TODO: Clear all caches, reset all modules, etc...
}
/*
@@ -221,16 +212,14 @@ func TestRestApi(t *testing.T) {
},
}
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
r := mux.NewRouter()
r.PathPrefix("/api").Subrouter()
r.StrictSlash(true)
r := chi.NewRouter()
restapi.MountAPIRoutes(r)
var TestJobId int64 = 123
var TestJobID int64 = 123
TestClusterName := "testcluster"
var TestStartTime int64 = 123456789
@@ -280,7 +269,7 @@ func TestRestApi(t *testing.T) {
}
// resolver := graph.GetResolverInstance()
restapi.JobRepository.SyncJobs()
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
@@ -338,7 +327,7 @@ func TestRestApi(t *testing.T) {
}
// Archiving happens asynchronously, will be completed in cleanup
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
if err != nil {
t.Fatal(err)
}
@@ -366,7 +355,7 @@ func TestRestApi(t *testing.T) {
}
t.Run("CheckArchive", func(t *testing.T) {
data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
if err != nil {
t.Fatal(err)
}
@@ -464,4 +453,198 @@ func TestRestApi(t *testing.T) {
if !ok {
t.Fatal("subtest failed")
}
t.Run("GetUsedNodesNoRunning", func(t *testing.T) {
contextUserValue := &schema.User{
Username: "testuser",
Projects: make([]string, 0),
Roles: []string{"api"},
AuthType: 0,
AuthSource: 2,
}
req := httptest.NewRequest(http.MethodGet, "/jobs/used_nodes?ts=123456790", nil)
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
response := recorder.Result()
if response.StatusCode != http.StatusOK {
t.Fatal(response.Status, recorder.Body.String())
}
var result api.GetUsedNodesAPIResponse
if err := json.NewDecoder(response.Body).Decode(&result); err != nil {
t.Fatal(err)
}
if result.UsedNodes == nil {
t.Fatal("expected usedNodes to be non-nil")
}
if len(result.UsedNodes) != 0 {
t.Fatalf("expected no used nodes for stopped jobs, got: %v", result.UsedNodes)
}
})
}
// TestStopJobWithReusedJobId verifies that stopping a recently started job works
// even when an older job with the same jobId exists in the job table (e.g. with
// state "failed"). This is a regression test for the bug where Find() on the job
// table would match the old job instead of the new one still in job_cache.
func TestStopJobWithReusedJobId(t *testing.T) {
restapi := setup(t)
t.Cleanup(cleanup)
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
r := chi.NewRouter()
restapi.MountAPIRoutes(r)
const contextUserKey repository.ContextKey = "user"
contextUserValue := &schema.User{
Username: "testuser",
Projects: make([]string, 0),
Roles: []string{"user"},
AuthType: 0,
AuthSource: 2,
}
// Step 1: Start the first job (jobId=999)
const startJobBody1 string = `{
"jobId": 999,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "default",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
"startTime": 200000000
}`
if ok := t.Run("StartFirstJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody1)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusCreated {
t.Fatal(recorder.Result().Status, recorder.Body.String())
}
}); !ok {
return
}
// Step 2: Sync to move job from cache to job table, then stop it as "failed"
time.Sleep(1 * time.Second)
restapi.JobRepository.SyncJobs()
const stopJobBody1 string = `{
"jobId": 999,
"startTime": 200000000,
"cluster": "testcluster",
"jobState": "failed",
"stopTime": 200001000
}`
if ok := t.Run("StopFirstJobAsFailed", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody1)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusOK {
t.Fatal(recorder.Result().Status, recorder.Body.String())
}
jobid, cluster := int64(999), "testcluster"
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
if err != nil {
t.Fatal(err)
}
if job.State != schema.JobStateFailed {
t.Fatalf("expected first job to be failed, got: %s", job.State)
}
}); !ok {
return
}
// Wait for archiving to complete
time.Sleep(1 * time.Second)
// Step 3: Start a NEW job with the same jobId=999 but different startTime.
// This job will sit in job_cache (not yet synced).
const startJobBody2 string = `{
"jobId": 999,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "default",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
"startTime": 300000000
}`
if ok := t.Run("StartSecondJob", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody2)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusCreated {
t.Fatal(recorder.Result().Status, recorder.Body.String())
}
}); !ok {
return
}
// Step 4: Stop the second job WITHOUT syncing first.
// Before the fix, this would fail because Find() on the job table would
// match the old failed job (jobId=999) and reject with "already stopped".
const stopJobBody2 string = `{
"jobId": 999,
"startTime": 300000000,
"cluster": "testcluster",
"jobState": "completed",
"stopTime": 300001000
}`
t.Run("StopSecondJobBeforeSync", func(t *testing.T) {
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody2)))
recorder := httptest.NewRecorder()
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
r.ServeHTTP(recorder, req.WithContext(ctx))
if recorder.Result().StatusCode != http.StatusOK {
t.Fatalf("expected stop to succeed for cached job, got: %s %s",
recorder.Result().Status, recorder.Body.String())
}
})
}

View File

@@ -13,7 +13,7 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// GetClustersAPIResponse model
@@ -27,7 +27,7 @@ type GetClustersAPIResponse struct {
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
// @produce json
// @param cluster query string false "Job Cluster"
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
// @success 200 {object} api.GetClustersAPIResponse "Array of clusters"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -36,9 +36,9 @@ type GetClustersAPIResponse struct {
// @router /api/clusters/ [get]
func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); user != nil &&
!user.HasRole(schema.RoleApi) {
!user.HasRole(schema.RoleAPI) {
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
return
}

File diff suppressed because it is too large Load Diff

View File

@@ -22,12 +22,12 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/gorilla/mux"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-chi/chi/v5"
)
const (
@@ -72,6 +72,14 @@ type EditMetaRequest struct {
Value string `json:"value" example:"bash script"`
}
// JobMetaRequest model
type JobMetaRequest struct {
JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
Cluster *string `json:"cluster" example:"fritz"` // Cluster of job
StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch
Payload EditMetaRequest `json:"payload"` // Content to Add to Job Meta_Data
}
type TagJobAPIRequest []*APITag
type GetJobAPIRequest []string
@@ -104,7 +112,7 @@ type JobMetricWithName struct {
// @param items-per-page query int false "Items per page (Default: 25)"
// @param page query int false "Page Number (Default: 1)"
// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response"
// @success 200 {object} api.GetJobsApiResponse "Job array and page info"
// @success 200 {object} api.GetJobsAPIResponse "Job array and page info"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -232,7 +240,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
// @produce json
// @param id path int true "Database ID of Job"
// @param all-metrics query bool false "Include all available metrics"
// @success 200 {object} api.GetJobApiResponse "Job resource"
// @success 200 {object} api.GetJobAPIResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -243,17 +251,17 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
// @router /api/jobs/{id} [get]
func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) {
// Fetch job from db
id, ok := mux.Vars(r)["id"]
id := chi.URLParam(r, "id")
var job *schema.Job
var err error
if ok {
if id != "" {
id, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
return
}
job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID
job, err = api.JobRepository.FindByID(r.Context(), id) // Get Job from Repo by ID
} else {
handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw)
return
@@ -293,7 +301,7 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
}
if r.URL.Query().Get("all-metrics") == "true" {
data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
data, err = metricdispatch.LoadData(job, nil, scopes, r.Context(), resolution)
if err != nil {
cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
return
@@ -324,8 +332,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
// @accept json
// @produce json
// @param id path int true "Database ID of Job"
// @param request body api.GetJobApiRequest true "Array of metric names"
// @success 200 {object} api.GetJobApiResponse "Job resource"
// @param request body api.GetJobAPIRequest true "Array of metric names"
// @success 200 {object} api.GetJobAPIResponse "Job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -336,17 +344,17 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
// @router /api/jobs/{id} [post]
func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
// Fetch job from db
id, ok := mux.Vars(r)["id"]
id := chi.URLParam(r, "id")
var job *schema.Job
var err error
if ok {
if id != "" {
id, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
return
}
job, err = api.JobRepository.FindById(r.Context(), id)
job, err = api.JobRepository.FindByID(r.Context(), id)
} else {
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
return
@@ -389,7 +397,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
resolution = max(resolution, mc.Timestep)
}
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
data, err := metricdispatch.LoadData(job, metrics, scopes, r.Context(), resolution)
if err != nil {
cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
return
@@ -423,29 +431,29 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
}
// editMeta godoc
// @summary Edit meta-data json
// @summary Edit meta-data json of job identified by database id
// @tags Job add and modify
// @description Edit key value pairs in job metadata json
// @description Edit key value pairs in job metadata json of job specified by database id
// @description If a key already exists its content will be overwritten
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.EditMetaRequest true "Kay value pair to add"
// @param request body api.EditMetaRequest true "Metadata Key value pair to add or update"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 404 {object} api.ErrorResponse "Job does not exist"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/edit_meta/{id} [post]
// @router /api/jobs/edit_meta/{id} [patch]
func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
if err != nil {
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
return
}
job, err := api.JobRepository.FindById(r.Context(), id)
job, err := api.JobRepository.FindByID(r.Context(), id)
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
return
@@ -469,6 +477,54 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
}
}
// editMetaByRequest godoc
// @summary Edit meta-data json of job identified by request
// @tags Job add and modify
// @description Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster
// @description If a key already exists its content will be overwritten
// @accept json
// @produce json
// @param request body api.JobMetaRequest true "Specifies job and payload to add or update"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 404 {object} api.ErrorResponse "Job does not exist"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/edit_meta/ [patch]
func (api *RestAPI) editMetaByRequest(rw http.ResponseWriter, r *http.Request) {
// Parse request body
req := JobMetaRequest{}
if err := decode(r.Body, &req); err != nil {
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
return
}
// Fetch job (that will have its meta_data edited) from db
var job *schema.Job
var err error
if req.JobId == nil {
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
return
}
// log.Printf("loading db job for editMetaByRequest... : JobMetaRequest=%v", req)
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
if err := api.JobRepository.UpdateMetadata(job, req.Payload.Key, req.Payload.Value); err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
json.NewEncoder(rw).Encode(job)
}
// tagJob godoc
// @summary Adds one or more tags to a job
// @tags Job add and modify
@@ -478,7 +534,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to add"
// @param request body api.TagJobAPIRequest true "Array of tag-objects to add"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -487,13 +543,13 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
// @security ApiKeyAuth
// @router /api/jobs/tag_job/{id} [post]
func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
if err != nil {
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
return
}
job, err := api.JobRepository.FindById(r.Context(), id)
job, err := api.JobRepository.FindByID(r.Context(), id)
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
return
@@ -542,7 +598,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
// @accept json
// @produce json
// @param id path int true "Job Database ID"
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
// @success 200 {object} schema.Job "Updated job resource"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -551,13 +607,13 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
// @security ApiKeyAuth
// @router /jobs/tag_job/{id} [delete]
func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
if err != nil {
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
return
}
job, err := api.JobRepository.FindById(r.Context(), id)
job, err := api.JobRepository.FindByID(r.Context(), id)
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
return
@@ -606,7 +662,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
// @description Tag wills be removed from respective archive files.
// @accept json
// @produce plain
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
// @success 200 {string} string "Success Response"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -650,7 +706,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) {
// @accept json
// @produce json
// @param request body schema.Job true "Job to add"
// @success 201 {object} api.DefaultApiResponse "Job added successfully"
// @success 201 {object} api.DefaultAPIResponse "Job added successfully"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -691,13 +747,21 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
for _, job := range jobs {
// Check if jobs are within the same day (prevent duplicates)
if (req.StartTime - job.StartTime) < secondsPerDay {
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", *job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
return
}
}
}
id, err := api.JobRepository.Start(&req)
// When tags are present, insert directly into the job table so that the
// returned ID can be used with AddTagOrCreate (which queries the job table).
// Jobs without tags use the cache path as before.
var id int64
if len(req.Tags) > 0 {
id, err = api.JobRepository.StartDirect(&req)
} else {
id, err = api.JobRepository.Start(&req)
}
if err != nil {
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
return
@@ -728,7 +792,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
// @description Job to stop is specified by request body. All fields are required in this case.
// @description Returns full job resource information according to 'Job' scheme.
// @produce json
// @param request body api.StopJobApiRequest true "All fields required"
// @param request body api.StopJobAPIRequest true "All fields required"
// @success 200 {object} schema.Job "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
@@ -754,20 +818,20 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
return
}
// cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
isCached := false
job, err = api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
if err != nil {
// Try cached jobs if not found in main repository
cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
if cachedErr != nil {
// Combine both errors for better debugging
handleError(fmt.Errorf("finding job failed: %w (cached lookup also failed: %v)", err, cachedErr), http.StatusNotFound, rw)
// Not in cache, try main job table
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
if err != nil {
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
return
}
job = cachedJob
} else {
isCached = true
}
api.checkAndHandleStopJob(rw, job, req)
api.checkAndHandleStopJob(rw, job, req, isCached)
}
// deleteJobByID godoc
@@ -776,7 +840,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
// @description Job to remove is specified by database ID. This will not remove the job from the job archive.
// @produce json
// @param id path int true "Database ID of Job"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -787,16 +851,16 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
// @router /api/jobs/delete_job/{id} [delete]
func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
// Fetch job (that will be stopped) from db
id, ok := mux.Vars(r)["id"]
id := chi.URLParam(r, "id")
var err error
if ok {
if id != "" {
id, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
return
}
err = api.JobRepository.DeleteJobById(id)
err = api.JobRepository.DeleteJobByID(id)
} else {
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
return
@@ -820,8 +884,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
// @description Job to delete is specified by request body. All fields are required in this case.
// @accept json
// @produce json
// @param request body api.DeleteJobApiRequest true "All fields required"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @param request body api.DeleteJobAPIRequest true "All fields required"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -852,7 +916,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
return
}
err = api.JobRepository.DeleteJobById(*job.ID)
err = api.JobRepository.DeleteJobByID(*job.ID)
if err != nil {
handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
return
@@ -861,7 +925,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(http.StatusOK)
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
Message: fmt.Sprintf("Successfully deleted job %d", *job.ID),
}); err != nil {
cclog.Errorf("Failed to encode response: %v", err)
}
@@ -873,7 +937,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
// @produce json
// @param ts path int true "Unix epoch timestamp"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -886,9 +950,9 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
var cnt int
// Fetch job (that will be stopped) from db
id, ok := mux.Vars(r)["ts"]
id := chi.URLParam(r, "ts")
var err error
if ok {
if id != "" {
ts, e := strconv.ParseInt(id, 10, 64)
if e != nil {
handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw)
@@ -896,11 +960,13 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
}
// Check for omit-tagged query parameter
omitTagged := false
omitTagged := "none"
if omitTaggedStr := r.URL.Query().Get("omit-tagged"); omitTaggedStr != "" {
omitTagged, e = strconv.ParseBool(omitTaggedStr)
if e != nil {
handleError(fmt.Errorf("boolean expected for omit-tagged parameter: %w", e), http.StatusBadRequest, rw)
switch omitTaggedStr {
case "none", "all", "user":
omitTagged = omitTaggedStr
default:
handleError(fmt.Errorf("omit-tagged must be one of: none, all, user"), http.StatusBadRequest, rw)
return
}
}
@@ -924,20 +990,20 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
}
}
func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) {
func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) {
// Sanity checks
if job.State != schema.JobStateRunning {
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
return
}
if job.StartTime > req.StopTime {
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, *job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
return
}
if req.State != "" && !req.State.Valid() {
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, *job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
return
} else if req.State == "" {
req.State = schema.JobStateCompleted
@@ -949,14 +1015,24 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
api.JobRepository.Mutex.Lock()
defer api.JobRepository.Mutex.Unlock()
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
// If the job is still in job_cache, transfer it to the job table first
// so that job.ID always points to the job table for downstream code
if isCached {
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
if err != nil {
handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw)
return
}
cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
job.ID = &newID
}
cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
return
}
cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
// Send a response (with status OK). This means that errors that happen from here on forward
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
@@ -977,7 +1053,7 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
}
func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
id := mux.Vars(r)["id"]
id := chi.URLParam(r, "id")
metrics := r.URL.Query()["metric"]
var scopes []schema.MetricScope
for _, scope := range r.URL.Query()["scope"] {
@@ -1022,3 +1098,57 @@ func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
cclog.Errorf("Failed to encode response: %v", err)
}
}
// GetUsedNodesAPIResponse model
type GetUsedNodesAPIResponse struct {
UsedNodes map[string][]string `json:"usedNodes"` // Map of cluster names to lists of used node hostnames
}
// getUsedNodes godoc
// @summary Lists used nodes by cluster
// @tags Job query
// @description Get a map of cluster names to lists of unique hostnames that are currently in use by running jobs that started before the specified timestamp.
// @produce json
// @param ts query int true "Unix timestamp to filter jobs (jobs with start_time < ts)"
// @success 200 {object} api.GetUsedNodesAPIResponse "Map of cluster names to hostname lists"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /api/jobs/used_nodes [get]
func (api *RestAPI) getUsedNodes(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); user != nil &&
!user.HasRole(schema.RoleAPI) {
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
return
}
tsStr := r.URL.Query().Get("ts")
if tsStr == "" {
handleError(fmt.Errorf("missing required query parameter: ts"), http.StatusBadRequest, rw)
return
}
ts, err := strconv.ParseInt(tsStr, 10, 64)
if err != nil {
handleError(fmt.Errorf("invalid timestamp format: %w", err), http.StatusBadRequest, rw)
return
}
usedNodes, err := api.JobRepository.GetUsedNodes(ts)
if err != nil {
handleError(fmt.Errorf("failed to get used nodes: %w", err), http.StatusInternalServerError, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
payload := GetUsedNodesAPIResponse{
UsedNodes: usedNodes,
}
if err := json.NewEncoder(rw).Encode(payload); err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
}

165
internal/api/log.go Normal file
View File

@@ -0,0 +1,165 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"os/exec"
"regexp"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type LogEntry struct {
Timestamp string `json:"timestamp"`
Priority int `json:"priority"`
Message string `json:"message"`
Unit string `json:"unit"`
}
var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`)
func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) {
user := repository.GetUserFromContext(r.Context())
if !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw)
return
}
since := r.URL.Query().Get("since")
if since == "" {
since = "1 hour ago"
}
if !safePattern.MatchString(since) {
handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw)
return
}
lines := 200
if l := r.URL.Query().Get("lines"); l != "" {
n, err := strconv.Atoi(l)
if err != nil || n < 1 {
handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw)
return
}
if n > 1000 {
n = 1000
}
lines = n
}
unit := config.Keys.SystemdUnit
if unit == "" {
unit = "clustercockpit.service"
}
args := []string{
"--output=json",
"--no-pager",
"-n", fmt.Sprintf("%d", lines),
"--since", since,
"-u", unit,
}
if level := r.URL.Query().Get("level"); level != "" {
n, err := strconv.Atoi(level)
if err != nil || n < 0 || n > 7 {
handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw)
return
}
args = append(args, "--priority", fmt.Sprintf("%d", n))
}
if search := r.URL.Query().Get("search"); search != "" {
if !safePattern.MatchString(search) {
handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw)
return
}
args = append(args, "--grep", search)
}
cclog.Debugf("calling journalctl with %s", strings.Join(args, " "))
cmd := exec.CommandContext(r.Context(), "journalctl", args...)
stdout, err := cmd.StdoutPipe()
if err != nil {
handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw)
return
}
if err := cmd.Start(); err != nil {
handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw)
return
}
entries := make([]LogEntry, 0, lines)
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
var raw map[string]any
if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
cclog.Debugf("error unmarshal log output: %v", err)
continue
}
priority := 6 // default info
if p, ok := raw["PRIORITY"]; ok {
switch v := p.(type) {
case string:
if n, err := strconv.Atoi(v); err == nil {
priority = n
}
case float64:
priority = int(v)
}
}
msg := ""
if m, ok := raw["MESSAGE"]; ok {
if s, ok := m.(string); ok {
msg = s
}
}
ts := ""
if t, ok := raw["__REALTIME_TIMESTAMP"]; ok {
if s, ok := t.(string); ok {
ts = s
}
}
unitName := ""
if u, ok := raw["_SYSTEMD_UNIT"]; ok {
if s, ok := u.(string); ok {
unitName = s
}
}
entries = append(entries, LogEntry{
Timestamp: ts,
Priority: priority,
Message: msg,
Unit: unitName,
})
}
if err := cmd.Wait(); err != nil {
// journalctl returns exit code 1 when --grep matches nothing
if len(entries) == 0 {
cclog.Debugf("journalctl exited with: %v", err)
}
}
rw.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(rw).Encode(entries); err != nil {
cclog.Errorf("Failed to encode log entries: %v", err)
}
}

View File

@@ -10,15 +10,14 @@ import (
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/influxdata/line-protocol/v2/lineprotocol"
"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
)
// handleFree godoc
@@ -58,7 +57,7 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
return
}
ms := memorystore.GetMemoryStore()
ms := metricstore.GetMemoryStore()
n := 0
for _, sel := range selectors {
bn, err := ms.Free(sel, to)
@@ -90,16 +89,17 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
// @security ApiKeyAuth
// @router /write/ [post]
func writeMetrics(rw http.ResponseWriter, r *http.Request) {
bytes, err := io.ReadAll(r.Body)
rw.Header().Add("Content-Type", "application/json")
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
ms := memorystore.GetMemoryStore()
dec := lineprotocol.NewDecoderWithBytes(bytes)
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
// Extract the "cluster" query parameter without allocating a url.Values map.
cluster := queryParam(r.URL.RawQuery, "cluster")
// Stream directly from the request body instead of copying it into a
// temporary buffer via io.ReadAll. The line-protocol decoder supports
// io.Reader natively, so this avoids the largest heap allocation.
ms := metricstore.GetMemoryStore()
dec := lineprotocol.NewDecoder(r.Body)
if err := metricstore.DecodeLine(dec, ms, cluster); err != nil {
cclog.Errorf("/api/write error: %s", err.Error())
handleError(err, http.StatusBadRequest, rw)
return
@@ -107,6 +107,20 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
rw.WriteHeader(http.StatusOK)
}
// queryParam extracts a single query-parameter value from a raw query string
// without allocating a url.Values map. Returns "" if the key is not present.
func queryParam(raw, key string) string {
for raw != "" {
var kv string
kv, raw, _ = strings.Cut(raw, "&")
k, v, _ := strings.Cut(kv, "=")
if k == key {
return v
}
}
return ""
}
// handleDebug godoc
// @summary Debug endpoint
// @tags debug
@@ -129,42 +143,9 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
selector = strings.Split(raw, ":")
}
ms := memorystore.GetMemoryStore()
ms := metricstore.GetMemoryStore()
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}
// handleHealthCheck godoc
// @summary HealthCheck endpoint
// @tags healthcheck
// @description This endpoint allows the users to check if a node is healthy
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /healthcheck/ [get]
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
rawCluster := r.URL.Query().Get("cluster")
rawNode := r.URL.Query().Get("node")
if rawCluster == "" || rawNode == "" {
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
selector := []string{rawCluster, rawNode}
ms := memorystore.GetMemoryStore()
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}

400
internal/api/nats.go Normal file
View File

@@ -0,0 +1,400 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"database/sql"
"encoding/json"
"strings"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/nats"
"github.com/ClusterCockpit/cc-lib/v2/receivers"
"github.com/ClusterCockpit/cc-lib/v2/schema"
influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
)
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
// It mirrors the functionality of the REST API but uses NATS messaging with
// InfluxDB line protocol as the message format.
//
// # Message Format
//
// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
// with the following structure:
//
// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
//
// # Job Events
//
// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
//
// job,function=start_job event="{...JSON payload...}" <timestamp>
// job,function=stop_job event="{...JSON payload...}" <timestamp>
//
// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
//
// Example job start message:
//
// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
//
// # Node State Events
//
// Node state updates use the "nodestate" measurement with cluster information:
//
// nodestate event="{...JSON payload...}" <timestamp>
//
// The JSON payload follows the UpdateNodeStatesRequest structure.
//
// Example node state message:
//
// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
type NatsAPI struct {
JobRepository *repository.JobRepository
// RepositoryMutex protects job creation operations from race conditions
// when checking for duplicate jobs during startJob calls.
RepositoryMutex sync.Mutex
}
// NewNatsAPI creates a new NatsAPI instance with default dependencies.
func NewNatsAPI() *NatsAPI {
return &NatsAPI{
JobRepository: repository.GetJobRepository(),
}
}
// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
// Returns an error if the NATS client is not available or subscription fails.
func (api *NatsAPI) StartSubscriptions() error {
client := nats.GetClient()
if client == nil {
cclog.Warn("NATS client not available, skipping API subscriptions")
return nil
}
if config.Keys.APISubjects != nil {
s := config.Keys.APISubjects
if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
return err
}
if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
return err
}
cclog.Info("NATS API subscriptions started")
}
return nil
}
// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
// Validates that required tags and fields are present before processing.
func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
function, ok := msg.GetTag("function")
if !ok {
cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
return
}
switch function {
case "start_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job start event is missing event field with JSON payload")
return
}
api.handleStartJob(v)
case "stop_job":
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Job stop event is missing event field with JSON payload")
return
}
api.handleStopJob(v)
default:
cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
}
}
// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="job" and include:
// - tag "function" with value "start_job" or "stop_job"
// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
//
// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Debugf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "job" {
api.processJobEvent(m)
} else {
cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
}
}
}
// handleStartJob processes job start messages received via NATS.
// The payload parameter contains JSON following the schema.Job structure.
// Jobs are validated, checked for duplicates, and inserted into the database.
func (api *NatsAPI) handleStartJob(payload string) {
if payload == "" {
cclog.Error("NATS start job: payload is empty")
return
}
req := schema.Job{
Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
}
dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS start job: parsing request failed: %v", err)
return
}
cclog.Debugf("NATS start job: %s", req.GoString())
req.State = schema.JobStateRunning
if err := importer.SanityChecks(&req); err != nil {
cclog.Errorf("NATS start job: sanity check failed: %v", err)
return
}
var unlockOnce sync.Once
api.RepositoryMutex.Lock()
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
if err != nil && err != sql.ErrNoRows {
cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
return
}
if err == nil {
for _, job := range jobs {
if (req.StartTime - job.StartTime) < secondsPerDay {
cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
req.JobID, req.Cluster, job.ID)
return
}
}
}
// When tags are present, insert directly into the job table so that the
// returned ID can be used with AddTagOrCreate (which queries the job table).
var id int64
if len(req.Tags) > 0 {
id, err = api.JobRepository.StartDirect(&req)
} else {
id, err = api.JobRepository.Start(&req)
}
if err != nil {
cclog.Errorf("NATS start job: insert into database failed: %v", err)
return
}
unlockOnce.Do(api.RepositoryMutex.Unlock)
for _, tag := range req.Tags {
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
return
}
}
cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
id, req.Cluster, req.JobID, req.User, req.StartTime)
}
// handleStopJob processes job stop messages received via NATS.
// The payload parameter contains JSON following the StopJobAPIRequest structure.
// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
func (api *NatsAPI) handleStopJob(payload string) {
if payload == "" {
cclog.Error("NATS stop job: payload is empty")
return
}
var req StopJobAPIRequest
dec := json.NewDecoder(strings.NewReader(payload))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS job stop: parsing request failed: %v", err)
return
}
if req.JobID == nil {
cclog.Errorf("NATS job stop: the field 'jobId' is required")
return
}
isCached := false
job, err := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
if err != nil {
// Not in cache, try main job table
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
if err != nil {
cclog.Errorf("NATS job stop: finding job failed: %v", err)
return
}
} else {
isCached = true
}
if job.State != schema.JobStateRunning {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
job.JobID, job.ID, job.Cluster, job.State)
return
}
if job.StartTime > req.StopTime {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
return
}
if req.State != "" && !req.State.Valid() {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
job.JobID, job.ID, job.Cluster, req.State)
return
} else if req.State == "" {
req.State = schema.JobStateCompleted
}
job.Duration = int32(req.StopTime - job.StartTime)
job.State = req.State
api.JobRepository.Mutex.Lock()
defer api.JobRepository.Mutex.Unlock()
// If the job is still in job_cache, transfer it to the job table first
if isCached {
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
if err != nil {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v",
job.JobID, *job.ID, job.Cluster, err)
return
}
cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
job.ID = &newID
}
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
job.JobID, *job.ID, job.Cluster, job.State, err)
return
}
cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
*job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
return
}
archiver.TriggerArchiving(job)
}
// processNodestateEvent extracts and processes node state data from the InfluxDB message.
// Updates node states in the repository for all nodes in the payload.
func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
v, ok := msg.GetEventValue()
if !ok {
cclog.Errorf("Nodestate event is missing event field with JSON payload")
return
}
var req UpdateNodeStatesRequest
dec := json.NewDecoder(strings.NewReader(v))
dec.DisallowUnknownFields()
if err := dec.Decode(&req); err != nil {
cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
return
}
repo := repository.GetNodeRepository()
requestReceived := time.Now().Unix()
for _, node := range req.Nodes {
state := determineState(node.States)
nodeState := schema.NodeStateDB{
TimeStamp: requestReceived,
NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated,
HealthState: schema.MonitoringStateFull,
JobsRunning: node.JobsRunning,
}
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
node.Hostname, req.Cluster, err)
}
}
cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
}
// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
// The message must be in line protocol format with measurement="nodestate" and include:
// - field "event" containing JSON payload (UpdateNodeStatesRequest)
//
// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
if len(data) == 0 {
cclog.Warnf("NATS %s: received empty message", subject)
return
}
d := influx.NewDecoderWithBytes(data)
for d.Next() {
m, err := receivers.DecodeInfluxMessage(d)
if err != nil {
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
return
}
if !m.IsEvent() {
cclog.Warnf("NATS %s: received non-event message, skipping", subject)
continue
}
if m.Name() == "nodestate" {
api.processNodestateEvent(m)
} else {
cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
}
}
}

947
internal/api/nats_test.go Normal file
View File

@@ -0,0 +1,947 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"
"time"
"github.com/ClusterCockpit/cc-backend/internal/archiver"
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
)
func setupNatsTest(t *testing.T) *NatsAPI {
repository.ResetConnection()
const testconfig = `{
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"api-allowed-ips": [
"*"
]
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"jwts": {
"max-age": "2m"
}
}
}`
const testclusterJSON = `{
"name": "testcluster",
"subClusters": [
{
"name": "sc1",
"nodes": "host123,host124,host125",
"processorType": "Intel Core i7-4770",
"socketsPerNode": 1,
"coresPerSocket": 4,
"threadsPerCore": 2,
"flopRateScalar": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 14
},
"flopRateSimd": {
"unit": {
"prefix": "G",
"base": "F/s"
},
"value": 112
},
"memoryBandwidth": {
"unit": {
"prefix": "G",
"base": "B/s"
},
"value": 24
},
"numberOfNodes": 70,
"topology": {
"node": [0, 1, 2, 3, 4, 5, 6, 7],
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
}
}
],
"metricConfig": [
{
"name": "load_one",
"unit": { "base": ""},
"scope": "node",
"timestep": 60,
"aggregation": "avg",
"peak": 8,
"normal": 0,
"caution": 0,
"alert": 0
}
]
}`
cclog.Init("info", true)
tmpdir := t.TempDir()
jobarchive := filepath.Join(tmpdir, "job-archive")
if err := os.Mkdir(jobarchive, 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
t.Fatal(err)
}
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
t.Fatal(err)
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
cfgFilePath := filepath.Join(tmpdir, "config.json")
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
t.Fatal(err)
}
ccconf.Init(cfgFilePath)
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
repository.Connect(dbfilepath)
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
t.Fatal(err)
}
// metricstore initialization removed - it's initialized via callback in tests
archiver.Start(repository.GetJobRepository(), context.Background())
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
auth.Init(&cfg)
} else {
cclog.Warn("Authentication disabled due to missing configuration")
auth.Init(nil)
}
graph.Init()
return NewNatsAPI()
}
func cleanupNatsTest() {
if err := archiver.Shutdown(5 * time.Second); err != nil {
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
}
}
func TestNatsHandleStartJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
shouldFindJob bool
}{
{
name: "valid job start",
payload: `{
"jobId": 1001,
"user": "testuser1",
"project": "testproj1",
"cluster": "testcluster",
"partition": "main",
"walltime": 7200,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1001 {
t.Errorf("expected JobID 1001, got %d", job.JobID)
}
if job.User != "testuser1" {
t.Errorf("expected user testuser1, got %s", job.User)
}
if job.State != schema.JobStateRunning {
t.Errorf("expected state running, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number",
"user": "testuser2"
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "missing required fields",
payload: `{
"jobId": 1002
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with unknown fields (should fail due to DisallowUnknownFields)",
payload: `{
"jobId": 1003,
"user": "testuser3",
"project": "testproj3",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"unknownField": "should cause error",
"startTime": 1234567900
}`,
expectError: true,
shouldFindJob: false,
},
{
name: "job with tags",
payload: `{
"jobId": 1004,
"user": "testuser4",
"project": "testproj4",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"tags": [
{
"type": "test",
"name": "testtag",
"scope": "testuser4"
}
],
"startTime": 1234567910
}`,
expectError: false,
shouldFindJob: true,
validateJob: func(t *testing.T, job *schema.Job) {
if job.JobID != 1004 {
t.Errorf("expected JobID 1004, got %d", job.JobID)
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleStartJob(tt.payload)
natsAPI.JobRepository.SyncJobs()
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if tt.shouldFindJob {
// Extract jobId from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
startTime := int64(payloadMap["startTime"].(float64))
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
if err != nil {
if !tt.expectError {
t.Fatalf("expected to find job, but got error: %v", err)
}
return
}
if tt.validateJob != nil {
tt.validateJob(t, job)
}
}
})
}
}
func TestNatsHandleStopJob(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// First, create a running job
startPayload := `{
"jobId": 2001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(startPayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
tests := []struct {
name string
payload string
expectError bool
validateJob func(t *testing.T, job *schema.Job)
setupJobFunc func() // Optional: create specific test job
}{
{
name: "valid job stop - completed",
payload: `{
"jobId": 2001,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateCompleted {
t.Errorf("expected state completed, got %s", job.State)
}
expectedDuration := int32(1234571490 - 1234567890)
if job.Duration != expectedDuration {
t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
}
},
},
{
name: "valid job stop - failed",
setupJobFunc: func() {
startPayloadFailed := `{
"jobId": 2002,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(startPayloadFailed)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2002,
"cluster": "testcluster",
"startTime": 1234567900,
"jobState": "failed",
"stopTime": 1234569900
}`,
expectError: false,
validateJob: func(t *testing.T, job *schema.Job) {
if job.State != schema.JobStateFailed {
t.Errorf("expected state failed, got %s", job.State)
}
},
},
{
name: "invalid JSON",
payload: `{
"jobId": "not a number"
}`,
expectError: true,
},
{
name: "missing jobId",
payload: `{
"cluster": "testcluster",
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
{
name: "invalid job state",
setupJobFunc: func() {
startPayloadInvalid := `{
"jobId": 2003,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1]
}
],
"startTime": 1234567910
}`
natsAPI.handleStartJob(startPayloadInvalid)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2003,
"cluster": "testcluster",
"startTime": 1234567910,
"jobState": "invalid_state",
"stopTime": 1234571510
}`,
expectError: true,
},
{
name: "stopTime before startTime",
setupJobFunc: func() {
startPayloadTime := `{
"jobId": 2004,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0]
}
],
"startTime": 1234567920
}`
natsAPI.handleStartJob(startPayloadTime)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
},
payload: `{
"jobId": 2004,
"cluster": "testcluster",
"startTime": 1234567920,
"jobState": "completed",
"stopTime": 1234567900
}`,
expectError: true,
},
{
name: "job not found",
payload: `{
"jobId": 99999,
"cluster": "testcluster",
"startTime": 1234567890,
"jobState": "completed",
"stopTime": 1234571490
}`,
expectError: true,
},
}
testData := schema.JobData{
"load_one": map[schema.MetricScope]*schema.JobMetric{
schema.MetricScopeNode: {
Unit: schema.Unit{Base: "load"},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "host123",
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
},
},
},
},
}
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
return testData, nil
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.setupJobFunc != nil {
tt.setupJobFunc()
}
natsAPI.handleStopJob(tt.payload)
// Allow some time for async operations
time.Sleep(100 * time.Millisecond)
if !tt.expectError && tt.validateJob != nil {
// Extract job details from payload
var payloadMap map[string]any
json.Unmarshal([]byte(tt.payload), &payloadMap)
jobID := int64(payloadMap["jobId"].(float64))
cluster := payloadMap["cluster"].(string)
var startTime *int64
if st, ok := payloadMap["startTime"]; ok {
t := int64(st.(float64))
startTime = &t
}
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
if err != nil {
t.Fatalf("expected to find job, but got error: %v", err)
}
tt.validateJob(t, job)
}
})
}
}
func TestNatsHandleNodeState(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
validateFn func(t *testing.T)
}{
{
name: "valid node state update",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
validateFn: func(t *testing.T) {
// In a full test, we would verify the node state was updated in the database
// For now, just ensure no error occurred
},
},
{
name: "multiple nodes",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid JSON in event field",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
expectError: true,
},
{
name: "empty nodes array",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: false, // Empty array should not cause error
},
{
name: "invalid line protocol format",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Should be handled gracefully with warning
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
// Allow some time for async operations
time.Sleep(50 * time.Millisecond)
if tt.validateFn != nil {
tt.validateFn(t)
}
})
}
}
func TestNatsProcessJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
msgStartJob, err := lp.NewMessage(
"job",
map[string]string{"function": "start_job"},
nil,
map[string]any{
"event": `{
"jobId": 3001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgMissingTag, err := lp.NewMessage(
"job",
map[string]string{},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
msgUnknownFunc, err := lp.NewMessage(
"job",
map[string]string{"function": "unknown_function"},
nil,
map[string]any{
"event": `{}`,
},
time.Now(),
)
if err != nil {
t.Fatalf("failed to create test message: %v", err)
}
tests := []struct {
name string
message lp.CCMessage
expectError bool
}{
{
name: "start_job function",
message: msgStartJob,
expectError: false,
},
{
name: "missing function tag",
message: msgMissingTag,
expectError: true,
},
{
name: "unknown function",
message: msgUnknownFunc,
expectError: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.processJobEvent(tt.message)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEvent(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
}{
{
name: "valid influx line protocol",
data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
expectError: false,
},
{
name: "invalid influx line protocol",
data: []byte(`invalid line protocol format`),
expectError: true,
},
{
name: "empty data",
data: []byte(``),
expectError: false, // Decoder should handle empty input gracefully
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// HandleJobEvent doesn't return errors, it logs them
// We're just ensuring it doesn't panic
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleJobEventEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "non-event message (metric data)",
data: []byte(`job,function=start_job value=123.45 1234567890000000000`),
expectError: false,
description: "Should skip non-event messages gracefully",
},
{
name: "wrong measurement name",
data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
expectError: false,
description: "Should warn about unexpected measurement but not fail",
},
{
name: "missing event field",
data: []byte(`job,function=start_job other_field="value" 1234567890000000000`),
expectError: true,
description: "Should error when event field is missing",
},
{
name: "multiple measurements in one message",
data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
expectError: false,
description: "Should process multiple lines",
},
{
name: "escaped quotes in JSON payload",
data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
expectError: true,
description: "Should handle escaped quotes (though JSON parsing may fail)",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleJobEvent("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
tests := []struct {
name string
data []byte
expectError bool
description string
}{
{
name: "missing cluster field in JSON",
data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail when cluster is missing",
},
{
name: "malformed JSON with unescaped quotes",
data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
expectError: true,
description: "Should fail on malformed JSON",
},
{
name: "unicode characters in hostname",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle unicode characters",
},
{
name: "very large node count",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
expectError: false,
description: "Should handle multiple nodes efficiently",
},
{
name: "timestamp in past",
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
expectError: false,
description: "Should accept any valid timestamp",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
natsAPI.handleNodeState("test.subject", tt.data)
time.Sleep(50 * time.Millisecond)
})
}
}
func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
natsAPI := setupNatsTest(t)
t.Cleanup(cleanupNatsTest)
// Start a job
payload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567890
}`
natsAPI.handleStartJob(payload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Try to start the same job again (within 24 hours)
duplicatePayload := `{
"jobId": 5001,
"user": "testuser",
"project": "testproj",
"cluster": "testcluster",
"partition": "main",
"walltime": 3600,
"numNodes": 1,
"numHwthreads": 8,
"numAcc": 0,
"shared": "none",
"monitoringStatus": 1,
"smt": 1,
"resources": [
{
"hostname": "host123",
"hwthreads": [0, 1, 2, 3]
}
],
"startTime": 1234567900
}`
natsAPI.handleStartJob(duplicatePayload)
natsAPI.JobRepository.SyncJobs()
time.Sleep(100 * time.Millisecond)
// Verify only one job exists
jobID := int64(5001)
cluster := "testcluster"
jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
if err != nil && err != sql.ErrNoRows {
t.Fatalf("unexpected error: %v", err)
}
if len(jobs) != 1 {
t.Errorf("expected 1 job, got %d", len(jobs))
}
}

View File

@@ -7,12 +7,17 @@ package api
import (
"fmt"
"maps"
"net/http"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type UpdateNodeStatesRequest struct {
@@ -20,6 +25,15 @@ type UpdateNodeStatesRequest struct {
Cluster string `json:"cluster" example:"fritz"`
}
// metricListToNames converts a map of metric configurations to a list of metric names
func metricListToNames(metricList map[string]*schema.Metric) []string {
names := make([]string, 0, len(metricList))
for name := range metricList {
names = append(names, name)
}
return names
}
// this routine assumes that only one of them exists per node
func determineState(states []string) schema.SchedulerState {
for _, state := range states {
@@ -47,7 +61,7 @@ func determineState(states []string) schema.SchedulerState {
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
// @success 200 {object} api.DefaultApiResponse "Success message"
// @success 200 {object} api.DefaultAPIResponse "Success message"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
@@ -62,19 +76,70 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
http.StatusBadRequest, rw)
return
}
requestReceived := time.Now().Unix()
repo := repository.GetNodeRepository()
m := make(map[string][]string)
metricNames := make(map[string][]string)
healthResults := make(map[string]metricstore.HealthCheckResult)
startMs := time.Now()
// Step 1: Build nodeList and metricList per subcluster
for _, node := range req.Nodes {
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
m[sc] = append(m[sc], node.Hostname)
}
}
for sc := range m {
if sc != "" {
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
metricNames[sc] = metricListToNames(metricList)
}
}
// Step 2: Determine which metric store to query and perform health check
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
if err != nil {
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
} else {
for sc, nl := range m {
if sc != "" {
if results, err := healthRepo.HealthCheck(req.Cluster, nl, metricNames[sc]); err == nil {
maps.Copy(healthResults, results)
}
}
}
}
cclog.Debugf("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs))
startDB := time.Now()
for _, node := range req.Nodes {
state := determineState(node.States)
healthState := schema.MonitoringStateFailed
var healthMetrics string
if result, ok := healthResults[node.Hostname]; ok {
healthState = result.State
healthMetrics = result.HealthMetrics
}
nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(), NodeState: state,
TimeStamp: requestReceived,
NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated,
HealthState: schema.MonitoringStateFull,
HealthState: healthState,
HealthMetrics: healthMetrics,
JobsRunning: node.JobsRunning,
}
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v",
node.Hostname, req.Cluster, err)
}
}
cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
}

View File

@@ -22,10 +22,11 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
"github.com/gorilla/mux"
"github.com/ClusterCockpit/cc-backend/internal/tagger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/go-chi/chi/v5"
)
// @title ClusterCockpit REST API
@@ -48,6 +49,7 @@ import (
const (
noticeFilePath = "./var/notice.txt"
noticeFilePerms = 0o644
maxNoticeLength = 10000 // Maximum allowed notice content length in characters
)
type RestAPI struct {
@@ -61,6 +63,7 @@ type RestAPI struct {
RepositoryMutex sync.Mutex
}
// New creates and initializes a new RestAPI instance with configured dependencies.
func New() *RestAPI {
return &RestAPI{
JobRepository: repository.GetJobRepository(),
@@ -69,79 +72,100 @@ func New() *RestAPI {
}
}
func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// MountAPIRoutes registers REST API endpoints for job and cluster management.
// These routes use JWT token authentication via the X-Auth-Token header.
func (api *RestAPI) MountAPIRoutes(r chi.Router) {
// REST API Uses TokenAuth
// User List
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
r.Get("/users/", api.getUsers)
// Cluster List
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
r.Get("/clusters/", api.getClusters)
// Slurm node state
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
r.Post("/nodestate/", api.updateNodeStates)
r.Put("/nodestate/", api.updateNodeStates)
// Job Handler
r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
r.HandleFunc("/jobs/tag_job/{id}", api.removeTagJob).Methods(http.MethodDelete)
r.HandleFunc("/jobs/edit_meta/{id}", api.editMeta).Methods(http.MethodPost, http.MethodPatch)
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
r.HandleFunc("/jobs/delete_job/", api.deleteJobByRequest).Methods(http.MethodDelete)
r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobByID).Methods(http.MethodDelete)
r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete)
if config.Keys.APISubjects == nil {
cclog.Info("Enabling REST start/stop job API")
r.Post("/jobs/start_job/", api.startJob)
r.Put("/jobs/start_job/", api.startJob)
r.Post("/jobs/stop_job/", api.stopJobByRequest)
r.Put("/jobs/stop_job/", api.stopJobByRequest)
}
r.Get("/jobs/", api.getJobs)
r.Get("/jobs/used_nodes", api.getUsedNodes)
r.Post("/jobs/tag_job/{id}", api.tagJob)
r.Patch("/jobs/tag_job/{id}", api.tagJob)
r.Delete("/jobs/tag_job/{id}", api.removeTagJob)
r.Patch("/jobs/edit_meta/{id}", api.editMeta)
r.Patch("/jobs/edit_meta/", api.editMetaByRequest)
r.Get("/jobs/metrics/{id}", api.getJobMetrics)
r.Delete("/jobs/delete_job/", api.deleteJobByRequest)
r.Delete("/jobs/delete_job/{id}", api.deleteJobByID)
r.Delete("/jobs/delete_job_before/{ts}", api.deleteJobBefore)
r.Post("/jobs/{id}", api.getJobByID)
r.Get("/jobs/{id}", api.getCompleteJobByID)
r.HandleFunc("/tags/", api.removeTags).Methods(http.MethodDelete)
r.Delete("/tags/", api.removeTags)
if api.MachineStateDir != "" {
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
r.Get("/machine_state/{cluster}/{host}", api.getMachineState)
r.Put("/machine_state/{cluster}/{host}", api.putMachineState)
r.Post("/machine_state/{cluster}/{host}", api.putMachineState)
}
}
func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// MountUserAPIRoutes registers user-accessible REST API endpoints.
// These are limited endpoints for regular users with JWT token authentication.
func (api *RestAPI) MountUserAPIRoutes(r chi.Router) {
// REST API Uses TokenAuth
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
r.Get("/jobs/", api.getJobs)
r.Post("/jobs/{id}", api.getJobByID)
r.Get("/jobs/{id}", api.getCompleteJobByID)
r.Get("/jobs/metrics/{id}", api.getJobMetrics)
}
func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
// MountMetricStoreAPIRoutes registers metric storage API endpoints.
// These endpoints handle metric data ingestion and health checks with JWT token authentication.
func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) {
// REST API Uses TokenAuth
// Note: StrictSlash handles trailing slash variations automatically
r.HandleFunc("/api/free", freeMetrics).Methods(http.MethodPost)
r.HandleFunc("/api/write", writeMetrics).Methods(http.MethodPost)
r.HandleFunc("/api/debug", debugMetrics).Methods(http.MethodGet)
r.HandleFunc("/api/healthcheck", metricsHealth).Methods(http.MethodGet)
r.Post("/free", freeMetrics)
r.Post("/write", writeMetrics)
r.Get("/debug", debugMetrics)
r.Post("/healthcheck", api.updateNodeStates)
// Same endpoints but with trailing slash
r.HandleFunc("/api/free/", freeMetrics).Methods(http.MethodPost)
r.HandleFunc("/api/write/", writeMetrics).Methods(http.MethodPost)
r.HandleFunc("/api/debug/", debugMetrics).Methods(http.MethodGet)
r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet)
r.Post("/free/", freeMetrics)
r.Post("/write/", writeMetrics)
r.Get("/debug/", debugMetrics)
r.Post("/healthcheck/", api.updateNodeStates)
}
func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// MountConfigAPIRoutes registers configuration and user management endpoints.
// These routes use session-based authentication and require admin privileges.
// Routes use full paths (including /config prefix) to avoid conflicting with
// the /config page route when registered via Group instead of Route.
func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) {
// Settings Frontend Uses SessionAuth
if api.Authentication != nil {
r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet)
r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete)
r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost)
r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost)
r.Get("/config/roles/", api.getRoles)
r.Post("/config/users/", api.createUser)
r.Put("/config/users/", api.createUser)
r.Get("/config/users/", api.getUsers)
r.Delete("/config/users/", api.deleteUser)
r.Post("/config/user/{id}", api.updateUser)
r.Post("/config/notice/", api.editNotice)
r.Get("/config/taggers/", api.getTaggers)
r.Post("/config/taggers/run/", api.runTagger)
}
}
func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) {
r.StrictSlash(true)
// MountFrontendAPIRoutes registers frontend-specific API endpoints.
// These routes support JWT generation and user configuration updates with session authentication.
func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) {
r.Get("/logs/", api.getJournalLog)
// Settings Frontend Uses SessionAuth
if api.Authentication != nil {
r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost)
r.Get("/jwt/", api.getJWT)
r.Post("/configuration/", api.updateConfiguration)
}
}
@@ -157,6 +181,8 @@ type DefaultAPIResponse struct {
Message string `json:"msg"`
}
// handleError writes a standardized JSON error response with the given status code.
// It logs the error at WARN level and ensures proper Content-Type headers are set.
func handleError(err error, statusCode int, rw http.ResponseWriter) {
cclog.Warnf("REST ERROR : %s", err.Error())
rw.Header().Add("Content-Type", "application/json")
@@ -169,15 +195,38 @@ func handleError(err error, statusCode int, rw http.ResponseWriter) {
}
}
// decode reads JSON from r into val with strict validation that rejects unknown fields.
func decode(r io.Reader, val any) error {
dec := json.NewDecoder(r)
dec.DisallowUnknownFields()
return dec.Decode(val)
}
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
// validatePathComponent checks if a path component contains potentially malicious patterns
// that could be used for path traversal attacks. Returns an error if validation fails.
func validatePathComponent(component, componentName string) error {
if strings.Contains(component, "..") ||
strings.Contains(component, "/") ||
strings.Contains(component, "\\") {
return fmt.Errorf("invalid %s", componentName)
}
return nil
}
// editNotice godoc
// @summary Update system notice
// @tags Config
// @description Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.
// @accept mpfd
// @produce plain
// @param new-content formData string true "New notice content (max 10000 characters)"
// @success 200 {string} string "Update Notice Content Success"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /notice/ [post]
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw)
return
@@ -186,9 +235,8 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
// Get Value
newContent := r.FormValue("new-content")
// Validate content length to prevent DoS
if len(newContent) > 10000 {
handleError(fmt.Errorf("notice content exceeds maximum length of 10000 characters"), http.StatusBadRequest, rw)
if len(newContent) > maxNoticeLength {
handleError(fmt.Errorf("notice content exceeds maximum length of %d characters", maxNoticeLength), http.StatusBadRequest, rw)
return
}
@@ -200,7 +248,9 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw)
return
}
ntxt.Close()
if err := ntxt.Close(); err != nil {
cclog.Warnf("Failed to close notice file: %v", err)
}
}
if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil {
@@ -210,13 +260,66 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
rw.WriteHeader(http.StatusOK)
var msg []byte
if newContent != "" {
rw.Write([]byte("Update Notice Content Success"))
msg = []byte("Update Notice Content Success")
} else {
rw.Write([]byte("Empty Notice Content Success"))
msg = []byte("Empty Notice Content Success")
}
if _, err := rw.Write(msg); err != nil {
cclog.Errorf("Failed to write response: %v", err)
}
}
func (api *RestAPI) getTaggers(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to list taggers"), http.StatusForbidden, rw)
return
}
rw.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(rw).Encode(tagger.ListTaggers()); err != nil {
cclog.Errorf("Failed to encode tagger list: %v", err)
}
}
func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) {
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to run taggers"), http.StatusForbidden, rw)
return
}
name := r.FormValue("name")
if name == "" {
handleError(fmt.Errorf("missing required parameter: name"), http.StatusBadRequest, rw)
return
}
if err := tagger.RunTaggerByName(name); err != nil {
handleError(err, http.StatusConflict, rw)
return
}
rw.Header().Set("Content-Type", "text/plain")
rw.WriteHeader(http.StatusOK)
if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil {
cclog.Errorf("Failed to write response: %v", err)
}
}
// getJWT godoc
// @summary Generate JWT token
// @tags Frontend
// @description Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.
// @accept mpfd
// @produce plain
// @param username formData string true "Username to generate JWT for"
// @success 200 {string} string "JWT token"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 404 {object} ErrorResponse "User Not Found"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /jwt/ [get]
func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
username := r.FormValue("username")
@@ -241,12 +344,22 @@ func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
}
rw.WriteHeader(http.StatusOK)
rw.Write([]byte(jwt))
if _, err := rw.Write([]byte(jwt)); err != nil {
cclog.Errorf("Failed to write JWT response: %v", err)
}
}
// getRoles godoc
// @summary Get available roles
// @tags Config
// @description Returns a list of valid user roles. Only admins are allowed.
// @produce json
// @success 200 {array} string "List of role names"
// @failure 403 {object} ErrorResponse "Forbidden"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /roles/ [get]
func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
// SecuredCheck() only worked with TokenAuth: Removed
user := repository.GetUserFromContext(r.Context())
if !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw)
@@ -265,6 +378,18 @@ func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
}
}
// updateConfiguration godoc
// @summary Update user configuration
// @tags Frontend
// @description Updates a user's configuration key-value pair.
// @accept mpfd
// @produce plain
// @param key formData string true "Configuration key"
// @param value formData string true "Configuration value"
// @success 200 {string} string "success"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /configuration/ [post]
func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain")
key, value := r.FormValue("key"), r.FormValue("value")
@@ -275,26 +400,40 @@ func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request)
}
rw.WriteHeader(http.StatusOK)
rw.Write([]byte("success"))
if _, err := rw.Write([]byte("success")); err != nil {
cclog.Errorf("Failed to write response: %v", err)
}
}
// putMachineState godoc
// @summary Store machine state
// @tags Machine State
// @description Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
// @accept json
// @produce plain
// @param cluster path string true "Cluster name"
// @param host path string true "Host name"
// @success 201 "Created"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 404 {object} ErrorResponse "Machine state not enabled"
// @failure 500 {object} ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /machine_state/{cluster}/{host} [put]
func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
return
}
vars := mux.Vars(r)
cluster := vars["cluster"]
host := vars["host"]
cluster := chi.URLParam(r, "cluster")
host := chi.URLParam(r, "host")
// Validate cluster and host to prevent path traversal attacks
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
if err := validatePathComponent(cluster, "cluster name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
if err := validatePathComponent(host, "host name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
@@ -320,23 +459,33 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
rw.WriteHeader(http.StatusCreated)
}
// getMachineState godoc
// @summary Retrieve machine state
// @tags Machine State
// @description Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
// @produce json
// @param cluster path string true "Cluster name"
// @param host path string true "Host name"
// @success 200 {object} object "Machine state JSON data"
// @failure 400 {object} ErrorResponse "Bad Request"
// @failure 404 {object} ErrorResponse "Machine state not enabled or file not found"
// @security ApiKeyAuth
// @router /machine_state/{cluster}/{host} [get]
func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
if api.MachineStateDir == "" {
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
return
}
vars := mux.Vars(r)
cluster := vars["cluster"]
host := vars["host"]
cluster := chi.URLParam(r, "cluster")
host := chi.URLParam(r, "host")
// Validate cluster and host to prevent path traversal attacks
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
if err := validatePathComponent(cluster, "cluster name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
if err := validatePathComponent(host, "host name"); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}

View File

@@ -11,9 +11,9 @@ import (
"net/http"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/gorilla/mux"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-chi/chi/v5"
)
type APIReturnedUser struct {
@@ -31,7 +31,7 @@ type APIReturnedUser struct {
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
// @produce json
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
// @success 200 {array} api.APIReturnedUser "List of users returned successfully"
// @failure 400 {string} string "Bad Request"
// @failure 401 {string} string "Unauthorized"
// @failure 403 {string} string "Forbidden"
@@ -91,7 +91,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
// Handle role updates
if newrole != "" {
if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil {
handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
@@ -99,7 +99,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if delrole != "" {
if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil {
handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
@@ -107,7 +107,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if newproj != "" {
if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil {
handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
@@ -115,7 +115,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
cclog.Errorf("Failed to encode response: %v", err)
}
} else if delproj != "" {
if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil {
handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
return
}
@@ -164,7 +164,7 @@ func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
return
}
if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) {
handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
return
}

View File

@@ -106,7 +106,7 @@ Data is archived at the highest available resolution (typically 60s intervals).
```go
// In archiver.go ArchiveJob() function
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 300)
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
// 0 = highest resolution
// 300 = 5-minute resolution
```
@@ -170,7 +170,6 @@ All exported functions are safe for concurrent use:
- `Start()` - Safe to call once
- `TriggerArchiving()` - Safe from multiple goroutines
- `Shutdown()` - Safe to call once
- `WaitForArchiving()` - Deprecated, but safe
Internal state is protected by:
- Channel synchronization (`archiveChannel`)
@@ -185,6 +184,6 @@ Internal state is protected by:
## Dependencies
- `internal/repository`: Database operations for job metadata
- `internal/metricDataDispatcher`: Loading metric data from various backends
- `internal/metricdispatch`: Loading metric data from various backends
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
- `cc-lib/schema`: Job and metric data structures

View File

@@ -54,8 +54,8 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel"
)
@@ -126,7 +126,7 @@ func archivingWorker() {
// not using meta data, called to load JobMeta into Cache?
// will fail if job meta not in repository
if _, err := jobRepo.FetchMetadata(job); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", *job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
archivePending.Done()
continue
@@ -136,7 +136,7 @@ func archivingWorker() {
// Use shutdown context to allow cancellation
jobMeta, err := ArchiveJob(job, shutdownCtx)
if err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", *job.ID, err.Error())
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
archivePending.Done()
continue
@@ -145,24 +145,24 @@ func archivingWorker() {
stmt := sq.Update("job").Where("job.id = ?", job.ID)
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", *job.ID, err.Error())
archivePending.Done()
continue
}
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", *job.ID, err.Error())
archivePending.Done()
continue
}
// Update the jobs database entry one last time:
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
if err := jobRepo.Execute(stmt); err != nil {
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", *job.ID, err.Error())
archivePending.Done()
continue
}
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
cclog.Infof("archiving job (dbid: %d) successful", job.ID)
cclog.Infof("archiving job (dbid: %d) successful", *job.ID)
repository.CallJobStopHooks(job)
archivePending.Done()

View File

@@ -9,11 +9,10 @@ import (
"context"
"math"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// ArchiveJob archives a completed job's metric data to the configured archive backend.
@@ -60,7 +59,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
scopes = append(scopes, schema.MetricScopeAccelerator)
}
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
if err != nil {
cclog.Error("Error wile loading job data for archiving")
return nil, err
@@ -94,12 +93,5 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
}
}
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if config.Keys.DisableArchive {
return job, nil
}
return job, archive.GetHandle().ImportJob(job, &jobData)
}

View File

@@ -25,9 +25,9 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/ClusterCockpit/cc-lib/v2/util"
"github.com/gorilla/sessions"
)
@@ -40,7 +40,7 @@ type Authenticator interface {
// authenticator should attempt the login. This method should not perform
// expensive operations or actual authentication.
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
// Login performs the actually authentication for the user.
// It returns the authenticated user or an error if authentication fails.
// The user parameter may be nil if the user doesn't exist in the database yet.
@@ -65,13 +65,13 @@ var ipUserLimiters sync.Map
func getIPUserLimiter(ip, username string) *rate.Limiter {
key := ip + ":" + username
now := time.Now()
if entry, ok := ipUserLimiters.Load(key); ok {
rle := entry.(*rateLimiterEntry)
rle.lastUsed = now
return rle.limiter
}
// More aggressive rate limiting: 5 attempts per 15 minutes
newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
ipUserLimiters.Store(key, &rateLimiterEntry{
@@ -176,7 +176,7 @@ func (auth *Authentication) AuthViaSession(
func Init(authCfg *json.RawMessage) {
initOnce.Do(func() {
authInstance = &Authentication{}
// Start background cleanup of rate limiters
startRateLimiterCleanup()
@@ -263,7 +263,7 @@ func GetAuthInstance() *Authentication {
}
// handleUserSync syncs or updates a user in the database based on configuration.
// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled.
func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
r := repository.GetUserRepository()
dbUser, err := r.GetUser(user.Username)
@@ -272,7 +272,7 @@ func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool)
cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
return
}
if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
if err := r.AddUser(user); err != nil {
cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
@@ -294,6 +294,11 @@ func handleOIDCUser(OIDCUser *schema.User) {
handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
}
// handleLdapUser syncs LDAP user with database
func handleLdapUser(ldapUser *schema.User) {
handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin)
}
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
session, err := auth.sessionStore.New(r, "session")
if err != nil {
@@ -305,8 +310,13 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
if auth.SessionMaxAge != 0 {
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
}
if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" {
cclog.Warn("HTTPS not configured - session cookies will not have Secure flag set (insecure for production)")
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
// If neither TLS or an encrypted reverse proxy are used, do not mark cookies as secure.
cclog.Warn("Authenticating with unencrypted request. Session cookies will not have Secure flag set (insecure for production)")
if r.Header.Get("X-Forwarded-Proto") == "" {
// This warning will not be printed if e.g. X-Forwarded-Proto == http
cclog.Warn("If you are using a reverse proxy, make sure X-Forwarded-Proto is set")
}
session.Options.Secure = false
}
session.Options.SameSite = http.SameSiteStrictMode
@@ -438,13 +448,13 @@ func (auth *Authentication) AuthAPI(
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
if user.HasRole(schema.RoleAPI) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
@@ -474,13 +484,13 @@ func (auth *Authentication) AuthUserAPI(
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
if user.HasRole(schema.RoleAPI) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
@@ -510,13 +520,13 @@ func (auth *Authentication) AuthMetricStoreAPI(
if user != nil {
switch {
case len(user.Roles) == 1:
if user.HasRole(schema.RoleApi) {
if user.HasRole(schema.RoleAPI) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
}
case len(user.Roles) >= 2:
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
return
@@ -616,9 +626,9 @@ func securedCheck(user *schema.User, r *http.Request) error {
}
// If SplitHostPort fails, IPAddress is already just a host (no port)
// If nothing declared in config: deny all request to this api endpoint
// If nothing declared in config: Continue
if len(config.Keys.APIAllowedIPs) == 0 {
return fmt.Errorf("missing configuration key ApiAllowedIPs")
return nil
}
// If wildcard declared in config: Continue
if config.Keys.APIAllowedIPs[0] == "*" {

View File

@@ -15,25 +15,25 @@ import (
func TestGetIPUserLimiter(t *testing.T) {
ip := "192.168.1.1"
username := "testuser"
// Get limiter for the first time
limiter1 := getIPUserLimiter(ip, username)
if limiter1 == nil {
t.Fatal("Expected limiter to be created")
}
// Get the same limiter again
limiter2 := getIPUserLimiter(ip, username)
if limiter1 != limiter2 {
t.Error("Expected to get the same limiter instance")
}
// Get a different limiter for different user
limiter3 := getIPUserLimiter(ip, "otheruser")
if limiter1 == limiter3 {
t.Error("Expected different limiter for different user")
}
// Get a different limiter for different IP
limiter4 := getIPUserLimiter("192.168.1.2", username)
if limiter1 == limiter4 {
@@ -45,16 +45,16 @@ func TestGetIPUserLimiter(t *testing.T) {
func TestRateLimiterBehavior(t *testing.T) {
ip := "10.0.0.1"
username := "ratelimituser"
limiter := getIPUserLimiter(ip, username)
// Should allow first 5 attempts
for i := 0; i < 5; i++ {
for i := range 5 {
if !limiter.Allow() {
t.Errorf("Request %d should be allowed within rate limit", i+1)
}
}
// 6th attempt should be blocked
if limiter.Allow() {
t.Error("Request 6 should be blocked by rate limiter")
@@ -65,19 +65,19 @@ func TestRateLimiterBehavior(t *testing.T) {
func TestCleanupOldRateLimiters(t *testing.T) {
// Clear all existing limiters first to avoid interference from other tests
cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
// Create some new rate limiters
limiter1 := getIPUserLimiter("1.1.1.1", "user1")
limiter2 := getIPUserLimiter("2.2.2.2", "user2")
if limiter1 == nil || limiter2 == nil {
t.Fatal("Failed to create test limiters")
}
// Cleanup limiters older than 1 second from now (should keep both)
time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
// Verify they still exist (should get same instance)
if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
t.Error("Limiter 1 was incorrectly cleaned up")
@@ -85,10 +85,10 @@ func TestCleanupOldRateLimiters(t *testing.T) {
if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
t.Error("Limiter 2 was incorrectly cleaned up")
}
// Cleanup limiters older than 1 hour from now (should remove both)
cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
// Getting them again should create new instances
newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
if newLimiter1 == limiter1 {
@@ -107,14 +107,14 @@ func TestIPv4Extraction(t *testing.T) {
{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
@@ -122,7 +122,7 @@ func TestIPv4Extraction(t *testing.T) {
}
}
// TestIPv6Extraction tests extracting IPv6 addresses
// TestIPv6Extraction tests extracting IPv6 addresses
func TestIPv6Extraction(t *testing.T) {
tests := []struct {
name string
@@ -134,14 +134,14 @@ func TestIPv6Extraction(t *testing.T) {
{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
{"IPv6 localhost", "::1", "::1"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}
@@ -160,14 +160,14 @@ func TestIPExtractionEdgeCases(t *testing.T) {
{"Empty string", "", ""},
{"Just port", ":8080", ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tt.input
if host, _, err := net.SplitHostPort(result); err == nil {
result = host
}
if result != tt.expected {
t.Errorf("Expected %s, got %s", tt.expected, result)
}

View File

@@ -14,8 +14,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -25,20 +25,20 @@ type JWTAuthConfig struct {
MaxAge string `json:"max-age"`
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
CookieName string `json:"cookieName"`
CookieName string `json:"cookie-name"`
// Deny login for users not in database (but defined in JWT).
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
ValidateUser bool `json:"validateUser"`
ValidateUser bool `json:"validate-user"`
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
TrustedIssuer string `json:"trustedIssuer"`
TrustedIssuer string `json:"trusted-issuer"`
// Should an non-existent user be added to the DB based on the information in the token
SyncUserOnLogin bool `json:"syncUserOnLogin"`
SyncUserOnLogin bool `json:"sync-user-on-login"`
// Should an existent user be updated in the DB based on the information in the token
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
UpdateUserOnLogin bool `json:"update-user-on-login"`
}
type JWTAuthenticator struct {
@@ -101,20 +101,20 @@ func (ja *JWTAuthenticator) AuthViaJWT(
// Token is valid, extract payload
claims := token.Claims.(jwt.MapClaims)
// Use shared helper to get user from JWT claims
var user *schema.User
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
if err != nil {
return nil, err
}
// If not validating user, we only get roles from JWT (no projects for this auth method)
if !Keys.JwtConfig.ValidateUser {
user.Roles = extractRolesFromClaims(claims, false)
user.Projects = nil // Standard JWT auth doesn't include projects
}
return user, nil
}

View File

@@ -12,8 +12,8 @@ import (
"net/http"
"os"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -146,13 +146,13 @@ func (ja *JWTCookieSessionAuthenticator) Login(
}
claims := token.Claims.(jwt.MapClaims)
// Use shared helper to get user from JWT claims
user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
if err != nil {
return nil, err
}
// Sync or update user if configured
if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
handleTokenUser(user)

View File

@@ -9,10 +9,11 @@ import (
"database/sql"
"errors"
"fmt"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -28,7 +29,7 @@ func extractStringFromClaims(claims jwt.MapClaims, key string) string {
// If validateRoles is true, only valid roles are returned
func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
var roles []string
if rawroles, ok := claims["roles"].([]any); ok {
for _, rr := range rawroles {
if r, ok := rr.(string); ok {
@@ -42,14 +43,14 @@ func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
}
}
}
return roles
}
// extractProjectsFromClaims extracts projects from JWT claims
func extractProjectsFromClaims(claims jwt.MapClaims) []string {
projects := make([]string, 0)
if rawprojs, ok := claims["projects"].([]any); ok {
for _, pp := range rawprojs {
if p, ok := pp.(string); ok {
@@ -61,7 +62,7 @@ func extractProjectsFromClaims(claims jwt.MapClaims) []string {
projects = append(projects, projSlice...)
}
}
return projects
}
@@ -72,22 +73,23 @@ func extractNameFromClaims(claims jwt.MapClaims) string {
if name, ok := claims["name"].(string); ok {
return name
}
// Try nested structure: {name: {values: [...]}}
if wrap, ok := claims["name"].(map[string]any); ok {
if vals, ok := wrap["values"].([]any); ok {
if len(vals) == 0 {
return ""
}
name := fmt.Sprintf("%v", vals[0])
var name strings.Builder
name.WriteString(fmt.Sprintf("%v", vals[0]))
for i := 1; i < len(vals); i++ {
name += fmt.Sprintf(" %v", vals[i])
name.WriteString(fmt.Sprintf(" %v", vals[i]))
}
return name
return name.String()
}
}
return ""
}
@@ -100,7 +102,7 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
if sub == "" {
return nil, errors.New("missing 'sub' claim in JWT")
}
if validateUser {
// Validate user against database
ur := repository.GetUserRepository()
@@ -109,22 +111,22 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
cclog.Errorf("Error while loading user '%v': %v", sub, err)
return nil, fmt.Errorf("database error: %w", err)
}
// Deny any logins for unknown usernames
if user == nil || err == sql.ErrNoRows {
cclog.Warn("Could not find user from JWT in internal database.")
return nil, errors.New("unknown user")
}
// Return database user (with database roles)
return user, nil
}
// Create user from JWT claims
name := extractNameFromClaims(claims)
roles := extractRolesFromClaims(claims, true) // Validate roles
projects := extractProjectsFromClaims(claims)
return &schema.User{
Username: sub,
Name: name,

View File

@@ -8,7 +8,7 @@ package auth
import (
"testing"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -19,7 +19,7 @@ func TestExtractStringFromClaims(t *testing.T) {
"email": "test@example.com",
"age": 25, // not a string
}
tests := []struct {
name string
key string
@@ -30,7 +30,7 @@ func TestExtractStringFromClaims(t *testing.T) {
{"Non-existent key", "missing", ""},
{"Non-string value", "age", ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractStringFromClaims(claims, tt.key)
@@ -88,16 +88,16 @@ func TestExtractRolesFromClaims(t *testing.T) {
expected: []string{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractRolesFromClaims(tt.claims, tt.validateRoles)
if len(result) != len(tt.expected) {
t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
return
}
for i, role := range result {
if i >= len(tt.expected) || role != tt.expected[i] {
t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
@@ -141,16 +141,16 @@ func TestExtractProjectsFromClaims(t *testing.T) {
expected: []string{"project1", "project2"}, // Should skip non-strings
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractProjectsFromClaims(tt.claims)
if len(result) != len(tt.expected) {
t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
return
}
for i, project := range result {
if i >= len(tt.expected) || project != tt.expected[i] {
t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
@@ -216,7 +216,7 @@ func TestExtractNameFromClaims(t *testing.T) {
expected: "123 Smith", // Should convert to string
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := extractNameFromClaims(tt.claims)
@@ -235,29 +235,28 @@ func TestGetUserFromJWT_NoValidation(t *testing.T) {
"roles": []any{"user", "admin"},
"projects": []any{"project1", "project2"},
}
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if user.Username != "testuser" {
t.Errorf("Expected username 'testuser', got '%s'", user.Username)
}
if user.Name != "Test User" {
t.Errorf("Expected name 'Test User', got '%s'", user.Name)
}
if len(user.Roles) != 2 {
t.Errorf("Expected 2 roles, got %d", len(user.Roles))
}
if len(user.Projects) != 2 {
t.Errorf("Expected 2 projects, got %d", len(user.Projects))
}
if user.AuthType != schema.AuthToken {
t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
}
@@ -268,13 +267,13 @@ func TestGetUserFromJWT_MissingSub(t *testing.T) {
claims := jwt.MapClaims{
"name": "Test User",
}
_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
if err == nil {
t.Error("Expected error for missing sub claim")
}
if err.Error() != "missing 'sub' claim in JWT" {
t.Errorf("Expected specific error message, got: %v", err)
}

View File

@@ -13,8 +13,8 @@ import (
"os"
"strings"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/golang-jwt/jwt/v5"
)
@@ -75,13 +75,13 @@ func (ja *JWTSessionAuthenticator) Login(
}
claims := token.Claims.(jwt.MapClaims)
// Use shared helper to get user from JWT claims
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
if err != nil {
return nil, err
}
// Sync or update user if configured
if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
handleTokenUser(user)

View File

@@ -6,35 +6,39 @@
package auth
import (
"errors"
"fmt"
"net"
"net/http"
"os"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-ldap/ldap/v3"
)
type LdapConfig struct {
URL string `json:"url"`
UserBase string `json:"user_base"`
SearchDN string `json:"search_dn"`
UserBind string `json:"user_bind"`
UserFilter string `json:"user_filter"`
UserAttr string `json:"username_attr"`
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
SyncDelOldUsers bool `json:"sync_del_old_users"`
UserBase string `json:"user-base"`
SearchDN string `json:"search-dn"`
UserBind string `json:"user-bind"`
UserFilter string `json:"user-filter"`
UserAttr string `json:"username-attr"`
UIDAttr string `json:"uid-attr"`
SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration.
SyncDelOldUsers bool `json:"sync-del-old-users"`
// Should an non-existent user be added to the DB if user exists in ldap directory
SyncUserOnLogin bool `json:"syncUserOnLogin"`
// Should a non-existent user be added to the DB if user exists in ldap directory
SyncUserOnLogin bool `json:"sync-user-on-login"`
UpdateUserOnLogin bool `json:"update-user-on-login"`
}
type LdapAuthenticator struct {
syncPassword string
UserAttr string
UIDAttr string
}
var _ Authenticator = (*LdapAuthenticator)(nil)
@@ -51,6 +55,12 @@ func (la *LdapAuthenticator) Init() error {
la.UserAttr = "gecos"
}
if Keys.LdapConfig.UIDAttr != "" {
la.UIDAttr = Keys.LdapConfig.UIDAttr
} else {
la.UIDAttr = "uid"
}
return nil
}
@@ -66,55 +76,44 @@ func (la *LdapAuthenticator) CanLogin(
if user.AuthSource == schema.AuthViaLDAP {
return user, true
}
} else {
if lc.SyncUserOnLogin {
l, err := la.getLdapConnection(true)
if err != nil {
cclog.Error("LDAP connection error")
return nil, false
}
defer l.Close()
// Search for the given username
searchRequest := ldap.NewSearchRequest(
lc.UserBase,
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username),
[]string{"dn", "uid", la.UserAttr}, nil)
sr, err := l.Search(searchRequest)
if err != nil {
cclog.Warn(err)
return nil, false
}
if len(sr.Entries) != 1 {
cclog.Warn("LDAP: User does not exist or too many entries returned")
return nil, false
}
entry := sr.Entries[0]
name := entry.GetAttributeValue(la.UserAttr)
var roles []string
roles = append(roles, schema.GetRoleString(schema.RoleUser))
projects := make([]string, 0)
user = &schema.User{
Username: username,
Name: name,
Roles: roles,
Projects: projects,
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaLDAP,
}
if err := repository.GetUserRepository().AddUser(user); err != nil {
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
return nil, false
}
return user, true
} else if lc.SyncUserOnLogin {
l, err := la.getLdapConnection(true)
if err != nil {
cclog.Error("LDAP connection error")
return nil, false
}
defer l.Close()
// Search for the given username
searchRequest := ldap.NewSearchRequest(
lc.UserBase,
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)),
[]string{"dn", la.UIDAttr, la.UserAttr}, nil)
sr, err := l.Search(searchRequest)
if err != nil {
cclog.Warn(err)
return nil, false
}
if len(sr.Entries) != 1 {
cclog.Warn("LDAP: User does not exist or too many entries returned")
return nil, false
}
entry := sr.Entries[0]
user = &schema.User{
Username: username,
Name: entry.GetAttributeValue(la.UserAttr),
Roles: []string{schema.GetRoleString(schema.RoleUser)},
Projects: make([]string, 0),
AuthType: schema.AuthSession,
AuthSource: schema.AuthViaLDAP,
}
handleLdapUser(user)
return user, true
}
return nil, false
@@ -132,7 +131,7 @@ func (la *LdapAuthenticator) Login(
}
defer l.Close()
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username)
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username))
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
user.Username, err)
@@ -170,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error {
lc.UserBase,
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
lc.UserFilter,
[]string{"dn", "uid", la.UserAttr}, nil))
[]string{"dn", la.UIDAttr, la.UserAttr}, nil))
if err != nil {
cclog.Warn("LDAP search error")
return err
@@ -178,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error {
newnames := map[string]string{}
for _, entry := range ldapResults.Entries {
username := entry.GetAttributeValue("uid")
username := entry.GetAttributeValue(la.UIDAttr)
if username == "" {
return errors.New("no attribute 'uid'")
return fmt.Errorf("no attribute '%s'", la.UIDAttr)
}
_, ok := users[username]
@@ -194,20 +193,19 @@ func (la *LdapAuthenticator) Sync() error {
for username, where := range users {
if where == InDB && lc.SyncDelOldUsers {
ur.DelUser(username)
if err := ur.DelUser(username); err != nil {
cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err)
return err
}
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
} else if where == InLdap {
name := newnames[username]
var roles []string
roles = append(roles, schema.GetRoleString(schema.RoleUser))
projects := make([]string, 0)
user := &schema.User{
Username: username,
Name: name,
Roles: roles,
Projects: projects,
Roles: []string{schema.GetRoleString(schema.RoleUser)},
Projects: make([]string, 0),
AuthSource: schema.AuthViaLDAP,
}
@@ -224,11 +222,13 @@ func (la *LdapAuthenticator) Sync() error {
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
lc := Keys.LdapConfig
conn, err := ldap.DialURL(lc.URL)
conn, err := ldap.DialURL(lc.URL,
ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second}))
if err != nil {
cclog.Warn("LDAP URL dial failed")
return nil, err
}
conn.SetTimeout(30 * time.Second)
if admin {
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {

View File

@@ -9,8 +9,8 @@ import (
"fmt"
"net/http"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"golang.org/x/crypto/bcrypt"
)

View File

@@ -9,23 +9,24 @@ import (
"context"
"crypto/rand"
"encoding/base64"
"fmt"
"io"
"net/http"
"os"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/coreos/go-oidc/v3/oidc"
"github.com/gorilla/mux"
"github.com/go-chi/chi/v5"
"golang.org/x/oauth2"
)
type OpenIDConfig struct {
Provider string `json:"provider"`
SyncUserOnLogin bool `json:"syncUserOnLogin"`
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
SyncUserOnLogin bool `json:"sync-user-on-login"`
UpdateUserOnLogin bool `json:"update-user-on-login"`
}
type OIDC struct {
@@ -50,6 +51,7 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
MaxAge: int(time.Hour.Seconds()),
Secure: r.TLS != nil,
HttpOnly: true,
SameSite: http.SameSiteLaxMode,
}
http.SetCookie(w, c)
}
@@ -59,7 +61,7 @@ func NewOIDC(a *Authentication) *OIDC {
// Use context with timeout for provider initialization
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
if err != nil {
cclog.Fatal(err)
@@ -77,8 +79,7 @@ func NewOIDC(a *Authentication) *OIDC {
ClientID: clientID,
ClientSecret: clientSecret,
Endpoint: provider.Endpoint(),
RedirectURL: "oidc-callback",
Scopes: []string{oidc.ScopeOpenID, "profile", "email"},
Scopes: []string{oidc.ScopeOpenID, "profile"},
}
oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
@@ -86,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC {
return oa
}
func (oa *OIDC) RegisterEndpoints(r *mux.Router) {
func (oa *OIDC) RegisterEndpoints(r chi.Router) {
r.HandleFunc("/oidc-login", oa.OAuth2Login)
r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
}
@@ -119,57 +120,96 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
// Exchange authorization code for token with timeout
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
if err != nil {
http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
cclog.Errorf("token exchange failed: %s", err.Error())
http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError)
return
}
// Get user info from OIDC provider with same timeout
userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
if err != nil {
http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
cclog.Errorf("failed to get userinfo: %s", err.Error())
http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError)
return
}
// // Extract the ID Token from OAuth2 token.
// rawIDToken, ok := token.Extra("id_token").(string)
// if !ok {
// http.Error(rw, "Cannot access idToken", http.StatusInternalServerError)
// }
//
// verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
// // Parse and verify ID Token payload.
// idToken, err := verifier.Verify(context.Background(), rawIDToken)
// if err != nil {
// http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError)
// }
// Verify ID token and nonce to prevent replay attacks
rawIDToken, ok := token.Extra("id_token").(string)
if !ok {
http.Error(rw, "ID token not found in response", http.StatusInternalServerError)
return
}
nonceCookie, err := r.Cookie("nonce")
if err != nil {
http.Error(rw, "nonce cookie not found", http.StatusBadRequest)
return
}
verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
idToken, err := verifier.Verify(ctx, rawIDToken)
if err != nil {
cclog.Errorf("ID token verification failed: %s", err.Error())
http.Error(rw, "ID token verification failed", http.StatusInternalServerError)
return
}
if idToken.Nonce != nonceCookie.Value {
http.Error(rw, "Nonce mismatch", http.StatusBadRequest)
return
}
projects := make([]string, 0)
// Extract custom claims
// Extract custom claims from userinfo
var claims struct {
Username string `json:"preferred_username"`
Name string `json:"name"`
Profile struct {
// Keycloak realm-level roles
RealmAccess struct {
Roles []string `json:"roles"`
} `json:"realm_access"`
// Keycloak client-level roles
ResourceAccess struct {
Client struct {
Roles []string `json:"roles"`
} `json:"clustercockpit"`
} `json:"resource_access"`
}
if err := userInfo.Claims(&claims); err != nil {
http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError)
cclog.Errorf("failed to extract claims: %s", err.Error())
http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError)
return
}
if claims.Username == "" {
http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest)
return
}
// Merge roles from both client-level and realm-level access
oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...)
roleSet := make(map[string]bool)
for _, r := range oidcRoles {
switch r {
case "user":
roleSet[schema.GetRoleString(schema.RoleUser)] = true
case "admin":
roleSet[schema.GetRoleString(schema.RoleAdmin)] = true
case "manager":
roleSet[schema.GetRoleString(schema.RoleManager)] = true
case "support":
roleSet[schema.GetRoleString(schema.RoleSupport)] = true
}
}
var roles []string
for _, r := range claims.Profile.Client.Roles {
switch r {
case "user":
roles = append(roles, schema.GetRoleString(schema.RoleUser))
case "admin":
roles = append(roles, schema.GetRoleString(schema.RoleAdmin))
}
for role := range roleSet {
roles = append(roles, role)
}
if len(roles) == 0 {
@@ -188,8 +228,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
handleOIDCUser(user)
}
oa.authentication.SaveSession(rw, r, user)
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
if err := oa.authentication.SaveSession(rw, r, user); err != nil {
cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error())
http.Error(rw, "Failed to create session", http.StatusInternalServerError)
return
}
cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
}
@@ -206,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
codeVerifier := oauth2.GenerateVerifier()
setCallbackCookie(rw, r, "verifier", codeVerifier)
// Generate nonce for ID token replay protection
nonce, err := randString(16)
if err != nil {
http.Error(rw, "Internal error", http.StatusInternalServerError)
return
}
setCallbackCookie(rw, r, "nonce", nonce)
// Build redirect URL from the incoming request
scheme := "https"
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
scheme = "http"
}
oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host)
// Redirect user to consent page to ask for permission
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier))
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline,
oauth2.S256ChallengeOption(codeVerifier),
oidc.Nonce(nonce))
http.Redirect(rw, r, url, http.StatusFound)
}

View File

@@ -15,37 +15,44 @@ var configSchema = `
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
"type": "string"
},
"cookieName": {
"cookie-name": {
"description": "Cookie that should be checked for a JWT token.",
"type": "string"
},
"validateUser": {
"validate-user": {
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
"type": "boolean"
},
"trustedIssuer": {
"trusted-issuer": {
"description": "Issuer that should be accepted when validating external JWTs ",
"type": "string"
},
"syncUserOnLogin": {
"sync-user-on-login": {
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
"type": "boolean"
},
"update-user-on-login": {
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided in JWT.",
"type": "boolean"
}
},
"required": ["max-age"]
},
"oidc": {
"provider": {
"description": "",
"type": "string"
},
"syncUserOnLogin": {
"description": "",
"type": "boolean"
},
"updateUserOnLogin": {
"description": "",
"type": "boolean"
"type": "object",
"properties": {
"provider": {
"description": "OpenID Connect provider URL.",
"type": "string"
},
"sync-user-on-login": {
"description": "Add non-existent user to DB at login attempt with values provided.",
"type": "boolean"
},
"update-user-on-login": {
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided.",
"type": "boolean"
}
},
"required": ["provider"]
},
@@ -57,40 +64,48 @@ var configSchema = `
"description": "URL of LDAP directory server.",
"type": "string"
},
"user_base": {
"user-base": {
"description": "Base DN of user tree root.",
"type": "string"
},
"search_dn": {
"search-dn": {
"description": "DN for authenticating LDAP admin account with general read rights.",
"type": "string"
},
"user_bind": {
"user-bind": {
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
"type": "string"
},
"user_filter": {
"user-filter": {
"description": "Filter to extract users for syncing.",
"type": "string"
},
"username_attr": {
"username-attr": {
"description": "Attribute with full username. Default: gecos",
"type": "string"
},
"sync_interval": {
"sync-interval": {
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
"type": "string"
},
"sync_del_old_users": {
"sync-del-old-users": {
"description": "Delete obsolete users in database.",
"type": "boolean"
},
"syncUserOnLogin": {
"uid-attr": {
"description": "LDAP attribute used as login username. Default: uid",
"type": "string"
},
"sync-user-on-login": {
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
"type": "boolean"
},
"update-user-on-login": {
"description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.",
"type": "boolean"
}
},
"required": ["url", "user_base", "search_dn", "user_bind", "user_filter"]
"required": ["url", "user-base", "search-dn", "user-bind", "user-filter"]
},
"required": ["jwts"]
}`

View File

@@ -11,8 +11,8 @@ import (
"encoding/json"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/resampler"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
)
type ProgramConfig struct {
@@ -20,7 +20,9 @@ type ProgramConfig struct {
Addr string `json:"addr"`
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
APIAllowedIPs []string `json:"apiAllowedIPs"`
APIAllowedIPs []string `json:"api-allowed-ips"`
APISubjects *NATSConfig `json:"api-subjects"`
// Drop root permissions once .env was read and the port was taken.
User string `json:"user"`
@@ -35,16 +37,9 @@ type ProgramConfig struct {
EmbedStaticFiles bool `json:"embed-static-files"`
StaticFiles string `json:"static-files"`
// 'sqlite3' or 'mysql' (mysql will work for mariadb as well)
DBDriver string `json:"db-driver"`
// For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).
// Path to SQLite database file
DB string `json:"db"`
// Keep all metric data in the metric data repositories,
// do not write to the job-archive.
DisableArchive bool `json:"disable-archive"`
EnableJobTaggers bool `json:"enable-job-taggers"`
// Validate json input against schema
@@ -76,17 +71,42 @@ type ProgramConfig struct {
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"resampling"`
// Systemd unit name for log viewer (default: "clustercockpit")
SystemdUnit string `json:"systemd-unit"`
// Node state retention configuration
NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
}
type NodeStateRetention struct {
Policy string `json:"policy"` // "delete" or "move"
Age int `json:"age"` // hours, default 24
TargetKind string `json:"target-kind"` // "file" or "s3"
TargetPath string `json:"target-path"`
TargetEndpoint string `json:"target-endpoint"`
TargetBucket string `json:"target-bucket"`
TargetAccessKey string `json:"target-access-key"`
TargetSecretKey string `json:"target-secret-key"`
TargetRegion string `json:"target-region"`
TargetUsePathStyle bool `json:"target-use-path-style"`
MaxFileSizeMB int `json:"max-file-size-mb"`
}
type ResampleConfig struct {
// Minimum number of points to trigger resampling of data
MinimumPoints int `json:"minimumPoints"`
MinimumPoints int `json:"minimum-points"`
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
Resolutions []int `json:"resolutions"`
// Trigger next zoom level at less than this many visible datapoints
Trigger int `json:"trigger"`
}
type NATSConfig struct {
SubjectJobEvent string `json:"subject-job-event"`
SubjectNodeState string `json:"subject-node-state"`
}
type IntRange struct {
From int `json:"from"`
To int `json:"to"`
@@ -100,32 +120,20 @@ type TimeRange struct {
type FilterRanges struct {
Duration *IntRange `json:"duration"`
NumNodes *IntRange `json:"numNodes"`
StartTime *TimeRange `json:"startTime"`
NumNodes *IntRange `json:"num-nodes"`
StartTime *TimeRange `json:"start-time"`
}
type ClusterConfig struct {
Name string `json:"name"`
FilterRanges *FilterRanges `json:"filterRanges"`
MetricDataRepository json.RawMessage `json:"metricDataRepository"`
}
var Clusters []*ClusterConfig
var Keys ProgramConfig = ProgramConfig{
Addr: "localhost:8080",
DisableAuthentication: false,
EmbedStaticFiles: true,
DBDriver: "sqlite3",
DB: "./var/job.db",
DisableArchive: false,
Validate: false,
SessionMaxAge: "168h",
StopJobsExceedingWalltime: 0,
ShortRunningJobsDuration: 5 * 60,
}
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
func Init(mainConfig json.RawMessage) {
Validate(configSchema, mainConfig)
dec := json.NewDecoder(bytes.NewReader(mainConfig))
dec.DisallowUnknownFields()
@@ -133,17 +141,6 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
Validate(clustersSchema, clusterConfig)
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&Clusters); err != nil {
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
}
if len(Clusters) < 1 {
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
}
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
}

View File

@@ -8,19 +8,15 @@ package config
import (
"testing"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
func TestInit(t *testing.T) {
fp := "../../configs/config.json"
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
@@ -34,11 +30,7 @@ func TestInitMinimal(t *testing.T) {
fp := "../../configs/config-demo.json"
ccconf.Init(fp)
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}

View File

@@ -15,7 +15,7 @@ import (
type DefaultMetricsCluster struct {
Name string `json:"name"`
DefaultMetrics string `json:"default_metrics"`
DefaultMetrics string `json:"default-metrics"`
}
type DefaultMetricsConfig struct {

View File

@@ -6,14 +6,14 @@
package config
var configSchema = `
{
{
"type": "object",
"properties": {
"addr": {
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
"type": "string"
},
"apiAllowedIPs": {
"api-allowed-ips": {
"description": "Addresses from which secured API endpoints can be reached",
"type": "array",
"items": {
@@ -41,13 +41,9 @@ var configSchema = `
"type": "string"
},
"db": {
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
"description": "Path to SQLite database file (e.g., './var/job.db')",
"type": "string"
},
"disable-archive": {
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
"type": "boolean"
},
"enable-job-taggers": {
"description": "Turn on automatic application and jobclass taggers",
"type": "boolean"
@@ -81,28 +77,22 @@ var configSchema = `
"type": "integer"
},
"emission-constant": {
"description": ".",
"description": "Energy mix CO2 emission constant [g/kWh]. If set, displays estimated CO2 emission for jobs.",
"type": "integer"
},
"cron-frequency": {
"description": "Frequency of cron job workers.",
"type": "object",
"properties": {
"duration-worker": {
"description": "Duration Update Worker [Defaults to '5m']",
"type": "string"
},
"footprint-worker": {
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
"type": "string"
}
}
"machine-state-dir": {
"description": "Where to store MachineState files.",
"type": "string"
},
"enable-resampling": {
"systemd-unit": {
"description": "Systemd unit name for log viewer (default: 'clustercockpit').",
"type": "string"
},
"resampling": {
"description": "Enable dynamic zoom in frontend metric plots.",
"type": "object",
"properties": {
"minimumPoints": {
"minimum-points": {
"description": "Minimum points to trigger resampling of time-series data.",
"type": "integer"
},
@@ -119,87 +109,74 @@ var configSchema = `
}
},
"required": ["trigger", "resolutions"]
}
},
"required": ["apiAllowedIPs"]
}`
var clustersSchema = `
{
"type": "array",
"items": {
},
"api-subjects": {
"description": "NATS subjects configuration for subscribing to job and node events.",
"type": "object",
"properties": {
"name": {
"description": "The name of the cluster.",
"subject-job-event": {
"description": "NATS subject for job events (start_job, stop_job)",
"type": "string"
},
"metricDataRepository": {
"description": "Type of the metric data repository for this cluster",
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
},
"url": {
"type": "string"
},
"token": {
"type": "string"
}
},
"required": ["kind"]
},
"filterRanges": {
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
"type": "object",
"properties": {
"numNodes": {
"description": "UI slider range for number of nodes",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"duration": {
"description": "UI slider range for duration",
"type": "object",
"properties": {
"from": {
"type": "integer"
},
"to": {
"type": "integer"
}
},
"required": ["from", "to"]
},
"startTime": {
"description": "UI slider range for start time",
"type": "object",
"properties": {
"from": {
"type": "string",
"format": "date-time"
},
"to": {
"type": "null"
}
},
"required": ["from", "to"]
}
},
"required": ["numNodes", "duration", "startTime"]
"subject-node-state": {
"description": "NATS subject for node state updates",
"type": "string"
}
},
"required": ["name", "metricDataRepository", "filterRanges"],
"minItems": 1
"required": ["subject-job-event", "subject-node-state"]
},
"nodestate-retention": {
"description": "Node state retention configuration for cleaning up old node_state rows.",
"type": "object",
"properties": {
"policy": {
"description": "Retention policy: 'delete' to remove old rows, 'move' to archive to Parquet then delete.",
"type": "string",
"enum": ["delete", "move"]
},
"age": {
"description": "Retention age in hours (default: 24).",
"type": "integer"
},
"target-kind": {
"description": "Target kind for parquet archiving: 'file' or 's3'.",
"type": "string",
"enum": ["file", "s3"]
},
"target-path": {
"description": "Filesystem path for parquet file target.",
"type": "string"
},
"target-endpoint": {
"description": "S3 endpoint URL.",
"type": "string"
},
"target-bucket": {
"description": "S3 bucket name.",
"type": "string"
},
"target-access-key": {
"description": "S3 access key.",
"type": "string"
},
"target-secret-key": {
"description": "S3 secret key.",
"type": "string"
},
"target-region": {
"description": "S3 region.",
"type": "string"
},
"target-use-path-style": {
"description": "Use path-style S3 addressing.",
"type": "boolean"
},
"max-file-size-mb": {
"description": "Maximum parquet file size in MB (default: 128).",
"type": "integer"
}
},
"required": ["policy"]
}
}`
}
}`

View File

@@ -8,7 +8,7 @@ package config
import (
"encoding/json"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/santhosh-tekuri/jsonschema/v5"
)

File diff suppressed because it is too large Load Diff

View File

@@ -10,9 +10,21 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type ClusterMetricWithName struct {
Name string `json:"name"`
Unit *schema.Unit `json:"unit,omitempty"`
Timestep int `json:"timestep"`
Data []schema.Float `json:"data"`
}
type ClusterMetrics struct {
NodeCount int `json:"nodeCount"`
Metrics []*ClusterMetricWithName `json:"metrics"`
}
type Count struct {
Name string `json:"name"`
Count int `json:"count"`
@@ -59,6 +71,7 @@ type JobFilter struct {
Project *StringInput `json:"project,omitempty"`
JobName *StringInput `json:"jobName,omitempty"`
Cluster *StringInput `json:"cluster,omitempty"`
SubCluster *StringInput `json:"subCluster,omitempty"`
Partition *StringInput `json:"partition,omitempty"`
Duration *config.IntRange `json:"duration,omitempty"`
Energy *FloatRange `json:"energy,omitempty"`
@@ -70,6 +83,7 @@ type JobFilter struct {
State []schema.JobState `json:"state,omitempty"`
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
Shared *string `json:"shared,omitempty"`
Schedule *string `json:"schedule,omitempty"`
Node *StringInput `json:"node,omitempty"`
}
@@ -173,7 +187,7 @@ type NamedStatsWithScope struct {
type NodeFilter struct {
Hostname *StringInput `json:"hostname,omitempty"`
Cluster *StringInput `json:"cluster,omitempty"`
Subcluster *StringInput `json:"subcluster,omitempty"`
SubCluster *StringInput `json:"subCluster,omitempty"`
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
HealthState *string `json:"healthState,omitempty"`
TimeStart *int `json:"timeStart,omitempty"`

View File

@@ -4,7 +4,7 @@ import (
"sync"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/jmoiron/sqlx"
)

View File

@@ -1,13 +1,15 @@
package graph
// This file will be automatically regenerated based on the schema, any resolver implementations
// This file will be automatically regenerated based on the schema, any resolver
// implementations
// will be copied through when generating and any unknown code will be moved to the end.
// Code generated by github.com/99designs/gqlgen version v0.17.81
// Code generated by github.com/99designs/gqlgen version v0.17.87
import (
"context"
"errors"
"fmt"
"math"
"regexp"
"slices"
"strconv"
@@ -17,11 +19,12 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// Partitions is the resolver for the partitions field.
@@ -86,14 +89,14 @@ func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*
res := []*model.EnergyFootprintValue{}
for name, value := range rawEnergyFootprint {
// Suboptimal: Nearly hardcoded metric name expectations
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
matchCore := regexp.MustCompile(`core|Core|CORE`)
hwType := ""
switch test := name; { // NOtice ';' for var declaration
case matchCpu.MatchString(test):
case matchCPU.MatchString(test):
hwType = "CPU"
case matchAcc.MatchString(test):
hwType = "Accelerator"
@@ -173,9 +176,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id")
return nil, err
@@ -220,9 +223,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id")
return nil, err
@@ -263,9 +266,9 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
}
tags := []int{}
for _, tagId := range tagIds {
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id for removal")
return nil, err
@@ -281,7 +284,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
// Remove from DB
if err = r.Repo.RemoveTagById(tid); err != nil {
if err = r.Repo.RemoveTagByID(tid); err != nil {
cclog.Warn("Error while removing tag")
return nil, err
} else {
@@ -315,18 +318,39 @@ func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (sc
if obj.NodeState != "" {
return obj.NodeState, nil
} else {
return "", fmt.Errorf("No SchedulerState (NodeState) on Object")
return "", fmt.Errorf("resolver: no SchedulerState (NodeState) on node object")
}
}
// HealthState is the resolver for the healthState field.
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
panic(fmt.Errorf("not implemented: HealthState - healthState"))
if obj.HealthState != "" {
return string(obj.HealthState), nil
} else {
return "", fmt.Errorf("resolver: no HealthState (NodeState) on node object")
}
}
// MetaData is the resolver for the metaData field.
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
panic(fmt.Errorf("not implemented: MetaData - metaData"))
if obj.MetaData != nil {
return obj.MetaData, nil
} else {
cclog.Debug("resolver: no MetaData (NodeState) on node object")
emptyMeta := make(map[string]string, 0)
return emptyMeta, nil
}
}
// HealthData is the resolver for the healthData field.
func (r *nodeResolver) HealthData(ctx context.Context, obj *schema.Node) (any, error) {
if obj.HealthData != nil {
return obj.HealthData, nil
} else {
cclog.Debug("resolver: no HealthData (NodeState) on node object")
emptyHealth := make(map[string][]string, 0)
return emptyHealth, nil
}
}
// Clusters is the resolver for the clusters field.
@@ -341,6 +365,14 @@ func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
// GlobalMetrics is the resolver for the globalMetrics field.
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
user := repository.GetUserFromContext(ctx)
if user != nil {
if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
return archive.GlobalUserMetricList, nil
}
}
return archive.GlobalMetricList, nil
}
@@ -371,12 +403,12 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
// Node is the resolver for the node field.
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
repo := repository.GetNodeRepository()
numericId, err := strconv.ParseInt(id, 10, 64)
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
cclog.Warn("Error while parsing job id")
return nil, err
}
return repo.GetNodeById(numericId, false)
return repo.GetNodeByID(numericID, false)
}
// Nodes is the resolver for the nodes field.
@@ -387,6 +419,15 @@ func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, o
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
}
// NodesWithMeta is the resolver for the nodesWithMeta field.
func (r *queryResolver) NodesWithMeta(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
// Why Extra Handler? -> graphql.CollectAllFields(ctx) only returns toplevel fields (i.e.: items, count), and not subfields like item.metaData
repo := repository.GetNodeRepository()
nodes, err := repo.QueryNodesWithMeta(ctx, filter, nil, order) // Ignore Paging, Order Unused
count := len(nodes)
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
}
// NodeStates is the resolver for the nodeStates field.
func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
repo := repository.GetNodeRepository()
@@ -403,8 +444,7 @@ func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilt
return nil, herr
}
allCounts := make([]*model.NodeStates, 0)
allCounts = append(stateCounts, healthCounts...)
allCounts := append(stateCounts, healthCounts...)
return allCounts, nil
}
@@ -431,18 +471,18 @@ func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.Nod
return healthCounts, nil
}
return nil, errors.New("Unknown Node State Query Type")
return nil, errors.New("unknown Node State Query Type")
}
// Job is the resolver for the job field.
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
numericId, err := strconv.ParseInt(id, 10, 64)
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
cclog.Warn("Error while parsing job id")
return nil, err
}
job, err := r.Repo.FindById(ctx, numericId)
job, err := r.Repo.FindByID(ctx, numericID)
if err != nil {
cclog.Warn("Error while finding job by id")
return nil, err
@@ -475,7 +515,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
return nil, err
}
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution)
if err != nil {
cclog.Warn("Error while loading job data")
return nil, err
@@ -503,7 +543,7 @@ func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []strin
return nil, err
}
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("Error while loading jobStats data for job id %s", id)
return nil, err
@@ -528,7 +568,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
return nil, err
}
data, err := metricDataDispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
return nil, err
@@ -542,7 +582,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
for _, stat := range stats {
mdlStats = append(mdlStats, &model.ScopedStats{
Hostname: stat.Hostname,
ID: stat.Id,
ID: stat.ID,
Data: stat.Data,
})
}
@@ -581,21 +621,24 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
// Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards)
/*
Example Page 4 @ 10 IpP : Does item 41 exist?
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
*/
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
hasNextPage := false
if page.ItemsPerPage != -1 {
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
}
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
if err != nil {
cclog.Warn("Error while querying next jobs")
return nil, err
}
hasNextPage = len(nextJobs) == 1
}
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
if err != nil {
cclog.Warn("Error while querying next jobs")
return nil, err
}
hasNextPage := len(nextJobs) == 1
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
}
@@ -693,7 +736,7 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job
res := []*model.JobStats{}
for _, job := range jobs {
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
continue
@@ -744,13 +787,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
return nil, errors.New("you need to be administrator or support staff for this query")
}
defaultMetrics := make([]string, 0)
for _, mc := range archive.GetCluster(cluster).MetricConfig {
defaultMetrics = append(defaultMetrics, mc.Name)
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
metrics = defaultMetrics
} else {
metrics = slices.DeleteFunc(metrics, func(metric string) bool {
return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics.
})
}
data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data")
return nil, err
@@ -804,153 +853,39 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
return nil, errors.New("you need to be administrator or support staff for this query")
}
nodeRepo := repository.GetNodeRepository()
// nodes -> array hostname
nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
if nerr != nil {
return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
// Build Filters
queryFilters := make([]*model.NodeFilter, 0)
if cluster != "" {
queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}})
}
if subCluster != "" {
queryFilters = append(queryFilters, &model.NodeFilter{Subcluster: &model.StringInput{Eq: &subCluster}})
}
if nodeFilter != "" && stateFilter != "notindb" {
queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}})
}
if stateFilter != "all" && stateFilter != "notindb" {
var queryState schema.SchedulerState = schema.SchedulerState(stateFilter)
queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState})
}
// if healthFilter != "all" {
// filters = append(filters, &model.NodeFilter{HealthState: &healthFilter})
// }
// Special Case: Disable Paging for missing nodes filter, save IPP for later
var backupItems int
if stateFilter == "notindb" {
backupItems = page.ItemsPerPage
page.ItemsPerPage = -1
}
// Query Nodes From DB
nodeRepo := repository.GetNodeRepository()
rawNodes, serr := nodeRepo.QueryNodes(ctx, queryFilters, page, nil) // Order not Used
if serr != nil {
cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)")
return nil, serr
}
// Intermediate Node Result Info
nodes := make([]string, 0)
stateMap := make(map[string]string)
for _, node := range rawNodes {
nodes = append(nodes, node.Hostname)
stateMap[node.Hostname] = string(node.NodeState)
}
// Setup Vars
var countNodes int
var cerr error
var hasNextPage bool
// Special Case: Find Nodes not in DB node table but in metricStore only
if stateFilter == "notindb" {
// Reapply Original Paging
page.ItemsPerPage = backupItems
// Get Nodes From Topology
var topoNodes []string
if subCluster != "" {
scNodes := archive.NodeLists[cluster][subCluster]
topoNodes = scNodes.PrintList()
} else {
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
topoNodes = append(topoNodes, nodeList.PrintList()...)
}
}
// Compare to all nodes from cluster/subcluster in DB
var missingNodes []string
for _, scanNode := range topoNodes {
if !slices.Contains(nodes, scanNode) {
missingNodes = append(missingNodes, scanNode)
}
}
// Filter nodes by name
if nodeFilter != "" {
filteredNodesByName := []string{}
for _, missingNode := range missingNodes {
if strings.Contains(missingNode, nodeFilter) {
filteredNodesByName = append(filteredNodesByName, missingNode)
}
}
missingNodes = filteredNodesByName
}
// Sort Missing Nodes Alphanumerically
slices.Sort(missingNodes)
// Total Missing
countNodes = len(missingNodes)
// Apply paging
if countNodes > page.ItemsPerPage {
start := (page.Page - 1) * page.ItemsPerPage
end := start + page.ItemsPerPage
if end > countNodes {
end = countNodes
hasNextPage = false
} else {
hasNextPage = true
}
nodes = missingNodes[start:end]
} else {
nodes = missingNodes
}
} else {
// DB Nodes: Count and Find Next Page
countNodes, cerr = nodeRepo.CountNodes(ctx, queryFilters)
if cerr != nil {
cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
return nil, cerr
}
// Example Page 4 @ 10 IpP : Does item 41 exist?
// Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists.
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
}
nextNodes, err := nodeRepo.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used
if err != nil {
cclog.Warn("Error while querying next nodes")
return nil, err
}
hasNextPage = len(nextNodes) == 1
}
// Load Metric Data For Specified Nodes Only
data, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
// data -> map hostname:jobdata
data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
return nil, err
}
// Build Result
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
for hostname, metrics := range data {
for _, hostname := range nodes {
host := &model.NodeMetrics{
Host: hostname,
State: stateMap[hostname],
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
Metrics: make([]*model.JobMetricWithName, 0),
}
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
if err != nil {
cclog.Warnf("error in nodeMetrics resolver: %s", err)
}
for metric, scopedMetrics := range metrics {
for metric, scopedMetrics := range data[hostname] {
for scope, scopedMetric := range scopedMetrics {
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
Name: metric,
@@ -963,9 +898,9 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
nodeMetricsList = append(nodeMetricsList, host)
}
// Final Return
nodeMetricsListResult := &model.NodesResultList{
Items: nodeMetricsList,
Items: nodeMetricsList,
// TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357
TotalNodes: &countNodes,
HasNextPage: &hasNextPage,
}
@@ -973,6 +908,99 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
return nodeMetricsListResult, nil
}
// ClusterMetrics is the resolver for the clusterMetrics field.
func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) {
user := repository.GetUserFromContext(ctx)
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
return nil, errors.New("you need to be administrator or support staff for this query")
}
if metrics == nil {
for _, mc := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, mc.Name)
}
}
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
scopes := []schema.MetricScope{"node"}
data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
if err != nil {
cclog.Warn("error while loading node data")
return nil, err
}
clusterMetricData := make([]*model.ClusterMetricWithName, 0)
clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData}
collectorTimestep := make(map[string]int)
collectorUnit := make(map[string]schema.Unit)
collectorData := make(map[string][]schema.Float)
for _, metrics := range data {
clusterMetrics.NodeCount += 1
for metric, scopedMetrics := range metrics {
for _, scopedMetric := range scopedMetrics {
// Collect Info Once
_, okTimestep := collectorTimestep[metric]
if !okTimestep {
collectorTimestep[metric] = scopedMetric.Timestep
}
_, okUnit := collectorUnit[metric]
if !okUnit {
collectorUnit[metric] = scopedMetric.Unit
}
// Collect Data
for _, ser := range scopedMetric.Series {
_, okData := collectorData[metric]
// Init With Datasize > 0
if !okData && len(ser.Data) != 0 {
collectorData[metric] = make([]schema.Float, len(ser.Data))
} else if !okData {
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
}
// Sum if init'd and matching size
if okData && len(ser.Data) == len(collectorData[metric]) {
for i, val := range ser.Data {
if val.IsNaN() {
continue
} else {
collectorData[metric][i] += val
}
}
} else if okData {
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
}
}
}
}
}
for metricName, data := range collectorData {
// use ccUnits for backend normalization to "Tera"
p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix)
p_new := ccunit.NewPrefix("T")
convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new)
u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base}
roundedData := make([]schema.Float, 0)
for _, v_old := range data {
v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0
roundedData = append(roundedData, schema.Float(v_new))
}
cm := model.ClusterMetricWithName{
Name: metricName,
Unit: &u_new,
Timestep: collectorTimestep[metricName],
Data: roundedData,
}
clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm)
}
return &clusterMetrics, nil
}
// NumberOfNodes is the resolver for the numberOfNodes field.
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
nodeList, err := archive.ParseNodeList(obj.Nodes)

View File

@@ -2,18 +2,20 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package graph
import (
"context"
"fmt"
"math"
"slices"
"github.com/99designs/gqlgen/graphql"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
const MAX_JOBS_FOR_ANALYSIS = 500
@@ -53,15 +55,15 @@ func (r *queryResolver) rooflineHeatmap(
// resolution = max(resolution, mc.Timestep)
// }
jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
if err != nil {
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
cclog.Warnf("Error while loading roofline metrics for job %d", *job.ID)
return nil, err
}
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil {
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", *job.ID)
continue
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
}
@@ -126,7 +128,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
continue
}
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
cclog.Error("Error while loading averages for footprint")
return nil, err
}
@@ -185,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
func requireField(ctx context.Context, name string) bool {
fields := graphql.CollectAllFields(ctx)
for _, f := range fields {
if f == name {
return true
}
}
return false
return slices.Contains(fields, name)
}

View File

@@ -2,6 +2,7 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
import (
@@ -14,8 +15,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
@@ -37,7 +38,7 @@ import (
func HandleImportFlag(flag string) error {
r := repository.GetJobRepository()
for _, pair := range strings.Split(flag, ",") {
for pair := range strings.SplitSeq(flag, ",") {
files := strings.Split(pair, ":")
if len(files) != 2 {
return fmt.Errorf("REPOSITORY/INIT > invalid import flag format")
@@ -101,7 +102,7 @@ func HandleImportFlag(flag string) error {
return err
}
id, err := r.InsertJob(&job)
id, err := r.InsertJobDirect(&job)
if err != nil {
cclog.Warn("Error while job db insert")
return err

View File

@@ -16,8 +16,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
// copyFile copies a file from source path to destination path.
@@ -50,42 +50,14 @@ func setup(t *testing.T) *repository.JobRepository {
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"api-allowed-ips": [
"*"
]},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
},
{
"name": "fritz",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 944 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
},
{
"name": "taurus",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 4000 },
"duration": { "from": 0, "to": 604800 },
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
}
}
]}`
}
}`
cclog.Init("info", true)
tmpdir := t.TempDir()
@@ -107,7 +79,7 @@ func setup(t *testing.T) *repository.JobRepository {
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := repository.MigrateDB("sqlite3", dbfilepath)
err := repository.MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
@@ -121,22 +93,18 @@ func setup(t *testing.T) *repository.JobRepository {
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
config.Init(cfg, clustercfg)
} else {
t.Fatal("Cluster configuration must be present")
}
config.Init(cfg)
} else {
t.Fatal("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
t.Fatal(err)
}
repository.Connect("sqlite3", dbfilepath)
repository.Connect(dbfilepath)
return repository.GetJobRepository()
}
@@ -197,7 +165,7 @@ func TestHandleImportFlag(t *testing.T) {
}
result := readResult(t, testname)
job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime)
if err != nil {
t.Fatal(err)
}

View File

@@ -22,8 +22,8 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
const (
@@ -111,18 +111,22 @@ func InitDB() error {
continue
}
id, err := r.TransactionAddNamed(t,
id, jobErr := r.TransactionAddNamed(t,
repository.NamedJobInsert, jobMeta)
if err != nil {
cclog.Errorf("repository initDB(): %v", err)
if jobErr != nil {
cclog.Errorf("repository initDB(): %v", jobErr)
errorOccured++
continue
}
// Job successfully inserted, increment counter
i += 1
for _, tag := range jobMeta.Tags {
tagstr := tag.Name + ":" + tag.Type
tagID, ok := tags[tagstr]
if !ok {
var err error
tagID, err = r.TransactionAdd(t,
addTagQuery,
tag.Name, tag.Type)
@@ -138,10 +142,6 @@ func InitDB() error {
setTagQuery,
id, tagID)
}
if err == nil {
i += 1
}
}
if errorOccured > 0 {
@@ -216,7 +216,7 @@ func enrichJobMetadata(job *schema.Job) error {
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
}
} else {
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, *job.ID)
}
job.EnergyFootprint[fp] = metricEnergy
@@ -225,7 +225,7 @@ func enrichJobMetadata(job *schema.Job) error {
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", *job.ID)
return err
}

View File

@@ -2,12 +2,13 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package importer
import (
"math"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
)
// getNormalizationFactor calculates the scaling factor needed to normalize a value

View File

@@ -8,7 +8,7 @@ import (
"fmt"
"testing"
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
)
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.

View File

@@ -1,217 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"math"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
)
type APIMetricData struct {
Error *string `json:"error,omitempty"`
Data schema.FloatArray `json:"data,omitempty"`
From int64 `json:"from"`
To int64 `json:"to"`
Resolution int64 `json:"resolution"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
}
type APIQueryRequest struct {
Cluster string `json:"cluster"`
Queries []APIQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
WithPadding bool `json:"with-padding"`
}
type APIQueryResponse struct {
Queries []APIQuery `json:"queries,omitempty"`
Results [][]APIMetricData `json:"results"`
}
type APIQuery struct {
Type *string `json:"type,omitempty"`
SubType *string `json:"subtype,omitempty"`
Metric string `json:"metric"`
Hostname string `json:"host"`
Resolution int64 `json:"resolution"`
TypeIds []string `json:"type-ids,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
ScaleFactor schema.Float `json:"scale-by,omitempty"`
Aggregate bool `json:"aggreg"`
}
// TODO: Optimize this, just like the stats endpoint!
func (data *APIMetricData) AddStats() {
n := 0
sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
for _, x := range data.Data {
if x.IsNaN() {
continue
}
n += 1
sum += float64(x)
min = math.Min(min, float64(x))
max = math.Max(max, float64(x))
}
if n > 0 {
avg := sum / float64(n)
data.Avg = schema.Float(avg)
data.Min = schema.Float(min)
data.Max = schema.Float(max)
} else {
data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN
}
}
func (data *APIMetricData) ScaleBy(f schema.Float) {
if f == 0 || f == 1 {
return
}
data.Avg *= f
data.Min *= f
data.Max *= f
for i := 0; i < len(data.Data); i++ {
data.Data[i] *= f
}
}
func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
minfo, ok := ms.Metrics[metric]
if !ok {
return
}
if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
ndata := make([]schema.Float, 0, padfront+len(data.Data))
for range padfront {
ndata = append(ndata, schema.NaN)
}
for j := 0; j < len(data.Data); j++ {
ndata = append(ndata, data.Data[j])
}
data.Data = ndata
}
}
func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
req.WithData = true
req.WithData = true
req.WithData = true
ms := GetMemoryStore()
response := APIQueryResponse{
Results: make([][]APIMetricData, 0, len(req.Queries)),
}
if req.ForAllNodes != nil {
nodes := ms.ListChildren([]string{req.Cluster})
for _, node := range nodes {
for _, metric := range req.ForAllNodes {
q := APIQuery{
Metric: metric,
Hostname: node,
}
req.Queries = append(req.Queries, q)
response.Queries = append(response.Queries, q)
}
}
}
for _, query := range req.Queries {
sels := make([]util.Selector, 0, 1)
if query.Aggregate || query.Type == nil {
sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}}
if query.Type != nil {
if len(query.TypeIds) == 1 {
sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]})
} else {
ids := make([]string, len(query.TypeIds))
for i, id := range query.TypeIds {
ids[i] = *query.Type + id
}
sel = append(sel, util.SelectorElement{Group: ids})
}
if query.SubType != nil {
if len(query.SubTypeIds) == 1 {
sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
} else {
ids := make([]string, len(query.SubTypeIds))
for i, id := range query.SubTypeIds {
ids[i] = *query.SubType + id
}
sel = append(sel, util.SelectorElement{Group: ids})
}
}
}
sels = append(sels, sel)
} else {
for _, typeID := range query.TypeIds {
if query.SubType != nil {
for _, subTypeID := range query.SubTypeIds {
sels = append(sels, util.Selector{
{String: req.Cluster},
{String: query.Hostname},
{String: *query.Type + typeID},
{String: *query.SubType + subTypeID},
})
}
} else {
sels = append(sels, util.Selector{
{String: req.Cluster},
{String: query.Hostname},
{String: *query.Type + typeID},
})
}
}
}
// log.Printf("query: %#v\n", query)
// log.Printf("sels: %#v\n", sels)
var err error
res := make([]APIMetricData, 0, len(sels))
for _, sel := range sels {
data := APIMetricData{}
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
if err != nil {
msg := err.Error()
data.Error = &msg
res = append(res, data)
continue
}
if req.WithStats {
data.AddStats()
}
if query.ScaleFactor != 0 {
data.ScaleBy(query.ScaleFactor)
}
if req.WithPadding {
data.PadDataWithNull(ms, req.From, req.To, query.Metric)
}
if !req.WithData {
data.Data = nil
}
res = append(res, data)
}
response.Results = append(response.Results, res)
}
return &response, nil
}

View File

@@ -1,191 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"archive/zip"
"bufio"
"context"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"sync"
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
func Archiving(wg *sync.WaitGroup, ctx context.Context) {
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.Archive.Interval)
if err != nil {
cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err)
}
if d <= 0 {
return
}
ticks := func() <-chan time.Time {
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339))
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead)
if err != nil {
cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
}
}
}
}()
}
var ErrNoNewArchiveData error = errors.New("all data already archived")
// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
// deleting them from the `checkpointsDir`.
func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries1, err := os.ReadDir(checkpointsDir)
if err != nil {
return 0, err
}
type workItem struct {
cdir, adir string
cluster, host string
}
var wg sync.WaitGroup
n, errs := int32(0), int32(0)
work := make(chan workItem, Keys.NumWorkers)
wg.Add(Keys.NumWorkers)
for worker := 0; worker < Keys.NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
if err != nil {
cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(m))
}
}()
}
for _, de1 := range entries1 {
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
if e != nil {
err = e
}
for _, de2 := range entries2 {
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
work <- workItem{
adir: adir, cdir: cdir,
cluster: de1.Name(), host: de2.Name(),
}
}
}
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
}
return int(n), nil
}
// Helper function for `ArchiveCheckpoints`.
func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries, err := os.ReadDir(dir)
if err != nil {
return 0, err
}
extension := Keys.Checkpoints.FileFormat
files, err := findFiles(entries, from, extension, false)
if err != nil {
return 0, err
}
if deleteInstead {
n := 0
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
}
return n, nil
}
filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(archiveDir, CheckpointDirPerms)
if err == nil {
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
}
}
if err != nil {
return 0, err
}
defer f.Close()
bw := bufio.NewWriter(f)
defer bw.Flush()
zw := zip.NewWriter(bw)
defer zw.Close()
n := 0
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
r, err := os.Open(filename)
if err != nil {
return n, err
}
defer r.Close()
w, err := zw.Create(checkpoint)
if err != nil {
return n, err
}
if _, err = io.Copy(w, r); err != nil {
return n, err
}
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
}
return n, nil
}

View File

@@ -1,482 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"os"
"path"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/linkedin/goavro/v2"
)
var NumAvroWorkers int = 4
var startUp bool = true
var ErrNoNewData error = errors.New("no data in the pool")
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
levels := make([]*AvroLevel, 0)
selectors := make([][]string, 0)
as.root.lock.RLock()
// Cluster
for sel1, l1 := range as.root.children {
l1.lock.RLock()
// Node
for sel2, l2 := range l1.children {
l2.lock.RLock()
// Frequency
for sel3, l3 := range l2.children {
levels = append(levels, l3)
selectors = append(selectors, []string{sel1, sel2, sel3})
}
l2.lock.RUnlock()
}
l1.lock.RUnlock()
}
as.root.lock.RUnlock()
type workItem struct {
level *AvroLevel
dir string
selector []string
}
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(NumAvroWorkers)
work := make(chan workItem, NumAvroWorkers*2)
for range NumAvroWorkers {
go func() {
defer wg.Done()
for workItem := range work {
from := getTimestamp(workItem.dir)
if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
if err == ErrNoNewArchiveData {
continue
}
cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
}
}()
}
for i := range len(levels) {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
}
startUp = false
return int(n), nil
}
// getTimestamp returns the timestamp from the directory name
func getTimestamp(dir string) int64 {
// Extract the resolution and timestamp from the directory name
// The existing avro file will be in epoch timestamp format
// iterate over all the files in the directory and find the maximum timestamp
// and return it
resolution := path.Base(dir)
dir = path.Dir(dir)
files, err := os.ReadDir(dir)
if err != nil {
return 0
}
var maxTS int64 = 0
if len(files) == 0 {
return 0
}
for _, file := range files {
if file.IsDir() {
continue
}
name := file.Name()
if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
continue
}
ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
if err != nil {
fmt.Printf("error while parsing timestamp: %s\n", err.Error())
continue
}
if ts > maxTS {
maxTS = ts
}
}
interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
if startUp {
return 0
}
if updateTime < time.Now().Unix() {
return 0
}
return maxTS
}
func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
l.lock.Lock()
defer l.lock.Unlock()
// fmt.Printf("Checkpointing directory: %s\n", dir)
// filepath contains the resolution
intRes, _ := strconv.Atoi(path.Base(dir))
// find smallest overall timestamp in l.data map and delete it from l.data
minTS := int64(1<<63 - 1)
for ts, dat := range l.data {
if ts < minTS && len(dat) != 0 {
minTS = ts
}
}
if from == 0 && minTS != int64(1<<63-1) {
from = minTS
}
if from == 0 {
return ErrNoNewArchiveData
}
var schema string
var codec *goavro.Codec
recordList := make([]map[string]any, 0)
var f *os.File
filePath := dir + fmt.Sprintf("_%d.avro", from)
var err error
fp_, err_ := os.Stat(filePath)
if errors.Is(err_, os.ErrNotExist) {
err = os.MkdirAll(path.Dir(dir), 0o755)
if err != nil {
return fmt.Errorf("failed to create directory: %v", err)
}
} else if fp_.Size() != 0 {
f, err = os.Open(filePath)
if err != nil {
return fmt.Errorf("failed to open existing avro file: %v", err)
}
br := bufio.NewReader(f)
reader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("failed to create OCF reader: %v", err)
}
codec = reader.Codec()
schema = codec.Schema()
f.Close()
}
timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
if dumpAll {
timeRef = time.Now().Unix()
}
// Empty values
if len(l.data) == 0 {
// we checkpoint avro files every 60 seconds
repeat := 60 / intRes
for range repeat {
recordList = append(recordList, make(map[string]any))
}
}
readFlag := true
for ts := range l.data {
flag := false
if ts < timeRef {
data := l.data[ts]
schemaGen, err := generateSchema(data)
if err != nil {
return err
}
flag, schema, err = compareSchema(schema, schemaGen)
if err != nil {
return fmt.Errorf("failed to compare read and generated schema: %v", err)
}
if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
f.Close()
f, err = os.Open(filePath)
if err != nil {
return fmt.Errorf("failed to open Avro file: %v", err)
}
br := bufio.NewReader(f)
ocfReader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
}
for ocfReader.Scan() {
record, err := ocfReader.Read()
if err != nil {
return fmt.Errorf("failed to read record: %v", err)
}
recordList = append(recordList, record.(map[string]any))
}
f.Close()
err = os.Remove(filePath)
if err != nil {
return fmt.Errorf("failed to delete file: %v", err)
}
readFlag = false
}
codec, err = goavro.NewCodec(schema)
if err != nil {
return fmt.Errorf("failed to create codec after merged schema: %v", err)
}
recordList = append(recordList, generateRecord(data))
delete(l.data, ts)
}
}
if len(recordList) == 0 {
return ErrNoNewArchiveData
}
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
if err != nil {
return fmt.Errorf("failed to append new avro file: %v", err)
}
// fmt.Printf("Codec : %#v\n", codec)
writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
W: f,
Codec: codec,
CompressionName: goavro.CompressionDeflateLabel,
})
if err != nil {
return fmt.Errorf("failed to create OCF writer: %v", err)
}
// Append the new record
if err := writer.Append(recordList); err != nil {
return fmt.Errorf("failed to append record: %v", err)
}
f.Close()
return nil
}
func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
var genSchema, readSchema AvroSchema
if schemaRead == "" {
return false, schemaGen, nil
}
// Unmarshal the schema strings into AvroSchema structs
if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
}
if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
return false, "", fmt.Errorf("failed to parse read schema: %v", err)
}
sort.Slice(genSchema.Fields, func(i, j int) bool {
return genSchema.Fields[i].Name < genSchema.Fields[j].Name
})
sort.Slice(readSchema.Fields, func(i, j int) bool {
return readSchema.Fields[i].Name < readSchema.Fields[j].Name
})
// Check if schemas are identical
schemasEqual := true
if len(genSchema.Fields) <= len(readSchema.Fields) {
for i := range genSchema.Fields {
if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
schemasEqual = false
break
}
}
// If schemas are identical, return the read schema
if schemasEqual {
return false, schemaRead, nil
}
}
// Create a map to hold unique fields from both schemas
fieldMap := make(map[string]AvroField)
// Add fields from the read schema
for _, field := range readSchema.Fields {
fieldMap[field.Name] = field
}
// Add or update fields from the generated schema
for _, field := range genSchema.Fields {
fieldMap[field.Name] = field
}
// Create a union schema by collecting fields from the map
var mergedFields []AvroField
for _, field := range fieldMap {
mergedFields = append(mergedFields, field)
}
// Sort fields by name for consistency
sort.Slice(mergedFields, func(i, j int) bool {
return mergedFields[i].Name < mergedFields[j].Name
})
// Create the merged schema
mergedSchema := AvroSchema{
Type: "record",
Name: genSchema.Name,
Fields: mergedFields,
}
// Check if schemas are identical
schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
if schemasEqual {
for i := range mergedSchema.Fields {
if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
schemasEqual = false
break
}
}
if schemasEqual {
return false, schemaRead, nil
}
}
// Marshal the merged schema back to JSON
mergedSchemaJSON, err := json.Marshal(mergedSchema)
if err != nil {
return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
}
return true, string(mergedSchemaJSON), nil
}
func generateSchema(data map[string]schema.Float) (string, error) {
// Define the Avro schema structure
schema := map[string]any{
"type": "record",
"name": "DataRecord",
"fields": []map[string]any{},
}
fieldTracker := make(map[string]struct{})
for key := range data {
if _, exists := fieldTracker[key]; !exists {
key = correctKey(key)
field := map[string]any{
"name": key,
"type": "double",
"default": -1.0,
}
schema["fields"] = append(schema["fields"].([]map[string]any), field)
fieldTracker[key] = struct{}{}
}
}
schemaString, err := json.Marshal(schema)
if err != nil {
return "", fmt.Errorf("failed to marshal schema: %v", err)
}
return string(schemaString), nil
}
func generateRecord(data map[string]schema.Float) map[string]any {
record := make(map[string]any)
// Iterate through each map in data
for key, value := range data {
key = correctKey(key)
// Set the value in the record
// avro only accepts basic types
record[key] = value.Double()
}
return record
}
func correctKey(key string) string {
// Replace any invalid characters in the key
// For example, replace spaces with underscores
key = strings.ReplaceAll(key, ":", "___")
key = strings.ReplaceAll(key, ".", "__")
return key
}
func ReplaceKey(key string) string {
// Replace any invalid characters in the key
// For example, replace spaces with underscores
key = strings.ReplaceAll(key, "___", ":")
key = strings.ReplaceAll(key, "__", ".")
return key
}

View File

@@ -1,84 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"context"
"slices"
"strconv"
"sync"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
// AvroPool is a pool of Avro writers.
go func() {
if Keys.Checkpoints.FileFormat == "json" {
wg.Done() // Mark this goroutine as done
return // Exit the goroutine
}
defer wg.Done()
var avroLevel *AvroLevel
oldSelector := make([]string, 0)
for {
select {
case <-ctx.Done():
return
case val := <-LineProtocolMessages:
// Fetch the frequency of the metric from the global configuration
freq, err := GetMetricFrequency(val.MetricName)
if err != nil {
cclog.Errorf("Error fetching metric frequency: %s\n", err)
continue
}
metricName := ""
for _, selectorName := range val.Selector {
metricName += selectorName + Delimiter
}
metricName += val.MetricName
// Create a new selector for the Avro level
// The selector is a slice of strings that represents the path to the
// Avro level. It is created by appending the cluster, node, and metric
// name to the selector.
var selector []string
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
if !testEq(oldSelector, selector) {
// Get the Avro level for the metric
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
// If the Avro level is nil, create a new one
if avroLevel == nil {
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
}
oldSelector = slices.Clone(selector)
}
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
}
}
}()
}
func testEq(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}

View File

@@ -1,168 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
)
var (
LineProtocolMessages = make(chan *AvroStruct)
Delimiter = "ZZZZZ"
)
// CheckpointBufferMinutes should always be in minutes.
// Its controls the amount of data to hold for given amount of time.
var CheckpointBufferMinutes = 3
type AvroStruct struct {
MetricName string
Cluster string
Node string
Selector []string
Value schema.Float
Timestamp int64
}
type AvroStore struct {
root AvroLevel
}
var avroStore AvroStore
type AvroLevel struct {
children map[string]*AvroLevel
data map[int64]map[string]schema.Float
lock sync.RWMutex
}
type AvroField struct {
Name string `json:"name"`
Type any `json:"type"`
Default any `json:"default,omitempty"`
}
type AvroSchema struct {
Type string `json:"type"`
Name string `json:"name"`
Fields []AvroField `json:"fields"`
}
func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *AvroLevel
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok := l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findAvroLevelOrCreate(selector[1:])
}
}
// The level does not exist, take write lock for unqiue access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findAvroLevelOrCreate(selector[1:])
}
}
child = &AvroLevel{
data: make(map[int64]map[string]schema.Float, 0),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*AvroLevel{selector[0]: child}
}
l.lock.Unlock()
return child.findAvroLevelOrCreate(selector[1:])
}
func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
l.lock.Lock()
defer l.lock.Unlock()
KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
// Create keys in advance for the given amount of time
if len(l.data) != KeyCounter {
if len(l.data) == 0 {
for i := range KeyCounter {
l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
}
} else {
// Get the last timestamp
var lastTS int64
for ts := range l.data {
if ts > lastTS {
lastTS = ts
}
}
// Create keys for the next KeyCounter timestamps
l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
}
}
closestTS := int64(0)
minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
found := false
// Iterate over timestamps and choose the one which is within range.
// Since its epoch time, we check if the difference is less than 60 seconds.
for ts, dat := range l.data {
// Check if timestamp is within range
diff := timestamp - ts
if diff < -int64(Freq) || diff > int64(Freq) {
continue
}
// Metric already present at this timestamp — skip
if _, ok := dat[metricName]; ok {
continue
}
// Check if this is the closest timestamp so far
if Abs(diff) < minDiff {
minDiff = Abs(diff)
closestTS = ts
found = true
}
}
if found {
l.data[closestTS][metricName] = value
}
}
func GetAvroStore() *AvroStore {
return &avroStore
}
// Abs returns the absolute value of x.
func Abs(x int64) int64 {
if x < 0 {
return -x
}
return x
}

View File

@@ -1,198 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"errors"
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
)
// Default buffer capacity.
// `buffer.data` will only ever grow up to it's capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const (
BufferCap int = 512
)
// So that we can reuse allocations
var bufferPool sync.Pool = sync.Pool{
New: func() any {
return &buffer{
data: make([]schema.Float, 0, BufferCap),
}
},
}
var (
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
)
// Each metric on each level has it's own buffer.
// This is where the actual values go.
// If `cap(data)` is reached, a new buffer is created and
// becomes the new head of a buffer list.
type buffer struct {
prev *buffer
next *buffer
data []schema.Float
frequency int64
start int64
archived bool
closed bool
}
func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// If a new buffer was created, the new head is returnd.
// Otherwise, the existing buffer is returnd.
// Normaly, only "newer" data should be written, but if the value would
// end up in the same buffer anyways it is allowed.
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency)
newbuf.prev = b
b.next = newbuf
b.close()
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, schema.NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
func (b *buffer) close() {}
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
// Simple linear interpolation is done between the two neighboring cells if possible.
// If values at the start or end are missing, instead of NaN values, the second and thrid
// return values contain the actual `from`/`to`.
// This function goes back the buffer chain if `from` is older than the currents buffer start.
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
// If `data` is not long enough to hold all values, this function will panic!
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
i := 0
t := from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += schema.NaN
} else if t < b.start {
data[i] += schema.NaN
// } else if b.data[idx].IsNaN() {
// data[i] += interpolate(idx, b.data)
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// Returns true if this buffer needs to be freed.
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BufferCap {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}

View File

@@ -1,783 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"io/fs"
"os"
"path"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/linkedin/goavro/v2"
)
// File operation constants
const (
// CheckpointFilePerms defines default permissions for checkpoint files
CheckpointFilePerms = 0o644
// CheckpointDirPerms defines default permissions for checkpoint directories
CheckpointDirPerms = 0o755
// GCTriggerInterval determines how often GC is forced during checkpoint loading
// GC is triggered every GCTriggerInterval*NumWorkers loaded hosts
GCTriggerInterval = 100
)
// Whenever changed, update MarshalJSON as well!
type CheckpointMetrics struct {
Data []schema.Float `json:"data"`
Frequency int64 `json:"frequency"`
Start int64 `json:"start"`
}
type CheckpointFile struct {
Metrics map[string]*CheckpointMetrics `json:"metrics"`
Children map[string]*CheckpointFile `json:"children"`
From int64 `json:"from"`
To int64 `json:"to"`
}
var lastCheckpoint time.Time
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
lastCheckpoint = time.Now()
if Keys.Checkpoints.FileFormat == "json" {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.Checkpoints.Interval)
if err != nil {
cclog.Fatal(err)
}
if d <= 0 {
return
}
ticks := func() <-chan time.Time {
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339))
now := time.Now()
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
lastCheckpoint.Unix(), now.Unix())
if err != nil {
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
lastCheckpoint = now
}
}
}
}()
} else {
go func() {
defer wg.Done()
d, _ := time.ParseDuration("1m")
select {
case <-ctx.Done():
return
case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
// This is the first tick untill we collect the data for given minutes.
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
// log.Printf("Checkpointing %d avro files", count)
}
ticks := func() <-chan time.Time {
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
// Regular ticks of 1 minute to write data.
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
// log.Printf("Checkpointing %d avro files", count)
}
}
}()
}
}
// As `Float` implements a custom MarshalJSON() function,
// serializing an array of such types has more overhead
// than one would assume (because of extra allocations, interfaces and so on).
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 128+len(cm.Data)*8)
buf = append(buf, `{"frequency":`...)
buf = strconv.AppendInt(buf, cm.Frequency, 10)
buf = append(buf, `,"start":`...)
buf = strconv.AppendInt(buf, cm.Start, 10)
buf = append(buf, `,"data":[`...)
for i, x := range cm.Data {
if i != 0 {
buf = append(buf, ',')
}
if x.IsNaN() {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
}
}
buf = append(buf, `]}`...)
return buf, nil
}
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
// The good thing: Only a host at a time is locked, so this function can run
// in parallel to writes/reads.
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
levels := make([]*Level, 0)
selectors := make([][]string, 0)
m.root.lock.RLock()
for sel1, l1 := range m.root.children {
l1.lock.RLock()
for sel2, l2 := range l1.children {
levels = append(levels, l2)
selectors = append(selectors, []string{sel1, sel2})
}
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
type workItem struct {
level *Level
dir string
selector []string
}
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(Keys.NumWorkers)
work := make(chan workItem, Keys.NumWorkers*2)
for worker := 0; worker < Keys.NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
if err == ErrNoNewArchiveData {
continue
}
cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
}
}()
}
for i := 0; i < len(levels); i++ {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
l.lock.RLock()
defer l.lock.RUnlock()
retval := &CheckpointFile{
From: from,
To: to,
Metrics: make(map[string]*CheckpointMetrics),
Children: make(map[string]*CheckpointFile),
}
for metric, minfo := range m.Metrics {
b := l.metrics[minfo.offset]
if b == nil {
continue
}
allArchived := true
b.iterFromTo(from, to, func(b *buffer) error {
if !b.archived {
allArchived = false
}
return nil
})
if allArchived {
continue
}
data := make([]schema.Float, (to-from)/b.frequency+1)
data, start, end, err := b.read(from, to, data)
if err != nil {
return nil, err
}
for i := int((end - start) / b.frequency); i < len(data); i++ {
data[i] = schema.NaN
}
retval.Metrics[metric] = &CheckpointMetrics{
Frequency: b.frequency,
Start: start,
Data: data,
}
}
for name, child := range l.children {
val, err := child.toCheckpointFile(from, to, m)
if err != nil {
return nil, err
}
if val != nil {
retval.Children[name] = val
}
}
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
return nil, nil
}
return retval, nil
}
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m)
if err != nil {
return err
}
if cf == nil {
return ErrNoNewArchiveData
}
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, CheckpointDirPerms)
if err == nil {
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
}
}
if err != nil {
return err
}
defer f.Close()
bw := bufio.NewWriter(f)
if err = json.NewEncoder(bw).Encode(cf); err != nil {
return err
}
return bw.Flush()
}
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
var wg sync.WaitGroup
work := make(chan [2]string, Keys.NumWorkers)
n, errs := int32(0), int32(0)
wg.Add(Keys.NumWorkers)
for worker := 0; worker < Keys.NumWorkers; worker++ {
go func() {
defer wg.Done()
for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
if err != nil {
cclog.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(nn))
}
}()
}
i := 0
clustersDir, err := os.ReadDir(dir)
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
goto done
}
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if e != nil {
err = e
goto done
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
goto done
}
i++
if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
done:
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
// This function can only be called once and before the very first write or read.
// Different host's data is loaded to memory in parallel.
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
if _, err := os.Stat(dir); os.IsNotExist(err) {
// The directory does not exist, so create it using os.MkdirAll()
err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
if err != nil {
cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
}
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
}
// Config read (replace with your actual config read)
fileFormat := Keys.Checkpoints.FileFormat
if fileFormat == "" {
fileFormat = "avro"
}
// Map to easily get the fallback format
oppositeFormat := map[string]string{
"json": "avro",
"avro": "json",
}
// First, attempt to load the specified format
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
return m.FromCheckpoint(dir, from, fileFormat)
}
// If not found, attempt the opposite format
altFormat := oppositeFormat[fileFormat]
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
return m.FromCheckpoint(dir, from, altFormat)
}
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
return 0, nil
}
func checkFilesWithExtension(dir string, extension string) (bool, error) {
found := false
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
}
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
found = true
return nil
}
return nil
})
if err != nil {
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
}
return found, nil
}
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
}
fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
// Same logic according to lineprotocol
fromTimestamp -= (resolution / 2)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
}
// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
var recordCounter int64 = 0
// Create a new OCF reader from the buffered reader
ocfReader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err)
}
metricsData := make(map[string]schema.FloatArray)
for ocfReader.Scan() {
datum, err := ocfReader.Read()
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
}
record, ok := datum.(map[string]any)
if !ok {
return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}")
}
for key, value := range record {
metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
}
recordCounter += 1
}
to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
if to < from {
return nil
}
for key, floatArray := range metricsData {
metricName := ReplaceKey(key)
if strings.Contains(metricName, Delimiter) {
subString := strings.Split(metricName, Delimiter)
lvl := l
for i := 0; i < len(subString)-1; i++ {
sel := subString[i]
if lvl.children == nil {
lvl.children = make(map[string]*Level)
}
child, ok := lvl.children[sel]
if !ok {
child = &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: nil,
}
lvl.children[sel] = child
}
lvl = child
}
leafMetricName := subString[len(subString)-1]
err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
}
} else {
err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
if err != nil {
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
}
}
}
return nil
}
func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
n := len(floatArray)
b := &buffer{
frequency: resolution,
start: from,
data: floatArray[0:n:n],
prev: nil,
next: nil,
archived: true,
}
b.close()
minfo, ok := m.Metrics[metricName]
if !ok {
return nil
// return errors.New("Unkown metric: " + name)
}
prev := l.metrics[minfo.offset]
if prev == nil {
l.metrics[minfo.offset] = b
} else {
if prev.start > b.start {
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
}
b.prev = prev
prev.next = b
missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
if missingCount > 0 {
missingCount /= int(b.frequency)
for range missingCount {
prev.data = append(prev.data, schema.NaN)
}
prev.data = prev.data[0:len(prev.data):len(prev.data)]
}
}
l.metrics[minfo.offset] = b
return nil
}
func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
cf := &CheckpointFile{}
if err := json.NewDecoder(br).Decode(cf); err != nil {
return err
}
if cf.To != 0 && cf.To < from {
return nil
}
if err := l.loadFile(cf, m); err != nil {
return err
}
return nil
}
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
for name, metric := range cf.Metrics {
n := len(metric.Data)
b := &buffer{
frequency: metric.Frequency,
start: metric.Start,
data: metric.Data[0:n:n], // Space is wasted here :(
prev: nil,
next: nil,
archived: true,
}
b.close()
minfo, ok := m.Metrics[name]
if !ok {
continue
// return errors.New("Unkown metric: " + name)
}
prev := l.metrics[minfo.offset]
if prev == nil {
l.metrics[minfo.offset] = b
} else {
if prev.start > b.start {
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
}
b.prev = prev
prev.next = b
}
l.metrics[minfo.offset] = b
}
if len(cf.Children) > 0 && l.children == nil {
l.children = make(map[string]*Level)
}
for sel, childCf := range cf.Children {
child, ok := l.children[sel]
if !ok {
child = &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: nil,
}
l.children[sel] = child
}
if err := child.loadFile(childCf, m); err != nil {
return err
}
}
return nil
}
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
direntries, err := os.ReadDir(dir)
if err != nil {
if os.IsNotExist(err) {
return 0, nil
}
return 0, err
}
allFiles := make([]fs.DirEntry, 0)
filesLoaded := 0
for _, e := range direntries {
if e.IsDir() {
child := &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: make(map[string]*Level),
}
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
filesLoaded += files
if err != nil {
return filesLoaded, err
}
l.children[e.Name()] = child
} else if strings.HasSuffix(e.Name(), "."+extension) {
allFiles = append(allFiles, e)
} else {
continue
}
}
files, err := findFiles(allFiles, from, extension, true)
if err != nil {
return filesLoaded, err
}
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
"json": l.loadJSONFile,
"avro": l.loadAvroFile,
}
loader := loaders[extension]
for _, filename := range files {
// Use a closure to ensure file is closed immediately after use
err := func() error {
f, err := os.Open(path.Join(dir, filename))
if err != nil {
return err
}
defer f.Close()
return loader(m, f, from)
}()
if err != nil {
return filesLoaded, err
}
filesLoaded += 1
}
return filesLoaded, nil
}
// This will probably get very slow over time!
// A solution could be some sort of an index file in which all other files
// and the timespan they contain is listed.
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
nums := map[string]int64{}
for _, e := range direntries {
if !strings.HasSuffix(e.Name(), "."+extension) {
continue
}
ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
if err != nil {
return nil, err
}
nums[e.Name()] = ts
}
sort.Slice(direntries, func(i, j int) bool {
a, b := direntries[i], direntries[j]
return nums[a.Name()] < nums[b.Name()]
})
filenames := make([]string, 0)
for i := range direntries {
e := direntries[i]
ts1 := nums[e.Name()]
if findMoreRecentFiles && t <= ts1 {
filenames = append(filenames, e.Name())
}
if i == len(direntries)-1 {
continue
}
enext := direntries[i+1]
ts2 := nums[enext.Name()]
if findMoreRecentFiles {
if ts1 < t && t < ts2 {
filenames = append(filenames, e.Name())
}
} else {
if ts2 < t {
filenames = append(filenames, e.Name())
}
}
}
return filenames, nil
}

View File

@@ -1,121 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"fmt"
)
var InternalCCMSFlag bool = false
type MetricStoreConfig struct {
// Number of concurrent workers for checkpoint and archive operations.
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
NumWorkers int `json:"num-workers"`
Checkpoints struct {
FileFormat string `json:"file-format"`
Interval string `json:"interval"`
RootDir string `json:"directory"`
Restore string `json:"restore"`
} `json:"checkpoints"`
Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
} `json:"debug"`
RetentionInMemory string `json:"retention-in-memory"`
Archive struct {
Interval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
} `json:"archive"`
Nats []*NatsConfig `json:"nats"`
}
type NatsConfig struct {
// Address of the nats server
Address string `json:"address"`
// Username/Password, optional
Username string `json:"username"`
Password string `json:"password"`
// Creds file path
Credsfilepath string `json:"creds-file-path"`
Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
} `json:"subscriptions"`
}
var Keys MetricStoreConfig
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
switch str {
case "":
return NoAggregation, nil
case "sum":
return SumAggregation, nil
case "avg":
return AvgAggregation, nil
default:
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
}
}
type MetricConfig struct {
// Interval in seconds at which measurements are stored
Frequency int64
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy
// Private, used internally...
offset int
}
var Metrics map[string]MetricConfig
func GetMetricFrequency(metricName string) (int64, error) {
if metric, ok := Metrics[metricName]; ok {
return metric.Frequency, nil
}
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
}
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
// use metric.Name to check if the metric already exists.
// if not, add it to the Metrics map.
func AddMetric(name string, metric MetricConfig) error {
if Metrics == nil {
Metrics = make(map[string]MetricConfig, 0)
}
if existingMetric, ok := Metrics[name]; ok {
if existingMetric.Frequency != metric.Frequency {
if existingMetric.Frequency < metric.Frequency {
existingMetric.Frequency = metric.Frequency
Metrics[name] = existingMetric
}
}
} else {
Metrics[name] = metric
}
return nil
}

View File

@@ -1,95 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
const configSchema = `{
"type": "object",
"description": "Configuration specific to built-in metric-store.",
"properties": {
"checkpoints": {
"description": "Configuration for checkpointing the metrics within metric-store",
"type": "object",
"properties": {
"file-format": {
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
"type": "string"
},
"interval": {
"description": "Interval at which the metrics should be checkpointed.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the checkpointed files should be placed.",
"type": "string"
},
"restore": {
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
"type": "string"
}
}
},
"archive": {
"description": "Configuration for archiving the already checkpointed files.",
"type": "object",
"properties": {
"interval": {
"description": "Interval at which the checkpointed files should be archived.",
"type": "string"
},
"directory": {
"description": "Specify the parent directy in which the archived files should be placed.",
"type": "string"
}
}
},
"retention-in-memory": {
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
"type": "string"
},
"nats": {
"description": "Configuration for accepting published data through NATS.",
"type": "array",
"items": {
"type": "object",
"properties": {
"address": {
"description": "Address of the NATS server.",
"type": "string"
},
"username": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"password": {
"description": "Optional: If configured with username/password method.",
"type": "string"
},
"creds-file-path": {
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
"type": "string"
},
"subscriptions": {
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
"type": "array",
"items": {
"type": "object",
"properties": {
"subscribe-to": {
"description": "Channel name",
"type": "string"
},
"cluster-tag": {
"description": "Optional: Allow lines without a cluster tag, use this as default",
"type": "string"
}
}
}
}
}
}
}
}
}`

View File

@@ -1,112 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"bufio"
"fmt"
"strconv"
)
func (b *buffer) debugDump(buf []byte) []byte {
if b.prev != nil {
buf = b.prev.debugDump(buf)
}
start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data))
buf = append(buf, `{"start":`...)
buf = strconv.AppendInt(buf, start, 10)
buf = append(buf, `,"len":`...)
buf = strconv.AppendInt(buf, int64(len), 10)
buf = append(buf, `,"end":`...)
buf = strconv.AppendInt(buf, end, 10)
if b.archived {
buf = append(buf, `,"saved":true`...)
}
if b.next != nil {
buf = append(buf, `},`...)
} else {
buf = append(buf, `}`...)
}
return buf
}
func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
l.lock.RLock()
defer l.lock.RUnlock()
for i := 0; i < depth; i++ {
buf = append(buf, '\t')
}
buf = append(buf, '"')
buf = append(buf, lvlname...)
buf = append(buf, "\":{\n"...)
depth += 1
objitems := 0
for name, mc := range m.Metrics {
if b := l.metrics[mc.offset]; b != nil {
for i := 0; i < depth; i++ {
buf = append(buf, '\t')
}
buf = append(buf, '"')
buf = append(buf, name...)
buf = append(buf, `":[`...)
buf = b.debugDump(buf)
buf = append(buf, "],\n"...)
objitems++
}
}
for name, lvl := range l.children {
_, err := w.Write(buf)
if err != nil {
return nil, err
}
buf = buf[0:0]
buf, err = lvl.debugDump(m, w, name, buf, depth)
if err != nil {
return nil, err
}
buf = append(buf, ',', '\n')
objitems++
}
// remove final `,`:
if objitems > 0 {
buf = append(buf[0:len(buf)-1], '\n')
}
depth -= 1
for i := 0; i < depth; i++ {
buf = append(buf, '\t')
}
buf = append(buf, '}')
return buf, nil
}
func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error {
lvl := m.root.findLevel(selector)
if lvl == nil {
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
}
buf := make([]byte, 0, 2048)
buf = append(buf, "{"...)
buf, err := lvl.debugDump(m, w, "data", buf, 0)
if err != nil {
return err
}
buf = append(buf, "}\n"...)
if _, err = w.Write(buf); err != nil {
return err
}
return w.Flush()
}

View File

@@ -1,92 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"bufio"
"fmt"
"time"
)
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
const MaxMissingDataPoints int64 = 5
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
const MaxUnhealthyMetrics int64 = 5
func (b *buffer) healthCheck() int64 {
// Check if the buffer is empty
if b.data == nil {
return 1
}
bufferEnd := b.start + b.frequency*int64(len(b.data))
t := time.Now().Unix()
// Check if the buffer is too old
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
return 1
}
return 0
}
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
l.lock.RLock()
defer l.lock.RUnlock()
for _, mc := range m.Metrics {
if b := l.metrics[mc.offset]; b != nil {
count += b.healthCheck()
}
}
for _, lvl := range l.children {
c, err := lvl.healthCheck(m, 0)
if err != nil {
return 0, err
}
count += c
}
return count, nil
}
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
lvl := m.root.findLevel(selector)
if lvl == nil {
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
}
buf := make([]byte, 0, 25)
// buf = append(buf, "{"...)
var count int64 = 0
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
if err != nil {
return err
}
if unhealthyMetricsCount < MaxUnhealthyMetrics {
buf = append(buf, "Healthy"...)
} else {
buf = append(buf, "Unhealthy"...)
}
// buf = append(buf, "}\n"...)
if _, err = w.Write(buf); err != nil {
return err
}
return w.Flush()
}

View File

@@ -1,192 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"sync"
"unsafe"
"github.com/ClusterCockpit/cc-lib/util"
)
// Could also be called "node" as this forms a node in a tree structure.
// Called Level because "node" might be confusing here.
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
// also hold data (in `metrics`).
type Level struct {
children map[string]*Level
metrics []*buffer
lock sync.RWMutex
}
// Find the correct level for the given selector, creating it if
// it does not exist. Example selector in the context of the
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *Level
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok = l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
// The level does not exist, take write lock for unqiue access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
child = &Level{
metrics: make([]*buffer, nMetrics),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*Level{selector[0]: child}
}
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
func (l *Level) free(t int64) (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
for i, b := range l.metrics {
if b != nil {
delme, m := b.free(t)
n += m
if delme {
if cap(b.data) == BufferCap {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
for _, l := range l.children {
m, err := l.free(t)
n += m
if err != nil {
return n, err
}
}
return n, nil
}
func (l *Level) sizeInBytes() int64 {
l.lock.RLock()
defer l.lock.RUnlock()
size := int64(0)
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
}
}
for _, child := range l.children {
size += child.sizeInBytes()
}
return size
}
func (l *Level) findLevel(selector []string) *Level {
if len(selector) == 0 {
return l
}
l.lock.RLock()
defer l.lock.RUnlock()
lvl := l.children[selector[0]]
if lvl == nil {
return nil
}
return lvl.findLevel(selector[1:])
}
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
l.lock.RLock()
defer l.lock.RUnlock()
if len(selector) == 0 {
b := l.metrics[offset]
if b != nil {
return f(b)
}
for _, lvl := range l.children {
err := lvl.findBuffers(nil, offset, f)
if err != nil {
return err
}
}
return nil
}
sel := selector[0]
if len(sel.String) != 0 && l.children != nil {
lvl, ok := l.children[sel.String]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
return nil
}
if sel.Group != nil && l.children != nil {
for _, key := range sel.Group {
lvl, ok := l.children[key]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
}
return nil
}
if sel.Any && l.children != nil {
for _, lvl := range l.children {
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
return err
}
}
return nil
}
return nil
}

View File

@@ -1,351 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"context"
"fmt"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/influxdata/line-protocol/v2/lineprotocol"
"github.com/nats-io/nats.go"
)
// Each connection is handled in it's own goroutine. This is a blocking function.
// func ReceiveRaw(ctx context.Context,
// listener net.Listener,
// handleLine func(*lineprotocol.Decoder, string) error,
// ) error {
// var wg sync.WaitGroup
// wg.Add(1)
// go func() {
// defer wg.Done()
// <-ctx.Done()
// if err := listener.Close(); err != nil {
// log.Printf("listener.Close(): %s", err.Error())
// }
// }()
// for {
// conn, err := listener.Accept()
// if err != nil {
// if errors.Is(err, net.ErrClosed) {
// break
// }
// log.Printf("listener.Accept(): %s", err.Error())
// }
// wg.Add(2)
// go func() {
// defer wg.Done()
// defer conn.Close()
// dec := lineprotocol.NewDecoder(conn)
// connctx, cancel := context.WithCancel(context.Background())
// defer cancel()
// go func() {
// defer wg.Done()
// select {
// case <-connctx.Done():
// conn.Close()
// case <-ctx.Done():
// conn.Close()
// }
// }()
// if err := handleLine(dec, "default"); err != nil {
// if errors.Is(err, net.ErrClosed) {
// return
// }
// log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error())
// errmsg := make([]byte, 128)
// errmsg = append(errmsg, `error: `...)
// errmsg = append(errmsg, err.Error()...)
// errmsg = append(errmsg, '\n')
// conn.Write(errmsg)
// }
// }()
// }
// wg.Wait()
// return nil
// }
// ReceiveNats connects to a nats server and subscribes to "updates". This is a
// blocking function. handleLine will be called for each line recieved via
// nats. Send `true` through the done channel for gracefull termination.
func ReceiveNats(conf *(NatsConfig),
ms *MemoryStore,
workers int,
ctx context.Context,
) error {
var opts []nats.Option
if conf.Username != "" && conf.Password != "" {
opts = append(opts, nats.UserInfo(conf.Username, conf.Password))
}
if conf.Credsfilepath != "" {
opts = append(opts, nats.UserCredentials(conf.Credsfilepath))
}
nc, err := nats.Connect(conf.Address, opts...)
if err != nil {
return err
}
defer nc.Close()
var wg sync.WaitGroup
var subs []*nats.Subscription
msgs := make(chan *nats.Msg, workers*2)
for _, sc := range conf.Subscriptions {
clusterTag := sc.ClusterTag
var sub *nats.Subscription
if workers > 1 {
wg.Add(workers)
for range workers {
go func() {
for m := range msgs {
dec := lineprotocol.NewDecoderWithBytes(m.Data)
if err := DecodeLine(dec, ms, clusterTag); err != nil {
cclog.Errorf("error: %s", err.Error())
}
}
wg.Done()
}()
}
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
msgs <- m
})
} else {
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
dec := lineprotocol.NewDecoderWithBytes(m.Data)
if err := DecodeLine(dec, ms, clusterTag); err != nil {
cclog.Errorf("error: %s", err.Error())
}
})
}
if err != nil {
return err
}
cclog.Infof("NATS subscription to '%s' on '%s' established", sc.SubscribeTo, conf.Address)
subs = append(subs, sub)
}
<-ctx.Done()
for _, sub := range subs {
err = sub.Unsubscribe()
if err != nil {
cclog.Errorf("NATS unsubscribe failed: %s", err.Error())
}
}
close(msgs)
wg.Wait()
nc.Close()
cclog.Print("NATS connection closed")
return nil
}
// Place `prefix` in front of `buf` but if possible,
// do that inplace in `buf`.
func reorder(buf, prefix []byte) []byte {
n := len(prefix)
m := len(buf)
if cap(buf) < m+n {
return append(prefix[:n:n], buf...)
} else {
buf = buf[:n+m]
for i := m - 1; i >= 0; i-- {
buf[i+n] = buf[i]
}
for i := range n {
buf[i] = prefix[i]
}
return buf
}
}
// Decode lines using dec and make write calls to the MemoryStore.
// If a line is missing its cluster tag, use clusterDefault as default.
func DecodeLine(dec *lineprotocol.Decoder,
ms *MemoryStore,
clusterDefault string,
) error {
// Reduce allocations in loop:
t := time.Now()
metric, metricBuf := Metric{}, make([]byte, 0, 16)
selector := make([]string, 0, 4)
typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
// Optimize for the case where all lines in a "batch" are about the same
// cluster and host. By using `WriteToLevel` (level = host), we do not need
// to take the root- and cluster-level lock as often.
var lvl *Level = nil
prevCluster, prevHost := "", ""
var ok bool
for dec.Next() {
rawmeasurement, err := dec.Measurement()
if err != nil {
return err
}
// Needs to be copied because another call to dec.* would
// invalidate the returned slice.
metricBuf = append(metricBuf[:0], rawmeasurement...)
// The go compiler optimizes map[string(byteslice)] lookups:
metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
if !ok {
continue
}
typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
cluster, host := clusterDefault, ""
for {
key, val, err := dec.NextTag()
if err != nil {
return err
}
if key == nil {
break
}
// The go compiler optimizes string([]byte{...}) == "...":
switch string(key) {
case "cluster":
if string(val) == prevCluster {
cluster = prevCluster
} else {
cluster = string(val)
lvl = nil
}
case "hostname", "host":
if string(val) == prevHost {
host = prevHost
} else {
host = string(val)
lvl = nil
}
case "type":
if string(val) == "node" {
break
}
// We cannot be sure that the "type" tag comes before the "type-id" tag:
if len(typeBuf) == 0 {
typeBuf = append(typeBuf, val...)
} else {
typeBuf = reorder(typeBuf, val)
}
case "type-id":
typeBuf = append(typeBuf, val...)
case "subtype":
// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
if len(subTypeBuf) == 0 {
subTypeBuf = append(subTypeBuf, val...)
} else {
subTypeBuf = reorder(subTypeBuf, val)
// subTypeBuf = reorder(typeBuf, val)
}
case "stype-id":
subTypeBuf = append(subTypeBuf, val...)
default:
// Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need)
// return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val))
}
}
// If the cluster or host changed, the lvl was set to nil
if lvl == nil {
selector = selector[:2]
selector[0], selector[1] = cluster, host
lvl = ms.GetLevel(selector)
prevCluster, prevHost = cluster, host
}
// subtypes:
selector = selector[:0]
if len(typeBuf) > 0 {
selector = append(selector, string(typeBuf)) // <- Allocation :(
if len(subTypeBuf) > 0 {
selector = append(selector, string(subTypeBuf))
}
}
for {
key, val, err := dec.NextField()
if err != nil {
return err
}
if key == nil {
break
}
if string(key) != "value" {
return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val)
}
if val.Kind() == lineprotocol.Float {
metric.Value = schema.Float(val.FloatV())
} else if val.Kind() == lineprotocol.Int {
metric.Value = schema.Float(val.IntV())
} else if val.Kind() == lineprotocol.Uint {
metric.Value = schema.Float(val.UintV())
} else {
return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String())
}
}
if t, err = dec.Time(lineprotocol.Second, t); err != nil {
t = time.Now()
if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil {
t = time.Now()
if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil {
t = time.Now()
if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil {
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
}
}
}
}
if err != nil {
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
}
time := t.Unix()
if Keys.Checkpoints.FileFormat != "json" {
LineProtocolMessages <- &AvroStruct{
MetricName: string(metricBuf),
Cluster: cluster,
Node: host,
Selector: append([]string{}, selector...),
Value: metric.Value,
Timestamp: time,
}
}
if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil {
return err
}
}
return nil
}

View File

@@ -1,437 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package memorystore provides an efficient in-memory time-series metric storage system
// with support for hierarchical data organization, checkpointing, and archiving.
//
// The package organizes metrics in a tree structure (cluster → host → component) and
// provides concurrent read/write access to metric data with configurable aggregation strategies.
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
// and enforcing retention policies.
//
// Key features:
// - In-memory metric storage with configurable retention
// - Hierarchical data organization (selectors)
// - Concurrent checkpoint/archive workers
// - Support for sum and average aggregation
// - NATS integration for metric ingestion
package memorystore
import (
"bytes"
"context"
"encoding/json"
"errors"
"runtime"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/resampler"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
)
var (
singleton sync.Once
msInstance *MemoryStore
// shutdownFunc stores the context cancellation function created in Init
// and is called during Shutdown to cancel all background goroutines
shutdownFunc context.CancelFunc
)
type Metric struct {
Name string
Value schema.Float
MetricConfig MetricConfig
}
type MemoryStore struct {
Metrics map[string]MetricConfig
root Level
}
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
startupTime := time.Now()
if rawConfig != nil {
config.Validate(configSchema, rawConfig)
dec := json.NewDecoder(bytes.NewReader(rawConfig))
// dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
}
}
// Set NumWorkers from config or use default
if Keys.NumWorkers <= 0 {
maxWorkers := 10
Keys.NumWorkers = min(runtime.NumCPU()/2+1, maxWorkers)
}
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
// Helper function to add metric configuration
addMetricConfig := func(mc schema.MetricConfig) {
agg, err := AssignAggregationStrategy(mc.Aggregation)
if err != nil {
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
}
AddMetric(mc.Name, MetricConfig{
Frequency: int64(mc.Timestep),
Aggregation: agg,
})
}
for _, c := range archive.Clusters {
for _, mc := range c.MetricConfig {
addMetricConfig(*mc)
}
for _, sc := range c.SubClusters {
for _, mc := range sc.MetricConfig {
addMetricConfig(mc)
}
}
}
// Pass the config.MetricStoreKeys
InitMetrics(Metrics)
ms := GetMemoryStore()
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
if err != nil {
cclog.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
ctx, shutdown := context.WithCancel(context.Background())
wg.Add(4)
Retention(wg, ctx)
Checkpointing(wg, ctx)
Archiving(wg, ctx)
DataStaging(wg, ctx)
// Note: Signal handling has been removed from this function.
// The caller is responsible for handling shutdown signals and calling
// the shutdown() function when appropriate.
// Store the shutdown function for later use by Shutdown()
shutdownFunc = shutdown
if Keys.Nats != nil {
for _, natsConf := range Keys.Nats {
// TODO: When multiple nats configs share a URL, do a single connect.
wg.Add(1)
nc := natsConf
go func() {
// err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx)
err := ReceiveNats(nc, ms, 1, ctx)
if err != nil {
cclog.Fatal(err)
}
wg.Done()
}()
}
}
}
// InitMetrics creates a new, initialized instance of a MemoryStore.
// Will panic if values in the metric configurations are invalid.
func InitMetrics(metrics map[string]MetricConfig) {
singleton.Do(func() {
offset := 0
for key, cfg := range metrics {
if cfg.Frequency == 0 {
panic("[METRICSTORE]> invalid frequency")
}
metrics[key] = MetricConfig{
Frequency: cfg.Frequency,
Aggregation: cfg.Aggregation,
offset: offset,
}
offset += 1
}
msInstance = &MemoryStore{
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
Metrics: metrics,
}
})
}
func GetMemoryStore() *MemoryStore {
if msInstance == nil {
cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!")
}
return msInstance
}
func Shutdown() {
// Cancel the context to signal all background goroutines to stop
if shutdownFunc != nil {
shutdownFunc()
}
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore()
if Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
} else {
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
close(LineProtocolMessages)
}
if err != nil {
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
}
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
}
func getName(m *MemoryStore, i int) string {
for key, val := range m.Metrics {
if val.offset == i {
return key
}
}
return ""
}
func Retention(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
cclog.Fatal(err)
}
if d <= 0 {
return
}
ticks := func() <-chan time.Time {
d := d / 2
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
t := time.Now().Add(-d)
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
freed, err := ms.Free(nil, t.Unix())
if err != nil {
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
}
}
}
}()
}
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
// Look at `findLevelOrCreate` for how selectors work.
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
var ok bool
for i, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
metric.MetricConfig, ok = m.Metrics[metric.Name]
if !ok {
metric.MetricConfig.Frequency = 0
}
metrics[i] = metric
}
}
return m.WriteToLevel(&m.root, selector, ts, metrics)
}
func (m *MemoryStore) GetLevel(selector []string) *Level {
return m.root.findLevelOrCreate(selector, len(m.Metrics))
}
// WriteToLevel assumes that `minfo` in `metrics` is filled in
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
l = l.findLevelOrCreate(selector, len(m.Metrics))
l.lock.Lock()
defer l.lock.Unlock()
for _, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
continue
}
b := l.metrics[metric.MetricConfig.offset]
if b == nil {
// First write to this metric and level
b = newBuffer(ts, metric.MetricConfig.Frequency)
l.metrics[metric.MetricConfig.offset] = b
}
nb, err := b.write(ts, metric.Value)
if err != nil {
return err
}
// Last write created a new buffer...
if b != nb {
l.metrics[metric.MetricConfig.offset] = nb
}
}
return nil
}
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
if from > to {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
}
minfo, ok := m.Metrics[metric]
if !ok {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric)
}
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
if err != nil {
return err
}
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto || len(data) != len(cdata) {
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
if missingfront != 0 {
return ErrDataDoesNotAlign
}
newlen := len(cdata) - missingback
if newlen < 1 {
return ErrDataDoesNotAlign
}
cdata = cdata[0:newlen]
if len(cdata) != len(data) {
return ErrDataDoesNotAlign
}
from, to = cfrom, cto
}
data = cdata
n += 1
return nil
})
if err != nil {
return nil, 0, 0, 0, err
} else if n == 0 {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
} else if n > 1 {
if minfo.Aggregation == AvgAggregation {
normalize := 1. / schema.Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != SumAggregation {
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
}
}
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
if err != nil {
return nil, 0, 0, 0, err
}
return data, from, to, resolution, nil
}
// Free releases all buffers for the selected level and all its children that
// contain only values older than `t`.
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
return m.GetLevel(selector).free(t)
}
func (m *MemoryStore) FreeAll() error {
for k := range m.root.children {
delete(m.root.children, k)
}
return nil
}
func (m *MemoryStore) SizeInBytes() int64 {
return m.root.sizeInBytes()
}
// ListChildren , given a selector, returns a list of all children of the level
// selected.
func (m *MemoryStore) ListChildren(selector []string) []string {
lvl := &m.root
for lvl != nil && len(selector) != 0 {
lvl.lock.RLock()
next := lvl.children[selector[0]]
lvl.lock.RUnlock()
lvl = next
selector = selector[1:]
}
if lvl == nil {
return nil
}
lvl.lock.RLock()
defer lvl.lock.RUnlock()
children := make([]string, 0, len(lvl.children))
for child := range lvl.children {
children = append(children, child)
}
return children
}

View File

@@ -1,156 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"testing"
"github.com/ClusterCockpit/cc-lib/schema"
)
func TestAssignAggregationStrategy(t *testing.T) {
tests := []struct {
name string
input string
expected AggregationStrategy
wantErr bool
}{
{"empty string", "", NoAggregation, false},
{"sum", "sum", SumAggregation, false},
{"avg", "avg", AvgAggregation, false},
{"invalid", "invalid", NoAggregation, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := AssignAggregationStrategy(tt.input)
if (err != nil) != tt.wantErr {
t.Errorf("AssignAggregationStrategy(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr)
return
}
if result != tt.expected {
t.Errorf("AssignAggregationStrategy(%q) = %v, want %v", tt.input, result, tt.expected)
}
})
}
}
func TestAddMetric(t *testing.T) {
// Reset Metrics before test
Metrics = make(map[string]MetricConfig)
err := AddMetric("test_metric", MetricConfig{
Frequency: 60,
Aggregation: SumAggregation,
})
if err != nil {
t.Errorf("AddMetric() error = %v", err)
}
if _, ok := Metrics["test_metric"]; !ok {
t.Error("AddMetric() did not add metric to Metrics map")
}
// Test updating with higher frequency
err = AddMetric("test_metric", MetricConfig{
Frequency: 120,
Aggregation: SumAggregation,
})
if err != nil {
t.Errorf("AddMetric() error = %v", err)
}
if Metrics["test_metric"].Frequency != 120 {
t.Errorf("AddMetric() frequency = %d, want 120", Metrics["test_metric"].Frequency)
}
// Test updating with lower frequency (should not update)
err = AddMetric("test_metric", MetricConfig{
Frequency: 30,
Aggregation: SumAggregation,
})
if err != nil {
t.Errorf("AddMetric() error = %v", err)
}
if Metrics["test_metric"].Frequency != 120 {
t.Errorf("AddMetric() frequency = %d, want 120 (should not downgrade)", Metrics["test_metric"].Frequency)
}
}
func TestGetMetricFrequency(t *testing.T) {
// Reset Metrics before test
Metrics = map[string]MetricConfig{
"test_metric": {
Frequency: 60,
Aggregation: SumAggregation,
},
}
freq, err := GetMetricFrequency("test_metric")
if err != nil {
t.Errorf("GetMetricFrequency() error = %v", err)
}
if freq != 60 {
t.Errorf("GetMetricFrequency() = %d, want 60", freq)
}
_, err = GetMetricFrequency("nonexistent")
if err == nil {
t.Error("GetMetricFrequency() expected error for nonexistent metric")
}
}
func TestBufferWrite(t *testing.T) {
b := newBuffer(100, 10)
// Test writing value
nb, err := b.write(100, schema.Float(42.0))
if err != nil {
t.Errorf("buffer.write() error = %v", err)
}
if nb != b {
t.Error("buffer.write() created new buffer unexpectedly")
}
if len(b.data) != 1 {
t.Errorf("buffer.write() len(data) = %d, want 1", len(b.data))
}
if b.data[0] != schema.Float(42.0) {
t.Errorf("buffer.write() data[0] = %v, want 42.0", b.data[0])
}
// Test writing value from past (should error)
_, err = b.write(50, schema.Float(10.0))
if err == nil {
t.Error("buffer.write() expected error for past timestamp")
}
}
func TestBufferRead(t *testing.T) {
b := newBuffer(100, 10)
// Write some test data
b.write(100, schema.Float(1.0))
b.write(110, schema.Float(2.0))
b.write(120, schema.Float(3.0))
// Read data
data := make([]schema.Float, 3)
result, from, to, err := b.read(100, 130, data)
if err != nil {
t.Errorf("buffer.read() error = %v", err)
}
// Buffer read should return from as firstWrite (start + freq/2)
if from != 100 {
t.Errorf("buffer.read() from = %d, want 100", from)
}
if to != 130 {
t.Errorf("buffer.read() to = %d, want 130", to)
}
if len(result) != 3 {
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
}
}

View File

@@ -1,124 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package memorystore
import (
"errors"
"math"
"github.com/ClusterCockpit/cc-lib/util"
)
type Stats struct {
Samples int
Avg util.Float
Min util.Float
Max util.Float
}
func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) {
if from < b.start {
if b.prev != nil {
return b.prev.stats(from, to)
}
from = b.start
}
// TODO: Check if b.closed and if so and the full buffer is queried,
// use b.statistics instead of iterating over the buffer.
samples := 0
sum, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
var t int64
for t = from; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
b = b.next
if b == nil {
break
}
idx = 0
}
if t < b.start || idx >= len(b.data) {
continue
}
xf := float64(b.data[idx])
if math.IsNaN(xf) {
continue
}
samples += 1
sum += xf
min = math.Min(min, xf)
max = math.Max(max, xf)
}
return Stats{
Samples: samples,
Avg: util.Float(sum) / util.Float(samples),
Min: util.Float(min),
Max: util.Float(max),
}, from, t, nil
}
// Returns statistics for the requested metric on the selected node/level.
// Data is aggregated to the selected level the same way as in `MemoryStore.Read`.
// If `Stats.Samples` is zero, the statistics should not be considered as valid.
func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) {
if from > to {
return nil, 0, 0, errors.New("invalid time range")
}
minfo, ok := m.Metrics[metric]
if !ok {
return nil, 0, 0, errors.New("unkown metric: " + metric)
}
n, samples := 0, 0
avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
stats, cfrom, cto, err := b.stats(from, to)
if err != nil {
return err
}
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto {
return ErrDataDoesNotAlign
}
samples += stats.Samples
avg += stats.Avg
min = math.Min(min, float64(stats.Min))
max = math.Max(max, float64(stats.Max))
n += 1
return nil
})
if err != nil {
return nil, 0, 0, err
}
if n == 0 {
return nil, 0, 0, ErrNoData
}
if minfo.Aggregation == AvgAggregation {
avg /= util.Float(n)
} else if n > 1 && minfo.Aggregation != SumAggregation {
return nil, 0, 0, errors.New("invalid aggregation")
}
return &Stats{
Samples: samples,
Avg: avg,
Min: util.Float(min),
Max: util.Float(max),
}, from, to, nil
}

View File

@@ -1,381 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricDataDispatcher
import (
"context"
"fmt"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/lrucache"
"github.com/ClusterCockpit/cc-lib/resampler"
"github.com/ClusterCockpit/cc-lib/schema"
)
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
func cacheKey(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) string {
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
job.ID, job.State, metrics, scopes, resolution)
}
// Fetches the metric data for a job.
func LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
config.Keys.DisableArchive {
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := archive.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
cclog.Warnf("partial error: %s", err.Error())
// return err, 0, 0 // Reactivating will block archiving on one partial error
} else {
cclog.Error("Error while loading job data from metric repository")
return err, 0, 0
}
}
size = jd.Size()
} else {
var jd_temp schema.JobData
jd_temp, err = archive.GetHandle().LoadJobData(job)
if err != nil {
cclog.Error("Error while loading job data from archive")
return err, 0, 0
}
// Deep copy the cached archive hashmap
jd = metricdata.DeepCopy(jd_temp)
// Resampling for archived data.
// Pass the resolution from frontend here.
for _, v := range jd {
for _, v_ := range v {
timestep := int64(0)
for i := 0; i < len(v_.Series); i += 1 {
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
if err != nil {
return err, 0, 0
}
}
v_.Timestep = int(timestep)
}
}
// Avoid sending unrequested data to the client:
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = jd.Size()
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
// FIXME: Review: Is this really necessary or correct.
// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
// to be available at the scope 'node'. If a job has a lot of nodes,
// statisticsSeries should be available so that a min/median/max Graph can be
// used instead of a lot of single lines.
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
// Existing (archived) StatsSeries can be 'min/mean/max'!
const maxSeriesSize int = 15
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jd.AddNodeScope("flops_any")
jd.AddNodeScope("mem_bw")
}
// Round Resulting Stat Values
jd.RoundMetricStats()
return jd, ttl, size
})
if err, ok := data.(error); ok {
cclog.Error("Error in returned dataset")
return nil, err
}
return data.(schema.JobData), nil
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(
job *schema.Job,
metrics []string,
data [][]schema.Float,
ctx context.Context,
) error {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
if err != nil {
cclog.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// Used for statsTable in frontend: Return scoped statistics by metric.
func LoadScopedJobStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
}
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return nil, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
}
scopedStats, err := repo.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Errorf("error while loading scoped statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
return nil, err
}
return scopedStats, nil
}
// Used for polar plots in frontend: Aggregates statistics for all nodes to single values for job per metric.
func LoadJobStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]schema.MetricStatistics, error) {
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
return archive.LoadStatsFromArchive(job, metrics)
}
data := make(map[string]schema.MetricStatistics, len(metrics))
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
if err != nil {
return data, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Errorf("error while loading statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
return data, err
}
for _, m := range metrics {
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
nodes, ok := stats[m]
if !ok {
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
continue
}
for _, node := range nodes {
sum += node.Avg
min = math.Min(min, node.Min)
max = math.Max(max, node.Max)
}
data[m] = schema.MetricStatistics{
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
}
return data, nil
}
// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
func LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
repo, err := metricdata.GetMetricDataRepo(cluster)
if err != nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error: %s", err.Error())
} else {
cclog.Error("Error while loading node data from metric repository")
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}
return data, nil
}
func LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
repo, err := metricdata.GetMetricDataRepo(cluster)
if err != nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error: %s", err.Error())
} else {
cclog.Error("Error while loading node data from metric repository")
return nil, err
}
}
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
const maxSeriesSize int = 8
for _, jd := range data {
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
}
if data == nil {
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
}
return data, nil
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,88 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
)
type MetricDataRepository interface {
// Initialize this MetricDataRepository. One instance of
// this interface will only ever be responsible for one cluster.
Init(rawConfig json.RawMessage) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster string, nodes, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, ctx context.Context) (map[string]schema.JobData, error)
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
func Init() error {
for _, cluster := range config.Clusters {
if cluster.MetricDataRepository != nil {
var kind struct {
Kind string `json:"kind"`
}
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
cclog.Warn("Error while unmarshaling raw json MetricDataRepository")
return err
}
var mdr MetricDataRepository
switch kind.Kind {
case "cc-metric-store":
mdr = &CCMetricStore{}
case "cc-metric-store-internal":
mdr = &CCMetricStoreInternal{}
memorystore.InternalCCMSFlag = true
case "prometheus":
mdr = &PrometheusDataRepository{}
case "test":
mdr = &TestMetricDataRepository{}
default:
return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
}
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
return err
}
metricDataRepos[cluster.Name] = mdr
}
}
return nil
}
func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
var err error
repo, ok := metricDataRepos[cluster]
if !ok {
err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
}
return repo, err
}

View File

@@ -1,587 +0,0 @@
// Copyright (C) 2022 DKRZ
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"net/http"
"os"
"regexp"
"sort"
"strings"
"sync"
"text/template"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
promapi "github.com/prometheus/client_golang/api"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
promcfg "github.com/prometheus/common/config"
promm "github.com/prometheus/common/model"
)
type PrometheusDataRepositoryConfig struct {
Url string `json:"url"`
Username string `json:"username,omitempty"`
Suffix string `json:"suffix,omitempty"`
Templates map[string]string `json:"query-templates"`
}
type PrometheusDataRepository struct {
client promapi.Client
queryClient promv1.API
suffix string
templates map[string]*template.Template
}
type PromQLArgs struct {
Nodes string
}
type Trie map[rune]Trie
var logOnce sync.Once
func contains(s []schema.MetricScope, str schema.MetricScope) bool {
for _, v := range s {
if v == str {
return true
}
}
return false
}
func MinMaxMean(data []schema.Float) (float64, float64, float64) {
if len(data) == 0 {
return 0.0, 0.0, 0.0
}
min := math.MaxFloat64
max := -math.MaxFloat64
var sum float64
var n float64
for _, val := range data {
if val.IsNaN() {
continue
}
sum += float64(val)
n += 1
if float64(val) > max {
max = float64(val)
}
if float64(val) < min {
min = float64(val)
}
}
return min, max, sum / n
}
// Rewritten from
// https://github.com/ermanh/trieregex/blob/master/trieregex/trieregex.py
func nodeRegex(nodes []string) string {
root := Trie{}
// add runes of each compute node to trie
for _, node := range nodes {
_trie := root
for _, c := range node {
if _, ok := _trie[c]; !ok {
_trie[c] = Trie{}
}
_trie = _trie[c]
}
_trie['*'] = Trie{}
}
// recursively build regex from rune trie
var trieRegex func(trie Trie, reset bool) string
trieRegex = func(trie Trie, reset bool) string {
if reset == true {
trie = root
}
if len(trie) == 0 {
return ""
}
if len(trie) == 1 {
for key, _trie := range trie {
if key == '*' {
return ""
}
return regexp.QuoteMeta(string(key)) + trieRegex(_trie, false)
}
} else {
sequences := []string{}
for key, _trie := range trie {
if key != '*' {
sequences = append(sequences, regexp.QuoteMeta(string(key))+trieRegex(_trie, false))
}
}
sort.Slice(sequences, func(i, j int) bool {
return (-len(sequences[i]) < -len(sequences[j])) || (sequences[i] < sequences[j])
})
var result string
// single edge from this tree node
if len(sequences) == 1 {
result = sequences[0]
if len(result) > 1 {
result = "(?:" + result + ")"
}
// multiple edges, each length 1
} else if s := strings.Join(sequences, ""); len(s) == len(sequences) {
// char or numeric range
if len(s)-1 == int(s[len(s)-1])-int(s[0]) {
result = fmt.Sprintf("[%c-%c]", s[0], s[len(s)-1])
// char or numeric set
} else {
result = "[" + s + "]"
}
// multiple edges of different lengths
} else {
result = "(?:" + strings.Join(sequences, "|") + ")"
}
if _, ok := trie['*']; ok {
result += "?"
}
return result
}
return ""
}
return trieRegex(root, true)
}
func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
var config PrometheusDataRepositoryConfig
// parse config
if err := json.Unmarshal(rawConfig, &config); err != nil {
cclog.Warn("Error while unmarshaling raw json config")
return err
}
// support basic authentication
var rt http.RoundTripper = nil
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
prom_pw := promcfg.Secret(prom_pw)
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
} else {
if config.Username != "" {
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
}
}
// init client
client, err := promapi.NewClient(promapi.Config{
Address: config.Url,
RoundTripper: rt,
})
if err != nil {
cclog.Error("Error while initializing new prometheus client")
return err
}
// init query client
pdb.client = client
pdb.queryClient = promv1.NewAPI(pdb.client)
// site config
pdb.suffix = config.Suffix
// init query templates
pdb.templates = make(map[string]*template.Template)
for metric, templ := range config.Templates {
pdb.templates[metric], err = template.New(metric).Parse(templ)
if err == nil {
cclog.Debugf("Added PromQL template for %s: %s", metric, templ)
} else {
cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
}
}
return nil
}
// TODO: respect scope argument
func (pdb *PrometheusDataRepository) FormatQuery(
metric string,
scope schema.MetricScope,
nodes []string,
cluster string,
) (string, error) {
args := PromQLArgs{}
if len(nodes) > 0 {
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
} else {
args.Nodes = fmt.Sprintf(".*%s", pdb.suffix)
}
buf := &bytes.Buffer{}
if templ, ok := pdb.templates[metric]; ok {
err := templ.Execute(buf, args)
if err != nil {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
} else {
query := buf.String()
cclog.Debugf("PromQL: %s", query)
return query, nil
}
} else {
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > No PromQL for metric %s configured.", metric))
}
}
// Convert PromAPI row to CC schema.Series
func (pdb *PrometheusDataRepository) RowToSeries(
from time.Time,
step int64,
steps int64,
row *promm.SampleStream,
) schema.Series {
ts := from.Unix()
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
// init array of expected length with NaN
values := make([]schema.Float, steps+1)
for i := range values {
values[i] = schema.NaN
}
// copy recorded values from prom sample pair
for _, v := range row.Values {
idx := (v.Timestamp.Unix() - ts) / step
values[idx] = schema.Float(v.Value)
}
min, max, mean := MinMaxMean(values)
// output struct
return schema.Series{
Hostname: hostname,
Data: values,
Statistics: schema.MetricStatistics{
Avg: mean,
Min: min,
Max: max,
},
}
}
func (pdb *PrometheusDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
// TODO respect requested scope
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
jobData := make(schema.JobData)
// parse job specs
nodes := make([]string, len(job.Resources))
for i, resource := range job.Resources {
nodes[i] = resource.Hostname
}
from := time.Unix(job.StartTime, 0)
to := time.Unix(job.StartTime+int64(job.Duration), 0)
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(job.Cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all job nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
// init data structures
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: make([]schema.Series, 0),
}
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
jobMetric.Series = append(jobMetric.Series,
pdb.RowToSeries(from, step, steps, row))
}
// only add metric if at least one host returned data
if !ok && len(jobMetric.Series) > 0 {
jobData[metric][scope] = jobMetric
}
// sort by hostname to get uniform coloring
sort.Slice(jobMetric.Series, func(i, j int) bool {
return (jobMetric.Series[i].Hostname < jobMetric.Series[j].Hostname)
})
}
}
return jobData, nil
}
// TODO change implementation to precomputed/cached stats
func (pdb *PrometheusDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
// map of metrics of nodes of stats
stats := map[string]map[string]schema.MetricStatistics{}
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
cclog.Warn("Error while loading job for stats")
return nil, err
}
for metric, metricData := range data {
stats[metric] = make(map[string]schema.MetricStatistics)
for _, series := range metricData[schema.MetricScopeNode].Series {
stats[metric][series.Hostname] = series.Statistics
}
}
return stats, nil
}
func (pdb *PrometheusDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
t0 := time.Now()
// Map of hosts of metrics of value slices
data := make(map[string]map[string][]*schema.JobMetric)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[hostname] = hostdata
}
// output per host and metric
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
},
)
}
}
}
t1 := time.Since(t0)
cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
return data, nil
}
// Implemented by NHR@FAU; Used in Job-View StatsTable
func (pdb *PrometheusDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
scopedJobStats := make(schema.ScopedJobStats)
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
if err != nil {
cclog.Warn("Error while loading job for scopedJobStats")
return nil, err
}
for metric, metricData := range data {
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range metricData[scope].Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Data: &series.Statistics,
})
}
}
}
return scopedJobStats, nil
}
// Implemented by NHR@FAU; Used in NodeList-View
func (pdb *PrometheusDataRepository) LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
// Fetch Data, based on pdb.LoadNodeData()
t0 := time.Now()
// Map of hosts of jobData
data := make(map[string]schema.JobData)
// query db for each metric
// TODO: scopes seems to be always empty
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
scopes = append(scopes, schema.MetricScopeNode)
}
for _, scope := range scopes {
if scope != schema.MetricScopeNode {
logOnce.Do(func() {
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
})
continue
}
for _, metric := range metrics {
metricConfig := archive.GetMetricConfig(cluster, metric)
if metricConfig == nil {
cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
return nil, errors.New("Prometheus config error")
}
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
if err != nil {
cclog.Warn("Error while formatting prometheus query")
return nil, err
}
// ranged query over all nodes
r := promv1.Range{
Start: from,
End: to,
Step: time.Duration(metricConfig.Timestep * 1e9),
}
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
if err != nil {
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
return nil, errors.New("Prometheus query error")
}
if len(warnings) > 0 {
cclog.Warnf("Warnings: %v\n", warnings)
}
step := int64(metricConfig.Timestep)
steps := int64(to.Sub(from).Seconds()) / step
// iter rows of host, metric, values
for _, row := range result.(promm.Matrix) {
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
hostdata, ok := data[hostname]
if !ok {
hostdata = make(schema.JobData)
data[hostname] = hostdata
}
metricdata, ok := hostdata[metric]
if !ok {
metricdata = make(map[schema.MetricScope]*schema.JobMetric)
data[hostname][metric] = metricdata
}
// output per host, metric and scope
scopeData, ok := metricdata[scope]
if !ok {
scopeData = &schema.JobMetric{
Unit: metricConfig.Unit,
Timestep: metricConfig.Timestep,
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
}
data[hostname][metric][scope] = scopeData
}
}
}
}
t1 := time.Since(t0)
cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
return data, nil
}

View File

@@ -1,118 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdata
import (
"context"
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-lib/schema"
)
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
panic("TODO")
}
// TestMetricDataRepository is only a mock for unit-testing.
type TestMetricDataRepository struct{}
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
return nil
}
func (tmdr *TestMetricDataRepository) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
}
func (tmdr *TestMetricDataRepository) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
panic("TODO")
}
func DeepCopy(jdTemp schema.JobData) schema.JobData {
jd := make(schema.JobData, len(jdTemp))
for k, v := range jdTemp {
jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jdTemp[k]))
for k_, v_ := range v {
jd[k][k_] = new(schema.JobMetric)
jd[k][k_].Series = make([]schema.Series, len(v_.Series))
for i := 0; i < len(v_.Series); i += 1 {
jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data))
copy(jd[k][k_].Series[i].Data, v_.Series[i].Data)
jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname
jd[k][k_].Series[i].Id = v_.Series[i].Id
jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg
jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min
jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max
}
jd[k][k_].Timestep = v_.Timestep
jd[k][k_].Unit.Base = v_.Unit.Base
jd[k][k_].Unit.Prefix = v_.Unit.Prefix
if v_.StatisticsSeries != nil {
// Init Slices
jd[k][k_].StatisticsSeries = new(schema.StatsSeries)
jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max))
jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min))
jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median))
jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean))
// Copy Data
copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max)
copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min)
copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median)
copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean)
// Handle Percentiles
for k__, v__ := range v_.StatisticsSeries.Percentiles {
jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__))
copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__)
}
} else {
jd[k][k_].StatisticsSeries = v_.StatisticsSeries
}
}
}
return jd
}

View File

@@ -0,0 +1,29 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdispatch
const configSchema = `{
"type": "array",
"description": "Array of metric store configurations with scope-based routing.",
"items": {
"type": "object",
"properties": {
"scope": {
"description": "Scope identifier for routing metrics (e.g., cluster name, '*' for default)",
"type": "string"
},
"url": {
"description": "URL of the metric store endpoint",
"type": "string"
},
"token": {
"description": "Authentication token for the metric store",
"type": "string"
}
},
"required": ["scope", "url", "token"]
}
}`

View File

@@ -0,0 +1,533 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricdispatch provides a unified interface for loading and caching job metric data.
//
// This package serves as a central dispatcher that routes metric data requests to the appropriate
// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store).
// For completed jobs, data is retrieved from the file-based job archive.
//
// # Key Features
//
// - Automatic backend selection based on job state (running vs. archived)
// - LRU cache for performance optimization (128 MB default cache size)
// - Data resampling using Largest Triangle Three Bucket algorithm for archived data
// - Automatic statistics series generation for jobs with many nodes
// - Support for scoped metrics (node, socket, accelerator, core)
//
// # Cache Behavior
//
// Cached data has different TTL (time-to-live) values depending on job state:
// - Running jobs: 2 minutes (data changes frequently)
// - Completed jobs: 5 hours (data is static)
//
// The cache key is based on job ID, state, requested metrics, scopes, and resolution.
//
// # Usage
//
// The primary entry point is LoadData, which automatically handles both running and archived jobs:
//
// jobData, err := metricdispatch.LoadData(job, metrics, scopes, ctx, resolution)
// if err != nil {
// // Handle error
// }
//
// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format.
package metricdispatch
import (
"context"
"fmt"
"math"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/lrucache"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// cache is an LRU cache with 128 MB capacity for storing loaded job metric data.
// The cache reduces load on both the metric store and archive backends.
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
// cacheKey generates a unique cache key for a job's metric data based on job ID, state,
// requested metrics, scopes, and resolution. Duration and StartTime are intentionally excluded
// because job.ID is more unique and the cache TTL ensures entries don't persist indefinitely.
func cacheKey(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) string {
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
*job.ID, job.State, metrics, scopes, resolution)
}
// LoadData retrieves metric data for a job from the appropriate backend (memory store for running jobs,
// archive for completed jobs) and applies caching, resampling, and statistics generation as needed.
//
// For running jobs or when archive is disabled, data is fetched from the metric store.
// For completed archived jobs, data is loaded from the job archive and resampled if needed.
//
// Parameters:
// - job: The job for which to load metric data
// - metrics: List of metric names to load (nil loads all metrics for the cluster)
// - scopes: Metric scopes to include (nil defaults to node scope)
// - ctx: Context for cancellation and timeouts
// - resolution: Target number of data points for resampling (only applies to archived data)
//
// Returns the loaded job data and any error encountered. For partial errors (some metrics failed),
// the function returns the successfully loaded data with a warning logged.
func LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving {
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := archive.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
} else {
cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
}
size = jd.Size()
} else {
var jdTemp schema.JobData
jdTemp, err = archive.GetHandle().LoadJobData(job)
if err != nil {
cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err, 0, 0
}
jd = deepCopy(jdTemp)
// Resample archived data using Largest Triangle Three Bucket algorithm to reduce data points
// to the requested resolution, improving transfer performance and client-side rendering.
for _, v := range jd {
for _, v_ := range v {
timestep := int64(0)
for i := 0; i < len(v_.Series); i += 1 {
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
if err != nil {
return err, 0, 0
}
}
v_.Timestep = int(timestep)
}
}
// Filter job data to only include requested metrics and scopes, avoiding unnecessary data transfer.
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = jd.Size()
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
// Generate statistics series for jobs with many nodes to enable min/median/max graphs
// instead of overwhelming the UI with individual node lines. Note that newly calculated
// statistics use min/median/max, while archived statistics may use min/mean/max.
const maxSeriesSize int = 15
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jd.AddNodeScope("flops_any")
jd.AddNodeScope("mem_bw")
}
// Round Resulting Stat Values
jd.RoundMetricStats()
return jd, ttl, size
})
if err, ok := data.(error); ok {
cclog.Errorf("error in cached dataset for job %d: %s", job.JobID, err.Error())
return nil, err
}
return data.(schema.JobData), nil
}
// LoadAverages computes average values for the specified metrics across all nodes of a job.
// For running jobs, it loads statistics from the metric store. For completed jobs, it uses
// the pre-calculated averages from the job archive. The results are appended to the data slice.
func LoadAverages(
job *schema.Job,
metrics []string,
data [][]schema.Float,
ctx context.Context,
) error {
if job.State != schema.JobStateRunning {
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return err
}
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// LoadScopedJobStats retrieves job statistics organized by metric scope (node, socket, core, accelerator).
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
// statistics are loaded from the job archive.
func LoadScopedJobStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
if job.State != schema.JobStateRunning {
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
}
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return nil, err
}
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return nil, err
}
// Round Resulting Stat Values
scopedStats.RoundScopedMetricStats()
return scopedStats, nil
}
// LoadJobStats retrieves aggregated statistics (min/avg/max) for each requested metric across all job nodes.
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
// statistics are loaded from the job archive.
func LoadJobStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]schema.MetricStatistics, error) {
if job.State != schema.JobStateRunning {
return archive.LoadStatsFromArchive(job, metrics)
}
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
job.Cluster, job.SubCluster, err.Error())
return nil, err
}
data := make(map[string]schema.MetricStatistics, len(metrics))
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
return data, err
}
for _, m := range metrics {
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
nodes, ok := stats[m]
if !ok {
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
continue
}
for _, node := range nodes {
sum += node.Avg
min = math.Min(min, node.Min)
max = math.Max(max, node.Max)
}
data[m] = schema.MetricStatistics{
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
Min: (math.Round(min*100) / 100),
Max: (math.Round(max*100) / 100),
}
}
return data, nil
}
// LoadNodeData retrieves metric data for specific nodes in a cluster within a time range.
// This is used for node monitoring views and system status pages. Data is always fetched from
// the metric store (not the archive) since it's for current/recent node status monitoring.
//
// Returns a nested map structure: node -> metric -> scoped data.
// FIXME: Add support for subcluster specific cc-metric-stores
func LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
ms, err := GetMetricDataRepo(cluster, "")
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
cluster, err.Error())
return nil, err
}
data, err := ms.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
} else {
cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("metric store for cluster '%s' does not support node data queries", cluster)
}
return data, nil
}
// LoadNodeListData retrieves time-series metric data for multiple nodes within a time range,
// with optional resampling and automatic statistics generation for large datasets.
// This is used for comparing multiple nodes or displaying node status over time.
//
// Returns a map of node names to their job-like metric data structures.
func LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
if metrics == nil {
for _, m := range archive.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
ms, err := GetMetricDataRepo(cluster, subCluster)
if err != nil {
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
cluster, subCluster, err.Error())
return nil, err
}
data, err := ms.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error())
} else {
cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
cluster, subCluster, err.Error())
return nil, err
}
}
// Generate statistics series for datasets with many series to improve visualization performance.
// Statistics are calculated as min/median/max.
const maxSeriesSize int = 8
for _, jd := range data {
for _, scopes := range jd {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
}
if data == nil {
return nil, fmt.Errorf("metric store for cluster '%s' does not support node list queries", cluster)
}
return data, nil
}
// deepCopy creates a deep copy of JobData to prevent cache corruption when modifying
// archived data (e.g., during resampling). This ensures the cached archive data remains
// immutable while allowing per-request transformations.
func deepCopy(source schema.JobData) schema.JobData {
result := make(schema.JobData, len(source))
for metricName, scopeMap := range source {
result[metricName] = make(map[schema.MetricScope]*schema.JobMetric, len(scopeMap))
for scope, jobMetric := range scopeMap {
result[metricName][scope] = copyJobMetric(jobMetric)
}
}
return result
}
func copyJobMetric(src *schema.JobMetric) *schema.JobMetric {
dst := &schema.JobMetric{
Timestep: src.Timestep,
Unit: src.Unit,
Series: make([]schema.Series, len(src.Series)),
}
for i := range src.Series {
dst.Series[i] = copySeries(&src.Series[i])
}
if src.StatisticsSeries != nil {
dst.StatisticsSeries = copyStatisticsSeries(src.StatisticsSeries)
}
return dst
}
func copySeries(src *schema.Series) schema.Series {
dst := schema.Series{
Hostname: src.Hostname,
ID: src.ID,
Statistics: src.Statistics,
Data: make([]schema.Float, len(src.Data)),
}
copy(dst.Data, src.Data)
return dst
}
func copyStatisticsSeries(src *schema.StatsSeries) *schema.StatsSeries {
dst := &schema.StatsSeries{
Min: make([]schema.Float, len(src.Min)),
Mean: make([]schema.Float, len(src.Mean)),
Median: make([]schema.Float, len(src.Median)),
Max: make([]schema.Float, len(src.Max)),
}
copy(dst.Min, src.Min)
copy(dst.Mean, src.Mean)
copy(dst.Median, src.Median)
copy(dst.Max, src.Max)
if len(src.Percentiles) > 0 {
dst.Percentiles = make(map[int][]schema.Float, len(src.Percentiles))
for percentile, values := range src.Percentiles {
dst.Percentiles[percentile] = make([]schema.Float, len(values))
copy(dst.Percentiles[percentile], values)
}
}
return dst
}

View File

@@ -0,0 +1,125 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdispatch
import (
"testing"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
func TestDeepCopy(t *testing.T) {
nodeId := "0"
original := schema.JobData{
"cpu_load": {
schema.MetricScopeNode: &schema.JobMetric{
Timestep: 60,
Unit: schema.Unit{Base: "load", Prefix: ""},
Series: []schema.Series{
{
Hostname: "node001",
ID: &nodeId,
Data: []schema.Float{1.0, 2.0, 3.0},
Statistics: schema.MetricStatistics{
Min: 1.0,
Avg: 2.0,
Max: 3.0,
},
},
},
StatisticsSeries: &schema.StatsSeries{
Min: []schema.Float{1.0, 1.5, 2.0},
Mean: []schema.Float{2.0, 2.5, 3.0},
Median: []schema.Float{2.0, 2.5, 3.0},
Max: []schema.Float{3.0, 3.5, 4.0},
Percentiles: map[int][]schema.Float{
25: {1.5, 2.0, 2.5},
75: {2.5, 3.0, 3.5},
},
},
},
},
}
copied := deepCopy(original)
original["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] = 999.0
original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] = 888.0
original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] = 777.0
if copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] != 1.0 {
t.Errorf("Series data was not deeply copied: got %v, want 1.0",
copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0])
}
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] != 1.0 {
t.Errorf("StatisticsSeries was not deeply copied: got %v, want 1.0",
copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0])
}
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] != 1.5 {
t.Errorf("Percentiles was not deeply copied: got %v, want 1.5",
copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0])
}
if copied["cpu_load"][schema.MetricScopeNode].Timestep != 60 {
t.Errorf("Timestep not copied correctly: got %v, want 60",
copied["cpu_load"][schema.MetricScopeNode].Timestep)
}
if copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname != "node001" {
t.Errorf("Hostname not copied correctly: got %v, want node001",
copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname)
}
}
func TestDeepCopyNilStatisticsSeries(t *testing.T) {
original := schema.JobData{
"mem_used": {
schema.MetricScopeNode: &schema.JobMetric{
Timestep: 60,
Series: []schema.Series{
{
Hostname: "node001",
Data: []schema.Float{1.0, 2.0},
},
},
StatisticsSeries: nil,
},
},
}
copied := deepCopy(original)
if copied["mem_used"][schema.MetricScopeNode].StatisticsSeries != nil {
t.Errorf("StatisticsSeries should be nil, got %v",
copied["mem_used"][schema.MetricScopeNode].StatisticsSeries)
}
}
func TestDeepCopyEmptyPercentiles(t *testing.T) {
original := schema.JobData{
"cpu_load": {
schema.MetricScopeNode: &schema.JobMetric{
Timestep: 60,
Series: []schema.Series{},
StatisticsSeries: &schema.StatsSeries{
Min: []schema.Float{1.0},
Mean: []schema.Float{2.0},
Median: []schema.Float{2.0},
Max: []schema.Float{3.0},
Percentiles: nil,
},
},
},
}
copied := deepCopy(original)
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles != nil {
t.Errorf("Percentiles should be nil when source is nil/empty")
}
}

View File

@@ -0,0 +1,123 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdispatch
import (
"bytes"
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
ccms "github.com/ClusterCockpit/cc-backend/internal/metricstoreclient"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type MetricDataRepository interface {
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
LoadStats(job *schema.Job,
metrics []string,
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
LoadScopedStats(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.ScopedJobStats, error)
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
LoadNodeData(cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context) (map[string]schema.JobData, error)
// HealthCheck evaluates the monitoring state for a set of nodes against expected metrics.
HealthCheck(cluster string,
nodes []string,
metrics []string) (map[string]metricstore.HealthCheckResult, error)
}
type CCMetricStoreConfig struct {
Scope string `json:"scope"`
URL string `json:"url"`
Token string `json:"token"`
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
func Init(rawConfig json.RawMessage) error {
if rawConfig != nil {
var configs []CCMetricStoreConfig
config.Validate(configSchema, rawConfig)
dec := json.NewDecoder(bytes.NewReader(rawConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&configs); err != nil {
return fmt.Errorf("[METRICDISPATCH]> External Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error())
}
if len(configs) == 0 {
return fmt.Errorf("[METRICDISPATCH]> No external metric store configurations found in config file")
}
for _, config := range configs {
metricDataRepos[config.Scope] = ccms.NewCCMetricStore(config.URL, config.Token)
}
}
return nil
}
func GetMetricDataRepo(cluster string, subcluster string) (MetricDataRepository, error) {
var repo MetricDataRepository
var ok bool
key := cluster + "-" + subcluster
repo, ok = metricDataRepos[key]
if !ok {
repo, ok = metricDataRepos[cluster]
if !ok {
repo, ok = metricDataRepos["*"]
if !ok {
if metricstore.MetricStoreHandle == nil {
return nil, fmt.Errorf("[METRICDISPATCH]> no metric data repository configured '%s'", key)
}
repo = metricstore.MetricStoreHandle
cclog.Debugf("[METRICDISPATCH]> Using internal metric data repository for '%s'", key)
}
}
}
return repo, nil
}
// GetHealthCheckRepo returns the MetricDataRepository for performing health checks on a cluster.
// It uses the same fallback logic as GetMetricDataRepo: cluster → wildcard → internal.
func GetHealthCheckRepo(cluster string) (MetricDataRepository, error) {
return GetMetricDataRepo(cluster, "")
}

View File

@@ -0,0 +1,239 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstoreclient - Query Building
//
// This file contains the query construction and scope transformation logic for cc-metric-store queries.
// It handles the complex mapping between requested metric scopes and native hardware topology,
// automatically aggregating or filtering metrics as needed.
//
// # Scope Transformations
//
// The buildScopeQueries function implements the core scope transformation algorithm.
// It handles 25+ different transformation cases, mapping between:
// - Accelerator (GPU) scope
// - HWThread (hardware thread/SMT) scope
// - Core (CPU core) scope
// - Socket (CPU package) scope
// - MemoryDomain (NUMA domain) scope
// - Node (full system) scope
//
// Transformations follow these rules:
// - Same scope: Return data as-is (e.g., Core → Core)
// - Coarser scope: Aggregate data (e.g., Core → Socket with Aggregate=true)
// - Finer scope: Error - cannot increase granularity
//
// # Query Building
//
// buildQueries and buildNodeQueries are the main entry points, handling job-specific
// and node-specific query construction respectively. They:
// - Validate metric configurations
// - Handle subcluster-specific metric filtering
// - Detect and skip duplicate scope requests
// - Call buildScopeQueries for each metric/scope/host combination
package metricstoreclient
import (
"fmt"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// buildQueries constructs API queries for job-specific metric data.
// It iterates through metrics, scopes, and job resources to build the complete query set.
//
// The function handles:
// - Metric configuration validation and subcluster filtering
// - Scope deduplication to avoid redundant queries
// - Hardware thread list resolution (job-allocated vs full node)
// - Delegation to buildScopeQueries for scope transformations
//
// Returns queries and their corresponding assigned scopes (which may differ from requested scopes).
func (ccms *CCMetricStore) buildQueries(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources))
topology, err := ccms.getTopology(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error())
return nil, nil, err
}
for _, metric := range metrics {
remoteName := metric
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
cclog.Warnf("metric '%s' is not specified for cluster '%s' - skipping", metric, job.Cluster)
continue
}
// Skip if metric is removed for subcluster
if len(mc.SubClusters) != 0 && metricstore.IsMetricRemovedForSubCluster(mc, job.SubCluster) {
continue
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
continue
}
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
}
handledScopes = append(handledScopes, scope)
for _, host := range job.Resources {
hwthreads := host.HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
scopeResults, ok := metricstore.BuildScopeQueries(
nativeScope, requestedScope,
remoteName, host.Hostname,
topology, hwthreads, host.Accelerators,
)
if !ok {
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
}
for _, sr := range scopeResults {
queries = append(queries, APIQuery{
Metric: sr.Metric,
Hostname: sr.Hostname,
Aggregate: sr.Aggregate,
Type: sr.Type,
TypeIds: sr.TypeIds,
Resolution: resolution,
})
assignedScope = append(assignedScope, sr.Scope)
}
}
}
}
return queries, assignedScope, nil
}
// buildNodeQueries constructs API queries for node-specific metric data (Systems View).
// Similar to buildQueries but uses full node topology instead of job-allocated resources.
//
// The function handles:
// - SubCluster topology resolution (either pre-loaded or per-node lookup)
// - Full node hardware thread lists (not job-specific subsets)
// - All accelerators on each node
// - Metric configuration validation with subcluster filtering
//
// Returns queries and their corresponding assigned scopes.
func (ccms *CCMetricStore) buildNodeQueries(
cluster string,
subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) ([]APIQuery, []schema.MetricScope, error) {
// Initialize both slices together
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes))
for _, metric := range metrics {
remoteName := metric
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
continue
}
// Skip if metric is removed for subcluster
if mc.SubClusters != nil && metricstore.IsMetricRemovedForSubCluster(mc, subCluster) {
continue
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
}
handledScopes = append(handledScopes, scope)
for _, hostname := range nodes {
var topology *schema.Topology
var err error
// If no subCluster given, get it by node
if subCluster == "" {
topology, err = ccms.getTopologyByNode(cluster, hostname)
} else {
topology, err = ccms.getTopology(cluster, subCluster)
}
if err != nil {
return nil, nil, err
}
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
acceleratorIds := topology.GetAcceleratorIDs()
// Moved check here if metric matches hardware specs
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
continue scopesLoop
}
scopeResults, ok := metricstore.BuildScopeQueries(
nativeScope, requestedScope,
remoteName, hostname,
topology, topology.Node, acceleratorIds,
)
if !ok {
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
}
for _, sr := range scopeResults {
queries = append(queries, APIQuery{
Metric: sr.Metric,
Hostname: sr.Hostname,
Aggregate: sr.Aggregate,
Type: sr.Type,
TypeIds: sr.TypeIds,
Resolution: resolution,
})
assignedScope = append(assignedScope, sr.Scope)
}
}
}
}
return queries, assignedScope, nil
}

View File

@@ -0,0 +1,796 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstoreclient provides a client for querying the cc-metric-store time series database.
//
// The cc-metric-store is a high-performance time series database optimized for HPC metric data.
// This client handles HTTP communication, query construction, scope transformations, and data retrieval
// for job and node metrics across different metric scopes (node, socket, core, hwthread, accelerator).
//
// # Architecture
//
// The package is split into two main components:
// - Client Operations (cc-metric-store.go): HTTP client, request handling, data loading methods
// - Query Building (cc-metric-store-queries.go): Query construction and scope transformation logic
//
// # Basic Usage
//
// store := NewCCMetricStore("http://localhost:8080", "jwt-token")
//
// // Load job data
// jobData, err := store.LoadData(job, metrics, scopes, ctx, resolution)
// if err != nil {
// log.Fatal(err)
// }
//
// # Metric Scopes
//
// The client supports hierarchical metric scopes that map to HPC hardware topology:
// - MetricScopeAccelerator: GPU/accelerator level metrics
// - MetricScopeHWThread: Hardware thread (SMT) level metrics
// - MetricScopeCore: CPU core level metrics
// - MetricScopeSocket: CPU socket level metrics
// - MetricScopeMemoryDomain: NUMA domain level metrics
// - MetricScopeNode: Full node level metrics
//
// The client automatically handles scope transformations, aggregating finer-grained metrics
// to coarser scopes when needed (e.g., aggregating core metrics to socket level).
//
// # Error Handling
//
// The client supports partial errors - if some queries fail, it returns both the successful
// data and an error listing the failed queries. This allows processing partial results
// when some nodes or metrics are temporarily unavailable.
//
// # API Versioning
//
// The client uses cc-metric-store API v2, which includes support for:
// - Data resampling for bandwidth optimization
// - Multi-scope queries in a single request
// - Aggregation across hardware topology levels
package metricstoreclient
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ms "github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// CCMetricStore is the HTTP client for communicating with cc-metric-store.
// It manages connection details, authentication, and provides methods for querying metrics.
type CCMetricStore struct {
client http.Client // HTTP client with 10-second timeout
jwt string // JWT Bearer token for authentication
url string // Base URL of cc-metric-store instance
queryEndpoint string // Full URL to query API endpoint
topologyCache map[string]*schema.Topology // cluster -> topology cache
}
// APIQueryRequest represents a request to the cc-metric-store query API.
// It supports both explicit queries and "for-all-nodes" bulk queries.
type APIQueryRequest struct {
Cluster string `json:"cluster"` // Target cluster name
Queries []APIQuery `json:"queries"` // Explicit list of metric queries
ForAllNodes []string `json:"for-all-nodes"` // Metrics to query for all nodes
From int64 `json:"from"` // Start time (Unix timestamp)
To int64 `json:"to"` // End time (Unix timestamp)
WithStats bool `json:"with-stats"` // Include min/avg/max statistics
WithData bool `json:"with-data"` // Include time series data points
}
// APIQuery specifies a single metric query with optional scope filtering.
// Type and TypeIds define the hardware scope (core, socket, accelerator, etc.).
type APIQuery struct {
Type *string `json:"type,omitempty"` // Scope type (e.g., "core", "socket")
SubType *string `json:"subtype,omitempty"` // Sub-scope type (reserved for future use)
Metric string `json:"metric"` // Metric name
Hostname string `json:"host"` // Target hostname
Resolution int `json:"resolution"` // Data resolution in seconds (0 = native)
TypeIds []string `json:"type-ids,omitempty"` // IDs for the scope type (e.g., core IDs)
SubTypeIds []string `json:"subtype-ids,omitempty"` // IDs for sub-scope (reserved)
Aggregate bool `json:"aggreg"` // Aggregate across TypeIds
}
// APIQueryResponse contains the results from a cc-metric-store query.
// Results align with the Queries slice by index.
type APIQueryResponse struct {
Queries []APIQuery `json:"queries,omitempty"` // Echoed queries (for bulk requests)
Results [][]APIMetricData `json:"results"` // Result data, indexed by query
}
// APIMetricData represents time series data and statistics for a single metric series.
// Error is set if this particular series failed to load.
type APIMetricData struct {
Error *string `json:"error"` // Error message if query failed
Data []schema.Float `json:"data"` // Time series data points
From int64 `json:"from"` // Actual start time of data
To int64 `json:"to"` // Actual end time of data
Resolution int `json:"resolution"` // Actual resolution of data in seconds
Avg schema.Float `json:"avg"` // Average value across time range
Min schema.Float `json:"min"` // Minimum value in time range
Max schema.Float `json:"max"` // Maximum value in time range
}
// NewCCMetricStore creates and initializes a new (external) CCMetricStore client.
// The url parameter should include the protocol and port (e.g., "http://localhost:8080").
// The token parameter is a JWT used for Bearer authentication; pass empty string if auth is disabled.
func NewCCMetricStore(url string, token string) *CCMetricStore {
return &CCMetricStore{
url: url,
queryEndpoint: fmt.Sprintf("%s/api/query", url),
jwt: token,
client: http.Client{
Timeout: 10 * time.Second,
},
topologyCache: make(map[string]*schema.Topology),
}
}
// doRequest executes an HTTP POST request to the cc-metric-store query API.
// It handles JSON encoding/decoding, authentication, and API versioning.
// The request body is automatically closed to prevent resource leaks.
func (ccms *CCMetricStore) doRequest(
ctx context.Context,
body *APIQueryRequest,
) (*APIQueryResponse, error) {
buf := &bytes.Buffer{}
if err := json.NewEncoder(buf).Encode(body); err != nil {
cclog.Errorf("Error while encoding request body: %s", err.Error())
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf)
if err != nil {
cclog.Errorf("Error while building request body: %s", err.Error())
return nil, err
}
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
// versioning the cc-metric-store query API.
// v2 = data with resampling
// v1 = data without resampling
q := req.URL.Query()
q.Add("version", "v2")
req.URL.RawQuery = q.Encode()
res, err := ccms.client.Do(req)
if err != nil {
cclog.Errorf("Error while performing request: %s", err.Error())
return nil, err
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
}
var resBody APIQueryResponse
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
cclog.Errorf("Error while decoding result body: %s", err.Error())
return nil, err
}
return &resBody, nil
}
// getTopology returns the topology for a given cluster and subcluster, caching it if not already present
func (ccms *CCMetricStore) getTopology(cluster, subCluster string) (*schema.Topology, error) {
cacheKey := fmt.Sprintf("%s:%s", cluster, subCluster)
if topology, ok := ccms.topologyCache[cacheKey]; ok {
return topology, nil
}
subcluster, err := archive.GetSubCluster(cluster, subCluster)
if err != nil {
return nil, err
}
ccms.topologyCache[cacheKey] = &subcluster.Topology
return &subcluster.Topology, nil
}
// getTopologyByNode returns the topology for a given cluster and node, caching it if not already present
func (ccms *CCMetricStore) getTopologyByNode(cluster, node string) (*schema.Topology, error) {
subCluster, err := archive.GetSubClusterByNode(cluster, node)
if err != nil {
return nil, err
}
return ccms.getTopology(cluster, subCluster)
}
// LoadData retrieves time series data and statistics for the specified job and metrics.
// It queries data for the job's time range and resources, handling scope transformations automatically.
//
// Parameters:
// - job: Job metadata including cluster, time range, and allocated resources
// - metrics: List of metric names to retrieve
// - scopes: Requested metric scopes (node, socket, core, etc.)
// - ctx: Context for cancellation and timeouts
// - resolution: Data resolution in seconds (0 for native resolution)
//
// Returns JobData organized as: metric -> scope -> series list.
// Supports partial errors: returns available data even if some queries fail.
func (ccms *CCMetricStore) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution)
if err != nil {
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: job.Cluster,
From: job.StartTime,
To: job.StartTime + int64(job.Duration),
Queries: queries,
WithStats: true,
WithData: true,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
return nil, err
}
var errors []string
jobData := make(schema.JobData)
// Add safety check for potential index out of range errors
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
query := req.Queries[i]
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(job.Cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster)
continue
}
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
res := mc.Timestep
if len(row) > 0 {
res = row[0].Resolution
}
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: mc.Unit,
Timestep: res,
Series: make([]schema.Series, 0),
}
jobData[metric][scope] = jobMetric
}
for ndx, res := range row {
if res.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
continue
}
id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname,
ID: id,
Statistics: schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
// So that one can later check len(jobData):
if len(jobMetric.Series) == 0 {
delete(jobData[metric], scope)
if len(jobData[metric]) == 0 {
delete(jobData, metric)
}
}
}
if len(errors) != 0 {
/* Returns list for "partial errors" */
return jobData, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
}
return jobData, nil
}
// LoadStats retrieves min/avg/max statistics for job metrics at node scope.
// This is faster than LoadData when only statistical summaries are needed (no time series data).
//
// Returns statistics organized as: metric -> hostname -> statistics.
func (ccms *CCMetricStore) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
if err != nil {
cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error())
return nil, err
}
req := APIQueryRequest{
Cluster: job.Cluster,
From: job.StartTime,
To: job.StartTime + int64(job.Duration),
Queries: queries,
WithStats: true,
WithData: false,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
return nil, err
}
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
for i, res := range resBody.Results {
if i >= len(req.Queries) {
cclog.Warnf("LoadStats: result index %d exceeds queries length %d", i, len(req.Queries))
break
}
if len(res) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
continue
}
query := req.Queries[i]
metric := query.Metric
data := res[0]
if data.Error != nil {
cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
continue
}
metricdata, ok := stats[metric]
if !ok {
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
stats[metric] = metricdata
}
if hasNaNStats(data.Avg, data.Min, data.Max) {
cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname)
continue
}
metricdata[query.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg),
Min: float64(data.Min),
Max: float64(data.Max),
}
}
return stats, nil
}
// LoadScopedStats retrieves statistics for job metrics across multiple scopes.
// Used for the Job-View Statistics Table to display per-scope breakdowns.
//
// Returns statistics organized as: metric -> scope -> list of scoped statistics.
// Each scoped statistic includes hostname, hardware ID (if applicable), and min/avg/max values.
func (ccms *CCMetricStore) LoadScopedStats(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
) (schema.ScopedJobStats, error) {
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, 0)
if err != nil {
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
return nil, err
}
req := APIQueryRequest{
Cluster: job.Cluster,
From: job.StartTime,
To: job.StartTime + int64(job.Duration),
Queries: queries,
WithStats: true,
WithData: false,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
return nil, err
}
var errors []string
scopedJobStats := make(schema.ScopedJobStats)
for i, row := range resBody.Results {
query := req.Queries[i]
metric := query.Metric
scope := assignedScope[i]
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for ndx, res := range row {
if res.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
continue
}
id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: query.Hostname,
ID: id,
Data: &schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
})
}
// So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty
if len(scopedJobStats[metric][scope]) == 0 {
delete(scopedJobStats[metric], scope)
if len(scopedJobStats[metric]) == 0 {
delete(scopedJobStats, metric)
}
}
}
if len(errors) != 0 {
/* Returns list for "partial errors" */
return scopedJobStats, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
}
return scopedJobStats, nil
}
// LoadNodeData retrieves current metric data for specified nodes in a cluster.
// Used for the Systems-View Node-Overview to display real-time node status.
//
// If nodes is nil, queries all metrics for all nodes in the cluster (bulk query).
// Returns data organized as: hostname -> metric -> list of JobMetric (with time series and stats).
func (ccms *CCMetricStore) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
req := APIQueryRequest{
Cluster: cluster,
From: from.Unix(),
To: to.Unix(),
WithStats: true,
WithData: true,
}
if nodes == nil {
req.ForAllNodes = append(req.ForAllNodes, metrics...)
} else {
for _, node := range nodes {
for _, metric := range metrics {
req.Queries = append(req.Queries, APIQuery{
Hostname: node,
Metric: metric,
Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution
})
}
}
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error())
return nil, err
}
var errors []string
data := make(map[string]map[string][]*schema.JobMetric)
for i, res := range resBody.Results {
if len(res) == 0 {
// No Data Found For Metric, Logged in FetchData to Warn
continue
}
var query APIQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
} else {
query = req.Queries[i]
}
metric := query.Metric
qdata := res[0]
if qdata.Error != nil {
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
continue
}
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
continue
}
ms.SanitizeStats(&qdata.Avg, &qdata.Min, &qdata.Max)
hostdata, ok := data[query.Hostname]
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[query.Hostname] = hostdata
}
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: []schema.Series{
{
Hostname: query.Hostname,
Data: qdata.Data,
Statistics: schema.MetricStatistics{
Avg: float64(qdata.Avg),
Min: float64(qdata.Min),
Max: float64(qdata.Max),
},
},
},
})
}
if len(errors) != 0 {
/* Returns list of "partial errors" */
return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
}
return data, nil
}
// LoadNodeListData retrieves paginated node metrics for the Systems-View Node-List.
//
// Supports filtering by subcluster and node name pattern. The nodeFilter performs
// substring matching on hostnames.
//
// Returns:
// - Node data organized as: hostname -> JobData (metric -> scope -> series)
// - Total node count (before pagination)
// - HasNextPage flag indicating if more pages are available
// - Error (may be partial error with some data returned)
func (ccms *CCMetricStore) LoadNodeListData(
cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context,
) (map[string]schema.JobData, error) {
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
if err != nil {
cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
return nil, err
}
// Verify assignment is correct - log any inconsistencies for debugging
if len(queries) != len(assignedScope) {
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d",
len(queries), len(assignedScope))
}
req := APIQueryRequest{
Cluster: cluster,
Queries: queries,
From: from.Unix(),
To: to.Unix(),
WithStats: true,
WithData: true,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error())
return nil, err
}
var errors []string
data := make(map[string]schema.JobData)
// Add safety check for index out of range issues
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
len(req.Queries), len(resBody.Results), len(assignedScope))
if len(resBody.Results) > len(req.Queries) {
resBody.Results = resBody.Results[:len(req.Queries)]
}
if len(assignedScope) > len(req.Queries) {
assignedScope = assignedScope[:len(req.Queries)]
}
}
for i, row := range resBody.Results {
var query APIQuery
if resBody.Queries != nil {
if i < len(resBody.Queries) {
query = resBody.Queries[i]
} else {
cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d",
i, len(resBody.Queries))
continue
}
} else {
query = req.Queries[i]
}
metric := query.Metric
scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
continue
}
res := mc.Timestep
if len(row) > 0 {
res = row[0].Resolution
}
// Init Nested Map Data Structures If Not Found
hostData, ok := data[query.Hostname]
if !ok {
hostData = make(schema.JobData)
data[query.Hostname] = hostData
}
metricData, ok := hostData[metric]
if !ok {
metricData = make(map[schema.MetricScope]*schema.JobMetric)
data[query.Hostname][metric] = metricData
}
scopeData, ok := metricData[scope]
if !ok {
scopeData = &schema.JobMetric{
Unit: mc.Unit,
Timestep: res,
Series: make([]schema.Series, 0),
}
data[query.Hostname][metric][scope] = scopeData
}
for ndx, res := range row {
if res.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
continue
}
id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
scopeData.Series = append(scopeData.Series, schema.Series{
Hostname: query.Hostname,
ID: id,
Statistics: schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
}
if len(errors) != 0 {
/* Returns list of "partial errors" */
return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
}
return data, nil
}
// HealthCheck queries the external cc-metric-store's health check endpoint.
// It sends a HealthCheckReq as the request body to /api/healthcheck and
// returns the per-node health check results.
func (ccms *CCMetricStore) HealthCheck(cluster string,
nodes []string, metrics []string,
) (map[string]ms.HealthCheckResult, error) {
req := ms.HealthCheckReq{
Cluster: cluster,
Nodes: nodes,
MetricNames: metrics,
}
buf := &bytes.Buffer{}
if err := json.NewEncoder(buf).Encode(req); err != nil {
cclog.Errorf("Error while encoding health check request body: %s", err.Error())
return nil, err
}
endpoint := fmt.Sprintf("%s/api/healthcheck", ccms.url)
httpReq, err := http.NewRequest(http.MethodGet, endpoint, buf)
if err != nil {
cclog.Errorf("Error while building health check request: %s", err.Error())
return nil, err
}
if ccms.jwt != "" {
httpReq.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
res, err := ccms.client.Do(httpReq)
if err != nil {
cclog.Errorf("Error while performing health check request: %s", err.Error())
return nil, err
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("'%s': HTTP Status: %s", endpoint, res.Status)
}
var results map[string]ms.HealthCheckResult
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&results); err != nil {
cclog.Errorf("Error while decoding health check response: %s", err.Error())
return nil, err
}
return results, nil
}
// hasNaNStats returns true if any of the statistics contain NaN values.
func hasNaNStats(avg, min, max schema.Float) bool {
return avg.IsNaN() || min.IsNaN() || max.IsNaN()
}

View File

@@ -12,7 +12,7 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/jmoiron/sqlx"
"github.com/mattn/go-sqlite3"
"github.com/qustavo/sqlhooks/v2"
@@ -51,7 +51,7 @@ func setupSqlite(db *sql.DB) error {
return nil
}
func Connect(driver string, db string) {
func Connect(db string) {
var err error
var dbHandle *sqlx.DB
@@ -64,39 +64,31 @@ func Connect(driver string, db string) {
ConnectionMaxIdleTime: repoConfig.ConnectionMaxIdleTime,
}
switch driver {
case "sqlite3":
// TODO: Have separate DB handles for Writes and Reads
// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
connectionURLParams := make(url.Values)
connectionURLParams.Add("_txlock", "immediate")
connectionURLParams.Add("_journal_mode", "WAL")
connectionURLParams.Add("_busy_timeout", "5000")
connectionURLParams.Add("_synchronous", "NORMAL")
connectionURLParams.Add("_cache_size", "1000000000")
connectionURLParams.Add("_foreign_keys", "true")
opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode())
// TODO: Have separate DB handles for Writes and Reads
// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
connectionURLParams := make(url.Values)
connectionURLParams.Add("_txlock", "immediate")
connectionURLParams.Add("_journal_mode", "WAL")
connectionURLParams.Add("_busy_timeout", "5000")
connectionURLParams.Add("_synchronous", "NORMAL")
connectionURLParams.Add("_cache_size", "1000000000")
connectionURLParams.Add("_foreign_keys", "true")
opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode())
if cclog.Loglevel() == "debug" {
sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
} else {
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
}
err = setupSqlite(dbHandle.DB)
if err != nil {
cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error())
}
case "mysql":
opts.URL += "?multiStatements=true"
dbHandle, err = sqlx.Open("mysql", opts.URL)
default:
cclog.Abortf("DB Connection: Unsupported database driver '%s'.\n", driver)
if cclog.Loglevel() == "debug" {
sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
} else {
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
}
if err != nil {
cclog.Abortf("DB Connection: Could not connect to '%s' database with sqlx.Open().\nError: %s\n", driver, err.Error())
cclog.Abortf("DB Connection: Could not connect to SQLite database with sqlx.Open().\nError: %s\n", err.Error())
}
err = setupSqlite(dbHandle.DB)
if err != nil {
cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error())
}
dbHandle.SetMaxOpenConns(opts.MaxOpenConnections)
@@ -104,8 +96,8 @@ func Connect(driver string, db string) {
dbHandle.SetConnMaxLifetime(opts.ConnectionMaxLifetime)
dbHandle.SetConnMaxIdleTime(opts.ConnectionMaxIdleTime)
dbConnInstance = &DBConnection{DB: dbHandle, Driver: driver}
err = checkDBVersion(driver, dbHandle.DB)
dbConnInstance = &DBConnection{DB: dbHandle}
err = checkDBVersion(dbHandle.DB)
if err != nil {
cclog.Abortf("DB Connection: Failed DB version check.\nError: %s\n", err.Error())
}
@@ -119,3 +111,26 @@ func GetConnection() *DBConnection {
return dbConnInstance
}
// ResetConnection closes the current database connection and resets the connection state.
// This function is intended for testing purposes only to allow test isolation.
func ResetConnection() error {
if dbConnInstance != nil && dbConnInstance.DB != nil {
if err := dbConnInstance.DB.Close(); err != nil {
return fmt.Errorf("failed to close database connection: %w", err)
}
}
dbConnInstance = nil
dbConnOnce = sync.Once{}
jobRepoInstance = nil
jobRepoOnce = sync.Once{}
nodeRepoInstance = nil
nodeRepoOnce = sync.Once{}
userRepoInstance = nil
userRepoOnce = sync.Once{}
userCfgRepoInstance = nil
userCfgRepoOnce = sync.Once{}
return nil
}

View File

@@ -2,13 +2,14 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"context"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
)
// Hooks satisfies the sqlhook.Hooks interface

View File

@@ -0,0 +1,274 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"context"
"testing"
"time"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type MockJobHook struct {
startCalled bool
stopCalled bool
startJobs []*schema.Job
stopJobs []*schema.Job
}
func (m *MockJobHook) JobStartCallback(job *schema.Job) {
m.startCalled = true
m.startJobs = append(m.startJobs, job)
}
func (m *MockJobHook) JobStopCallback(job *schema.Job) {
m.stopCalled = true
m.stopJobs = append(m.stopJobs, job)
}
func TestRegisterJobHook(t *testing.T) {
t.Run("register single hook", func(t *testing.T) {
hooks = nil
mock := &MockJobHook{}
RegisterJobHook(mock)
assert.NotNil(t, hooks)
assert.Len(t, hooks, 1)
assert.Equal(t, mock, hooks[0])
hooks = nil
})
t.Run("register multiple hooks", func(t *testing.T) {
hooks = nil
mock1 := &MockJobHook{}
mock2 := &MockJobHook{}
RegisterJobHook(mock1)
RegisterJobHook(mock2)
assert.Len(t, hooks, 2)
assert.Equal(t, mock1, hooks[0])
assert.Equal(t, mock2, hooks[1])
hooks = nil
})
t.Run("register nil hook does not add to hooks", func(t *testing.T) {
hooks = nil
RegisterJobHook(nil)
if hooks != nil {
assert.Len(t, hooks, 0, "Nil hook should not be added")
}
hooks = nil
})
}
func TestCallJobStartHooks(t *testing.T) {
t.Run("call start hooks with single job", func(t *testing.T) {
hooks = nil
mock := &MockJobHook{}
RegisterJobHook(mock)
job := &schema.Job{
JobID: 123,
User: "testuser",
Cluster: "testcluster",
}
CallJobStartHooks([]*schema.Job{job})
assert.True(t, mock.startCalled)
assert.False(t, mock.stopCalled)
assert.Len(t, mock.startJobs, 1)
assert.Equal(t, int64(123), mock.startJobs[0].JobID)
hooks = nil
})
t.Run("call start hooks with multiple jobs", func(t *testing.T) {
hooks = nil
mock := &MockJobHook{}
RegisterJobHook(mock)
jobs := []*schema.Job{
{JobID: 1, User: "user1", Cluster: "cluster1"},
{JobID: 2, User: "user2", Cluster: "cluster2"},
{JobID: 3, User: "user3", Cluster: "cluster3"},
}
CallJobStartHooks(jobs)
assert.True(t, mock.startCalled)
assert.Len(t, mock.startJobs, 3)
assert.Equal(t, int64(1), mock.startJobs[0].JobID)
assert.Equal(t, int64(2), mock.startJobs[1].JobID)
assert.Equal(t, int64(3), mock.startJobs[2].JobID)
hooks = nil
})
t.Run("call start hooks with multiple registered hooks", func(t *testing.T) {
hooks = nil
mock1 := &MockJobHook{}
mock2 := &MockJobHook{}
RegisterJobHook(mock1)
RegisterJobHook(mock2)
job := &schema.Job{
JobID: 456, User: "testuser", Cluster: "testcluster",
}
CallJobStartHooks([]*schema.Job{job})
assert.True(t, mock1.startCalled)
assert.True(t, mock2.startCalled)
assert.Len(t, mock1.startJobs, 1)
assert.Len(t, mock2.startJobs, 1)
hooks = nil
})
t.Run("call start hooks with nil hooks", func(t *testing.T) {
hooks = nil
job := &schema.Job{
JobID: 789, User: "testuser", Cluster: "testcluster",
}
CallJobStartHooks([]*schema.Job{job})
hooks = nil
})
t.Run("call start hooks with empty job list", func(t *testing.T) {
hooks = nil
mock := &MockJobHook{}
RegisterJobHook(mock)
CallJobStartHooks([]*schema.Job{})
assert.False(t, mock.startCalled)
assert.Len(t, mock.startJobs, 0)
hooks = nil
})
}
func TestCallJobStopHooks(t *testing.T) {
t.Run("call stop hooks with single job", func(t *testing.T) {
hooks = nil
mock := &MockJobHook{}
RegisterJobHook(mock)
job := &schema.Job{
JobID: 123,
User: "testuser",
Cluster: "testcluster",
}
CallJobStopHooks(job)
assert.True(t, mock.stopCalled)
assert.False(t, mock.startCalled)
assert.Len(t, mock.stopJobs, 1)
assert.Equal(t, int64(123), mock.stopJobs[0].JobID)
hooks = nil
})
t.Run("call stop hooks with multiple registered hooks", func(t *testing.T) {
hooks = nil
mock1 := &MockJobHook{}
mock2 := &MockJobHook{}
RegisterJobHook(mock1)
RegisterJobHook(mock2)
job := &schema.Job{
JobID: 456, User: "testuser", Cluster: "testcluster",
}
CallJobStopHooks(job)
assert.True(t, mock1.stopCalled)
assert.True(t, mock2.stopCalled)
assert.Len(t, mock1.stopJobs, 1)
assert.Len(t, mock2.stopJobs, 1)
hooks = nil
})
t.Run("call stop hooks with nil hooks", func(t *testing.T) {
hooks = nil
job := &schema.Job{
JobID: 789, User: "testuser", Cluster: "testcluster",
}
CallJobStopHooks(job)
hooks = nil
})
}
func TestSQLHooks(t *testing.T) {
_ = setup(t)
t.Run("hooks log queries in debug mode", func(t *testing.T) {
h := &Hooks{}
ctx := context.Background()
query := "SELECT * FROM job WHERE job_id = ?"
args := []any{123}
ctxWithTime, err := h.Before(ctx, query, args...)
require.NoError(t, err)
assert.NotNil(t, ctxWithTime)
beginTime := ctxWithTime.Value("begin")
require.NotNil(t, beginTime)
_, ok := beginTime.(time.Time)
assert.True(t, ok, "Begin time should be time.Time")
time.Sleep(10 * time.Millisecond)
ctxAfter, err := h.After(ctxWithTime, query, args...)
require.NoError(t, err)
assert.NotNil(t, ctxAfter)
})
}
func TestHookIntegration(t *testing.T) {
t.Run("hooks are called during job lifecycle", func(t *testing.T) {
hooks = nil
mock := &MockJobHook{}
RegisterJobHook(mock)
job := &schema.Job{
JobID: 999,
User: "integrationuser",
Cluster: "integrationcluster",
}
CallJobStartHooks([]*schema.Job{job})
assert.True(t, mock.startCalled)
assert.Equal(t, 1, len(mock.startJobs))
CallJobStopHooks(job)
assert.True(t, mock.stopCalled)
assert.Equal(t, 1, len(mock.stopJobs))
assert.Equal(t, mock.startJobs[0].JobID, mock.stopJobs[0].JobID)
hooks = nil
})
}

File diff suppressed because it is too large Load Diff

View File

@@ -2,14 +2,15 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"encoding/json"
"fmt"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel"
)
@@ -29,6 +30,27 @@ const NamedJobInsert string = `INSERT INTO job (
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
);`
// InsertJobDirect inserts a job directly into the job table (not job_cache).
// Use this when the returned ID will be used for operations on the job table
// (e.g., adding tags), or for imported jobs that are already completed.
func (r *JobRepository) InsertJobDirect(job *schema.Job) (int64, error) {
r.Mutex.Lock()
defer r.Mutex.Unlock()
res, err := r.DB.NamedExec(NamedJobInsert, job)
if err != nil {
cclog.Warn("Error while NamedJobInsert (direct)")
return 0, err
}
id, err := res.LastInsertId()
if err != nil {
cclog.Warn("Error while getting last insert ID (direct)")
return 0, err
}
return id, nil
}
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
r.Mutex.Lock()
defer r.Mutex.Unlock()
@@ -70,8 +92,9 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
jobs = append(jobs, job)
}
// Use INSERT OR IGNORE to skip jobs already transferred by the stop path
_, err = r.DB.Exec(
"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
"INSERT OR IGNORE INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
if err != nil {
cclog.Warnf("Error while Job sync: %v", err)
return nil, err
@@ -83,9 +106,48 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
return nil, err
}
// Resolve correct job.id from the job table. The IDs read from job_cache
// are from a different auto-increment sequence and must not be used to
// query the job table.
for _, job := range jobs {
var newID int64
if err := sq.Select("job.id").From("job").
Where("job.job_id = ? AND job.cluster = ? AND job.start_time = ?",
job.JobID, job.Cluster, job.StartTime).
RunWith(r.stmtCache).QueryRow().Scan(&newID); err != nil {
cclog.Warnf("SyncJobs: could not resolve job table id for job %d on %s: %v",
job.JobID, job.Cluster, err)
continue
}
job.ID = &newID
}
return jobs, nil
}
// TransferCachedJobToMain moves a job from job_cache to the job table.
// Caller must hold r.Mutex. Returns the new job table ID.
func (r *JobRepository) TransferCachedJobToMain(cacheID int64) (int64, error) {
res, err := r.DB.Exec(
"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache WHERE id = ?",
cacheID)
if err != nil {
return 0, fmt.Errorf("transferring cached job %d to main table failed: %w", cacheID, err)
}
newID, err := res.LastInsertId()
if err != nil {
return 0, fmt.Errorf("getting new job ID after transfer failed: %w", err)
}
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", cacheID)
if err != nil {
return 0, fmt.Errorf("deleting cached job %d after transfer failed: %w", cacheID, err)
}
return newID, nil
}
// Start inserts a new job in the table, returning the unique job ID.
// Statistics are not transfered!
func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
@@ -107,41 +169,46 @@ func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
return r.InsertJob(job)
}
// StartDirect inserts a new job directly into the job table (not job_cache).
// Use this when the returned ID will immediately be used for job table
// operations such as adding tags.
func (r *JobRepository) StartDirect(job *schema.Job) (id int64, err error) {
job.RawFootprint, err = json.Marshal(job.Footprint)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
}
job.RawResources, err = json.Marshal(job.Resources)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
}
job.RawMetaData, err = json.Marshal(job.MetaData)
if err != nil {
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
}
return r.InsertJobDirect(job)
}
// Stop updates the job with the database id jobId using the provided arguments.
func (r *JobRepository) Stop(
jobId int64,
jobID int64,
duration int32,
state schema.JobState,
monitoringStatus int32,
) (err error) {
// Invalidate cache entries as job state is changing
r.cache.Del(fmt.Sprintf("metadata:%d", jobId))
r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobId))
r.cache.Del(fmt.Sprintf("metadata:%d", jobID))
r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobID))
stmt := sq.Update("job").
Set("job_state", state).
Set("duration", duration).
Set("monitoring_status", monitoringStatus).
Where("job.id = ?", jobId)
Where("job.id = ?", jobID)
_, err = stmt.RunWith(r.stmtCache).Exec()
return err
}
func (r *JobRepository) StopCached(
jobId int64,
duration int32,
state schema.JobState,
monitoringStatus int32,
) (err error) {
// Note: StopCached updates job_cache table, not the main job table
// Cache invalidation happens when job is synced to main table
stmt := sq.Update("job_cache").
Set("job_state", state).
Set("duration", duration).
Set("monitoring_status", monitoringStatus).
Where("job_cache.id = ?", jobId)
_, err = stmt.RunWith(r.stmtCache).Exec()
return err
}

View File

@@ -0,0 +1,607 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"encoding/json"
"testing"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// createTestJob creates a minimal valid job for testing
func createTestJob(jobID int64, cluster string) *schema.Job {
return &schema.Job{
JobID: jobID,
User: "testuser",
Project: "testproject",
Cluster: cluster,
SubCluster: "main",
Partition: "batch",
NumNodes: 1,
NumHWThreads: 4,
NumAcc: 0,
Shared: "none",
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
SMT: 1,
State: schema.JobStateRunning,
StartTime: 1234567890,
Duration: 0,
Walltime: 3600,
Resources: []*schema.Resource{
{
Hostname: "node01",
HWThreads: []int{0, 1, 2, 3},
},
},
Footprint: map[string]float64{
"cpu_load": 50.0,
"mem_used": 8000.0,
"flops_any": 0.5,
"mem_bw": 10.0,
"net_bw": 2.0,
"file_bw": 1.0,
"cpu_used": 2.0,
"cpu_load_core": 12.5,
},
MetaData: map[string]string{
"jobName": "test_job",
"queue": "normal",
"qosName": "default",
"accountName": "testaccount",
},
}
}
func TestInsertJob(t *testing.T) {
r := setup(t)
t.Run("successful insertion", func(t *testing.T) {
job := createTestJob(999001, "testcluster")
job.RawResources, _ = json.Marshal(job.Resources)
job.RawFootprint, _ = json.Marshal(job.Footprint)
job.RawMetaData, _ = json.Marshal(job.MetaData)
id, err := r.InsertJob(job)
require.NoError(t, err, "InsertJob should succeed")
assert.Greater(t, id, int64(0), "Should return valid insert ID")
// Verify job was inserted into job_cache
var count int
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?",
job.JobID, job.Cluster).Scan(&count)
require.NoError(t, err)
assert.Equal(t, 1, count, "Job should be in job_cache table")
// Clean up
_, err = r.DB.Exec("DELETE FROM job_cache WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
require.NoError(t, err)
})
t.Run("insertion with all fields", func(t *testing.T) {
job := createTestJob(999002, "testcluster")
job.ArrayJobID = 5000
job.Energy = 1500.5
job.RawResources, _ = json.Marshal(job.Resources)
job.RawFootprint, _ = json.Marshal(job.Footprint)
job.RawMetaData, _ = json.Marshal(job.MetaData)
id, err := r.InsertJob(job)
require.NoError(t, err)
assert.Greater(t, id, int64(0))
// Verify all fields were stored correctly
var retrievedJob schema.Job
err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, energy
FROM job_cache WHERE id = ?`, id).Scan(
&retrievedJob.JobID, &retrievedJob.User, &retrievedJob.Project,
&retrievedJob.Cluster, &retrievedJob.ArrayJobID, &retrievedJob.Energy)
require.NoError(t, err)
assert.Equal(t, job.JobID, retrievedJob.JobID)
assert.Equal(t, job.User, retrievedJob.User)
assert.Equal(t, job.Project, retrievedJob.Project)
assert.Equal(t, job.Cluster, retrievedJob.Cluster)
assert.Equal(t, job.ArrayJobID, retrievedJob.ArrayJobID)
assert.Equal(t, job.Energy, retrievedJob.Energy)
// Clean up
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
require.NoError(t, err)
})
}
func TestStart(t *testing.T) {
r := setup(t)
t.Run("successful job start with JSON encoding", func(t *testing.T) {
job := createTestJob(999003, "testcluster")
id, err := r.Start(job)
require.NoError(t, err, "Start should succeed")
assert.Greater(t, id, int64(0), "Should return valid insert ID")
// Verify job was inserted and JSON fields were encoded
var rawResources, rawFootprint, rawMetaData []byte
err = r.DB.QueryRow(`SELECT resources, footprint, meta_data FROM job_cache WHERE id = ?`, id).Scan(
&rawResources, &rawFootprint, &rawMetaData)
require.NoError(t, err)
// Verify resources JSON
var resources []*schema.Resource
err = json.Unmarshal(rawResources, &resources)
require.NoError(t, err, "Resources should be valid JSON")
assert.Equal(t, 1, len(resources))
assert.Equal(t, "node01", resources[0].Hostname)
// Verify footprint JSON
var footprint map[string]float64
err = json.Unmarshal(rawFootprint, &footprint)
require.NoError(t, err, "Footprint should be valid JSON")
assert.Equal(t, 50.0, footprint["cpu_load"])
assert.Equal(t, 8000.0, footprint["mem_used"])
// Verify metadata JSON
var metaData map[string]string
err = json.Unmarshal(rawMetaData, &metaData)
require.NoError(t, err, "MetaData should be valid JSON")
assert.Equal(t, "test_job", metaData["jobName"])
// Clean up
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
require.NoError(t, err)
})
t.Run("job start with empty footprint", func(t *testing.T) {
job := createTestJob(999004, "testcluster")
job.Footprint = map[string]float64{}
id, err := r.Start(job)
require.NoError(t, err)
assert.Greater(t, id, int64(0))
// Verify empty footprint was encoded as empty JSON object
var rawFootprint []byte
err = r.DB.QueryRow(`SELECT footprint FROM job_cache WHERE id = ?`, id).Scan(&rawFootprint)
require.NoError(t, err)
assert.Equal(t, []byte("{}"), rawFootprint)
// Clean up
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
require.NoError(t, err)
})
t.Run("job start with nil metadata", func(t *testing.T) {
job := createTestJob(999005, "testcluster")
job.MetaData = nil
id, err := r.Start(job)
require.NoError(t, err)
assert.Greater(t, id, int64(0))
// Clean up
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
require.NoError(t, err)
})
}
func TestStop(t *testing.T) {
r := setup(t)
t.Run("successful job stop", func(t *testing.T) {
// First insert a job using Start
job := createTestJob(999106, "testcluster")
id, err := r.Start(job)
require.NoError(t, err)
// Move from job_cache to job table (simulate SyncJobs) - exclude id to let it auto-increment
_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint)
SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint
FROM job_cache WHERE id = ?`, id)
require.NoError(t, err)
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
require.NoError(t, err)
// Get the new job id in the job table
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
job.JobID, job.Cluster, job.StartTime).Scan(&id)
require.NoError(t, err)
// Stop the job
duration := int32(3600)
state := schema.JobStateCompleted
monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful)
err = r.Stop(id, duration, state, monitoringStatus)
require.NoError(t, err, "Stop should succeed")
// Verify job was updated
var retrievedDuration int32
var retrievedState string
var retrievedMonStatus int32
err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job WHERE id = ?`, id).Scan(
&retrievedDuration, &retrievedState, &retrievedMonStatus)
require.NoError(t, err)
assert.Equal(t, duration, retrievedDuration)
assert.Equal(t, string(state), retrievedState)
assert.Equal(t, monitoringStatus, retrievedMonStatus)
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
require.NoError(t, err)
})
t.Run("stop updates job state transitions", func(t *testing.T) {
// Insert a job
job := createTestJob(999107, "testcluster")
id, err := r.Start(job)
require.NoError(t, err)
// Move to job table
_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint)
SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint
FROM job_cache WHERE id = ?`, id)
require.NoError(t, err)
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
require.NoError(t, err)
// Get the new job id in the job table
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
job.JobID, job.Cluster, job.StartTime).Scan(&id)
require.NoError(t, err)
// Stop the job with different duration
err = r.Stop(id, 7200, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful))
require.NoError(t, err)
// Verify the duration was updated correctly
var duration int32
err = r.DB.QueryRow(`SELECT duration FROM job WHERE id = ?`, id).Scan(&duration)
require.NoError(t, err)
assert.Equal(t, int32(7200), duration, "Duration should be updated to 7200")
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
require.NoError(t, err)
})
t.Run("stop with different states", func(t *testing.T) {
testCases := []struct {
name string
jobID int64
state schema.JobState
monitoringStatus int32
}{
{"completed", 999108, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)},
{"failed", 999118, schema.JobStateFailed, int32(schema.MonitoringStatusArchivingSuccessful)},
{"cancelled", 999119, schema.JobStateCancelled, int32(schema.MonitoringStatusArchivingSuccessful)},
{"timeout", 999120, schema.JobStateTimeout, int32(schema.MonitoringStatusArchivingSuccessful)},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
job := createTestJob(tc.jobID, "testcluster")
id, err := r.Start(job)
require.NoError(t, err)
// Move to job table
_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint)
SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint
FROM job_cache WHERE id = ?`, id)
require.NoError(t, err)
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
require.NoError(t, err)
// Get the new job id in the job table
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
job.JobID, job.Cluster, job.StartTime).Scan(&id)
require.NoError(t, err)
// Stop with specific state
err = r.Stop(id, 1800, tc.state, tc.monitoringStatus)
require.NoError(t, err)
// Verify state was set correctly
var retrievedState string
err = r.DB.QueryRow(`SELECT job_state FROM job WHERE id = ?`, id).Scan(&retrievedState)
require.NoError(t, err)
assert.Equal(t, string(tc.state), retrievedState)
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
require.NoError(t, err)
})
}
})
}
func TestTransferCachedJobToMain(t *testing.T) {
r := setup(t)
t.Run("successful transfer from cache to main", func(t *testing.T) {
// Insert a job in job_cache
job := createTestJob(999009, "testcluster")
cacheID, err := r.Start(job)
require.NoError(t, err)
// Transfer the cached job to the main table
r.Mutex.Lock()
newID, err := r.TransferCachedJobToMain(cacheID)
r.Mutex.Unlock()
require.NoError(t, err, "TransferCachedJobToMain should succeed")
assert.NotEqual(t, cacheID, newID, "New ID should differ from cache ID")
// Verify job exists in job table
var count int
err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE id = ?`, newID).Scan(&count)
require.NoError(t, err)
assert.Equal(t, 1, count, "Job should exist in main table")
// Verify job was removed from job_cache
err = r.DB.QueryRow(`SELECT COUNT(*) FROM job_cache WHERE id = ?`, cacheID).Scan(&count)
require.NoError(t, err)
assert.Equal(t, 0, count, "Job should be removed from cache")
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID)
require.NoError(t, err)
})
t.Run("transfer preserves job data", func(t *testing.T) {
// Insert a job in job_cache
job := createTestJob(999010, "testcluster")
cacheID, err := r.Start(job)
require.NoError(t, err)
// Transfer the cached job
r.Mutex.Lock()
newID, err := r.TransferCachedJobToMain(cacheID)
r.Mutex.Unlock()
require.NoError(t, err)
// Verify the transferred job has the correct data
var jobID int64
var cluster string
err = r.DB.QueryRow(`SELECT job_id, cluster FROM job WHERE id = ?`, newID).Scan(&jobID, &cluster)
require.NoError(t, err)
assert.Equal(t, job.JobID, jobID)
assert.Equal(t, job.Cluster, cluster)
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID)
require.NoError(t, err)
})
}
func TestSyncJobs(t *testing.T) {
r := setup(t)
t.Run("sync jobs from cache to main table", func(t *testing.T) {
// Ensure cache is empty first
_, err := r.DB.Exec("DELETE FROM job_cache")
require.NoError(t, err)
// Insert multiple jobs in job_cache
job1 := createTestJob(999011, "testcluster")
job2 := createTestJob(999012, "testcluster")
job3 := createTestJob(999013, "testcluster")
_, err = r.Start(job1)
require.NoError(t, err)
_, err = r.Start(job2)
require.NoError(t, err)
_, err = r.Start(job3)
require.NoError(t, err)
// Verify jobs are in job_cache
var cacheCount int
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)",
job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount)
require.NoError(t, err)
assert.Equal(t, 3, cacheCount, "All jobs should be in job_cache")
// Sync jobs
jobs, err := r.SyncJobs()
require.NoError(t, err, "SyncJobs should succeed")
assert.Equal(t, 3, len(jobs), "Should return 3 synced jobs")
// Verify jobs were moved to job table
var jobCount int
err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE job_id IN (?, ?, ?)",
job1.JobID, job2.JobID, job3.JobID).Scan(&jobCount)
require.NoError(t, err)
assert.Equal(t, 3, jobCount, "All jobs should be in job table")
// Verify job_cache was cleared
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)",
job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount)
require.NoError(t, err)
assert.Equal(t, 0, cacheCount, "job_cache should be empty after sync")
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE job_id IN (?, ?, ?)", job1.JobID, job2.JobID, job3.JobID)
require.NoError(t, err)
})
t.Run("sync preserves job data", func(t *testing.T) {
// Ensure cache is empty first
_, err := r.DB.Exec("DELETE FROM job_cache")
require.NoError(t, err)
// Insert a job with specific data
job := createTestJob(999014, "testcluster")
job.ArrayJobID = 7777
job.Energy = 2500.75
job.Duration = 1800
id, err := r.Start(job)
require.NoError(t, err)
// Update some fields to simulate job progress
result, err := r.DB.Exec(`UPDATE job_cache SET duration = ?, energy = ? WHERE id = ?`,
3600, 3000.5, id)
require.NoError(t, err)
rowsAffected, _ := result.RowsAffected()
require.Equal(t, int64(1), rowsAffected, "UPDATE should affect exactly 1 row")
// Verify the update worked
var checkDuration int32
var checkEnergy float64
err = r.DB.QueryRow(`SELECT duration, energy FROM job_cache WHERE id = ?`, id).Scan(&checkDuration, &checkEnergy)
require.NoError(t, err)
require.Equal(t, int32(3600), checkDuration, "Duration should be updated to 3600 before sync")
require.Equal(t, 3000.5, checkEnergy, "Energy should be updated to 3000.5 before sync")
// Sync jobs
jobs, err := r.SyncJobs()
require.NoError(t, err)
require.Equal(t, 1, len(jobs), "Should return exactly 1 synced job")
// Verify in database
var dbJob schema.Job
err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, duration, energy
FROM job WHERE job_id = ? AND cluster = ?`, job.JobID, job.Cluster).Scan(
&dbJob.JobID, &dbJob.User, &dbJob.Project, &dbJob.Cluster,
&dbJob.ArrayJobID, &dbJob.Duration, &dbJob.Energy)
require.NoError(t, err)
assert.Equal(t, job.JobID, dbJob.JobID)
assert.Equal(t, int32(3600), dbJob.Duration)
assert.Equal(t, 3000.5, dbJob.Energy)
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
require.NoError(t, err)
})
t.Run("sync returns job table IDs not cache IDs", func(t *testing.T) {
// Ensure cache is empty first
_, err := r.DB.Exec("DELETE FROM job_cache")
require.NoError(t, err)
// Insert a job into job_cache
job := createTestJob(999015, "testcluster")
cacheID, err := r.Start(job)
require.NoError(t, err)
// Sync jobs
jobs, err := r.SyncJobs()
require.NoError(t, err)
require.Equal(t, 1, len(jobs))
// The returned ID must refer to the job table, not job_cache
var jobTableID int64
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
jobs[0].JobID, jobs[0].Cluster, jobs[0].StartTime).Scan(&jobTableID)
require.NoError(t, err)
assert.Equal(t, jobTableID, *jobs[0].ID,
"returned ID should match the job table row, not the cache ID (%d)", cacheID)
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
require.NoError(t, err)
})
t.Run("sync with empty cache returns empty list", func(t *testing.T) {
// Ensure cache is empty
_, err := r.DB.Exec("DELETE FROM job_cache")
require.NoError(t, err)
// Sync should return empty list
jobs, err := r.SyncJobs()
require.NoError(t, err)
assert.Equal(t, 0, len(jobs), "Should return empty list when cache is empty")
})
}
func TestInsertJobDirect(t *testing.T) {
r := setup(t)
t.Run("inserts into job table not cache", func(t *testing.T) {
job := createTestJob(999020, "testcluster")
job.RawResources, _ = json.Marshal(job.Resources)
job.RawFootprint, _ = json.Marshal(job.Footprint)
job.RawMetaData, _ = json.Marshal(job.MetaData)
id, err := r.InsertJobDirect(job)
require.NoError(t, err, "InsertJobDirect should succeed")
assert.Greater(t, id, int64(0), "Should return valid insert ID")
// Verify job is in job table
var count int
err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE id = ?", id).Scan(&count)
require.NoError(t, err)
assert.Equal(t, 1, count, "Job should be in job table")
// Verify job is NOT in job_cache
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?",
job.JobID, job.Cluster).Scan(&count)
require.NoError(t, err)
assert.Equal(t, 0, count, "Job should NOT be in job_cache")
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
require.NoError(t, err)
})
t.Run("returned ID works for tag operations", func(t *testing.T) {
job := createTestJob(999021, "testcluster")
job.RawResources, _ = json.Marshal(job.Resources)
job.RawFootprint, _ = json.Marshal(job.Footprint)
job.RawMetaData, _ = json.Marshal(job.MetaData)
id, err := r.InsertJobDirect(job)
require.NoError(t, err)
// Adding a tag using the returned ID should succeed (FK constraint on jobtag)
err = r.ImportTag(id, "test_type", "test_name", "global")
require.NoError(t, err, "ImportTag should succeed with direct insert ID")
// Clean up
_, err = r.DB.Exec("DELETE FROM jobtag WHERE job_id = ?", id)
require.NoError(t, err)
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
require.NoError(t, err)
})
}
func TestStartDirect(t *testing.T) {
r := setup(t)
t.Run("inserts into job table with JSON encoding", func(t *testing.T) {
job := createTestJob(999022, "testcluster")
id, err := r.StartDirect(job)
require.NoError(t, err, "StartDirect should succeed")
assert.Greater(t, id, int64(0))
// Verify job is in job table with encoded JSON
var rawResources []byte
err = r.DB.QueryRow("SELECT resources FROM job WHERE id = ?", id).Scan(&rawResources)
require.NoError(t, err)
var resources []*schema.Resource
err = json.Unmarshal(rawResources, &resources)
require.NoError(t, err, "Resources should be valid JSON")
assert.Equal(t, "node01", resources[0].Hostname)
// Clean up
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
require.NoError(t, err)
})
}

View File

@@ -2,6 +2,7 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
@@ -11,8 +12,8 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel"
)
@@ -22,13 +23,17 @@ import (
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) Find(
jobId *int64,
jobID *int64,
cluster *string,
startTime *int64,
) (*schema.Job, error) {
if jobID == nil {
return nil, fmt.Errorf("jobID cannot be nil")
}
start := time.Now()
q := sq.Select(jobColumns...).From("job").
Where("job.job_id = ?", *jobId)
Where("job.job_id = ?", *jobID)
if cluster != nil {
q = q.Where("job.cluster = ?", *cluster)
@@ -37,19 +42,29 @@ func (r *JobRepository) Find(
q = q.Where("job.start_time = ?", *startTime)
}
q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match
q = q.OrderBy("job.id DESC").Limit(1) // always use newest matching job by db id if more than one match
cclog.Debugf("Timer Find %s", time.Since(start))
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindCached executes a SQL query to find a specific batch job from the job_cache table.
// The job is queried using the batch job id, and optionally filtered by cluster name
// and start time (UNIX epoch time seconds). This method uses cached job data which
// may be stale but provides faster access than Find().
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindCached(
jobId *int64,
jobID *int64,
cluster *string,
startTime *int64,
) (*schema.Job, error) {
if jobID == nil {
return nil, fmt.Errorf("jobID cannot be nil")
}
q := sq.Select(jobCacheColumns...).From("job_cache").
Where("job_cache.job_id = ?", *jobId)
Where("job_cache.job_id = ?", *jobID)
if cluster != nil {
q = q.Where("job_cache.cluster = ?", *cluster)
@@ -58,24 +73,28 @@ func (r *JobRepository) FindCached(
q = q.Where("job_cache.start_time = ?", *startTime)
}
q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match
q = q.OrderBy("job_cache.id DESC").Limit(1) // always use newest matching job by db id if more than one match
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// Find executes a SQL query to find a specific batch job.
// The job is queried using the batch job id, the cluster name,
// and the start time of the job in UNIX epoch time seconds.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
// FindAll executes a SQL query to find all batch jobs matching the given criteria.
// Jobs are queried using the batch job id, and optionally filtered by cluster name
// and start time (UNIX epoch time seconds).
// It returns a slice of pointers to schema.Job data structures and an error variable.
// An empty slice is returned if no matching jobs are found.
func (r *JobRepository) FindAll(
jobId *int64,
jobID *int64,
cluster *string,
startTime *int64,
) ([]*schema.Job, error) {
if jobID == nil {
return nil, fmt.Errorf("jobID cannot be nil")
}
start := time.Now()
q := sq.Select(jobColumns...).From("job").
Where("job.job_id = ?", *jobId)
Where("job.job_id = ?", *jobID)
if cluster != nil {
q = q.Where("job.cluster = ?", *cluster)
@@ -86,8 +105,8 @@ func (r *JobRepository) FindAll(
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
cclog.Error("Error while running query")
return nil, err
cclog.Errorf("Error while running FindAll query for jobID=%d: %v", *jobID, err)
return nil, fmt.Errorf("failed to execute FindAll query: %w", err)
}
defer rows.Close()
@@ -95,8 +114,8 @@ func (r *JobRepository) FindAll(
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
cclog.Warn("Error while scanning rows")
return nil, err
cclog.Warnf("Error while scanning rows in FindAll: %v", err)
return nil, fmt.Errorf("failed to scan job row: %w", err)
}
jobs = append(jobs, job)
}
@@ -119,8 +138,8 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) {
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
cclog.Error("Error while running query")
return nil, err
cclog.Errorf("Error while running GetJobList query (limit=%d, offset=%d): %v", limit, offset, err)
return nil, fmt.Errorf("failed to execute GetJobList query: %w", err)
}
defer rows.Close()
@@ -129,23 +148,23 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) {
var id int64
err := rows.Scan(&id)
if err != nil {
cclog.Warn("Error while scanning rows")
return nil, err
cclog.Warnf("Error while scanning rows in GetJobList: %v", err)
return nil, fmt.Errorf("failed to scan job ID: %w", err)
}
jl = append(jl, id)
}
cclog.Infof("Return job count %d", len(jl))
cclog.Debugf("JobRepository.GetJobList(): Return job count %d", len(jl))
return jl, nil
}
// FindById executes a SQL query to find a specific batch job.
// FindByID executes a SQL query to find a specific batch job.
// The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, error) {
func (r *JobRepository) FindByID(ctx context.Context, jobID int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
From("job").Where("job.id = ?", jobID)
q, qerr := SecurityCheck(ctx, q)
if qerr != nil {
@@ -155,14 +174,14 @@ func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job,
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindByIdWithUser executes a SQL query to find a specific batch job.
// FindByIDWithUser executes a SQL query to find a specific batch job.
// The job is queried using the database id. The user is passed directly,
// instead as part of the context.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schema.Job, error) {
func (r *JobRepository) FindByIDWithUser(user *schema.User, jobID int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
From("job").Where("job.id = ?", jobID)
q, qerr := SecurityCheckWithUser(user, q)
if qerr != nil {
@@ -172,24 +191,24 @@ func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schem
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindByIdDirect executes a SQL query to find a specific batch job.
// FindByIDDirect executes a SQL query to find a specific batch job.
// The job is queried using the database id.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindByIdDirect(jobId int64) (*schema.Job, error) {
func (r *JobRepository) FindByIDDirect(jobID int64) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").Where("job.id = ?", jobId)
From("job").Where("job.id = ?", jobID)
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// FindByJobId executes a SQL query to find a specific batch job.
// FindByJobID executes a SQL query to find a specific batch job.
// The job is queried using the slurm id and the clustername.
// It returns a pointer to a schema.Job data structure and an error variable.
// To check if no job was found test err == sql.ErrNoRows
func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime int64, cluster string) (*schema.Job, error) {
func (r *JobRepository) FindByJobID(ctx context.Context, jobID int64, startTime int64, cluster string) (*schema.Job, error) {
q := sq.Select(jobColumns...).
From("job").
Where("job.job_id = ?", jobId).
Where("job.job_id = ?", jobID).
Where("job.cluster = ?", cluster).
Where("job.start_time = ?", startTime)
@@ -201,19 +220,22 @@ func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime
return scanJob(q.RunWith(r.stmtCache).QueryRow())
}
// IsJobOwner executes a SQL query to find a specific batch job.
// The job is queried using the slurm id,a username and the cluster.
// It returns a bool.
// If job was found, user is owner: test err != sql.ErrNoRows
func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cluster string) bool {
// IsJobOwner checks if the specified user owns the batch job identified by jobID,
// startTime, and cluster. Returns true if the user is the owner, false otherwise.
// This method does not return errors; it returns false for both non-existent jobs
// and jobs owned by other users.
func (r *JobRepository) IsJobOwner(jobID int64, startTime int64, user string, cluster string) bool {
q := sq.Select("id").
From("job").
Where("job.job_id = ?", jobId).
Where("job.job_id = ?", jobID).
Where("job.hpc_user = ?", user).
Where("job.cluster = ?", cluster).
Where("job.start_time = ?", startTime)
_, err := scanJob(q.RunWith(r.stmtCache).QueryRow())
if err != nil && err != sql.ErrNoRows {
cclog.Warnf("IsJobOwner: unexpected error for jobID=%d, user=%s, cluster=%s: %v", jobID, user, cluster, err)
}
return err != sql.ErrNoRows
}
@@ -231,6 +253,11 @@ func (r *JobRepository) FindConcurrentJobs(
}
query = query.Where("cluster = ?", job.Cluster)
if len(job.Resources) == 0 {
return nil, fmt.Errorf("job has no resources defined")
}
var startTime int64
var stopTime int64
@@ -243,25 +270,28 @@ func (r *JobRepository) FindConcurrentJobs(
stopTime = startTime + int64(job.Duration)
}
// Add 200s overlap for jobs start time at the end
startTimeTail := startTime + 10
stopTimeTail := stopTime - 200
startTimeFront := startTime + 200
// Time buffer constant for finding overlapping jobs
// overlapBufferEnd: 200s buffer at job end to account for scheduling/cleanup overlap
const overlapBufferEnd = 200
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
"running", startTimeTail, stopTimeTail, startTime)
stopTimeTail := stopTime - overlapBufferEnd
startTimeFront := startTime + overlapBufferEnd
queryRunning := query.Where("job.job_state = ?", "running").
Where("job.start_time <= ?", stopTimeTail)
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
queryRunning = queryRunning.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
query = query.Where("job.job_state != ?", "running").
Where("job.start_time < ?", stopTimeTail).
Where("(job.start_time + job.duration) > ?", startTimeFront)
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
query = query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
cclog.Errorf("Error while running query: %v", err)
return nil, err
cclog.Errorf("Error while running concurrent jobs query: %v", err)
return nil, fmt.Errorf("failed to execute concurrent jobs query: %w", err)
}
defer rows.Close()
@@ -269,44 +299,44 @@ func (r *JobRepository) FindConcurrentJobs(
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
for rows.Next() {
var id, jobId, startTime sql.NullInt64
var id, jobID, startTime sql.NullInt64
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
cclog.Warn("Error while scanning rows")
return nil, err
if err = rows.Scan(&id, &jobID, &startTime); err != nil {
cclog.Warnf("Error while scanning concurrent job rows: %v", err)
return nil, fmt.Errorf("failed to scan concurrent job row: %w", err)
}
if id.Valid {
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
items = append(items,
&model.JobLink{
ID: fmt.Sprint(id.Int64),
JobID: int(jobId.Int64),
JobID: int(jobID.Int64),
})
}
}
rows, err = queryRunning.RunWith(r.stmtCache).Query()
if err != nil {
cclog.Errorf("Error while running query: %v", err)
return nil, err
cclog.Errorf("Error while running concurrent running jobs query: %v", err)
return nil, fmt.Errorf("failed to execute concurrent running jobs query: %w", err)
}
defer rows.Close()
for rows.Next() {
var id, jobId, startTime sql.NullInt64
var id, jobID, startTime sql.NullInt64
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
cclog.Warn("Error while scanning rows")
return nil, err
if err := rows.Scan(&id, &jobID, &startTime); err != nil {
cclog.Warnf("Error while scanning running concurrent job rows: %v", err)
return nil, fmt.Errorf("failed to scan running concurrent job row: %w", err)
}
if id.Valid {
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
items = append(items,
&model.JobLink{
ID: fmt.Sprint(id.Int64),
JobID: int(jobId.Int64),
JobID: int(jobID.Int64),
})
}
}

View File

@@ -2,16 +2,45 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package repository
import (
"sync"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// JobHook interface allows external components to hook into job lifecycle events.
// Implementations can perform actions when jobs start or stop, such as tagging,
// logging, notifications, or triggering external workflows.
//
// Example implementation:
//
// type MyJobTagger struct{}
//
// func (t *MyJobTagger) JobStartCallback(job *schema.Job) {
// if job.NumNodes > 100 {
// // Tag large jobs automatically
// }
// }
//
// func (t *MyJobTagger) JobStopCallback(job *schema.Job) {
// if job.State == schema.JobStateFailed {
// // Log or alert on failed jobs
// }
// }
//
// Register hooks during application initialization:
//
// repository.RegisterJobHook(&MyJobTagger{})
type JobHook interface {
// JobStartCallback is invoked when one or more jobs start.
// This is called synchronously, so implementations should be fast.
JobStartCallback(job *schema.Job)
// JobStopCallback is invoked when a job completes.
// This is called synchronously, so implementations should be fast.
JobStopCallback(job *schema.Job)
}
@@ -20,7 +49,13 @@ var (
hooks []JobHook
)
func RegisterJobJook(hook JobHook) {
// RegisterJobHook registers a JobHook to receive job lifecycle callbacks.
// Multiple hooks can be registered and will be called in registration order.
// This function is safe to call multiple times and is typically called during
// application initialization.
//
// Nil hooks are silently ignored to simplify conditional registration.
func RegisterJobHook(hook JobHook) {
initOnce.Do(func() {
hooks = make([]JobHook, 0)
})
@@ -30,6 +65,12 @@ func RegisterJobJook(hook JobHook) {
}
}
// CallJobStartHooks invokes all registered JobHook.JobStartCallback methods
// for each job in the provided slice. This is called internally by the repository
// when jobs are started (e.g., via StartJob or batch job imports).
//
// Hooks are called synchronously in registration order. If a hook panics,
// the panic will propagate to the caller.
func CallJobStartHooks(jobs []*schema.Job) {
if hooks == nil {
return
@@ -44,6 +85,12 @@ func CallJobStartHooks(jobs []*schema.Job) {
}
}
// CallJobStopHooks invokes all registered JobHook.JobStopCallback methods
// for the provided job. This is called internally by the repository when a
// job completes (e.g., via StopJob or job state updates).
//
// Hooks are called synchronously in registration order. If a hook panics,
// the panic will propagate to the caller.
func CallJobStopHooks(job *schema.Job) {
if hooks == nil {
return

View File

@@ -2,6 +2,10 @@
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package repository provides job query functionality with filtering, pagination,
// and security controls. This file contains the main query builders and security
// checks for job retrieval operations.
package repository
import (
@@ -14,11 +18,27 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel"
)
const (
// Default initial capacity for job result slices
defaultJobsCapacity = 50
)
// QueryJobs retrieves jobs from the database with optional filtering, pagination,
// and sorting. Security controls are automatically applied based on the user context.
//
// Parameters:
// - ctx: Context containing user authentication information
// - filters: Optional job filters (cluster, state, user, time ranges, etc.)
// - page: Optional pagination parameters (page number and items per page)
// - order: Optional sorting specification (column or footprint field)
//
// Returns a slice of jobs matching the criteria, or an error if the query fails.
// The function enforces role-based access control through SecurityCheck.
func (r *JobRepository) QueryJobs(
ctx context.Context,
filters []*model.JobFilter,
@@ -33,26 +53,24 @@ func (r *JobRepository) QueryJobs(
if order != nil {
field := toSnakeCase(order.Field)
if order.Type == "col" {
// "col": Fixed column name query
switch order.Order {
case model.SortDirectionEnumAsc:
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
case model.SortDirectionEnumDesc:
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
default:
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column")
return nil, errors.New("invalid sorting order for column")
}
} else {
// "foot": Order by footprint JSON field values
// Verify and Search Only in Valid Jsons
query = query.Where("JSON_VALID(meta_data)")
// Order by footprint JSON field values
query = query.Where("JSON_VALID(footprint)")
switch order.Order {
case model.SortDirectionEnumAsc:
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field))
case model.SortDirectionEnumDesc:
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field))
default:
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint")
return nil, errors.New("invalid sorting order for footprint")
}
}
}
@@ -69,29 +87,35 @@ func (r *JobRepository) QueryJobs(
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
queryString, queryVars, _ := query.ToSql()
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
return nil, err
return nil, fmt.Errorf("query failed [%s] %v: %w", queryString, queryVars, err)
}
defer rows.Close()
jobs := make([]*schema.Job, 0, 50)
jobs := make([]*schema.Job, 0, defaultJobsCapacity)
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
rows.Close()
cclog.Warn("Error while scanning rows (Jobs)")
return nil, err
cclog.Warnf("Error scanning job row: %v", err)
return nil, fmt.Errorf("failed to scan job row: %w", err)
}
jobs = append(jobs, job)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("error iterating job rows: %w", err)
}
return jobs, nil
}
// CountJobs returns the total number of jobs matching the given filters.
// Security controls are automatically applied based on the user context.
// Uses DISTINCT count to handle tag filters correctly (jobs may appear multiple
// times when joined with the tag table).
func (r *JobRepository) CountJobs(
ctx context.Context,
filters []*model.JobFilter,
) (int, error) {
// DISTICT count for tags filters, does not affect other queries
query, qerr := SecurityCheck(ctx, sq.Select("count(DISTINCT job.id)").From("job"))
if qerr != nil {
return 0, qerr
@@ -103,12 +127,22 @@ func (r *JobRepository) CountJobs(
var count int
if err := query.RunWith(r.DB).Scan(&count); err != nil {
return 0, err
return 0, fmt.Errorf("failed to count jobs: %w", err)
}
return count, nil
}
// SecurityCheckWithUser applies role-based access control filters to a job query
// based on the provided user's roles and permissions.
//
// Access rules by role:
// - API role (exclusive): Full access to all jobs
// - Admin/Support roles: Full access to all jobs
// - Manager role: Access to jobs in managed projects plus own jobs
// - User role: Access only to own jobs
//
// Returns an error if the user is nil or has no recognized roles.
func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBuilder, error) {
if user == nil {
var qnil sq.SelectBuilder
@@ -116,84 +150,68 @@ func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.Select
}
switch {
case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): // API-User : All jobs
case len(user.Roles) == 1 && user.HasRole(schema.RoleAPI):
return query, nil
case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): // Admin & Support : All jobs
case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}):
return query, nil
case user.HasRole(schema.RoleManager): // Manager : Add filter for managed projects' jobs only + personal jobs
case user.HasRole(schema.RoleManager):
if len(user.Projects) != 0 {
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.hpc_user": user.Username}}), nil
} else {
cclog.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
return query.Where("job.hpc_user = ?", user.Username), nil
}
case user.HasRole(schema.RoleUser): // User : Only personal jobs
cclog.Debugf("Manager '%s' has no assigned projects, restricting to personal jobs", user.Username)
return query.Where("job.hpc_user = ?", user.Username), nil
default: // No known Role, return error
case user.HasRole(schema.RoleUser):
return query.Where("job.hpc_user = ?", user.Username), nil
default:
var qnil sq.SelectBuilder
return qnil, fmt.Errorf("user has no or unknown roles")
}
}
// SecurityCheck extracts the user from the context and applies role-based access
// control filters to the query. This is a convenience wrapper around SecurityCheckWithUser.
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
user := GetUserFromContext(ctx)
return SecurityCheckWithUser(user, query)
}
// Build a sq.SelectBuilder out of a schema.JobFilter.
// BuildWhereClause constructs SQL WHERE conditions from a JobFilter and applies
// them to the query. Supports filtering by job properties (cluster, state, user),
// time ranges, resource usage, tags, and JSON field searches in meta_data,
// footprint, and resources columns.
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
if filter.Tags != nil {
// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
}
// Primary Key
if filter.DbID != nil {
dbIDs := make([]string, len(filter.DbID))
copy(dbIDs, filter.DbID)
query = query.Where(sq.Eq{"job.id": dbIDs})
}
if filter.JobID != nil {
query = buildStringCondition("job.job_id", filter.JobID, query)
}
if filter.ArrayJobID != nil {
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
}
if filter.User != nil {
query = buildStringCondition("job.hpc_user", filter.User, query)
}
if filter.Project != nil {
query = buildStringCondition("job.project", filter.Project, query)
}
if filter.JobName != nil {
query = buildMetaJsonCondition("jobName", filter.JobName, query)
}
// Explicit indices
if filter.Cluster != nil {
query = buildStringCondition("job.cluster", filter.Cluster, query)
}
if filter.SubCluster != nil {
query = buildStringCondition("job.subcluster", filter.SubCluster, query)
}
if filter.Partition != nil {
query = buildStringCondition("job.cluster_partition", filter.Partition, query)
}
if filter.StartTime != nil {
query = buildTimeCondition("job.start_time", filter.StartTime, query)
}
if filter.Duration != nil {
query = buildIntCondition("job.duration", filter.Duration, query)
}
if filter.MinRunningFor != nil {
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
}
if filter.Shared != nil {
query = query.Where("job.shared = ?", *filter.Shared)
}
if filter.State != nil {
states := make([]string, len(filter.State))
for i, val := range filter.State {
states[i] = string(val)
}
query = query.Where(sq.Eq{"job.job_state": states})
}
if filter.Shared != nil {
query = query.Where("job.shared = ?", *filter.Shared)
}
if filter.Project != nil {
query = buildStringCondition("job.project", filter.Project, query)
}
if filter.User != nil {
query = buildStringCondition("job.hpc_user", filter.User, query)
}
if filter.NumNodes != nil {
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
}
@@ -203,33 +221,95 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
if filter.NumHWThreads != nil {
query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
}
if filter.Node != nil {
query = buildResourceJsonCondition("hostname", filter.Node, query)
if filter.ArrayJobID != nil {
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
}
if filter.StartTime != nil {
query = buildTimeCondition("job.start_time", filter.StartTime, query)
}
if filter.Duration != nil {
query = buildIntCondition("job.duration", filter.Duration, query)
}
if filter.Energy != nil {
query = buildFloatCondition("job.energy", filter.Energy, query)
}
// Indices on Tag Table
if filter.Tags != nil {
// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
}
// No explicit Indices
if filter.JobID != nil {
query = buildStringCondition("job.job_id", filter.JobID, query)
}
// Queries Within JSONs
if filter.MetricStats != nil {
for _, ms := range filter.MetricStats {
query = buildFloatJsonCondition(ms.MetricName, ms.Range, query)
query = buildFloatJSONCondition(ms.MetricName, ms.Range, query)
}
}
if filter.Node != nil {
query = buildResourceJSONCondition("hostname", filter.Node, query)
}
if filter.JobName != nil {
query = buildMetaJSONCondition("jobName", filter.JobName, query)
}
if filter.Schedule != nil {
interactiveJobname := "interactive"
switch *filter.Schedule {
case "interactive":
iFilter := model.StringInput{Eq: &interactiveJobname}
query = buildMetaJSONCondition("jobName", &iFilter, query)
case "batch":
sFilter := model.StringInput{Neq: &interactiveJobname}
query = buildMetaJSONCondition("jobName", &sFilter, query)
}
}
// Configurable Filter to exclude recently started jobs, see config.go: ShortRunningJobsDuration
if filter.MinRunningFor != nil {
now := time.Now().Unix()
// Only jobs whose start timestamp is more than MinRunningFor seconds in the past
// If a job completed within the configured timeframe, it will still show up after the start_time matches the condition!
query = query.Where(sq.Lt{"job.start_time": (now - int64(*filter.MinRunningFor))})
}
return query
}
// buildIntCondition creates clauses for integer range filters, using BETWEEN only if required.
func buildIntCondition(field string, cond *config.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
if cond.From != 1 && cond.To != 0 {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
} else if cond.From != 1 && cond.To == 0 {
return query.Where(field+" >= ?", cond.From)
} else if cond.From == 1 && cond.To != 0 {
return query.Where(field+" <= ?", cond.To)
} else {
return query
}
}
// buildFloatCondition creates a clauses for float range filters, using BETWEEN only if required.
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
if cond.From != 1.0 && cond.To != 0.0 {
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
} else if cond.From != 1.0 && cond.To == 0.0 {
return query.Where(field+" >= ?", cond.From)
} else if cond.From == 1.0 && cond.To != 0.0 {
return query.Where(field+" <= ?", cond.To)
} else {
return query
}
}
// buildTimeCondition creates time range filters supporting absolute timestamps,
// relative time ranges (last6h, last24h, last7d, last30d), or open-ended ranges.
// Reminder: BETWEEN Queries are slower and dont use indices as frequently: Only use if both conditions required
func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
if cond.From != nil && cond.To != nil {
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
} else if cond.From != nil {
return query.Where("? <= "+field, cond.From.Unix())
return query.Where(field+" >= ?", cond.From.Unix())
} else if cond.To != nil {
return query.Where(field+" <= ?", cond.To.Unix())
} else if cond.Range != "" {
@@ -248,18 +328,28 @@ func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBui
cclog.Debugf("No known named timeRange: startTime.range = %s", cond.Range)
return query
}
return query.Where(field+" BETWEEN ? AND ?", then, now)
return query.Where(field+" >= ?", then)
} else {
return query
}
}
func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
// Verify and Search Only in Valid Jsons
// buildFloatJSONCondition creates a filter on a numeric field within the footprint JSON column, using BETWEEN only if required.
func buildFloatJSONCondition(jsonField string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
query = query.Where("JSON_VALID(footprint)")
return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To)
if cond.From != 1.0 && cond.To != 0.0 {
return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") BETWEEN ? AND ?", cond.From, cond.To)
} else if cond.From != 1.0 && cond.To == 0.0 {
return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") >= ?", cond.From)
} else if cond.From == 1.0 && cond.To != 0.0 {
return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") <= ?", cond.To)
} else {
return query
}
}
// buildStringCondition creates filters for string fields supporting equality,
// inequality, prefix, suffix, substring, and IN list matching.
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
if cond.Eq != nil {
return query.Where(field+" = ?", *cond.Eq)
@@ -284,10 +374,9 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select
return query
}
func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
// Verify and Search Only in Valid Jsons
// buildMetaJSONCondition creates filters on fields within the meta_data JSON column.
func buildMetaJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
query = query.Where("JSON_VALID(meta_data)")
// add "AND" Sql query Block for field match
if cond.Eq != nil {
return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") = ?", *cond.Eq)
}
@@ -306,10 +395,10 @@ func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.
return query
}
func buildResourceJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
// Verify and Search Only in Valid Jsons
// buildResourceJSONCondition creates filters on fields within the resources JSON array column.
// Uses json_each to search within array elements.
func buildResourceJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
query = query.Where("JSON_VALID(resources)")
// add "AND" Sql query Block for field match
if cond.Eq != nil {
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") = ?)", *cond.Eq)
}
@@ -333,15 +422,16 @@ var (
matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
)
// toSnakeCase converts camelCase strings to snake_case for SQL column names.
// Includes security checks to prevent SQL injection attempts.
// Panics if potentially dangerous characters are detected.
func toSnakeCase(str string) string {
for _, c := range str {
if c == '\'' || c == '\\' {
cclog.Panic("toSnakeCase() attack vector!")
if c == '\'' || c == '\\' || c == '"' || c == ';' || c == '-' || c == ' ' {
cclog.Panicf("toSnakeCase: potentially dangerous character detected in input: %q", str)
}
}
str = strings.ReplaceAll(str, "'", "")
str = strings.ReplaceAll(str, "\\", "")
snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
return strings.ToLower(snake)

View File

@@ -10,7 +10,7 @@ import (
"testing"
"time"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
)
@@ -33,7 +33,7 @@ func TestFind(t *testing.T) {
func TestFindById(t *testing.T) {
r := setup(t)
job, err := r.FindById(getContext(t), 338)
job, err := r.FindByID(getContext(t), 338)
if err != nil {
t.Fatal(err)
}
@@ -78,7 +78,7 @@ func TestFindJobsBetween(t *testing.T) {
// 1. Find a job to use (Find all jobs)
// We use a large time range to ensure we get something if it exists
jobs, err := r.FindJobsBetween(0, 9999999999, false)
jobs, err := r.FindJobsBetween(0, 9999999999, "none")
if err != nil {
t.Fatal(err)
}
@@ -88,21 +88,21 @@ func TestFindJobsBetween(t *testing.T) {
targetJob := jobs[0]
// 2. Create a tag
tagName := fmt.Sprintf("testtag_%d", time.Now().UnixNano())
tagId, err := r.CreateTag("testtype", tagName, "global")
// 2. Create an auto-tagger tag (type "app")
appTagName := fmt.Sprintf("apptag_%d", time.Now().UnixNano())
appTagID, err := r.CreateTag("app", appTagName, "global")
if err != nil {
t.Fatal(err)
}
// 3. Link Tag (Manually to avoid archive dependency side-effects in unit test)
_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagId)
// 3. Link auto-tagger tag to job
_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, appTagID)
if err != nil {
t.Fatal(err)
}
// 4. Search with omitTagged = false (Should find the job)
jobsFound, err := r.FindJobsBetween(0, 9999999999, false)
// 4. Search with omitTagged = "none" (Should find the job)
jobsFound, err := r.FindJobsBetween(0, 9999999999, "none")
if err != nil {
t.Fatal(err)
}
@@ -115,18 +115,58 @@ func TestFindJobsBetween(t *testing.T) {
}
}
if !found {
t.Errorf("Target job %d should be found when omitTagged=false", *targetJob.ID)
t.Errorf("Target job %d should be found when omitTagged=none", *targetJob.ID)
}
// 5. Search with omitTagged = true (Should NOT find the job)
jobsFiltered, err := r.FindJobsBetween(0, 9999999999, true)
// 5. Search with omitTagged = "all" (Should NOT find the job — it has a tag)
jobsFiltered, err := r.FindJobsBetween(0, 9999999999, "all")
if err != nil {
t.Fatal(err)
}
for _, j := range jobsFiltered {
if *j.ID == *targetJob.ID {
t.Errorf("Target job %d should NOT be found when omitTagged=true", *targetJob.ID)
t.Errorf("Target job %d should NOT be found when omitTagged=all", *targetJob.ID)
}
}
// 6. Search with omitTagged = "user": auto-tagger tag ("app") should NOT exclude the job
jobsUserFilter, err := r.FindJobsBetween(0, 9999999999, "user")
if err != nil {
t.Fatal(err)
}
found = false
for _, j := range jobsUserFilter {
if *j.ID == *targetJob.ID {
found = true
break
}
}
if !found {
t.Errorf("Target job %d should be found when omitTagged=user (only has auto-tagger tag)", *targetJob.ID)
}
// 7. Add a user-created tag (type "testtype") to the same job
userTagName := fmt.Sprintf("usertag_%d", time.Now().UnixNano())
userTagID, err := r.CreateTag("testtype", userTagName, "global")
if err != nil {
t.Fatal(err)
}
_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, userTagID)
if err != nil {
t.Fatal(err)
}
// 8. Now omitTagged = "user" should exclude the job (has a user-created tag)
jobsUserFilter2, err := r.FindJobsBetween(0, 9999999999, "user")
if err != nil {
t.Fatal(err)
}
for _, j := range jobsUserFilter2 {
if *j.ID == *targetJob.ID {
t.Errorf("Target job %d should NOT be found when omitTagged=user (has user-created tag)", *targetJob.ID)
}
}
}

View File

@@ -10,52 +10,48 @@ import (
"embed"
"fmt"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/golang-migrate/migrate/v4"
"github.com/golang-migrate/migrate/v4/database/mysql"
"github.com/golang-migrate/migrate/v4/database/sqlite3"
"github.com/golang-migrate/migrate/v4/source/iofs"
)
// Version is the current database schema version required by this version of cc-backend.
// When the database schema changes, this version is incremented and a new migration file
// is added to internal/repository/migrations/sqlite3/.
//
// Version history:
// - Version 10: Current version
//
// Migration files are embedded at build time from the migrations directory.
const Version uint = 10
//go:embed migrations/*
var migrationFiles embed.FS
func checkDBVersion(backend string, db *sql.DB) error {
var m *migrate.Migrate
// checkDBVersion verifies that the database schema version matches the expected version.
// This is called automatically during Connect() to ensure schema compatibility.
//
// Returns an error if:
// - Database version is older than expected (needs migration)
// - Database version is newer than expected (needs app upgrade)
// - Database is in a dirty state (failed migration)
//
// A "dirty" database indicates a migration was started but not completed successfully.
// This requires manual intervention to fix the database and force the version.
func checkDBVersion(db *sql.DB) error {
driver, err := sqlite3.WithInstance(db, &sqlite3.Config{})
if err != nil {
return err
}
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
if err != nil {
return err
}
switch backend {
case "sqlite3":
driver, err := sqlite3.WithInstance(db, &sqlite3.Config{})
if err != nil {
return err
}
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
if err != nil {
return err
}
m, err = migrate.NewWithInstance("iofs", d, "sqlite3", driver)
if err != nil {
return err
}
case "mysql":
driver, err := mysql.WithInstance(db, &mysql.Config{})
if err != nil {
return err
}
d, err := iofs.New(migrationFiles, "migrations/mysql")
if err != nil {
return err
}
m, err = migrate.NewWithInstance("iofs", d, "mysql", driver)
if err != nil {
return err
}
default:
cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
m, err := migrate.NewWithInstance("iofs", d, "sqlite3", driver)
if err != nil {
return err
}
v, dirty, err := m.Version()
@@ -80,37 +76,41 @@ func checkDBVersion(backend string, db *sql.DB) error {
return nil
}
func getMigrateInstance(backend string, db string) (m *migrate.Migrate, err error) {
switch backend {
case "sqlite3":
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
if err != nil {
cclog.Fatal(err)
}
// getMigrateInstance creates a new migration instance for the given database file.
// This is used internally by MigrateDB, RevertDB, and ForceDB.
func getMigrateInstance(db string) (m *migrate.Migrate, err error) {
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
if err != nil {
return nil, err
}
m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db))
if err != nil {
return m, err
}
case "mysql":
d, err := iofs.New(migrationFiles, "migrations/mysql")
if err != nil {
return m, err
}
m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("mysql://%s?multiStatements=true", db))
if err != nil {
return m, err
}
default:
cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db))
if err != nil {
return nil, err
}
return m, nil
}
func MigrateDB(backend string, db string) error {
m, err := getMigrateInstance(backend, db)
// MigrateDB applies all pending database migrations to bring the schema up to date.
// This should be run with the -migrate-db flag before starting the application
// after upgrading to a new version that requires schema changes.
//
// Process:
// 1. Checks current database version
// 2. Applies all migrations from current version to target Version
// 3. Updates schema_migrations table to track applied migrations
//
// Important:
// - Always backup your database before running migrations
// - Migrations are irreversible without manual intervention
// - If a migration fails, the database is marked "dirty" and requires manual fix
//
// Usage:
//
// cc-backend -migrate-db
func MigrateDB(db string) error {
m, err := getMigrateInstance(db)
if err != nil {
return err
}
@@ -118,7 +118,7 @@ func MigrateDB(backend string, db string) error {
v, dirty, err := m.Version()
if err != nil {
if err == migrate.ErrNilVersion {
cclog.Warn("Legacy database without version or missing database file!")
cclog.Info("Legacy database without version or missing database file!")
} else {
return err
}
@@ -144,8 +144,19 @@ func MigrateDB(backend string, db string) error {
return nil
}
func RevertDB(backend string, db string) error {
m, err := getMigrateInstance(backend, db)
// RevertDB rolls back the database schema to the previous version (Version - 1).
// This is primarily used for testing or emergency rollback scenarios.
//
// Warning:
// - This may cause data loss if newer schema added columns/tables
// - Always backup before reverting
// - Not all migrations are safely reversible
//
// Usage:
//
// cc-backend -revert-db
func RevertDB(db string) error {
m, err := getMigrateInstance(db)
if err != nil {
return err
}
@@ -162,8 +173,23 @@ func RevertDB(backend string, db string) error {
return nil
}
func ForceDB(backend string, db string) error {
m, err := getMigrateInstance(backend, db)
// ForceDB forces the database schema version to the current Version without running migrations.
// This is only used to recover from failed migrations that left the database in a "dirty" state.
//
// When to use:
// - After manually fixing a failed migration
// - When you've manually applied schema changes and need to update the version marker
//
// Warning:
// - This does NOT apply any schema changes
// - Only use after manually verifying the schema is correct
// - Improper use can cause schema/version mismatch
//
// Usage:
//
// cc-backend -force-db
func ForceDB(db string) error {
m, err := getMigrateInstance(db)
if err != nil {
return err
}

View File

@@ -1,5 +0,0 @@
DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tags;
DROP TABLE IF EXISTS jobtag;
DROP TABLE IF EXISTS configuration;
DROP TABLE IF EXISTS user;

View File

@@ -1,66 +0,0 @@
CREATE TABLE IF NOT EXISTS job (
id INTEGER AUTO_INCREMENT PRIMARY KEY ,
job_id BIGINT NOT NULL,
cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
start_time BIGINT NOT NULL, -- Unix timestamp
user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
`partition` VARCHAR(255) NOT NULL,
array_job_id BIGINT NOT NULL,
duration INT NOT NULL DEFAULT 0,
walltime INT NOT NULL DEFAULT 0,
job_state VARCHAR(255) NOT NULL
CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled',
'stopped', 'timeout', 'preempted', 'out_of_memory')),
meta_data TEXT, -- JSON
resources TEXT NOT NULL, -- JSON
num_nodes INT NOT NULL,
num_hwthreads INT NOT NULL,
num_acc INT NOT NULL,
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
mem_used_max REAL NOT NULL DEFAULT 0.0,
flops_any_avg REAL NOT NULL DEFAULT 0.0,
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
load_avg REAL NOT NULL DEFAULT 0.0,
net_bw_avg REAL NOT NULL DEFAULT 0.0,
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
file_bw_avg REAL NOT NULL DEFAULT 0.0,
file_data_vol_total REAL NOT NULL DEFAULT 0.0,
UNIQUE (job_id, cluster, start_time)
);
CREATE TABLE IF NOT EXISTS tag (
id INTEGER PRIMARY KEY,
tag_type VARCHAR(255) NOT NULL,
tag_name VARCHAR(255) NOT NULL,
UNIQUE (tag_type, tag_name));
CREATE TABLE IF NOT EXISTS jobtag (
job_id INTEGER,
tag_id INTEGER,
PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
CREATE TABLE IF NOT EXISTS user (
username varchar(255) PRIMARY KEY NOT NULL,
password varchar(255) DEFAULT NULL,
ldap tinyint NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */
name varchar(255) DEFAULT NULL,
roles varchar(255) NOT NULL DEFAULT "[]",
email varchar(255) DEFAULT NULL);
CREATE TABLE IF NOT EXISTS configuration (
username varchar(255),
confkey varchar(255),
value varchar(255),
PRIMARY KEY (username, confkey),
FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);

View File

@@ -1,8 +0,0 @@
DROP INDEX IF EXISTS job_stats;
DROP INDEX IF EXISTS job_by_user;
DROP INDEX IF EXISTS job_by_starttime;
DROP INDEX IF EXISTS job_by_job_id;
DROP INDEX IF EXISTS job_list;
DROP INDEX IF EXISTS job_list_user;
DROP INDEX IF EXISTS job_list_users;
DROP INDEX IF EXISTS job_list_users_start;

View File

@@ -1,8 +0,0 @@
CREATE INDEX IF NOT EXISTS job_stats ON job (cluster,subcluster,user);
CREATE INDEX IF NOT EXISTS job_by_user ON job (user);
CREATE INDEX IF NOT EXISTS job_by_starttime ON job (start_time);
CREATE INDEX IF NOT EXISTS job_by_job_id ON job (job_id);
CREATE INDEX IF NOT EXISTS job_list ON job (cluster, job_state);
CREATE INDEX IF NOT EXISTS job_list_user ON job (user, cluster, job_state);
CREATE INDEX IF NOT EXISTS job_list_users ON job (user, job_state);
CREATE INDEX IF NOT EXISTS job_list_users_start ON job (start_time, user, job_state);

View File

@@ -1 +0,0 @@
ALTER TABLE user DROP COLUMN projects;

View File

@@ -1 +0,0 @@
ALTER TABLE user ADD COLUMN projects varchar(255) NOT NULL DEFAULT "[]";

View File

@@ -1,5 +0,0 @@
ALTER TABLE job
MODIFY `partition` VARCHAR(255) NOT NULL,
MODIFY array_job_id BIGINT NOT NULL,
MODIFY num_hwthreads INT NOT NULL,
MODIFY num_acc INT NOT NULL;

View File

@@ -1,5 +0,0 @@
ALTER TABLE job
MODIFY `partition` VARCHAR(255),
MODIFY array_job_id BIGINT,
MODIFY num_hwthreads INT,
MODIFY num_acc INT;

View File

@@ -1,2 +0,0 @@
ALTER TABLE tag DROP COLUMN insert_time;
ALTER TABLE jobtag DROP COLUMN insert_time;

View File

@@ -1,2 +0,0 @@
ALTER TABLE tag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP;
ALTER TABLE jobtag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP;

View File

@@ -1 +0,0 @@
ALTER TABLE configuration MODIFY value VARCHAR(255);

View File

@@ -1 +0,0 @@
ALTER TABLE configuration MODIFY value TEXT;

View File

@@ -1,3 +0,0 @@
SET FOREIGN_KEY_CHECKS = 0;
ALTER TABLE tag MODIFY id INTEGER;
SET FOREIGN_KEY_CHECKS = 1;

View File

@@ -1,3 +0,0 @@
SET FOREIGN_KEY_CHECKS = 0;
ALTER TABLE tag MODIFY id INTEGER AUTO_INCREMENT;
SET FOREIGN_KEY_CHECKS = 1;

View File

@@ -1,83 +0,0 @@
ALTER TABLE job DROP energy;
ALTER TABLE job DROP energy_footprint;
ALTER TABLE job ADD COLUMN flops_any_avg;
ALTER TABLE job ADD COLUMN mem_bw_avg;
ALTER TABLE job ADD COLUMN mem_used_max;
ALTER TABLE job ADD COLUMN load_avg;
ALTER TABLE job ADD COLUMN net_bw_avg;
ALTER TABLE job ADD COLUMN net_data_vol_total;
ALTER TABLE job ADD COLUMN file_bw_avg;
ALTER TABLE job ADD COLUMN file_data_vol_total;
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
ALTER TABLE job DROP footprint;
-- Do not use reserved keywords anymore
RENAME TABLE hpc_user TO `user`;
ALTER TABLE job RENAME COLUMN hpc_user TO `user`;
ALTER TABLE job RENAME COLUMN cluster_partition TO `partition`;
DROP INDEX IF EXISTS jobs_cluster;
DROP INDEX IF EXISTS jobs_cluster_user;
DROP INDEX IF EXISTS jobs_cluster_project;
DROP INDEX IF EXISTS jobs_cluster_subcluster;
DROP INDEX IF EXISTS jobs_cluster_starttime;
DROP INDEX IF EXISTS jobs_cluster_duration;
DROP INDEX IF EXISTS jobs_cluster_numnodes;
DROP INDEX IF EXISTS jobs_cluster_partition;
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_cluster_jobstate;
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_user;
DROP INDEX IF EXISTS jobs_user_starttime;
DROP INDEX IF EXISTS jobs_user_duration;
DROP INDEX IF EXISTS jobs_user_numnodes;
DROP INDEX IF EXISTS jobs_project;
DROP INDEX IF EXISTS jobs_project_user;
DROP INDEX IF EXISTS jobs_project_starttime;
DROP INDEX IF EXISTS jobs_project_duration;
DROP INDEX IF EXISTS jobs_project_numnodes;
DROP INDEX IF EXISTS jobs_jobstate;
DROP INDEX IF EXISTS jobs_jobstate_user;
DROP INDEX IF EXISTS jobs_jobstate_project;
DROP INDEX IF EXISTS jobs_jobstate_starttime;
DROP INDEX IF EXISTS jobs_jobstate_duration;
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
DROP INDEX IF EXISTS jobs_starttime;
DROP INDEX IF EXISTS jobs_duration;
DROP INDEX IF EXISTS jobs_numnodes;
DROP INDEX IF EXISTS jobs_duration_starttime;
DROP INDEX IF EXISTS jobs_numnodes_starttime;
DROP INDEX IF EXISTS jobs_numacc_starttime;
DROP INDEX IF EXISTS jobs_energy_starttime;

View File

@@ -1,123 +0,0 @@
DROP INDEX IF EXISTS job_stats ON job;
DROP INDEX IF EXISTS job_by_user ON job;
DROP INDEX IF EXISTS job_by_starttime ON job;
DROP INDEX IF EXISTS job_by_job_id ON job;
DROP INDEX IF EXISTS job_list ON job;
DROP INDEX IF EXISTS job_list_user ON job;
DROP INDEX IF EXISTS job_list_users ON job;
DROP INDEX IF EXISTS job_list_users_start ON job;
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
ALTER TABLE job ADD COLUMN energy_footprint JSON;
ALTER TABLE job ADD COLUMN footprint JSON;
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
-- Do not use reserved keywords anymore
RENAME TABLE `user` TO hpc_user;
ALTER TABLE job RENAME COLUMN `user` TO hpc_user;
ALTER TABLE job RENAME COLUMN `partition` TO cluster_partition;
ALTER TABLE job MODIFY COLUMN cluster VARCHAR(50);
ALTER TABLE job MODIFY COLUMN hpc_user VARCHAR(50);
ALTER TABLE job MODIFY COLUMN subcluster VARCHAR(50);
ALTER TABLE job MODIFY COLUMN project VARCHAR(50);
ALTER TABLE job MODIFY COLUMN cluster_partition VARCHAR(50);
ALTER TABLE job MODIFY COLUMN job_state VARCHAR(25);
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
ALTER TABLE job DROP flops_any_avg;
ALTER TABLE job DROP mem_bw_avg;
ALTER TABLE job DROP mem_used_max;
ALTER TABLE job DROP load_avg;
ALTER TABLE job DROP net_bw_avg;
ALTER TABLE job DROP net_data_vol_total;
ALTER TABLE job DROP file_bw_avg;
ALTER TABLE job DROP file_data_vol_total;
-- Indices for: Single filters, combined filters, sorting, sorting with filters
-- Cluster Filter
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
-- Cluster Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
-- Cluster+Partition Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
-- Cluster+Partition Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
-- Cluster+Partition+Jobstate Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
-- Cluster+Partition+Jobstate Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
-- Cluster+JobState Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
-- Cluster+JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
-- User Filter
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
-- User Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
-- Project Filter
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
-- Project Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
-- JobState Filter
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
-- JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
-- ArrayJob Filter
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
-- Sorting without active filters
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
-- Single filters with default starttime sorting
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
-- Optimize DB index usage

View File

@@ -118,104 +118,116 @@ DROP TABLE lookup_exclusive;
DROP TABLE job; -- Deletes All Existing 'job' Indices; Recreate after Renaming
ALTER TABLE job_new RENAME TO job;
-- Recreate Indices from 08_add-footprint, include new submit_time indices
-- Recreate Indices from 08_add-footprint; include new 'shared' column
-- Cluster Filter
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
-- Cluster Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_submittime ON job (cluster, submit_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_numhwthreads ON job (cluster, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_numacc ON job (cluster, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_energy ON job (cluster, energy);
-- Cluster Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_duration_starttime ON job (cluster, duration, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime_duration ON job (cluster, start_time, duration);
-- Cluster+Partition Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_user ON job (cluster, cluster_partition, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_project ON job (cluster, cluster_partition, project);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_shared ON job (cluster, cluster_partition, shared);
-- Cluster+Partition Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_submittime ON job (cluster, cluster_partition, submit_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numhwthreads ON job (cluster, cluster_partition, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numacc ON job (cluster, cluster_partition, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_energy ON job (cluster, cluster_partition, energy);
-- Cluster+Partition+Jobstate Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
-- Cluster+Partition+Jobstate Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_submittime ON job (cluster, cluster_partition, job_state, submit_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numhwthreads ON job (cluster, cluster_partition, job_state, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numacc ON job (cluster, cluster_partition, job_state, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_energy ON job (cluster, cluster_partition, job_state, energy);
-- Cluster+Partition Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration_starttime ON job (cluster, cluster_partition, duration, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime_duration ON job (cluster, cluster_partition, start_time, duration);
-- Cluster+JobState Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
-- Cluster+JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_submittime ON job (cluster, job_state, submit_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numhwthreads ON job (cluster, job_state, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numacc ON job (cluster, job_state, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_energy ON job (cluster, job_state, energy);
-- Cluster+JobState Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime_duration ON job (cluster, job_state, start_time, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration_starttime ON job (cluster, job_state, duration, start_time);
-- Cluster+Shared Filter
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_user ON job (cluster, shared, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_project ON job (cluster, shared, project);
-- Cluster+Shared Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numnodes ON job (cluster, shared, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numhwthreads ON job (cluster, shared, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numacc ON job (cluster, shared, num_acc);
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_energy ON job (cluster, shared, energy);
-- Cluster+Shared Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_starttime_duration ON job (cluster, shared, start_time, duration);
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_duration_starttime ON job (cluster, shared, duration, start_time);
-- User Filter
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
-- User Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_user_numhwthreads ON job (hpc_user, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_user_numacc ON job (hpc_user, num_acc);
CREATE INDEX IF NOT EXISTS jobs_user_energy ON job (hpc_user, energy);
-- Cluster+Shared Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_user_starttime_duration ON job (hpc_user, start_time, duration);
CREATE INDEX IF NOT EXISTS jobs_user_duration_starttime ON job (hpc_user, duration, start_time);
-- Project Filter
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
-- Project Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_project_numhwthreads ON job (project, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_project_numacc ON job (project, num_acc);
CREATE INDEX IF NOT EXISTS jobs_project_energy ON job (project, energy);
-- Cluster+Shared Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_project_starttime_duration ON job (project, start_time, duration);
CREATE INDEX IF NOT EXISTS jobs_project_duration_starttime ON job (project, duration, start_time);
-- JobState Filter
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
-- JobState Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numhwthreads ON job (job_state, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_jobstate_numacc ON job (job_state, num_acc);
CREATE INDEX IF NOT EXISTS jobs_jobstate_energy ON job (job_state, energy);
-- Cluster+Shared Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime_duration ON job (job_state, start_time, duration);
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration_starttime ON job (job_state, duration, start_time);
-- Shared Filter
CREATE INDEX IF NOT EXISTS jobs_shared_user ON job (shared, hpc_user);
CREATE INDEX IF NOT EXISTS jobs_shared_project ON job (shared, project);
-- Shared Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_shared_numnodes ON job (shared, num_nodes);
CREATE INDEX IF NOT EXISTS jobs_shared_numhwthreads ON job (shared, num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_shared_numacc ON job (shared, num_acc);
CREATE INDEX IF NOT EXISTS jobs_shared_energy ON job (shared, energy);
-- Cluster+Shared Time Filter Sorting
CREATE INDEX IF NOT EXISTS jobs_shared_starttime_duration ON job (shared, start_time, duration);
CREATE INDEX IF NOT EXISTS jobs_shared_duration_starttime ON job (shared, duration, start_time);
-- ArrayJob Filter
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
-- Sorting without active filters
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
CREATE INDEX IF NOT EXISTS jobs_numhwthreads ON job (num_hwthreads);
CREATE INDEX IF NOT EXISTS jobs_numacc ON job (num_acc);
CREATE INDEX IF NOT EXISTS jobs_energy ON job (energy);
-- Single filters with default starttime sorting
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
@@ -223,6 +235,22 @@ CREATE INDEX IF NOT EXISTS jobs_numhwthreads_starttime ON job (num_hwthreads, st
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
-- Single filters with duration sorting
CREATE INDEX IF NOT EXISTS jobs_starttime_duration ON job (start_time, duration);
CREATE INDEX IF NOT EXISTS jobs_numnodes_duration ON job (num_nodes, duration);
CREATE INDEX IF NOT EXISTS jobs_numhwthreads_duration ON job (num_hwthreads, duration);
CREATE INDEX IF NOT EXISTS jobs_numacc_duration ON job (num_acc, duration);
CREATE INDEX IF NOT EXISTS jobs_energy_duration ON job (energy, duration);
-- Backup Indices For High Variety Columns
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
-- Notes:
-- Cluster+Partition+Jobstate Filter: Tested -> Full Array Of Combinations non-required
-- Cluster+JobState+Shared Filter: Tested -> No further timing improvement
-- JobState+Shared Filter: Tested -> No further timing improvement
-- Optimize DB index usage
PRAGMA optimize;

View File

@@ -23,6 +23,7 @@ CREATE TABLE "node_state" (
CHECK (health_state IN (
'full', 'partial', 'failed'
)),
health_metrics TEXT, -- JSON array of strings
node_id INTEGER,
FOREIGN KEY (node_id) REFERENCES node (id)
);
@@ -33,12 +34,11 @@ CREATE INDEX IF NOT EXISTS nodes_cluster_subcluster ON node (cluster, subcluster
-- Add NEW Indices For New Node_State Table Fields
CREATE INDEX IF NOT EXISTS nodestates_timestamp ON node_state (time_stamp);
CREATE INDEX IF NOT EXISTS nodestates_state ON node_state (node_state);
CREATE INDEX IF NOT EXISTS nodestates_health ON node_state (health_state);
CREATE INDEX IF NOT EXISTS nodestates_state_timestamp ON node_state (node_state, time_stamp);
CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp);
CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state);
CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state);
CREATE INDEX IF NOT EXISTS nodestates_nodeid_timestamp ON node_state (node_id, time_stamp DESC);
-- Add NEW Indices For Increased Amounts of Tags
CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id);

View File

@@ -10,14 +10,17 @@ import (
"database/sql"
"encoding/json"
"fmt"
"slices"
"sort"
"strings"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/lrucache"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/lrucache"
"github.com/ClusterCockpit/cc-lib/v2/schema"
sq "github.com/Masterminds/squirrel"
"github.com/jmoiron/sqlx"
)
@@ -49,6 +52,38 @@ func GetNodeRepository() *NodeRepository {
return nodeRepoInstance
}
// latestStateCondition returns a squirrel expression that restricts node_state
// rows to the latest per node_id using a correlated subquery.
// Requires the query to join node and node_state tables.
func latestStateCondition() sq.Sqlizer {
return sq.Expr(
"node_state.id = (SELECT ns2.id FROM node_state ns2 WHERE ns2.node_id = node.id ORDER BY ns2.time_stamp DESC LIMIT 1)",
)
}
// applyNodeFilters applies common NodeFilter conditions to a query that joins
// the node and node_state tables with latestStateCondition.
func applyNodeFilters(query sq.SelectBuilder, filters []*model.NodeFilter) sq.SelectBuilder {
for _, f := range filters {
if f.Cluster != nil {
query = buildStringCondition("node.cluster", f.Cluster, query)
}
if f.SubCluster != nil {
query = buildStringCondition("node.subcluster", f.SubCluster, query)
}
if f.Hostname != nil {
query = buildStringCondition("node.hostname", f.Hostname, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state.node_state = ?", f.SchedulerState)
}
if f.HealthState != nil {
query = query.Where("node_state.health_state = ?", f.HealthState)
}
}
return query
}
func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) {
start := time.Now()
@@ -79,17 +114,16 @@ func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[str
func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) {
node := &schema.Node{}
var timestamp int
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
"node_state.health_state", "MAX(node_state.time_stamp) as time").
From("node_state").
Join("node ON node_state.node_id = node.id").
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
"node_state.node_state", "node_state.health_state").
From("node").
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()).
Where("node.hostname = ?", hostname).
Where("node.cluster = ?", cluster).
GroupBy("node_state.node_id").
RunWith(r.DB).
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil {
cclog.Warnf("Error while querying node '%s' at time '%d' from database: %v", hostname, timestamp, err)
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
cclog.Warnf("Error while querying node '%s' from database: %v", hostname, err)
return nil, err
}
@@ -106,31 +140,28 @@ func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool)
return node, nil
}
func (r *NodeRepository) GetNodeById(id int64, withMeta bool) (*schema.Node, error) {
func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) {
node := &schema.Node{}
var timestamp int
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
"node_state.health_state", "MAX(node_state.time_stamp) as time").
From("node_state").
Join("node ON node_state.node_id = node.id").
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
"node_state.node_state", "node_state.health_state").
From("node").
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()).
Where("node.id = ?", id).
GroupBy("node_state.node_id").
RunWith(r.DB).
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil {
cclog.Warnf("Error while querying node ID '%d' at time '%d' from database: %v", id, timestamp, err)
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
cclog.Warnf("Error while querying node ID '%d' from database: %v", id, err)
return nil, err
}
// NEEDS METADATA BY ID
// if withMeta {
// var err error
// var meta map[string]string
// if meta, err = r.FetchMetadata(hostname, cluster); err != nil {
// cclog.Warnf("Error while fetching metadata for node '%s'", hostname)
// return nil, err
// }
// node.MetaData = meta
// }
if withMeta {
meta, metaErr := r.FetchMetadata(node.Hostname, node.Cluster)
if metaErr != nil {
cclog.Warnf("Error while fetching metadata for node ID '%d': %v", id, metaErr)
return nil, metaErr
}
node.MetaData = meta
}
return node, nil
}
@@ -166,9 +197,10 @@ func (r *NodeRepository) AddNode(node *schema.NodeDB) (int64, error) {
}
const NamedNodeStateInsert string = `
INSERT INTO node_state (time_stamp, node_state, health_state, cpus_allocated,
memory_allocated, gpus_allocated, jobs_running, node_id)
VALUES (:time_stamp, :node_state, :health_state, :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`
INSERT INTO node_state (time_stamp, node_state, health_state, health_metrics,
cpus_allocated, memory_allocated, gpus_allocated, jobs_running, node_id)
VALUES (:time_stamp, :node_state, :health_state, :health_metrics,
:cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`
// TODO: Add real Monitoring Health State
@@ -194,8 +226,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
return err
}
cclog.Infof("Added node '%s' to database", hostname)
return nil
cclog.Debugf("Added node '%s' to database", hostname)
} else {
cclog.Warnf("Error while querying node '%v' from database", id)
return err
@@ -209,7 +240,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
cclog.Errorf("Error while adding node state for '%v' to database", hostname)
return err
}
cclog.Infof("Updated node state for '%s' in database", hostname)
cclog.Debugf("Updated node state for '%s' in database", hostname)
return nil
}
@@ -222,6 +253,77 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
// return nil
// }
// NodeStateWithNode combines a node state row with denormalized node info.
type NodeStateWithNode struct {
ID int64 `db:"id"`
TimeStamp int64 `db:"time_stamp"`
NodeState string `db:"node_state"`
HealthState string `db:"health_state"`
HealthMetrics string `db:"health_metrics"`
CpusAllocated int `db:"cpus_allocated"`
MemoryAllocated int64 `db:"memory_allocated"`
GpusAllocated int `db:"gpus_allocated"`
JobsRunning int `db:"jobs_running"`
Hostname string `db:"hostname"`
Cluster string `db:"cluster"`
SubCluster string `db:"subcluster"`
}
// FindNodeStatesBefore returns all node_state rows with time_stamp < cutoff,
// joined with node info for denormalized archiving.
func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode, error) {
rows, err := sq.Select(
"node_state.id", "node_state.time_stamp", "node_state.node_state",
"node_state.health_state", "node_state.health_metrics",
"node_state.cpus_allocated", "node_state.memory_allocated",
"node_state.gpus_allocated", "node_state.jobs_running",
"node.hostname", "node.cluster", "node.subcluster",
).
From("node_state").
Join("node ON node_state.node_id = node.id").
Where(sq.Lt{"node_state.time_stamp": cutoff}).
Where("node_state.id NOT IN (SELECT ns2.id FROM node_state ns2 WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id))").
OrderBy("node.cluster ASC", "node.subcluster ASC", "node.hostname ASC", "node_state.time_stamp ASC").
RunWith(r.DB).Query()
if err != nil {
return nil, err
}
defer rows.Close()
var result []NodeStateWithNode
for rows.Next() {
var ns NodeStateWithNode
var healthMetrics sql.NullString
if err := rows.Scan(&ns.ID, &ns.TimeStamp, &ns.NodeState,
&ns.HealthState, &healthMetrics,
&ns.CpusAllocated, &ns.MemoryAllocated,
&ns.GpusAllocated, &ns.JobsRunning,
&ns.Hostname, &ns.Cluster, &ns.SubCluster); err != nil {
return nil, err
}
ns.HealthMetrics = healthMetrics.String
result = append(result, ns)
}
return result, nil
}
// DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff,
// but always preserves the row with the latest timestamp per node_id.
func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) {
res, err := r.DB.Exec(
`DELETE FROM node_state WHERE time_stamp < ?
AND id NOT IN (
SELECT id FROM node_state ns2
WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id)
)`,
cutoff,
)
if err != nil {
return 0, err
}
return res.RowsAffected()
}
func (r *NodeRepository) DeleteNode(id int64) error {
_, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id)
if err != nil {
@@ -241,38 +343,17 @@ func (r *NodeRepository) QueryNodes(
order *model.OrderByInput, // Currently unused!
) ([]*schema.Node, error) {
query, qerr := AccessCheck(ctx,
sq.Select("hostname", "cluster", "subcluster", "node_state", "health_state", "MAX(time_stamp) as time").
sq.Select("node.hostname", "node.cluster", "node.subcluster",
"node_state.node_state", "node_state.health_state").
From("node").
Join("node_state ON node_state.node_id = node.id"))
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()))
if qerr != nil {
return nil, qerr
}
for _, f := range filters {
if f.Cluster != nil {
query = buildStringCondition("cluster", f.Cluster, query)
}
if f.Subcluster != nil {
query = buildStringCondition("subcluster", f.Subcluster, query)
}
if f.Hostname != nil {
query = buildStringCondition("hostname", f.Hostname, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state = ?", f.SchedulerState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
}
if f.HealthState != nil {
query = query.Where("health_state = ?", f.HealthState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
}
}
query = query.GroupBy("node_id").OrderBy("hostname ASC")
query = applyNodeFilters(query, filters)
query = query.OrderBy("node.hostname ASC")
if page != nil && page.ItemsPerPage != -1 {
limit := uint64(page.ItemsPerPage)
@@ -290,11 +371,10 @@ func (r *NodeRepository) QueryNodes(
nodes := make([]*schema.Node, 0)
for rows.Next() {
node := schema.Node{}
var timestamp int
if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster,
&node.NodeState, &node.HealthState, &timestamp); err != nil {
&node.NodeState, &node.HealthState); err != nil {
rows.Close()
cclog.Warnf("Error while scanning rows (QueryNodes) at time '%d'", timestamp)
cclog.Warn("Error while scanning rows (QueryNodes)")
return nil, err
}
nodes = append(nodes, &node)
@@ -386,73 +466,115 @@ func (r *NodeRepository) QueryNodesWithMeta(
return nodes, nil
}
// CountNodes returns the total matched nodes based on a node filter. It always operates
// on the last state (largest timestamp).
func (r *NodeRepository) CountNodes(
// QueryNodesWithMeta returns a list of nodes based on a node filter. It always operates
// on the last state (largest timestamp). It includes both (!) optional JSON column data
func (r *NodeRepository) QueryNodesWithMeta(
ctx context.Context,
filters []*model.NodeFilter,
) (int, error) {
page *model.PageRequest,
order *model.OrderByInput, // Currently unused!
) ([]*schema.Node, error) {
query, qerr := AccessCheck(ctx,
sq.Select("time_stamp", "count(*) as countRes").
sq.Select("node.hostname", "node.cluster", "node.subcluster",
"node_state.node_state", "node_state.health_state",
"node.meta_data", "node_state.health_metrics").
From("node").
Join("node_state ON node_state.node_id = node.id"))
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()))
if qerr != nil {
return 0, qerr
return nil, qerr
}
for _, f := range filters {
if f.Cluster != nil {
query = buildStringCondition("cluster", f.Cluster, query)
}
if f.Subcluster != nil {
query = buildStringCondition("subcluster", f.Subcluster, query)
}
if f.Hostname != nil {
query = buildStringCondition("hostname", f.Hostname, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state = ?", f.SchedulerState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
}
if f.HealthState != nil {
query = query.Where("health_state = ?", f.HealthState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
}
}
query = applyNodeFilters(query, filters)
query = query.OrderBy("node.hostname ASC")
query = query.GroupBy("time_stamp").OrderBy("time_stamp DESC").Limit(1)
if page != nil && page.ItemsPerPage != -1 {
limit := uint64(page.ItemsPerPage)
query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
}
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
queryString, queryVars, _ := query.ToSql()
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
return nil, err
}
nodes := make([]*schema.Node, 0)
for rows.Next() {
node := schema.Node{}
RawMetaData := make([]byte, 0)
RawMetricHealth := make([]byte, 0)
if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster,
&node.NodeState, &node.HealthState, &RawMetaData, &RawMetricHealth); err != nil {
rows.Close()
cclog.Warn("Error while scanning rows (QueryNodes)")
return nil, err
}
if len(RawMetaData) == 0 {
node.MetaData = nil
} else {
metaData := make(map[string]string)
if err := json.Unmarshal(RawMetaData, &metaData); err != nil {
cclog.Warn("Error while unmarshaling raw metadata json")
return nil, err
}
node.MetaData = metaData
}
if len(RawMetricHealth) == 0 {
node.HealthData = nil
} else {
healthData := make(map[string][]string)
if err := json.Unmarshal(RawMetricHealth, &healthData); err != nil {
cclog.Warn("Error while unmarshaling raw healthdata json")
return nil, err
}
node.HealthData = healthData
}
nodes = append(nodes, &node)
}
return nodes, nil
}
// CountNodes returns the total matched nodes based on a node filter. It always operates
// on the last state (largest timestamp) per node.
func (r *NodeRepository) CountNodes(
ctx context.Context,
filters []*model.NodeFilter,
) (int, error) {
query, qerr := AccessCheck(ctx,
sq.Select("COUNT(*)").
From("node").
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()))
if qerr != nil {
return 0, qerr
}
query = applyNodeFilters(query, filters)
var count int
if err := query.RunWith(r.stmtCache).QueryRow().Scan(&count); err != nil {
queryString, queryVars, _ := query.ToSql()
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
return 0, err
}
var totalNodes int
for rows.Next() {
var timestamp int
if err := rows.Scan(&timestamp, &totalNodes); err != nil {
rows.Close()
cclog.Warnf("Error while scanning rows (CountNodes) at time '%d'", timestamp)
return 0, err
}
}
return totalNodes, nil
return count, nil
}
func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
q := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
"node_state.health_state", "MAX(node_state.time_stamp) as time").
q := sq.Select("node.hostname", "node.cluster", "node.subcluster",
"node_state.node_state", "node_state.health_state").
From("node").
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()).
Where("node.cluster = ?", cluster).
GroupBy("node_state.node_id").
OrderBy("node.hostname ASC")
rows, err := q.RunWith(r.DB).Query()
@@ -464,10 +586,9 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
defer rows.Close()
for rows.Next() {
node := &schema.Node{}
var timestamp int
if err := rows.Scan(&node.Hostname, &node.Cluster,
&node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil {
cclog.Warnf("Error while scanning node list (ListNodes) at time '%d'", timestamp)
&node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
cclog.Warn("Error while scanning node list (ListNodes)")
return nil, err
}
@@ -478,11 +599,11 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
}
func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
q := sq.Select("node.hostname", "node_state.node_state", "MAX(node_state.time_stamp) as time").
q := sq.Select("node.hostname", "node_state.node_state").
From("node").
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()).
Where("node.cluster = ?", cluster).
GroupBy("node_state.node_id").
OrderBy("node.hostname ASC")
rows, err := q.RunWith(r.DB).Query()
@@ -495,9 +616,8 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
defer rows.Close()
for rows.Next() {
var hostname, nodestate string
var timestamp int
if err := rows.Scan(&hostname, &nodestate, &timestamp); err != nil {
cclog.Warnf("Error while scanning node list (MapNodes) at time '%d'", timestamp)
if err := rows.Scan(&hostname, &nodestate); err != nil {
cclog.Warn("Error while scanning node list (MapNodes)")
return nil, err
}
@@ -509,37 +629,15 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) {
query, qerr := AccessCheck(ctx,
sq.Select(column, "COUNT(*) as count").
sq.Select(column).
From("node").
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()).
GroupBy(column))
Where(latestStateCondition()))
if qerr != nil {
return nil, qerr
}
query = query.Join("node_state ON node_state.node_id = node.id")
for _, f := range filters {
if f.Hostname != nil {
query = buildStringCondition("hostname", f.Hostname, query)
}
if f.Cluster != nil {
query = buildStringCondition("cluster", f.Cluster, query)
}
if f.Subcluster != nil {
query = buildStringCondition("subcluster", f.Subcluster, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state = ?", f.SchedulerState)
}
if f.HealthState != nil {
query = query.Where("health_state = ?", f.HealthState)
}
}
// Add Group and Order
query = query.GroupBy("hostname").OrderBy("hostname DESC")
query = applyNodeFilters(query, filters)
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
@@ -549,6 +647,18 @@ func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeF
}
defer rows.Close()
stateMap := map[string]int{}
for rows.Next() {
var state string
if err := rows.Scan(&state); err != nil {
rows.Close()
cclog.Warn("Error while scanning rows (CountStates)")
return nil, err
}
stateMap[state] += 1
}
nodes := make([]*model.NodeStates, 0)
for rows.Next() {
var state string
@@ -587,8 +697,8 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model.
if f.Cluster != nil {
query = buildStringCondition("cluster", f.Cluster, query)
}
if f.Subcluster != nil {
query = buildStringCondition("subcluster", f.Subcluster, query)
if f.SubCluster != nil {
query = buildStringCondition("subcluster", f.SubCluster, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state = ?", f.SchedulerState)
@@ -640,6 +750,132 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model.
return timedStates, nil
}
func (r *NodeRepository) GetNodesForList(
ctx context.Context,
cluster string,
subCluster string,
stateFilter string,
nodeFilter string,
page *model.PageRequest,
) ([]string, map[string]string, int, bool, error) {
// Init Return Vars
nodes := make([]string, 0)
stateMap := make(map[string]string)
countNodes := 0
hasNextPage := false
// Build Filters
queryFilters := make([]*model.NodeFilter, 0)
if cluster != "" {
queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}})
}
if subCluster != "" {
queryFilters = append(queryFilters, &model.NodeFilter{SubCluster: &model.StringInput{Eq: &subCluster}})
}
if nodeFilter != "" && stateFilter != "notindb" {
queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}})
}
if stateFilter != "all" && stateFilter != "notindb" {
queryState := schema.SchedulerState(stateFilter)
queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState})
}
// if healthFilter != "all" {
// filters = append(filters, &model.NodeFilter{HealthState: &healthFilter})
// }
// Special Case: Disable Paging for missing nodes filter, save IPP for later
var backupItems int
if stateFilter == "notindb" {
backupItems = page.ItemsPerPage
page.ItemsPerPage = -1
}
// Query Nodes From DB
rawNodes, serr := r.QueryNodes(ctx, queryFilters, page, nil) // Order not Used
if serr != nil {
cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)")
return nil, nil, 0, false, serr
}
// Intermediate Node Result Info
for _, node := range rawNodes {
if node == nil {
continue
}
nodes = append(nodes, node.Hostname)
stateMap[node.Hostname] = string(node.NodeState)
}
// Special Case: Find Nodes not in DB node table but in metricStore only
if stateFilter == "notindb" {
// Reapply Original Paging
page.ItemsPerPage = backupItems
// Get Nodes From Topology
var topoNodes []string
if subCluster != "" {
scNodes := archive.NodeLists[cluster][subCluster]
topoNodes = scNodes.PrintList()
} else {
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
topoNodes = append(topoNodes, nodeList.PrintList()...)
}
}
// Compare to all nodes from cluster/subcluster in DB
var missingNodes []string
for _, scanNode := range topoNodes {
if !slices.Contains(nodes, scanNode) {
missingNodes = append(missingNodes, scanNode)
}
}
// Filter nodes by name
if nodeFilter != "" {
filteredNodesByName := []string{}
for _, missingNode := range missingNodes {
if strings.Contains(missingNode, nodeFilter) {
filteredNodesByName = append(filteredNodesByName, missingNode)
}
}
missingNodes = filteredNodesByName
}
// Sort Missing Nodes Alphanumerically
slices.Sort(missingNodes)
// Total Missing
countNodes = len(missingNodes)
// Apply paging
if countNodes > page.ItemsPerPage {
start := (page.Page - 1) * page.ItemsPerPage
end := start + page.ItemsPerPage
if end > countNodes {
end = countNodes
hasNextPage = false
} else {
hasNextPage = true
}
nodes = missingNodes[start:end]
} else {
nodes = missingNodes
}
} else {
// DB Nodes: Count and derive hasNextPage from count
var cerr error
countNodes, cerr = r.CountNodes(ctx, queryFilters)
if cerr != nil {
cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
return nil, nil, 0, false, cerr
}
hasNextPage = page.Page*page.ItemsPerPage < countNodes
}
// Fallback for non-init'd node table in DB; Ignores stateFilter
if stateFilter == "all" && countNodes == 0 {
nodes, countNodes, hasNextPage = getNodesFromTopol(cluster, subCluster, nodeFilter, page)
}
return nodes, stateMap, countNodes, hasNextPage, nil
}
func AccessCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
user := GetUserFromContext(ctx)
return AccessCheckWithUser(user, query)
@@ -661,3 +897,51 @@ func AccessCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBu
return qnil, fmt.Errorf("user has no or unknown roles")
}
}
func getNodesFromTopol(cluster string, subCluster string, nodeFilter string, page *model.PageRequest) ([]string, int, bool) {
// 0) Init additional vars
hasNextPage := false
totalNodes := 0
// 1) Get list of all nodes
var topolNodes []string
if subCluster != "" {
scNodes := archive.NodeLists[cluster][subCluster]
topolNodes = scNodes.PrintList()
} else {
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
topolNodes = append(topolNodes, nodeList.PrintList()...)
}
}
// 2) Filter nodes
if nodeFilter != "" {
filteredNodes := []string{}
for _, node := range topolNodes {
if strings.Contains(node, nodeFilter) {
filteredNodes = append(filteredNodes, node)
}
}
topolNodes = filteredNodes
}
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after ccms return ...
totalNodes = len(topolNodes)
sort.Strings(topolNodes)
// 3) Apply paging
if len(topolNodes) > page.ItemsPerPage {
start := (page.Page - 1) * page.ItemsPerPage
end := start + page.ItemsPerPage
if end >= len(topolNodes) {
end = len(topolNodes)
hasNextPage = false
} else {
hasNextPage = true
}
topolNodes = topolNodes[start:end]
}
return topolNodes, totalNodes, hasNextPage
}

View File

@@ -15,9 +15,9 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
)
@@ -26,7 +26,7 @@ func nodeTestSetup(t *testing.T) {
"main": {
"addr": "0.0.0.0:8080",
"validate": false,
"apiAllowedIPs": [
"api-allowed-ips": [
"*"
]
},
@@ -38,18 +38,7 @@ func nodeTestSetup(t *testing.T) {
"jwts": {
"max-age": "2m"
}
},
"clusters": [
{
"name": "testcluster",
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
"filterRanges": {
"numNodes": { "from": 1, "to": 64 },
"duration": { "from": 0, "to": 86400 },
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
}
}
]
}`
const testclusterJSON = `{
"name": "testcluster",
@@ -130,7 +119,7 @@ func nodeTestSetup(t *testing.T) {
}
dbfilepath := filepath.Join(tmpdir, "test.db")
err := MigrateDB("sqlite3", dbfilepath)
err := MigrateDB(dbfilepath)
if err != nil {
t.Fatal(err)
}
@@ -144,19 +133,22 @@ func nodeTestSetup(t *testing.T) {
// Load and check main configuration
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
config.Init(cfg, clustercfg)
} else {
cclog.Abort("Cluster configuration must be present")
}
config.Init(cfg)
} else {
cclog.Abort("Main configuration must be present")
}
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
Connect("sqlite3", dbfilepath)
if err := ResetConnection(); err != nil {
t.Fatal(err)
}
t.Cleanup(func() {
ResetConnection()
})
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
Connect(dbfilepath)
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
t.Fatal(err)
}
}
@@ -164,8 +156,12 @@ func nodeTestSetup(t *testing.T) {
func TestUpdateNodeState(t *testing.T) {
nodeTestSetup(t)
repo := GetNodeRepository()
now := time.Now().Unix()
nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(), NodeState: "allocated",
TimeStamp: now,
NodeState: "allocated",
CpusAllocated: 72,
MemoryAllocated: 480,
GpusAllocated: 0,
@@ -173,18 +169,152 @@ func TestUpdateNodeState(t *testing.T) {
JobsRunning: 1,
}
repo := GetNodeRepository()
err := repo.UpdateNodeState("host124", "testcluster", &nodeState)
if err != nil {
return
t.Fatal(err)
}
node, err := repo.GetNode("host124", "testcluster", false)
if err != nil {
return
t.Fatal(err)
}
if node.NodeState != "allocated" {
t.Errorf("wrong node state\ngot: %s \nwant: allocated ", node.NodeState)
}
t.Run("FindBeforeEmpty", func(t *testing.T) {
// Only the current-timestamp row exists, so nothing should be found before now
rows, err := repo.FindNodeStatesBefore(now)
if err != nil {
t.Fatal(err)
}
if len(rows) != 0 {
t.Errorf("expected 0 rows, got %d", len(rows))
}
})
t.Run("DeleteOldRows", func(t *testing.T) {
// Insert 2 more old rows for host124
for i, ts := range []int64{now - 7200, now - 3600} {
ns := schema.NodeStateDB{
TimeStamp: ts,
NodeState: "allocated",
HealthState: schema.MonitoringStateFull,
CpusAllocated: 72,
MemoryAllocated: 480,
JobsRunning: i,
}
if err := repo.UpdateNodeState("host124", "testcluster", &ns); err != nil {
t.Fatal(err)
}
}
// Delete rows older than 30 minutes
cutoff := now - 1800
cnt, err := repo.DeleteNodeStatesBefore(cutoff)
if err != nil {
t.Fatal(err)
}
// Should delete the 2 old rows
if cnt != 2 {
t.Errorf("expected 2 deleted rows, got %d", cnt)
}
// Latest row should still exist
node, err := repo.GetNode("host124", "testcluster", false)
if err != nil {
t.Fatal(err)
}
if node.NodeState != "allocated" {
t.Errorf("expected node state 'allocated', got %s", node.NodeState)
}
})
t.Run("PreservesLatestPerNode", func(t *testing.T) {
// Insert a single old row for host125 — it's the latest per node so it must survive
ns := schema.NodeStateDB{
TimeStamp: now - 7200,
NodeState: "idle",
HealthState: schema.MonitoringStateFull,
CpusAllocated: 0,
MemoryAllocated: 0,
JobsRunning: 0,
}
if err := repo.UpdateNodeState("host125", "testcluster", &ns); err != nil {
t.Fatal(err)
}
// Delete everything older than now — the latest per node should be preserved
_, err := repo.DeleteNodeStatesBefore(now)
if err != nil {
t.Fatal(err)
}
// The latest row for host125 must still exist
node, err := repo.GetNode("host125", "testcluster", false)
if err != nil {
t.Fatal(err)
}
if node.NodeState != "idle" {
t.Errorf("expected node state 'idle', got %s", node.NodeState)
}
// Verify exactly 1 row remains for host125
var countAfter int
if err := repo.DB.QueryRow(
"SELECT COUNT(*) FROM node_state WHERE node_id = (SELECT id FROM node WHERE hostname = 'host125')").
Scan(&countAfter); err != nil {
t.Fatal(err)
}
if countAfter != 1 {
t.Errorf("expected 1 row remaining for host125, got %d", countAfter)
}
})
t.Run("FindBeforeWithJoin", func(t *testing.T) {
// Insert old and current rows for host123
for _, ts := range []int64{now - 7200, now} {
ns := schema.NodeStateDB{
TimeStamp: ts,
NodeState: "allocated",
HealthState: schema.MonitoringStateFull,
CpusAllocated: 8,
MemoryAllocated: 1024,
GpusAllocated: 1,
JobsRunning: 1,
}
if err := repo.UpdateNodeState("host123", "testcluster", &ns); err != nil {
t.Fatal(err)
}
}
// Find rows older than 30 minutes, excluding latest per node
cutoff := now - 1800
rows, err := repo.FindNodeStatesBefore(cutoff)
if err != nil {
t.Fatal(err)
}
// Should find the old host123 row
found := false
for _, row := range rows {
if row.Hostname == "host123" && row.TimeStamp == now-7200 {
found = true
if row.Cluster != "testcluster" {
t.Errorf("expected cluster 'testcluster', got %s", row.Cluster)
}
if row.SubCluster != "sc1" {
t.Errorf("expected subcluster 'sc1', got %s", row.SubCluster)
}
if row.CpusAllocated != 8 {
t.Errorf("expected cpus_allocated 8, got %d", row.CpusAllocated)
}
}
}
if !found {
t.Errorf("expected to find old host123 row among %d results", len(rows))
}
})
}

View File

@@ -6,11 +6,13 @@ package repository
import (
"context"
"os"
"path/filepath"
"testing"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
"github.com/ClusterCockpit/cc-lib/schema"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
_ "github.com/mattn/go-sqlite3"
)
@@ -46,7 +48,7 @@ func BenchmarkSelect1(b *testing.B) {
}
func BenchmarkDB_FindJobById(b *testing.B) {
var jobId int64 = 1677322
var jobID int64 = 1677322
b.Run("FindJobById", func(b *testing.B) {
db := setup(b)
@@ -55,7 +57,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
_, err := db.FindById(getContext(b), jobId)
_, err := db.FindByID(getContext(b), jobID)
noErr(b, err)
}
})
@@ -63,7 +65,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {
}
func BenchmarkDB_FindJob(b *testing.B) {
var jobId int64 = 107266
var jobID int64 = 107266
var startTime int64 = 1657557241
cluster := "fritz"
@@ -74,7 +76,7 @@ func BenchmarkDB_FindJob(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
_, err := db.Find(&jobId, &cluster, &startTime)
_, err := db.Find(&jobID, &cluster, &startTime)
noErr(b, err)
}
})
@@ -148,10 +150,24 @@ func getContext(tb testing.TB) context.Context {
func setup(tb testing.TB) *JobRepository {
tb.Helper()
cclog.Init("warn", true)
dbfile := "testdata/job.db"
err := MigrateDB("sqlite3", dbfile)
// Copy test DB to a temp file for test isolation
srcData, err := os.ReadFile("testdata/job.db")
noErr(tb, err)
Connect("sqlite3", dbfile)
dbfile := filepath.Join(tb.TempDir(), "job.db")
err = os.WriteFile(dbfile, srcData, 0o644)
noErr(tb, err)
// Reset singletons so Connect uses the new temp DB
err = ResetConnection()
noErr(tb, err)
tb.Cleanup(func() {
ResetConnection()
})
err = MigrateDB(dbfile)
noErr(tb, err)
Connect(dbfile)
return GetJobRepository()
}

Some files were not shown because too many files have changed in this diff Show More