mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-20 06:47:30 +01:00
Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix
This commit is contained in:
@@ -23,47 +23,45 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func setup(t *testing.T) *api.RestAPI {
|
||||
repository.ResetConnection()
|
||||
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"metric-store": {
|
||||
"checkpoints": {
|
||||
"interval": "12h"
|
||||
},
|
||||
"retention-in-memory": "48h",
|
||||
"memory-cap": 100
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
}
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
}
|
||||
]
|
||||
}`
|
||||
const testclusterJSON = `{
|
||||
"name": "testcluster",
|
||||
@@ -141,7 +139,7 @@ func setup(t *testing.T) *api.RestAPI {
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -152,28 +150,23 @@ func setup(t *testing.T) *api.RestAPI {
|
||||
}
|
||||
|
||||
ccconf.Init(cfgFilePath)
|
||||
metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
config.Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
repository.Connect("sqlite3", dbfilepath)
|
||||
repository.Connect(dbfilepath)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := metricdata.Init(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// metricstore initialization removed - it's initialized via callback in tests
|
||||
|
||||
archiver.Start(repository.GetJobRepository(), context.Background())
|
||||
|
||||
@@ -190,11 +183,9 @@ func setup(t *testing.T) *api.RestAPI {
|
||||
}
|
||||
|
||||
func cleanup() {
|
||||
// Gracefully shutdown archiver with timeout
|
||||
if err := archiver.Shutdown(5 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
|
||||
}
|
||||
// TODO: Clear all caches, reset all modules, etc...
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -221,16 +212,14 @@ func TestRestApi(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
r := mux.NewRouter()
|
||||
r.PathPrefix("/api").Subrouter()
|
||||
r.StrictSlash(true)
|
||||
r := chi.NewRouter()
|
||||
restapi.MountAPIRoutes(r)
|
||||
|
||||
var TestJobId int64 = 123
|
||||
var TestJobID int64 = 123
|
||||
TestClusterName := "testcluster"
|
||||
var TestStartTime int64 = 123456789
|
||||
|
||||
@@ -280,7 +269,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
// resolver := graph.GetResolverInstance()
|
||||
restapi.JobRepository.SyncJobs()
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -338,7 +327,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
|
||||
// Archiving happens asynchronously, will be completed in cleanup
|
||||
job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
|
||||
job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -366,7 +355,7 @@ func TestRestApi(t *testing.T) {
|
||||
}
|
||||
|
||||
t.Run("CheckArchive", func(t *testing.T) {
|
||||
data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
|
||||
data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -464,4 +453,198 @@ func TestRestApi(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatal("subtest failed")
|
||||
}
|
||||
|
||||
t.Run("GetUsedNodesNoRunning", func(t *testing.T) {
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"api"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/jobs/used_nodes?ts=123456790", nil)
|
||||
recorder := httptest.NewRecorder()
|
||||
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
response := recorder.Result()
|
||||
if response.StatusCode != http.StatusOK {
|
||||
t.Fatal(response.Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
var result api.GetUsedNodesAPIResponse
|
||||
if err := json.NewDecoder(response.Body).Decode(&result); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if result.UsedNodes == nil {
|
||||
t.Fatal("expected usedNodes to be non-nil")
|
||||
}
|
||||
|
||||
if len(result.UsedNodes) != 0 {
|
||||
t.Fatalf("expected no used nodes for stopped jobs, got: %v", result.UsedNodes)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestStopJobWithReusedJobId verifies that stopping a recently started job works
|
||||
// even when an older job with the same jobId exists in the job table (e.g. with
|
||||
// state "failed"). This is a regression test for the bug where Find() on the job
|
||||
// table would match the old job instead of the new one still in job_cache.
|
||||
func TestStopJobWithReusedJobId(t *testing.T) {
|
||||
restapi := setup(t)
|
||||
t.Cleanup(cleanup)
|
||||
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
Unit: schema.Unit{Base: "load"},
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "host123",
|
||||
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
r := chi.NewRouter()
|
||||
restapi.MountAPIRoutes(r)
|
||||
|
||||
const contextUserKey repository.ContextKey = "user"
|
||||
contextUserValue := &schema.User{
|
||||
Username: "testuser",
|
||||
Projects: make([]string, 0),
|
||||
Roles: []string{"user"},
|
||||
AuthType: 0,
|
||||
AuthSource: 2,
|
||||
}
|
||||
|
||||
// Step 1: Start the first job (jobId=999)
|
||||
const startJobBody1 string = `{
|
||||
"jobId": 999,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "default",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
|
||||
"startTime": 200000000
|
||||
}`
|
||||
|
||||
if ok := t.Run("StartFirstJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody1)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusCreated {
|
||||
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// Step 2: Sync to move job from cache to job table, then stop it as "failed"
|
||||
time.Sleep(1 * time.Second)
|
||||
restapi.JobRepository.SyncJobs()
|
||||
|
||||
const stopJobBody1 string = `{
|
||||
"jobId": 999,
|
||||
"startTime": 200000000,
|
||||
"cluster": "testcluster",
|
||||
"jobState": "failed",
|
||||
"stopTime": 200001000
|
||||
}`
|
||||
|
||||
if ok := t.Run("StopFirstJobAsFailed", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody1)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusOK {
|
||||
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
|
||||
jobid, cluster := int64(999), "testcluster"
|
||||
job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if job.State != schema.JobStateFailed {
|
||||
t.Fatalf("expected first job to be failed, got: %s", job.State)
|
||||
}
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// Wait for archiving to complete
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Step 3: Start a NEW job with the same jobId=999 but different startTime.
|
||||
// This job will sit in job_cache (not yet synced).
|
||||
const startJobBody2 string = `{
|
||||
"jobId": 999,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "default",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
|
||||
"startTime": 300000000
|
||||
}`
|
||||
|
||||
if ok := t.Run("StartSecondJob", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody2)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusCreated {
|
||||
t.Fatal(recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
}); !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// Step 4: Stop the second job WITHOUT syncing first.
|
||||
// Before the fix, this would fail because Find() on the job table would
|
||||
// match the old failed job (jobId=999) and reject with "already stopped".
|
||||
const stopJobBody2 string = `{
|
||||
"jobId": 999,
|
||||
"startTime": 300000000,
|
||||
"cluster": "testcluster",
|
||||
"jobState": "completed",
|
||||
"stopTime": 300001000
|
||||
}`
|
||||
|
||||
t.Run("StopSecondJobBeforeSync", func(t *testing.T) {
|
||||
req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody2)))
|
||||
recorder := httptest.NewRecorder()
|
||||
ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
|
||||
r.ServeHTTP(recorder, req.WithContext(ctx))
|
||||
if recorder.Result().StatusCode != http.StatusOK {
|
||||
t.Fatalf("expected stop to succeed for cached job, got: %s %s",
|
||||
recorder.Result().Status, recorder.Body.String())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// GetClustersAPIResponse model
|
||||
@@ -27,7 +27,7 @@ type GetClustersAPIResponse struct {
|
||||
// @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
|
||||
// @produce json
|
||||
// @param cluster query string false "Job Cluster"
|
||||
// @success 200 {object} api.GetClustersApiResponse "Array of clusters"
|
||||
// @success 200 {object} api.GetClustersAPIResponse "Array of clusters"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -36,9 +36,9 @@ type GetClustersAPIResponse struct {
|
||||
// @router /api/clusters/ [get]
|
||||
func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleApi) {
|
||||
!user.HasRole(schema.RoleAPI) {
|
||||
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
1235
internal/api/docs.go
1235
internal/api/docs.go
File diff suppressed because it is too large
Load Diff
@@ -22,12 +22,12 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-chi/chi/v5"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -72,6 +72,14 @@ type EditMetaRequest struct {
|
||||
Value string `json:"value" example:"bash script"`
|
||||
}
|
||||
|
||||
// JobMetaRequest model
|
||||
type JobMetaRequest struct {
|
||||
JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
|
||||
Cluster *string `json:"cluster" example:"fritz"` // Cluster of job
|
||||
StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch
|
||||
Payload EditMetaRequest `json:"payload"` // Content to Add to Job Meta_Data
|
||||
}
|
||||
|
||||
type TagJobAPIRequest []*APITag
|
||||
|
||||
type GetJobAPIRequest []string
|
||||
@@ -104,7 +112,7 @@ type JobMetricWithName struct {
|
||||
// @param items-per-page query int false "Items per page (Default: 25)"
|
||||
// @param page query int false "Page Number (Default: 1)"
|
||||
// @param with-metadata query bool false "Include metadata (e.g. jobScript) in response"
|
||||
// @success 200 {object} api.GetJobsApiResponse "Job array and page info"
|
||||
// @success 200 {object} api.GetJobsAPIResponse "Job array and page info"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -232,7 +240,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @param all-metrics query bool false "Include all available metrics"
|
||||
// @success 200 {object} api.GetJobApiResponse "Job resource"
|
||||
// @success 200 {object} api.GetJobAPIResponse "Job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -243,17 +251,17 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
|
||||
// @router /api/jobs/{id} [get]
|
||||
func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID
|
||||
job, err = api.JobRepository.FindByID(r.Context(), id) // Get Job from Repo by ID
|
||||
} else {
|
||||
handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -293,7 +301,7 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
|
||||
if r.URL.Query().Get("all-metrics") == "true" {
|
||||
data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
|
||||
data, err = metricdispatch.LoadData(job, nil, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
@@ -324,8 +332,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @param request body api.GetJobApiRequest true "Array of metric names"
|
||||
// @success 200 {object} api.GetJobApiResponse "Job resource"
|
||||
// @param request body api.GetJobAPIRequest true "Array of metric names"
|
||||
// @success 200 {object} api.GetJobAPIResponse "Job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -336,17 +344,17 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
|
||||
// @router /api/jobs/{id} [post]
|
||||
func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err = api.JobRepository.FindById(r.Context(), id)
|
||||
job, err = api.JobRepository.FindByID(r.Context(), id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -389,7 +397,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
resolution = max(resolution, mc.Timestep)
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
|
||||
data, err := metricdispatch.LoadData(job, metrics, scopes, r.Context(), resolution)
|
||||
if err != nil {
|
||||
cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
|
||||
return
|
||||
@@ -423,29 +431,29 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// editMeta godoc
|
||||
// @summary Edit meta-data json
|
||||
// @summary Edit meta-data json of job identified by database id
|
||||
// @tags Job add and modify
|
||||
// @description Edit key value pairs in job metadata json
|
||||
// @description Edit key value pairs in job metadata json of job specified by database id
|
||||
// @description If a key already exists its content will be overwritten
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.EditMetaRequest true "Kay value pair to add"
|
||||
// @param request body api.EditMetaRequest true "Metadata Key value pair to add or update"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/edit_meta/{id} [post]
|
||||
// @router /api/jobs/edit_meta/{id} [patch]
|
||||
func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
job, err := api.JobRepository.FindByID(r.Context(), id)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
@@ -469,6 +477,54 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// editMetaByRequest godoc
|
||||
// @summary Edit meta-data json of job identified by request
|
||||
// @tags Job add and modify
|
||||
// @description Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster
|
||||
// @description If a key already exists its content will be overwritten
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body api.JobMetaRequest true "Specifies job and payload to add or update"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 404 {object} api.ErrorResponse "Job does not exist"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/edit_meta/ [patch]
|
||||
func (api *RestAPI) editMetaByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// Parse request body
|
||||
req := JobMetaRequest{}
|
||||
if err := decode(r.Body, &req); err != nil {
|
||||
handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch job (that will have its meta_data edited) from db
|
||||
var job *schema.Job
|
||||
var err error
|
||||
if req.JobId == nil {
|
||||
handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
// log.Printf("loading db job for editMetaByRequest... : JobMetaRequest=%v", req)
|
||||
job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := api.JobRepository.UpdateMetadata(job, req.Payload.Key, req.Payload.Value); err != nil {
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(rw).Encode(job)
|
||||
}
|
||||
|
||||
// tagJob godoc
|
||||
// @summary Adds one or more tags to a job
|
||||
// @tags Job add and modify
|
||||
@@ -478,7 +534,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to add"
|
||||
// @param request body api.TagJobAPIRequest true "Array of tag-objects to add"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -487,13 +543,13 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/tag_job/{id} [post]
|
||||
func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
job, err := api.JobRepository.FindByID(r.Context(), id)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
@@ -542,7 +598,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param id path int true "Job Database ID"
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
|
||||
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
|
||||
// @success 200 {object} schema.Job "Updated job resource"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -551,13 +607,13 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /jobs/tag_job/{id} [delete]
|
||||
func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
|
||||
id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
job, err := api.JobRepository.FindById(r.Context(), id)
|
||||
job, err := api.JobRepository.FindByID(r.Context(), id)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
@@ -606,7 +662,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Tag wills be removed from respective archive files.
|
||||
// @accept json
|
||||
// @produce plain
|
||||
// @param request body api.TagJobApiRequest true "Array of tag-objects to remove"
|
||||
// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove"
|
||||
// @success 200 {string} string "Success Response"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -650,7 +706,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) {
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body schema.Job true "Job to add"
|
||||
// @success 201 {object} api.DefaultApiResponse "Job added successfully"
|
||||
// @success 201 {object} api.DefaultAPIResponse "Job added successfully"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -691,13 +747,21 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
for _, job := range jobs {
|
||||
// Check if jobs are within the same day (prevent duplicates)
|
||||
if (req.StartTime - job.StartTime) < secondsPerDay {
|
||||
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
|
||||
handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", *job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
id, err := api.JobRepository.Start(&req)
|
||||
// When tags are present, insert directly into the job table so that the
|
||||
// returned ID can be used with AddTagOrCreate (which queries the job table).
|
||||
// Jobs without tags use the cache path as before.
|
||||
var id int64
|
||||
if len(req.Tags) > 0 {
|
||||
id, err = api.JobRepository.StartDirect(&req)
|
||||
} else {
|
||||
id, err = api.JobRepository.Start(&req)
|
||||
}
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
@@ -728,7 +792,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Job to stop is specified by request body. All fields are required in this case.
|
||||
// @description Returns full job resource information according to 'Job' scheme.
|
||||
// @produce json
|
||||
// @param request body api.StopJobApiRequest true "All fields required"
|
||||
// @param request body api.StopJobAPIRequest true "All fields required"
|
||||
// @success 200 {object} schema.Job "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
@@ -754,20 +818,20 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
|
||||
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||
isCached := false
|
||||
job, err = api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
// Try cached jobs if not found in main repository
|
||||
cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||
if cachedErr != nil {
|
||||
// Combine both errors for better debugging
|
||||
handleError(fmt.Errorf("finding job failed: %w (cached lookup also failed: %v)", err, cachedErr), http.StatusNotFound, rw)
|
||||
// Not in cache, try main job table
|
||||
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
|
||||
return
|
||||
}
|
||||
job = cachedJob
|
||||
} else {
|
||||
isCached = true
|
||||
}
|
||||
|
||||
api.checkAndHandleStopJob(rw, job, req)
|
||||
api.checkAndHandleStopJob(rw, job, req, isCached)
|
||||
}
|
||||
|
||||
// deleteJobByID godoc
|
||||
@@ -776,7 +840,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Job to remove is specified by database ID. This will not remove the job from the job archive.
|
||||
// @produce json
|
||||
// @param id path int true "Database ID of Job"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -787,16 +851,16 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
|
||||
// @router /api/jobs/delete_job/{id} [delete]
|
||||
func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
id, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
err = api.JobRepository.DeleteJobById(id)
|
||||
err = api.JobRepository.DeleteJobByID(id)
|
||||
} else {
|
||||
handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -820,8 +884,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
|
||||
// @description Job to delete is specified by request body. All fields are required in this case.
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body api.DeleteJobApiRequest true "All fields required"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @param request body api.DeleteJobAPIRequest true "All fields required"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -852,7 +916,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
err = api.JobRepository.DeleteJobById(*job.ID)
|
||||
err = api.JobRepository.DeleteJobByID(*job.ID)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
@@ -861,7 +925,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{
|
||||
Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
|
||||
Message: fmt.Sprintf("Successfully deleted job %d", *job.ID),
|
||||
}); err != nil {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
@@ -873,7 +937,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
// @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
|
||||
// @produce json
|
||||
// @param ts path int true "Unix epoch timestamp"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -886,9 +950,9 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
|
||||
func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
var cnt int
|
||||
// Fetch job (that will be stopped) from db
|
||||
id, ok := mux.Vars(r)["ts"]
|
||||
id := chi.URLParam(r, "ts")
|
||||
var err error
|
||||
if ok {
|
||||
if id != "" {
|
||||
ts, e := strconv.ParseInt(id, 10, 64)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw)
|
||||
@@ -896,11 +960,13 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// Check for omit-tagged query parameter
|
||||
omitTagged := false
|
||||
omitTagged := "none"
|
||||
if omitTaggedStr := r.URL.Query().Get("omit-tagged"); omitTaggedStr != "" {
|
||||
omitTagged, e = strconv.ParseBool(omitTaggedStr)
|
||||
if e != nil {
|
||||
handleError(fmt.Errorf("boolean expected for omit-tagged parameter: %w", e), http.StatusBadRequest, rw)
|
||||
switch omitTaggedStr {
|
||||
case "none", "all", "user":
|
||||
omitTagged = omitTaggedStr
|
||||
default:
|
||||
handleError(fmt.Errorf("omit-tagged must be one of: none, all, user"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -924,20 +990,20 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) {
|
||||
func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) {
|
||||
// Sanity checks
|
||||
if job.State != schema.JobStateRunning {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if job.StartTime > req.StopTime {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, *job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, *job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
|
||||
return
|
||||
} else if req.State == "" {
|
||||
req.State = schema.JobStateCompleted
|
||||
@@ -949,14 +1015,24 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
|
||||
api.JobRepository.Mutex.Lock()
|
||||
defer api.JobRepository.Mutex.Unlock()
|
||||
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
|
||||
// If the job is still in job_cache, transfer it to the job table first
|
||||
// so that job.ID always points to the job table for downstream code
|
||||
if isCached {
|
||||
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
|
||||
job.ID = &newID
|
||||
}
|
||||
|
||||
cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
|
||||
// Send a response (with status OK). This means that errors that happen from here on forward
|
||||
// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
|
||||
@@ -977,7 +1053,7 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
|
||||
}
|
||||
|
||||
func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
id := mux.Vars(r)["id"]
|
||||
id := chi.URLParam(r, "id")
|
||||
metrics := r.URL.Query()["metric"]
|
||||
var scopes []schema.MetricScope
|
||||
for _, scope := range r.URL.Query()["scope"] {
|
||||
@@ -1022,3 +1098,57 @@ func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// GetUsedNodesAPIResponse model
|
||||
type GetUsedNodesAPIResponse struct {
|
||||
UsedNodes map[string][]string `json:"usedNodes"` // Map of cluster names to lists of used node hostnames
|
||||
}
|
||||
|
||||
// getUsedNodes godoc
|
||||
// @summary Lists used nodes by cluster
|
||||
// @tags Job query
|
||||
// @description Get a map of cluster names to lists of unique hostnames that are currently in use by running jobs that started before the specified timestamp.
|
||||
// @produce json
|
||||
// @param ts query int true "Unix timestamp to filter jobs (jobs with start_time < ts)"
|
||||
// @success 200 {object} api.GetUsedNodesAPIResponse "Map of cluster names to hostname lists"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /api/jobs/used_nodes [get]
|
||||
func (api *RestAPI) getUsedNodes(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); user != nil &&
|
||||
!user.HasRole(schema.RoleAPI) {
|
||||
handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
tsStr := r.URL.Query().Get("ts")
|
||||
if tsStr == "" {
|
||||
handleError(fmt.Errorf("missing required query parameter: ts"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(tsStr, 10, 64)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("invalid timestamp format: %w", err), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
usedNodes, err := api.JobRepository.GetUsedNodes(ts)
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("failed to get used nodes: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
payload := GetUsedNodesAPIResponse{
|
||||
UsedNodes: usedNodes,
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(rw).Encode(payload); err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
165
internal/api/log.go
Normal file
165
internal/api/log.go
Normal file
@@ -0,0 +1,165 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
type LogEntry struct {
|
||||
Timestamp string `json:"timestamp"`
|
||||
Priority int `json:"priority"`
|
||||
Message string `json:"message"`
|
||||
Unit string `json:"unit"`
|
||||
}
|
||||
|
||||
var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`)
|
||||
|
||||
func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) {
|
||||
user := repository.GetUserFromContext(r.Context())
|
||||
if !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
since := r.URL.Query().Get("since")
|
||||
if since == "" {
|
||||
since = "1 hour ago"
|
||||
}
|
||||
if !safePattern.MatchString(since) {
|
||||
handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
lines := 200
|
||||
if l := r.URL.Query().Get("lines"); l != "" {
|
||||
n, err := strconv.Atoi(l)
|
||||
if err != nil || n < 1 {
|
||||
handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if n > 1000 {
|
||||
n = 1000
|
||||
}
|
||||
lines = n
|
||||
}
|
||||
|
||||
unit := config.Keys.SystemdUnit
|
||||
if unit == "" {
|
||||
unit = "clustercockpit.service"
|
||||
}
|
||||
|
||||
args := []string{
|
||||
"--output=json",
|
||||
"--no-pager",
|
||||
"-n", fmt.Sprintf("%d", lines),
|
||||
"--since", since,
|
||||
"-u", unit,
|
||||
}
|
||||
|
||||
if level := r.URL.Query().Get("level"); level != "" {
|
||||
n, err := strconv.Atoi(level)
|
||||
if err != nil || n < 0 || n > 7 {
|
||||
handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
args = append(args, "--priority", fmt.Sprintf("%d", n))
|
||||
}
|
||||
|
||||
if search := r.URL.Query().Get("search"); search != "" {
|
||||
if !safePattern.MatchString(search) {
|
||||
handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
args = append(args, "--grep", search)
|
||||
}
|
||||
|
||||
cclog.Debugf("calling journalctl with %s", strings.Join(args, " "))
|
||||
cmd := exec.CommandContext(r.Context(), "journalctl", args...)
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
entries := make([]LogEntry, 0, lines)
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
for scanner.Scan() {
|
||||
var raw map[string]any
|
||||
if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
|
||||
cclog.Debugf("error unmarshal log output: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
priority := 6 // default info
|
||||
if p, ok := raw["PRIORITY"]; ok {
|
||||
switch v := p.(type) {
|
||||
case string:
|
||||
if n, err := strconv.Atoi(v); err == nil {
|
||||
priority = n
|
||||
}
|
||||
case float64:
|
||||
priority = int(v)
|
||||
}
|
||||
}
|
||||
|
||||
msg := ""
|
||||
if m, ok := raw["MESSAGE"]; ok {
|
||||
if s, ok := m.(string); ok {
|
||||
msg = s
|
||||
}
|
||||
}
|
||||
|
||||
ts := ""
|
||||
if t, ok := raw["__REALTIME_TIMESTAMP"]; ok {
|
||||
if s, ok := t.(string); ok {
|
||||
ts = s
|
||||
}
|
||||
}
|
||||
|
||||
unitName := ""
|
||||
if u, ok := raw["_SYSTEMD_UNIT"]; ok {
|
||||
if s, ok := u.(string); ok {
|
||||
unitName = s
|
||||
}
|
||||
}
|
||||
|
||||
entries = append(entries, LogEntry{
|
||||
Timestamp: ts,
|
||||
Priority: priority,
|
||||
Message: msg,
|
||||
Unit: unitName,
|
||||
})
|
||||
}
|
||||
|
||||
if err := cmd.Wait(); err != nil {
|
||||
// journalctl returns exit code 1 when --grep matches nothing
|
||||
if len(entries) == 0 {
|
||||
cclog.Debugf("journalctl exited with: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
rw.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(rw).Encode(entries); err != nil {
|
||||
cclog.Errorf("Failed to encode log entries: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -10,15 +10,14 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
// handleFree godoc
|
||||
@@ -58,7 +57,7 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
ms := metricstore.GetMemoryStore()
|
||||
n := 0
|
||||
for _, sel := range selectors {
|
||||
bn, err := ms.Free(sel, to)
|
||||
@@ -90,16 +89,17 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
// @security ApiKeyAuth
|
||||
// @router /write/ [post]
|
||||
func writeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
bytes, err := io.ReadAll(r.Body)
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
if err != nil {
|
||||
handleError(err, http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
dec := lineprotocol.NewDecoderWithBytes(bytes)
|
||||
if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
|
||||
// Extract the "cluster" query parameter without allocating a url.Values map.
|
||||
cluster := queryParam(r.URL.RawQuery, "cluster")
|
||||
|
||||
// Stream directly from the request body instead of copying it into a
|
||||
// temporary buffer via io.ReadAll. The line-protocol decoder supports
|
||||
// io.Reader natively, so this avoids the largest heap allocation.
|
||||
ms := metricstore.GetMemoryStore()
|
||||
dec := lineprotocol.NewDecoder(r.Body)
|
||||
if err := metricstore.DecodeLine(dec, ms, cluster); err != nil {
|
||||
cclog.Errorf("/api/write error: %s", err.Error())
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
@@ -107,6 +107,20 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
// queryParam extracts a single query-parameter value from a raw query string
|
||||
// without allocating a url.Values map. Returns "" if the key is not present.
|
||||
func queryParam(raw, key string) string {
|
||||
for raw != "" {
|
||||
var kv string
|
||||
kv, raw, _ = strings.Cut(raw, "&")
|
||||
k, v, _ := strings.Cut(kv, "=")
|
||||
if k == key {
|
||||
return v
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// handleDebug godoc
|
||||
// @summary Debug endpoint
|
||||
// @tags debug
|
||||
@@ -129,42 +143,9 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
selector = strings.Split(raw, ":")
|
||||
}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
ms := metricstore.GetMemoryStore()
|
||||
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// handleHealthCheck godoc
|
||||
// @summary HealthCheck endpoint
|
||||
// @tags healthcheck
|
||||
// @description This endpoint allows the users to check if a node is healthy
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /healthcheck/ [get]
|
||||
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
|
||||
rawCluster := r.URL.Query().Get("cluster")
|
||||
rawNode := r.URL.Query().Get("node")
|
||||
|
||||
if rawCluster == "" || rawNode == "" {
|
||||
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
|
||||
selector := []string{rawCluster, rawNode}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
400
internal/api/nats.go
Normal file
400
internal/api/nats.go
Normal file
@@ -0,0 +1,400 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/nats"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/receivers"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
|
||||
// It mirrors the functionality of the REST API but uses NATS messaging with
|
||||
// InfluxDB line protocol as the message format.
|
||||
//
|
||||
// # Message Format
|
||||
//
|
||||
// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
|
||||
// with the following structure:
|
||||
//
|
||||
// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
|
||||
//
|
||||
// # Job Events
|
||||
//
|
||||
// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
|
||||
//
|
||||
// job,function=start_job event="{...JSON payload...}" <timestamp>
|
||||
// job,function=stop_job event="{...JSON payload...}" <timestamp>
|
||||
//
|
||||
// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
|
||||
//
|
||||
// Example job start message:
|
||||
//
|
||||
// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
|
||||
//
|
||||
// # Node State Events
|
||||
//
|
||||
// Node state updates use the "nodestate" measurement with cluster information:
|
||||
//
|
||||
// nodestate event="{...JSON payload...}" <timestamp>
|
||||
//
|
||||
// The JSON payload follows the UpdateNodeStatesRequest structure.
|
||||
//
|
||||
// Example node state message:
|
||||
//
|
||||
// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
|
||||
type NatsAPI struct {
|
||||
JobRepository *repository.JobRepository
|
||||
// RepositoryMutex protects job creation operations from race conditions
|
||||
// when checking for duplicate jobs during startJob calls.
|
||||
RepositoryMutex sync.Mutex
|
||||
}
|
||||
|
||||
// NewNatsAPI creates a new NatsAPI instance with default dependencies.
|
||||
func NewNatsAPI() *NatsAPI {
|
||||
return &NatsAPI{
|
||||
JobRepository: repository.GetJobRepository(),
|
||||
}
|
||||
}
|
||||
|
||||
// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
|
||||
// Returns an error if the NATS client is not available or subscription fails.
|
||||
func (api *NatsAPI) StartSubscriptions() error {
|
||||
client := nats.GetClient()
|
||||
if client == nil {
|
||||
cclog.Warn("NATS client not available, skipping API subscriptions")
|
||||
return nil
|
||||
}
|
||||
|
||||
if config.Keys.APISubjects != nil {
|
||||
|
||||
s := config.Keys.APISubjects
|
||||
|
||||
if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cclog.Info("NATS API subscriptions started")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
|
||||
// Validates that required tags and fields are present before processing.
|
||||
func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
|
||||
function, ok := msg.GetTag("function")
|
||||
if !ok {
|
||||
cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
|
||||
return
|
||||
}
|
||||
|
||||
switch function {
|
||||
case "start_job":
|
||||
v, ok := msg.GetEventValue()
|
||||
if !ok {
|
||||
cclog.Errorf("Job start event is missing event field with JSON payload")
|
||||
return
|
||||
}
|
||||
api.handleStartJob(v)
|
||||
|
||||
case "stop_job":
|
||||
v, ok := msg.GetEventValue()
|
||||
if !ok {
|
||||
cclog.Errorf("Job stop event is missing event field with JSON payload")
|
||||
return
|
||||
}
|
||||
api.handleStopJob(v)
|
||||
|
||||
default:
|
||||
cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
|
||||
}
|
||||
}
|
||||
|
||||
// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
|
||||
// The message must be in line protocol format with measurement="job" and include:
|
||||
// - tag "function" with value "start_job" or "stop_job"
|
||||
// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
|
||||
//
|
||||
// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
|
||||
func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
|
||||
if len(data) == 0 {
|
||||
cclog.Warnf("NATS %s: received empty message", subject)
|
||||
return
|
||||
}
|
||||
|
||||
d := influx.NewDecoderWithBytes(data)
|
||||
|
||||
for d.Next() {
|
||||
m, err := receivers.DecodeInfluxMessage(d)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
if !m.IsEvent() {
|
||||
cclog.Debugf("NATS %s: received non-event message, skipping", subject)
|
||||
continue
|
||||
}
|
||||
|
||||
if m.Name() == "job" {
|
||||
api.processJobEvent(m)
|
||||
} else {
|
||||
cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleStartJob processes job start messages received via NATS.
|
||||
// The payload parameter contains JSON following the schema.Job structure.
|
||||
// Jobs are validated, checked for duplicates, and inserted into the database.
|
||||
func (api *NatsAPI) handleStartJob(payload string) {
|
||||
if payload == "" {
|
||||
cclog.Error("NATS start job: payload is empty")
|
||||
return
|
||||
}
|
||||
req := schema.Job{
|
||||
Shared: "none",
|
||||
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||
}
|
||||
|
||||
dec := json.NewDecoder(strings.NewReader(payload))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS start job: parsing request failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Debugf("NATS start job: %s", req.GoString())
|
||||
req.State = schema.JobStateRunning
|
||||
|
||||
if err := importer.SanityChecks(&req); err != nil {
|
||||
cclog.Errorf("NATS start job: sanity check failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
var unlockOnce sync.Once
|
||||
api.RepositoryMutex.Lock()
|
||||
defer unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
|
||||
return
|
||||
}
|
||||
if err == nil {
|
||||
for _, job := range jobs {
|
||||
if (req.StartTime - job.StartTime) < secondsPerDay {
|
||||
cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
|
||||
req.JobID, req.Cluster, job.ID)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// When tags are present, insert directly into the job table so that the
|
||||
// returned ID can be used with AddTagOrCreate (which queries the job table).
|
||||
var id int64
|
||||
if len(req.Tags) > 0 {
|
||||
id, err = api.JobRepository.StartDirect(&req)
|
||||
} else {
|
||||
id, err = api.JobRepository.Start(&req)
|
||||
}
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS start job: insert into database failed: %v", err)
|
||||
return
|
||||
}
|
||||
unlockOnce.Do(api.RepositoryMutex.Unlock)
|
||||
|
||||
for _, tag := range req.Tags {
|
||||
if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
|
||||
cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
|
||||
id, req.Cluster, req.JobID, req.User, req.StartTime)
|
||||
}
|
||||
|
||||
// handleStopJob processes job stop messages received via NATS.
|
||||
// The payload parameter contains JSON following the StopJobAPIRequest structure.
|
||||
// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
|
||||
func (api *NatsAPI) handleStopJob(payload string) {
|
||||
if payload == "" {
|
||||
cclog.Error("NATS stop job: payload is empty")
|
||||
return
|
||||
}
|
||||
var req StopJobAPIRequest
|
||||
|
||||
dec := json.NewDecoder(strings.NewReader(payload))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS job stop: parsing request failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if req.JobID == nil {
|
||||
cclog.Errorf("NATS job stop: the field 'jobId' is required")
|
||||
return
|
||||
}
|
||||
|
||||
isCached := false
|
||||
job, err := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
// Not in cache, try main job table
|
||||
job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS job stop: finding job failed: %v", err)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
isCached = true
|
||||
}
|
||||
|
||||
if job.State != schema.JobStateRunning {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
|
||||
job.JobID, job.ID, job.Cluster, job.State)
|
||||
return
|
||||
}
|
||||
|
||||
if job.StartTime > req.StopTime {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
|
||||
job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
|
||||
return
|
||||
}
|
||||
|
||||
if req.State != "" && !req.State.Valid() {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
|
||||
job.JobID, job.ID, job.Cluster, req.State)
|
||||
return
|
||||
} else if req.State == "" {
|
||||
req.State = schema.JobStateCompleted
|
||||
}
|
||||
|
||||
job.Duration = int32(req.StopTime - job.StartTime)
|
||||
job.State = req.State
|
||||
api.JobRepository.Mutex.Lock()
|
||||
defer api.JobRepository.Mutex.Unlock()
|
||||
|
||||
// If the job is still in job_cache, transfer it to the job table first
|
||||
if isCached {
|
||||
newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v",
|
||||
job.JobID, *job.ID, job.Cluster, err)
|
||||
return
|
||||
}
|
||||
cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
|
||||
job.ID = &newID
|
||||
}
|
||||
|
||||
if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
||||
cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
|
||||
job.JobID, *job.ID, job.Cluster, job.State, err)
|
||||
return
|
||||
}
|
||||
|
||||
cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
|
||||
*job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
|
||||
|
||||
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
||||
return
|
||||
}
|
||||
|
||||
archiver.TriggerArchiving(job)
|
||||
}
|
||||
|
||||
// processNodestateEvent extracts and processes node state data from the InfluxDB message.
|
||||
// Updates node states in the repository for all nodes in the payload.
|
||||
func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
|
||||
v, ok := msg.GetEventValue()
|
||||
if !ok {
|
||||
cclog.Errorf("Nodestate event is missing event field with JSON payload")
|
||||
return
|
||||
}
|
||||
|
||||
var req UpdateNodeStatesRequest
|
||||
|
||||
dec := json.NewDecoder(strings.NewReader(v))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&req); err != nil {
|
||||
cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
repo := repository.GetNodeRepository()
|
||||
requestReceived := time.Now().Unix()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: requestReceived,
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||
cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
|
||||
node.Hostname, req.Cluster, err)
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
|
||||
}
|
||||
|
||||
// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
|
||||
// The message must be in line protocol format with measurement="nodestate" and include:
|
||||
// - field "event" containing JSON payload (UpdateNodeStatesRequest)
|
||||
//
|
||||
// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
|
||||
func (api *NatsAPI) handleNodeState(subject string, data []byte) {
|
||||
if len(data) == 0 {
|
||||
cclog.Warnf("NATS %s: received empty message", subject)
|
||||
return
|
||||
}
|
||||
|
||||
d := influx.NewDecoderWithBytes(data)
|
||||
|
||||
for d.Next() {
|
||||
m, err := receivers.DecodeInfluxMessage(d)
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
|
||||
return
|
||||
}
|
||||
|
||||
if !m.IsEvent() {
|
||||
cclog.Warnf("NATS %s: received non-event message, skipping", subject)
|
||||
continue
|
||||
}
|
||||
|
||||
if m.Name() == "nodestate" {
|
||||
api.processNodestateEvent(m)
|
||||
} else {
|
||||
cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
947
internal/api/nats_test.go
Normal file
947
internal/api/nats_test.go
Normal file
@@ -0,0 +1,947 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/archiver"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func setupNatsTest(t *testing.T) *NatsAPI {
|
||||
repository.ResetConnection()
|
||||
|
||||
const testconfig = `{
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"auth": {
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
}
|
||||
}
|
||||
}`
|
||||
const testclusterJSON = `{
|
||||
"name": "testcluster",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "sc1",
|
||||
"nodes": "host123,host124,host125",
|
||||
"processorType": "Intel Core i7-4770",
|
||||
"socketsPerNode": 1,
|
||||
"coresPerSocket": 4,
|
||||
"threadsPerCore": 2,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "F/s"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"prefix": "G",
|
||||
"base": "B/s"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"numberOfNodes": 70,
|
||||
"topology": {
|
||||
"node": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||
"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||
"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
|
||||
"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
|
||||
}
|
||||
}
|
||||
],
|
||||
"metricConfig": [
|
||||
{
|
||||
"name": "load_one",
|
||||
"unit": { "base": ""},
|
||||
"scope": "node",
|
||||
"timestep": 60,
|
||||
"aggregation": "avg",
|
||||
"peak": 8,
|
||||
"normal": 0,
|
||||
"caution": 0,
|
||||
"alert": 0
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
jobarchive := filepath.Join(tmpdir, "job-archive")
|
||||
if err := os.Mkdir(jobarchive, 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cfgFilePath := filepath.Join(tmpdir, "config.json")
|
||||
if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ccconf.Init(cfgFilePath)
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
config.Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
repository.Connect(dbfilepath)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// metricstore initialization removed - it's initialized via callback in tests
|
||||
|
||||
archiver.Start(repository.GetJobRepository(), context.Background())
|
||||
|
||||
if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
|
||||
auth.Init(&cfg)
|
||||
} else {
|
||||
cclog.Warn("Authentication disabled due to missing configuration")
|
||||
auth.Init(nil)
|
||||
}
|
||||
|
||||
graph.Init()
|
||||
|
||||
return NewNatsAPI()
|
||||
}
|
||||
|
||||
func cleanupNatsTest() {
|
||||
if err := archiver.Shutdown(5 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleStartJob(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
payload string
|
||||
expectError bool
|
||||
validateJob func(t *testing.T, job *schema.Job)
|
||||
shouldFindJob bool
|
||||
}{
|
||||
{
|
||||
name: "valid job start",
|
||||
payload: `{
|
||||
"jobId": 1001,
|
||||
"user": "testuser1",
|
||||
"project": "testproj1",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 7200,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`,
|
||||
expectError: false,
|
||||
shouldFindJob: true,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.JobID != 1001 {
|
||||
t.Errorf("expected JobID 1001, got %d", job.JobID)
|
||||
}
|
||||
if job.User != "testuser1" {
|
||||
t.Errorf("expected user testuser1, got %s", job.User)
|
||||
}
|
||||
if job.State != schema.JobStateRunning {
|
||||
t.Errorf("expected state running, got %s", job.State)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "invalid JSON",
|
||||
payload: `{
|
||||
"jobId": "not a number",
|
||||
"user": "testuser2"
|
||||
}`,
|
||||
expectError: true,
|
||||
shouldFindJob: false,
|
||||
},
|
||||
{
|
||||
name: "missing required fields",
|
||||
payload: `{
|
||||
"jobId": 1002
|
||||
}`,
|
||||
expectError: true,
|
||||
shouldFindJob: false,
|
||||
},
|
||||
{
|
||||
name: "job with unknown fields (should fail due to DisallowUnknownFields)",
|
||||
payload: `{
|
||||
"jobId": 1003,
|
||||
"user": "testuser3",
|
||||
"project": "testproj3",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"unknownField": "should cause error",
|
||||
"startTime": 1234567900
|
||||
}`,
|
||||
expectError: true,
|
||||
shouldFindJob: false,
|
||||
},
|
||||
{
|
||||
name: "job with tags",
|
||||
payload: `{
|
||||
"jobId": 1004,
|
||||
"user": "testuser4",
|
||||
"project": "testproj4",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"type": "test",
|
||||
"name": "testtag",
|
||||
"scope": "testuser4"
|
||||
}
|
||||
],
|
||||
"startTime": 1234567910
|
||||
}`,
|
||||
expectError: false,
|
||||
shouldFindJob: true,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.JobID != 1004 {
|
||||
t.Errorf("expected JobID 1004, got %d", job.JobID)
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleStartJob(tt.payload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
|
||||
// Allow some time for async operations
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
if tt.shouldFindJob {
|
||||
// Extract jobId from payload
|
||||
var payloadMap map[string]any
|
||||
json.Unmarshal([]byte(tt.payload), &payloadMap)
|
||||
jobID := int64(payloadMap["jobId"].(float64))
|
||||
cluster := payloadMap["cluster"].(string)
|
||||
startTime := int64(payloadMap["startTime"].(float64))
|
||||
|
||||
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
|
||||
if err != nil {
|
||||
if !tt.expectError {
|
||||
t.Fatalf("expected to find job, but got error: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if tt.validateJob != nil {
|
||||
tt.validateJob(t, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleStopJob(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
// First, create a running job
|
||||
startPayload := `{
|
||||
"jobId": 2001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`
|
||||
|
||||
natsAPI.handleStartJob(startPayload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
payload string
|
||||
expectError bool
|
||||
validateJob func(t *testing.T, job *schema.Job)
|
||||
setupJobFunc func() // Optional: create specific test job
|
||||
}{
|
||||
{
|
||||
name: "valid job stop - completed",
|
||||
payload: `{
|
||||
"jobId": 2001,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567890,
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234571490
|
||||
}`,
|
||||
expectError: false,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.State != schema.JobStateCompleted {
|
||||
t.Errorf("expected state completed, got %s", job.State)
|
||||
}
|
||||
expectedDuration := int32(1234571490 - 1234567890)
|
||||
if job.Duration != expectedDuration {
|
||||
t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "valid job stop - failed",
|
||||
setupJobFunc: func() {
|
||||
startPayloadFailed := `{
|
||||
"jobId": 2002,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567900
|
||||
}`
|
||||
natsAPI.handleStartJob(startPayloadFailed)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
},
|
||||
payload: `{
|
||||
"jobId": 2002,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567900,
|
||||
"jobState": "failed",
|
||||
"stopTime": 1234569900
|
||||
}`,
|
||||
expectError: false,
|
||||
validateJob: func(t *testing.T, job *schema.Job) {
|
||||
if job.State != schema.JobStateFailed {
|
||||
t.Errorf("expected state failed, got %s", job.State)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "invalid JSON",
|
||||
payload: `{
|
||||
"jobId": "not a number"
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "missing jobId",
|
||||
payload: `{
|
||||
"cluster": "testcluster",
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234571490
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "invalid job state",
|
||||
setupJobFunc: func() {
|
||||
startPayloadInvalid := `{
|
||||
"jobId": 2003,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567910
|
||||
}`
|
||||
natsAPI.handleStartJob(startPayloadInvalid)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
},
|
||||
payload: `{
|
||||
"jobId": 2003,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567910,
|
||||
"jobState": "invalid_state",
|
||||
"stopTime": 1234571510
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "stopTime before startTime",
|
||||
setupJobFunc: func() {
|
||||
startPayloadTime := `{
|
||||
"jobId": 2004,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567920
|
||||
}`
|
||||
natsAPI.handleStartJob(startPayloadTime)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
},
|
||||
payload: `{
|
||||
"jobId": 2004,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567920,
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234567900
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "job not found",
|
||||
payload: `{
|
||||
"jobId": 99999,
|
||||
"cluster": "testcluster",
|
||||
"startTime": 1234567890,
|
||||
"jobState": "completed",
|
||||
"stopTime": 1234571490
|
||||
}`,
|
||||
expectError: true,
|
||||
},
|
||||
}
|
||||
|
||||
testData := schema.JobData{
|
||||
"load_one": map[schema.MetricScope]*schema.JobMetric{
|
||||
schema.MetricScopeNode: {
|
||||
Unit: schema.Unit{Base: "load"},
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "host123",
|
||||
Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
|
||||
Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
return testData, nil
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if tt.setupJobFunc != nil {
|
||||
tt.setupJobFunc()
|
||||
}
|
||||
|
||||
natsAPI.handleStopJob(tt.payload)
|
||||
|
||||
// Allow some time for async operations
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
if !tt.expectError && tt.validateJob != nil {
|
||||
// Extract job details from payload
|
||||
var payloadMap map[string]any
|
||||
json.Unmarshal([]byte(tt.payload), &payloadMap)
|
||||
jobID := int64(payloadMap["jobId"].(float64))
|
||||
cluster := payloadMap["cluster"].(string)
|
||||
|
||||
var startTime *int64
|
||||
if st, ok := payloadMap["startTime"]; ok {
|
||||
t := int64(st.(float64))
|
||||
startTime = &t
|
||||
}
|
||||
|
||||
job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
|
||||
if err != nil {
|
||||
t.Fatalf("expected to find job, but got error: %v", err)
|
||||
}
|
||||
|
||||
tt.validateJob(t, job)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleNodeState(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
validateFn func(t *testing.T)
|
||||
}{
|
||||
{
|
||||
name: "valid node state update",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
validateFn: func(t *testing.T) {
|
||||
// In a full test, we would verify the node state was updated in the database
|
||||
// For now, just ensure no error occurred
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "multiple nodes",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
name: "invalid JSON in event field",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty nodes array",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
|
||||
expectError: false, // Empty array should not cause error
|
||||
},
|
||||
{
|
||||
name: "invalid line protocol format",
|
||||
data: []byte(`invalid line protocol format`),
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty data",
|
||||
data: []byte(``),
|
||||
expectError: false, // Should be handled gracefully with warning
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleNodeState("test.subject", tt.data)
|
||||
|
||||
// Allow some time for async operations
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
if tt.validateFn != nil {
|
||||
tt.validateFn(t)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsProcessJobEvent(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
msgStartJob, err := lp.NewMessage(
|
||||
"job",
|
||||
map[string]string{"function": "start_job"},
|
||||
nil,
|
||||
map[string]any{
|
||||
"event": `{
|
||||
"jobId": 3001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`,
|
||||
},
|
||||
time.Now(),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create test message: %v", err)
|
||||
}
|
||||
|
||||
msgMissingTag, err := lp.NewMessage(
|
||||
"job",
|
||||
map[string]string{},
|
||||
nil,
|
||||
map[string]any{
|
||||
"event": `{}`,
|
||||
},
|
||||
time.Now(),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create test message: %v", err)
|
||||
}
|
||||
|
||||
msgUnknownFunc, err := lp.NewMessage(
|
||||
"job",
|
||||
map[string]string{"function": "unknown_function"},
|
||||
nil,
|
||||
map[string]any{
|
||||
"event": `{}`,
|
||||
},
|
||||
time.Now(),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create test message: %v", err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
message lp.CCMessage
|
||||
expectError bool
|
||||
}{
|
||||
{
|
||||
name: "start_job function",
|
||||
message: msgStartJob,
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
name: "missing function tag",
|
||||
message: msgMissingTag,
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "unknown function",
|
||||
message: msgUnknownFunc,
|
||||
expectError: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.processJobEvent(tt.message)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleJobEvent(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
}{
|
||||
{
|
||||
name: "valid influx line protocol",
|
||||
data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
name: "invalid influx line protocol",
|
||||
data: []byte(`invalid line protocol format`),
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "empty data",
|
||||
data: []byte(``),
|
||||
expectError: false, // Decoder should handle empty input gracefully
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// HandleJobEvent doesn't return errors, it logs them
|
||||
// We're just ensuring it doesn't panic
|
||||
natsAPI.handleJobEvent("test.subject", tt.data)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleJobEventEdgeCases(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "non-event message (metric data)",
|
||||
data: []byte(`job,function=start_job value=123.45 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should skip non-event messages gracefully",
|
||||
},
|
||||
{
|
||||
name: "wrong measurement name",
|
||||
data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should warn about unexpected measurement but not fail",
|
||||
},
|
||||
{
|
||||
name: "missing event field",
|
||||
data: []byte(`job,function=start_job other_field="value" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should error when event field is missing",
|
||||
},
|
||||
{
|
||||
name: "multiple measurements in one message",
|
||||
data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
|
||||
expectError: false,
|
||||
description: "Should process multiple lines",
|
||||
},
|
||||
{
|
||||
name: "escaped quotes in JSON payload",
|
||||
data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should handle escaped quotes (though JSON parsing may fail)",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleJobEvent("test.subject", tt.data)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
expectError bool
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "missing cluster field in JSON",
|
||||
data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should fail when cluster is missing",
|
||||
},
|
||||
{
|
||||
name: "malformed JSON with unescaped quotes",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
|
||||
expectError: true,
|
||||
description: "Should fail on malformed JSON",
|
||||
},
|
||||
{
|
||||
name: "unicode characters in hostname",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should handle unicode characters",
|
||||
},
|
||||
{
|
||||
name: "very large node count",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
|
||||
expectError: false,
|
||||
description: "Should handle multiple nodes efficiently",
|
||||
},
|
||||
{
|
||||
name: "timestamp in past",
|
||||
data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
|
||||
expectError: false,
|
||||
description: "Should accept any valid timestamp",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
natsAPI.handleNodeState("test.subject", tt.data)
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
|
||||
natsAPI := setupNatsTest(t)
|
||||
t.Cleanup(cleanupNatsTest)
|
||||
|
||||
// Start a job
|
||||
payload := `{
|
||||
"jobId": 5001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567890
|
||||
}`
|
||||
|
||||
natsAPI.handleStartJob(payload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Try to start the same job again (within 24 hours)
|
||||
duplicatePayload := `{
|
||||
"jobId": 5001,
|
||||
"user": "testuser",
|
||||
"project": "testproj",
|
||||
"cluster": "testcluster",
|
||||
"partition": "main",
|
||||
"walltime": 3600,
|
||||
"numNodes": 1,
|
||||
"numHwthreads": 8,
|
||||
"numAcc": 0,
|
||||
"shared": "none",
|
||||
"monitoringStatus": 1,
|
||||
"smt": 1,
|
||||
"resources": [
|
||||
{
|
||||
"hostname": "host123",
|
||||
"hwthreads": [0, 1, 2, 3]
|
||||
}
|
||||
],
|
||||
"startTime": 1234567900
|
||||
}`
|
||||
|
||||
natsAPI.handleStartJob(duplicatePayload)
|
||||
natsAPI.JobRepository.SyncJobs()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Verify only one job exists
|
||||
jobID := int64(5001)
|
||||
cluster := "testcluster"
|
||||
jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(jobs) != 1 {
|
||||
t.Errorf("expected 1 job, got %d", len(jobs))
|
||||
}
|
||||
}
|
||||
@@ -7,12 +7,17 @@ package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
type UpdateNodeStatesRequest struct {
|
||||
@@ -20,6 +25,15 @@ type UpdateNodeStatesRequest struct {
|
||||
Cluster string `json:"cluster" example:"fritz"`
|
||||
}
|
||||
|
||||
// metricListToNames converts a map of metric configurations to a list of metric names
|
||||
func metricListToNames(metricList map[string]*schema.Metric) []string {
|
||||
names := make([]string, 0, len(metricList))
|
||||
for name := range metricList {
|
||||
names = append(names, name)
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
// this routine assumes that only one of them exists per node
|
||||
func determineState(states []string) schema.SchedulerState {
|
||||
for _, state := range states {
|
||||
@@ -47,7 +61,7 @@ func determineState(states []string) schema.SchedulerState {
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
|
||||
// @success 200 {object} api.DefaultApiResponse "Success message"
|
||||
// @success 200 {object} api.DefaultAPIResponse "Success message"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
@@ -62,19 +76,70 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
requestReceived := time.Now().Unix()
|
||||
repo := repository.GetNodeRepository()
|
||||
|
||||
m := make(map[string][]string)
|
||||
metricNames := make(map[string][]string)
|
||||
healthResults := make(map[string]metricstore.HealthCheckResult)
|
||||
|
||||
startMs := time.Now()
|
||||
|
||||
// Step 1: Build nodeList and metricList per subcluster
|
||||
for _, node := range req.Nodes {
|
||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||
m[sc] = append(m[sc], node.Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
for sc := range m {
|
||||
if sc != "" {
|
||||
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
||||
metricNames[sc] = metricListToNames(metricList)
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Determine which metric store to query and perform health check
|
||||
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
|
||||
if err != nil {
|
||||
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
|
||||
} else {
|
||||
for sc, nl := range m {
|
||||
if sc != "" {
|
||||
if results, err := healthRepo.HealthCheck(req.Cluster, nl, metricNames[sc]); err == nil {
|
||||
maps.Copy(healthResults, results)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs))
|
||||
startDB := time.Now()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
healthState := schema.MonitoringStateFailed
|
||||
var healthMetrics string
|
||||
if result, ok := healthResults[node.Hostname]; ok {
|
||||
healthState = result.State
|
||||
healthMetrics = result.HealthMetrics
|
||||
}
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(), NodeState: state,
|
||||
TimeStamp: requestReceived,
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
HealthState: healthState,
|
||||
HealthMetrics: healthMetrics,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
||||
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||
cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v",
|
||||
node.Hostname, req.Cluster, err)
|
||||
}
|
||||
}
|
||||
|
||||
cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
|
||||
}
|
||||
|
||||
@@ -22,10 +22,11 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/auth"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/tagger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
"github.com/go-chi/chi/v5"
|
||||
)
|
||||
|
||||
// @title ClusterCockpit REST API
|
||||
@@ -48,6 +49,7 @@ import (
|
||||
const (
|
||||
noticeFilePath = "./var/notice.txt"
|
||||
noticeFilePerms = 0o644
|
||||
maxNoticeLength = 10000 // Maximum allowed notice content length in characters
|
||||
)
|
||||
|
||||
type RestAPI struct {
|
||||
@@ -61,6 +63,7 @@ type RestAPI struct {
|
||||
RepositoryMutex sync.Mutex
|
||||
}
|
||||
|
||||
// New creates and initializes a new RestAPI instance with configured dependencies.
|
||||
func New() *RestAPI {
|
||||
return &RestAPI{
|
||||
JobRepository: repository.GetJobRepository(),
|
||||
@@ -69,79 +72,100 @@ func New() *RestAPI {
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountAPIRoutes registers REST API endpoints for job and cluster management.
|
||||
// These routes use JWT token authentication via the X-Auth-Token header.
|
||||
func (api *RestAPI) MountAPIRoutes(r chi.Router) {
|
||||
// REST API Uses TokenAuth
|
||||
// User List
|
||||
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
|
||||
r.Get("/users/", api.getUsers)
|
||||
// Cluster List
|
||||
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
|
||||
r.Get("/clusters/", api.getClusters)
|
||||
// Slurm node state
|
||||
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
||||
r.Post("/nodestate/", api.updateNodeStates)
|
||||
r.Put("/nodestate/", api.updateNodeStates)
|
||||
// Job Handler
|
||||
r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
|
||||
r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
|
||||
r.HandleFunc("/jobs/tag_job/{id}", api.removeTagJob).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/jobs/edit_meta/{id}", api.editMeta).Methods(http.MethodPost, http.MethodPatch)
|
||||
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/delete_job/", api.deleteJobByRequest).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobByID).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete)
|
||||
if config.Keys.APISubjects == nil {
|
||||
cclog.Info("Enabling REST start/stop job API")
|
||||
r.Post("/jobs/start_job/", api.startJob)
|
||||
r.Put("/jobs/start_job/", api.startJob)
|
||||
r.Post("/jobs/stop_job/", api.stopJobByRequest)
|
||||
r.Put("/jobs/stop_job/", api.stopJobByRequest)
|
||||
}
|
||||
r.Get("/jobs/", api.getJobs)
|
||||
r.Get("/jobs/used_nodes", api.getUsedNodes)
|
||||
r.Post("/jobs/tag_job/{id}", api.tagJob)
|
||||
r.Patch("/jobs/tag_job/{id}", api.tagJob)
|
||||
r.Delete("/jobs/tag_job/{id}", api.removeTagJob)
|
||||
r.Patch("/jobs/edit_meta/{id}", api.editMeta)
|
||||
r.Patch("/jobs/edit_meta/", api.editMetaByRequest)
|
||||
r.Get("/jobs/metrics/{id}", api.getJobMetrics)
|
||||
r.Delete("/jobs/delete_job/", api.deleteJobByRequest)
|
||||
r.Delete("/jobs/delete_job/{id}", api.deleteJobByID)
|
||||
r.Delete("/jobs/delete_job_before/{ts}", api.deleteJobBefore)
|
||||
r.Post("/jobs/{id}", api.getJobByID)
|
||||
r.Get("/jobs/{id}", api.getCompleteJobByID)
|
||||
|
||||
r.HandleFunc("/tags/", api.removeTags).Methods(http.MethodDelete)
|
||||
r.Delete("/tags/", api.removeTags)
|
||||
|
||||
if api.MachineStateDir != "" {
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
|
||||
r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
|
||||
r.Get("/machine_state/{cluster}/{host}", api.getMachineState)
|
||||
r.Put("/machine_state/{cluster}/{host}", api.putMachineState)
|
||||
r.Post("/machine_state/{cluster}/{host}", api.putMachineState)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountUserAPIRoutes registers user-accessible REST API endpoints.
|
||||
// These are limited endpoints for regular users with JWT token authentication.
|
||||
func (api *RestAPI) MountUserAPIRoutes(r chi.Router) {
|
||||
// REST API Uses TokenAuth
|
||||
r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
|
||||
r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
|
||||
r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
|
||||
r.Get("/jobs/", api.getJobs)
|
||||
r.Post("/jobs/{id}", api.getJobByID)
|
||||
r.Get("/jobs/{id}", api.getCompleteJobByID)
|
||||
r.Get("/jobs/metrics/{id}", api.getJobMetrics)
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
|
||||
// MountMetricStoreAPIRoutes registers metric storage API endpoints.
|
||||
// These endpoints handle metric data ingestion and health checks with JWT token authentication.
|
||||
func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) {
|
||||
// REST API Uses TokenAuth
|
||||
// Note: StrictSlash handles trailing slash variations automatically
|
||||
r.HandleFunc("/api/free", freeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/write", writeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/debug", debugMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/api/healthcheck", metricsHealth).Methods(http.MethodGet)
|
||||
r.Post("/free", freeMetrics)
|
||||
r.Post("/write", writeMetrics)
|
||||
r.Get("/debug", debugMetrics)
|
||||
r.Post("/healthcheck", api.updateNodeStates)
|
||||
// Same endpoints but with trailing slash
|
||||
r.HandleFunc("/api/free/", freeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/write/", writeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/api/debug/", debugMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet)
|
||||
r.Post("/free/", freeMetrics)
|
||||
r.Post("/write/", writeMetrics)
|
||||
r.Get("/debug/", debugMetrics)
|
||||
r.Post("/healthcheck/", api.updateNodeStates)
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountConfigAPIRoutes registers configuration and user management endpoints.
|
||||
// These routes use session-based authentication and require admin privileges.
|
||||
// Routes use full paths (including /config prefix) to avoid conflicting with
|
||||
// the /config page route when registered via Group instead of Route.
|
||||
func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) {
|
||||
// Settings Frontend Uses SessionAuth
|
||||
if api.Authentication != nil {
|
||||
r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
|
||||
r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete)
|
||||
r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost)
|
||||
r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost)
|
||||
r.Get("/config/roles/", api.getRoles)
|
||||
r.Post("/config/users/", api.createUser)
|
||||
r.Put("/config/users/", api.createUser)
|
||||
r.Get("/config/users/", api.getUsers)
|
||||
r.Delete("/config/users/", api.deleteUser)
|
||||
r.Post("/config/user/{id}", api.updateUser)
|
||||
r.Post("/config/notice/", api.editNotice)
|
||||
r.Get("/config/taggers/", api.getTaggers)
|
||||
r.Post("/config/taggers/run/", api.runTagger)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) {
|
||||
r.StrictSlash(true)
|
||||
// MountFrontendAPIRoutes registers frontend-specific API endpoints.
|
||||
// These routes support JWT generation and user configuration updates with session authentication.
|
||||
func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) {
|
||||
r.Get("/logs/", api.getJournalLog)
|
||||
// Settings Frontend Uses SessionAuth
|
||||
if api.Authentication != nil {
|
||||
r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
|
||||
r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost)
|
||||
r.Get("/jwt/", api.getJWT)
|
||||
r.Post("/configuration/", api.updateConfiguration)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,6 +181,8 @@ type DefaultAPIResponse struct {
|
||||
Message string `json:"msg"`
|
||||
}
|
||||
|
||||
// handleError writes a standardized JSON error response with the given status code.
|
||||
// It logs the error at WARN level and ensures proper Content-Type headers are set.
|
||||
func handleError(err error, statusCode int, rw http.ResponseWriter) {
|
||||
cclog.Warnf("REST ERROR : %s", err.Error())
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
@@ -169,15 +195,38 @@ func handleError(err error, statusCode int, rw http.ResponseWriter) {
|
||||
}
|
||||
}
|
||||
|
||||
// decode reads JSON from r into val with strict validation that rejects unknown fields.
|
||||
func decode(r io.Reader, val any) error {
|
||||
dec := json.NewDecoder(r)
|
||||
dec.DisallowUnknownFields()
|
||||
return dec.Decode(val)
|
||||
}
|
||||
|
||||
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
// validatePathComponent checks if a path component contains potentially malicious patterns
|
||||
// that could be used for path traversal attacks. Returns an error if validation fails.
|
||||
func validatePathComponent(component, componentName string) error {
|
||||
if strings.Contains(component, "..") ||
|
||||
strings.Contains(component, "/") ||
|
||||
strings.Contains(component, "\\") {
|
||||
return fmt.Errorf("invalid %s", componentName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// editNotice godoc
|
||||
// @summary Update system notice
|
||||
// @tags Config
|
||||
// @description Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.
|
||||
// @accept mpfd
|
||||
// @produce plain
|
||||
// @param new-content formData string true "New notice content (max 10000 characters)"
|
||||
// @success 200 {string} string "Update Notice Content Success"
|
||||
// @failure 400 {object} ErrorResponse "Bad Request"
|
||||
// @failure 403 {object} ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /notice/ [post]
|
||||
func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw)
|
||||
return
|
||||
@@ -186,9 +235,8 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
// Get Value
|
||||
newContent := r.FormValue("new-content")
|
||||
|
||||
// Validate content length to prevent DoS
|
||||
if len(newContent) > 10000 {
|
||||
handleError(fmt.Errorf("notice content exceeds maximum length of 10000 characters"), http.StatusBadRequest, rw)
|
||||
if len(newContent) > maxNoticeLength {
|
||||
handleError(fmt.Errorf("notice content exceeds maximum length of %d characters", maxNoticeLength), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -200,7 +248,9 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw)
|
||||
return
|
||||
}
|
||||
ntxt.Close()
|
||||
if err := ntxt.Close(); err != nil {
|
||||
cclog.Warnf("Failed to close notice file: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil {
|
||||
@@ -210,13 +260,66 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
var msg []byte
|
||||
if newContent != "" {
|
||||
rw.Write([]byte("Update Notice Content Success"))
|
||||
msg = []byte("Update Notice Content Success")
|
||||
} else {
|
||||
rw.Write([]byte("Empty Notice Content Success"))
|
||||
msg = []byte("Empty Notice Content Success")
|
||||
}
|
||||
if _, err := rw.Write(msg); err != nil {
|
||||
cclog.Errorf("Failed to write response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) getTaggers(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to list taggers"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(rw).Encode(tagger.ListTaggers()); err != nil {
|
||||
cclog.Errorf("Failed to encode tagger list: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) {
|
||||
if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to run taggers"), http.StatusForbidden, rw)
|
||||
return
|
||||
}
|
||||
|
||||
name := r.FormValue("name")
|
||||
if name == "" {
|
||||
handleError(fmt.Errorf("missing required parameter: name"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
if err := tagger.RunTaggerByName(name); err != nil {
|
||||
handleError(err, http.StatusConflict, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil {
|
||||
cclog.Errorf("Failed to write response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// getJWT godoc
|
||||
// @summary Generate JWT token
|
||||
// @tags Frontend
|
||||
// @description Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.
|
||||
// @accept mpfd
|
||||
// @produce plain
|
||||
// @param username formData string true "Username to generate JWT for"
|
||||
// @success 200 {string} string "JWT token"
|
||||
// @failure 403 {object} ErrorResponse "Forbidden"
|
||||
// @failure 404 {object} ErrorResponse "User Not Found"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /jwt/ [get]
|
||||
func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
username := r.FormValue("username")
|
||||
@@ -241,12 +344,22 @@ func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
rw.Write([]byte(jwt))
|
||||
if _, err := rw.Write([]byte(jwt)); err != nil {
|
||||
cclog.Errorf("Failed to write JWT response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// getRoles godoc
|
||||
// @summary Get available roles
|
||||
// @tags Config
|
||||
// @description Returns a list of valid user roles. Only admins are allowed.
|
||||
// @produce json
|
||||
// @success 200 {array} string "List of role names"
|
||||
// @failure 403 {object} ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /roles/ [get]
|
||||
func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
|
||||
// SecuredCheck() only worked with TokenAuth: Removed
|
||||
|
||||
user := repository.GetUserFromContext(r.Context())
|
||||
if !user.HasRole(schema.RoleAdmin) {
|
||||
handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw)
|
||||
@@ -265,6 +378,18 @@ func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// updateConfiguration godoc
|
||||
// @summary Update user configuration
|
||||
// @tags Frontend
|
||||
// @description Updates a user's configuration key-value pair.
|
||||
// @accept mpfd
|
||||
// @produce plain
|
||||
// @param key formData string true "Configuration key"
|
||||
// @param value formData string true "Configuration value"
|
||||
// @success 200 {string} string "success"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /configuration/ [post]
|
||||
func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.Header().Set("Content-Type", "text/plain")
|
||||
key, value := r.FormValue("key"), r.FormValue("value")
|
||||
@@ -275,26 +400,40 @@ func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
|
||||
rw.WriteHeader(http.StatusOK)
|
||||
rw.Write([]byte("success"))
|
||||
if _, err := rw.Write([]byte("success")); err != nil {
|
||||
cclog.Errorf("Failed to write response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// putMachineState godoc
|
||||
// @summary Store machine state
|
||||
// @tags Machine State
|
||||
// @description Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
|
||||
// @accept json
|
||||
// @produce plain
|
||||
// @param cluster path string true "Cluster name"
|
||||
// @param host path string true "Host name"
|
||||
// @success 201 "Created"
|
||||
// @failure 400 {object} ErrorResponse "Bad Request"
|
||||
// @failure 404 {object} ErrorResponse "Machine state not enabled"
|
||||
// @failure 500 {object} ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /machine_state/{cluster}/{host} [put]
|
||||
func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
if api.MachineStateDir == "" {
|
||||
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
|
||||
return
|
||||
}
|
||||
|
||||
vars := mux.Vars(r)
|
||||
cluster := vars["cluster"]
|
||||
host := vars["host"]
|
||||
cluster := chi.URLParam(r, "cluster")
|
||||
host := chi.URLParam(r, "host")
|
||||
|
||||
// Validate cluster and host to prevent path traversal attacks
|
||||
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
|
||||
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(cluster, "cluster name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
|
||||
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(host, "host name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -320,23 +459,33 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
rw.WriteHeader(http.StatusCreated)
|
||||
}
|
||||
|
||||
// getMachineState godoc
|
||||
// @summary Retrieve machine state
|
||||
// @tags Machine State
|
||||
// @description Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
|
||||
// @produce json
|
||||
// @param cluster path string true "Cluster name"
|
||||
// @param host path string true "Host name"
|
||||
// @success 200 {object} object "Machine state JSON data"
|
||||
// @failure 400 {object} ErrorResponse "Bad Request"
|
||||
// @failure 404 {object} ErrorResponse "Machine state not enabled or file not found"
|
||||
// @security ApiKeyAuth
|
||||
// @router /machine_state/{cluster}/{host} [get]
|
||||
func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
|
||||
if api.MachineStateDir == "" {
|
||||
handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
|
||||
return
|
||||
}
|
||||
|
||||
vars := mux.Vars(r)
|
||||
cluster := vars["cluster"]
|
||||
host := vars["host"]
|
||||
cluster := chi.URLParam(r, "cluster")
|
||||
host := chi.URLParam(r, "host")
|
||||
|
||||
// Validate cluster and host to prevent path traversal attacks
|
||||
if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
|
||||
handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(cluster, "cluster name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
|
||||
handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
|
||||
if err := validatePathComponent(host, "host name"); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -11,9 +11,9 @@ import (
|
||||
"net/http"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/gorilla/mux"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-chi/chi/v5"
|
||||
)
|
||||
|
||||
type APIReturnedUser struct {
|
||||
@@ -31,7 +31,7 @@ type APIReturnedUser struct {
|
||||
// @description Required query-parameter defines if all users or only users with additional special roles are returned.
|
||||
// @produce json
|
||||
// @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
|
||||
// @success 200 {array} api.ApiReturnedUser "List of users returned successfully"
|
||||
// @success 200 {array} api.APIReturnedUser "List of users returned successfully"
|
||||
// @failure 400 {string} string "Bad Request"
|
||||
// @failure 401 {string} string "Unauthorized"
|
||||
// @failure 403 {string} string "Forbidden"
|
||||
@@ -91,7 +91,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
// Handle role updates
|
||||
if newrole != "" {
|
||||
if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
|
||||
if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil {
|
||||
handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -99,7 +99,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if delrole != "" {
|
||||
if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
|
||||
if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil {
|
||||
handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -107,7 +107,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if newproj != "" {
|
||||
if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
|
||||
if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil {
|
||||
handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -115,7 +115,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
|
||||
cclog.Errorf("Failed to encode response: %v", err)
|
||||
}
|
||||
} else if delproj != "" {
|
||||
if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
|
||||
if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil {
|
||||
handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
|
||||
return
|
||||
}
|
||||
@@ -164,7 +164,7 @@ func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
|
||||
if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) {
|
||||
handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -106,7 +106,7 @@ Data is archived at the highest available resolution (typically 60s intervals).
|
||||
|
||||
```go
|
||||
// In archiver.go ArchiveJob() function
|
||||
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 300)
|
||||
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
|
||||
// 0 = highest resolution
|
||||
// 300 = 5-minute resolution
|
||||
```
|
||||
@@ -170,7 +170,6 @@ All exported functions are safe for concurrent use:
|
||||
- `Start()` - Safe to call once
|
||||
- `TriggerArchiving()` - Safe from multiple goroutines
|
||||
- `Shutdown()` - Safe to call once
|
||||
- `WaitForArchiving()` - Deprecated, but safe
|
||||
|
||||
Internal state is protected by:
|
||||
- Channel synchronization (`archiveChannel`)
|
||||
@@ -185,6 +184,6 @@ Internal state is protected by:
|
||||
## Dependencies
|
||||
|
||||
- `internal/repository`: Database operations for job metadata
|
||||
- `internal/metricDataDispatcher`: Loading metric data from various backends
|
||||
- `internal/metricdispatch`: Loading metric data from various backends
|
||||
- `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
|
||||
- `cc-lib/schema`: Job and metric data structures
|
||||
|
||||
@@ -54,8 +54,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
@@ -126,7 +126,7 @@ func archivingWorker() {
|
||||
// not using meta data, called to load JobMeta into Cache?
|
||||
// will fail if job meta not in repository
|
||||
if _, err := jobRepo.FetchMetadata(job); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", *job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
archivePending.Done()
|
||||
continue
|
||||
@@ -136,7 +136,7 @@ func archivingWorker() {
|
||||
// Use shutdown context to allow cancellation
|
||||
jobMeta, err := ArchiveJob(job, shutdownCtx)
|
||||
if err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", *job.ID, err.Error())
|
||||
jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
|
||||
archivePending.Done()
|
||||
continue
|
||||
@@ -145,24 +145,24 @@ func archivingWorker() {
|
||||
stmt := sq.Update("job").Where("job.id = ?", job.ID)
|
||||
|
||||
if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", *job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", *job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
// Update the jobs database entry one last time:
|
||||
stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
|
||||
if err := jobRepo.Execute(stmt); err != nil {
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
|
||||
cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", *job.ID, err.Error())
|
||||
archivePending.Done()
|
||||
continue
|
||||
}
|
||||
cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
|
||||
cclog.Infof("archiving job (dbid: %d) successful", job.ID)
|
||||
cclog.Infof("archiving job (dbid: %d) successful", *job.ID)
|
||||
|
||||
repository.CallJobStopHooks(job)
|
||||
archivePending.Done()
|
||||
|
||||
@@ -9,11 +9,10 @@ import (
|
||||
"context"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// ArchiveJob archives a completed job's metric data to the configured archive backend.
|
||||
@@ -60,7 +59,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
|
||||
scopes = append(scopes, schema.MetricScopeAccelerator)
|
||||
}
|
||||
|
||||
jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||
jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
|
||||
if err != nil {
|
||||
cclog.Error("Error wile loading job data for archiving")
|
||||
return nil, err
|
||||
@@ -94,12 +93,5 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// If the file based archive is disabled,
|
||||
// only return the JobMeta structure as the
|
||||
// statistics in there are needed.
|
||||
if config.Keys.DisableArchive {
|
||||
return job, nil
|
||||
}
|
||||
|
||||
return job, archive.GetHandle().ImportJob(job, &jobData)
|
||||
}
|
||||
|
||||
@@ -25,9 +25,9 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
"github.com/gorilla/sessions"
|
||||
)
|
||||
|
||||
@@ -40,7 +40,7 @@ type Authenticator interface {
|
||||
// authenticator should attempt the login. This method should not perform
|
||||
// expensive operations or actual authentication.
|
||||
CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
|
||||
|
||||
|
||||
// Login performs the actually authentication for the user.
|
||||
// It returns the authenticated user or an error if authentication fails.
|
||||
// The user parameter may be nil if the user doesn't exist in the database yet.
|
||||
@@ -65,13 +65,13 @@ var ipUserLimiters sync.Map
|
||||
func getIPUserLimiter(ip, username string) *rate.Limiter {
|
||||
key := ip + ":" + username
|
||||
now := time.Now()
|
||||
|
||||
|
||||
if entry, ok := ipUserLimiters.Load(key); ok {
|
||||
rle := entry.(*rateLimiterEntry)
|
||||
rle.lastUsed = now
|
||||
return rle.limiter
|
||||
}
|
||||
|
||||
|
||||
// More aggressive rate limiting: 5 attempts per 15 minutes
|
||||
newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
|
||||
ipUserLimiters.Store(key, &rateLimiterEntry{
|
||||
@@ -176,7 +176,7 @@ func (auth *Authentication) AuthViaSession(
|
||||
func Init(authCfg *json.RawMessage) {
|
||||
initOnce.Do(func() {
|
||||
authInstance = &Authentication{}
|
||||
|
||||
|
||||
// Start background cleanup of rate limiters
|
||||
startRateLimiterCleanup()
|
||||
|
||||
@@ -263,7 +263,7 @@ func GetAuthInstance() *Authentication {
|
||||
}
|
||||
|
||||
// handleUserSync syncs or updates a user in the database based on configuration.
|
||||
// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
|
||||
// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled.
|
||||
func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
|
||||
r := repository.GetUserRepository()
|
||||
dbUser, err := r.GetUser(user.Username)
|
||||
@@ -272,7 +272,7 @@ func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool)
|
||||
cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
|
||||
if err := r.AddUser(user); err != nil {
|
||||
cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
|
||||
@@ -294,6 +294,11 @@ func handleOIDCUser(OIDCUser *schema.User) {
|
||||
handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
|
||||
}
|
||||
|
||||
// handleLdapUser syncs LDAP user with database
|
||||
func handleLdapUser(ldapUser *schema.User) {
|
||||
handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin)
|
||||
}
|
||||
|
||||
func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
|
||||
session, err := auth.sessionStore.New(r, "session")
|
||||
if err != nil {
|
||||
@@ -305,8 +310,13 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
if auth.SessionMaxAge != 0 {
|
||||
session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
|
||||
}
|
||||
if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" {
|
||||
cclog.Warn("HTTPS not configured - session cookies will not have Secure flag set (insecure for production)")
|
||||
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
|
||||
// If neither TLS or an encrypted reverse proxy are used, do not mark cookies as secure.
|
||||
cclog.Warn("Authenticating with unencrypted request. Session cookies will not have Secure flag set (insecure for production)")
|
||||
if r.Header.Get("X-Forwarded-Proto") == "" {
|
||||
// This warning will not be printed if e.g. X-Forwarded-Proto == http
|
||||
cclog.Warn("If you are using a reverse proxy, make sure X-Forwarded-Proto is set")
|
||||
}
|
||||
session.Options.Secure = false
|
||||
}
|
||||
session.Options.SameSite = http.SameSiteStrictMode
|
||||
@@ -438,13 +448,13 @@ func (auth *Authentication) AuthAPI(
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
if user.HasRole(schema.RoleAPI) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
|
||||
if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
@@ -474,13 +484,13 @@ func (auth *Authentication) AuthUserAPI(
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
if user.HasRole(schema.RoleAPI) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
|
||||
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
@@ -510,13 +520,13 @@ func (auth *Authentication) AuthMetricStoreAPI(
|
||||
if user != nil {
|
||||
switch {
|
||||
case len(user.Roles) == 1:
|
||||
if user.HasRole(schema.RoleApi) {
|
||||
if user.HasRole(schema.RoleAPI) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
}
|
||||
case len(user.Roles) >= 2:
|
||||
if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||
if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
|
||||
ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
onsuccess.ServeHTTP(rw, r.WithContext(ctx))
|
||||
return
|
||||
@@ -616,9 +626,9 @@ func securedCheck(user *schema.User, r *http.Request) error {
|
||||
}
|
||||
// If SplitHostPort fails, IPAddress is already just a host (no port)
|
||||
|
||||
// If nothing declared in config: deny all request to this api endpoint
|
||||
// If nothing declared in config: Continue
|
||||
if len(config.Keys.APIAllowedIPs) == 0 {
|
||||
return fmt.Errorf("missing configuration key ApiAllowedIPs")
|
||||
return nil
|
||||
}
|
||||
// If wildcard declared in config: Continue
|
||||
if config.Keys.APIAllowedIPs[0] == "*" {
|
||||
|
||||
@@ -15,25 +15,25 @@ import (
|
||||
func TestGetIPUserLimiter(t *testing.T) {
|
||||
ip := "192.168.1.1"
|
||||
username := "testuser"
|
||||
|
||||
|
||||
// Get limiter for the first time
|
||||
limiter1 := getIPUserLimiter(ip, username)
|
||||
if limiter1 == nil {
|
||||
t.Fatal("Expected limiter to be created")
|
||||
}
|
||||
|
||||
|
||||
// Get the same limiter again
|
||||
limiter2 := getIPUserLimiter(ip, username)
|
||||
if limiter1 != limiter2 {
|
||||
t.Error("Expected to get the same limiter instance")
|
||||
}
|
||||
|
||||
|
||||
// Get a different limiter for different user
|
||||
limiter3 := getIPUserLimiter(ip, "otheruser")
|
||||
if limiter1 == limiter3 {
|
||||
t.Error("Expected different limiter for different user")
|
||||
}
|
||||
|
||||
|
||||
// Get a different limiter for different IP
|
||||
limiter4 := getIPUserLimiter("192.168.1.2", username)
|
||||
if limiter1 == limiter4 {
|
||||
@@ -45,16 +45,16 @@ func TestGetIPUserLimiter(t *testing.T) {
|
||||
func TestRateLimiterBehavior(t *testing.T) {
|
||||
ip := "10.0.0.1"
|
||||
username := "ratelimituser"
|
||||
|
||||
|
||||
limiter := getIPUserLimiter(ip, username)
|
||||
|
||||
|
||||
// Should allow first 5 attempts
|
||||
for i := 0; i < 5; i++ {
|
||||
for i := range 5 {
|
||||
if !limiter.Allow() {
|
||||
t.Errorf("Request %d should be allowed within rate limit", i+1)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// 6th attempt should be blocked
|
||||
if limiter.Allow() {
|
||||
t.Error("Request 6 should be blocked by rate limiter")
|
||||
@@ -65,19 +65,19 @@ func TestRateLimiterBehavior(t *testing.T) {
|
||||
func TestCleanupOldRateLimiters(t *testing.T) {
|
||||
// Clear all existing limiters first to avoid interference from other tests
|
||||
cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
|
||||
|
||||
|
||||
// Create some new rate limiters
|
||||
limiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||
limiter2 := getIPUserLimiter("2.2.2.2", "user2")
|
||||
|
||||
|
||||
if limiter1 == nil || limiter2 == nil {
|
||||
t.Fatal("Failed to create test limiters")
|
||||
}
|
||||
|
||||
|
||||
// Cleanup limiters older than 1 second from now (should keep both)
|
||||
time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
|
||||
cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
|
||||
|
||||
|
||||
// Verify they still exist (should get same instance)
|
||||
if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
|
||||
t.Error("Limiter 1 was incorrectly cleaned up")
|
||||
@@ -85,10 +85,10 @@ func TestCleanupOldRateLimiters(t *testing.T) {
|
||||
if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
|
||||
t.Error("Limiter 2 was incorrectly cleaned up")
|
||||
}
|
||||
|
||||
|
||||
// Cleanup limiters older than 1 hour from now (should remove both)
|
||||
cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
|
||||
|
||||
|
||||
// Getting them again should create new instances
|
||||
newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
|
||||
if newLimiter1 == limiter1 {
|
||||
@@ -107,14 +107,14 @@ func TestIPv4Extraction(t *testing.T) {
|
||||
{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
|
||||
{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
@@ -122,7 +122,7 @@ func TestIPv4Extraction(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestIPv6Extraction tests extracting IPv6 addresses
|
||||
// TestIPv6Extraction tests extracting IPv6 addresses
|
||||
func TestIPv6Extraction(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -134,14 +134,14 @@ func TestIPv6Extraction(t *testing.T) {
|
||||
{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
|
||||
{"IPv6 localhost", "::1", "::1"},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
@@ -160,14 +160,14 @@ func TestIPExtractionEdgeCases(t *testing.T) {
|
||||
{"Empty string", "", ""},
|
||||
{"Just port", ":8080", ""},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tt.input
|
||||
if host, _, err := net.SplitHostPort(result); err == nil {
|
||||
result = host
|
||||
}
|
||||
|
||||
|
||||
if result != tt.expected {
|
||||
t.Errorf("Expected %s, got %s", tt.expected, result)
|
||||
}
|
||||
|
||||
@@ -14,8 +14,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -25,20 +25,20 @@ type JWTAuthConfig struct {
|
||||
MaxAge string `json:"max-age"`
|
||||
|
||||
// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
|
||||
CookieName string `json:"cookieName"`
|
||||
CookieName string `json:"cookie-name"`
|
||||
|
||||
// Deny login for users not in database (but defined in JWT).
|
||||
// Ignore user roles defined in JWTs ('roles' claim), get them from db.
|
||||
ValidateUser bool `json:"validateUser"`
|
||||
ValidateUser bool `json:"validate-user"`
|
||||
|
||||
// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
|
||||
TrustedIssuer string `json:"trustedIssuer"`
|
||||
TrustedIssuer string `json:"trusted-issuer"`
|
||||
|
||||
// Should an non-existent user be added to the DB based on the information in the token
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||
|
||||
// Should an existent user be updated in the DB based on the information in the token
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||
}
|
||||
|
||||
type JWTAuthenticator struct {
|
||||
@@ -101,20 +101,20 @@ func (ja *JWTAuthenticator) AuthViaJWT(
|
||||
|
||||
// Token is valid, extract payload
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
var user *schema.User
|
||||
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
// If not validating user, we only get roles from JWT (no projects for this auth method)
|
||||
if !Keys.JwtConfig.ValidateUser {
|
||||
user.Roles = extractRolesFromClaims(claims, false)
|
||||
user.Projects = nil // Standard JWT auth doesn't include projects
|
||||
}
|
||||
|
||||
|
||||
return user, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -146,13 +146,13 @@ func (ja *JWTCookieSessionAuthenticator) Login(
|
||||
}
|
||||
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
// Sync or update user if configured
|
||||
if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
|
||||
handleTokenUser(user)
|
||||
|
||||
@@ -9,10 +9,11 @@ import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -28,7 +29,7 @@ func extractStringFromClaims(claims jwt.MapClaims, key string) string {
|
||||
// If validateRoles is true, only valid roles are returned
|
||||
func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
|
||||
var roles []string
|
||||
|
||||
|
||||
if rawroles, ok := claims["roles"].([]any); ok {
|
||||
for _, rr := range rawroles {
|
||||
if r, ok := rr.(string); ok {
|
||||
@@ -42,14 +43,14 @@ func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return roles
|
||||
}
|
||||
|
||||
// extractProjectsFromClaims extracts projects from JWT claims
|
||||
func extractProjectsFromClaims(claims jwt.MapClaims) []string {
|
||||
projects := make([]string, 0)
|
||||
|
||||
|
||||
if rawprojs, ok := claims["projects"].([]any); ok {
|
||||
for _, pp := range rawprojs {
|
||||
if p, ok := pp.(string); ok {
|
||||
@@ -61,7 +62,7 @@ func extractProjectsFromClaims(claims jwt.MapClaims) []string {
|
||||
projects = append(projects, projSlice...)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return projects
|
||||
}
|
||||
|
||||
@@ -72,22 +73,23 @@ func extractNameFromClaims(claims jwt.MapClaims) string {
|
||||
if name, ok := claims["name"].(string); ok {
|
||||
return name
|
||||
}
|
||||
|
||||
|
||||
// Try nested structure: {name: {values: [...]}}
|
||||
if wrap, ok := claims["name"].(map[string]any); ok {
|
||||
if vals, ok := wrap["values"].([]any); ok {
|
||||
if len(vals) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("%v", vals[0])
|
||||
|
||||
var name strings.Builder
|
||||
name.WriteString(fmt.Sprintf("%v", vals[0]))
|
||||
for i := 1; i < len(vals); i++ {
|
||||
name += fmt.Sprintf(" %v", vals[i])
|
||||
name.WriteString(fmt.Sprintf(" %v", vals[i]))
|
||||
}
|
||||
return name
|
||||
return name.String()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -100,7 +102,7 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
|
||||
if sub == "" {
|
||||
return nil, errors.New("missing 'sub' claim in JWT")
|
||||
}
|
||||
|
||||
|
||||
if validateUser {
|
||||
// Validate user against database
|
||||
ur := repository.GetUserRepository()
|
||||
@@ -109,22 +111,22 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
|
||||
cclog.Errorf("Error while loading user '%v': %v", sub, err)
|
||||
return nil, fmt.Errorf("database error: %w", err)
|
||||
}
|
||||
|
||||
|
||||
// Deny any logins for unknown usernames
|
||||
if user == nil || err == sql.ErrNoRows {
|
||||
cclog.Warn("Could not find user from JWT in internal database.")
|
||||
return nil, errors.New("unknown user")
|
||||
}
|
||||
|
||||
|
||||
// Return database user (with database roles)
|
||||
return user, nil
|
||||
}
|
||||
|
||||
|
||||
// Create user from JWT claims
|
||||
name := extractNameFromClaims(claims)
|
||||
roles := extractRolesFromClaims(claims, true) // Validate roles
|
||||
projects := extractProjectsFromClaims(claims)
|
||||
|
||||
|
||||
return &schema.User{
|
||||
Username: sub,
|
||||
Name: name,
|
||||
|
||||
@@ -8,7 +8,7 @@ package auth
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -19,7 +19,7 @@ func TestExtractStringFromClaims(t *testing.T) {
|
||||
"email": "test@example.com",
|
||||
"age": 25, // not a string
|
||||
}
|
||||
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
key string
|
||||
@@ -30,7 +30,7 @@ func TestExtractStringFromClaims(t *testing.T) {
|
||||
{"Non-existent key", "missing", ""},
|
||||
{"Non-string value", "age", ""},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractStringFromClaims(claims, tt.key)
|
||||
@@ -88,16 +88,16 @@ func TestExtractRolesFromClaims(t *testing.T) {
|
||||
expected: []string{},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractRolesFromClaims(tt.claims, tt.validateRoles)
|
||||
|
||||
|
||||
if len(result) != len(tt.expected) {
|
||||
t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
for i, role := range result {
|
||||
if i >= len(tt.expected) || role != tt.expected[i] {
|
||||
t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
|
||||
@@ -141,16 +141,16 @@ func TestExtractProjectsFromClaims(t *testing.T) {
|
||||
expected: []string{"project1", "project2"}, // Should skip non-strings
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractProjectsFromClaims(tt.claims)
|
||||
|
||||
|
||||
if len(result) != len(tt.expected) {
|
||||
t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
for i, project := range result {
|
||||
if i >= len(tt.expected) || project != tt.expected[i] {
|
||||
t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
|
||||
@@ -216,7 +216,7 @@ func TestExtractNameFromClaims(t *testing.T) {
|
||||
expected: "123 Smith", // Should convert to string
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := extractNameFromClaims(tt.claims)
|
||||
@@ -235,29 +235,28 @@ func TestGetUserFromJWT_NoValidation(t *testing.T) {
|
||||
"roles": []any{"user", "admin"},
|
||||
"projects": []any{"project1", "project2"},
|
||||
}
|
||||
|
||||
|
||||
user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %v", err)
|
||||
}
|
||||
|
||||
|
||||
if user.Username != "testuser" {
|
||||
t.Errorf("Expected username 'testuser', got '%s'", user.Username)
|
||||
}
|
||||
|
||||
|
||||
if user.Name != "Test User" {
|
||||
t.Errorf("Expected name 'Test User', got '%s'", user.Name)
|
||||
}
|
||||
|
||||
|
||||
if len(user.Roles) != 2 {
|
||||
t.Errorf("Expected 2 roles, got %d", len(user.Roles))
|
||||
}
|
||||
|
||||
|
||||
if len(user.Projects) != 2 {
|
||||
t.Errorf("Expected 2 projects, got %d", len(user.Projects))
|
||||
}
|
||||
|
||||
|
||||
if user.AuthType != schema.AuthToken {
|
||||
t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
|
||||
}
|
||||
@@ -268,13 +267,13 @@ func TestGetUserFromJWT_MissingSub(t *testing.T) {
|
||||
claims := jwt.MapClaims{
|
||||
"name": "Test User",
|
||||
}
|
||||
|
||||
|
||||
_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
|
||||
|
||||
|
||||
if err == nil {
|
||||
t.Error("Expected error for missing sub claim")
|
||||
}
|
||||
|
||||
|
||||
if err.Error() != "missing 'sub' claim in JWT" {
|
||||
t.Errorf("Expected specific error message, got: %v", err)
|
||||
}
|
||||
|
||||
@@ -13,8 +13,8 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
@@ -75,13 +75,13 @@ func (ja *JWTSessionAuthenticator) Login(
|
||||
}
|
||||
|
||||
claims := token.Claims.(jwt.MapClaims)
|
||||
|
||||
|
||||
// Use shared helper to get user from JWT claims
|
||||
user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
// Sync or update user if configured
|
||||
if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
|
||||
handleTokenUser(user)
|
||||
|
||||
@@ -6,35 +6,39 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/go-ldap/ldap/v3"
|
||||
)
|
||||
|
||||
type LdapConfig struct {
|
||||
URL string `json:"url"`
|
||||
UserBase string `json:"user_base"`
|
||||
SearchDN string `json:"search_dn"`
|
||||
UserBind string `json:"user_bind"`
|
||||
UserFilter string `json:"user_filter"`
|
||||
UserAttr string `json:"username_attr"`
|
||||
SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration.
|
||||
SyncDelOldUsers bool `json:"sync_del_old_users"`
|
||||
UserBase string `json:"user-base"`
|
||||
SearchDN string `json:"search-dn"`
|
||||
UserBind string `json:"user-bind"`
|
||||
UserFilter string `json:"user-filter"`
|
||||
UserAttr string `json:"username-attr"`
|
||||
UIDAttr string `json:"uid-attr"`
|
||||
SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration.
|
||||
SyncDelOldUsers bool `json:"sync-del-old-users"`
|
||||
|
||||
// Should an non-existent user be added to the DB if user exists in ldap directory
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
// Should a non-existent user be added to the DB if user exists in ldap directory
|
||||
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||
}
|
||||
|
||||
type LdapAuthenticator struct {
|
||||
syncPassword string
|
||||
UserAttr string
|
||||
UIDAttr string
|
||||
}
|
||||
|
||||
var _ Authenticator = (*LdapAuthenticator)(nil)
|
||||
@@ -51,6 +55,12 @@ func (la *LdapAuthenticator) Init() error {
|
||||
la.UserAttr = "gecos"
|
||||
}
|
||||
|
||||
if Keys.LdapConfig.UIDAttr != "" {
|
||||
la.UIDAttr = Keys.LdapConfig.UIDAttr
|
||||
} else {
|
||||
la.UIDAttr = "uid"
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -66,55 +76,44 @@ func (la *LdapAuthenticator) CanLogin(
|
||||
if user.AuthSource == schema.AuthViaLDAP {
|
||||
return user, true
|
||||
}
|
||||
} else {
|
||||
if lc.SyncUserOnLogin {
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
cclog.Error("LDAP connection error")
|
||||
return nil, false
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
// Search for the given username
|
||||
searchRequest := ldap.NewSearchRequest(
|
||||
lc.UserBase,
|
||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||
fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username),
|
||||
[]string{"dn", "uid", la.UserAttr}, nil)
|
||||
|
||||
sr, err := l.Search(searchRequest)
|
||||
if err != nil {
|
||||
cclog.Warn(err)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
if len(sr.Entries) != 1 {
|
||||
cclog.Warn("LDAP: User does not exist or too many entries returned")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
entry := sr.Entries[0]
|
||||
name := entry.GetAttributeValue(la.UserAttr)
|
||||
var roles []string
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
||||
projects := make([]string, 0)
|
||||
|
||||
user = &schema.User{
|
||||
Username: username,
|
||||
Name: name,
|
||||
Roles: roles,
|
||||
Projects: projects,
|
||||
AuthType: schema.AuthSession,
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
if err := repository.GetUserRepository().AddUser(user); err != nil {
|
||||
cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
return user, true
|
||||
} else if lc.SyncUserOnLogin {
|
||||
l, err := la.getLdapConnection(true)
|
||||
if err != nil {
|
||||
cclog.Error("LDAP connection error")
|
||||
return nil, false
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
// Search for the given username
|
||||
searchRequest := ldap.NewSearchRequest(
|
||||
lc.UserBase,
|
||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||
fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)),
|
||||
[]string{"dn", la.UIDAttr, la.UserAttr}, nil)
|
||||
|
||||
sr, err := l.Search(searchRequest)
|
||||
if err != nil {
|
||||
cclog.Warn(err)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
if len(sr.Entries) != 1 {
|
||||
cclog.Warn("LDAP: User does not exist or too many entries returned")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
entry := sr.Entries[0]
|
||||
user = &schema.User{
|
||||
Username: username,
|
||||
Name: entry.GetAttributeValue(la.UserAttr),
|
||||
Roles: []string{schema.GetRoleString(schema.RoleUser)},
|
||||
Projects: make([]string, 0),
|
||||
AuthType: schema.AuthSession,
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
handleLdapUser(user)
|
||||
return user, true
|
||||
}
|
||||
|
||||
return nil, false
|
||||
@@ -132,7 +131,7 @@ func (la *LdapAuthenticator) Login(
|
||||
}
|
||||
defer l.Close()
|
||||
|
||||
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username)
|
||||
userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username))
|
||||
if err := l.Bind(userDn, r.FormValue("password")); err != nil {
|
||||
cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
|
||||
user.Username, err)
|
||||
@@ -170,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
lc.UserBase,
|
||||
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
|
||||
lc.UserFilter,
|
||||
[]string{"dn", "uid", la.UserAttr}, nil))
|
||||
[]string{"dn", la.UIDAttr, la.UserAttr}, nil))
|
||||
if err != nil {
|
||||
cclog.Warn("LDAP search error")
|
||||
return err
|
||||
@@ -178,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
newnames := map[string]string{}
|
||||
for _, entry := range ldapResults.Entries {
|
||||
username := entry.GetAttributeValue("uid")
|
||||
username := entry.GetAttributeValue(la.UIDAttr)
|
||||
if username == "" {
|
||||
return errors.New("no attribute 'uid'")
|
||||
return fmt.Errorf("no attribute '%s'", la.UIDAttr)
|
||||
}
|
||||
|
||||
_, ok := users[username]
|
||||
@@ -194,20 +193,19 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
for username, where := range users {
|
||||
if where == InDB && lc.SyncDelOldUsers {
|
||||
ur.DelUser(username)
|
||||
if err := ur.DelUser(username); err != nil {
|
||||
cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err)
|
||||
return err
|
||||
}
|
||||
cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
|
||||
} else if where == InLdap {
|
||||
name := newnames[username]
|
||||
|
||||
var roles []string
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
||||
projects := make([]string, 0)
|
||||
|
||||
user := &schema.User{
|
||||
Username: username,
|
||||
Name: name,
|
||||
Roles: roles,
|
||||
Projects: projects,
|
||||
Roles: []string{schema.GetRoleString(schema.RoleUser)},
|
||||
Projects: make([]string, 0),
|
||||
AuthSource: schema.AuthViaLDAP,
|
||||
}
|
||||
|
||||
@@ -224,11 +222,13 @@ func (la *LdapAuthenticator) Sync() error {
|
||||
|
||||
func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
|
||||
lc := Keys.LdapConfig
|
||||
conn, err := ldap.DialURL(lc.URL)
|
||||
conn, err := ldap.DialURL(lc.URL,
|
||||
ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second}))
|
||||
if err != nil {
|
||||
cclog.Warn("LDAP URL dial failed")
|
||||
return nil, err
|
||||
}
|
||||
conn.SetTimeout(30 * time.Second)
|
||||
|
||||
if admin {
|
||||
if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
|
||||
|
||||
@@ -9,8 +9,8 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"golang.org/x/crypto/bcrypt"
|
||||
)
|
||||
|
||||
|
||||
@@ -9,23 +9,24 @@ import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
"github.com/coreos/go-oidc/v3/oidc"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/go-chi/chi/v5"
|
||||
"golang.org/x/oauth2"
|
||||
)
|
||||
|
||||
type OpenIDConfig struct {
|
||||
Provider string `json:"provider"`
|
||||
SyncUserOnLogin bool `json:"syncUserOnLogin"`
|
||||
UpdateUserOnLogin bool `json:"updateUserOnLogin"`
|
||||
SyncUserOnLogin bool `json:"sync-user-on-login"`
|
||||
UpdateUserOnLogin bool `json:"update-user-on-login"`
|
||||
}
|
||||
|
||||
type OIDC struct {
|
||||
@@ -50,6 +51,7 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
|
||||
MaxAge: int(time.Hour.Seconds()),
|
||||
Secure: r.TLS != nil,
|
||||
HttpOnly: true,
|
||||
SameSite: http.SameSiteLaxMode,
|
||||
}
|
||||
http.SetCookie(w, c)
|
||||
}
|
||||
@@ -59,7 +61,7 @@ func NewOIDC(a *Authentication) *OIDC {
|
||||
// Use context with timeout for provider initialization
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
|
||||
provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
@@ -77,8 +79,7 @@ func NewOIDC(a *Authentication) *OIDC {
|
||||
ClientID: clientID,
|
||||
ClientSecret: clientSecret,
|
||||
Endpoint: provider.Endpoint(),
|
||||
RedirectURL: "oidc-callback",
|
||||
Scopes: []string{oidc.ScopeOpenID, "profile", "email"},
|
||||
Scopes: []string{oidc.ScopeOpenID, "profile"},
|
||||
}
|
||||
|
||||
oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
|
||||
@@ -86,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC {
|
||||
return oa
|
||||
}
|
||||
|
||||
func (oa *OIDC) RegisterEndpoints(r *mux.Router) {
|
||||
func (oa *OIDC) RegisterEndpoints(r chi.Router) {
|
||||
r.HandleFunc("/oidc-login", oa.OAuth2Login)
|
||||
r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
|
||||
}
|
||||
@@ -119,57 +120,96 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
// Exchange authorization code for token with timeout
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
|
||||
token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
|
||||
if err != nil {
|
||||
http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
|
||||
cclog.Errorf("token exchange failed: %s", err.Error())
|
||||
http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Get user info from OIDC provider with same timeout
|
||||
userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
|
||||
if err != nil {
|
||||
http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
|
||||
cclog.Errorf("failed to get userinfo: %s", err.Error())
|
||||
http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// // Extract the ID Token from OAuth2 token.
|
||||
// rawIDToken, ok := token.Extra("id_token").(string)
|
||||
// if !ok {
|
||||
// http.Error(rw, "Cannot access idToken", http.StatusInternalServerError)
|
||||
// }
|
||||
//
|
||||
// verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
|
||||
// // Parse and verify ID Token payload.
|
||||
// idToken, err := verifier.Verify(context.Background(), rawIDToken)
|
||||
// if err != nil {
|
||||
// http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError)
|
||||
// }
|
||||
// Verify ID token and nonce to prevent replay attacks
|
||||
rawIDToken, ok := token.Extra("id_token").(string)
|
||||
if !ok {
|
||||
http.Error(rw, "ID token not found in response", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
nonceCookie, err := r.Cookie("nonce")
|
||||
if err != nil {
|
||||
http.Error(rw, "nonce cookie not found", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
|
||||
idToken, err := verifier.Verify(ctx, rawIDToken)
|
||||
if err != nil {
|
||||
cclog.Errorf("ID token verification failed: %s", err.Error())
|
||||
http.Error(rw, "ID token verification failed", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if idToken.Nonce != nonceCookie.Value {
|
||||
http.Error(rw, "Nonce mismatch", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
projects := make([]string, 0)
|
||||
|
||||
// Extract custom claims
|
||||
// Extract custom claims from userinfo
|
||||
var claims struct {
|
||||
Username string `json:"preferred_username"`
|
||||
Name string `json:"name"`
|
||||
Profile struct {
|
||||
// Keycloak realm-level roles
|
||||
RealmAccess struct {
|
||||
Roles []string `json:"roles"`
|
||||
} `json:"realm_access"`
|
||||
// Keycloak client-level roles
|
||||
ResourceAccess struct {
|
||||
Client struct {
|
||||
Roles []string `json:"roles"`
|
||||
} `json:"clustercockpit"`
|
||||
} `json:"resource_access"`
|
||||
}
|
||||
if err := userInfo.Claims(&claims); err != nil {
|
||||
http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError)
|
||||
cclog.Errorf("failed to extract claims: %s", err.Error())
|
||||
http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if claims.Username == "" {
|
||||
http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Merge roles from both client-level and realm-level access
|
||||
oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...)
|
||||
|
||||
roleSet := make(map[string]bool)
|
||||
for _, r := range oidcRoles {
|
||||
switch r {
|
||||
case "user":
|
||||
roleSet[schema.GetRoleString(schema.RoleUser)] = true
|
||||
case "admin":
|
||||
roleSet[schema.GetRoleString(schema.RoleAdmin)] = true
|
||||
case "manager":
|
||||
roleSet[schema.GetRoleString(schema.RoleManager)] = true
|
||||
case "support":
|
||||
roleSet[schema.GetRoleString(schema.RoleSupport)] = true
|
||||
}
|
||||
}
|
||||
|
||||
var roles []string
|
||||
for _, r := range claims.Profile.Client.Roles {
|
||||
switch r {
|
||||
case "user":
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleUser))
|
||||
case "admin":
|
||||
roles = append(roles, schema.GetRoleString(schema.RoleAdmin))
|
||||
}
|
||||
for role := range roleSet {
|
||||
roles = append(roles, role)
|
||||
}
|
||||
|
||||
if len(roles) == 0 {
|
||||
@@ -188,8 +228,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
|
||||
handleOIDCUser(user)
|
||||
}
|
||||
|
||||
oa.authentication.SaveSession(rw, r, user)
|
||||
cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
if err := oa.authentication.SaveSession(rw, r, user); err != nil {
|
||||
cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error())
|
||||
http.Error(rw, "Failed to create session", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
|
||||
userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
|
||||
http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
|
||||
}
|
||||
@@ -206,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
|
||||
codeVerifier := oauth2.GenerateVerifier()
|
||||
setCallbackCookie(rw, r, "verifier", codeVerifier)
|
||||
|
||||
// Generate nonce for ID token replay protection
|
||||
nonce, err := randString(16)
|
||||
if err != nil {
|
||||
http.Error(rw, "Internal error", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
setCallbackCookie(rw, r, "nonce", nonce)
|
||||
|
||||
// Build redirect URL from the incoming request
|
||||
scheme := "https"
|
||||
if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
|
||||
scheme = "http"
|
||||
}
|
||||
oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host)
|
||||
|
||||
// Redirect user to consent page to ask for permission
|
||||
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier))
|
||||
url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline,
|
||||
oauth2.S256ChallengeOption(codeVerifier),
|
||||
oidc.Nonce(nonce))
|
||||
http.Redirect(rw, r, url, http.StatusFound)
|
||||
}
|
||||
|
||||
@@ -15,37 +15,44 @@ var configSchema = `
|
||||
"description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
|
||||
"type": "string"
|
||||
},
|
||||
"cookieName": {
|
||||
"cookie-name": {
|
||||
"description": "Cookie that should be checked for a JWT token.",
|
||||
"type": "string"
|
||||
},
|
||||
"validateUser": {
|
||||
"validate-user": {
|
||||
"description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"trustedIssuer": {
|
||||
"trusted-issuer": {
|
||||
"description": "Issuer that should be accepted when validating external JWTs ",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"sync-user-on-login": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"update-user-on-login": {
|
||||
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided in JWT.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["max-age"]
|
||||
},
|
||||
"oidc": {
|
||||
"provider": {
|
||||
"description": "",
|
||||
"type": "string"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
},
|
||||
"updateUserOnLogin": {
|
||||
"description": "",
|
||||
"type": "boolean"
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"provider": {
|
||||
"description": "OpenID Connect provider URL.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync-user-on-login": {
|
||||
"description": "Add non-existent user to DB at login attempt with values provided.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"update-user-on-login": {
|
||||
"description": "Should an existent user attributes in the DB be updated at login attempt with values provided.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["provider"]
|
||||
},
|
||||
@@ -57,40 +64,48 @@ var configSchema = `
|
||||
"description": "URL of LDAP directory server.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_base": {
|
||||
"user-base": {
|
||||
"description": "Base DN of user tree root.",
|
||||
"type": "string"
|
||||
},
|
||||
"search_dn": {
|
||||
"search-dn": {
|
||||
"description": "DN for authenticating LDAP admin account with general read rights.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_bind": {
|
||||
"user-bind": {
|
||||
"description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
|
||||
"type": "string"
|
||||
},
|
||||
"user_filter": {
|
||||
"user-filter": {
|
||||
"description": "Filter to extract users for syncing.",
|
||||
"type": "string"
|
||||
},
|
||||
"username_attr": {
|
||||
"username-attr": {
|
||||
"description": "Attribute with full username. Default: gecos",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_interval": {
|
||||
"sync-interval": {
|
||||
"description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
|
||||
"type": "string"
|
||||
},
|
||||
"sync_del_old_users": {
|
||||
"sync-del-old-users": {
|
||||
"description": "Delete obsolete users in database.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"syncUserOnLogin": {
|
||||
"uid-attr": {
|
||||
"description": "LDAP attribute used as login username. Default: uid",
|
||||
"type": "string"
|
||||
},
|
||||
"sync-user-on-login": {
|
||||
"description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
|
||||
"type": "boolean"
|
||||
},
|
||||
"update-user-on-login": {
|
||||
"description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["url", "user_base", "search_dn", "user_bind", "user_filter"]
|
||||
"required": ["url", "user-base", "search-dn", "user-bind", "user-filter"]
|
||||
},
|
||||
"required": ["jwts"]
|
||||
}`
|
||||
|
||||
@@ -11,8 +11,8 @@ import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/resampler"
|
||||
)
|
||||
|
||||
type ProgramConfig struct {
|
||||
@@ -20,7 +20,9 @@ type ProgramConfig struct {
|
||||
Addr string `json:"addr"`
|
||||
|
||||
// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
|
||||
APIAllowedIPs []string `json:"apiAllowedIPs"`
|
||||
APIAllowedIPs []string `json:"api-allowed-ips"`
|
||||
|
||||
APISubjects *NATSConfig `json:"api-subjects"`
|
||||
|
||||
// Drop root permissions once .env was read and the port was taken.
|
||||
User string `json:"user"`
|
||||
@@ -35,16 +37,9 @@ type ProgramConfig struct {
|
||||
EmbedStaticFiles bool `json:"embed-static-files"`
|
||||
StaticFiles string `json:"static-files"`
|
||||
|
||||
// 'sqlite3' or 'mysql' (mysql will work for mariadb as well)
|
||||
DBDriver string `json:"db-driver"`
|
||||
|
||||
// For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).
|
||||
// Path to SQLite database file
|
||||
DB string `json:"db"`
|
||||
|
||||
// Keep all metric data in the metric data repositories,
|
||||
// do not write to the job-archive.
|
||||
DisableArchive bool `json:"disable-archive"`
|
||||
|
||||
EnableJobTaggers bool `json:"enable-job-taggers"`
|
||||
|
||||
// Validate json input against schema
|
||||
@@ -76,17 +71,42 @@ type ProgramConfig struct {
|
||||
|
||||
// If exists, will enable dynamic zoom in frontend metric plots using the configured values
|
||||
EnableResampling *ResampleConfig `json:"resampling"`
|
||||
|
||||
// Systemd unit name for log viewer (default: "clustercockpit")
|
||||
SystemdUnit string `json:"systemd-unit"`
|
||||
|
||||
// Node state retention configuration
|
||||
NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
|
||||
}
|
||||
|
||||
type NodeStateRetention struct {
|
||||
Policy string `json:"policy"` // "delete" or "move"
|
||||
Age int `json:"age"` // hours, default 24
|
||||
TargetKind string `json:"target-kind"` // "file" or "s3"
|
||||
TargetPath string `json:"target-path"`
|
||||
TargetEndpoint string `json:"target-endpoint"`
|
||||
TargetBucket string `json:"target-bucket"`
|
||||
TargetAccessKey string `json:"target-access-key"`
|
||||
TargetSecretKey string `json:"target-secret-key"`
|
||||
TargetRegion string `json:"target-region"`
|
||||
TargetUsePathStyle bool `json:"target-use-path-style"`
|
||||
MaxFileSizeMB int `json:"max-file-size-mb"`
|
||||
}
|
||||
|
||||
type ResampleConfig struct {
|
||||
// Minimum number of points to trigger resampling of data
|
||||
MinimumPoints int `json:"minimumPoints"`
|
||||
MinimumPoints int `json:"minimum-points"`
|
||||
// Array of resampling target resolutions, in seconds; Example: [600,300,60]
|
||||
Resolutions []int `json:"resolutions"`
|
||||
// Trigger next zoom level at less than this many visible datapoints
|
||||
Trigger int `json:"trigger"`
|
||||
}
|
||||
|
||||
type NATSConfig struct {
|
||||
SubjectJobEvent string `json:"subject-job-event"`
|
||||
SubjectNodeState string `json:"subject-node-state"`
|
||||
}
|
||||
|
||||
type IntRange struct {
|
||||
From int `json:"from"`
|
||||
To int `json:"to"`
|
||||
@@ -100,32 +120,20 @@ type TimeRange struct {
|
||||
|
||||
type FilterRanges struct {
|
||||
Duration *IntRange `json:"duration"`
|
||||
NumNodes *IntRange `json:"numNodes"`
|
||||
StartTime *TimeRange `json:"startTime"`
|
||||
NumNodes *IntRange `json:"num-nodes"`
|
||||
StartTime *TimeRange `json:"start-time"`
|
||||
}
|
||||
|
||||
type ClusterConfig struct {
|
||||
Name string `json:"name"`
|
||||
FilterRanges *FilterRanges `json:"filterRanges"`
|
||||
MetricDataRepository json.RawMessage `json:"metricDataRepository"`
|
||||
}
|
||||
|
||||
var Clusters []*ClusterConfig
|
||||
|
||||
var Keys ProgramConfig = ProgramConfig{
|
||||
Addr: "localhost:8080",
|
||||
DisableAuthentication: false,
|
||||
EmbedStaticFiles: true,
|
||||
DBDriver: "sqlite3",
|
||||
DB: "./var/job.db",
|
||||
DisableArchive: false,
|
||||
Validate: false,
|
||||
SessionMaxAge: "168h",
|
||||
StopJobsExceedingWalltime: 0,
|
||||
ShortRunningJobsDuration: 5 * 60,
|
||||
}
|
||||
|
||||
func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
|
||||
func Init(mainConfig json.RawMessage) {
|
||||
Validate(configSchema, mainConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(mainConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
@@ -133,17 +141,6 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
Validate(clustersSchema, clusterConfig)
|
||||
dec = json.NewDecoder(bytes.NewReader(clusterConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Clusters); err != nil {
|
||||
cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
|
||||
}
|
||||
|
||||
if len(Clusters) < 1 {
|
||||
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
|
||||
}
|
||||
|
||||
if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
|
||||
resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
|
||||
}
|
||||
|
||||
@@ -8,19 +8,15 @@ package config
|
||||
import (
|
||||
"testing"
|
||||
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
)
|
||||
|
||||
func TestInit(t *testing.T) {
|
||||
fp := "../../configs/config.json"
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
@@ -34,11 +30,7 @@ func TestInitMinimal(t *testing.T) {
|
||||
fp := "../../configs/config-demo.json"
|
||||
ccconf.Init(fp)
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ import (
|
||||
|
||||
type DefaultMetricsCluster struct {
|
||||
Name string `json:"name"`
|
||||
DefaultMetrics string `json:"default_metrics"`
|
||||
DefaultMetrics string `json:"default-metrics"`
|
||||
}
|
||||
|
||||
type DefaultMetricsConfig struct {
|
||||
|
||||
@@ -6,14 +6,14 @@
|
||||
package config
|
||||
|
||||
var configSchema = `
|
||||
{
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"addr": {
|
||||
"description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
|
||||
"type": "string"
|
||||
},
|
||||
"apiAllowedIPs": {
|
||||
"api-allowed-ips": {
|
||||
"description": "Addresses from which secured API endpoints can be reached",
|
||||
"type": "array",
|
||||
"items": {
|
||||
@@ -41,13 +41,9 @@ var configSchema = `
|
||||
"type": "string"
|
||||
},
|
||||
"db": {
|
||||
"description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
|
||||
"description": "Path to SQLite database file (e.g., './var/job.db')",
|
||||
"type": "string"
|
||||
},
|
||||
"disable-archive": {
|
||||
"description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"enable-job-taggers": {
|
||||
"description": "Turn on automatic application and jobclass taggers",
|
||||
"type": "boolean"
|
||||
@@ -81,28 +77,22 @@ var configSchema = `
|
||||
"type": "integer"
|
||||
},
|
||||
"emission-constant": {
|
||||
"description": ".",
|
||||
"description": "Energy mix CO2 emission constant [g/kWh]. If set, displays estimated CO2 emission for jobs.",
|
||||
"type": "integer"
|
||||
},
|
||||
"cron-frequency": {
|
||||
"description": "Frequency of cron job workers.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"duration-worker": {
|
||||
"description": "Duration Update Worker [Defaults to '5m']",
|
||||
"type": "string"
|
||||
},
|
||||
"footprint-worker": {
|
||||
"description": "Metric-Footprint Update Worker [Defaults to '10m']",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
"machine-state-dir": {
|
||||
"description": "Where to store MachineState files.",
|
||||
"type": "string"
|
||||
},
|
||||
"enable-resampling": {
|
||||
"systemd-unit": {
|
||||
"description": "Systemd unit name for log viewer (default: 'clustercockpit').",
|
||||
"type": "string"
|
||||
},
|
||||
"resampling": {
|
||||
"description": "Enable dynamic zoom in frontend metric plots.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"minimumPoints": {
|
||||
"minimum-points": {
|
||||
"description": "Minimum points to trigger resampling of time-series data.",
|
||||
"type": "integer"
|
||||
},
|
||||
@@ -119,87 +109,74 @@ var configSchema = `
|
||||
}
|
||||
},
|
||||
"required": ["trigger", "resolutions"]
|
||||
}
|
||||
},
|
||||
"required": ["apiAllowedIPs"]
|
||||
}`
|
||||
|
||||
var clustersSchema = `
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
},
|
||||
"api-subjects": {
|
||||
"description": "NATS subjects configuration for subscribing to job and node events.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "The name of the cluster.",
|
||||
"subject-job-event": {
|
||||
"description": "NATS subject for job events (start_job, stop_job)",
|
||||
"type": "string"
|
||||
},
|
||||
"metricDataRepository": {
|
||||
"description": "Type of the metric data repository for this cluster",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["kind"]
|
||||
},
|
||||
"filterRanges": {
|
||||
"description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"numNodes": {
|
||||
"description": "UI slider range for number of nodes",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"duration": {
|
||||
"description": "UI slider range for duration",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "integer"
|
||||
},
|
||||
"to": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"startTime": {
|
||||
"description": "UI slider range for start time",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"to": {
|
||||
"type": "null"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
}
|
||||
},
|
||||
"required": ["numNodes", "duration", "startTime"]
|
||||
"subject-node-state": {
|
||||
"description": "NATS subject for node state updates",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["name", "metricDataRepository", "filterRanges"],
|
||||
"minItems": 1
|
||||
"required": ["subject-job-event", "subject-node-state"]
|
||||
},
|
||||
"nodestate-retention": {
|
||||
"description": "Node state retention configuration for cleaning up old node_state rows.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"policy": {
|
||||
"description": "Retention policy: 'delete' to remove old rows, 'move' to archive to Parquet then delete.",
|
||||
"type": "string",
|
||||
"enum": ["delete", "move"]
|
||||
},
|
||||
"age": {
|
||||
"description": "Retention age in hours (default: 24).",
|
||||
"type": "integer"
|
||||
},
|
||||
"target-kind": {
|
||||
"description": "Target kind for parquet archiving: 'file' or 's3'.",
|
||||
"type": "string",
|
||||
"enum": ["file", "s3"]
|
||||
},
|
||||
"target-path": {
|
||||
"description": "Filesystem path for parquet file target.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-endpoint": {
|
||||
"description": "S3 endpoint URL.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-bucket": {
|
||||
"description": "S3 bucket name.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-access-key": {
|
||||
"description": "S3 access key.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-secret-key": {
|
||||
"description": "S3 secret key.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-region": {
|
||||
"description": "S3 region.",
|
||||
"type": "string"
|
||||
},
|
||||
"target-use-path-style": {
|
||||
"description": "Use path-style S3 addressing.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"max-file-size-mb": {
|
||||
"description": "Maximum parquet file size in MB (default: 128).",
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": ["policy"]
|
||||
}
|
||||
}`
|
||||
}
|
||||
}`
|
||||
|
||||
@@ -8,7 +8,7 @@ package config
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/santhosh-tekuri/jsonschema/v5"
|
||||
)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,9 +10,21 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
type ClusterMetricWithName struct {
|
||||
Name string `json:"name"`
|
||||
Unit *schema.Unit `json:"unit,omitempty"`
|
||||
Timestep int `json:"timestep"`
|
||||
Data []schema.Float `json:"data"`
|
||||
}
|
||||
|
||||
type ClusterMetrics struct {
|
||||
NodeCount int `json:"nodeCount"`
|
||||
Metrics []*ClusterMetricWithName `json:"metrics"`
|
||||
}
|
||||
|
||||
type Count struct {
|
||||
Name string `json:"name"`
|
||||
Count int `json:"count"`
|
||||
@@ -59,6 +71,7 @@ type JobFilter struct {
|
||||
Project *StringInput `json:"project,omitempty"`
|
||||
JobName *StringInput `json:"jobName,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
SubCluster *StringInput `json:"subCluster,omitempty"`
|
||||
Partition *StringInput `json:"partition,omitempty"`
|
||||
Duration *config.IntRange `json:"duration,omitempty"`
|
||||
Energy *FloatRange `json:"energy,omitempty"`
|
||||
@@ -70,6 +83,7 @@ type JobFilter struct {
|
||||
State []schema.JobState `json:"state,omitempty"`
|
||||
MetricStats []*MetricStatItem `json:"metricStats,omitempty"`
|
||||
Shared *string `json:"shared,omitempty"`
|
||||
Schedule *string `json:"schedule,omitempty"`
|
||||
Node *StringInput `json:"node,omitempty"`
|
||||
}
|
||||
|
||||
@@ -173,7 +187,7 @@ type NamedStatsWithScope struct {
|
||||
type NodeFilter struct {
|
||||
Hostname *StringInput `json:"hostname,omitempty"`
|
||||
Cluster *StringInput `json:"cluster,omitempty"`
|
||||
Subcluster *StringInput `json:"subcluster,omitempty"`
|
||||
SubCluster *StringInput `json:"subCluster,omitempty"`
|
||||
SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
|
||||
HealthState *string `json:"healthState,omitempty"`
|
||||
TimeStart *int `json:"timeStart,omitempty"`
|
||||
|
||||
@@ -4,7 +4,7 @@ import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
package graph
|
||||
|
||||
// This file will be automatically regenerated based on the schema, any resolver implementations
|
||||
// This file will be automatically regenerated based on the schema, any resolver
|
||||
// implementations
|
||||
// will be copied through when generating and any unknown code will be moved to the end.
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.81
|
||||
// Code generated by github.com/99designs/gqlgen version v0.17.87
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
@@ -17,11 +19,12 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// Partitions is the resolver for the partitions field.
|
||||
@@ -86,14 +89,14 @@ func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*
|
||||
res := []*model.EnergyFootprintValue{}
|
||||
for name, value := range rawEnergyFootprint {
|
||||
// Suboptimal: Nearly hardcoded metric name expectations
|
||||
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
|
||||
matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
|
||||
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
|
||||
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
|
||||
matchCore := regexp.MustCompile(`core|Core|CORE`)
|
||||
|
||||
hwType := ""
|
||||
switch test := name; { // NOtice ';' for var declaration
|
||||
case matchCpu.MatchString(test):
|
||||
case matchCPU.MatchString(test):
|
||||
hwType = "CPU"
|
||||
case matchAcc.MatchString(test):
|
||||
hwType = "Accelerator"
|
||||
@@ -173,9 +176,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
@@ -220,9 +223,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
|
||||
}
|
||||
|
||||
tags := []*schema.Tag{}
|
||||
for _, tagId := range tagIds {
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id")
|
||||
return nil, err
|
||||
@@ -263,9 +266,9 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
|
||||
}
|
||||
|
||||
tags := []int{}
|
||||
for _, tagId := range tagIds {
|
||||
for _, tagID := range tagIds {
|
||||
// Get ID
|
||||
tid, err := strconv.ParseInt(tagId, 10, 64)
|
||||
tid, err := strconv.ParseInt(tagID, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing tag id for removal")
|
||||
return nil, err
|
||||
@@ -281,7 +284,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
|
||||
// Test Access: Admins && Admin Tag OR Everyone && Private Tag
|
||||
if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
|
||||
// Remove from DB
|
||||
if err = r.Repo.RemoveTagById(tid); err != nil {
|
||||
if err = r.Repo.RemoveTagByID(tid); err != nil {
|
||||
cclog.Warn("Error while removing tag")
|
||||
return nil, err
|
||||
} else {
|
||||
@@ -315,18 +318,39 @@ func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (sc
|
||||
if obj.NodeState != "" {
|
||||
return obj.NodeState, nil
|
||||
} else {
|
||||
return "", fmt.Errorf("No SchedulerState (NodeState) on Object")
|
||||
return "", fmt.Errorf("resolver: no SchedulerState (NodeState) on node object")
|
||||
}
|
||||
}
|
||||
|
||||
// HealthState is the resolver for the healthState field.
|
||||
func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
|
||||
panic(fmt.Errorf("not implemented: HealthState - healthState"))
|
||||
if obj.HealthState != "" {
|
||||
return string(obj.HealthState), nil
|
||||
} else {
|
||||
return "", fmt.Errorf("resolver: no HealthState (NodeState) on node object")
|
||||
}
|
||||
}
|
||||
|
||||
// MetaData is the resolver for the metaData field.
|
||||
func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
|
||||
panic(fmt.Errorf("not implemented: MetaData - metaData"))
|
||||
if obj.MetaData != nil {
|
||||
return obj.MetaData, nil
|
||||
} else {
|
||||
cclog.Debug("resolver: no MetaData (NodeState) on node object")
|
||||
emptyMeta := make(map[string]string, 0)
|
||||
return emptyMeta, nil
|
||||
}
|
||||
}
|
||||
|
||||
// HealthData is the resolver for the healthData field.
|
||||
func (r *nodeResolver) HealthData(ctx context.Context, obj *schema.Node) (any, error) {
|
||||
if obj.HealthData != nil {
|
||||
return obj.HealthData, nil
|
||||
} else {
|
||||
cclog.Debug("resolver: no HealthData (NodeState) on node object")
|
||||
emptyHealth := make(map[string][]string, 0)
|
||||
return emptyHealth, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Clusters is the resolver for the clusters field.
|
||||
@@ -341,6 +365,14 @@ func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
|
||||
|
||||
// GlobalMetrics is the resolver for the globalMetrics field.
|
||||
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
|
||||
if user != nil {
|
||||
if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
|
||||
return archive.GlobalUserMetricList, nil
|
||||
}
|
||||
}
|
||||
|
||||
return archive.GlobalMetricList, nil
|
||||
}
|
||||
|
||||
@@ -371,12 +403,12 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
|
||||
// Node is the resolver for the node field.
|
||||
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
numericID, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
return repo.GetNodeById(numericId, false)
|
||||
return repo.GetNodeByID(numericID, false)
|
||||
}
|
||||
|
||||
// Nodes is the resolver for the nodes field.
|
||||
@@ -387,6 +419,15 @@ func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, o
|
||||
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
|
||||
}
|
||||
|
||||
// NodesWithMeta is the resolver for the nodesWithMeta field.
|
||||
func (r *queryResolver) NodesWithMeta(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
|
||||
// Why Extra Handler? -> graphql.CollectAllFields(ctx) only returns toplevel fields (i.e.: items, count), and not subfields like item.metaData
|
||||
repo := repository.GetNodeRepository()
|
||||
nodes, err := repo.QueryNodesWithMeta(ctx, filter, nil, order) // Ignore Paging, Order Unused
|
||||
count := len(nodes)
|
||||
return &model.NodeStateResultList{Items: nodes, Count: &count}, err
|
||||
}
|
||||
|
||||
// NodeStates is the resolver for the nodeStates field.
|
||||
func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
|
||||
repo := repository.GetNodeRepository()
|
||||
@@ -403,8 +444,7 @@ func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilt
|
||||
return nil, herr
|
||||
}
|
||||
|
||||
allCounts := make([]*model.NodeStates, 0)
|
||||
allCounts = append(stateCounts, healthCounts...)
|
||||
allCounts := append(stateCounts, healthCounts...)
|
||||
|
||||
return allCounts, nil
|
||||
}
|
||||
@@ -431,18 +471,18 @@ func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.Nod
|
||||
return healthCounts, nil
|
||||
}
|
||||
|
||||
return nil, errors.New("Unknown Node State Query Type")
|
||||
return nil, errors.New("unknown Node State Query Type")
|
||||
}
|
||||
|
||||
// Job is the resolver for the job field.
|
||||
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
|
||||
numericId, err := strconv.ParseInt(id, 10, 64)
|
||||
numericID, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while parsing job id")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
job, err := r.Repo.FindById(ctx, numericId)
|
||||
job, err := r.Repo.FindByID(ctx, numericID)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while finding job by id")
|
||||
return nil, err
|
||||
@@ -475,7 +515,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
|
||||
data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while loading job data")
|
||||
return nil, err
|
||||
@@ -503,7 +543,7 @@ func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []strin
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
|
||||
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading jobStats data for job id %s", id)
|
||||
return nil, err
|
||||
@@ -528,7 +568,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
|
||||
data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
|
||||
return nil, err
|
||||
@@ -542,7 +582,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
|
||||
for _, stat := range stats {
|
||||
mdlStats = append(mdlStats, &model.ScopedStats{
|
||||
Hostname: stat.Hostname,
|
||||
ID: stat.Id,
|
||||
ID: stat.ID,
|
||||
Data: stat.Data,
|
||||
})
|
||||
}
|
||||
@@ -581,21 +621,24 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag
|
||||
|
||||
// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
|
||||
// Users can decide in frontend to use continuous scroll, even if app-default is paging!
|
||||
// Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards)
|
||||
/*
|
||||
Example Page 4 @ 10 IpP : Does item 41 exist?
|
||||
Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
|
||||
*/
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
hasNextPage := false
|
||||
if page.ItemsPerPage != -1 {
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
}
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
hasNextPage = len(nextJobs) == 1
|
||||
}
|
||||
nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next jobs")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hasNextPage := len(nextJobs) == 1
|
||||
|
||||
return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
|
||||
}
|
||||
@@ -693,7 +736,7 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job
|
||||
|
||||
res := []*model.JobStats{}
|
||||
for _, job := range jobs {
|
||||
data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
|
||||
data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
|
||||
continue
|
||||
@@ -744,13 +787,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
defaultMetrics := make([]string, 0)
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
defaultMetrics = append(defaultMetrics, mc.Name)
|
||||
}
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
metrics = defaultMetrics
|
||||
} else {
|
||||
metrics = slices.DeleteFunc(metrics, func(metric string) bool {
|
||||
return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics.
|
||||
})
|
||||
}
|
||||
|
||||
data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
@@ -804,153 +853,39 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
nodeRepo := repository.GetNodeRepository()
|
||||
// nodes -> array hostname
|
||||
nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
|
||||
if nerr != nil {
|
||||
return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Build Filters
|
||||
queryFilters := make([]*model.NodeFilter, 0)
|
||||
if cluster != "" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}})
|
||||
}
|
||||
if subCluster != "" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Subcluster: &model.StringInput{Eq: &subCluster}})
|
||||
}
|
||||
if nodeFilter != "" && stateFilter != "notindb" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}})
|
||||
}
|
||||
if stateFilter != "all" && stateFilter != "notindb" {
|
||||
var queryState schema.SchedulerState = schema.SchedulerState(stateFilter)
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState})
|
||||
}
|
||||
// if healthFilter != "all" {
|
||||
// filters = append(filters, &model.NodeFilter{HealthState: &healthFilter})
|
||||
// }
|
||||
|
||||
// Special Case: Disable Paging for missing nodes filter, save IPP for later
|
||||
var backupItems int
|
||||
if stateFilter == "notindb" {
|
||||
backupItems = page.ItemsPerPage
|
||||
page.ItemsPerPage = -1
|
||||
}
|
||||
|
||||
// Query Nodes From DB
|
||||
nodeRepo := repository.GetNodeRepository()
|
||||
rawNodes, serr := nodeRepo.QueryNodes(ctx, queryFilters, page, nil) // Order not Used
|
||||
if serr != nil {
|
||||
cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)")
|
||||
return nil, serr
|
||||
}
|
||||
|
||||
// Intermediate Node Result Info
|
||||
nodes := make([]string, 0)
|
||||
stateMap := make(map[string]string)
|
||||
for _, node := range rawNodes {
|
||||
nodes = append(nodes, node.Hostname)
|
||||
stateMap[node.Hostname] = string(node.NodeState)
|
||||
}
|
||||
|
||||
// Setup Vars
|
||||
var countNodes int
|
||||
var cerr error
|
||||
var hasNextPage bool
|
||||
|
||||
// Special Case: Find Nodes not in DB node table but in metricStore only
|
||||
if stateFilter == "notindb" {
|
||||
// Reapply Original Paging
|
||||
page.ItemsPerPage = backupItems
|
||||
// Get Nodes From Topology
|
||||
var topoNodes []string
|
||||
if subCluster != "" {
|
||||
scNodes := archive.NodeLists[cluster][subCluster]
|
||||
topoNodes = scNodes.PrintList()
|
||||
} else {
|
||||
subClusterNodeLists := archive.NodeLists[cluster]
|
||||
for _, nodeList := range subClusterNodeLists {
|
||||
topoNodes = append(topoNodes, nodeList.PrintList()...)
|
||||
}
|
||||
}
|
||||
// Compare to all nodes from cluster/subcluster in DB
|
||||
var missingNodes []string
|
||||
for _, scanNode := range topoNodes {
|
||||
if !slices.Contains(nodes, scanNode) {
|
||||
missingNodes = append(missingNodes, scanNode)
|
||||
}
|
||||
}
|
||||
// Filter nodes by name
|
||||
if nodeFilter != "" {
|
||||
filteredNodesByName := []string{}
|
||||
for _, missingNode := range missingNodes {
|
||||
if strings.Contains(missingNode, nodeFilter) {
|
||||
filteredNodesByName = append(filteredNodesByName, missingNode)
|
||||
}
|
||||
}
|
||||
missingNodes = filteredNodesByName
|
||||
}
|
||||
// Sort Missing Nodes Alphanumerically
|
||||
slices.Sort(missingNodes)
|
||||
// Total Missing
|
||||
countNodes = len(missingNodes)
|
||||
// Apply paging
|
||||
if countNodes > page.ItemsPerPage {
|
||||
start := (page.Page - 1) * page.ItemsPerPage
|
||||
end := start + page.ItemsPerPage
|
||||
if end > countNodes {
|
||||
end = countNodes
|
||||
hasNextPage = false
|
||||
} else {
|
||||
hasNextPage = true
|
||||
}
|
||||
nodes = missingNodes[start:end]
|
||||
} else {
|
||||
nodes = missingNodes
|
||||
}
|
||||
|
||||
} else {
|
||||
// DB Nodes: Count and Find Next Page
|
||||
countNodes, cerr = nodeRepo.CountNodes(ctx, queryFilters)
|
||||
if cerr != nil {
|
||||
cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
|
||||
return nil, cerr
|
||||
}
|
||||
|
||||
// Example Page 4 @ 10 IpP : Does item 41 exist?
|
||||
// Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists.
|
||||
nextPage := &model.PageRequest{
|
||||
ItemsPerPage: 1,
|
||||
Page: ((page.Page * page.ItemsPerPage) + 1),
|
||||
}
|
||||
nextNodes, err := nodeRepo.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used
|
||||
if err != nil {
|
||||
cclog.Warn("Error while querying next nodes")
|
||||
return nil, err
|
||||
}
|
||||
hasNextPage = len(nextNodes) == 1
|
||||
}
|
||||
|
||||
// Load Metric Data For Specified Nodes Only
|
||||
data, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
|
||||
// data -> map hostname:jobdata
|
||||
data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Build Result
|
||||
nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
|
||||
for hostname, metrics := range data {
|
||||
for _, hostname := range nodes {
|
||||
host := &model.NodeMetrics{
|
||||
Host: hostname,
|
||||
State: stateMap[hostname],
|
||||
Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
|
||||
Metrics: make([]*model.JobMetricWithName, 0),
|
||||
}
|
||||
host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
|
||||
if err != nil {
|
||||
cclog.Warnf("error in nodeMetrics resolver: %s", err)
|
||||
}
|
||||
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for metric, scopedMetrics := range data[hostname] {
|
||||
for scope, scopedMetric := range scopedMetrics {
|
||||
host.Metrics = append(host.Metrics, &model.JobMetricWithName{
|
||||
Name: metric,
|
||||
@@ -963,9 +898,9 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
|
||||
nodeMetricsList = append(nodeMetricsList, host)
|
||||
}
|
||||
|
||||
// Final Return
|
||||
nodeMetricsListResult := &model.NodesResultList{
|
||||
Items: nodeMetricsList,
|
||||
Items: nodeMetricsList,
|
||||
// TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357
|
||||
TotalNodes: &countNodes,
|
||||
HasNextPage: &hasNextPage,
|
||||
}
|
||||
@@ -973,6 +908,99 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
|
||||
return nodeMetricsListResult, nil
|
||||
}
|
||||
|
||||
// ClusterMetrics is the resolver for the clusterMetrics field.
|
||||
func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) {
|
||||
user := repository.GetUserFromContext(ctx)
|
||||
if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
|
||||
return nil, errors.New("you need to be administrator or support staff for this query")
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, mc := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
|
||||
scopes := []schema.MetricScope{"node"}
|
||||
data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
cclog.Warn("error while loading node data")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
clusterMetricData := make([]*model.ClusterMetricWithName, 0)
|
||||
clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData}
|
||||
|
||||
collectorTimestep := make(map[string]int)
|
||||
collectorUnit := make(map[string]schema.Unit)
|
||||
collectorData := make(map[string][]schema.Float)
|
||||
|
||||
for _, metrics := range data {
|
||||
clusterMetrics.NodeCount += 1
|
||||
for metric, scopedMetrics := range metrics {
|
||||
for _, scopedMetric := range scopedMetrics {
|
||||
// Collect Info Once
|
||||
_, okTimestep := collectorTimestep[metric]
|
||||
if !okTimestep {
|
||||
collectorTimestep[metric] = scopedMetric.Timestep
|
||||
}
|
||||
_, okUnit := collectorUnit[metric]
|
||||
if !okUnit {
|
||||
collectorUnit[metric] = scopedMetric.Unit
|
||||
}
|
||||
// Collect Data
|
||||
for _, ser := range scopedMetric.Series {
|
||||
_, okData := collectorData[metric]
|
||||
// Init With Datasize > 0
|
||||
if !okData && len(ser.Data) != 0 {
|
||||
collectorData[metric] = make([]schema.Float, len(ser.Data))
|
||||
} else if !okData {
|
||||
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
|
||||
}
|
||||
// Sum if init'd and matching size
|
||||
if okData && len(ser.Data) == len(collectorData[metric]) {
|
||||
for i, val := range ser.Data {
|
||||
if val.IsNaN() {
|
||||
continue
|
||||
} else {
|
||||
collectorData[metric][i] += val
|
||||
}
|
||||
}
|
||||
} else if okData {
|
||||
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for metricName, data := range collectorData {
|
||||
// use ccUnits for backend normalization to "Tera"
|
||||
p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix)
|
||||
p_new := ccunit.NewPrefix("T")
|
||||
convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new)
|
||||
u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base}
|
||||
|
||||
roundedData := make([]schema.Float, 0)
|
||||
for _, v_old := range data {
|
||||
v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0
|
||||
roundedData = append(roundedData, schema.Float(v_new))
|
||||
}
|
||||
|
||||
cm := model.ClusterMetricWithName{
|
||||
Name: metricName,
|
||||
Unit: &u_new,
|
||||
Timestep: collectorTimestep[metricName],
|
||||
Data: roundedData,
|
||||
}
|
||||
|
||||
clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm)
|
||||
}
|
||||
|
||||
return &clusterMetrics, nil
|
||||
}
|
||||
|
||||
// NumberOfNodes is the resolver for the numberOfNodes field.
|
||||
func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
|
||||
nodeList, err := archive.ParseNodeList(obj.Nodes)
|
||||
|
||||
@@ -2,18 +2,20 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package graph
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/99designs/gqlgen/graphql"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
const MAX_JOBS_FOR_ANALYSIS = 500
|
||||
@@ -53,15 +55,15 @@ func (r *queryResolver) rooflineHeatmap(
|
||||
// resolution = max(resolution, mc.Timestep)
|
||||
// }
|
||||
|
||||
jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
|
||||
cclog.Warnf("Error while loading roofline metrics for job %d", *job.ID)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
|
||||
if flops_ == nil && membw_ == nil {
|
||||
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", *job.ID)
|
||||
continue
|
||||
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
|
||||
}
|
||||
@@ -126,7 +128,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
continue
|
||||
}
|
||||
|
||||
if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
|
||||
cclog.Error("Error while loading averages for footprint")
|
||||
return nil, err
|
||||
}
|
||||
@@ -185,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
|
||||
func requireField(ctx context.Context, name string) bool {
|
||||
fields := graphql.CollectAllFields(ctx)
|
||||
|
||||
for _, f := range fields {
|
||||
if f == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
return slices.Contains(fields, name)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package importer
|
||||
|
||||
import (
|
||||
@@ -14,8 +15,8 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
|
||||
@@ -37,7 +38,7 @@ import (
|
||||
func HandleImportFlag(flag string) error {
|
||||
r := repository.GetJobRepository()
|
||||
|
||||
for _, pair := range strings.Split(flag, ",") {
|
||||
for pair := range strings.SplitSeq(flag, ",") {
|
||||
files := strings.Split(pair, ":")
|
||||
if len(files) != 2 {
|
||||
return fmt.Errorf("REPOSITORY/INIT > invalid import flag format")
|
||||
@@ -101,7 +102,7 @@ func HandleImportFlag(flag string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := r.InsertJob(&job)
|
||||
id, err := r.InsertJobDirect(&job)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while job db insert")
|
||||
return err
|
||||
|
||||
@@ -16,8 +16,8 @@ import (
|
||||
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
)
|
||||
|
||||
// copyFile copies a file from source path to destination path.
|
||||
@@ -50,42 +50,14 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
]},
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 944 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "taurus",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 4000 },
|
||||
"duration": { "from": 0, "to": 604800 },
|
||||
"startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
}
|
||||
]}`
|
||||
}
|
||||
}`
|
||||
|
||||
cclog.Init("info", true)
|
||||
tmpdir := t.TempDir()
|
||||
@@ -107,7 +79,7 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := repository.MigrateDB("sqlite3", dbfilepath)
|
||||
err := repository.MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -121,22 +93,18 @@ func setup(t *testing.T) *repository.JobRepository {
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
t.Fatal("Cluster configuration must be present")
|
||||
}
|
||||
config.Init(cfg)
|
||||
} else {
|
||||
t.Fatal("Main configuration must be present")
|
||||
}
|
||||
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
repository.Connect("sqlite3", dbfilepath)
|
||||
repository.Connect(dbfilepath)
|
||||
return repository.GetJobRepository()
|
||||
}
|
||||
|
||||
@@ -197,7 +165,7 @@ func TestHandleImportFlag(t *testing.T) {
|
||||
}
|
||||
|
||||
result := readResult(t, testname)
|
||||
job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -22,8 +22,8 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -111,18 +111,22 @@ func InitDB() error {
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := r.TransactionAddNamed(t,
|
||||
id, jobErr := r.TransactionAddNamed(t,
|
||||
repository.NamedJobInsert, jobMeta)
|
||||
if err != nil {
|
||||
cclog.Errorf("repository initDB(): %v", err)
|
||||
if jobErr != nil {
|
||||
cclog.Errorf("repository initDB(): %v", jobErr)
|
||||
errorOccured++
|
||||
continue
|
||||
}
|
||||
|
||||
// Job successfully inserted, increment counter
|
||||
i += 1
|
||||
|
||||
for _, tag := range jobMeta.Tags {
|
||||
tagstr := tag.Name + ":" + tag.Type
|
||||
tagID, ok := tags[tagstr]
|
||||
if !ok {
|
||||
var err error
|
||||
tagID, err = r.TransactionAdd(t,
|
||||
addTagQuery,
|
||||
tag.Name, tag.Type)
|
||||
@@ -138,10 +142,6 @@ func InitDB() error {
|
||||
setTagQuery,
|
||||
id, tagID)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
i += 1
|
||||
}
|
||||
}
|
||||
|
||||
if errorOccured > 0 {
|
||||
@@ -216,7 +216,7 @@ func enrichJobMetadata(job *schema.Job) error {
|
||||
metricEnergy = math.Round(rawEnergy*100.0) / 100.0
|
||||
}
|
||||
} else {
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
|
||||
cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, *job.ID)
|
||||
}
|
||||
|
||||
job.EnergyFootprint[fp] = metricEnergy
|
||||
@@ -225,7 +225,7 @@ func enrichJobMetadata(job *schema.Job) error {
|
||||
|
||||
job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
|
||||
if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
|
||||
cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", *job.ID)
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package importer
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
|
||||
)
|
||||
|
||||
// getNormalizationFactor calculates the scaling factor needed to normalize a value
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
|
||||
ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
|
||||
)
|
||||
|
||||
// TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.
|
||||
|
||||
@@ -1,217 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
type APIMetricData struct {
|
||||
Error *string `json:"error,omitempty"`
|
||||
Data schema.FloatArray `json:"data,omitempty"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
Resolution int64 `json:"resolution"`
|
||||
Avg schema.Float `json:"avg"`
|
||||
Min schema.Float `json:"min"`
|
||||
Max schema.Float `json:"max"`
|
||||
}
|
||||
|
||||
type APIQueryRequest struct {
|
||||
Cluster string `json:"cluster"`
|
||||
Queries []APIQuery `json:"queries"`
|
||||
ForAllNodes []string `json:"for-all-nodes"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
WithStats bool `json:"with-stats"`
|
||||
WithData bool `json:"with-data"`
|
||||
WithPadding bool `json:"with-padding"`
|
||||
}
|
||||
|
||||
type APIQueryResponse struct {
|
||||
Queries []APIQuery `json:"queries,omitempty"`
|
||||
Results [][]APIMetricData `json:"results"`
|
||||
}
|
||||
|
||||
type APIQuery struct {
|
||||
Type *string `json:"type,omitempty"`
|
||||
SubType *string `json:"subtype,omitempty"`
|
||||
Metric string `json:"metric"`
|
||||
Hostname string `json:"host"`
|
||||
Resolution int64 `json:"resolution"`
|
||||
TypeIds []string `json:"type-ids,omitempty"`
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||
ScaleFactor schema.Float `json:"scale-by,omitempty"`
|
||||
Aggregate bool `json:"aggreg"`
|
||||
}
|
||||
|
||||
// TODO: Optimize this, just like the stats endpoint!
|
||||
func (data *APIMetricData) AddStats() {
|
||||
n := 0
|
||||
sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
|
||||
for _, x := range data.Data {
|
||||
if x.IsNaN() {
|
||||
continue
|
||||
}
|
||||
|
||||
n += 1
|
||||
sum += float64(x)
|
||||
min = math.Min(min, float64(x))
|
||||
max = math.Max(max, float64(x))
|
||||
}
|
||||
|
||||
if n > 0 {
|
||||
avg := sum / float64(n)
|
||||
data.Avg = schema.Float(avg)
|
||||
data.Min = schema.Float(min)
|
||||
data.Max = schema.Float(max)
|
||||
} else {
|
||||
data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN
|
||||
}
|
||||
}
|
||||
|
||||
func (data *APIMetricData) ScaleBy(f schema.Float) {
|
||||
if f == 0 || f == 1 {
|
||||
return
|
||||
}
|
||||
|
||||
data.Avg *= f
|
||||
data.Min *= f
|
||||
data.Max *= f
|
||||
for i := 0; i < len(data.Data); i++ {
|
||||
data.Data[i] *= f
|
||||
}
|
||||
}
|
||||
|
||||
func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
|
||||
minfo, ok := ms.Metrics[metric]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
|
||||
padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
|
||||
ndata := make([]schema.Float, 0, padfront+len(data.Data))
|
||||
for range padfront {
|
||||
ndata = append(ndata, schema.NaN)
|
||||
}
|
||||
for j := 0; j < len(data.Data); j++ {
|
||||
ndata = append(ndata, data.Data[j])
|
||||
}
|
||||
data.Data = ndata
|
||||
}
|
||||
}
|
||||
|
||||
func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
|
||||
req.WithData = true
|
||||
req.WithData = true
|
||||
req.WithData = true
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
response := APIQueryResponse{
|
||||
Results: make([][]APIMetricData, 0, len(req.Queries)),
|
||||
}
|
||||
if req.ForAllNodes != nil {
|
||||
nodes := ms.ListChildren([]string{req.Cluster})
|
||||
for _, node := range nodes {
|
||||
for _, metric := range req.ForAllNodes {
|
||||
q := APIQuery{
|
||||
Metric: metric,
|
||||
Hostname: node,
|
||||
}
|
||||
req.Queries = append(req.Queries, q)
|
||||
response.Queries = append(response.Queries, q)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, query := range req.Queries {
|
||||
sels := make([]util.Selector, 0, 1)
|
||||
if query.Aggregate || query.Type == nil {
|
||||
sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}}
|
||||
if query.Type != nil {
|
||||
if len(query.TypeIds) == 1 {
|
||||
sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]})
|
||||
} else {
|
||||
ids := make([]string, len(query.TypeIds))
|
||||
for i, id := range query.TypeIds {
|
||||
ids[i] = *query.Type + id
|
||||
}
|
||||
sel = append(sel, util.SelectorElement{Group: ids})
|
||||
}
|
||||
|
||||
if query.SubType != nil {
|
||||
if len(query.SubTypeIds) == 1 {
|
||||
sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
|
||||
} else {
|
||||
ids := make([]string, len(query.SubTypeIds))
|
||||
for i, id := range query.SubTypeIds {
|
||||
ids[i] = *query.SubType + id
|
||||
}
|
||||
sel = append(sel, util.SelectorElement{Group: ids})
|
||||
}
|
||||
}
|
||||
}
|
||||
sels = append(sels, sel)
|
||||
} else {
|
||||
for _, typeID := range query.TypeIds {
|
||||
if query.SubType != nil {
|
||||
for _, subTypeID := range query.SubTypeIds {
|
||||
sels = append(sels, util.Selector{
|
||||
{String: req.Cluster},
|
||||
{String: query.Hostname},
|
||||
{String: *query.Type + typeID},
|
||||
{String: *query.SubType + subTypeID},
|
||||
})
|
||||
}
|
||||
} else {
|
||||
sels = append(sels, util.Selector{
|
||||
{String: req.Cluster},
|
||||
{String: query.Hostname},
|
||||
{String: *query.Type + typeID},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// log.Printf("query: %#v\n", query)
|
||||
// log.Printf("sels: %#v\n", sels)
|
||||
var err error
|
||||
res := make([]APIMetricData, 0, len(sels))
|
||||
for _, sel := range sels {
|
||||
data := APIMetricData{}
|
||||
|
||||
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
|
||||
if err != nil {
|
||||
msg := err.Error()
|
||||
data.Error = &msg
|
||||
res = append(res, data)
|
||||
continue
|
||||
}
|
||||
|
||||
if req.WithStats {
|
||||
data.AddStats()
|
||||
}
|
||||
if query.ScaleFactor != 0 {
|
||||
data.ScaleBy(query.ScaleFactor)
|
||||
}
|
||||
if req.WithPadding {
|
||||
data.PadDataWithNull(ms, req.From, req.To, query.Metric)
|
||||
}
|
||||
if !req.WithData {
|
||||
data.Data = nil
|
||||
}
|
||||
res = append(res, data)
|
||||
}
|
||||
response.Results = append(response.Results, res)
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
@@ -1,191 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bufio"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func Archiving(wg *sync.WaitGroup, ctx context.Context) {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.Archive.Interval)
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
t := time.Now().Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339))
|
||||
n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
|
||||
Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead)
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
var ErrNoNewArchiveData error = errors.New("all data already archived")
|
||||
|
||||
// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
|
||||
// deleting them from the `checkpointsDir`.
|
||||
func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
|
||||
entries1, err := os.ReadDir(checkpointsDir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
type workItem struct {
|
||||
cdir, adir string
|
||||
cluster, host string
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
n, errs := int32(0), int32(0)
|
||||
work := make(chan workItem, Keys.NumWorkers)
|
||||
|
||||
wg.Add(Keys.NumWorkers)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for workItem := range work {
|
||||
m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
}
|
||||
atomic.AddInt32(&n, int32(m))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for _, de1 := range entries1 {
|
||||
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
}
|
||||
|
||||
for _, de2 := range entries2 {
|
||||
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
|
||||
adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
|
||||
work <- workItem{
|
||||
adir: adir, cdir: cdir,
|
||||
cluster: de1.Name(), host: de2.Name(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Helper function for `ArchiveCheckpoints`.
|
||||
func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
extension := Keys.Checkpoints.FileFormat
|
||||
files, err := findFiles(entries, from, extension, false)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if deleteInstead {
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return n, err
|
||||
}
|
||||
n += 1
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
|
||||
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(archiveDir, CheckpointDirPerms)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
bw := bufio.NewWriter(f)
|
||||
defer bw.Flush()
|
||||
zw := zip.NewWriter(bw)
|
||||
defer zw.Close()
|
||||
|
||||
n := 0
|
||||
for _, checkpoint := range files {
|
||||
filename := filepath.Join(dir, checkpoint)
|
||||
r, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
w, err := zw.Create(checkpoint)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
|
||||
if _, err = io.Copy(w, r); err != nil {
|
||||
return n, err
|
||||
}
|
||||
|
||||
if err = os.Remove(filename); err != nil {
|
||||
return n, err
|
||||
}
|
||||
n += 1
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
@@ -1,482 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
var NumAvroWorkers int = 4
|
||||
var startUp bool = true
|
||||
var ErrNoNewData error = errors.New("no data in the pool")
|
||||
|
||||
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
|
||||
levels := make([]*AvroLevel, 0)
|
||||
selectors := make([][]string, 0)
|
||||
as.root.lock.RLock()
|
||||
// Cluster
|
||||
for sel1, l1 := range as.root.children {
|
||||
l1.lock.RLock()
|
||||
// Node
|
||||
for sel2, l2 := range l1.children {
|
||||
l2.lock.RLock()
|
||||
// Frequency
|
||||
for sel3, l3 := range l2.children {
|
||||
levels = append(levels, l3)
|
||||
selectors = append(selectors, []string{sel1, sel2, sel3})
|
||||
}
|
||||
l2.lock.RUnlock()
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
as.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *AvroLevel
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(NumAvroWorkers)
|
||||
work := make(chan workItem, NumAvroWorkers*2)
|
||||
for range NumAvroWorkers {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
from := getTimestamp(workItem.dir)
|
||||
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := range len(levels) {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
|
||||
startUp = false
|
||||
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// getTimestamp returns the timestamp from the directory name
|
||||
func getTimestamp(dir string) int64 {
|
||||
// Extract the resolution and timestamp from the directory name
|
||||
// The existing avro file will be in epoch timestamp format
|
||||
// iterate over all the files in the directory and find the maximum timestamp
|
||||
// and return it
|
||||
|
||||
resolution := path.Base(dir)
|
||||
dir = path.Dir(dir)
|
||||
|
||||
files, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var maxTS int64 = 0
|
||||
|
||||
if len(files) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
if file.IsDir() {
|
||||
continue
|
||||
}
|
||||
name := file.Name()
|
||||
|
||||
if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
|
||||
if err != nil {
|
||||
fmt.Printf("error while parsing timestamp: %s\n", err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
if ts > maxTS {
|
||||
maxTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
|
||||
|
||||
if startUp {
|
||||
return 0
|
||||
}
|
||||
|
||||
if updateTime < time.Now().Unix() {
|
||||
return 0
|
||||
}
|
||||
|
||||
return maxTS
|
||||
}
|
||||
|
||||
func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
// fmt.Printf("Checkpointing directory: %s\n", dir)
|
||||
// filepath contains the resolution
|
||||
intRes, _ := strconv.Atoi(path.Base(dir))
|
||||
|
||||
// find smallest overall timestamp in l.data map and delete it from l.data
|
||||
minTS := int64(1<<63 - 1)
|
||||
for ts, dat := range l.data {
|
||||
if ts < minTS && len(dat) != 0 {
|
||||
minTS = ts
|
||||
}
|
||||
}
|
||||
|
||||
if from == 0 && minTS != int64(1<<63-1) {
|
||||
from = minTS
|
||||
}
|
||||
|
||||
if from == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
var schema string
|
||||
var codec *goavro.Codec
|
||||
recordList := make([]map[string]any, 0)
|
||||
|
||||
var f *os.File
|
||||
|
||||
filePath := dir + fmt.Sprintf("_%d.avro", from)
|
||||
|
||||
var err error
|
||||
|
||||
fp_, err_ := os.Stat(filePath)
|
||||
if errors.Is(err_, os.ErrNotExist) {
|
||||
err = os.MkdirAll(path.Dir(dir), 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create directory: %v", err)
|
||||
}
|
||||
} else if fp_.Size() != 0 {
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open existing avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
reader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader: %v", err)
|
||||
}
|
||||
codec = reader.Codec()
|
||||
schema = codec.Schema()
|
||||
|
||||
f.Close()
|
||||
}
|
||||
|
||||
timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
|
||||
|
||||
if dumpAll {
|
||||
timeRef = time.Now().Unix()
|
||||
}
|
||||
|
||||
// Empty values
|
||||
if len(l.data) == 0 {
|
||||
// we checkpoint avro files every 60 seconds
|
||||
repeat := 60 / intRes
|
||||
|
||||
for range repeat {
|
||||
recordList = append(recordList, make(map[string]any))
|
||||
}
|
||||
}
|
||||
|
||||
readFlag := true
|
||||
|
||||
for ts := range l.data {
|
||||
flag := false
|
||||
if ts < timeRef {
|
||||
data := l.data[ts]
|
||||
|
||||
schemaGen, err := generateSchema(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
flag, schema, err = compareSchema(schema, schemaGen)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compare read and generated schema: %v", err)
|
||||
}
|
||||
if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
|
||||
|
||||
f.Close()
|
||||
|
||||
f, err = os.Open(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open Avro file: %v", err)
|
||||
}
|
||||
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
|
||||
}
|
||||
|
||||
for ocfReader.Scan() {
|
||||
record, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read record: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, record.(map[string]any))
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
err = os.Remove(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete file: %v", err)
|
||||
}
|
||||
|
||||
readFlag = false
|
||||
}
|
||||
codec, err = goavro.NewCodec(schema)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create codec after merged schema: %v", err)
|
||||
}
|
||||
|
||||
recordList = append(recordList, generateRecord(data))
|
||||
delete(l.data, ts)
|
||||
}
|
||||
}
|
||||
|
||||
if len(recordList) == 0 {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to append new avro file: %v", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("Codec : %#v\n", codec)
|
||||
|
||||
writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
|
||||
W: f,
|
||||
Codec: codec,
|
||||
CompressionName: goavro.CompressionDeflateLabel,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create OCF writer: %v", err)
|
||||
}
|
||||
|
||||
// Append the new record
|
||||
if err := writer.Append(recordList); err != nil {
|
||||
return fmt.Errorf("failed to append record: %v", err)
|
||||
}
|
||||
|
||||
f.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
|
||||
var genSchema, readSchema AvroSchema
|
||||
|
||||
if schemaRead == "" {
|
||||
return false, schemaGen, nil
|
||||
}
|
||||
|
||||
// Unmarshal the schema strings into AvroSchema structs
|
||||
if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
|
||||
}
|
||||
if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
|
||||
return false, "", fmt.Errorf("failed to parse read schema: %v", err)
|
||||
}
|
||||
|
||||
sort.Slice(genSchema.Fields, func(i, j int) bool {
|
||||
return genSchema.Fields[i].Name < genSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
sort.Slice(readSchema.Fields, func(i, j int) bool {
|
||||
return readSchema.Fields[i].Name < readSchema.Fields[j].Name
|
||||
})
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual := true
|
||||
if len(genSchema.Fields) <= len(readSchema.Fields) {
|
||||
|
||||
for i := range genSchema.Fields {
|
||||
if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// If schemas are identical, return the read schema
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Create a map to hold unique fields from both schemas
|
||||
fieldMap := make(map[string]AvroField)
|
||||
|
||||
// Add fields from the read schema
|
||||
for _, field := range readSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Add or update fields from the generated schema
|
||||
for _, field := range genSchema.Fields {
|
||||
fieldMap[field.Name] = field
|
||||
}
|
||||
|
||||
// Create a union schema by collecting fields from the map
|
||||
var mergedFields []AvroField
|
||||
for _, field := range fieldMap {
|
||||
mergedFields = append(mergedFields, field)
|
||||
}
|
||||
|
||||
// Sort fields by name for consistency
|
||||
sort.Slice(mergedFields, func(i, j int) bool {
|
||||
return mergedFields[i].Name < mergedFields[j].Name
|
||||
})
|
||||
|
||||
// Create the merged schema
|
||||
mergedSchema := AvroSchema{
|
||||
Type: "record",
|
||||
Name: genSchema.Name,
|
||||
Fields: mergedFields,
|
||||
}
|
||||
|
||||
// Check if schemas are identical
|
||||
schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
|
||||
if schemasEqual {
|
||||
for i := range mergedSchema.Fields {
|
||||
if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
|
||||
schemasEqual = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if schemasEqual {
|
||||
return false, schemaRead, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Marshal the merged schema back to JSON
|
||||
mergedSchemaJSON, err := json.Marshal(mergedSchema)
|
||||
if err != nil {
|
||||
return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
|
||||
}
|
||||
|
||||
return true, string(mergedSchemaJSON), nil
|
||||
}
|
||||
|
||||
func generateSchema(data map[string]schema.Float) (string, error) {
|
||||
// Define the Avro schema structure
|
||||
schema := map[string]any{
|
||||
"type": "record",
|
||||
"name": "DataRecord",
|
||||
"fields": []map[string]any{},
|
||||
}
|
||||
|
||||
fieldTracker := make(map[string]struct{})
|
||||
|
||||
for key := range data {
|
||||
if _, exists := fieldTracker[key]; !exists {
|
||||
key = correctKey(key)
|
||||
|
||||
field := map[string]any{
|
||||
"name": key,
|
||||
"type": "double",
|
||||
"default": -1.0,
|
||||
}
|
||||
schema["fields"] = append(schema["fields"].([]map[string]any), field)
|
||||
fieldTracker[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
schemaString, err := json.Marshal(schema)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to marshal schema: %v", err)
|
||||
}
|
||||
|
||||
return string(schemaString), nil
|
||||
}
|
||||
|
||||
func generateRecord(data map[string]schema.Float) map[string]any {
|
||||
record := make(map[string]any)
|
||||
|
||||
// Iterate through each map in data
|
||||
for key, value := range data {
|
||||
key = correctKey(key)
|
||||
|
||||
// Set the value in the record
|
||||
// avro only accepts basic types
|
||||
record[key] = value.Double()
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func correctKey(key string) string {
|
||||
// Replace any invalid characters in the key
|
||||
// For example, replace spaces with underscores
|
||||
key = strings.ReplaceAll(key, ":", "___")
|
||||
key = strings.ReplaceAll(key, ".", "__")
|
||||
|
||||
return key
|
||||
}
|
||||
|
||||
func ReplaceKey(key string) string {
|
||||
// Replace any invalid characters in the key
|
||||
// For example, replace spaces with underscores
|
||||
key = strings.ReplaceAll(key, "___", ":")
|
||||
key = strings.ReplaceAll(key, "__", ".")
|
||||
|
||||
return key
|
||||
}
|
||||
@@ -1,84 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"slices"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
)
|
||||
|
||||
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
// AvroPool is a pool of Avro writers.
|
||||
go func() {
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
wg.Done() // Mark this goroutine as done
|
||||
return // Exit the goroutine
|
||||
}
|
||||
|
||||
defer wg.Done()
|
||||
|
||||
var avroLevel *AvroLevel
|
||||
oldSelector := make([]string, 0)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case val := <-LineProtocolMessages:
|
||||
// Fetch the frequency of the metric from the global configuration
|
||||
freq, err := GetMetricFrequency(val.MetricName)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error fetching metric frequency: %s\n", err)
|
||||
continue
|
||||
}
|
||||
|
||||
metricName := ""
|
||||
|
||||
for _, selectorName := range val.Selector {
|
||||
metricName += selectorName + Delimiter
|
||||
}
|
||||
|
||||
metricName += val.MetricName
|
||||
|
||||
// Create a new selector for the Avro level
|
||||
// The selector is a slice of strings that represents the path to the
|
||||
// Avro level. It is created by appending the cluster, node, and metric
|
||||
// name to the selector.
|
||||
var selector []string
|
||||
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
|
||||
|
||||
if !testEq(oldSelector, selector) {
|
||||
// Get the Avro level for the metric
|
||||
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
|
||||
|
||||
// If the Avro level is nil, create a new one
|
||||
if avroLevel == nil {
|
||||
cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
|
||||
}
|
||||
oldSelector = slices.Clone(selector)
|
||||
}
|
||||
|
||||
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func testEq(a, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
LineProtocolMessages = make(chan *AvroStruct)
|
||||
Delimiter = "ZZZZZ"
|
||||
)
|
||||
|
||||
// CheckpointBufferMinutes should always be in minutes.
|
||||
// Its controls the amount of data to hold for given amount of time.
|
||||
var CheckpointBufferMinutes = 3
|
||||
|
||||
type AvroStruct struct {
|
||||
MetricName string
|
||||
Cluster string
|
||||
Node string
|
||||
Selector []string
|
||||
Value schema.Float
|
||||
Timestamp int64
|
||||
}
|
||||
|
||||
type AvroStore struct {
|
||||
root AvroLevel
|
||||
}
|
||||
|
||||
var avroStore AvroStore
|
||||
|
||||
type AvroLevel struct {
|
||||
children map[string]*AvroLevel
|
||||
data map[int64]map[string]schema.Float
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
type AvroField struct {
|
||||
Name string `json:"name"`
|
||||
Type any `json:"type"`
|
||||
Default any `json:"default,omitempty"`
|
||||
}
|
||||
|
||||
type AvroSchema struct {
|
||||
Type string `json:"type"`
|
||||
Name string `json:"name"`
|
||||
Fields []AvroField `json:"fields"`
|
||||
}
|
||||
|
||||
func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *AvroLevel
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok := l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unqiue access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
}
|
||||
|
||||
child = &AvroLevel{
|
||||
data: make(map[int64]map[string]schema.Float, 0),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*AvroLevel{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findAvroLevelOrCreate(selector[1:])
|
||||
}
|
||||
|
||||
func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
|
||||
|
||||
// Create keys in advance for the given amount of time
|
||||
if len(l.data) != KeyCounter {
|
||||
if len(l.data) == 0 {
|
||||
for i := range KeyCounter {
|
||||
l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
} else {
|
||||
// Get the last timestamp
|
||||
var lastTS int64
|
||||
for ts := range l.data {
|
||||
if ts > lastTS {
|
||||
lastTS = ts
|
||||
}
|
||||
}
|
||||
// Create keys for the next KeyCounter timestamps
|
||||
l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
|
||||
}
|
||||
}
|
||||
|
||||
closestTS := int64(0)
|
||||
minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
|
||||
found := false
|
||||
|
||||
// Iterate over timestamps and choose the one which is within range.
|
||||
// Since its epoch time, we check if the difference is less than 60 seconds.
|
||||
for ts, dat := range l.data {
|
||||
// Check if timestamp is within range
|
||||
diff := timestamp - ts
|
||||
if diff < -int64(Freq) || diff > int64(Freq) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Metric already present at this timestamp — skip
|
||||
if _, ok := dat[metricName]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if this is the closest timestamp so far
|
||||
if Abs(diff) < minDiff {
|
||||
minDiff = Abs(diff)
|
||||
closestTS = ts
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
l.data[closestTS][metricName] = value
|
||||
}
|
||||
}
|
||||
|
||||
func GetAvroStore() *AvroStore {
|
||||
return &avroStore
|
||||
}
|
||||
|
||||
// Abs returns the absolute value of x.
|
||||
func Abs(x int64) int64 {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
@@ -1,198 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
// Default buffer capacity.
|
||||
// `buffer.data` will only ever grow up to it's capacity and a new link
|
||||
// in the buffer chain will be created if needed so that no copying
|
||||
// of data or reallocation needs to happen on writes.
|
||||
const (
|
||||
BufferCap int = 512
|
||||
)
|
||||
|
||||
// So that we can reuse allocations
|
||||
var bufferPool sync.Pool = sync.Pool{
|
||||
New: func() any {
|
||||
return &buffer{
|
||||
data: make([]schema.Float, 0, BufferCap),
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
var (
|
||||
ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level")
|
||||
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
|
||||
)
|
||||
|
||||
// Each metric on each level has it's own buffer.
|
||||
// This is where the actual values go.
|
||||
// If `cap(data)` is reached, a new buffer is created and
|
||||
// becomes the new head of a buffer list.
|
||||
type buffer struct {
|
||||
prev *buffer
|
||||
next *buffer
|
||||
data []schema.Float
|
||||
frequency int64
|
||||
start int64
|
||||
archived bool
|
||||
closed bool
|
||||
}
|
||||
|
||||
func newBuffer(ts, freq int64) *buffer {
|
||||
b := bufferPool.Get().(*buffer)
|
||||
b.frequency = freq
|
||||
b.start = ts - (freq / 2)
|
||||
b.prev = nil
|
||||
b.next = nil
|
||||
b.archived = false
|
||||
b.closed = false
|
||||
b.data = b.data[:0]
|
||||
return b
|
||||
}
|
||||
|
||||
// If a new buffer was created, the new head is returnd.
|
||||
// Otherwise, the existing buffer is returnd.
|
||||
// Normaly, only "newer" data should be written, but if the value would
|
||||
// end up in the same buffer anyways it is allowed.
|
||||
func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
|
||||
if ts < b.start {
|
||||
return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
|
||||
}
|
||||
|
||||
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
|
||||
idx := int((ts - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
newbuf := newBuffer(ts, b.frequency)
|
||||
newbuf.prev = b
|
||||
b.next = newbuf
|
||||
b.close()
|
||||
b = newbuf
|
||||
idx = 0
|
||||
}
|
||||
|
||||
// Overwriting value or writing value from past
|
||||
if idx < len(b.data) {
|
||||
b.data[idx] = value
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// Fill up unwritten slots with NaN
|
||||
for i := len(b.data); i < idx; i++ {
|
||||
b.data = append(b.data, schema.NaN)
|
||||
}
|
||||
|
||||
b.data = append(b.data, value)
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (b *buffer) end() int64 {
|
||||
return b.firstWrite() + int64(len(b.data))*b.frequency
|
||||
}
|
||||
|
||||
func (b *buffer) firstWrite() int64 {
|
||||
return b.start + (b.frequency / 2)
|
||||
}
|
||||
|
||||
func (b *buffer) close() {}
|
||||
|
||||
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
|
||||
// Simple linear interpolation is done between the two neighboring cells if possible.
|
||||
// If values at the start or end are missing, instead of NaN values, the second and thrid
|
||||
// return values contain the actual `from`/`to`.
|
||||
// This function goes back the buffer chain if `from` is older than the currents buffer start.
|
||||
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
|
||||
// If `data` is not long enough to hold all values, this function will panic!
|
||||
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
|
||||
if from < b.firstWrite() {
|
||||
if b.prev != nil {
|
||||
return b.prev.read(from, to, data)
|
||||
}
|
||||
from = b.firstWrite()
|
||||
}
|
||||
|
||||
i := 0
|
||||
t := from
|
||||
for ; t < to; t += b.frequency {
|
||||
idx := int((t - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
if b.next == nil {
|
||||
break
|
||||
}
|
||||
b = b.next
|
||||
idx = 0
|
||||
}
|
||||
|
||||
if idx >= len(b.data) {
|
||||
if b.next == nil || to <= b.next.start {
|
||||
break
|
||||
}
|
||||
data[i] += schema.NaN
|
||||
} else if t < b.start {
|
||||
data[i] += schema.NaN
|
||||
// } else if b.data[idx].IsNaN() {
|
||||
// data[i] += interpolate(idx, b.data)
|
||||
} else {
|
||||
data[i] += b.data[idx]
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
return data[:i], from, t, nil
|
||||
}
|
||||
|
||||
// Returns true if this buffer needs to be freed.
|
||||
func (b *buffer) free(t int64) (delme bool, n int) {
|
||||
if b.prev != nil {
|
||||
delme, m := b.prev.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
b.prev.next = nil
|
||||
if cap(b.prev.data) == BufferCap {
|
||||
bufferPool.Put(b.prev)
|
||||
}
|
||||
b.prev = nil
|
||||
}
|
||||
}
|
||||
|
||||
end := b.end()
|
||||
if end < t {
|
||||
return true, n + 1
|
||||
}
|
||||
|
||||
return false, n
|
||||
}
|
||||
|
||||
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
|
||||
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
|
||||
if b == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := b.prev.iterFromTo(from, to, callback); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if from <= b.end() && b.start <= to {
|
||||
return callback(b)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *buffer) count() int64 {
|
||||
res := int64(len(b.data))
|
||||
if b.prev != nil {
|
||||
res += b.prev.count()
|
||||
}
|
||||
return res
|
||||
}
|
||||
@@ -1,783 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/linkedin/goavro/v2"
|
||||
)
|
||||
|
||||
// File operation constants
|
||||
const (
|
||||
// CheckpointFilePerms defines default permissions for checkpoint files
|
||||
CheckpointFilePerms = 0o644
|
||||
// CheckpointDirPerms defines default permissions for checkpoint directories
|
||||
CheckpointDirPerms = 0o755
|
||||
// GCTriggerInterval determines how often GC is forced during checkpoint loading
|
||||
// GC is triggered every GCTriggerInterval*NumWorkers loaded hosts
|
||||
GCTriggerInterval = 100
|
||||
)
|
||||
|
||||
// Whenever changed, update MarshalJSON as well!
|
||||
type CheckpointMetrics struct {
|
||||
Data []schema.Float `json:"data"`
|
||||
Frequency int64 `json:"frequency"`
|
||||
Start int64 `json:"start"`
|
||||
}
|
||||
|
||||
type CheckpointFile struct {
|
||||
Metrics map[string]*CheckpointMetrics `json:"metrics"`
|
||||
Children map[string]*CheckpointFile `json:"children"`
|
||||
From int64 `json:"from"`
|
||||
To int64 `json:"to"`
|
||||
}
|
||||
|
||||
var lastCheckpoint time.Time
|
||||
|
||||
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
lastCheckpoint = time.Now()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Interval)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339))
|
||||
now := time.Now()
|
||||
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
|
||||
lastCheckpoint.Unix(), now.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
|
||||
lastCheckpoint = now
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, _ := time.ParseDuration("1m")
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
|
||||
// This is the first tick untill we collect the data for given minutes.
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
// log.Printf("Checkpointing %d avro files", count)
|
||||
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
// Regular ticks of 1 minute to write data.
|
||||
GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
|
||||
// log.Printf("Checkpointing %d avro files", count)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// As `Float` implements a custom MarshalJSON() function,
|
||||
// serializing an array of such types has more overhead
|
||||
// than one would assume (because of extra allocations, interfaces and so on).
|
||||
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
|
||||
buf := make([]byte, 0, 128+len(cm.Data)*8)
|
||||
buf = append(buf, `{"frequency":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Frequency, 10)
|
||||
buf = append(buf, `,"start":`...)
|
||||
buf = strconv.AppendInt(buf, cm.Start, 10)
|
||||
buf = append(buf, `,"data":[`...)
|
||||
for i, x := range cm.Data {
|
||||
if i != 0 {
|
||||
buf = append(buf, ',')
|
||||
}
|
||||
if x.IsNaN() {
|
||||
buf = append(buf, `null`...)
|
||||
} else {
|
||||
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
|
||||
}
|
||||
}
|
||||
buf = append(buf, `]}`...)
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
|
||||
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
|
||||
// The good thing: Only a host at a time is locked, so this function can run
|
||||
// in parallel to writes/reads.
|
||||
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
|
||||
levels := make([]*Level, 0)
|
||||
selectors := make([][]string, 0)
|
||||
m.root.lock.RLock()
|
||||
for sel1, l1 := range m.root.children {
|
||||
l1.lock.RLock()
|
||||
for sel2, l2 := range l1.children {
|
||||
levels = append(levels, l2)
|
||||
selectors = append(selectors, []string{sel1, sel2})
|
||||
}
|
||||
l1.lock.RUnlock()
|
||||
}
|
||||
m.root.lock.RUnlock()
|
||||
|
||||
type workItem struct {
|
||||
level *Level
|
||||
dir string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(Keys.NumWorkers)
|
||||
work := make(chan workItem, Keys.NumWorkers*2)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for workItem := range work {
|
||||
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
continue
|
||||
}
|
||||
|
||||
cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
} else {
|
||||
atomic.AddInt32(&n, 1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for i := 0; i < len(levels); i++ {
|
||||
dir := path.Join(dir, path.Join(selectors[i]...))
|
||||
work <- workItem{
|
||||
level: levels[i],
|
||||
dir: dir,
|
||||
selector: selectors[i],
|
||||
}
|
||||
}
|
||||
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
retval := &CheckpointFile{
|
||||
From: from,
|
||||
To: to,
|
||||
Metrics: make(map[string]*CheckpointMetrics),
|
||||
Children: make(map[string]*CheckpointFile),
|
||||
}
|
||||
|
||||
for metric, minfo := range m.Metrics {
|
||||
b := l.metrics[minfo.offset]
|
||||
if b == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
allArchived := true
|
||||
b.iterFromTo(from, to, func(b *buffer) error {
|
||||
if !b.archived {
|
||||
allArchived = false
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if allArchived {
|
||||
continue
|
||||
}
|
||||
|
||||
data := make([]schema.Float, (to-from)/b.frequency+1)
|
||||
data, start, end, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := int((end - start) / b.frequency); i < len(data); i++ {
|
||||
data[i] = schema.NaN
|
||||
}
|
||||
|
||||
retval.Metrics[metric] = &CheckpointMetrics{
|
||||
Frequency: b.frequency,
|
||||
Start: start,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
for name, child := range l.children {
|
||||
val, err := child.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if val != nil {
|
||||
retval.Children[name] = val
|
||||
}
|
||||
}
|
||||
|
||||
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return retval, nil
|
||||
}
|
||||
|
||||
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
|
||||
cf, err := l.toCheckpointFile(from, to, m)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf == nil {
|
||||
return ErrNoNewArchiveData
|
||||
}
|
||||
|
||||
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
|
||||
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
err = os.MkdirAll(dir, CheckpointDirPerms)
|
||||
if err == nil {
|
||||
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
bw := bufio.NewWriter(f)
|
||||
if err = json.NewEncoder(bw).Encode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return bw.Flush()
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
|
||||
var wg sync.WaitGroup
|
||||
work := make(chan [2]string, Keys.NumWorkers)
|
||||
n, errs := int32(0), int32(0)
|
||||
|
||||
wg.Add(Keys.NumWorkers)
|
||||
for worker := 0; worker < Keys.NumWorkers; worker++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for host := range work {
|
||||
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
|
||||
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
}
|
||||
atomic.AddInt32(&n, int32(nn))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
i := 0
|
||||
clustersDir, err := os.ReadDir(dir)
|
||||
for _, clusterDir := range clustersDir {
|
||||
if !clusterDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
|
||||
if e != nil {
|
||||
err = e
|
||||
goto done
|
||||
}
|
||||
|
||||
for _, hostDir := range hostsDir {
|
||||
if !hostDir.IsDir() {
|
||||
err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
|
||||
goto done
|
||||
}
|
||||
|
||||
i++
|
||||
if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
|
||||
// Forcing garbage collection runs here regulary during the loading of checkpoints
|
||||
// will decrease the total heap size after loading everything back to memory is done.
|
||||
// While loading data, the heap will grow fast, so the GC target size will double
|
||||
// almost always. By forcing GCs here, we can keep it growing more slowly so that
|
||||
// at the end, less memory is wasted.
|
||||
runtime.GC()
|
||||
}
|
||||
|
||||
work <- [2]string{clusterDir.Name(), hostDir.Name()}
|
||||
}
|
||||
}
|
||||
done:
|
||||
close(work)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
|
||||
}
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
|
||||
// This function can only be called once and before the very first write or read.
|
||||
// Different host's data is loaded to memory in parallel.
|
||||
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
|
||||
if _, err := os.Stat(dir); os.IsNotExist(err) {
|
||||
// The directory does not exist, so create it using os.MkdirAll()
|
||||
err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
|
||||
}
|
||||
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
|
||||
}
|
||||
|
||||
// Config read (replace with your actual config read)
|
||||
fileFormat := Keys.Checkpoints.FileFormat
|
||||
if fileFormat == "" {
|
||||
fileFormat = "avro"
|
||||
}
|
||||
|
||||
// Map to easily get the fallback format
|
||||
oppositeFormat := map[string]string{
|
||||
"json": "avro",
|
||||
"avro": "json",
|
||||
}
|
||||
|
||||
// First, attempt to load the specified format
|
||||
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, fileFormat)
|
||||
}
|
||||
|
||||
// If not found, attempt the opposite format
|
||||
altFormat := oppositeFormat[fileFormat]
|
||||
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, altFormat)
|
||||
}
|
||||
|
||||
cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func checkFilesWithExtension(dir string, extension string) (bool, error) {
|
||||
found := false
|
||||
|
||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
|
||||
}
|
||||
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
|
||||
found = true
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
|
||||
}
|
||||
|
||||
return found, nil
|
||||
}
|
||||
|
||||
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
|
||||
resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
|
||||
}
|
||||
|
||||
fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
|
||||
|
||||
// Same logic according to lineprotocol
|
||||
fromTimestamp -= (resolution / 2)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
|
||||
}
|
||||
|
||||
// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
|
||||
|
||||
var recordCounter int64 = 0
|
||||
|
||||
// Create a new OCF reader from the buffered reader
|
||||
ocfReader, err := goavro.NewOCFReader(br)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err)
|
||||
}
|
||||
|
||||
metricsData := make(map[string]schema.FloatArray)
|
||||
|
||||
for ocfReader.Scan() {
|
||||
datum, err := ocfReader.Read()
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
|
||||
}
|
||||
|
||||
record, ok := datum.(map[string]any)
|
||||
if !ok {
|
||||
return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}")
|
||||
}
|
||||
|
||||
for key, value := range record {
|
||||
metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
|
||||
}
|
||||
|
||||
recordCounter += 1
|
||||
}
|
||||
|
||||
to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
|
||||
if to < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
for key, floatArray := range metricsData {
|
||||
metricName := ReplaceKey(key)
|
||||
|
||||
if strings.Contains(metricName, Delimiter) {
|
||||
subString := strings.Split(metricName, Delimiter)
|
||||
|
||||
lvl := l
|
||||
|
||||
for i := 0; i < len(subString)-1; i++ {
|
||||
|
||||
sel := subString[i]
|
||||
|
||||
if lvl.children == nil {
|
||||
lvl.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
child, ok := lvl.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
lvl.children[sel] = child
|
||||
}
|
||||
lvl = child
|
||||
}
|
||||
|
||||
leafMetricName := subString[len(subString)-1]
|
||||
err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
} else {
|
||||
err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
|
||||
n := len(floatArray)
|
||||
b := &buffer{
|
||||
frequency: resolution,
|
||||
start: from,
|
||||
data: floatArray[0:n:n],
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
b.close()
|
||||
|
||||
minfo, ok := m.Metrics[metricName]
|
||||
if !ok {
|
||||
return nil
|
||||
// return errors.New("Unkown metric: " + name)
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
|
||||
missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
|
||||
if missingCount > 0 {
|
||||
missingCount /= int(b.frequency)
|
||||
|
||||
for range missingCount {
|
||||
prev.data = append(prev.data, schema.NaN)
|
||||
}
|
||||
|
||||
prev.data = prev.data[0:len(prev.data):len(prev.data)]
|
||||
}
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
br := bufio.NewReader(f)
|
||||
cf := &CheckpointFile{}
|
||||
if err := json.NewDecoder(br).Decode(cf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if cf.To != 0 && cf.To < from {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := l.loadFile(cf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
|
||||
for name, metric := range cf.Metrics {
|
||||
n := len(metric.Data)
|
||||
b := &buffer{
|
||||
frequency: metric.Frequency,
|
||||
start: metric.Start,
|
||||
data: metric.Data[0:n:n], // Space is wasted here :(
|
||||
prev: nil,
|
||||
next: nil,
|
||||
archived: true,
|
||||
}
|
||||
b.close()
|
||||
|
||||
minfo, ok := m.Metrics[name]
|
||||
if !ok {
|
||||
continue
|
||||
// return errors.New("Unkown metric: " + name)
|
||||
}
|
||||
|
||||
prev := l.metrics[minfo.offset]
|
||||
if prev == nil {
|
||||
l.metrics[minfo.offset] = b
|
||||
} else {
|
||||
if prev.start > b.start {
|
||||
return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
|
||||
}
|
||||
|
||||
b.prev = prev
|
||||
prev.next = b
|
||||
}
|
||||
l.metrics[minfo.offset] = b
|
||||
}
|
||||
|
||||
if len(cf.Children) > 0 && l.children == nil {
|
||||
l.children = make(map[string]*Level)
|
||||
}
|
||||
|
||||
for sel, childCf := range cf.Children {
|
||||
child, ok := l.children[sel]
|
||||
if !ok {
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: nil,
|
||||
}
|
||||
l.children[sel] = child
|
||||
}
|
||||
|
||||
if err := child.loadFile(childCf, m); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
|
||||
direntries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return 0, err
|
||||
}
|
||||
|
||||
allFiles := make([]fs.DirEntry, 0)
|
||||
filesLoaded := 0
|
||||
for _, e := range direntries {
|
||||
if e.IsDir() {
|
||||
child := &Level{
|
||||
metrics: make([]*buffer, len(m.Metrics)),
|
||||
children: make(map[string]*Level),
|
||||
}
|
||||
|
||||
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
|
||||
filesLoaded += files
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
l.children[e.Name()] = child
|
||||
} else if strings.HasSuffix(e.Name(), "."+extension) {
|
||||
allFiles = append(allFiles, e)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
files, err := findFiles(allFiles, from, extension, true)
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
|
||||
"json": l.loadJSONFile,
|
||||
"avro": l.loadAvroFile,
|
||||
}
|
||||
|
||||
loader := loaders[extension]
|
||||
|
||||
for _, filename := range files {
|
||||
// Use a closure to ensure file is closed immediately after use
|
||||
err := func() error {
|
||||
f, err := os.Open(path.Join(dir, filename))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return loader(m, f, from)
|
||||
}()
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
filesLoaded += 1
|
||||
}
|
||||
|
||||
return filesLoaded, nil
|
||||
}
|
||||
|
||||
// This will probably get very slow over time!
|
||||
// A solution could be some sort of an index file in which all other files
|
||||
// and the timespan they contain is listed.
|
||||
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
|
||||
nums := map[string]int64{}
|
||||
for _, e := range direntries {
|
||||
if !strings.HasSuffix(e.Name(), "."+extension) {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nums[e.Name()] = ts
|
||||
}
|
||||
|
||||
sort.Slice(direntries, func(i, j int) bool {
|
||||
a, b := direntries[i], direntries[j]
|
||||
return nums[a.Name()] < nums[b.Name()]
|
||||
})
|
||||
|
||||
filenames := make([]string, 0)
|
||||
for i := range direntries {
|
||||
e := direntries[i]
|
||||
ts1 := nums[e.Name()]
|
||||
|
||||
if findMoreRecentFiles && t <= ts1 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
if i == len(direntries)-1 {
|
||||
continue
|
||||
}
|
||||
|
||||
enext := direntries[i+1]
|
||||
ts2 := nums[enext.Name()]
|
||||
|
||||
if findMoreRecentFiles {
|
||||
if ts1 < t && t < ts2 {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
} else {
|
||||
if ts2 < t {
|
||||
filenames = append(filenames, e.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return filenames, nil
|
||||
}
|
||||
@@ -1,121 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
var InternalCCMSFlag bool = false
|
||||
|
||||
type MetricStoreConfig struct {
|
||||
// Number of concurrent workers for checkpoint and archive operations.
|
||||
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
|
||||
NumWorkers int `json:"num-workers"`
|
||||
Checkpoints struct {
|
||||
FileFormat string `json:"file-format"`
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
Restore string `json:"restore"`
|
||||
} `json:"checkpoints"`
|
||||
Debug struct {
|
||||
DumpToFile string `json:"dump-to-file"`
|
||||
EnableGops bool `json:"gops"`
|
||||
} `json:"debug"`
|
||||
RetentionInMemory string `json:"retention-in-memory"`
|
||||
Archive struct {
|
||||
Interval string `json:"interval"`
|
||||
RootDir string `json:"directory"`
|
||||
DeleteInstead bool `json:"delete-instead"`
|
||||
} `json:"archive"`
|
||||
Nats []*NatsConfig `json:"nats"`
|
||||
}
|
||||
|
||||
type NatsConfig struct {
|
||||
// Address of the nats server
|
||||
Address string `json:"address"`
|
||||
|
||||
// Username/Password, optional
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
|
||||
// Creds file path
|
||||
Credsfilepath string `json:"creds-file-path"`
|
||||
|
||||
Subscriptions []struct {
|
||||
// Channel name
|
||||
SubscribeTo string `json:"subscribe-to"`
|
||||
|
||||
// Allow lines without a cluster tag, use this as default, optional
|
||||
ClusterTag string `json:"cluster-tag"`
|
||||
} `json:"subscriptions"`
|
||||
}
|
||||
|
||||
var Keys MetricStoreConfig
|
||||
|
||||
// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
|
||||
type AggregationStrategy int
|
||||
|
||||
const (
|
||||
NoAggregation AggregationStrategy = iota
|
||||
SumAggregation
|
||||
AvgAggregation
|
||||
)
|
||||
|
||||
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
|
||||
switch str {
|
||||
case "":
|
||||
return NoAggregation, nil
|
||||
case "sum":
|
||||
return SumAggregation, nil
|
||||
case "avg":
|
||||
return AvgAggregation, nil
|
||||
default:
|
||||
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
|
||||
}
|
||||
}
|
||||
|
||||
type MetricConfig struct {
|
||||
// Interval in seconds at which measurements are stored
|
||||
Frequency int64
|
||||
|
||||
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
|
||||
Aggregation AggregationStrategy
|
||||
|
||||
// Private, used internally...
|
||||
offset int
|
||||
}
|
||||
|
||||
var Metrics map[string]MetricConfig
|
||||
|
||||
func GetMetricFrequency(metricName string) (int64, error) {
|
||||
if metric, ok := Metrics[metricName]; ok {
|
||||
return metric.Frequency, nil
|
||||
}
|
||||
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
|
||||
}
|
||||
|
||||
// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
|
||||
// use metric.Name to check if the metric already exists.
|
||||
// if not, add it to the Metrics map.
|
||||
func AddMetric(name string, metric MetricConfig) error {
|
||||
if Metrics == nil {
|
||||
Metrics = make(map[string]MetricConfig, 0)
|
||||
}
|
||||
|
||||
if existingMetric, ok := Metrics[name]; ok {
|
||||
if existingMetric.Frequency != metric.Frequency {
|
||||
if existingMetric.Frequency < metric.Frequency {
|
||||
existingMetric.Frequency = metric.Frequency
|
||||
Metrics[name] = existingMetric
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Metrics[name] = metric
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,95 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
const configSchema = `{
|
||||
"type": "object",
|
||||
"description": "Configuration specific to built-in metric-store.",
|
||||
"properties": {
|
||||
"checkpoints": {
|
||||
"description": "Configuration for checkpointing the metrics within metric-store",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file-format": {
|
||||
"description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
|
||||
"type": "string"
|
||||
},
|
||||
"interval": {
|
||||
"description": "Interval at which the metrics should be checkpointed.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the checkpointed files should be placed.",
|
||||
"type": "string"
|
||||
},
|
||||
"restore": {
|
||||
"description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"archive": {
|
||||
"description": "Configuration for archiving the already checkpointed files.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"interval": {
|
||||
"description": "Interval at which the checkpointed files should be archived.",
|
||||
"type": "string"
|
||||
},
|
||||
"directory": {
|
||||
"description": "Specify the parent directy in which the archived files should be placed.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"retention-in-memory": {
|
||||
"description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
|
||||
"type": "string"
|
||||
},
|
||||
"nats": {
|
||||
"description": "Configuration for accepting published data through NATS.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": {
|
||||
"description": "Address of the NATS server.",
|
||||
"type": "string"
|
||||
},
|
||||
"username": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"password": {
|
||||
"description": "Optional: If configured with username/password method.",
|
||||
"type": "string"
|
||||
},
|
||||
"creds-file-path": {
|
||||
"description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
|
||||
"type": "string"
|
||||
},
|
||||
"subscriptions": {
|
||||
"description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"subscribe-to": {
|
||||
"description": "Channel name",
|
||||
"type": "string"
|
||||
},
|
||||
"cluster-tag": {
|
||||
"description": "Optional: Allow lines without a cluster tag, use this as default",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
@@ -1,112 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func (b *buffer) debugDump(buf []byte) []byte {
|
||||
if b.prev != nil {
|
||||
buf = b.prev.debugDump(buf)
|
||||
}
|
||||
|
||||
start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data))
|
||||
buf = append(buf, `{"start":`...)
|
||||
buf = strconv.AppendInt(buf, start, 10)
|
||||
buf = append(buf, `,"len":`...)
|
||||
buf = strconv.AppendInt(buf, int64(len), 10)
|
||||
buf = append(buf, `,"end":`...)
|
||||
buf = strconv.AppendInt(buf, end, 10)
|
||||
if b.archived {
|
||||
buf = append(buf, `,"saved":true`...)
|
||||
}
|
||||
if b.next != nil {
|
||||
buf = append(buf, `},`...)
|
||||
} else {
|
||||
buf = append(buf, `}`...)
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
buf = append(buf, '"')
|
||||
buf = append(buf, lvlname...)
|
||||
buf = append(buf, "\":{\n"...)
|
||||
depth += 1
|
||||
objitems := 0
|
||||
for name, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
|
||||
buf = append(buf, '"')
|
||||
buf = append(buf, name...)
|
||||
buf = append(buf, `":[`...)
|
||||
buf = b.debugDump(buf)
|
||||
buf = append(buf, "],\n"...)
|
||||
objitems++
|
||||
}
|
||||
}
|
||||
|
||||
for name, lvl := range l.children {
|
||||
_, err := w.Write(buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
buf = buf[0:0]
|
||||
buf, err = lvl.debugDump(m, w, name, buf, depth)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
buf = append(buf, ',', '\n')
|
||||
objitems++
|
||||
}
|
||||
|
||||
// remove final `,`:
|
||||
if objitems > 0 {
|
||||
buf = append(buf[0:len(buf)-1], '\n')
|
||||
}
|
||||
|
||||
depth -= 1
|
||||
for i := 0; i < depth; i++ {
|
||||
buf = append(buf, '\t')
|
||||
}
|
||||
buf = append(buf, '}')
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 2048)
|
||||
buf = append(buf, "{"...)
|
||||
|
||||
buf, err := lvl.debugDump(m, w, "data", buf, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
buf = append(buf, "}\n"...)
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
||||
@@ -1,92 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
|
||||
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
||||
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
||||
const MaxMissingDataPoints int64 = 5
|
||||
|
||||
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
|
||||
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
|
||||
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
|
||||
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
|
||||
const MaxUnhealthyMetrics int64 = 5
|
||||
|
||||
func (b *buffer) healthCheck() int64 {
|
||||
// Check if the buffer is empty
|
||||
if b.data == nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
||||
t := time.Now().Unix()
|
||||
|
||||
// Check if the buffer is too old
|
||||
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
||||
return 1
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
for _, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
count += b.healthCheck()
|
||||
}
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
c, err := lvl.healthCheck(m, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
count += c
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 25)
|
||||
// buf = append(buf, "{"...)
|
||||
|
||||
var count int64 = 0
|
||||
|
||||
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if unhealthyMetricsCount < MaxUnhealthyMetrics {
|
||||
buf = append(buf, "Healthy"...)
|
||||
} else {
|
||||
buf = append(buf, "Unhealthy"...)
|
||||
}
|
||||
|
||||
// buf = append(buf, "}\n"...)
|
||||
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
||||
@@ -1,192 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
// Could also be called "node" as this forms a node in a tree structure.
|
||||
// Called Level because "node" might be confusing here.
|
||||
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
|
||||
// also hold data (in `metrics`).
|
||||
type Level struct {
|
||||
children map[string]*Level
|
||||
metrics []*buffer
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
// Find the correct level for the given selector, creating it if
|
||||
// it does not exist. Example selector in the context of the
|
||||
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
|
||||
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
|
||||
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
// Allow concurrent reads:
|
||||
l.lock.RLock()
|
||||
var child *Level
|
||||
var ok bool
|
||||
if l.children == nil {
|
||||
// Children map needs to be created...
|
||||
l.lock.RUnlock()
|
||||
} else {
|
||||
child, ok = l.children[selector[0]]
|
||||
l.lock.RUnlock()
|
||||
if ok {
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// The level does not exist, take write lock for unqiue access:
|
||||
l.lock.Lock()
|
||||
// While this thread waited for the write lock, another thread
|
||||
// could have created the child node.
|
||||
if l.children != nil {
|
||||
child, ok = l.children[selector[0]]
|
||||
if ok {
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
child = &Level{
|
||||
metrics: make([]*buffer, nMetrics),
|
||||
children: nil,
|
||||
}
|
||||
|
||||
if l.children != nil {
|
||||
l.children[selector[0]] = child
|
||||
} else {
|
||||
l.children = map[string]*Level{selector[0]: child}
|
||||
}
|
||||
l.lock.Unlock()
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
|
||||
func (l *Level) free(t int64) (int, error) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
n := 0
|
||||
for i, b := range l.metrics {
|
||||
if b != nil {
|
||||
delme, m := b.free(t)
|
||||
n += m
|
||||
if delme {
|
||||
if cap(b.data) == BufferCap {
|
||||
bufferPool.Put(b)
|
||||
}
|
||||
l.metrics[i] = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range l.children {
|
||||
m, err := l.free(t)
|
||||
n += m
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (l *Level) sizeInBytes() int64 {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
size := int64(0)
|
||||
|
||||
for _, b := range l.metrics {
|
||||
if b != nil {
|
||||
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
|
||||
}
|
||||
}
|
||||
|
||||
for _, child := range l.children {
|
||||
size += child.sizeInBytes()
|
||||
}
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
func (l *Level) findLevel(selector []string) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
}
|
||||
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
lvl := l.children[selector[0]]
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return lvl.findLevel(selector[1:])
|
||||
}
|
||||
|
||||
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
if len(selector) == 0 {
|
||||
b := l.metrics[offset]
|
||||
if b != nil {
|
||||
return f(b)
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
err := lvl.findBuffers(nil, offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
sel := selector[0]
|
||||
if len(sel.String) != 0 && l.children != nil {
|
||||
lvl, ok := l.children[sel.String]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Group != nil && l.children != nil {
|
||||
for _, key := range sel.Group {
|
||||
lvl, ok := l.children[key]
|
||||
if ok {
|
||||
err := lvl.findBuffers(selector[1:], offset, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if sel.Any && l.children != nil {
|
||||
for _, lvl := range l.children {
|
||||
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,351 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
// Each connection is handled in it's own goroutine. This is a blocking function.
|
||||
// func ReceiveRaw(ctx context.Context,
|
||||
// listener net.Listener,
|
||||
// handleLine func(*lineprotocol.Decoder, string) error,
|
||||
// ) error {
|
||||
// var wg sync.WaitGroup
|
||||
|
||||
// wg.Add(1)
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// <-ctx.Done()
|
||||
// if err := listener.Close(); err != nil {
|
||||
// log.Printf("listener.Close(): %s", err.Error())
|
||||
// }
|
||||
// }()
|
||||
|
||||
// for {
|
||||
// conn, err := listener.Accept()
|
||||
// if err != nil {
|
||||
// if errors.Is(err, net.ErrClosed) {
|
||||
// break
|
||||
// }
|
||||
|
||||
// log.Printf("listener.Accept(): %s", err.Error())
|
||||
// }
|
||||
|
||||
// wg.Add(2)
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// defer conn.Close()
|
||||
|
||||
// dec := lineprotocol.NewDecoder(conn)
|
||||
// connctx, cancel := context.WithCancel(context.Background())
|
||||
// defer cancel()
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
// select {
|
||||
// case <-connctx.Done():
|
||||
// conn.Close()
|
||||
// case <-ctx.Done():
|
||||
// conn.Close()
|
||||
// }
|
||||
// }()
|
||||
|
||||
// if err := handleLine(dec, "default"); err != nil {
|
||||
// if errors.Is(err, net.ErrClosed) {
|
||||
// return
|
||||
// }
|
||||
|
||||
// log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error())
|
||||
// errmsg := make([]byte, 128)
|
||||
// errmsg = append(errmsg, `error: `...)
|
||||
// errmsg = append(errmsg, err.Error()...)
|
||||
// errmsg = append(errmsg, '\n')
|
||||
// conn.Write(errmsg)
|
||||
// }
|
||||
// }()
|
||||
// }
|
||||
|
||||
// wg.Wait()
|
||||
// return nil
|
||||
// }
|
||||
|
||||
// ReceiveNats connects to a nats server and subscribes to "updates". This is a
|
||||
// blocking function. handleLine will be called for each line recieved via
|
||||
// nats. Send `true` through the done channel for gracefull termination.
|
||||
func ReceiveNats(conf *(NatsConfig),
|
||||
ms *MemoryStore,
|
||||
workers int,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
var opts []nats.Option
|
||||
if conf.Username != "" && conf.Password != "" {
|
||||
opts = append(opts, nats.UserInfo(conf.Username, conf.Password))
|
||||
}
|
||||
|
||||
if conf.Credsfilepath != "" {
|
||||
opts = append(opts, nats.UserCredentials(conf.Credsfilepath))
|
||||
}
|
||||
|
||||
nc, err := nats.Connect(conf.Address, opts...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer nc.Close()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
var subs []*nats.Subscription
|
||||
|
||||
msgs := make(chan *nats.Msg, workers*2)
|
||||
|
||||
for _, sc := range conf.Subscriptions {
|
||||
clusterTag := sc.ClusterTag
|
||||
var sub *nats.Subscription
|
||||
if workers > 1 {
|
||||
wg.Add(workers)
|
||||
|
||||
for range workers {
|
||||
go func() {
|
||||
for m := range msgs {
|
||||
dec := lineprotocol.NewDecoderWithBytes(m.Data)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Errorf("error: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
|
||||
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
|
||||
msgs <- m
|
||||
})
|
||||
} else {
|
||||
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
|
||||
dec := lineprotocol.NewDecoderWithBytes(m.Data)
|
||||
if err := DecodeLine(dec, ms, clusterTag); err != nil {
|
||||
cclog.Errorf("error: %s", err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cclog.Infof("NATS subscription to '%s' on '%s' established", sc.SubscribeTo, conf.Address)
|
||||
subs = append(subs, sub)
|
||||
}
|
||||
|
||||
<-ctx.Done()
|
||||
for _, sub := range subs {
|
||||
err = sub.Unsubscribe()
|
||||
if err != nil {
|
||||
cclog.Errorf("NATS unsubscribe failed: %s", err.Error())
|
||||
}
|
||||
}
|
||||
close(msgs)
|
||||
wg.Wait()
|
||||
|
||||
nc.Close()
|
||||
cclog.Print("NATS connection closed")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Place `prefix` in front of `buf` but if possible,
|
||||
// do that inplace in `buf`.
|
||||
func reorder(buf, prefix []byte) []byte {
|
||||
n := len(prefix)
|
||||
m := len(buf)
|
||||
if cap(buf) < m+n {
|
||||
return append(prefix[:n:n], buf...)
|
||||
} else {
|
||||
buf = buf[:n+m]
|
||||
for i := m - 1; i >= 0; i-- {
|
||||
buf[i+n] = buf[i]
|
||||
}
|
||||
for i := range n {
|
||||
buf[i] = prefix[i]
|
||||
}
|
||||
return buf
|
||||
}
|
||||
}
|
||||
|
||||
// Decode lines using dec and make write calls to the MemoryStore.
|
||||
// If a line is missing its cluster tag, use clusterDefault as default.
|
||||
func DecodeLine(dec *lineprotocol.Decoder,
|
||||
ms *MemoryStore,
|
||||
clusterDefault string,
|
||||
) error {
|
||||
// Reduce allocations in loop:
|
||||
t := time.Now()
|
||||
metric, metricBuf := Metric{}, make([]byte, 0, 16)
|
||||
selector := make([]string, 0, 4)
|
||||
typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
|
||||
|
||||
// Optimize for the case where all lines in a "batch" are about the same
|
||||
// cluster and host. By using `WriteToLevel` (level = host), we do not need
|
||||
// to take the root- and cluster-level lock as often.
|
||||
var lvl *Level = nil
|
||||
prevCluster, prevHost := "", ""
|
||||
|
||||
var ok bool
|
||||
for dec.Next() {
|
||||
rawmeasurement, err := dec.Measurement()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Needs to be copied because another call to dec.* would
|
||||
// invalidate the returned slice.
|
||||
metricBuf = append(metricBuf[:0], rawmeasurement...)
|
||||
|
||||
// The go compiler optimizes map[string(byteslice)] lookups:
|
||||
metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
|
||||
cluster, host := clusterDefault, ""
|
||||
for {
|
||||
key, val, err := dec.NextTag()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
// The go compiler optimizes string([]byte{...}) == "...":
|
||||
switch string(key) {
|
||||
case "cluster":
|
||||
if string(val) == prevCluster {
|
||||
cluster = prevCluster
|
||||
} else {
|
||||
cluster = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "hostname", "host":
|
||||
if string(val) == prevHost {
|
||||
host = prevHost
|
||||
} else {
|
||||
host = string(val)
|
||||
lvl = nil
|
||||
}
|
||||
case "type":
|
||||
if string(val) == "node" {
|
||||
break
|
||||
}
|
||||
|
||||
// We cannot be sure that the "type" tag comes before the "type-id" tag:
|
||||
if len(typeBuf) == 0 {
|
||||
typeBuf = append(typeBuf, val...)
|
||||
} else {
|
||||
typeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "type-id":
|
||||
typeBuf = append(typeBuf, val...)
|
||||
case "subtype":
|
||||
// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
|
||||
if len(subTypeBuf) == 0 {
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
} else {
|
||||
subTypeBuf = reorder(subTypeBuf, val)
|
||||
// subTypeBuf = reorder(typeBuf, val)
|
||||
}
|
||||
case "stype-id":
|
||||
subTypeBuf = append(subTypeBuf, val...)
|
||||
default:
|
||||
// Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need)
|
||||
// return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val))
|
||||
}
|
||||
}
|
||||
|
||||
// If the cluster or host changed, the lvl was set to nil
|
||||
if lvl == nil {
|
||||
selector = selector[:2]
|
||||
selector[0], selector[1] = cluster, host
|
||||
lvl = ms.GetLevel(selector)
|
||||
prevCluster, prevHost = cluster, host
|
||||
}
|
||||
|
||||
// subtypes:
|
||||
selector = selector[:0]
|
||||
if len(typeBuf) > 0 {
|
||||
selector = append(selector, string(typeBuf)) // <- Allocation :(
|
||||
if len(subTypeBuf) > 0 {
|
||||
selector = append(selector, string(subTypeBuf))
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
key, val, err := dec.NextField()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if key == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if string(key) != "value" {
|
||||
return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val)
|
||||
}
|
||||
|
||||
if val.Kind() == lineprotocol.Float {
|
||||
metric.Value = schema.Float(val.FloatV())
|
||||
} else if val.Kind() == lineprotocol.Int {
|
||||
metric.Value = schema.Float(val.IntV())
|
||||
} else if val.Kind() == lineprotocol.Uint {
|
||||
metric.Value = schema.Float(val.UintV())
|
||||
} else {
|
||||
return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String())
|
||||
}
|
||||
}
|
||||
|
||||
if t, err = dec.Time(lineprotocol.Second, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil {
|
||||
t = time.Now()
|
||||
if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
|
||||
}
|
||||
|
||||
time := t.Unix()
|
||||
|
||||
if Keys.Checkpoints.FileFormat != "json" {
|
||||
LineProtocolMessages <- &AvroStruct{
|
||||
MetricName: string(metricBuf),
|
||||
Cluster: cluster,
|
||||
Node: host,
|
||||
Selector: append([]string{}, selector...),
|
||||
Value: metric.Value,
|
||||
Timestamp: time,
|
||||
}
|
||||
}
|
||||
|
||||
if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,437 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package memorystore provides an efficient in-memory time-series metric storage system
|
||||
// with support for hierarchical data organization, checkpointing, and archiving.
|
||||
//
|
||||
// The package organizes metrics in a tree structure (cluster → host → component) and
|
||||
// provides concurrent read/write access to metric data with configurable aggregation strategies.
|
||||
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
|
||||
// and enforcing retention policies.
|
||||
//
|
||||
// Key features:
|
||||
// - In-memory metric storage with configurable retention
|
||||
// - Hierarchical data organization (selectors)
|
||||
// - Concurrent checkpoint/archive workers
|
||||
// - Support for sum and average aggregation
|
||||
// - NATS integration for metric ingestion
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
var (
|
||||
singleton sync.Once
|
||||
msInstance *MemoryStore
|
||||
// shutdownFunc stores the context cancellation function created in Init
|
||||
// and is called during Shutdown to cancel all background goroutines
|
||||
shutdownFunc context.CancelFunc
|
||||
)
|
||||
|
||||
|
||||
|
||||
type Metric struct {
|
||||
Name string
|
||||
Value schema.Float
|
||||
MetricConfig MetricConfig
|
||||
}
|
||||
|
||||
type MemoryStore struct {
|
||||
Metrics map[string]MetricConfig
|
||||
root Level
|
||||
}
|
||||
|
||||
func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
|
||||
startupTime := time.Now()
|
||||
|
||||
if rawConfig != nil {
|
||||
config.Validate(configSchema, rawConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(rawConfig))
|
||||
// dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&Keys); err != nil {
|
||||
cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Set NumWorkers from config or use default
|
||||
if Keys.NumWorkers <= 0 {
|
||||
maxWorkers := 10
|
||||
Keys.NumWorkers = min(runtime.NumCPU()/2+1, maxWorkers)
|
||||
}
|
||||
cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
|
||||
|
||||
// Helper function to add metric configuration
|
||||
addMetricConfig := func(mc schema.MetricConfig) {
|
||||
agg, err := AssignAggregationStrategy(mc.Aggregation)
|
||||
if err != nil {
|
||||
cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
|
||||
}
|
||||
|
||||
AddMetric(mc.Name, MetricConfig{
|
||||
Frequency: int64(mc.Timestep),
|
||||
Aggregation: agg,
|
||||
})
|
||||
}
|
||||
|
||||
for _, c := range archive.Clusters {
|
||||
for _, mc := range c.MetricConfig {
|
||||
addMetricConfig(*mc)
|
||||
}
|
||||
|
||||
for _, sc := range c.SubClusters {
|
||||
for _, mc := range sc.MetricConfig {
|
||||
addMetricConfig(mc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass the config.MetricStoreKeys
|
||||
InitMetrics(Metrics)
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
restoreFrom := startupTime.Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
|
||||
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
|
||||
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
|
||||
if err != nil {
|
||||
cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
|
||||
}
|
||||
|
||||
// Try to use less memory by forcing a GC run here and then
|
||||
// lowering the target percentage. The default of 100 means
|
||||
// that only once the ratio of new allocations execeds the
|
||||
// previously active heap, a GC is triggered.
|
||||
// Forcing a GC here will set the "previously active heap"
|
||||
// to a minumum.
|
||||
runtime.GC()
|
||||
|
||||
ctx, shutdown := context.WithCancel(context.Background())
|
||||
|
||||
wg.Add(4)
|
||||
|
||||
Retention(wg, ctx)
|
||||
Checkpointing(wg, ctx)
|
||||
Archiving(wg, ctx)
|
||||
DataStaging(wg, ctx)
|
||||
|
||||
// Note: Signal handling has been removed from this function.
|
||||
// The caller is responsible for handling shutdown signals and calling
|
||||
// the shutdown() function when appropriate.
|
||||
// Store the shutdown function for later use by Shutdown()
|
||||
shutdownFunc = shutdown
|
||||
|
||||
if Keys.Nats != nil {
|
||||
for _, natsConf := range Keys.Nats {
|
||||
// TODO: When multiple nats configs share a URL, do a single connect.
|
||||
wg.Add(1)
|
||||
nc := natsConf
|
||||
go func() {
|
||||
// err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx)
|
||||
err := ReceiveNats(nc, ms, 1, ctx)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// InitMetrics creates a new, initialized instance of a MemoryStore.
|
||||
// Will panic if values in the metric configurations are invalid.
|
||||
func InitMetrics(metrics map[string]MetricConfig) {
|
||||
singleton.Do(func() {
|
||||
offset := 0
|
||||
for key, cfg := range metrics {
|
||||
if cfg.Frequency == 0 {
|
||||
panic("[METRICSTORE]> invalid frequency")
|
||||
}
|
||||
|
||||
metrics[key] = MetricConfig{
|
||||
Frequency: cfg.Frequency,
|
||||
Aggregation: cfg.Aggregation,
|
||||
offset: offset,
|
||||
}
|
||||
offset += 1
|
||||
}
|
||||
|
||||
msInstance = &MemoryStore{
|
||||
root: Level{
|
||||
metrics: make([]*buffer, len(metrics)),
|
||||
children: make(map[string]*Level),
|
||||
},
|
||||
Metrics: metrics,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func GetMemoryStore() *MemoryStore {
|
||||
if msInstance == nil {
|
||||
cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!")
|
||||
}
|
||||
|
||||
return msInstance
|
||||
}
|
||||
|
||||
func Shutdown() {
|
||||
// Cancel the context to signal all background goroutines to stop
|
||||
if shutdownFunc != nil {
|
||||
shutdownFunc()
|
||||
}
|
||||
|
||||
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
|
||||
var files int
|
||||
var err error
|
||||
|
||||
ms := GetMemoryStore()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "json" {
|
||||
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
|
||||
} else {
|
||||
files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
|
||||
close(LineProtocolMessages)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
|
||||
}
|
||||
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
|
||||
}
|
||||
|
||||
func getName(m *MemoryStore, i int) string {
|
||||
for key, val := range m.Metrics {
|
||||
if val.offset == i {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func Retention(wg *sync.WaitGroup, ctx context.Context) {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
d, err := time.ParseDuration(Keys.RetentionInMemory)
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
if d <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
ticks := func() <-chan time.Time {
|
||||
d := d / 2
|
||||
if d <= 0 {
|
||||
return nil
|
||||
}
|
||||
return time.NewTicker(d).C
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticks:
|
||||
t := time.Now().Add(-d)
|
||||
cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
|
||||
freed, err := ms.Free(nil, t.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
|
||||
} else {
|
||||
cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
|
||||
// Look at `findLevelOrCreate` for how selectors work.
|
||||
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
|
||||
var ok bool
|
||||
for i, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
metric.MetricConfig, ok = m.Metrics[metric.Name]
|
||||
if !ok {
|
||||
metric.MetricConfig.Frequency = 0
|
||||
}
|
||||
metrics[i] = metric
|
||||
}
|
||||
}
|
||||
|
||||
return m.WriteToLevel(&m.root, selector, ts, metrics)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) GetLevel(selector []string) *Level {
|
||||
return m.root.findLevelOrCreate(selector, len(m.Metrics))
|
||||
}
|
||||
|
||||
// WriteToLevel assumes that `minfo` in `metrics` is filled in
|
||||
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
|
||||
l = l.findLevelOrCreate(selector, len(m.Metrics))
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
for _, metric := range metrics {
|
||||
if metric.MetricConfig.Frequency == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
b := l.metrics[metric.MetricConfig.offset]
|
||||
if b == nil {
|
||||
// First write to this metric and level
|
||||
b = newBuffer(ts, metric.MetricConfig.Frequency)
|
||||
l.metrics[metric.MetricConfig.offset] = b
|
||||
}
|
||||
|
||||
nb, err := b.write(ts, metric.Value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Last write created a new buffer...
|
||||
if b != nb {
|
||||
l.metrics[metric.MetricConfig.offset] = nb
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
|
||||
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
|
||||
// The second and third return value are the actual from/to for the data. Those can be different from
|
||||
// the range asked for if no data was available.
|
||||
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
|
||||
if from > to {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metric]
|
||||
if !ok {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric)
|
||||
}
|
||||
|
||||
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
|
||||
|
||||
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
|
||||
cdata, cfrom, cto, err := b.read(from, to, data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto || len(data) != len(cdata) {
|
||||
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
|
||||
if missingfront != 0 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
newlen := len(cdata) - missingback
|
||||
if newlen < 1 {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
cdata = cdata[0:newlen]
|
||||
if len(cdata) != len(data) {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
from, to = cfrom, cto
|
||||
}
|
||||
|
||||
data = cdata
|
||||
n += 1
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
} else if n == 0 {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
|
||||
} else if n > 1 {
|
||||
if minfo.Aggregation == AvgAggregation {
|
||||
normalize := 1. / schema.Float(n)
|
||||
for i := 0; i < len(data); i++ {
|
||||
data[i] *= normalize
|
||||
}
|
||||
} else if minfo.Aggregation != SumAggregation {
|
||||
return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
|
||||
}
|
||||
}
|
||||
|
||||
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, err
|
||||
}
|
||||
|
||||
return data, from, to, resolution, nil
|
||||
}
|
||||
|
||||
// Free releases all buffers for the selected level and all its children that
|
||||
// contain only values older than `t`.
|
||||
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
|
||||
return m.GetLevel(selector).free(t)
|
||||
}
|
||||
|
||||
func (m *MemoryStore) FreeAll() error {
|
||||
for k := range m.root.children {
|
||||
delete(m.root.children, k)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) SizeInBytes() int64 {
|
||||
return m.root.sizeInBytes()
|
||||
}
|
||||
|
||||
// ListChildren , given a selector, returns a list of all children of the level
|
||||
// selected.
|
||||
func (m *MemoryStore) ListChildren(selector []string) []string {
|
||||
lvl := &m.root
|
||||
for lvl != nil && len(selector) != 0 {
|
||||
lvl.lock.RLock()
|
||||
next := lvl.children[selector[0]]
|
||||
lvl.lock.RUnlock()
|
||||
lvl = next
|
||||
selector = selector[1:]
|
||||
}
|
||||
|
||||
if lvl == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
lvl.lock.RLock()
|
||||
defer lvl.lock.RUnlock()
|
||||
|
||||
children := make([]string, 0, len(lvl.children))
|
||||
for child := range lvl.children {
|
||||
children = append(children, child)
|
||||
}
|
||||
|
||||
return children
|
||||
}
|
||||
@@ -1,156 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
func TestAssignAggregationStrategy(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected AggregationStrategy
|
||||
wantErr bool
|
||||
}{
|
||||
{"empty string", "", NoAggregation, false},
|
||||
{"sum", "sum", SumAggregation, false},
|
||||
{"avg", "avg", AvgAggregation, false},
|
||||
{"invalid", "invalid", NoAggregation, true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, err := AssignAggregationStrategy(tt.input)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("AssignAggregationStrategy(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if result != tt.expected {
|
||||
t.Errorf("AssignAggregationStrategy(%q) = %v, want %v", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddMetric(t *testing.T) {
|
||||
// Reset Metrics before test
|
||||
Metrics = make(map[string]MetricConfig)
|
||||
|
||||
err := AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 60,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if _, ok := Metrics["test_metric"]; !ok {
|
||||
t.Error("AddMetric() did not add metric to Metrics map")
|
||||
}
|
||||
|
||||
// Test updating with higher frequency
|
||||
err = AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 120,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if Metrics["test_metric"].Frequency != 120 {
|
||||
t.Errorf("AddMetric() frequency = %d, want 120", Metrics["test_metric"].Frequency)
|
||||
}
|
||||
|
||||
// Test updating with lower frequency (should not update)
|
||||
err = AddMetric("test_metric", MetricConfig{
|
||||
Frequency: 30,
|
||||
Aggregation: SumAggregation,
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("AddMetric() error = %v", err)
|
||||
}
|
||||
|
||||
if Metrics["test_metric"].Frequency != 120 {
|
||||
t.Errorf("AddMetric() frequency = %d, want 120 (should not downgrade)", Metrics["test_metric"].Frequency)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMetricFrequency(t *testing.T) {
|
||||
// Reset Metrics before test
|
||||
Metrics = map[string]MetricConfig{
|
||||
"test_metric": {
|
||||
Frequency: 60,
|
||||
Aggregation: SumAggregation,
|
||||
},
|
||||
}
|
||||
|
||||
freq, err := GetMetricFrequency("test_metric")
|
||||
if err != nil {
|
||||
t.Errorf("GetMetricFrequency() error = %v", err)
|
||||
}
|
||||
if freq != 60 {
|
||||
t.Errorf("GetMetricFrequency() = %d, want 60", freq)
|
||||
}
|
||||
|
||||
_, err = GetMetricFrequency("nonexistent")
|
||||
if err == nil {
|
||||
t.Error("GetMetricFrequency() expected error for nonexistent metric")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferWrite(t *testing.T) {
|
||||
b := newBuffer(100, 10)
|
||||
|
||||
// Test writing value
|
||||
nb, err := b.write(100, schema.Float(42.0))
|
||||
if err != nil {
|
||||
t.Errorf("buffer.write() error = %v", err)
|
||||
}
|
||||
if nb != b {
|
||||
t.Error("buffer.write() created new buffer unexpectedly")
|
||||
}
|
||||
if len(b.data) != 1 {
|
||||
t.Errorf("buffer.write() len(data) = %d, want 1", len(b.data))
|
||||
}
|
||||
if b.data[0] != schema.Float(42.0) {
|
||||
t.Errorf("buffer.write() data[0] = %v, want 42.0", b.data[0])
|
||||
}
|
||||
|
||||
// Test writing value from past (should error)
|
||||
_, err = b.write(50, schema.Float(10.0))
|
||||
if err == nil {
|
||||
t.Error("buffer.write() expected error for past timestamp")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBufferRead(t *testing.T) {
|
||||
b := newBuffer(100, 10)
|
||||
|
||||
// Write some test data
|
||||
b.write(100, schema.Float(1.0))
|
||||
b.write(110, schema.Float(2.0))
|
||||
b.write(120, schema.Float(3.0))
|
||||
|
||||
// Read data
|
||||
data := make([]schema.Float, 3)
|
||||
result, from, to, err := b.read(100, 130, data)
|
||||
if err != nil {
|
||||
t.Errorf("buffer.read() error = %v", err)
|
||||
}
|
||||
// Buffer read should return from as firstWrite (start + freq/2)
|
||||
if from != 100 {
|
||||
t.Errorf("buffer.read() from = %d, want 100", from)
|
||||
}
|
||||
if to != 130 {
|
||||
t.Errorf("buffer.read() to = %d, want 130", to)
|
||||
}
|
||||
if len(result) != 3 {
|
||||
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
|
||||
}
|
||||
}
|
||||
@@ -1,124 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"math"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/util"
|
||||
)
|
||||
|
||||
type Stats struct {
|
||||
Samples int
|
||||
Avg util.Float
|
||||
Min util.Float
|
||||
Max util.Float
|
||||
}
|
||||
|
||||
func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) {
|
||||
if from < b.start {
|
||||
if b.prev != nil {
|
||||
return b.prev.stats(from, to)
|
||||
}
|
||||
from = b.start
|
||||
}
|
||||
|
||||
// TODO: Check if b.closed and if so and the full buffer is queried,
|
||||
// use b.statistics instead of iterating over the buffer.
|
||||
|
||||
samples := 0
|
||||
sum, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
||||
|
||||
var t int64
|
||||
for t = from; t < to; t += b.frequency {
|
||||
idx := int((t - b.start) / b.frequency)
|
||||
if idx >= cap(b.data) {
|
||||
b = b.next
|
||||
if b == nil {
|
||||
break
|
||||
}
|
||||
idx = 0
|
||||
}
|
||||
|
||||
if t < b.start || idx >= len(b.data) {
|
||||
continue
|
||||
}
|
||||
|
||||
xf := float64(b.data[idx])
|
||||
if math.IsNaN(xf) {
|
||||
continue
|
||||
}
|
||||
|
||||
samples += 1
|
||||
sum += xf
|
||||
min = math.Min(min, xf)
|
||||
max = math.Max(max, xf)
|
||||
}
|
||||
|
||||
return Stats{
|
||||
Samples: samples,
|
||||
Avg: util.Float(sum) / util.Float(samples),
|
||||
Min: util.Float(min),
|
||||
Max: util.Float(max),
|
||||
}, from, t, nil
|
||||
}
|
||||
|
||||
// Returns statistics for the requested metric on the selected node/level.
|
||||
// Data is aggregated to the selected level the same way as in `MemoryStore.Read`.
|
||||
// If `Stats.Samples` is zero, the statistics should not be considered as valid.
|
||||
func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) {
|
||||
if from > to {
|
||||
return nil, 0, 0, errors.New("invalid time range")
|
||||
}
|
||||
|
||||
minfo, ok := m.Metrics[metric]
|
||||
if !ok {
|
||||
return nil, 0, 0, errors.New("unkown metric: " + metric)
|
||||
}
|
||||
|
||||
n, samples := 0, 0
|
||||
avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32
|
||||
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
|
||||
stats, cfrom, cto, err := b.stats(from, to)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto {
|
||||
return ErrDataDoesNotAlign
|
||||
}
|
||||
|
||||
samples += stats.Samples
|
||||
avg += stats.Avg
|
||||
min = math.Min(min, float64(stats.Min))
|
||||
max = math.Max(max, float64(stats.Max))
|
||||
n += 1
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
|
||||
if n == 0 {
|
||||
return nil, 0, 0, ErrNoData
|
||||
}
|
||||
|
||||
if minfo.Aggregation == AvgAggregation {
|
||||
avg /= util.Float(n)
|
||||
} else if n > 1 && minfo.Aggregation != SumAggregation {
|
||||
return nil, 0, 0, errors.New("invalid aggregation")
|
||||
}
|
||||
|
||||
return &Stats{
|
||||
Samples: samples,
|
||||
Avg: avg,
|
||||
Min: util.Float(min),
|
||||
Max: util.Float(max),
|
||||
}, from, to, nil
|
||||
}
|
||||
@@ -1,381 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricDataDispatcher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) string {
|
||||
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
||||
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
|
||||
job.ID, job.State, metrics, scopes, resolution)
|
||||
}
|
||||
|
||||
// Fetches the metric data for a job.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
||||
config.Keys.DisableArchive {
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
// return err, 0, 0 // Reactivating will block archiving on one partial error
|
||||
} else {
|
||||
cclog.Error("Error while loading job data from metric repository")
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
var jd_temp schema.JobData
|
||||
jd_temp, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Error("Error while loading job data from archive")
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
// Deep copy the cached archive hashmap
|
||||
jd = metricdata.DeepCopy(jd_temp)
|
||||
|
||||
// Resampling for archived data.
|
||||
// Pass the resolution from frontend here.
|
||||
for _, v := range jd {
|
||||
for _, v_ := range v {
|
||||
timestep := int64(0)
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
v_.Timestep = int(timestep)
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid sending unrequested data to the client:
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
// FIXME: Review: Is this really necessary or correct.
|
||||
// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
|
||||
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
|
||||
// to be available at the scope 'node'. If a job has a lot of nodes,
|
||||
// statisticsSeries should be available so that a min/median/max Graph can be
|
||||
// used instead of a lot of single lines.
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
// Existing (archived) StatsSeries can be 'min/mean/max'!
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jd.AddNodeScope("flops_any")
|
||||
jd.AddNodeScope("mem_bw")
|
||||
}
|
||||
|
||||
// Round Resulting Stat Values
|
||||
jd.RoundMetricStats()
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
cclog.Error("Error in returned dataset")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for statsTable in frontend: Return scoped statistics by metric.
|
||||
func LoadScopedJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
|
||||
}
|
||||
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
|
||||
}
|
||||
|
||||
scopedStats, err := repo.LoadScopedStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while loading scoped statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return scopedStats, nil
|
||||
}
|
||||
|
||||
// Used for polar plots in frontend: Aggregates statistics for all nodes to single values for job per metric.
|
||||
func LoadJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.MetricStatistics, error) {
|
||||
if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
|
||||
return archive.LoadStatsFromArchive(job, metrics)
|
||||
}
|
||||
|
||||
data := make(map[string]schema.MetricStatistics, len(metrics))
|
||||
repo, err := metricdata.GetMetricDataRepo(job.Cluster)
|
||||
if err != nil {
|
||||
return data, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
|
||||
}
|
||||
|
||||
stats, err := repo.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Errorf("error while loading statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
|
||||
return data, err
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
min = math.Min(min, node.Min)
|
||||
max = math.Max(max, node.Max)
|
||||
}
|
||||
|
||||
data[m] = schema.MetricStatistics{
|
||||
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
|
||||
Min: (math.Round(min*100) / 100),
|
||||
Max: (math.Round(max*100) / 100),
|
||||
}
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
cclog.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
repo, err := metricdata.GetMetricDataRepo(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
data, err := repo.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error: %s", err.Error())
|
||||
} else {
|
||||
cclog.Error("Error while loading node data from metric repository")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: New StatsSeries will always be calculated as 'min/median/max'
|
||||
const maxSeriesSize int = 8
|
||||
for _, jd := range data {
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,88 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
type MetricDataRepository interface {
|
||||
// Initialize this MetricDataRepository. One instance of
|
||||
// this interface will only ever be responsible for one cluster.
|
||||
Init(rawConfig json.RawMessage) error
|
||||
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
|
||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
|
||||
LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
|
||||
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
|
||||
LoadNodeListData(cluster, subCluster string, nodes, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, ctx context.Context) (map[string]schema.JobData, error)
|
||||
}
|
||||
|
||||
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
|
||||
|
||||
func Init() error {
|
||||
for _, cluster := range config.Clusters {
|
||||
if cluster.MetricDataRepository != nil {
|
||||
var kind struct {
|
||||
Kind string `json:"kind"`
|
||||
}
|
||||
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
|
||||
cclog.Warn("Error while unmarshaling raw json MetricDataRepository")
|
||||
return err
|
||||
}
|
||||
|
||||
var mdr MetricDataRepository
|
||||
switch kind.Kind {
|
||||
case "cc-metric-store":
|
||||
mdr = &CCMetricStore{}
|
||||
case "cc-metric-store-internal":
|
||||
mdr = &CCMetricStoreInternal{}
|
||||
memorystore.InternalCCMSFlag = true
|
||||
case "prometheus":
|
||||
mdr = &PrometheusDataRepository{}
|
||||
case "test":
|
||||
mdr = &TestMetricDataRepository{}
|
||||
default:
|
||||
return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
}
|
||||
|
||||
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
|
||||
cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
|
||||
return err
|
||||
}
|
||||
metricDataRepos[cluster.Name] = mdr
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
|
||||
var err error
|
||||
repo, ok := metricDataRepos[cluster]
|
||||
|
||||
if !ok {
|
||||
err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
|
||||
}
|
||||
|
||||
return repo, err
|
||||
}
|
||||
@@ -1,587 +0,0 @@
|
||||
// Copyright (C) 2022 DKRZ
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
promapi "github.com/prometheus/client_golang/api"
|
||||
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
promcfg "github.com/prometheus/common/config"
|
||||
promm "github.com/prometheus/common/model"
|
||||
)
|
||||
|
||||
type PrometheusDataRepositoryConfig struct {
|
||||
Url string `json:"url"`
|
||||
Username string `json:"username,omitempty"`
|
||||
Suffix string `json:"suffix,omitempty"`
|
||||
Templates map[string]string `json:"query-templates"`
|
||||
}
|
||||
|
||||
type PrometheusDataRepository struct {
|
||||
client promapi.Client
|
||||
queryClient promv1.API
|
||||
suffix string
|
||||
templates map[string]*template.Template
|
||||
}
|
||||
|
||||
type PromQLArgs struct {
|
||||
Nodes string
|
||||
}
|
||||
|
||||
type Trie map[rune]Trie
|
||||
|
||||
var logOnce sync.Once
|
||||
|
||||
func contains(s []schema.MetricScope, str schema.MetricScope) bool {
|
||||
for _, v := range s {
|
||||
if v == str {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func MinMaxMean(data []schema.Float) (float64, float64, float64) {
|
||||
if len(data) == 0 {
|
||||
return 0.0, 0.0, 0.0
|
||||
}
|
||||
min := math.MaxFloat64
|
||||
max := -math.MaxFloat64
|
||||
var sum float64
|
||||
var n float64
|
||||
for _, val := range data {
|
||||
if val.IsNaN() {
|
||||
continue
|
||||
}
|
||||
sum += float64(val)
|
||||
n += 1
|
||||
if float64(val) > max {
|
||||
max = float64(val)
|
||||
}
|
||||
if float64(val) < min {
|
||||
min = float64(val)
|
||||
}
|
||||
}
|
||||
return min, max, sum / n
|
||||
}
|
||||
|
||||
// Rewritten from
|
||||
// https://github.com/ermanh/trieregex/blob/master/trieregex/trieregex.py
|
||||
func nodeRegex(nodes []string) string {
|
||||
root := Trie{}
|
||||
// add runes of each compute node to trie
|
||||
for _, node := range nodes {
|
||||
_trie := root
|
||||
for _, c := range node {
|
||||
if _, ok := _trie[c]; !ok {
|
||||
_trie[c] = Trie{}
|
||||
}
|
||||
_trie = _trie[c]
|
||||
}
|
||||
_trie['*'] = Trie{}
|
||||
}
|
||||
// recursively build regex from rune trie
|
||||
var trieRegex func(trie Trie, reset bool) string
|
||||
trieRegex = func(trie Trie, reset bool) string {
|
||||
if reset == true {
|
||||
trie = root
|
||||
}
|
||||
if len(trie) == 0 {
|
||||
return ""
|
||||
}
|
||||
if len(trie) == 1 {
|
||||
for key, _trie := range trie {
|
||||
if key == '*' {
|
||||
return ""
|
||||
}
|
||||
return regexp.QuoteMeta(string(key)) + trieRegex(_trie, false)
|
||||
}
|
||||
} else {
|
||||
sequences := []string{}
|
||||
for key, _trie := range trie {
|
||||
if key != '*' {
|
||||
sequences = append(sequences, regexp.QuoteMeta(string(key))+trieRegex(_trie, false))
|
||||
}
|
||||
}
|
||||
sort.Slice(sequences, func(i, j int) bool {
|
||||
return (-len(sequences[i]) < -len(sequences[j])) || (sequences[i] < sequences[j])
|
||||
})
|
||||
var result string
|
||||
// single edge from this tree node
|
||||
if len(sequences) == 1 {
|
||||
result = sequences[0]
|
||||
if len(result) > 1 {
|
||||
result = "(?:" + result + ")"
|
||||
}
|
||||
// multiple edges, each length 1
|
||||
} else if s := strings.Join(sequences, ""); len(s) == len(sequences) {
|
||||
// char or numeric range
|
||||
if len(s)-1 == int(s[len(s)-1])-int(s[0]) {
|
||||
result = fmt.Sprintf("[%c-%c]", s[0], s[len(s)-1])
|
||||
// char or numeric set
|
||||
} else {
|
||||
result = "[" + s + "]"
|
||||
}
|
||||
// multiple edges of different lengths
|
||||
} else {
|
||||
result = "(?:" + strings.Join(sequences, "|") + ")"
|
||||
}
|
||||
if _, ok := trie['*']; ok {
|
||||
result += "?"
|
||||
}
|
||||
return result
|
||||
}
|
||||
return ""
|
||||
}
|
||||
return trieRegex(root, true)
|
||||
}
|
||||
|
||||
func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
|
||||
var config PrometheusDataRepositoryConfig
|
||||
// parse config
|
||||
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
||||
cclog.Warn("Error while unmarshaling raw json config")
|
||||
return err
|
||||
}
|
||||
// support basic authentication
|
||||
var rt http.RoundTripper = nil
|
||||
if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
|
||||
prom_pw := promcfg.Secret(prom_pw)
|
||||
rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
|
||||
} else {
|
||||
if config.Username != "" {
|
||||
return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
|
||||
}
|
||||
}
|
||||
// init client
|
||||
client, err := promapi.NewClient(promapi.Config{
|
||||
Address: config.Url,
|
||||
RoundTripper: rt,
|
||||
})
|
||||
if err != nil {
|
||||
cclog.Error("Error while initializing new prometheus client")
|
||||
return err
|
||||
}
|
||||
// init query client
|
||||
pdb.client = client
|
||||
pdb.queryClient = promv1.NewAPI(pdb.client)
|
||||
// site config
|
||||
pdb.suffix = config.Suffix
|
||||
// init query templates
|
||||
pdb.templates = make(map[string]*template.Template)
|
||||
for metric, templ := range config.Templates {
|
||||
pdb.templates[metric], err = template.New(metric).Parse(templ)
|
||||
if err == nil {
|
||||
cclog.Debugf("Added PromQL template for %s: %s", metric, templ)
|
||||
} else {
|
||||
cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: respect scope argument
|
||||
func (pdb *PrometheusDataRepository) FormatQuery(
|
||||
metric string,
|
||||
scope schema.MetricScope,
|
||||
nodes []string,
|
||||
cluster string,
|
||||
) (string, error) {
|
||||
args := PromQLArgs{}
|
||||
if len(nodes) > 0 {
|
||||
args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
|
||||
} else {
|
||||
args.Nodes = fmt.Sprintf(".*%s", pdb.suffix)
|
||||
}
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
if templ, ok := pdb.templates[metric]; ok {
|
||||
err := templ.Execute(buf, args)
|
||||
if err != nil {
|
||||
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
|
||||
} else {
|
||||
query := buf.String()
|
||||
cclog.Debugf("PromQL: %s", query)
|
||||
return query, nil
|
||||
}
|
||||
} else {
|
||||
return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > No PromQL for metric %s configured.", metric))
|
||||
}
|
||||
}
|
||||
|
||||
// Convert PromAPI row to CC schema.Series
|
||||
func (pdb *PrometheusDataRepository) RowToSeries(
|
||||
from time.Time,
|
||||
step int64,
|
||||
steps int64,
|
||||
row *promm.SampleStream,
|
||||
) schema.Series {
|
||||
ts := from.Unix()
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
// init array of expected length with NaN
|
||||
values := make([]schema.Float, steps+1)
|
||||
for i := range values {
|
||||
values[i] = schema.NaN
|
||||
}
|
||||
// copy recorded values from prom sample pair
|
||||
for _, v := range row.Values {
|
||||
idx := (v.Timestamp.Unix() - ts) / step
|
||||
values[idx] = schema.Float(v.Value)
|
||||
}
|
||||
min, max, mean := MinMaxMean(values)
|
||||
// output struct
|
||||
return schema.Series{
|
||||
Hostname: hostname,
|
||||
Data: values,
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: mean,
|
||||
Min: min,
|
||||
Max: max,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (pdb *PrometheusDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
// TODO respect requested scope
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
jobData := make(schema.JobData)
|
||||
// parse job specs
|
||||
nodes := make([]string, len(job.Resources))
|
||||
for i, resource := range job.Resources {
|
||||
nodes[i] = resource.Hostname
|
||||
}
|
||||
from := time.Unix(job.StartTime, 0)
|
||||
to := time.Unix(job.StartTime+int64(job.Duration), 0)
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(job.Cluster, metric)
|
||||
if metricConfig == nil {
|
||||
cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// ranged query over all job nodes
|
||||
r := promv1.Range{
|
||||
Start: from,
|
||||
End: to,
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
if err != nil {
|
||||
cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
// init data structures
|
||||
if _, ok := jobData[metric]; !ok {
|
||||
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
}
|
||||
jobMetric, ok := jobData[metric][scope]
|
||||
if !ok {
|
||||
jobMetric = &schema.JobMetric{
|
||||
Unit: metricConfig.Unit,
|
||||
Timestep: metricConfig.Timestep,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
}
|
||||
step := int64(metricConfig.Timestep)
|
||||
steps := int64(to.Sub(from).Seconds()) / step
|
||||
// iter rows of host, metric, values
|
||||
for _, row := range result.(promm.Matrix) {
|
||||
jobMetric.Series = append(jobMetric.Series,
|
||||
pdb.RowToSeries(from, step, steps, row))
|
||||
}
|
||||
// only add metric if at least one host returned data
|
||||
if !ok && len(jobMetric.Series) > 0 {
|
||||
jobData[metric][scope] = jobMetric
|
||||
}
|
||||
// sort by hostname to get uniform coloring
|
||||
sort.Slice(jobMetric.Series, func(i, j int) bool {
|
||||
return (jobMetric.Series[i].Hostname < jobMetric.Series[j].Hostname)
|
||||
})
|
||||
}
|
||||
}
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
// TODO change implementation to precomputed/cached stats
|
||||
func (pdb *PrometheusDataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
// map of metrics of nodes of stats
|
||||
stats := map[string]map[string]schema.MetricStatistics{}
|
||||
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while loading job for stats")
|
||||
return nil, err
|
||||
}
|
||||
for metric, metricData := range data {
|
||||
stats[metric] = make(map[string]schema.MetricStatistics)
|
||||
for _, series := range metricData[schema.MetricScopeNode].Series {
|
||||
stats[metric][series.Hostname] = series.Statistics
|
||||
}
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (pdb *PrometheusDataRepository) LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
t0 := time.Now()
|
||||
// Map of hosts of metrics of value slices
|
||||
data := make(map[string]map[string][]*schema.JobMetric)
|
||||
// query db for each metric
|
||||
// TODO: scopes seems to be always empty
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(cluster, metric)
|
||||
if metricConfig == nil {
|
||||
cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// ranged query over all nodes
|
||||
r := promv1.Range{
|
||||
Start: from,
|
||||
End: to,
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
if err != nil {
|
||||
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
step := int64(metricConfig.Timestep)
|
||||
steps := int64(to.Sub(from).Seconds()) / step
|
||||
|
||||
// iter rows of host, metric, values
|
||||
for _, row := range result.(promm.Matrix) {
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
hostdata, ok := data[hostname]
|
||||
if !ok {
|
||||
hostdata = make(map[string][]*schema.JobMetric)
|
||||
data[hostname] = hostdata
|
||||
}
|
||||
// output per host and metric
|
||||
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
||||
Unit: metricConfig.Unit,
|
||||
Timestep: metricConfig.Timestep,
|
||||
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
t1 := time.Since(t0)
|
||||
cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Implemented by NHR@FAU; Used in Job-View StatsTable
|
||||
func (pdb *PrometheusDataRepository) LoadScopedStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
|
||||
scopedJobStats := make(schema.ScopedJobStats)
|
||||
data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while loading job for scopedJobStats")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for metric, metricData := range data {
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := scopedJobStats[metric]; !ok {
|
||||
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
|
||||
}
|
||||
|
||||
if _, ok := scopedJobStats[metric][scope]; !ok {
|
||||
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
|
||||
}
|
||||
|
||||
for _, series := range metricData[scope].Series {
|
||||
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
|
||||
Hostname: series.Hostname,
|
||||
Data: &series.Statistics,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return scopedJobStats, nil
|
||||
}
|
||||
|
||||
// Implemented by NHR@FAU; Used in NodeList-View
|
||||
func (pdb *PrometheusDataRepository) LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
|
||||
|
||||
// Fetch Data, based on pdb.LoadNodeData()
|
||||
t0 := time.Now()
|
||||
// Map of hosts of jobData
|
||||
data := make(map[string]schema.JobData)
|
||||
|
||||
// query db for each metric
|
||||
// TODO: scopes seems to be always empty
|
||||
if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
for _, scope := range scopes {
|
||||
if scope != schema.MetricScopeNode {
|
||||
logOnce.Do(func() {
|
||||
cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
for _, metric := range metrics {
|
||||
metricConfig := archive.GetMetricConfig(cluster, metric)
|
||||
if metricConfig == nil {
|
||||
cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
|
||||
return nil, errors.New("Prometheus config error")
|
||||
}
|
||||
query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while formatting prometheus query")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// ranged query over all nodes
|
||||
r := promv1.Range{
|
||||
Start: from,
|
||||
End: to,
|
||||
Step: time.Duration(metricConfig.Timestep * 1e9),
|
||||
}
|
||||
result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
|
||||
if err != nil {
|
||||
cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
|
||||
return nil, errors.New("Prometheus query error")
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
cclog.Warnf("Warnings: %v\n", warnings)
|
||||
}
|
||||
|
||||
step := int64(metricConfig.Timestep)
|
||||
steps := int64(to.Sub(from).Seconds()) / step
|
||||
|
||||
// iter rows of host, metric, values
|
||||
for _, row := range result.(promm.Matrix) {
|
||||
hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
|
||||
|
||||
hostdata, ok := data[hostname]
|
||||
if !ok {
|
||||
hostdata = make(schema.JobData)
|
||||
data[hostname] = hostdata
|
||||
}
|
||||
|
||||
metricdata, ok := hostdata[metric]
|
||||
if !ok {
|
||||
metricdata = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
data[hostname][metric] = metricdata
|
||||
}
|
||||
|
||||
// output per host, metric and scope
|
||||
scopeData, ok := metricdata[scope]
|
||||
if !ok {
|
||||
scopeData = &schema.JobMetric{
|
||||
Unit: metricConfig.Unit,
|
||||
Timestep: metricConfig.Timestep,
|
||||
Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)},
|
||||
}
|
||||
data[hostname][metric][scope] = scopeData
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
t1 := time.Since(t0)
|
||||
cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
|
||||
return data, nil
|
||||
}
|
||||
@@ -1,118 +0,0 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdata
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
)
|
||||
|
||||
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
// TestMetricDataRepository is only a mock for unit-testing.
|
||||
type TestMetricDataRepository struct{}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadScopedStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func (tmdr *TestMetricDataRepository) LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
panic("TODO")
|
||||
}
|
||||
|
||||
func DeepCopy(jdTemp schema.JobData) schema.JobData {
|
||||
jd := make(schema.JobData, len(jdTemp))
|
||||
for k, v := range jdTemp {
|
||||
jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jdTemp[k]))
|
||||
for k_, v_ := range v {
|
||||
jd[k][k_] = new(schema.JobMetric)
|
||||
jd[k][k_].Series = make([]schema.Series, len(v_.Series))
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data))
|
||||
copy(jd[k][k_].Series[i].Data, v_.Series[i].Data)
|
||||
jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname
|
||||
jd[k][k_].Series[i].Id = v_.Series[i].Id
|
||||
jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg
|
||||
jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min
|
||||
jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max
|
||||
}
|
||||
jd[k][k_].Timestep = v_.Timestep
|
||||
jd[k][k_].Unit.Base = v_.Unit.Base
|
||||
jd[k][k_].Unit.Prefix = v_.Unit.Prefix
|
||||
if v_.StatisticsSeries != nil {
|
||||
// Init Slices
|
||||
jd[k][k_].StatisticsSeries = new(schema.StatsSeries)
|
||||
jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max))
|
||||
jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min))
|
||||
jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median))
|
||||
jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean))
|
||||
// Copy Data
|
||||
copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max)
|
||||
copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min)
|
||||
copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median)
|
||||
copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean)
|
||||
// Handle Percentiles
|
||||
for k__, v__ := range v_.StatisticsSeries.Percentiles {
|
||||
jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__))
|
||||
copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__)
|
||||
}
|
||||
} else {
|
||||
jd[k][k_].StatisticsSeries = v_.StatisticsSeries
|
||||
}
|
||||
}
|
||||
}
|
||||
return jd
|
||||
}
|
||||
29
internal/metricdispatch/configSchema.go
Normal file
29
internal/metricdispatch/configSchema.go
Normal file
@@ -0,0 +1,29 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdispatch
|
||||
|
||||
const configSchema = `{
|
||||
"type": "array",
|
||||
"description": "Array of metric store configurations with scope-based routing.",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"scope": {
|
||||
"description": "Scope identifier for routing metrics (e.g., cluster name, '*' for default)",
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"description": "URL of the metric store endpoint",
|
||||
"type": "string"
|
||||
},
|
||||
"token": {
|
||||
"description": "Authentication token for the metric store",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["scope", "url", "token"]
|
||||
}
|
||||
}`
|
||||
533
internal/metricdispatch/dataLoader.go
Normal file
533
internal/metricdispatch/dataLoader.go
Normal file
@@ -0,0 +1,533 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package metricdispatch provides a unified interface for loading and caching job metric data.
|
||||
//
|
||||
// This package serves as a central dispatcher that routes metric data requests to the appropriate
|
||||
// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store).
|
||||
// For completed jobs, data is retrieved from the file-based job archive.
|
||||
//
|
||||
// # Key Features
|
||||
//
|
||||
// - Automatic backend selection based on job state (running vs. archived)
|
||||
// - LRU cache for performance optimization (128 MB default cache size)
|
||||
// - Data resampling using Largest Triangle Three Bucket algorithm for archived data
|
||||
// - Automatic statistics series generation for jobs with many nodes
|
||||
// - Support for scoped metrics (node, socket, accelerator, core)
|
||||
//
|
||||
// # Cache Behavior
|
||||
//
|
||||
// Cached data has different TTL (time-to-live) values depending on job state:
|
||||
// - Running jobs: 2 minutes (data changes frequently)
|
||||
// - Completed jobs: 5 hours (data is static)
|
||||
//
|
||||
// The cache key is based on job ID, state, requested metrics, scopes, and resolution.
|
||||
//
|
||||
// # Usage
|
||||
//
|
||||
// The primary entry point is LoadData, which automatically handles both running and archived jobs:
|
||||
//
|
||||
// jobData, err := metricdispatch.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
// if err != nil {
|
||||
// // Handle error
|
||||
// }
|
||||
//
|
||||
// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format.
|
||||
package metricdispatch
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/resampler"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// cache is an LRU cache with 128 MB capacity for storing loaded job metric data.
|
||||
// The cache reduces load on both the metric store and archive backends.
|
||||
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
|
||||
|
||||
// cacheKey generates a unique cache key for a job's metric data based on job ID, state,
|
||||
// requested metrics, scopes, and resolution. Duration and StartTime are intentionally excluded
|
||||
// because job.ID is more unique and the cache TTL ensures entries don't persist indefinitely.
|
||||
func cacheKey(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) string {
|
||||
return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
|
||||
*job.ID, job.State, metrics, scopes, resolution)
|
||||
}
|
||||
|
||||
// LoadData retrieves metric data for a job from the appropriate backend (memory store for running jobs,
|
||||
// archive for completed jobs) and applies caching, resampling, and statistics generation as needed.
|
||||
//
|
||||
// For running jobs or when archive is disabled, data is fetched from the metric store.
|
||||
// For completed archived jobs, data is loaded from the job archive and resampled if needed.
|
||||
//
|
||||
// Parameters:
|
||||
// - job: The job for which to load metric data
|
||||
// - metrics: List of metric names to load (nil loads all metrics for the cluster)
|
||||
// - scopes: Metric scopes to include (nil defaults to node scope)
|
||||
// - ctx: Context for cancellation and timeouts
|
||||
// - resolution: Target number of data points for resampling (only applies to archived data)
|
||||
//
|
||||
// Returns the loaded job data and any error encountered. For partial errors (some metrics failed),
|
||||
// the function returns the successfully loaded data with a warning logged.
|
||||
func LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
|
||||
var jd schema.JobData
|
||||
var err error
|
||||
|
||||
if job.State == schema.JobStateRunning ||
|
||||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving {
|
||||
|
||||
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
|
||||
job.Cluster, job.SubCluster, err.Error())
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
if scopes == nil {
|
||||
scopes = append(scopes, schema.MetricScopeNode)
|
||||
}
|
||||
|
||||
if metrics == nil {
|
||||
cluster := archive.GetCluster(job.Cluster)
|
||||
for _, mc := range cluster.MetricConfig {
|
||||
metrics = append(metrics, mc.Name)
|
||||
}
|
||||
}
|
||||
|
||||
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
if err != nil {
|
||||
if len(jd) != 0 {
|
||||
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
|
||||
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
|
||||
} else {
|
||||
cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
|
||||
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
size = jd.Size()
|
||||
} else {
|
||||
var jdTemp schema.JobData
|
||||
jdTemp, err = archive.GetHandle().LoadJobData(job)
|
||||
if err != nil {
|
||||
cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
|
||||
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
|
||||
return err, 0, 0
|
||||
}
|
||||
|
||||
jd = deepCopy(jdTemp)
|
||||
|
||||
// Resample archived data using Largest Triangle Three Bucket algorithm to reduce data points
|
||||
// to the requested resolution, improving transfer performance and client-side rendering.
|
||||
for _, v := range jd {
|
||||
for _, v_ := range v {
|
||||
timestep := int64(0)
|
||||
for i := 0; i < len(v_.Series); i += 1 {
|
||||
v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
|
||||
if err != nil {
|
||||
return err, 0, 0
|
||||
}
|
||||
}
|
||||
v_.Timestep = int(timestep)
|
||||
}
|
||||
}
|
||||
|
||||
// Filter job data to only include requested metrics and scopes, avoiding unnecessary data transfer.
|
||||
if metrics != nil || scopes != nil {
|
||||
if metrics == nil {
|
||||
metrics = make([]string, 0, len(jd))
|
||||
for k := range jd {
|
||||
metrics = append(metrics, k)
|
||||
}
|
||||
}
|
||||
|
||||
res := schema.JobData{}
|
||||
for _, metric := range metrics {
|
||||
if perscope, ok := jd[metric]; ok {
|
||||
if len(perscope) > 1 {
|
||||
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
||||
for _, scope := range scopes {
|
||||
if jm, ok := perscope[scope]; ok {
|
||||
subset[scope] = jm
|
||||
}
|
||||
}
|
||||
|
||||
if len(subset) > 0 {
|
||||
perscope = subset
|
||||
}
|
||||
}
|
||||
|
||||
res[metric] = perscope
|
||||
}
|
||||
}
|
||||
jd = res
|
||||
}
|
||||
size = jd.Size()
|
||||
}
|
||||
|
||||
ttl = 5 * time.Hour
|
||||
if job.State == schema.JobStateRunning {
|
||||
ttl = 2 * time.Minute
|
||||
}
|
||||
|
||||
// Generate statistics series for jobs with many nodes to enable min/median/max graphs
|
||||
// instead of overwhelming the UI with individual node lines. Note that newly calculated
|
||||
// statistics use min/median/max, while archived statistics may use min/mean/max.
|
||||
const maxSeriesSize int = 15
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
|
||||
nodeScopeRequested := false
|
||||
for _, scope := range scopes {
|
||||
if scope == schema.MetricScopeNode {
|
||||
nodeScopeRequested = true
|
||||
}
|
||||
}
|
||||
|
||||
if nodeScopeRequested {
|
||||
jd.AddNodeScope("flops_any")
|
||||
jd.AddNodeScope("mem_bw")
|
||||
}
|
||||
|
||||
// Round Resulting Stat Values
|
||||
jd.RoundMetricStats()
|
||||
|
||||
return jd, ttl, size
|
||||
})
|
||||
|
||||
if err, ok := data.(error); ok {
|
||||
cclog.Errorf("error in cached dataset for job %d: %s", job.JobID, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return data.(schema.JobData), nil
|
||||
}
|
||||
|
||||
// LoadAverages computes average values for the specified metrics across all nodes of a job.
|
||||
// For running jobs, it loads statistics from the metric store. For completed jobs, it uses
|
||||
// the pre-calculated averages from the job archive. The results are appended to the data slice.
|
||||
func LoadAverages(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
data [][]schema.Float,
|
||||
ctx context.Context,
|
||||
) error {
|
||||
if job.State != schema.JobStateRunning {
|
||||
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
|
||||
}
|
||||
|
||||
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
|
||||
job.Cluster, job.SubCluster, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
stats, err := ms.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
|
||||
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
for i, m := range metrics {
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[i] = append(data[i], schema.NaN)
|
||||
continue
|
||||
}
|
||||
|
||||
sum := 0.0
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
}
|
||||
data[i] = append(data[i], schema.Float(sum))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadScopedJobStats retrieves job statistics organized by metric scope (node, socket, core, accelerator).
|
||||
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
|
||||
// statistics are loaded from the job archive.
|
||||
func LoadScopedJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
if job.State != schema.JobStateRunning {
|
||||
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
|
||||
}
|
||||
|
||||
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
|
||||
job.Cluster, job.SubCluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
|
||||
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Round Resulting Stat Values
|
||||
scopedStats.RoundScopedMetricStats()
|
||||
|
||||
return scopedStats, nil
|
||||
}
|
||||
|
||||
// LoadJobStats retrieves aggregated statistics (min/avg/max) for each requested metric across all job nodes.
|
||||
// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
|
||||
// statistics are loaded from the job archive.
|
||||
func LoadJobStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.MetricStatistics, error) {
|
||||
if job.State != schema.JobStateRunning {
|
||||
return archive.LoadStatsFromArchive(job, metrics)
|
||||
}
|
||||
|
||||
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
|
||||
job.Cluster, job.SubCluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data := make(map[string]schema.MetricStatistics, len(metrics))
|
||||
|
||||
stats, err := ms.LoadStats(job, metrics, ctx)
|
||||
if err != nil {
|
||||
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
|
||||
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
|
||||
return data, err
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
|
||||
nodes, ok := stats[m]
|
||||
if !ok {
|
||||
data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, node := range nodes {
|
||||
sum += node.Avg
|
||||
min = math.Min(min, node.Min)
|
||||
max = math.Max(max, node.Max)
|
||||
}
|
||||
|
||||
data[m] = schema.MetricStatistics{
|
||||
Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
|
||||
Min: (math.Round(min*100) / 100),
|
||||
Max: (math.Round(max*100) / 100),
|
||||
}
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// LoadNodeData retrieves metric data for specific nodes in a cluster within a time range.
|
||||
// This is used for node monitoring views and system status pages. Data is always fetched from
|
||||
// the metric store (not the archive) since it's for current/recent node status monitoring.
|
||||
//
|
||||
// Returns a nested map structure: node -> metric -> scoped data.
|
||||
// FIXME: Add support for subcluster specific cc-metric-stores
|
||||
func LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
ms, err := GetMetricDataRepo(cluster, "")
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
|
||||
cluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := ms.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
|
||||
} else {
|
||||
cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("metric store for cluster '%s' does not support node data queries", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// LoadNodeListData retrieves time-series metric data for multiple nodes within a time range,
|
||||
// with optional resampling and automatic statistics generation for large datasets.
|
||||
// This is used for comparing multiple nodes or displaying node status over time.
|
||||
//
|
||||
// Returns a map of node names to their job-like metric data structures.
|
||||
func LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
if metrics == nil {
|
||||
for _, m := range archive.GetCluster(cluster).MetricConfig {
|
||||
metrics = append(metrics, m.Name)
|
||||
}
|
||||
}
|
||||
|
||||
ms, err := GetMetricDataRepo(cluster, subCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
|
||||
cluster, subCluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
data, err := ms.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
|
||||
if err != nil {
|
||||
if len(data) != 0 {
|
||||
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
|
||||
cluster, subCluster, err.Error())
|
||||
} else {
|
||||
cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
|
||||
cluster, subCluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Generate statistics series for datasets with many series to improve visualization performance.
|
||||
// Statistics are calculated as min/median/max.
|
||||
const maxSeriesSize int = 8
|
||||
for _, jd := range data {
|
||||
for _, scopes := range jd {
|
||||
for _, jm := range scopes {
|
||||
if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
|
||||
continue
|
||||
}
|
||||
jm.AddStatisticsSeries()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if data == nil {
|
||||
return nil, fmt.Errorf("metric store for cluster '%s' does not support node list queries", cluster)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// deepCopy creates a deep copy of JobData to prevent cache corruption when modifying
|
||||
// archived data (e.g., during resampling). This ensures the cached archive data remains
|
||||
// immutable while allowing per-request transformations.
|
||||
func deepCopy(source schema.JobData) schema.JobData {
|
||||
result := make(schema.JobData, len(source))
|
||||
|
||||
for metricName, scopeMap := range source {
|
||||
result[metricName] = make(map[schema.MetricScope]*schema.JobMetric, len(scopeMap))
|
||||
|
||||
for scope, jobMetric := range scopeMap {
|
||||
result[metricName][scope] = copyJobMetric(jobMetric)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func copyJobMetric(src *schema.JobMetric) *schema.JobMetric {
|
||||
dst := &schema.JobMetric{
|
||||
Timestep: src.Timestep,
|
||||
Unit: src.Unit,
|
||||
Series: make([]schema.Series, len(src.Series)),
|
||||
}
|
||||
|
||||
for i := range src.Series {
|
||||
dst.Series[i] = copySeries(&src.Series[i])
|
||||
}
|
||||
|
||||
if src.StatisticsSeries != nil {
|
||||
dst.StatisticsSeries = copyStatisticsSeries(src.StatisticsSeries)
|
||||
}
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func copySeries(src *schema.Series) schema.Series {
|
||||
dst := schema.Series{
|
||||
Hostname: src.Hostname,
|
||||
ID: src.ID,
|
||||
Statistics: src.Statistics,
|
||||
Data: make([]schema.Float, len(src.Data)),
|
||||
}
|
||||
|
||||
copy(dst.Data, src.Data)
|
||||
return dst
|
||||
}
|
||||
|
||||
func copyStatisticsSeries(src *schema.StatsSeries) *schema.StatsSeries {
|
||||
dst := &schema.StatsSeries{
|
||||
Min: make([]schema.Float, len(src.Min)),
|
||||
Mean: make([]schema.Float, len(src.Mean)),
|
||||
Median: make([]schema.Float, len(src.Median)),
|
||||
Max: make([]schema.Float, len(src.Max)),
|
||||
}
|
||||
|
||||
copy(dst.Min, src.Min)
|
||||
copy(dst.Mean, src.Mean)
|
||||
copy(dst.Median, src.Median)
|
||||
copy(dst.Max, src.Max)
|
||||
|
||||
if len(src.Percentiles) > 0 {
|
||||
dst.Percentiles = make(map[int][]schema.Float, len(src.Percentiles))
|
||||
for percentile, values := range src.Percentiles {
|
||||
dst.Percentiles[percentile] = make([]schema.Float, len(values))
|
||||
copy(dst.Percentiles[percentile], values)
|
||||
}
|
||||
}
|
||||
|
||||
return dst
|
||||
}
|
||||
125
internal/metricdispatch/dataLoader_test.go
Normal file
125
internal/metricdispatch/dataLoader_test.go
Normal file
@@ -0,0 +1,125 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package metricdispatch
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
func TestDeepCopy(t *testing.T) {
|
||||
nodeId := "0"
|
||||
original := schema.JobData{
|
||||
"cpu_load": {
|
||||
schema.MetricScopeNode: &schema.JobMetric{
|
||||
Timestep: 60,
|
||||
Unit: schema.Unit{Base: "load", Prefix: ""},
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "node001",
|
||||
ID: &nodeId,
|
||||
Data: []schema.Float{1.0, 2.0, 3.0},
|
||||
Statistics: schema.MetricStatistics{
|
||||
Min: 1.0,
|
||||
Avg: 2.0,
|
||||
Max: 3.0,
|
||||
},
|
||||
},
|
||||
},
|
||||
StatisticsSeries: &schema.StatsSeries{
|
||||
Min: []schema.Float{1.0, 1.5, 2.0},
|
||||
Mean: []schema.Float{2.0, 2.5, 3.0},
|
||||
Median: []schema.Float{2.0, 2.5, 3.0},
|
||||
Max: []schema.Float{3.0, 3.5, 4.0},
|
||||
Percentiles: map[int][]schema.Float{
|
||||
25: {1.5, 2.0, 2.5},
|
||||
75: {2.5, 3.0, 3.5},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
copied := deepCopy(original)
|
||||
|
||||
original["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] = 999.0
|
||||
original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] = 888.0
|
||||
original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] = 777.0
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] != 1.0 {
|
||||
t.Errorf("Series data was not deeply copied: got %v, want 1.0",
|
||||
copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0])
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] != 1.0 {
|
||||
t.Errorf("StatisticsSeries was not deeply copied: got %v, want 1.0",
|
||||
copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0])
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] != 1.5 {
|
||||
t.Errorf("Percentiles was not deeply copied: got %v, want 1.5",
|
||||
copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0])
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].Timestep != 60 {
|
||||
t.Errorf("Timestep not copied correctly: got %v, want 60",
|
||||
copied["cpu_load"][schema.MetricScopeNode].Timestep)
|
||||
}
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname != "node001" {
|
||||
t.Errorf("Hostname not copied correctly: got %v, want node001",
|
||||
copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeepCopyNilStatisticsSeries(t *testing.T) {
|
||||
original := schema.JobData{
|
||||
"mem_used": {
|
||||
schema.MetricScopeNode: &schema.JobMetric{
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: "node001",
|
||||
Data: []schema.Float{1.0, 2.0},
|
||||
},
|
||||
},
|
||||
StatisticsSeries: nil,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
copied := deepCopy(original)
|
||||
|
||||
if copied["mem_used"][schema.MetricScopeNode].StatisticsSeries != nil {
|
||||
t.Errorf("StatisticsSeries should be nil, got %v",
|
||||
copied["mem_used"][schema.MetricScopeNode].StatisticsSeries)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeepCopyEmptyPercentiles(t *testing.T) {
|
||||
original := schema.JobData{
|
||||
"cpu_load": {
|
||||
schema.MetricScopeNode: &schema.JobMetric{
|
||||
Timestep: 60,
|
||||
Series: []schema.Series{},
|
||||
StatisticsSeries: &schema.StatsSeries{
|
||||
Min: []schema.Float{1.0},
|
||||
Mean: []schema.Float{2.0},
|
||||
Median: []schema.Float{2.0},
|
||||
Max: []schema.Float{3.0},
|
||||
Percentiles: nil,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
copied := deepCopy(original)
|
||||
|
||||
if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles != nil {
|
||||
t.Errorf("Percentiles should be nil when source is nil/empty")
|
||||
}
|
||||
}
|
||||
123
internal/metricdispatch/metricdata.go
Executable file
123
internal/metricdispatch/metricdata.go
Executable file
@@ -0,0 +1,123 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package metricdispatch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
ccms "github.com/ClusterCockpit/cc-backend/internal/metricstoreclient"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
type MetricDataRepository interface {
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
|
||||
LoadStats(job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
|
||||
LoadScopedStats(job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context) (schema.ScopedJobStats, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
|
||||
LoadNodeData(cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
|
||||
|
||||
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
|
||||
LoadNodeListData(cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context) (map[string]schema.JobData, error)
|
||||
|
||||
// HealthCheck evaluates the monitoring state for a set of nodes against expected metrics.
|
||||
HealthCheck(cluster string,
|
||||
nodes []string,
|
||||
metrics []string) (map[string]metricstore.HealthCheckResult, error)
|
||||
}
|
||||
|
||||
type CCMetricStoreConfig struct {
|
||||
Scope string `json:"scope"`
|
||||
URL string `json:"url"`
|
||||
Token string `json:"token"`
|
||||
}
|
||||
|
||||
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
|
||||
|
||||
func Init(rawConfig json.RawMessage) error {
|
||||
if rawConfig != nil {
|
||||
var configs []CCMetricStoreConfig
|
||||
config.Validate(configSchema, rawConfig)
|
||||
dec := json.NewDecoder(bytes.NewReader(rawConfig))
|
||||
dec.DisallowUnknownFields()
|
||||
if err := dec.Decode(&configs); err != nil {
|
||||
return fmt.Errorf("[METRICDISPATCH]> External Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error())
|
||||
}
|
||||
|
||||
if len(configs) == 0 {
|
||||
return fmt.Errorf("[METRICDISPATCH]> No external metric store configurations found in config file")
|
||||
}
|
||||
|
||||
for _, config := range configs {
|
||||
metricDataRepos[config.Scope] = ccms.NewCCMetricStore(config.URL, config.Token)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetMetricDataRepo(cluster string, subcluster string) (MetricDataRepository, error) {
|
||||
var repo MetricDataRepository
|
||||
var ok bool
|
||||
|
||||
key := cluster + "-" + subcluster
|
||||
repo, ok = metricDataRepos[key]
|
||||
|
||||
if !ok {
|
||||
repo, ok = metricDataRepos[cluster]
|
||||
|
||||
if !ok {
|
||||
repo, ok = metricDataRepos["*"]
|
||||
|
||||
if !ok {
|
||||
if metricstore.MetricStoreHandle == nil {
|
||||
return nil, fmt.Errorf("[METRICDISPATCH]> no metric data repository configured '%s'", key)
|
||||
}
|
||||
|
||||
repo = metricstore.MetricStoreHandle
|
||||
cclog.Debugf("[METRICDISPATCH]> Using internal metric data repository for '%s'", key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return repo, nil
|
||||
}
|
||||
|
||||
// GetHealthCheckRepo returns the MetricDataRepository for performing health checks on a cluster.
|
||||
// It uses the same fallback logic as GetMetricDataRepo: cluster → wildcard → internal.
|
||||
func GetHealthCheckRepo(cluster string) (MetricDataRepository, error) {
|
||||
return GetMetricDataRepo(cluster, "")
|
||||
}
|
||||
239
internal/metricstoreclient/cc-metric-store-queries.go
Normal file
239
internal/metricstoreclient/cc-metric-store-queries.go
Normal file
@@ -0,0 +1,239 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package metricstoreclient - Query Building
|
||||
//
|
||||
// This file contains the query construction and scope transformation logic for cc-metric-store queries.
|
||||
// It handles the complex mapping between requested metric scopes and native hardware topology,
|
||||
// automatically aggregating or filtering metrics as needed.
|
||||
//
|
||||
// # Scope Transformations
|
||||
//
|
||||
// The buildScopeQueries function implements the core scope transformation algorithm.
|
||||
// It handles 25+ different transformation cases, mapping between:
|
||||
// - Accelerator (GPU) scope
|
||||
// - HWThread (hardware thread/SMT) scope
|
||||
// - Core (CPU core) scope
|
||||
// - Socket (CPU package) scope
|
||||
// - MemoryDomain (NUMA domain) scope
|
||||
// - Node (full system) scope
|
||||
//
|
||||
// Transformations follow these rules:
|
||||
// - Same scope: Return data as-is (e.g., Core → Core)
|
||||
// - Coarser scope: Aggregate data (e.g., Core → Socket with Aggregate=true)
|
||||
// - Finer scope: Error - cannot increase granularity
|
||||
//
|
||||
// # Query Building
|
||||
//
|
||||
// buildQueries and buildNodeQueries are the main entry points, handling job-specific
|
||||
// and node-specific query construction respectively. They:
|
||||
// - Validate metric configurations
|
||||
// - Handle subcluster-specific metric filtering
|
||||
// - Detect and skip duplicate scope requests
|
||||
// - Call buildScopeQueries for each metric/scope/host combination
|
||||
package metricstoreclient
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// buildQueries constructs API queries for job-specific metric data.
|
||||
// It iterates through metrics, scopes, and job resources to build the complete query set.
|
||||
//
|
||||
// The function handles:
|
||||
// - Metric configuration validation and subcluster filtering
|
||||
// - Scope deduplication to avoid redundant queries
|
||||
// - Hardware thread list resolution (job-allocated vs full node)
|
||||
// - Delegation to buildScopeQueries for scope transformations
|
||||
//
|
||||
// Returns queries and their corresponding assigned scopes (which may differ from requested scopes).
|
||||
func (ccms *CCMetricStore) buildQueries(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) ([]APIQuery, []schema.MetricScope, error) {
|
||||
// Initialize both slices together
|
||||
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
||||
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources))
|
||||
|
||||
topology, err := ccms.getTopology(job.Cluster, job.SubCluster)
|
||||
if err != nil {
|
||||
cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error())
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
for _, metric := range metrics {
|
||||
remoteName := metric
|
||||
mc := archive.GetMetricConfig(job.Cluster, metric)
|
||||
if mc == nil {
|
||||
cclog.Warnf("metric '%s' is not specified for cluster '%s' - skipping", metric, job.Cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if metric is removed for subcluster
|
||||
if len(mc.SubClusters) != 0 && metricstore.IsMetricRemovedForSubCluster(mc, job.SubCluster) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Avoid duplicates...
|
||||
handledScopes := make([]schema.MetricScope, 0, 3)
|
||||
|
||||
scopesLoop:
|
||||
for _, requestedScope := range scopes {
|
||||
nativeScope := mc.Scope
|
||||
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
scope := nativeScope.Max(requestedScope)
|
||||
for _, s := range handledScopes {
|
||||
if scope == s {
|
||||
continue scopesLoop
|
||||
}
|
||||
}
|
||||
handledScopes = append(handledScopes, scope)
|
||||
|
||||
for _, host := range job.Resources {
|
||||
hwthreads := host.HWThreads
|
||||
if hwthreads == nil {
|
||||
hwthreads = topology.Node
|
||||
}
|
||||
|
||||
scopeResults, ok := metricstore.BuildScopeQueries(
|
||||
nativeScope, requestedScope,
|
||||
remoteName, host.Hostname,
|
||||
topology, hwthreads, host.Accelerators,
|
||||
)
|
||||
|
||||
if !ok {
|
||||
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
||||
}
|
||||
|
||||
for _, sr := range scopeResults {
|
||||
queries = append(queries, APIQuery{
|
||||
Metric: sr.Metric,
|
||||
Hostname: sr.Hostname,
|
||||
Aggregate: sr.Aggregate,
|
||||
Type: sr.Type,
|
||||
TypeIds: sr.TypeIds,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, sr.Scope)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return queries, assignedScope, nil
|
||||
}
|
||||
|
||||
// buildNodeQueries constructs API queries for node-specific metric data (Systems View).
|
||||
// Similar to buildQueries but uses full node topology instead of job-allocated resources.
|
||||
//
|
||||
// The function handles:
|
||||
// - SubCluster topology resolution (either pre-loaded or per-node lookup)
|
||||
// - Full node hardware thread lists (not job-specific subsets)
|
||||
// - All accelerators on each node
|
||||
// - Metric configuration validation with subcluster filtering
|
||||
//
|
||||
// Returns queries and their corresponding assigned scopes.
|
||||
func (ccms *CCMetricStore) buildNodeQueries(
|
||||
cluster string,
|
||||
subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
) ([]APIQuery, []schema.MetricScope, error) {
|
||||
// Initialize both slices together
|
||||
queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
|
||||
assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes))
|
||||
|
||||
for _, metric := range metrics {
|
||||
remoteName := metric
|
||||
mc := archive.GetMetricConfig(cluster, metric)
|
||||
if mc == nil {
|
||||
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if metric is removed for subcluster
|
||||
if mc.SubClusters != nil && metricstore.IsMetricRemovedForSubCluster(mc, subCluster) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Avoid duplicates...
|
||||
handledScopes := make([]schema.MetricScope, 0, 3)
|
||||
|
||||
scopesLoop:
|
||||
for _, requestedScope := range scopes {
|
||||
nativeScope := mc.Scope
|
||||
|
||||
scope := nativeScope.Max(requestedScope)
|
||||
for _, s := range handledScopes {
|
||||
if scope == s {
|
||||
continue scopesLoop
|
||||
}
|
||||
}
|
||||
handledScopes = append(handledScopes, scope)
|
||||
|
||||
for _, hostname := range nodes {
|
||||
var topology *schema.Topology
|
||||
var err error
|
||||
|
||||
// If no subCluster given, get it by node
|
||||
if subCluster == "" {
|
||||
topology, err = ccms.getTopologyByNode(cluster, hostname)
|
||||
} else {
|
||||
topology, err = ccms.getTopology(cluster, subCluster)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
|
||||
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
|
||||
acceleratorIds := topology.GetAcceleratorIDs()
|
||||
|
||||
// Moved check here if metric matches hardware specs
|
||||
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
|
||||
continue scopesLoop
|
||||
}
|
||||
|
||||
scopeResults, ok := metricstore.BuildScopeQueries(
|
||||
nativeScope, requestedScope,
|
||||
remoteName, hostname,
|
||||
topology, topology.Node, acceleratorIds,
|
||||
)
|
||||
|
||||
if !ok {
|
||||
return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
||||
}
|
||||
|
||||
for _, sr := range scopeResults {
|
||||
queries = append(queries, APIQuery{
|
||||
Metric: sr.Metric,
|
||||
Hostname: sr.Hostname,
|
||||
Aggregate: sr.Aggregate,
|
||||
Type: sr.Type,
|
||||
TypeIds: sr.TypeIds,
|
||||
Resolution: resolution,
|
||||
})
|
||||
assignedScope = append(assignedScope, sr.Scope)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return queries, assignedScope, nil
|
||||
}
|
||||
|
||||
796
internal/metricstoreclient/cc-metric-store.go
Normal file
796
internal/metricstoreclient/cc-metric-store.go
Normal file
@@ -0,0 +1,796 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package metricstoreclient provides a client for querying the cc-metric-store time series database.
|
||||
//
|
||||
// The cc-metric-store is a high-performance time series database optimized for HPC metric data.
|
||||
// This client handles HTTP communication, query construction, scope transformations, and data retrieval
|
||||
// for job and node metrics across different metric scopes (node, socket, core, hwthread, accelerator).
|
||||
//
|
||||
// # Architecture
|
||||
//
|
||||
// The package is split into two main components:
|
||||
// - Client Operations (cc-metric-store.go): HTTP client, request handling, data loading methods
|
||||
// - Query Building (cc-metric-store-queries.go): Query construction and scope transformation logic
|
||||
//
|
||||
// # Basic Usage
|
||||
//
|
||||
// store := NewCCMetricStore("http://localhost:8080", "jwt-token")
|
||||
//
|
||||
// // Load job data
|
||||
// jobData, err := store.LoadData(job, metrics, scopes, ctx, resolution)
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
//
|
||||
// # Metric Scopes
|
||||
//
|
||||
// The client supports hierarchical metric scopes that map to HPC hardware topology:
|
||||
// - MetricScopeAccelerator: GPU/accelerator level metrics
|
||||
// - MetricScopeHWThread: Hardware thread (SMT) level metrics
|
||||
// - MetricScopeCore: CPU core level metrics
|
||||
// - MetricScopeSocket: CPU socket level metrics
|
||||
// - MetricScopeMemoryDomain: NUMA domain level metrics
|
||||
// - MetricScopeNode: Full node level metrics
|
||||
//
|
||||
// The client automatically handles scope transformations, aggregating finer-grained metrics
|
||||
// to coarser scopes when needed (e.g., aggregating core metrics to socket level).
|
||||
//
|
||||
// # Error Handling
|
||||
//
|
||||
// The client supports partial errors - if some queries fail, it returns both the successful
|
||||
// data and an error listing the failed queries. This allows processing partial results
|
||||
// when some nodes or metrics are temporarily unavailable.
|
||||
//
|
||||
// # API Versioning
|
||||
//
|
||||
// The client uses cc-metric-store API v2, which includes support for:
|
||||
// - Data resampling for bandwidth optimization
|
||||
// - Multi-scope queries in a single request
|
||||
// - Aggregation across hardware topology levels
|
||||
package metricstoreclient
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
ms "github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// CCMetricStore is the HTTP client for communicating with cc-metric-store.
|
||||
// It manages connection details, authentication, and provides methods for querying metrics.
|
||||
type CCMetricStore struct {
|
||||
client http.Client // HTTP client with 10-second timeout
|
||||
jwt string // JWT Bearer token for authentication
|
||||
url string // Base URL of cc-metric-store instance
|
||||
queryEndpoint string // Full URL to query API endpoint
|
||||
topologyCache map[string]*schema.Topology // cluster -> topology cache
|
||||
}
|
||||
|
||||
// APIQueryRequest represents a request to the cc-metric-store query API.
|
||||
// It supports both explicit queries and "for-all-nodes" bulk queries.
|
||||
type APIQueryRequest struct {
|
||||
Cluster string `json:"cluster"` // Target cluster name
|
||||
Queries []APIQuery `json:"queries"` // Explicit list of metric queries
|
||||
ForAllNodes []string `json:"for-all-nodes"` // Metrics to query for all nodes
|
||||
From int64 `json:"from"` // Start time (Unix timestamp)
|
||||
To int64 `json:"to"` // End time (Unix timestamp)
|
||||
WithStats bool `json:"with-stats"` // Include min/avg/max statistics
|
||||
WithData bool `json:"with-data"` // Include time series data points
|
||||
}
|
||||
|
||||
// APIQuery specifies a single metric query with optional scope filtering.
|
||||
// Type and TypeIds define the hardware scope (core, socket, accelerator, etc.).
|
||||
type APIQuery struct {
|
||||
Type *string `json:"type,omitempty"` // Scope type (e.g., "core", "socket")
|
||||
SubType *string `json:"subtype,omitempty"` // Sub-scope type (reserved for future use)
|
||||
Metric string `json:"metric"` // Metric name
|
||||
Hostname string `json:"host"` // Target hostname
|
||||
Resolution int `json:"resolution"` // Data resolution in seconds (0 = native)
|
||||
TypeIds []string `json:"type-ids,omitempty"` // IDs for the scope type (e.g., core IDs)
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"` // IDs for sub-scope (reserved)
|
||||
Aggregate bool `json:"aggreg"` // Aggregate across TypeIds
|
||||
}
|
||||
|
||||
// APIQueryResponse contains the results from a cc-metric-store query.
|
||||
// Results align with the Queries slice by index.
|
||||
type APIQueryResponse struct {
|
||||
Queries []APIQuery `json:"queries,omitempty"` // Echoed queries (for bulk requests)
|
||||
Results [][]APIMetricData `json:"results"` // Result data, indexed by query
|
||||
}
|
||||
|
||||
// APIMetricData represents time series data and statistics for a single metric series.
|
||||
// Error is set if this particular series failed to load.
|
||||
type APIMetricData struct {
|
||||
Error *string `json:"error"` // Error message if query failed
|
||||
Data []schema.Float `json:"data"` // Time series data points
|
||||
From int64 `json:"from"` // Actual start time of data
|
||||
To int64 `json:"to"` // Actual end time of data
|
||||
Resolution int `json:"resolution"` // Actual resolution of data in seconds
|
||||
Avg schema.Float `json:"avg"` // Average value across time range
|
||||
Min schema.Float `json:"min"` // Minimum value in time range
|
||||
Max schema.Float `json:"max"` // Maximum value in time range
|
||||
}
|
||||
|
||||
// NewCCMetricStore creates and initializes a new (external) CCMetricStore client.
|
||||
// The url parameter should include the protocol and port (e.g., "http://localhost:8080").
|
||||
// The token parameter is a JWT used for Bearer authentication; pass empty string if auth is disabled.
|
||||
func NewCCMetricStore(url string, token string) *CCMetricStore {
|
||||
return &CCMetricStore{
|
||||
url: url,
|
||||
queryEndpoint: fmt.Sprintf("%s/api/query", url),
|
||||
jwt: token,
|
||||
client: http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
topologyCache: make(map[string]*schema.Topology),
|
||||
}
|
||||
}
|
||||
|
||||
// doRequest executes an HTTP POST request to the cc-metric-store query API.
|
||||
// It handles JSON encoding/decoding, authentication, and API versioning.
|
||||
// The request body is automatically closed to prevent resource leaks.
|
||||
func (ccms *CCMetricStore) doRequest(
|
||||
ctx context.Context,
|
||||
body *APIQueryRequest,
|
||||
) (*APIQueryResponse, error) {
|
||||
buf := &bytes.Buffer{}
|
||||
if err := json.NewEncoder(buf).Encode(body); err != nil {
|
||||
cclog.Errorf("Error while encoding request body: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while building request body: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
if ccms.jwt != "" {
|
||||
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
|
||||
}
|
||||
|
||||
// versioning the cc-metric-store query API.
|
||||
// v2 = data with resampling
|
||||
// v1 = data without resampling
|
||||
q := req.URL.Query()
|
||||
q.Add("version", "v2")
|
||||
req.URL.RawQuery = q.Encode()
|
||||
|
||||
res, err := ccms.client.Do(req)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while performing request: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
|
||||
}
|
||||
|
||||
var resBody APIQueryResponse
|
||||
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
|
||||
cclog.Errorf("Error while decoding result body: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &resBody, nil
|
||||
}
|
||||
|
||||
// getTopology returns the topology for a given cluster and subcluster, caching it if not already present
|
||||
func (ccms *CCMetricStore) getTopology(cluster, subCluster string) (*schema.Topology, error) {
|
||||
cacheKey := fmt.Sprintf("%s:%s", cluster, subCluster)
|
||||
if topology, ok := ccms.topologyCache[cacheKey]; ok {
|
||||
return topology, nil
|
||||
}
|
||||
|
||||
subcluster, err := archive.GetSubCluster(cluster, subCluster)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ccms.topologyCache[cacheKey] = &subcluster.Topology
|
||||
return &subcluster.Topology, nil
|
||||
}
|
||||
|
||||
// getTopologyByNode returns the topology for a given cluster and node, caching it if not already present
|
||||
func (ccms *CCMetricStore) getTopologyByNode(cluster, node string) (*schema.Topology, error) {
|
||||
subCluster, err := archive.GetSubClusterByNode(cluster, node)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return ccms.getTopology(cluster, subCluster)
|
||||
}
|
||||
|
||||
// LoadData retrieves time series data and statistics for the specified job and metrics.
|
||||
// It queries data for the job's time range and resources, handling scope transformations automatically.
|
||||
//
|
||||
// Parameters:
|
||||
// - job: Job metadata including cluster, time range, and allocated resources
|
||||
// - metrics: List of metric names to retrieve
|
||||
// - scopes: Requested metric scopes (node, socket, core, etc.)
|
||||
// - ctx: Context for cancellation and timeouts
|
||||
// - resolution: Data resolution in seconds (0 for native resolution)
|
||||
//
|
||||
// Returns JobData organized as: metric -> scope -> series list.
|
||||
// Supports partial errors: returns available data even if some queries fail.
|
||||
func (ccms *CCMetricStore) LoadData(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
resolution int,
|
||||
) (schema.JobData, error) {
|
||||
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Verify assignment is correct - log any inconsistencies for debugging
|
||||
if len(queries) != len(assignedScope) {
|
||||
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d",
|
||||
len(queries), len(assignedScope))
|
||||
}
|
||||
|
||||
req := APIQueryRequest{
|
||||
Cluster: job.Cluster,
|
||||
From: job.StartTime,
|
||||
To: job.StartTime + int64(job.Duration),
|
||||
Queries: queries,
|
||||
WithStats: true,
|
||||
WithData: true,
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var errors []string
|
||||
jobData := make(schema.JobData)
|
||||
|
||||
// Add safety check for potential index out of range errors
|
||||
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
|
||||
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
|
||||
len(req.Queries), len(resBody.Results), len(assignedScope))
|
||||
if len(resBody.Results) > len(req.Queries) {
|
||||
resBody.Results = resBody.Results[:len(req.Queries)]
|
||||
}
|
||||
if len(assignedScope) > len(req.Queries) {
|
||||
assignedScope = assignedScope[:len(req.Queries)]
|
||||
}
|
||||
}
|
||||
|
||||
for i, row := range resBody.Results {
|
||||
query := req.Queries[i]
|
||||
metric := query.Metric
|
||||
scope := assignedScope[i]
|
||||
mc := archive.GetMetricConfig(job.Cluster, metric)
|
||||
|
||||
if mc == nil {
|
||||
cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := jobData[metric]; !ok {
|
||||
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
}
|
||||
|
||||
res := mc.Timestep
|
||||
if len(row) > 0 {
|
||||
res = row[0].Resolution
|
||||
}
|
||||
|
||||
jobMetric, ok := jobData[metric][scope]
|
||||
if !ok {
|
||||
jobMetric = &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Timestep: res,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
jobData[metric][scope] = jobMetric
|
||||
}
|
||||
|
||||
for ndx, res := range row {
|
||||
if res.Error != nil {
|
||||
/* Build list for "partial errors", if any */
|
||||
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
||||
continue
|
||||
}
|
||||
|
||||
id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
|
||||
|
||||
ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
|
||||
|
||||
jobMetric.Series = append(jobMetric.Series, schema.Series{
|
||||
Hostname: query.Hostname,
|
||||
ID: id,
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: float64(res.Avg),
|
||||
Min: float64(res.Min),
|
||||
Max: float64(res.Max),
|
||||
},
|
||||
Data: res.Data,
|
||||
})
|
||||
}
|
||||
|
||||
// So that one can later check len(jobData):
|
||||
if len(jobMetric.Series) == 0 {
|
||||
delete(jobData[metric], scope)
|
||||
if len(jobData[metric]) == 0 {
|
||||
delete(jobData, metric)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(errors) != 0 {
|
||||
/* Returns list for "partial errors" */
|
||||
return jobData, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
// LoadStats retrieves min/avg/max statistics for job metrics at node scope.
|
||||
// This is faster than LoadData when only statistical summaries are needed (no time series data).
|
||||
//
|
||||
// Returns statistics organized as: metric -> hostname -> statistics.
|
||||
func (ccms *CCMetricStore) LoadStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := APIQueryRequest{
|
||||
Cluster: job.Cluster,
|
||||
From: job.StartTime,
|
||||
To: job.StartTime + int64(job.Duration),
|
||||
Queries: queries,
|
||||
WithStats: true,
|
||||
WithData: false,
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
|
||||
for i, res := range resBody.Results {
|
||||
if i >= len(req.Queries) {
|
||||
cclog.Warnf("LoadStats: result index %d exceeds queries length %d", i, len(req.Queries))
|
||||
break
|
||||
}
|
||||
if len(res) == 0 {
|
||||
// No Data Found For Metric, Logged in FetchData to Warn
|
||||
continue
|
||||
}
|
||||
query := req.Queries[i]
|
||||
metric := query.Metric
|
||||
data := res[0]
|
||||
if data.Error != nil {
|
||||
cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
|
||||
continue
|
||||
}
|
||||
|
||||
metricdata, ok := stats[metric]
|
||||
if !ok {
|
||||
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
|
||||
stats[metric] = metricdata
|
||||
}
|
||||
|
||||
if hasNaNStats(data.Avg, data.Min, data.Max) {
|
||||
cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname)
|
||||
continue
|
||||
}
|
||||
|
||||
metricdata[query.Hostname] = schema.MetricStatistics{
|
||||
Avg: float64(data.Avg),
|
||||
Min: float64(data.Min),
|
||||
Max: float64(data.Max),
|
||||
}
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// LoadScopedStats retrieves statistics for job metrics across multiple scopes.
|
||||
// Used for the Job-View Statistics Table to display per-scope breakdowns.
|
||||
//
|
||||
// Returns statistics organized as: metric -> scope -> list of scoped statistics.
|
||||
// Each scoped statistic includes hostname, hardware ID (if applicable), and min/avg/max values.
|
||||
func (ccms *CCMetricStore) LoadScopedStats(
|
||||
job *schema.Job,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
ctx context.Context,
|
||||
) (schema.ScopedJobStats, error) {
|
||||
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, 0)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := APIQueryRequest{
|
||||
Cluster: job.Cluster,
|
||||
From: job.StartTime,
|
||||
To: job.StartTime + int64(job.Duration),
|
||||
Queries: queries,
|
||||
WithStats: true,
|
||||
WithData: false,
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var errors []string
|
||||
scopedJobStats := make(schema.ScopedJobStats)
|
||||
|
||||
for i, row := range resBody.Results {
|
||||
query := req.Queries[i]
|
||||
metric := query.Metric
|
||||
scope := assignedScope[i]
|
||||
|
||||
if _, ok := scopedJobStats[metric]; !ok {
|
||||
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
|
||||
}
|
||||
|
||||
if _, ok := scopedJobStats[metric][scope]; !ok {
|
||||
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
|
||||
}
|
||||
|
||||
for ndx, res := range row {
|
||||
if res.Error != nil {
|
||||
/* Build list for "partial errors", if any */
|
||||
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
||||
continue
|
||||
}
|
||||
|
||||
id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
|
||||
|
||||
ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
|
||||
|
||||
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
|
||||
Hostname: query.Hostname,
|
||||
ID: id,
|
||||
Data: &schema.MetricStatistics{
|
||||
Avg: float64(res.Avg),
|
||||
Min: float64(res.Min),
|
||||
Max: float64(res.Max),
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty
|
||||
if len(scopedJobStats[metric][scope]) == 0 {
|
||||
delete(scopedJobStats[metric], scope)
|
||||
if len(scopedJobStats[metric]) == 0 {
|
||||
delete(scopedJobStats, metric)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(errors) != 0 {
|
||||
/* Returns list for "partial errors" */
|
||||
return scopedJobStats, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
return scopedJobStats, nil
|
||||
}
|
||||
|
||||
// LoadNodeData retrieves current metric data for specified nodes in a cluster.
|
||||
// Used for the Systems-View Node-Overview to display real-time node status.
|
||||
//
|
||||
// If nodes is nil, queries all metrics for all nodes in the cluster (bulk query).
|
||||
// Returns data organized as: hostname -> metric -> list of JobMetric (with time series and stats).
|
||||
func (ccms *CCMetricStore) LoadNodeData(
|
||||
cluster string,
|
||||
metrics, nodes []string,
|
||||
scopes []schema.MetricScope,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]map[string][]*schema.JobMetric, error) {
|
||||
req := APIQueryRequest{
|
||||
Cluster: cluster,
|
||||
From: from.Unix(),
|
||||
To: to.Unix(),
|
||||
WithStats: true,
|
||||
WithData: true,
|
||||
}
|
||||
|
||||
if nodes == nil {
|
||||
req.ForAllNodes = append(req.ForAllNodes, metrics...)
|
||||
} else {
|
||||
for _, node := range nodes {
|
||||
for _, metric := range metrics {
|
||||
req.Queries = append(req.Queries, APIQuery{
|
||||
Hostname: node,
|
||||
Metric: metric,
|
||||
Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var errors []string
|
||||
data := make(map[string]map[string][]*schema.JobMetric)
|
||||
for i, res := range resBody.Results {
|
||||
if len(res) == 0 {
|
||||
// No Data Found For Metric, Logged in FetchData to Warn
|
||||
continue
|
||||
}
|
||||
|
||||
var query APIQuery
|
||||
if resBody.Queries != nil {
|
||||
query = resBody.Queries[i]
|
||||
} else {
|
||||
query = req.Queries[i]
|
||||
}
|
||||
|
||||
metric := query.Metric
|
||||
qdata := res[0]
|
||||
if qdata.Error != nil {
|
||||
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
|
||||
continue
|
||||
}
|
||||
|
||||
mc := archive.GetMetricConfig(cluster, metric)
|
||||
if mc == nil {
|
||||
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
ms.SanitizeStats(&qdata.Avg, &qdata.Min, &qdata.Max)
|
||||
|
||||
hostdata, ok := data[query.Hostname]
|
||||
if !ok {
|
||||
hostdata = make(map[string][]*schema.JobMetric)
|
||||
data[query.Hostname] = hostdata
|
||||
}
|
||||
|
||||
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Timestep: mc.Timestep,
|
||||
Series: []schema.Series{
|
||||
{
|
||||
Hostname: query.Hostname,
|
||||
Data: qdata.Data,
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: float64(qdata.Avg),
|
||||
Min: float64(qdata.Min),
|
||||
Max: float64(qdata.Max),
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if len(errors) != 0 {
|
||||
/* Returns list of "partial errors" */
|
||||
return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// LoadNodeListData retrieves paginated node metrics for the Systems-View Node-List.
|
||||
//
|
||||
// Supports filtering by subcluster and node name pattern. The nodeFilter performs
|
||||
// substring matching on hostnames.
|
||||
//
|
||||
// Returns:
|
||||
// - Node data organized as: hostname -> JobData (metric -> scope -> series)
|
||||
// - Total node count (before pagination)
|
||||
// - HasNextPage flag indicating if more pages are available
|
||||
// - Error (may be partial error with some data returned)
|
||||
func (ccms *CCMetricStore) LoadNodeListData(
|
||||
cluster, subCluster string,
|
||||
nodes []string,
|
||||
metrics []string,
|
||||
scopes []schema.MetricScope,
|
||||
resolution int,
|
||||
from, to time.Time,
|
||||
ctx context.Context,
|
||||
) (map[string]schema.JobData, error) {
|
||||
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Verify assignment is correct - log any inconsistencies for debugging
|
||||
if len(queries) != len(assignedScope) {
|
||||
cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d",
|
||||
len(queries), len(assignedScope))
|
||||
}
|
||||
|
||||
req := APIQueryRequest{
|
||||
Cluster: cluster,
|
||||
Queries: queries,
|
||||
From: from.Unix(),
|
||||
To: to.Unix(),
|
||||
WithStats: true,
|
||||
WithData: true,
|
||||
}
|
||||
|
||||
resBody, err := ccms.doRequest(ctx, &req)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var errors []string
|
||||
data := make(map[string]schema.JobData)
|
||||
|
||||
// Add safety check for index out of range issues
|
||||
if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
|
||||
cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
|
||||
len(req.Queries), len(resBody.Results), len(assignedScope))
|
||||
if len(resBody.Results) > len(req.Queries) {
|
||||
resBody.Results = resBody.Results[:len(req.Queries)]
|
||||
}
|
||||
if len(assignedScope) > len(req.Queries) {
|
||||
assignedScope = assignedScope[:len(req.Queries)]
|
||||
}
|
||||
}
|
||||
|
||||
for i, row := range resBody.Results {
|
||||
var query APIQuery
|
||||
if resBody.Queries != nil {
|
||||
if i < len(resBody.Queries) {
|
||||
query = resBody.Queries[i]
|
||||
} else {
|
||||
cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d",
|
||||
i, len(resBody.Queries))
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
query = req.Queries[i]
|
||||
}
|
||||
|
||||
metric := query.Metric
|
||||
scope := assignedScope[i]
|
||||
mc := archive.GetMetricConfig(cluster, metric)
|
||||
if mc == nil {
|
||||
cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
res := mc.Timestep
|
||||
if len(row) > 0 {
|
||||
res = row[0].Resolution
|
||||
}
|
||||
|
||||
// Init Nested Map Data Structures If Not Found
|
||||
hostData, ok := data[query.Hostname]
|
||||
if !ok {
|
||||
hostData = make(schema.JobData)
|
||||
data[query.Hostname] = hostData
|
||||
}
|
||||
|
||||
metricData, ok := hostData[metric]
|
||||
if !ok {
|
||||
metricData = make(map[schema.MetricScope]*schema.JobMetric)
|
||||
data[query.Hostname][metric] = metricData
|
||||
}
|
||||
|
||||
scopeData, ok := metricData[scope]
|
||||
if !ok {
|
||||
scopeData = &schema.JobMetric{
|
||||
Unit: mc.Unit,
|
||||
Timestep: res,
|
||||
Series: make([]schema.Series, 0),
|
||||
}
|
||||
data[query.Hostname][metric][scope] = scopeData
|
||||
}
|
||||
|
||||
for ndx, res := range row {
|
||||
if res.Error != nil {
|
||||
/* Build list for "partial errors", if any */
|
||||
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
||||
continue
|
||||
}
|
||||
|
||||
id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
|
||||
|
||||
ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
|
||||
|
||||
scopeData.Series = append(scopeData.Series, schema.Series{
|
||||
Hostname: query.Hostname,
|
||||
ID: id,
|
||||
Statistics: schema.MetricStatistics{
|
||||
Avg: float64(res.Avg),
|
||||
Min: float64(res.Min),
|
||||
Max: float64(res.Max),
|
||||
},
|
||||
Data: res.Data,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if len(errors) != 0 {
|
||||
/* Returns list of "partial errors" */
|
||||
return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// HealthCheck queries the external cc-metric-store's health check endpoint.
|
||||
// It sends a HealthCheckReq as the request body to /api/healthcheck and
|
||||
// returns the per-node health check results.
|
||||
func (ccms *CCMetricStore) HealthCheck(cluster string,
|
||||
nodes []string, metrics []string,
|
||||
) (map[string]ms.HealthCheckResult, error) {
|
||||
req := ms.HealthCheckReq{
|
||||
Cluster: cluster,
|
||||
Nodes: nodes,
|
||||
MetricNames: metrics,
|
||||
}
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
if err := json.NewEncoder(buf).Encode(req); err != nil {
|
||||
cclog.Errorf("Error while encoding health check request body: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/api/healthcheck", ccms.url)
|
||||
httpReq, err := http.NewRequest(http.MethodGet, endpoint, buf)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while building health check request: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
if ccms.jwt != "" {
|
||||
httpReq.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
|
||||
}
|
||||
|
||||
res, err := ccms.client.Do(httpReq)
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while performing health check request: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("'%s': HTTP Status: %s", endpoint, res.Status)
|
||||
}
|
||||
|
||||
var results map[string]ms.HealthCheckResult
|
||||
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&results); err != nil {
|
||||
cclog.Errorf("Error while decoding health check response: %s", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// hasNaNStats returns true if any of the statistics contain NaN values.
|
||||
func hasNaNStats(avg, min, max schema.Float) bool {
|
||||
return avg.IsNaN() || min.IsNaN() || max.IsNaN()
|
||||
}
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/mattn/go-sqlite3"
|
||||
"github.com/qustavo/sqlhooks/v2"
|
||||
@@ -51,7 +51,7 @@ func setupSqlite(db *sql.DB) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func Connect(driver string, db string) {
|
||||
func Connect(db string) {
|
||||
var err error
|
||||
var dbHandle *sqlx.DB
|
||||
|
||||
@@ -64,39 +64,31 @@ func Connect(driver string, db string) {
|
||||
ConnectionMaxIdleTime: repoConfig.ConnectionMaxIdleTime,
|
||||
}
|
||||
|
||||
switch driver {
|
||||
case "sqlite3":
|
||||
// TODO: Have separate DB handles for Writes and Reads
|
||||
// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
|
||||
connectionURLParams := make(url.Values)
|
||||
connectionURLParams.Add("_txlock", "immediate")
|
||||
connectionURLParams.Add("_journal_mode", "WAL")
|
||||
connectionURLParams.Add("_busy_timeout", "5000")
|
||||
connectionURLParams.Add("_synchronous", "NORMAL")
|
||||
connectionURLParams.Add("_cache_size", "1000000000")
|
||||
connectionURLParams.Add("_foreign_keys", "true")
|
||||
opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode())
|
||||
// TODO: Have separate DB handles for Writes and Reads
|
||||
// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
|
||||
connectionURLParams := make(url.Values)
|
||||
connectionURLParams.Add("_txlock", "immediate")
|
||||
connectionURLParams.Add("_journal_mode", "WAL")
|
||||
connectionURLParams.Add("_busy_timeout", "5000")
|
||||
connectionURLParams.Add("_synchronous", "NORMAL")
|
||||
connectionURLParams.Add("_cache_size", "1000000000")
|
||||
connectionURLParams.Add("_foreign_keys", "true")
|
||||
opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode())
|
||||
|
||||
if cclog.Loglevel() == "debug" {
|
||||
sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
|
||||
dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
|
||||
} else {
|
||||
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
|
||||
}
|
||||
|
||||
err = setupSqlite(dbHandle.DB)
|
||||
if err != nil {
|
||||
cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error())
|
||||
}
|
||||
case "mysql":
|
||||
opts.URL += "?multiStatements=true"
|
||||
dbHandle, err = sqlx.Open("mysql", opts.URL)
|
||||
default:
|
||||
cclog.Abortf("DB Connection: Unsupported database driver '%s'.\n", driver)
|
||||
if cclog.Loglevel() == "debug" {
|
||||
sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
|
||||
dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
|
||||
} else {
|
||||
dbHandle, err = sqlx.Open("sqlite3", opts.URL)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Abortf("DB Connection: Could not connect to '%s' database with sqlx.Open().\nError: %s\n", driver, err.Error())
|
||||
cclog.Abortf("DB Connection: Could not connect to SQLite database with sqlx.Open().\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
err = setupSqlite(dbHandle.DB)
|
||||
if err != nil {
|
||||
cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error())
|
||||
}
|
||||
|
||||
dbHandle.SetMaxOpenConns(opts.MaxOpenConnections)
|
||||
@@ -104,8 +96,8 @@ func Connect(driver string, db string) {
|
||||
dbHandle.SetConnMaxLifetime(opts.ConnectionMaxLifetime)
|
||||
dbHandle.SetConnMaxIdleTime(opts.ConnectionMaxIdleTime)
|
||||
|
||||
dbConnInstance = &DBConnection{DB: dbHandle, Driver: driver}
|
||||
err = checkDBVersion(driver, dbHandle.DB)
|
||||
dbConnInstance = &DBConnection{DB: dbHandle}
|
||||
err = checkDBVersion(dbHandle.DB)
|
||||
if err != nil {
|
||||
cclog.Abortf("DB Connection: Failed DB version check.\nError: %s\n", err.Error())
|
||||
}
|
||||
@@ -119,3 +111,26 @@ func GetConnection() *DBConnection {
|
||||
|
||||
return dbConnInstance
|
||||
}
|
||||
|
||||
// ResetConnection closes the current database connection and resets the connection state.
|
||||
// This function is intended for testing purposes only to allow test isolation.
|
||||
func ResetConnection() error {
|
||||
if dbConnInstance != nil && dbConnInstance.DB != nil {
|
||||
if err := dbConnInstance.DB.Close(); err != nil {
|
||||
return fmt.Errorf("failed to close database connection: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
dbConnInstance = nil
|
||||
dbConnOnce = sync.Once{}
|
||||
jobRepoInstance = nil
|
||||
jobRepoOnce = sync.Once{}
|
||||
nodeRepoInstance = nil
|
||||
nodeRepoOnce = sync.Once{}
|
||||
userRepoInstance = nil
|
||||
userRepoOnce = sync.Once{}
|
||||
userCfgRepoInstance = nil
|
||||
userCfgRepoOnce = sync.Once{}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
)
|
||||
|
||||
// Hooks satisfies the sqlhook.Hooks interface
|
||||
|
||||
274
internal/repository/hooks_test.go
Normal file
274
internal/repository/hooks_test.go
Normal file
@@ -0,0 +1,274 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type MockJobHook struct {
|
||||
startCalled bool
|
||||
stopCalled bool
|
||||
startJobs []*schema.Job
|
||||
stopJobs []*schema.Job
|
||||
}
|
||||
|
||||
func (m *MockJobHook) JobStartCallback(job *schema.Job) {
|
||||
m.startCalled = true
|
||||
m.startJobs = append(m.startJobs, job)
|
||||
}
|
||||
|
||||
func (m *MockJobHook) JobStopCallback(job *schema.Job) {
|
||||
m.stopCalled = true
|
||||
m.stopJobs = append(m.stopJobs, job)
|
||||
}
|
||||
|
||||
func TestRegisterJobHook(t *testing.T) {
|
||||
t.Run("register single hook", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock := &MockJobHook{}
|
||||
|
||||
RegisterJobHook(mock)
|
||||
|
||||
assert.NotNil(t, hooks)
|
||||
assert.Len(t, hooks, 1)
|
||||
assert.Equal(t, mock, hooks[0])
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("register multiple hooks", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock1 := &MockJobHook{}
|
||||
mock2 := &MockJobHook{}
|
||||
|
||||
RegisterJobHook(mock1)
|
||||
RegisterJobHook(mock2)
|
||||
|
||||
assert.Len(t, hooks, 2)
|
||||
assert.Equal(t, mock1, hooks[0])
|
||||
assert.Equal(t, mock2, hooks[1])
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("register nil hook does not add to hooks", func(t *testing.T) {
|
||||
hooks = nil
|
||||
RegisterJobHook(nil)
|
||||
|
||||
if hooks != nil {
|
||||
assert.Len(t, hooks, 0, "Nil hook should not be added")
|
||||
}
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
}
|
||||
|
||||
func TestCallJobStartHooks(t *testing.T) {
|
||||
t.Run("call start hooks with single job", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock := &MockJobHook{}
|
||||
RegisterJobHook(mock)
|
||||
|
||||
job := &schema.Job{
|
||||
JobID: 123,
|
||||
User: "testuser",
|
||||
Cluster: "testcluster",
|
||||
}
|
||||
|
||||
CallJobStartHooks([]*schema.Job{job})
|
||||
|
||||
assert.True(t, mock.startCalled)
|
||||
assert.False(t, mock.stopCalled)
|
||||
assert.Len(t, mock.startJobs, 1)
|
||||
assert.Equal(t, int64(123), mock.startJobs[0].JobID)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("call start hooks with multiple jobs", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock := &MockJobHook{}
|
||||
RegisterJobHook(mock)
|
||||
|
||||
jobs := []*schema.Job{
|
||||
{JobID: 1, User: "user1", Cluster: "cluster1"},
|
||||
{JobID: 2, User: "user2", Cluster: "cluster2"},
|
||||
{JobID: 3, User: "user3", Cluster: "cluster3"},
|
||||
}
|
||||
|
||||
CallJobStartHooks(jobs)
|
||||
|
||||
assert.True(t, mock.startCalled)
|
||||
assert.Len(t, mock.startJobs, 3)
|
||||
assert.Equal(t, int64(1), mock.startJobs[0].JobID)
|
||||
assert.Equal(t, int64(2), mock.startJobs[1].JobID)
|
||||
assert.Equal(t, int64(3), mock.startJobs[2].JobID)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("call start hooks with multiple registered hooks", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock1 := &MockJobHook{}
|
||||
mock2 := &MockJobHook{}
|
||||
RegisterJobHook(mock1)
|
||||
RegisterJobHook(mock2)
|
||||
|
||||
job := &schema.Job{
|
||||
JobID: 456, User: "testuser", Cluster: "testcluster",
|
||||
}
|
||||
|
||||
CallJobStartHooks([]*schema.Job{job})
|
||||
|
||||
assert.True(t, mock1.startCalled)
|
||||
assert.True(t, mock2.startCalled)
|
||||
assert.Len(t, mock1.startJobs, 1)
|
||||
assert.Len(t, mock2.startJobs, 1)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("call start hooks with nil hooks", func(t *testing.T) {
|
||||
hooks = nil
|
||||
|
||||
job := &schema.Job{
|
||||
JobID: 789, User: "testuser", Cluster: "testcluster",
|
||||
}
|
||||
|
||||
CallJobStartHooks([]*schema.Job{job})
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("call start hooks with empty job list", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock := &MockJobHook{}
|
||||
RegisterJobHook(mock)
|
||||
|
||||
CallJobStartHooks([]*schema.Job{})
|
||||
|
||||
assert.False(t, mock.startCalled)
|
||||
assert.Len(t, mock.startJobs, 0)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
}
|
||||
|
||||
func TestCallJobStopHooks(t *testing.T) {
|
||||
t.Run("call stop hooks with single job", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock := &MockJobHook{}
|
||||
RegisterJobHook(mock)
|
||||
|
||||
job := &schema.Job{
|
||||
JobID: 123,
|
||||
User: "testuser",
|
||||
Cluster: "testcluster",
|
||||
}
|
||||
|
||||
CallJobStopHooks(job)
|
||||
|
||||
assert.True(t, mock.stopCalled)
|
||||
assert.False(t, mock.startCalled)
|
||||
assert.Len(t, mock.stopJobs, 1)
|
||||
assert.Equal(t, int64(123), mock.stopJobs[0].JobID)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("call stop hooks with multiple registered hooks", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock1 := &MockJobHook{}
|
||||
mock2 := &MockJobHook{}
|
||||
RegisterJobHook(mock1)
|
||||
RegisterJobHook(mock2)
|
||||
|
||||
job := &schema.Job{
|
||||
JobID: 456, User: "testuser", Cluster: "testcluster",
|
||||
}
|
||||
|
||||
CallJobStopHooks(job)
|
||||
|
||||
assert.True(t, mock1.stopCalled)
|
||||
assert.True(t, mock2.stopCalled)
|
||||
assert.Len(t, mock1.stopJobs, 1)
|
||||
assert.Len(t, mock2.stopJobs, 1)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
|
||||
t.Run("call stop hooks with nil hooks", func(t *testing.T) {
|
||||
hooks = nil
|
||||
|
||||
job := &schema.Job{
|
||||
JobID: 789, User: "testuser", Cluster: "testcluster",
|
||||
}
|
||||
|
||||
CallJobStopHooks(job)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
}
|
||||
|
||||
func TestSQLHooks(t *testing.T) {
|
||||
_ = setup(t)
|
||||
|
||||
t.Run("hooks log queries in debug mode", func(t *testing.T) {
|
||||
h := &Hooks{}
|
||||
|
||||
ctx := context.Background()
|
||||
query := "SELECT * FROM job WHERE job_id = ?"
|
||||
args := []any{123}
|
||||
|
||||
ctxWithTime, err := h.Before(ctx, query, args...)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, ctxWithTime)
|
||||
|
||||
beginTime := ctxWithTime.Value("begin")
|
||||
require.NotNil(t, beginTime)
|
||||
_, ok := beginTime.(time.Time)
|
||||
assert.True(t, ok, "Begin time should be time.Time")
|
||||
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
ctxAfter, err := h.After(ctxWithTime, query, args...)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, ctxAfter)
|
||||
})
|
||||
}
|
||||
|
||||
func TestHookIntegration(t *testing.T) {
|
||||
t.Run("hooks are called during job lifecycle", func(t *testing.T) {
|
||||
hooks = nil
|
||||
mock := &MockJobHook{}
|
||||
RegisterJobHook(mock)
|
||||
|
||||
job := &schema.Job{
|
||||
JobID: 999,
|
||||
User: "integrationuser",
|
||||
Cluster: "integrationcluster",
|
||||
}
|
||||
|
||||
CallJobStartHooks([]*schema.Job{job})
|
||||
assert.True(t, mock.startCalled)
|
||||
assert.Equal(t, 1, len(mock.startJobs))
|
||||
|
||||
CallJobStopHooks(job)
|
||||
assert.True(t, mock.stopCalled)
|
||||
assert.Equal(t, 1, len(mock.stopJobs))
|
||||
|
||||
assert.Equal(t, mock.startJobs[0].JobID, mock.stopJobs[0].JobID)
|
||||
|
||||
hooks = nil
|
||||
})
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,14 +2,15 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
@@ -29,6 +30,27 @@ const NamedJobInsert string = `INSERT INTO job (
|
||||
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
|
||||
);`
|
||||
|
||||
// InsertJobDirect inserts a job directly into the job table (not job_cache).
|
||||
// Use this when the returned ID will be used for operations on the job table
|
||||
// (e.g., adding tags), or for imported jobs that are already completed.
|
||||
func (r *JobRepository) InsertJobDirect(job *schema.Job) (int64, error) {
|
||||
r.Mutex.Lock()
|
||||
defer r.Mutex.Unlock()
|
||||
|
||||
res, err := r.DB.NamedExec(NamedJobInsert, job)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while NamedJobInsert (direct)")
|
||||
return 0, err
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
cclog.Warn("Error while getting last insert ID (direct)")
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
|
||||
r.Mutex.Lock()
|
||||
defer r.Mutex.Unlock()
|
||||
@@ -70,8 +92,9 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
// Use INSERT OR IGNORE to skip jobs already transferred by the stop path
|
||||
_, err = r.DB.Exec(
|
||||
"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
|
||||
"INSERT OR IGNORE INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
|
||||
if err != nil {
|
||||
cclog.Warnf("Error while Job sync: %v", err)
|
||||
return nil, err
|
||||
@@ -83,9 +106,48 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Resolve correct job.id from the job table. The IDs read from job_cache
|
||||
// are from a different auto-increment sequence and must not be used to
|
||||
// query the job table.
|
||||
for _, job := range jobs {
|
||||
var newID int64
|
||||
if err := sq.Select("job.id").From("job").
|
||||
Where("job.job_id = ? AND job.cluster = ? AND job.start_time = ?",
|
||||
job.JobID, job.Cluster, job.StartTime).
|
||||
RunWith(r.stmtCache).QueryRow().Scan(&newID); err != nil {
|
||||
cclog.Warnf("SyncJobs: could not resolve job table id for job %d on %s: %v",
|
||||
job.JobID, job.Cluster, err)
|
||||
continue
|
||||
}
|
||||
job.ID = &newID
|
||||
}
|
||||
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// TransferCachedJobToMain moves a job from job_cache to the job table.
|
||||
// Caller must hold r.Mutex. Returns the new job table ID.
|
||||
func (r *JobRepository) TransferCachedJobToMain(cacheID int64) (int64, error) {
|
||||
res, err := r.DB.Exec(
|
||||
"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache WHERE id = ?",
|
||||
cacheID)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("transferring cached job %d to main table failed: %w", cacheID, err)
|
||||
}
|
||||
|
||||
newID, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("getting new job ID after transfer failed: %w", err)
|
||||
}
|
||||
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", cacheID)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("deleting cached job %d after transfer failed: %w", cacheID, err)
|
||||
}
|
||||
|
||||
return newID, nil
|
||||
}
|
||||
|
||||
// Start inserts a new job in the table, returning the unique job ID.
|
||||
// Statistics are not transfered!
|
||||
func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
|
||||
@@ -107,41 +169,46 @@ func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
|
||||
return r.InsertJob(job)
|
||||
}
|
||||
|
||||
// StartDirect inserts a new job directly into the job table (not job_cache).
|
||||
// Use this when the returned ID will immediately be used for job table
|
||||
// operations such as adding tags.
|
||||
func (r *JobRepository) StartDirect(job *schema.Job) (id int64, err error) {
|
||||
job.RawFootprint, err = json.Marshal(job.Footprint)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawResources, err = json.Marshal(job.Resources)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
|
||||
}
|
||||
|
||||
job.RawMetaData, err = json.Marshal(job.MetaData)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
|
||||
}
|
||||
|
||||
return r.InsertJobDirect(job)
|
||||
}
|
||||
|
||||
// Stop updates the job with the database id jobId using the provided arguments.
|
||||
func (r *JobRepository) Stop(
|
||||
jobId int64,
|
||||
jobID int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
// Invalidate cache entries as job state is changing
|
||||
r.cache.Del(fmt.Sprintf("metadata:%d", jobId))
|
||||
r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobId))
|
||||
r.cache.Del(fmt.Sprintf("metadata:%d", jobID))
|
||||
r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobID))
|
||||
|
||||
stmt := sq.Update("job").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job.id = ?", jobId)
|
||||
Where("job.id = ?", jobID)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *JobRepository) StopCached(
|
||||
jobId int64,
|
||||
duration int32,
|
||||
state schema.JobState,
|
||||
monitoringStatus int32,
|
||||
) (err error) {
|
||||
// Note: StopCached updates job_cache table, not the main job table
|
||||
// Cache invalidation happens when job is synced to main table
|
||||
stmt := sq.Update("job_cache").
|
||||
Set("job_state", state).
|
||||
Set("duration", duration).
|
||||
Set("monitoring_status", monitoringStatus).
|
||||
Where("job_cache.id = ?", jobId)
|
||||
|
||||
_, err = stmt.RunWith(r.stmtCache).Exec()
|
||||
return err
|
||||
}
|
||||
|
||||
607
internal/repository/jobCreate_test.go
Normal file
607
internal/repository/jobCreate_test.go
Normal file
@@ -0,0 +1,607 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
package repository
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// createTestJob creates a minimal valid job for testing
|
||||
func createTestJob(jobID int64, cluster string) *schema.Job {
|
||||
return &schema.Job{
|
||||
JobID: jobID,
|
||||
User: "testuser",
|
||||
Project: "testproject",
|
||||
Cluster: cluster,
|
||||
SubCluster: "main",
|
||||
Partition: "batch",
|
||||
NumNodes: 1,
|
||||
NumHWThreads: 4,
|
||||
NumAcc: 0,
|
||||
Shared: "none",
|
||||
MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
|
||||
SMT: 1,
|
||||
State: schema.JobStateRunning,
|
||||
StartTime: 1234567890,
|
||||
Duration: 0,
|
||||
Walltime: 3600,
|
||||
Resources: []*schema.Resource{
|
||||
{
|
||||
Hostname: "node01",
|
||||
HWThreads: []int{0, 1, 2, 3},
|
||||
},
|
||||
},
|
||||
Footprint: map[string]float64{
|
||||
"cpu_load": 50.0,
|
||||
"mem_used": 8000.0,
|
||||
"flops_any": 0.5,
|
||||
"mem_bw": 10.0,
|
||||
"net_bw": 2.0,
|
||||
"file_bw": 1.0,
|
||||
"cpu_used": 2.0,
|
||||
"cpu_load_core": 12.5,
|
||||
},
|
||||
MetaData: map[string]string{
|
||||
"jobName": "test_job",
|
||||
"queue": "normal",
|
||||
"qosName": "default",
|
||||
"accountName": "testaccount",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestInsertJob(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
t.Run("successful insertion", func(t *testing.T) {
|
||||
job := createTestJob(999001, "testcluster")
|
||||
job.RawResources, _ = json.Marshal(job.Resources)
|
||||
job.RawFootprint, _ = json.Marshal(job.Footprint)
|
||||
job.RawMetaData, _ = json.Marshal(job.MetaData)
|
||||
|
||||
id, err := r.InsertJob(job)
|
||||
require.NoError(t, err, "InsertJob should succeed")
|
||||
assert.Greater(t, id, int64(0), "Should return valid insert ID")
|
||||
|
||||
// Verify job was inserted into job_cache
|
||||
var count int
|
||||
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?",
|
||||
job.JobID, job.Cluster).Scan(&count)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, count, "Job should be in job_cache table")
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("insertion with all fields", func(t *testing.T) {
|
||||
job := createTestJob(999002, "testcluster")
|
||||
job.ArrayJobID = 5000
|
||||
job.Energy = 1500.5
|
||||
job.RawResources, _ = json.Marshal(job.Resources)
|
||||
job.RawFootprint, _ = json.Marshal(job.Footprint)
|
||||
job.RawMetaData, _ = json.Marshal(job.MetaData)
|
||||
|
||||
id, err := r.InsertJob(job)
|
||||
require.NoError(t, err)
|
||||
assert.Greater(t, id, int64(0))
|
||||
|
||||
// Verify all fields were stored correctly
|
||||
var retrievedJob schema.Job
|
||||
err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, energy
|
||||
FROM job_cache WHERE id = ?`, id).Scan(
|
||||
&retrievedJob.JobID, &retrievedJob.User, &retrievedJob.Project,
|
||||
&retrievedJob.Cluster, &retrievedJob.ArrayJobID, &retrievedJob.Energy)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, job.JobID, retrievedJob.JobID)
|
||||
assert.Equal(t, job.User, retrievedJob.User)
|
||||
assert.Equal(t, job.Project, retrievedJob.Project)
|
||||
assert.Equal(t, job.Cluster, retrievedJob.Cluster)
|
||||
assert.Equal(t, job.ArrayJobID, retrievedJob.ArrayJobID)
|
||||
assert.Equal(t, job.Energy, retrievedJob.Energy)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestStart(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
t.Run("successful job start with JSON encoding", func(t *testing.T) {
|
||||
job := createTestJob(999003, "testcluster")
|
||||
|
||||
id, err := r.Start(job)
|
||||
require.NoError(t, err, "Start should succeed")
|
||||
assert.Greater(t, id, int64(0), "Should return valid insert ID")
|
||||
|
||||
// Verify job was inserted and JSON fields were encoded
|
||||
var rawResources, rawFootprint, rawMetaData []byte
|
||||
err = r.DB.QueryRow(`SELECT resources, footprint, meta_data FROM job_cache WHERE id = ?`, id).Scan(
|
||||
&rawResources, &rawFootprint, &rawMetaData)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify resources JSON
|
||||
var resources []*schema.Resource
|
||||
err = json.Unmarshal(rawResources, &resources)
|
||||
require.NoError(t, err, "Resources should be valid JSON")
|
||||
assert.Equal(t, 1, len(resources))
|
||||
assert.Equal(t, "node01", resources[0].Hostname)
|
||||
|
||||
// Verify footprint JSON
|
||||
var footprint map[string]float64
|
||||
err = json.Unmarshal(rawFootprint, &footprint)
|
||||
require.NoError(t, err, "Footprint should be valid JSON")
|
||||
assert.Equal(t, 50.0, footprint["cpu_load"])
|
||||
assert.Equal(t, 8000.0, footprint["mem_used"])
|
||||
|
||||
// Verify metadata JSON
|
||||
var metaData map[string]string
|
||||
err = json.Unmarshal(rawMetaData, &metaData)
|
||||
require.NoError(t, err, "MetaData should be valid JSON")
|
||||
assert.Equal(t, "test_job", metaData["jobName"])
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("job start with empty footprint", func(t *testing.T) {
|
||||
job := createTestJob(999004, "testcluster")
|
||||
job.Footprint = map[string]float64{}
|
||||
|
||||
id, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
assert.Greater(t, id, int64(0))
|
||||
|
||||
// Verify empty footprint was encoded as empty JSON object
|
||||
var rawFootprint []byte
|
||||
err = r.DB.QueryRow(`SELECT footprint FROM job_cache WHERE id = ?`, id).Scan(&rawFootprint)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, []byte("{}"), rawFootprint)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("job start with nil metadata", func(t *testing.T) {
|
||||
job := createTestJob(999005, "testcluster")
|
||||
job.MetaData = nil
|
||||
|
||||
id, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
assert.Greater(t, id, int64(0))
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestStop(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
t.Run("successful job stop", func(t *testing.T) {
|
||||
// First insert a job using Start
|
||||
job := createTestJob(999106, "testcluster")
|
||||
id, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Move from job_cache to job table (simulate SyncJobs) - exclude id to let it auto-increment
|
||||
_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
|
||||
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
|
||||
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint)
|
||||
SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
|
||||
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
|
||||
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint
|
||||
FROM job_cache WHERE id = ?`, id)
|
||||
require.NoError(t, err)
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Get the new job id in the job table
|
||||
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
|
||||
job.JobID, job.Cluster, job.StartTime).Scan(&id)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Stop the job
|
||||
duration := int32(3600)
|
||||
state := schema.JobStateCompleted
|
||||
monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful)
|
||||
|
||||
err = r.Stop(id, duration, state, monitoringStatus)
|
||||
require.NoError(t, err, "Stop should succeed")
|
||||
|
||||
// Verify job was updated
|
||||
var retrievedDuration int32
|
||||
var retrievedState string
|
||||
var retrievedMonStatus int32
|
||||
err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job WHERE id = ?`, id).Scan(
|
||||
&retrievedDuration, &retrievedState, &retrievedMonStatus)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, duration, retrievedDuration)
|
||||
assert.Equal(t, string(state), retrievedState)
|
||||
assert.Equal(t, monitoringStatus, retrievedMonStatus)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("stop updates job state transitions", func(t *testing.T) {
|
||||
// Insert a job
|
||||
job := createTestJob(999107, "testcluster")
|
||||
id, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Move to job table
|
||||
_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
|
||||
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
|
||||
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint)
|
||||
SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
|
||||
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
|
||||
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint
|
||||
FROM job_cache WHERE id = ?`, id)
|
||||
require.NoError(t, err)
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Get the new job id in the job table
|
||||
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
|
||||
job.JobID, job.Cluster, job.StartTime).Scan(&id)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Stop the job with different duration
|
||||
err = r.Stop(id, 7200, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful))
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify the duration was updated correctly
|
||||
var duration int32
|
||||
err = r.DB.QueryRow(`SELECT duration FROM job WHERE id = ?`, id).Scan(&duration)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, int32(7200), duration, "Duration should be updated to 7200")
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("stop with different states", func(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
jobID int64
|
||||
state schema.JobState
|
||||
monitoringStatus int32
|
||||
}{
|
||||
{"completed", 999108, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)},
|
||||
{"failed", 999118, schema.JobStateFailed, int32(schema.MonitoringStatusArchivingSuccessful)},
|
||||
{"cancelled", 999119, schema.JobStateCancelled, int32(schema.MonitoringStatusArchivingSuccessful)},
|
||||
{"timeout", 999120, schema.JobStateTimeout, int32(schema.MonitoringStatusArchivingSuccessful)},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
job := createTestJob(tc.jobID, "testcluster")
|
||||
id, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Move to job table
|
||||
_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
|
||||
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
|
||||
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint)
|
||||
SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project,
|
||||
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes,
|
||||
num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint
|
||||
FROM job_cache WHERE id = ?`, id)
|
||||
require.NoError(t, err)
|
||||
_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Get the new job id in the job table
|
||||
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
|
||||
job.JobID, job.Cluster, job.StartTime).Scan(&id)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Stop with specific state
|
||||
err = r.Stop(id, 1800, tc.state, tc.monitoringStatus)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify state was set correctly
|
||||
var retrievedState string
|
||||
err = r.DB.QueryRow(`SELECT job_state FROM job WHERE id = ?`, id).Scan(&retrievedState)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, string(tc.state), retrievedState)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestTransferCachedJobToMain(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
t.Run("successful transfer from cache to main", func(t *testing.T) {
|
||||
// Insert a job in job_cache
|
||||
job := createTestJob(999009, "testcluster")
|
||||
cacheID, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Transfer the cached job to the main table
|
||||
r.Mutex.Lock()
|
||||
newID, err := r.TransferCachedJobToMain(cacheID)
|
||||
r.Mutex.Unlock()
|
||||
require.NoError(t, err, "TransferCachedJobToMain should succeed")
|
||||
assert.NotEqual(t, cacheID, newID, "New ID should differ from cache ID")
|
||||
|
||||
// Verify job exists in job table
|
||||
var count int
|
||||
err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE id = ?`, newID).Scan(&count)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, count, "Job should exist in main table")
|
||||
|
||||
// Verify job was removed from job_cache
|
||||
err = r.DB.QueryRow(`SELECT COUNT(*) FROM job_cache WHERE id = ?`, cacheID).Scan(&count)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 0, count, "Job should be removed from cache")
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("transfer preserves job data", func(t *testing.T) {
|
||||
// Insert a job in job_cache
|
||||
job := createTestJob(999010, "testcluster")
|
||||
cacheID, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Transfer the cached job
|
||||
r.Mutex.Lock()
|
||||
newID, err := r.TransferCachedJobToMain(cacheID)
|
||||
r.Mutex.Unlock()
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify the transferred job has the correct data
|
||||
var jobID int64
|
||||
var cluster string
|
||||
err = r.DB.QueryRow(`SELECT job_id, cluster FROM job WHERE id = ?`, newID).Scan(&jobID, &cluster)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, job.JobID, jobID)
|
||||
assert.Equal(t, job.Cluster, cluster)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestSyncJobs(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
t.Run("sync jobs from cache to main table", func(t *testing.T) {
|
||||
// Ensure cache is empty first
|
||||
_, err := r.DB.Exec("DELETE FROM job_cache")
|
||||
require.NoError(t, err)
|
||||
|
||||
// Insert multiple jobs in job_cache
|
||||
job1 := createTestJob(999011, "testcluster")
|
||||
job2 := createTestJob(999012, "testcluster")
|
||||
job3 := createTestJob(999013, "testcluster")
|
||||
|
||||
_, err = r.Start(job1)
|
||||
require.NoError(t, err)
|
||||
_, err = r.Start(job2)
|
||||
require.NoError(t, err)
|
||||
_, err = r.Start(job3)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify jobs are in job_cache
|
||||
var cacheCount int
|
||||
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)",
|
||||
job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 3, cacheCount, "All jobs should be in job_cache")
|
||||
|
||||
// Sync jobs
|
||||
jobs, err := r.SyncJobs()
|
||||
require.NoError(t, err, "SyncJobs should succeed")
|
||||
assert.Equal(t, 3, len(jobs), "Should return 3 synced jobs")
|
||||
|
||||
// Verify jobs were moved to job table
|
||||
var jobCount int
|
||||
err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE job_id IN (?, ?, ?)",
|
||||
job1.JobID, job2.JobID, job3.JobID).Scan(&jobCount)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 3, jobCount, "All jobs should be in job table")
|
||||
|
||||
// Verify job_cache was cleared
|
||||
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)",
|
||||
job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 0, cacheCount, "job_cache should be empty after sync")
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE job_id IN (?, ?, ?)", job1.JobID, job2.JobID, job3.JobID)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("sync preserves job data", func(t *testing.T) {
|
||||
// Ensure cache is empty first
|
||||
_, err := r.DB.Exec("DELETE FROM job_cache")
|
||||
require.NoError(t, err)
|
||||
|
||||
// Insert a job with specific data
|
||||
job := createTestJob(999014, "testcluster")
|
||||
job.ArrayJobID = 7777
|
||||
job.Energy = 2500.75
|
||||
job.Duration = 1800
|
||||
|
||||
id, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Update some fields to simulate job progress
|
||||
result, err := r.DB.Exec(`UPDATE job_cache SET duration = ?, energy = ? WHERE id = ?`,
|
||||
3600, 3000.5, id)
|
||||
require.NoError(t, err)
|
||||
rowsAffected, _ := result.RowsAffected()
|
||||
require.Equal(t, int64(1), rowsAffected, "UPDATE should affect exactly 1 row")
|
||||
|
||||
// Verify the update worked
|
||||
var checkDuration int32
|
||||
var checkEnergy float64
|
||||
err = r.DB.QueryRow(`SELECT duration, energy FROM job_cache WHERE id = ?`, id).Scan(&checkDuration, &checkEnergy)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, int32(3600), checkDuration, "Duration should be updated to 3600 before sync")
|
||||
require.Equal(t, 3000.5, checkEnergy, "Energy should be updated to 3000.5 before sync")
|
||||
|
||||
// Sync jobs
|
||||
jobs, err := r.SyncJobs()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, len(jobs), "Should return exactly 1 synced job")
|
||||
|
||||
// Verify in database
|
||||
var dbJob schema.Job
|
||||
err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, duration, energy
|
||||
FROM job WHERE job_id = ? AND cluster = ?`, job.JobID, job.Cluster).Scan(
|
||||
&dbJob.JobID, &dbJob.User, &dbJob.Project, &dbJob.Cluster,
|
||||
&dbJob.ArrayJobID, &dbJob.Duration, &dbJob.Energy)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, job.JobID, dbJob.JobID)
|
||||
assert.Equal(t, int32(3600), dbJob.Duration)
|
||||
assert.Equal(t, 3000.5, dbJob.Energy)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("sync returns job table IDs not cache IDs", func(t *testing.T) {
|
||||
// Ensure cache is empty first
|
||||
_, err := r.DB.Exec("DELETE FROM job_cache")
|
||||
require.NoError(t, err)
|
||||
|
||||
// Insert a job into job_cache
|
||||
job := createTestJob(999015, "testcluster")
|
||||
cacheID, err := r.Start(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Sync jobs
|
||||
jobs, err := r.SyncJobs()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, len(jobs))
|
||||
|
||||
// The returned ID must refer to the job table, not job_cache
|
||||
var jobTableID int64
|
||||
err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
|
||||
jobs[0].JobID, jobs[0].Cluster, jobs[0].StartTime).Scan(&jobTableID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, jobTableID, *jobs[0].ID,
|
||||
"returned ID should match the job table row, not the cache ID (%d)", cacheID)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("sync with empty cache returns empty list", func(t *testing.T) {
|
||||
// Ensure cache is empty
|
||||
_, err := r.DB.Exec("DELETE FROM job_cache")
|
||||
require.NoError(t, err)
|
||||
|
||||
// Sync should return empty list
|
||||
jobs, err := r.SyncJobs()
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 0, len(jobs), "Should return empty list when cache is empty")
|
||||
})
|
||||
}
|
||||
|
||||
func TestInsertJobDirect(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
t.Run("inserts into job table not cache", func(t *testing.T) {
|
||||
job := createTestJob(999020, "testcluster")
|
||||
job.RawResources, _ = json.Marshal(job.Resources)
|
||||
job.RawFootprint, _ = json.Marshal(job.Footprint)
|
||||
job.RawMetaData, _ = json.Marshal(job.MetaData)
|
||||
|
||||
id, err := r.InsertJobDirect(job)
|
||||
require.NoError(t, err, "InsertJobDirect should succeed")
|
||||
assert.Greater(t, id, int64(0), "Should return valid insert ID")
|
||||
|
||||
// Verify job is in job table
|
||||
var count int
|
||||
err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE id = ?", id).Scan(&count)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, count, "Job should be in job table")
|
||||
|
||||
// Verify job is NOT in job_cache
|
||||
err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?",
|
||||
job.JobID, job.Cluster).Scan(&count)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 0, count, "Job should NOT be in job_cache")
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("returned ID works for tag operations", func(t *testing.T) {
|
||||
job := createTestJob(999021, "testcluster")
|
||||
job.RawResources, _ = json.Marshal(job.Resources)
|
||||
job.RawFootprint, _ = json.Marshal(job.Footprint)
|
||||
job.RawMetaData, _ = json.Marshal(job.MetaData)
|
||||
|
||||
id, err := r.InsertJobDirect(job)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Adding a tag using the returned ID should succeed (FK constraint on jobtag)
|
||||
err = r.ImportTag(id, "test_type", "test_name", "global")
|
||||
require.NoError(t, err, "ImportTag should succeed with direct insert ID")
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM jobtag WHERE job_id = ?", id)
|
||||
require.NoError(t, err)
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestStartDirect(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
t.Run("inserts into job table with JSON encoding", func(t *testing.T) {
|
||||
job := createTestJob(999022, "testcluster")
|
||||
|
||||
id, err := r.StartDirect(job)
|
||||
require.NoError(t, err, "StartDirect should succeed")
|
||||
assert.Greater(t, id, int64(0))
|
||||
|
||||
// Verify job is in job table with encoded JSON
|
||||
var rawResources []byte
|
||||
err = r.DB.QueryRow("SELECT resources FROM job WHERE id = ?", id).Scan(&rawResources)
|
||||
require.NoError(t, err)
|
||||
|
||||
var resources []*schema.Resource
|
||||
err = json.Unmarshal(rawResources, &resources)
|
||||
require.NoError(t, err, "Resources should be valid JSON")
|
||||
assert.Equal(t, "node01", resources[0].Hostname)
|
||||
|
||||
// Clean up
|
||||
_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
@@ -11,8 +12,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
@@ -22,13 +23,17 @@ import (
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) Find(
|
||||
jobId *int64,
|
||||
jobID *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
if jobID == nil {
|
||||
return nil, fmt.Errorf("jobID cannot be nil")
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
Where("job.job_id = ?", *jobID)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
@@ -37,19 +42,29 @@ func (r *JobRepository) Find(
|
||||
q = q.Where("job.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match
|
||||
q = q.OrderBy("job.id DESC").Limit(1) // always use newest matching job by db id if more than one match
|
||||
|
||||
cclog.Debugf("Timer Find %s", time.Since(start))
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindCached executes a SQL query to find a specific batch job from the job_cache table.
|
||||
// The job is queried using the batch job id, and optionally filtered by cluster name
|
||||
// and start time (UNIX epoch time seconds). This method uses cached job data which
|
||||
// may be stale but provides faster access than Find().
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindCached(
|
||||
jobId *int64,
|
||||
jobID *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) (*schema.Job, error) {
|
||||
if jobID == nil {
|
||||
return nil, fmt.Errorf("jobID cannot be nil")
|
||||
}
|
||||
|
||||
q := sq.Select(jobCacheColumns...).From("job_cache").
|
||||
Where("job_cache.job_id = ?", *jobId)
|
||||
Where("job_cache.job_id = ?", *jobID)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job_cache.cluster = ?", *cluster)
|
||||
@@ -58,24 +73,28 @@ func (r *JobRepository) FindCached(
|
||||
q = q.Where("job_cache.start_time = ?", *startTime)
|
||||
}
|
||||
|
||||
q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match
|
||||
q = q.OrderBy("job_cache.id DESC").Limit(1) // always use newest matching job by db id if more than one match
|
||||
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// Find executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the batch job id, the cluster name,
|
||||
// and the start time of the job in UNIX epoch time seconds.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
// FindAll executes a SQL query to find all batch jobs matching the given criteria.
|
||||
// Jobs are queried using the batch job id, and optionally filtered by cluster name
|
||||
// and start time (UNIX epoch time seconds).
|
||||
// It returns a slice of pointers to schema.Job data structures and an error variable.
|
||||
// An empty slice is returned if no matching jobs are found.
|
||||
func (r *JobRepository) FindAll(
|
||||
jobId *int64,
|
||||
jobID *int64,
|
||||
cluster *string,
|
||||
startTime *int64,
|
||||
) ([]*schema.Job, error) {
|
||||
if jobID == nil {
|
||||
return nil, fmt.Errorf("jobID cannot be nil")
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
q := sq.Select(jobColumns...).From("job").
|
||||
Where("job.job_id = ?", *jobId)
|
||||
Where("job.job_id = ?", *jobID)
|
||||
|
||||
if cluster != nil {
|
||||
q = q.Where("job.cluster = ?", *cluster)
|
||||
@@ -86,8 +105,8 @@ func (r *JobRepository) FindAll(
|
||||
|
||||
rows, err := q.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
cclog.Errorf("Error while running FindAll query for jobID=%d: %v", *jobID, err)
|
||||
return nil, fmt.Errorf("failed to execute FindAll query: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
@@ -95,8 +114,8 @@ func (r *JobRepository) FindAll(
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
cclog.Warnf("Error while scanning rows in FindAll: %v", err)
|
||||
return nil, fmt.Errorf("failed to scan job row: %w", err)
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
@@ -119,8 +138,8 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) {
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Error("Error while running query")
|
||||
return nil, err
|
||||
cclog.Errorf("Error while running GetJobList query (limit=%d, offset=%d): %v", limit, offset, err)
|
||||
return nil, fmt.Errorf("failed to execute GetJobList query: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
@@ -129,23 +148,23 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) {
|
||||
var id int64
|
||||
err := rows.Scan(&id)
|
||||
if err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
cclog.Warnf("Error while scanning rows in GetJobList: %v", err)
|
||||
return nil, fmt.Errorf("failed to scan job ID: %w", err)
|
||||
}
|
||||
jl = append(jl, id)
|
||||
}
|
||||
|
||||
cclog.Infof("Return job count %d", len(jl))
|
||||
cclog.Debugf("JobRepository.GetJobList(): Return job count %d", len(jl))
|
||||
return jl, nil
|
||||
}
|
||||
|
||||
// FindById executes a SQL query to find a specific batch job.
|
||||
// FindByID executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, error) {
|
||||
func (r *JobRepository) FindByID(ctx context.Context, jobID int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
From("job").Where("job.id = ?", jobID)
|
||||
|
||||
q, qerr := SecurityCheck(ctx, q)
|
||||
if qerr != nil {
|
||||
@@ -155,14 +174,14 @@ func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job,
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIdWithUser executes a SQL query to find a specific batch job.
|
||||
// FindByIDWithUser executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id. The user is passed directly,
|
||||
// instead as part of the context.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schema.Job, error) {
|
||||
func (r *JobRepository) FindByIDWithUser(user *schema.User, jobID int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
From("job").Where("job.id = ?", jobID)
|
||||
|
||||
q, qerr := SecurityCheckWithUser(user, q)
|
||||
if qerr != nil {
|
||||
@@ -172,24 +191,24 @@ func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schem
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByIdDirect executes a SQL query to find a specific batch job.
|
||||
// FindByIDDirect executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the database id.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByIdDirect(jobId int64) (*schema.Job, error) {
|
||||
func (r *JobRepository) FindByIDDirect(jobID int64) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").Where("job.id = ?", jobId)
|
||||
From("job").Where("job.id = ?", jobID)
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// FindByJobId executes a SQL query to find a specific batch job.
|
||||
// FindByJobID executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id and the clustername.
|
||||
// It returns a pointer to a schema.Job data structure and an error variable.
|
||||
// To check if no job was found test err == sql.ErrNoRows
|
||||
func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime int64, cluster string) (*schema.Job, error) {
|
||||
func (r *JobRepository) FindByJobID(ctx context.Context, jobID int64, startTime int64, cluster string) (*schema.Job, error) {
|
||||
q := sq.Select(jobColumns...).
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobId).
|
||||
Where("job.job_id = ?", jobID).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
@@ -201,19 +220,22 @@ func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime
|
||||
return scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
}
|
||||
|
||||
// IsJobOwner executes a SQL query to find a specific batch job.
|
||||
// The job is queried using the slurm id,a username and the cluster.
|
||||
// It returns a bool.
|
||||
// If job was found, user is owner: test err != sql.ErrNoRows
|
||||
func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cluster string) bool {
|
||||
// IsJobOwner checks if the specified user owns the batch job identified by jobID,
|
||||
// startTime, and cluster. Returns true if the user is the owner, false otherwise.
|
||||
// This method does not return errors; it returns false for both non-existent jobs
|
||||
// and jobs owned by other users.
|
||||
func (r *JobRepository) IsJobOwner(jobID int64, startTime int64, user string, cluster string) bool {
|
||||
q := sq.Select("id").
|
||||
From("job").
|
||||
Where("job.job_id = ?", jobId).
|
||||
Where("job.job_id = ?", jobID).
|
||||
Where("job.hpc_user = ?", user).
|
||||
Where("job.cluster = ?", cluster).
|
||||
Where("job.start_time = ?", startTime)
|
||||
|
||||
_, err := scanJob(q.RunWith(r.stmtCache).QueryRow())
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
cclog.Warnf("IsJobOwner: unexpected error for jobID=%d, user=%s, cluster=%s: %v", jobID, user, cluster, err)
|
||||
}
|
||||
return err != sql.ErrNoRows
|
||||
}
|
||||
|
||||
@@ -231,6 +253,11 @@ func (r *JobRepository) FindConcurrentJobs(
|
||||
}
|
||||
|
||||
query = query.Where("cluster = ?", job.Cluster)
|
||||
|
||||
if len(job.Resources) == 0 {
|
||||
return nil, fmt.Errorf("job has no resources defined")
|
||||
}
|
||||
|
||||
var startTime int64
|
||||
var stopTime int64
|
||||
|
||||
@@ -243,25 +270,28 @@ func (r *JobRepository) FindConcurrentJobs(
|
||||
stopTime = startTime + int64(job.Duration)
|
||||
}
|
||||
|
||||
// Add 200s overlap for jobs start time at the end
|
||||
startTimeTail := startTime + 10
|
||||
stopTimeTail := stopTime - 200
|
||||
startTimeFront := startTime + 200
|
||||
// Time buffer constant for finding overlapping jobs
|
||||
// overlapBufferEnd: 200s buffer at job end to account for scheduling/cleanup overlap
|
||||
const overlapBufferEnd = 200
|
||||
|
||||
queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTime)
|
||||
stopTimeTail := stopTime - overlapBufferEnd
|
||||
startTimeFront := startTime + overlapBufferEnd
|
||||
|
||||
queryRunning := query.Where("job.job_state = ?", "running").
|
||||
Where("job.start_time <= ?", stopTimeTail)
|
||||
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
|
||||
queryRunning = queryRunning.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
|
||||
|
||||
query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
|
||||
"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
|
||||
query = query.Where("job.job_state != ?", "running").
|
||||
Where("job.start_time < ?", stopTimeTail).
|
||||
Where("(job.start_time + job.duration) > ?", startTimeFront)
|
||||
// Get At Least One Exact Hostname Match from JSON Resources Array in Database
|
||||
query = query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
cclog.Errorf("Error while running concurrent jobs query: %v", err)
|
||||
return nil, fmt.Errorf("failed to execute concurrent jobs query: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
@@ -269,44 +299,44 @@ func (r *JobRepository) FindConcurrentJobs(
|
||||
queryString := fmt.Sprintf("cluster=%s", job.Cluster)
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
var id, jobID, startTime sql.NullInt64
|
||||
|
||||
if err = rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
if err = rows.Scan(&id, &jobID, &startTime); err != nil {
|
||||
cclog.Warnf("Error while scanning concurrent job rows: %v", err)
|
||||
return nil, fmt.Errorf("failed to scan concurrent job row: %w", err)
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
JobID: int(jobID.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
rows, err = queryRunning.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
cclog.Errorf("Error while running query: %v", err)
|
||||
return nil, err
|
||||
cclog.Errorf("Error while running concurrent running jobs query: %v", err)
|
||||
return nil, fmt.Errorf("failed to execute concurrent running jobs query: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var id, jobId, startTime sql.NullInt64
|
||||
var id, jobID, startTime sql.NullInt64
|
||||
|
||||
if err := rows.Scan(&id, &jobId, &startTime); err != nil {
|
||||
cclog.Warn("Error while scanning rows")
|
||||
return nil, err
|
||||
if err := rows.Scan(&id, &jobID, &startTime); err != nil {
|
||||
cclog.Warnf("Error while scanning running concurrent job rows: %v", err)
|
||||
return nil, fmt.Errorf("failed to scan running concurrent job row: %w", err)
|
||||
}
|
||||
|
||||
if id.Valid {
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
|
||||
queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
|
||||
items = append(items,
|
||||
&model.JobLink{
|
||||
ID: fmt.Sprint(id.Int64),
|
||||
JobID: int(jobId.Int64),
|
||||
JobID: int(jobID.Int64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,16 +2,45 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package repository
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// JobHook interface allows external components to hook into job lifecycle events.
|
||||
// Implementations can perform actions when jobs start or stop, such as tagging,
|
||||
// logging, notifications, or triggering external workflows.
|
||||
//
|
||||
// Example implementation:
|
||||
//
|
||||
// type MyJobTagger struct{}
|
||||
//
|
||||
// func (t *MyJobTagger) JobStartCallback(job *schema.Job) {
|
||||
// if job.NumNodes > 100 {
|
||||
// // Tag large jobs automatically
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// func (t *MyJobTagger) JobStopCallback(job *schema.Job) {
|
||||
// if job.State == schema.JobStateFailed {
|
||||
// // Log or alert on failed jobs
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// Register hooks during application initialization:
|
||||
//
|
||||
// repository.RegisterJobHook(&MyJobTagger{})
|
||||
type JobHook interface {
|
||||
// JobStartCallback is invoked when one or more jobs start.
|
||||
// This is called synchronously, so implementations should be fast.
|
||||
JobStartCallback(job *schema.Job)
|
||||
|
||||
// JobStopCallback is invoked when a job completes.
|
||||
// This is called synchronously, so implementations should be fast.
|
||||
JobStopCallback(job *schema.Job)
|
||||
}
|
||||
|
||||
@@ -20,7 +49,13 @@ var (
|
||||
hooks []JobHook
|
||||
)
|
||||
|
||||
func RegisterJobJook(hook JobHook) {
|
||||
// RegisterJobHook registers a JobHook to receive job lifecycle callbacks.
|
||||
// Multiple hooks can be registered and will be called in registration order.
|
||||
// This function is safe to call multiple times and is typically called during
|
||||
// application initialization.
|
||||
//
|
||||
// Nil hooks are silently ignored to simplify conditional registration.
|
||||
func RegisterJobHook(hook JobHook) {
|
||||
initOnce.Do(func() {
|
||||
hooks = make([]JobHook, 0)
|
||||
})
|
||||
@@ -30,6 +65,12 @@ func RegisterJobJook(hook JobHook) {
|
||||
}
|
||||
}
|
||||
|
||||
// CallJobStartHooks invokes all registered JobHook.JobStartCallback methods
|
||||
// for each job in the provided slice. This is called internally by the repository
|
||||
// when jobs are started (e.g., via StartJob or batch job imports).
|
||||
//
|
||||
// Hooks are called synchronously in registration order. If a hook panics,
|
||||
// the panic will propagate to the caller.
|
||||
func CallJobStartHooks(jobs []*schema.Job) {
|
||||
if hooks == nil {
|
||||
return
|
||||
@@ -44,6 +85,12 @@ func CallJobStartHooks(jobs []*schema.Job) {
|
||||
}
|
||||
}
|
||||
|
||||
// CallJobStopHooks invokes all registered JobHook.JobStopCallback methods
|
||||
// for the provided job. This is called internally by the repository when a
|
||||
// job completes (e.g., via StopJob or job state updates).
|
||||
//
|
||||
// Hooks are called synchronously in registration order. If a hook panics,
|
||||
// the panic will propagate to the caller.
|
||||
func CallJobStopHooks(job *schema.Job) {
|
||||
if hooks == nil {
|
||||
return
|
||||
|
||||
@@ -2,6 +2,10 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package repository provides job query functionality with filtering, pagination,
|
||||
// and security controls. This file contains the main query builders and security
|
||||
// checks for job retrieval operations.
|
||||
package repository
|
||||
|
||||
import (
|
||||
@@ -14,11 +18,27 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
)
|
||||
|
||||
const (
|
||||
// Default initial capacity for job result slices
|
||||
defaultJobsCapacity = 50
|
||||
)
|
||||
|
||||
// QueryJobs retrieves jobs from the database with optional filtering, pagination,
|
||||
// and sorting. Security controls are automatically applied based on the user context.
|
||||
//
|
||||
// Parameters:
|
||||
// - ctx: Context containing user authentication information
|
||||
// - filters: Optional job filters (cluster, state, user, time ranges, etc.)
|
||||
// - page: Optional pagination parameters (page number and items per page)
|
||||
// - order: Optional sorting specification (column or footprint field)
|
||||
//
|
||||
// Returns a slice of jobs matching the criteria, or an error if the query fails.
|
||||
// The function enforces role-based access control through SecurityCheck.
|
||||
func (r *JobRepository) QueryJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter,
|
||||
@@ -33,26 +53,24 @@ func (r *JobRepository) QueryJobs(
|
||||
if order != nil {
|
||||
field := toSnakeCase(order.Field)
|
||||
if order.Type == "col" {
|
||||
// "col": Fixed column name query
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column")
|
||||
return nil, errors.New("invalid sorting order for column")
|
||||
}
|
||||
} else {
|
||||
// "foot": Order by footprint JSON field values
|
||||
// Verify and Search Only in Valid Jsons
|
||||
query = query.Where("JSON_VALID(meta_data)")
|
||||
// Order by footprint JSON field values
|
||||
query = query.Where("JSON_VALID(footprint)")
|
||||
switch order.Order {
|
||||
case model.SortDirectionEnumAsc:
|
||||
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field))
|
||||
case model.SortDirectionEnumDesc:
|
||||
query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field))
|
||||
default:
|
||||
return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint")
|
||||
return nil, errors.New("invalid sorting order for footprint")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -69,29 +87,35 @@ func (r *JobRepository) QueryJobs(
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
queryString, queryVars, _ := query.ToSql()
|
||||
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("query failed [%s] %v: %w", queryString, queryVars, err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
jobs := make([]*schema.Job, 0, 50)
|
||||
jobs := make([]*schema.Job, 0, defaultJobsCapacity)
|
||||
for rows.Next() {
|
||||
job, err := scanJob(rows)
|
||||
if err != nil {
|
||||
rows.Close()
|
||||
cclog.Warn("Error while scanning rows (Jobs)")
|
||||
return nil, err
|
||||
cclog.Warnf("Error scanning job row: %v", err)
|
||||
return nil, fmt.Errorf("failed to scan job row: %w", err)
|
||||
}
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, fmt.Errorf("error iterating job rows: %w", err)
|
||||
}
|
||||
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// CountJobs returns the total number of jobs matching the given filters.
|
||||
// Security controls are automatically applied based on the user context.
|
||||
// Uses DISTINCT count to handle tag filters correctly (jobs may appear multiple
|
||||
// times when joined with the tag table).
|
||||
func (r *JobRepository) CountJobs(
|
||||
ctx context.Context,
|
||||
filters []*model.JobFilter,
|
||||
) (int, error) {
|
||||
// DISTICT count for tags filters, does not affect other queries
|
||||
query, qerr := SecurityCheck(ctx, sq.Select("count(DISTINCT job.id)").From("job"))
|
||||
if qerr != nil {
|
||||
return 0, qerr
|
||||
@@ -103,12 +127,22 @@ func (r *JobRepository) CountJobs(
|
||||
|
||||
var count int
|
||||
if err := query.RunWith(r.DB).Scan(&count); err != nil {
|
||||
return 0, err
|
||||
return 0, fmt.Errorf("failed to count jobs: %w", err)
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// SecurityCheckWithUser applies role-based access control filters to a job query
|
||||
// based on the provided user's roles and permissions.
|
||||
//
|
||||
// Access rules by role:
|
||||
// - API role (exclusive): Full access to all jobs
|
||||
// - Admin/Support roles: Full access to all jobs
|
||||
// - Manager role: Access to jobs in managed projects plus own jobs
|
||||
// - User role: Access only to own jobs
|
||||
//
|
||||
// Returns an error if the user is nil or has no recognized roles.
|
||||
func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
if user == nil {
|
||||
var qnil sq.SelectBuilder
|
||||
@@ -116,84 +150,68 @@ func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.Select
|
||||
}
|
||||
|
||||
switch {
|
||||
case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): // API-User : All jobs
|
||||
case len(user.Roles) == 1 && user.HasRole(schema.RoleAPI):
|
||||
return query, nil
|
||||
case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): // Admin & Support : All jobs
|
||||
case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}):
|
||||
return query, nil
|
||||
case user.HasRole(schema.RoleManager): // Manager : Add filter for managed projects' jobs only + personal jobs
|
||||
case user.HasRole(schema.RoleManager):
|
||||
if len(user.Projects) != 0 {
|
||||
return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.hpc_user": user.Username}}), nil
|
||||
} else {
|
||||
cclog.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
|
||||
return query.Where("job.hpc_user = ?", user.Username), nil
|
||||
}
|
||||
case user.HasRole(schema.RoleUser): // User : Only personal jobs
|
||||
cclog.Debugf("Manager '%s' has no assigned projects, restricting to personal jobs", user.Username)
|
||||
return query.Where("job.hpc_user = ?", user.Username), nil
|
||||
default: // No known Role, return error
|
||||
case user.HasRole(schema.RoleUser):
|
||||
return query.Where("job.hpc_user = ?", user.Username), nil
|
||||
default:
|
||||
var qnil sq.SelectBuilder
|
||||
return qnil, fmt.Errorf("user has no or unknown roles")
|
||||
}
|
||||
}
|
||||
|
||||
// SecurityCheck extracts the user from the context and applies role-based access
|
||||
// control filters to the query. This is a convenience wrapper around SecurityCheckWithUser.
|
||||
func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
user := GetUserFromContext(ctx)
|
||||
|
||||
return SecurityCheckWithUser(user, query)
|
||||
}
|
||||
|
||||
// Build a sq.SelectBuilder out of a schema.JobFilter.
|
||||
// BuildWhereClause constructs SQL WHERE conditions from a JobFilter and applies
|
||||
// them to the query. Supports filtering by job properties (cluster, state, user),
|
||||
// time ranges, resource usage, tags, and JSON field searches in meta_data,
|
||||
// footprint, and resources columns.
|
||||
func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if filter.Tags != nil {
|
||||
// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
|
||||
}
|
||||
// Primary Key
|
||||
if filter.DbID != nil {
|
||||
dbIDs := make([]string, len(filter.DbID))
|
||||
copy(dbIDs, filter.DbID)
|
||||
query = query.Where(sq.Eq{"job.id": dbIDs})
|
||||
}
|
||||
if filter.JobID != nil {
|
||||
query = buildStringCondition("job.job_id", filter.JobID, query)
|
||||
}
|
||||
if filter.ArrayJobID != nil {
|
||||
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
|
||||
}
|
||||
if filter.User != nil {
|
||||
query = buildStringCondition("job.hpc_user", filter.User, query)
|
||||
}
|
||||
if filter.Project != nil {
|
||||
query = buildStringCondition("job.project", filter.Project, query)
|
||||
}
|
||||
if filter.JobName != nil {
|
||||
query = buildMetaJsonCondition("jobName", filter.JobName, query)
|
||||
}
|
||||
// Explicit indices
|
||||
if filter.Cluster != nil {
|
||||
query = buildStringCondition("job.cluster", filter.Cluster, query)
|
||||
}
|
||||
if filter.SubCluster != nil {
|
||||
query = buildStringCondition("job.subcluster", filter.SubCluster, query)
|
||||
}
|
||||
if filter.Partition != nil {
|
||||
query = buildStringCondition("job.cluster_partition", filter.Partition, query)
|
||||
}
|
||||
if filter.StartTime != nil {
|
||||
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
||||
}
|
||||
if filter.Duration != nil {
|
||||
query = buildIntCondition("job.duration", filter.Duration, query)
|
||||
}
|
||||
if filter.MinRunningFor != nil {
|
||||
now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
|
||||
query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
|
||||
}
|
||||
if filter.Shared != nil {
|
||||
query = query.Where("job.shared = ?", *filter.Shared)
|
||||
}
|
||||
if filter.State != nil {
|
||||
states := make([]string, len(filter.State))
|
||||
for i, val := range filter.State {
|
||||
states[i] = string(val)
|
||||
}
|
||||
|
||||
query = query.Where(sq.Eq{"job.job_state": states})
|
||||
}
|
||||
if filter.Shared != nil {
|
||||
query = query.Where("job.shared = ?", *filter.Shared)
|
||||
}
|
||||
if filter.Project != nil {
|
||||
query = buildStringCondition("job.project", filter.Project, query)
|
||||
}
|
||||
if filter.User != nil {
|
||||
query = buildStringCondition("job.hpc_user", filter.User, query)
|
||||
}
|
||||
if filter.NumNodes != nil {
|
||||
query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
|
||||
}
|
||||
@@ -203,33 +221,95 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
|
||||
if filter.NumHWThreads != nil {
|
||||
query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
|
||||
}
|
||||
if filter.Node != nil {
|
||||
query = buildResourceJsonCondition("hostname", filter.Node, query)
|
||||
if filter.ArrayJobID != nil {
|
||||
query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
|
||||
}
|
||||
if filter.StartTime != nil {
|
||||
query = buildTimeCondition("job.start_time", filter.StartTime, query)
|
||||
}
|
||||
if filter.Duration != nil {
|
||||
query = buildIntCondition("job.duration", filter.Duration, query)
|
||||
}
|
||||
if filter.Energy != nil {
|
||||
query = buildFloatCondition("job.energy", filter.Energy, query)
|
||||
}
|
||||
// Indices on Tag Table
|
||||
if filter.Tags != nil {
|
||||
// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
|
||||
query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
|
||||
}
|
||||
// No explicit Indices
|
||||
if filter.JobID != nil {
|
||||
query = buildStringCondition("job.job_id", filter.JobID, query)
|
||||
}
|
||||
// Queries Within JSONs
|
||||
if filter.MetricStats != nil {
|
||||
for _, ms := range filter.MetricStats {
|
||||
query = buildFloatJsonCondition(ms.MetricName, ms.Range, query)
|
||||
query = buildFloatJSONCondition(ms.MetricName, ms.Range, query)
|
||||
}
|
||||
}
|
||||
if filter.Node != nil {
|
||||
query = buildResourceJSONCondition("hostname", filter.Node, query)
|
||||
}
|
||||
if filter.JobName != nil {
|
||||
query = buildMetaJSONCondition("jobName", filter.JobName, query)
|
||||
}
|
||||
if filter.Schedule != nil {
|
||||
interactiveJobname := "interactive"
|
||||
switch *filter.Schedule {
|
||||
case "interactive":
|
||||
iFilter := model.StringInput{Eq: &interactiveJobname}
|
||||
query = buildMetaJSONCondition("jobName", &iFilter, query)
|
||||
case "batch":
|
||||
sFilter := model.StringInput{Neq: &interactiveJobname}
|
||||
query = buildMetaJSONCondition("jobName", &sFilter, query)
|
||||
}
|
||||
}
|
||||
|
||||
// Configurable Filter to exclude recently started jobs, see config.go: ShortRunningJobsDuration
|
||||
if filter.MinRunningFor != nil {
|
||||
now := time.Now().Unix()
|
||||
// Only jobs whose start timestamp is more than MinRunningFor seconds in the past
|
||||
// If a job completed within the configured timeframe, it will still show up after the start_time matches the condition!
|
||||
query = query.Where(sq.Lt{"job.start_time": (now - int64(*filter.MinRunningFor))})
|
||||
}
|
||||
return query
|
||||
}
|
||||
|
||||
// buildIntCondition creates clauses for integer range filters, using BETWEEN only if required.
|
||||
func buildIntCondition(field string, cond *config.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
if cond.From != 1 && cond.To != 0 {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
} else if cond.From != 1 && cond.To == 0 {
|
||||
return query.Where(field+" >= ?", cond.From)
|
||||
} else if cond.From == 1 && cond.To != 0 {
|
||||
return query.Where(field+" <= ?", cond.To)
|
||||
} else {
|
||||
return query
|
||||
}
|
||||
}
|
||||
|
||||
// buildFloatCondition creates a clauses for float range filters, using BETWEEN only if required.
|
||||
func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
if cond.From != 1.0 && cond.To != 0.0 {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
|
||||
} else if cond.From != 1.0 && cond.To == 0.0 {
|
||||
return query.Where(field+" >= ?", cond.From)
|
||||
} else if cond.From == 1.0 && cond.To != 0.0 {
|
||||
return query.Where(field+" <= ?", cond.To)
|
||||
} else {
|
||||
return query
|
||||
}
|
||||
}
|
||||
|
||||
// buildTimeCondition creates time range filters supporting absolute timestamps,
|
||||
// relative time ranges (last6h, last24h, last7d, last30d), or open-ended ranges.
|
||||
// Reminder: BETWEEN Queries are slower and dont use indices as frequently: Only use if both conditions required
|
||||
func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if cond.From != nil && cond.To != nil {
|
||||
return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
|
||||
} else if cond.From != nil {
|
||||
return query.Where("? <= "+field, cond.From.Unix())
|
||||
return query.Where(field+" >= ?", cond.From.Unix())
|
||||
} else if cond.To != nil {
|
||||
return query.Where(field+" <= ?", cond.To.Unix())
|
||||
} else if cond.Range != "" {
|
||||
@@ -248,18 +328,28 @@ func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBui
|
||||
cclog.Debugf("No known named timeRange: startTime.range = %s", cond.Range)
|
||||
return query
|
||||
}
|
||||
return query.Where(field+" BETWEEN ? AND ?", then, now)
|
||||
return query.Where(field+" >= ?", then)
|
||||
} else {
|
||||
return query
|
||||
}
|
||||
}
|
||||
|
||||
func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
// Verify and Search Only in Valid Jsons
|
||||
// buildFloatJSONCondition creates a filter on a numeric field within the footprint JSON column, using BETWEEN only if required.
|
||||
func buildFloatJSONCondition(jsonField string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
query = query.Where("JSON_VALID(footprint)")
|
||||
return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To)
|
||||
if cond.From != 1.0 && cond.To != 0.0 {
|
||||
return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") BETWEEN ? AND ?", cond.From, cond.To)
|
||||
} else if cond.From != 1.0 && cond.To == 0.0 {
|
||||
return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") >= ?", cond.From)
|
||||
} else if cond.From == 1.0 && cond.To != 0.0 {
|
||||
return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") <= ?", cond.To)
|
||||
} else {
|
||||
return query
|
||||
}
|
||||
}
|
||||
|
||||
// buildStringCondition creates filters for string fields supporting equality,
|
||||
// inequality, prefix, suffix, substring, and IN list matching.
|
||||
func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
if cond.Eq != nil {
|
||||
return query.Where(field+" = ?", *cond.Eq)
|
||||
@@ -284,10 +374,9 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select
|
||||
return query
|
||||
}
|
||||
|
||||
func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
// Verify and Search Only in Valid Jsons
|
||||
// buildMetaJSONCondition creates filters on fields within the meta_data JSON column.
|
||||
func buildMetaJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
query = query.Where("JSON_VALID(meta_data)")
|
||||
// add "AND" Sql query Block for field match
|
||||
if cond.Eq != nil {
|
||||
return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") = ?", *cond.Eq)
|
||||
}
|
||||
@@ -306,10 +395,10 @@ func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.
|
||||
return query
|
||||
}
|
||||
|
||||
func buildResourceJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
// Verify and Search Only in Valid Jsons
|
||||
// buildResourceJSONCondition creates filters on fields within the resources JSON array column.
|
||||
// Uses json_each to search within array elements.
|
||||
func buildResourceJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
|
||||
query = query.Where("JSON_VALID(resources)")
|
||||
// add "AND" Sql query Block for field match
|
||||
if cond.Eq != nil {
|
||||
return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") = ?)", *cond.Eq)
|
||||
}
|
||||
@@ -333,15 +422,16 @@ var (
|
||||
matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])")
|
||||
)
|
||||
|
||||
// toSnakeCase converts camelCase strings to snake_case for SQL column names.
|
||||
// Includes security checks to prevent SQL injection attempts.
|
||||
// Panics if potentially dangerous characters are detected.
|
||||
func toSnakeCase(str string) string {
|
||||
for _, c := range str {
|
||||
if c == '\'' || c == '\\' {
|
||||
cclog.Panic("toSnakeCase() attack vector!")
|
||||
if c == '\'' || c == '\\' || c == '"' || c == ';' || c == '-' || c == ' ' {
|
||||
cclog.Panicf("toSnakeCase: potentially dangerous character detected in input: %q", str)
|
||||
}
|
||||
}
|
||||
|
||||
str = strings.ReplaceAll(str, "'", "")
|
||||
str = strings.ReplaceAll(str, "\\", "")
|
||||
snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
|
||||
snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
|
||||
return strings.ToLower(snake)
|
||||
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
@@ -33,7 +33,7 @@ func TestFind(t *testing.T) {
|
||||
func TestFindById(t *testing.T) {
|
||||
r := setup(t)
|
||||
|
||||
job, err := r.FindById(getContext(t), 338)
|
||||
job, err := r.FindByID(getContext(t), 338)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -78,7 +78,7 @@ func TestFindJobsBetween(t *testing.T) {
|
||||
|
||||
// 1. Find a job to use (Find all jobs)
|
||||
// We use a large time range to ensure we get something if it exists
|
||||
jobs, err := r.FindJobsBetween(0, 9999999999, false)
|
||||
jobs, err := r.FindJobsBetween(0, 9999999999, "none")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -88,21 +88,21 @@ func TestFindJobsBetween(t *testing.T) {
|
||||
|
||||
targetJob := jobs[0]
|
||||
|
||||
// 2. Create a tag
|
||||
tagName := fmt.Sprintf("testtag_%d", time.Now().UnixNano())
|
||||
tagId, err := r.CreateTag("testtype", tagName, "global")
|
||||
// 2. Create an auto-tagger tag (type "app")
|
||||
appTagName := fmt.Sprintf("apptag_%d", time.Now().UnixNano())
|
||||
appTagID, err := r.CreateTag("app", appTagName, "global")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// 3. Link Tag (Manually to avoid archive dependency side-effects in unit test)
|
||||
_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagId)
|
||||
// 3. Link auto-tagger tag to job
|
||||
_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, appTagID)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// 4. Search with omitTagged = false (Should find the job)
|
||||
jobsFound, err := r.FindJobsBetween(0, 9999999999, false)
|
||||
// 4. Search with omitTagged = "none" (Should find the job)
|
||||
jobsFound, err := r.FindJobsBetween(0, 9999999999, "none")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -115,18 +115,58 @@ func TestFindJobsBetween(t *testing.T) {
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Target job %d should be found when omitTagged=false", *targetJob.ID)
|
||||
t.Errorf("Target job %d should be found when omitTagged=none", *targetJob.ID)
|
||||
}
|
||||
|
||||
// 5. Search with omitTagged = true (Should NOT find the job)
|
||||
jobsFiltered, err := r.FindJobsBetween(0, 9999999999, true)
|
||||
// 5. Search with omitTagged = "all" (Should NOT find the job — it has a tag)
|
||||
jobsFiltered, err := r.FindJobsBetween(0, 9999999999, "all")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for _, j := range jobsFiltered {
|
||||
if *j.ID == *targetJob.ID {
|
||||
t.Errorf("Target job %d should NOT be found when omitTagged=true", *targetJob.ID)
|
||||
t.Errorf("Target job %d should NOT be found when omitTagged=all", *targetJob.ID)
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Search with omitTagged = "user": auto-tagger tag ("app") should NOT exclude the job
|
||||
jobsUserFilter, err := r.FindJobsBetween(0, 9999999999, "user")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
found = false
|
||||
for _, j := range jobsUserFilter {
|
||||
if *j.ID == *targetJob.ID {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Target job %d should be found when omitTagged=user (only has auto-tagger tag)", *targetJob.ID)
|
||||
}
|
||||
|
||||
// 7. Add a user-created tag (type "testtype") to the same job
|
||||
userTagName := fmt.Sprintf("usertag_%d", time.Now().UnixNano())
|
||||
userTagID, err := r.CreateTag("testtype", userTagName, "global")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, userTagID)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// 8. Now omitTagged = "user" should exclude the job (has a user-created tag)
|
||||
jobsUserFilter2, err := r.FindJobsBetween(0, 9999999999, "user")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for _, j := range jobsUserFilter2 {
|
||||
if *j.ID == *targetJob.ID {
|
||||
t.Errorf("Target job %d should NOT be found when omitTagged=user (has user-created tag)", *targetJob.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,52 +10,48 @@ import (
|
||||
"embed"
|
||||
"fmt"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/golang-migrate/migrate/v4"
|
||||
"github.com/golang-migrate/migrate/v4/database/mysql"
|
||||
"github.com/golang-migrate/migrate/v4/database/sqlite3"
|
||||
"github.com/golang-migrate/migrate/v4/source/iofs"
|
||||
)
|
||||
|
||||
// Version is the current database schema version required by this version of cc-backend.
|
||||
// When the database schema changes, this version is incremented and a new migration file
|
||||
// is added to internal/repository/migrations/sqlite3/.
|
||||
//
|
||||
// Version history:
|
||||
// - Version 10: Current version
|
||||
//
|
||||
// Migration files are embedded at build time from the migrations directory.
|
||||
const Version uint = 10
|
||||
|
||||
//go:embed migrations/*
|
||||
var migrationFiles embed.FS
|
||||
|
||||
func checkDBVersion(backend string, db *sql.DB) error {
|
||||
var m *migrate.Migrate
|
||||
// checkDBVersion verifies that the database schema version matches the expected version.
|
||||
// This is called automatically during Connect() to ensure schema compatibility.
|
||||
//
|
||||
// Returns an error if:
|
||||
// - Database version is older than expected (needs migration)
|
||||
// - Database version is newer than expected (needs app upgrade)
|
||||
// - Database is in a dirty state (failed migration)
|
||||
//
|
||||
// A "dirty" database indicates a migration was started but not completed successfully.
|
||||
// This requires manual intervention to fix the database and force the version.
|
||||
func checkDBVersion(db *sql.DB) error {
|
||||
driver, err := sqlite3.WithInstance(db, &sqlite3.Config{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch backend {
|
||||
case "sqlite3":
|
||||
driver, err := sqlite3.WithInstance(db, &sqlite3.Config{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
m, err = migrate.NewWithInstance("iofs", d, "sqlite3", driver)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case "mysql":
|
||||
driver, err := mysql.WithInstance(db, &mysql.Config{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
d, err := iofs.New(migrationFiles, "migrations/mysql")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
m, err = migrate.NewWithInstance("iofs", d, "mysql", driver)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
|
||||
m, err := migrate.NewWithInstance("iofs", d, "sqlite3", driver)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
v, dirty, err := m.Version()
|
||||
@@ -80,37 +76,41 @@ func checkDBVersion(backend string, db *sql.DB) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMigrateInstance(backend string, db string) (m *migrate.Migrate, err error) {
|
||||
switch backend {
|
||||
case "sqlite3":
|
||||
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
|
||||
if err != nil {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
// getMigrateInstance creates a new migration instance for the given database file.
|
||||
// This is used internally by MigrateDB, RevertDB, and ForceDB.
|
||||
func getMigrateInstance(db string) (m *migrate.Migrate, err error) {
|
||||
d, err := iofs.New(migrationFiles, "migrations/sqlite3")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db))
|
||||
if err != nil {
|
||||
return m, err
|
||||
}
|
||||
case "mysql":
|
||||
d, err := iofs.New(migrationFiles, "migrations/mysql")
|
||||
if err != nil {
|
||||
return m, err
|
||||
}
|
||||
|
||||
m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("mysql://%s?multiStatements=true", db))
|
||||
if err != nil {
|
||||
return m, err
|
||||
}
|
||||
default:
|
||||
cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
|
||||
m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func MigrateDB(backend string, db string) error {
|
||||
m, err := getMigrateInstance(backend, db)
|
||||
// MigrateDB applies all pending database migrations to bring the schema up to date.
|
||||
// This should be run with the -migrate-db flag before starting the application
|
||||
// after upgrading to a new version that requires schema changes.
|
||||
//
|
||||
// Process:
|
||||
// 1. Checks current database version
|
||||
// 2. Applies all migrations from current version to target Version
|
||||
// 3. Updates schema_migrations table to track applied migrations
|
||||
//
|
||||
// Important:
|
||||
// - Always backup your database before running migrations
|
||||
// - Migrations are irreversible without manual intervention
|
||||
// - If a migration fails, the database is marked "dirty" and requires manual fix
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// cc-backend -migrate-db
|
||||
func MigrateDB(db string) error {
|
||||
m, err := getMigrateInstance(db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -118,7 +118,7 @@ func MigrateDB(backend string, db string) error {
|
||||
v, dirty, err := m.Version()
|
||||
if err != nil {
|
||||
if err == migrate.ErrNilVersion {
|
||||
cclog.Warn("Legacy database without version or missing database file!")
|
||||
cclog.Info("Legacy database without version or missing database file!")
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
@@ -144,8 +144,19 @@ func MigrateDB(backend string, db string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func RevertDB(backend string, db string) error {
|
||||
m, err := getMigrateInstance(backend, db)
|
||||
// RevertDB rolls back the database schema to the previous version (Version - 1).
|
||||
// This is primarily used for testing or emergency rollback scenarios.
|
||||
//
|
||||
// Warning:
|
||||
// - This may cause data loss if newer schema added columns/tables
|
||||
// - Always backup before reverting
|
||||
// - Not all migrations are safely reversible
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// cc-backend -revert-db
|
||||
func RevertDB(db string) error {
|
||||
m, err := getMigrateInstance(db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -162,8 +173,23 @@ func RevertDB(backend string, db string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func ForceDB(backend string, db string) error {
|
||||
m, err := getMigrateInstance(backend, db)
|
||||
// ForceDB forces the database schema version to the current Version without running migrations.
|
||||
// This is only used to recover from failed migrations that left the database in a "dirty" state.
|
||||
//
|
||||
// When to use:
|
||||
// - After manually fixing a failed migration
|
||||
// - When you've manually applied schema changes and need to update the version marker
|
||||
//
|
||||
// Warning:
|
||||
// - This does NOT apply any schema changes
|
||||
// - Only use after manually verifying the schema is correct
|
||||
// - Improper use can cause schema/version mismatch
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// cc-backend -force-db
|
||||
func ForceDB(db string) error {
|
||||
m, err := getMigrateInstance(db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
DROP TABLE IF EXISTS job;
|
||||
DROP TABLE IF EXISTS tags;
|
||||
DROP TABLE IF EXISTS jobtag;
|
||||
DROP TABLE IF EXISTS configuration;
|
||||
DROP TABLE IF EXISTS user;
|
||||
@@ -1,66 +0,0 @@
|
||||
CREATE TABLE IF NOT EXISTS job (
|
||||
id INTEGER AUTO_INCREMENT PRIMARY KEY ,
|
||||
job_id BIGINT NOT NULL,
|
||||
cluster VARCHAR(255) NOT NULL,
|
||||
subcluster VARCHAR(255) NOT NULL,
|
||||
start_time BIGINT NOT NULL, -- Unix timestamp
|
||||
|
||||
user VARCHAR(255) NOT NULL,
|
||||
project VARCHAR(255) NOT NULL,
|
||||
`partition` VARCHAR(255) NOT NULL,
|
||||
array_job_id BIGINT NOT NULL,
|
||||
duration INT NOT NULL DEFAULT 0,
|
||||
walltime INT NOT NULL DEFAULT 0,
|
||||
job_state VARCHAR(255) NOT NULL
|
||||
CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled',
|
||||
'stopped', 'timeout', 'preempted', 'out_of_memory')),
|
||||
meta_data TEXT, -- JSON
|
||||
resources TEXT NOT NULL, -- JSON
|
||||
|
||||
num_nodes INT NOT NULL,
|
||||
num_hwthreads INT NOT NULL,
|
||||
num_acc INT NOT NULL,
|
||||
smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )),
|
||||
exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)),
|
||||
monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
|
||||
|
||||
mem_used_max REAL NOT NULL DEFAULT 0.0,
|
||||
flops_any_avg REAL NOT NULL DEFAULT 0.0,
|
||||
mem_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||
load_avg REAL NOT NULL DEFAULT 0.0,
|
||||
net_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||
net_data_vol_total REAL NOT NULL DEFAULT 0.0,
|
||||
file_bw_avg REAL NOT NULL DEFAULT 0.0,
|
||||
file_data_vol_total REAL NOT NULL DEFAULT 0.0,
|
||||
UNIQUE (job_id, cluster, start_time)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tag (
|
||||
id INTEGER PRIMARY KEY,
|
||||
tag_type VARCHAR(255) NOT NULL,
|
||||
tag_name VARCHAR(255) NOT NULL,
|
||||
UNIQUE (tag_type, tag_name));
|
||||
|
||||
CREATE TABLE IF NOT EXISTS jobtag (
|
||||
job_id INTEGER,
|
||||
tag_id INTEGER,
|
||||
PRIMARY KEY (job_id, tag_id),
|
||||
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS user (
|
||||
username varchar(255) PRIMARY KEY NOT NULL,
|
||||
password varchar(255) DEFAULT NULL,
|
||||
ldap tinyint NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */
|
||||
name varchar(255) DEFAULT NULL,
|
||||
roles varchar(255) NOT NULL DEFAULT "[]",
|
||||
email varchar(255) DEFAULT NULL);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS configuration (
|
||||
username varchar(255),
|
||||
confkey varchar(255),
|
||||
value varchar(255),
|
||||
PRIMARY KEY (username, confkey),
|
||||
FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);
|
||||
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
DROP INDEX IF EXISTS job_stats;
|
||||
DROP INDEX IF EXISTS job_by_user;
|
||||
DROP INDEX IF EXISTS job_by_starttime;
|
||||
DROP INDEX IF EXISTS job_by_job_id;
|
||||
DROP INDEX IF EXISTS job_list;
|
||||
DROP INDEX IF EXISTS job_list_user;
|
||||
DROP INDEX IF EXISTS job_list_users;
|
||||
DROP INDEX IF EXISTS job_list_users_start;
|
||||
@@ -1,8 +0,0 @@
|
||||
CREATE INDEX IF NOT EXISTS job_stats ON job (cluster,subcluster,user);
|
||||
CREATE INDEX IF NOT EXISTS job_by_user ON job (user);
|
||||
CREATE INDEX IF NOT EXISTS job_by_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS job_by_job_id ON job (job_id);
|
||||
CREATE INDEX IF NOT EXISTS job_list ON job (cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS job_list_user ON job (user, cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS job_list_users ON job (user, job_state);
|
||||
CREATE INDEX IF NOT EXISTS job_list_users_start ON job (start_time, user, job_state);
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE user DROP COLUMN projects;
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE user ADD COLUMN projects varchar(255) NOT NULL DEFAULT "[]";
|
||||
@@ -1,5 +0,0 @@
|
||||
ALTER TABLE job
|
||||
MODIFY `partition` VARCHAR(255) NOT NULL,
|
||||
MODIFY array_job_id BIGINT NOT NULL,
|
||||
MODIFY num_hwthreads INT NOT NULL,
|
||||
MODIFY num_acc INT NOT NULL;
|
||||
@@ -1,5 +0,0 @@
|
||||
ALTER TABLE job
|
||||
MODIFY `partition` VARCHAR(255),
|
||||
MODIFY array_job_id BIGINT,
|
||||
MODIFY num_hwthreads INT,
|
||||
MODIFY num_acc INT;
|
||||
@@ -1,2 +0,0 @@
|
||||
ALTER TABLE tag DROP COLUMN insert_time;
|
||||
ALTER TABLE jobtag DROP COLUMN insert_time;
|
||||
@@ -1,2 +0,0 @@
|
||||
ALTER TABLE tag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP;
|
||||
ALTER TABLE jobtag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP;
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE configuration MODIFY value VARCHAR(255);
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE configuration MODIFY value TEXT;
|
||||
@@ -1,3 +0,0 @@
|
||||
SET FOREIGN_KEY_CHECKS = 0;
|
||||
ALTER TABLE tag MODIFY id INTEGER;
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
||||
@@ -1,3 +0,0 @@
|
||||
SET FOREIGN_KEY_CHECKS = 0;
|
||||
ALTER TABLE tag MODIFY id INTEGER AUTO_INCREMENT;
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
||||
@@ -1,83 +0,0 @@
|
||||
ALTER TABLE job DROP energy;
|
||||
ALTER TABLE job DROP energy_footprint;
|
||||
ALTER TABLE job ADD COLUMN flops_any_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN mem_used_max;
|
||||
ALTER TABLE job ADD COLUMN load_avg;
|
||||
ALTER TABLE job ADD COLUMN net_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN net_data_vol_total;
|
||||
ALTER TABLE job ADD COLUMN file_bw_avg;
|
||||
ALTER TABLE job ADD COLUMN file_data_vol_total;
|
||||
|
||||
UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
|
||||
UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
|
||||
UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
|
||||
UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
|
||||
UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
|
||||
UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
|
||||
UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
|
||||
UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
|
||||
|
||||
ALTER TABLE job DROP footprint;
|
||||
-- Do not use reserved keywords anymore
|
||||
RENAME TABLE hpc_user TO `user`;
|
||||
ALTER TABLE job RENAME COLUMN hpc_user TO `user`;
|
||||
ALTER TABLE job RENAME COLUMN cluster_partition TO `partition`;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_subcluster;
|
||||
DROP INDEX IF EXISTS jobs_cluster_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_user;
|
||||
DROP INDEX IF EXISTS jobs_user_starttime;
|
||||
DROP INDEX IF EXISTS jobs_user_duration;
|
||||
DROP INDEX IF EXISTS jobs_user_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_project;
|
||||
DROP INDEX IF EXISTS jobs_project_user;
|
||||
DROP INDEX IF EXISTS jobs_project_starttime;
|
||||
DROP INDEX IF EXISTS jobs_project_duration;
|
||||
DROP INDEX IF EXISTS jobs_project_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_jobstate;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_user;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_project;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_starttime;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_duration;
|
||||
DROP INDEX IF EXISTS jobs_jobstate_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
|
||||
DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_starttime;
|
||||
DROP INDEX IF EXISTS jobs_duration;
|
||||
DROP INDEX IF EXISTS jobs_numnodes;
|
||||
|
||||
DROP INDEX IF EXISTS jobs_duration_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numnodes_starttime;
|
||||
DROP INDEX IF EXISTS jobs_numacc_starttime;
|
||||
DROP INDEX IF EXISTS jobs_energy_starttime;
|
||||
@@ -1,123 +0,0 @@
|
||||
DROP INDEX IF EXISTS job_stats ON job;
|
||||
DROP INDEX IF EXISTS job_by_user ON job;
|
||||
DROP INDEX IF EXISTS job_by_starttime ON job;
|
||||
DROP INDEX IF EXISTS job_by_job_id ON job;
|
||||
DROP INDEX IF EXISTS job_list ON job;
|
||||
DROP INDEX IF EXISTS job_list_user ON job;
|
||||
DROP INDEX IF EXISTS job_list_users ON job;
|
||||
DROP INDEX IF EXISTS job_list_users_start ON job;
|
||||
|
||||
ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
|
||||
ALTER TABLE job ADD COLUMN energy_footprint JSON;
|
||||
|
||||
ALTER TABLE job ADD COLUMN footprint JSON;
|
||||
ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
|
||||
|
||||
-- Do not use reserved keywords anymore
|
||||
RENAME TABLE `user` TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN `user` TO hpc_user;
|
||||
ALTER TABLE job RENAME COLUMN `partition` TO cluster_partition;
|
||||
|
||||
ALTER TABLE job MODIFY COLUMN cluster VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN hpc_user VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN subcluster VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN project VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN cluster_partition VARCHAR(50);
|
||||
ALTER TABLE job MODIFY COLUMN job_state VARCHAR(25);
|
||||
|
||||
UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
|
||||
UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
|
||||
UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
|
||||
|
||||
ALTER TABLE job DROP flops_any_avg;
|
||||
ALTER TABLE job DROP mem_bw_avg;
|
||||
ALTER TABLE job DROP mem_used_max;
|
||||
ALTER TABLE job DROP load_avg;
|
||||
ALTER TABLE job DROP net_bw_avg;
|
||||
ALTER TABLE job DROP net_data_vol_total;
|
||||
ALTER TABLE job DROP file_bw_avg;
|
||||
ALTER TABLE job DROP file_data_vol_total;
|
||||
|
||||
-- Indices for: Single filters, combined filters, sorting, sorting with filters
|
||||
-- Cluster Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
|
||||
-- Cluster Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
|
||||
|
||||
-- Cluster+Partition Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
|
||||
-- Cluster+Partition Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
|
||||
|
||||
-- Cluster+Partition+Jobstate Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
|
||||
-- Cluster+Partition+Jobstate Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
|
||||
|
||||
-- Cluster+JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
|
||||
-- Cluster+JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
|
||||
|
||||
-- User Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
|
||||
-- User Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
|
||||
|
||||
-- Project Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
|
||||
-- Project Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
|
||||
|
||||
-- JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
|
||||
-- JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
|
||||
|
||||
-- ArrayJob Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
|
||||
|
||||
-- Sorting without active filters
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
|
||||
|
||||
-- Single filters with default starttime sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
|
||||
|
||||
-- Optimize DB index usage
|
||||
@@ -118,104 +118,116 @@ DROP TABLE lookup_exclusive;
|
||||
DROP TABLE job; -- Deletes All Existing 'job' Indices; Recreate after Renaming
|
||||
ALTER TABLE job_new RENAME TO job;
|
||||
|
||||
-- Recreate Indices from 08_add-footprint, include new submit_time indices
|
||||
-- Recreate Indices from 08_add-footprint; include new 'shared' column
|
||||
-- Cluster Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
|
||||
-- Cluster Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_submittime ON job (cluster, submit_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numhwthreads ON job (cluster, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_numacc ON job (cluster, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_energy ON job (cluster, energy);
|
||||
|
||||
-- Cluster Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_duration_starttime ON job (cluster, duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_starttime_duration ON job (cluster, start_time, duration);
|
||||
|
||||
-- Cluster+Partition Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_user ON job (cluster, cluster_partition, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_project ON job (cluster, cluster_partition, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_shared ON job (cluster, cluster_partition, shared);
|
||||
|
||||
-- Cluster+Partition Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_submittime ON job (cluster, cluster_partition, submit_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numhwthreads ON job (cluster, cluster_partition, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numacc ON job (cluster, cluster_partition, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_energy ON job (cluster, cluster_partition, energy);
|
||||
|
||||
-- Cluster+Partition+Jobstate Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
|
||||
-- Cluster+Partition+Jobstate Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_submittime ON job (cluster, cluster_partition, job_state, submit_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numhwthreads ON job (cluster, cluster_partition, job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numacc ON job (cluster, cluster_partition, job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_energy ON job (cluster, cluster_partition, job_state, energy);
|
||||
-- Cluster+Partition Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration_starttime ON job (cluster, cluster_partition, duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime_duration ON job (cluster, cluster_partition, start_time, duration);
|
||||
|
||||
-- Cluster+JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
|
||||
-- Cluster+JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_submittime ON job (cluster, job_state, submit_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numhwthreads ON job (cluster, job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numacc ON job (cluster, job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_energy ON job (cluster, job_state, energy);
|
||||
|
||||
-- Cluster+JobState Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime_duration ON job (cluster, job_state, start_time, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration_starttime ON job (cluster, job_state, duration, start_time);
|
||||
|
||||
-- Cluster+Shared Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_user ON job (cluster, shared, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_project ON job (cluster, shared, project);
|
||||
-- Cluster+Shared Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numnodes ON job (cluster, shared, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numhwthreads ON job (cluster, shared, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numacc ON job (cluster, shared, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_energy ON job (cluster, shared, energy);
|
||||
|
||||
-- Cluster+Shared Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_starttime_duration ON job (cluster, shared, start_time, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_shared_duration_starttime ON job (cluster, shared, duration, start_time);
|
||||
|
||||
-- User Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
|
||||
-- User Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numhwthreads ON job (hpc_user, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_numacc ON job (hpc_user, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_energy ON job (hpc_user, energy);
|
||||
|
||||
-- Cluster+Shared Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_starttime_duration ON job (hpc_user, start_time, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_user_duration_starttime ON job (hpc_user, duration, start_time);
|
||||
|
||||
-- Project Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
|
||||
-- Project Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numhwthreads ON job (project, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_numacc ON job (project, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_energy ON job (project, energy);
|
||||
|
||||
-- Cluster+Shared Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_starttime_duration ON job (project, start_time, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_project_duration_starttime ON job (project, duration, start_time);
|
||||
|
||||
-- JobState Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
|
||||
-- JobState Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numhwthreads ON job (job_state, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_numacc ON job (job_state, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_energy ON job (job_state, energy);
|
||||
|
||||
-- Cluster+Shared Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime_duration ON job (job_state, start_time, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_jobstate_duration_starttime ON job (job_state, duration, start_time);
|
||||
|
||||
-- Shared Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_user ON job (shared, hpc_user);
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_project ON job (shared, project);
|
||||
-- Shared Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_numnodes ON job (shared, num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_numhwthreads ON job (shared, num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_numacc ON job (shared, num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_energy ON job (shared, energy);
|
||||
|
||||
-- Cluster+Shared Time Filter Sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_starttime_duration ON job (shared, start_time, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_shared_duration_starttime ON job (shared, duration, start_time);
|
||||
|
||||
-- ArrayJob Filter
|
||||
CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
|
||||
|
||||
-- Sorting without active filters
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numhwthreads ON job (num_hwthreads);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc ON job (num_acc);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy ON job (energy);
|
||||
|
||||
-- Single filters with default starttime sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
|
||||
@@ -223,6 +235,22 @@ CREATE INDEX IF NOT EXISTS jobs_numhwthreads_starttime ON job (num_hwthreads, st
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
|
||||
|
||||
-- Single filters with duration sorting
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime_duration ON job (start_time, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numnodes_duration ON job (num_nodes, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numhwthreads_duration ON job (num_hwthreads, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_numacc_duration ON job (num_acc, duration);
|
||||
CREATE INDEX IF NOT EXISTS jobs_energy_duration ON job (energy, duration);
|
||||
|
||||
-- Backup Indices For High Variety Columns
|
||||
CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
|
||||
CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
|
||||
|
||||
-- Notes:
|
||||
-- Cluster+Partition+Jobstate Filter: Tested -> Full Array Of Combinations non-required
|
||||
-- Cluster+JobState+Shared Filter: Tested -> No further timing improvement
|
||||
-- JobState+Shared Filter: Tested -> No further timing improvement
|
||||
|
||||
-- Optimize DB index usage
|
||||
PRAGMA optimize;
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ CREATE TABLE "node_state" (
|
||||
CHECK (health_state IN (
|
||||
'full', 'partial', 'failed'
|
||||
)),
|
||||
health_metrics TEXT, -- JSON array of strings
|
||||
node_id INTEGER,
|
||||
FOREIGN KEY (node_id) REFERENCES node (id)
|
||||
);
|
||||
@@ -33,12 +34,11 @@ CREATE INDEX IF NOT EXISTS nodes_cluster_subcluster ON node (cluster, subcluster
|
||||
|
||||
-- Add NEW Indices For New Node_State Table Fields
|
||||
CREATE INDEX IF NOT EXISTS nodestates_timestamp ON node_state (time_stamp);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_state ON node_state (node_state);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_health ON node_state (health_state);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_state_timestamp ON node_state (node_state, time_stamp);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state);
|
||||
CREATE INDEX IF NOT EXISTS nodestates_nodeid_timestamp ON node_state (node_id, time_stamp DESC);
|
||||
|
||||
-- Add NEW Indices For Increased Amounts of Tags
|
||||
CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id);
|
||||
|
||||
@@ -10,14 +10,17 @@ import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"slices"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/lrucache"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
sq "github.com/Masterminds/squirrel"
|
||||
"github.com/jmoiron/sqlx"
|
||||
)
|
||||
@@ -49,6 +52,38 @@ func GetNodeRepository() *NodeRepository {
|
||||
return nodeRepoInstance
|
||||
}
|
||||
|
||||
// latestStateCondition returns a squirrel expression that restricts node_state
|
||||
// rows to the latest per node_id using a correlated subquery.
|
||||
// Requires the query to join node and node_state tables.
|
||||
func latestStateCondition() sq.Sqlizer {
|
||||
return sq.Expr(
|
||||
"node_state.id = (SELECT ns2.id FROM node_state ns2 WHERE ns2.node_id = node.id ORDER BY ns2.time_stamp DESC LIMIT 1)",
|
||||
)
|
||||
}
|
||||
|
||||
// applyNodeFilters applies common NodeFilter conditions to a query that joins
|
||||
// the node and node_state tables with latestStateCondition.
|
||||
func applyNodeFilters(query sq.SelectBuilder, filters []*model.NodeFilter) sq.SelectBuilder {
|
||||
for _, f := range filters {
|
||||
if f.Cluster != nil {
|
||||
query = buildStringCondition("node.cluster", f.Cluster, query)
|
||||
}
|
||||
if f.SubCluster != nil {
|
||||
query = buildStringCondition("node.subcluster", f.SubCluster, query)
|
||||
}
|
||||
if f.Hostname != nil {
|
||||
query = buildStringCondition("node.hostname", f.Hostname, query)
|
||||
}
|
||||
if f.SchedulerState != nil {
|
||||
query = query.Where("node_state.node_state = ?", f.SchedulerState)
|
||||
}
|
||||
if f.HealthState != nil {
|
||||
query = query.Where("node_state.health_state = ?", f.HealthState)
|
||||
}
|
||||
}
|
||||
return query
|
||||
}
|
||||
|
||||
func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) {
|
||||
start := time.Now()
|
||||
|
||||
@@ -79,17 +114,16 @@ func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[str
|
||||
|
||||
func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) {
|
||||
node := &schema.Node{}
|
||||
var timestamp int
|
||||
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
|
||||
"node_state.health_state", "MAX(node_state.time_stamp) as time").
|
||||
From("node_state").
|
||||
Join("node ON node_state.node_id = node.id").
|
||||
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
|
||||
"node_state.node_state", "node_state.health_state").
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()).
|
||||
Where("node.hostname = ?", hostname).
|
||||
Where("node.cluster = ?", cluster).
|
||||
GroupBy("node_state.node_id").
|
||||
RunWith(r.DB).
|
||||
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil {
|
||||
cclog.Warnf("Error while querying node '%s' at time '%d' from database: %v", hostname, timestamp, err)
|
||||
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
|
||||
cclog.Warnf("Error while querying node '%s' from database: %v", hostname, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -106,31 +140,28 @@ func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool)
|
||||
return node, nil
|
||||
}
|
||||
|
||||
func (r *NodeRepository) GetNodeById(id int64, withMeta bool) (*schema.Node, error) {
|
||||
func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) {
|
||||
node := &schema.Node{}
|
||||
var timestamp int
|
||||
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
|
||||
"node_state.health_state", "MAX(node_state.time_stamp) as time").
|
||||
From("node_state").
|
||||
Join("node ON node_state.node_id = node.id").
|
||||
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
|
||||
"node_state.node_state", "node_state.health_state").
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()).
|
||||
Where("node.id = ?", id).
|
||||
GroupBy("node_state.node_id").
|
||||
RunWith(r.DB).
|
||||
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil {
|
||||
cclog.Warnf("Error while querying node ID '%d' at time '%d' from database: %v", id, timestamp, err)
|
||||
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
|
||||
cclog.Warnf("Error while querying node ID '%d' from database: %v", id, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// NEEDS METADATA BY ID
|
||||
// if withMeta {
|
||||
// var err error
|
||||
// var meta map[string]string
|
||||
// if meta, err = r.FetchMetadata(hostname, cluster); err != nil {
|
||||
// cclog.Warnf("Error while fetching metadata for node '%s'", hostname)
|
||||
// return nil, err
|
||||
// }
|
||||
// node.MetaData = meta
|
||||
// }
|
||||
if withMeta {
|
||||
meta, metaErr := r.FetchMetadata(node.Hostname, node.Cluster)
|
||||
if metaErr != nil {
|
||||
cclog.Warnf("Error while fetching metadata for node ID '%d': %v", id, metaErr)
|
||||
return nil, metaErr
|
||||
}
|
||||
node.MetaData = meta
|
||||
}
|
||||
|
||||
return node, nil
|
||||
}
|
||||
@@ -166,9 +197,10 @@ func (r *NodeRepository) AddNode(node *schema.NodeDB) (int64, error) {
|
||||
}
|
||||
|
||||
const NamedNodeStateInsert string = `
|
||||
INSERT INTO node_state (time_stamp, node_state, health_state, cpus_allocated,
|
||||
memory_allocated, gpus_allocated, jobs_running, node_id)
|
||||
VALUES (:time_stamp, :node_state, :health_state, :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`
|
||||
INSERT INTO node_state (time_stamp, node_state, health_state, health_metrics,
|
||||
cpus_allocated, memory_allocated, gpus_allocated, jobs_running, node_id)
|
||||
VALUES (:time_stamp, :node_state, :health_state, :health_metrics,
|
||||
:cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`
|
||||
|
||||
// TODO: Add real Monitoring Health State
|
||||
|
||||
@@ -194,8 +226,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
|
||||
return err
|
||||
}
|
||||
|
||||
cclog.Infof("Added node '%s' to database", hostname)
|
||||
return nil
|
||||
cclog.Debugf("Added node '%s' to database", hostname)
|
||||
} else {
|
||||
cclog.Warnf("Error while querying node '%v' from database", id)
|
||||
return err
|
||||
@@ -209,7 +240,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
|
||||
cclog.Errorf("Error while adding node state for '%v' to database", hostname)
|
||||
return err
|
||||
}
|
||||
cclog.Infof("Updated node state for '%s' in database", hostname)
|
||||
cclog.Debugf("Updated node state for '%s' in database", hostname)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -222,6 +253,77 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
|
||||
// return nil
|
||||
// }
|
||||
|
||||
// NodeStateWithNode combines a node state row with denormalized node info.
|
||||
type NodeStateWithNode struct {
|
||||
ID int64 `db:"id"`
|
||||
TimeStamp int64 `db:"time_stamp"`
|
||||
NodeState string `db:"node_state"`
|
||||
HealthState string `db:"health_state"`
|
||||
HealthMetrics string `db:"health_metrics"`
|
||||
CpusAllocated int `db:"cpus_allocated"`
|
||||
MemoryAllocated int64 `db:"memory_allocated"`
|
||||
GpusAllocated int `db:"gpus_allocated"`
|
||||
JobsRunning int `db:"jobs_running"`
|
||||
Hostname string `db:"hostname"`
|
||||
Cluster string `db:"cluster"`
|
||||
SubCluster string `db:"subcluster"`
|
||||
}
|
||||
|
||||
// FindNodeStatesBefore returns all node_state rows with time_stamp < cutoff,
|
||||
// joined with node info for denormalized archiving.
|
||||
func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode, error) {
|
||||
rows, err := sq.Select(
|
||||
"node_state.id", "node_state.time_stamp", "node_state.node_state",
|
||||
"node_state.health_state", "node_state.health_metrics",
|
||||
"node_state.cpus_allocated", "node_state.memory_allocated",
|
||||
"node_state.gpus_allocated", "node_state.jobs_running",
|
||||
"node.hostname", "node.cluster", "node.subcluster",
|
||||
).
|
||||
From("node_state").
|
||||
Join("node ON node_state.node_id = node.id").
|
||||
Where(sq.Lt{"node_state.time_stamp": cutoff}).
|
||||
Where("node_state.id NOT IN (SELECT ns2.id FROM node_state ns2 WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id))").
|
||||
OrderBy("node.cluster ASC", "node.subcluster ASC", "node.hostname ASC", "node_state.time_stamp ASC").
|
||||
RunWith(r.DB).Query()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var result []NodeStateWithNode
|
||||
for rows.Next() {
|
||||
var ns NodeStateWithNode
|
||||
var healthMetrics sql.NullString
|
||||
if err := rows.Scan(&ns.ID, &ns.TimeStamp, &ns.NodeState,
|
||||
&ns.HealthState, &healthMetrics,
|
||||
&ns.CpusAllocated, &ns.MemoryAllocated,
|
||||
&ns.GpusAllocated, &ns.JobsRunning,
|
||||
&ns.Hostname, &ns.Cluster, &ns.SubCluster); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ns.HealthMetrics = healthMetrics.String
|
||||
result = append(result, ns)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff,
|
||||
// but always preserves the row with the latest timestamp per node_id.
|
||||
func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) {
|
||||
res, err := r.DB.Exec(
|
||||
`DELETE FROM node_state WHERE time_stamp < ?
|
||||
AND id NOT IN (
|
||||
SELECT id FROM node_state ns2
|
||||
WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id)
|
||||
)`,
|
||||
cutoff,
|
||||
)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return res.RowsAffected()
|
||||
}
|
||||
|
||||
func (r *NodeRepository) DeleteNode(id int64) error {
|
||||
_, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id)
|
||||
if err != nil {
|
||||
@@ -241,38 +343,17 @@ func (r *NodeRepository) QueryNodes(
|
||||
order *model.OrderByInput, // Currently unused!
|
||||
) ([]*schema.Node, error) {
|
||||
query, qerr := AccessCheck(ctx,
|
||||
sq.Select("hostname", "cluster", "subcluster", "node_state", "health_state", "MAX(time_stamp) as time").
|
||||
sq.Select("node.hostname", "node.cluster", "node.subcluster",
|
||||
"node_state.node_state", "node_state.health_state").
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id"))
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
for _, f := range filters {
|
||||
if f.Cluster != nil {
|
||||
query = buildStringCondition("cluster", f.Cluster, query)
|
||||
}
|
||||
if f.Subcluster != nil {
|
||||
query = buildStringCondition("subcluster", f.Subcluster, query)
|
||||
}
|
||||
if f.Hostname != nil {
|
||||
query = buildStringCondition("hostname", f.Hostname, query)
|
||||
}
|
||||
if f.SchedulerState != nil {
|
||||
query = query.Where("node_state = ?", f.SchedulerState)
|
||||
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
|
||||
now := time.Now().Unix()
|
||||
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
|
||||
}
|
||||
if f.HealthState != nil {
|
||||
query = query.Where("health_state = ?", f.HealthState)
|
||||
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
|
||||
now := time.Now().Unix()
|
||||
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
|
||||
}
|
||||
}
|
||||
|
||||
query = query.GroupBy("node_id").OrderBy("hostname ASC")
|
||||
query = applyNodeFilters(query, filters)
|
||||
query = query.OrderBy("node.hostname ASC")
|
||||
|
||||
if page != nil && page.ItemsPerPage != -1 {
|
||||
limit := uint64(page.ItemsPerPage)
|
||||
@@ -290,11 +371,10 @@ func (r *NodeRepository) QueryNodes(
|
||||
nodes := make([]*schema.Node, 0)
|
||||
for rows.Next() {
|
||||
node := schema.Node{}
|
||||
var timestamp int
|
||||
if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster,
|
||||
&node.NodeState, &node.HealthState, ×tamp); err != nil {
|
||||
&node.NodeState, &node.HealthState); err != nil {
|
||||
rows.Close()
|
||||
cclog.Warnf("Error while scanning rows (QueryNodes) at time '%d'", timestamp)
|
||||
cclog.Warn("Error while scanning rows (QueryNodes)")
|
||||
return nil, err
|
||||
}
|
||||
nodes = append(nodes, &node)
|
||||
@@ -386,73 +466,115 @@ func (r *NodeRepository) QueryNodesWithMeta(
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
// CountNodes returns the total matched nodes based on a node filter. It always operates
|
||||
// on the last state (largest timestamp).
|
||||
func (r *NodeRepository) CountNodes(
|
||||
// QueryNodesWithMeta returns a list of nodes based on a node filter. It always operates
|
||||
// on the last state (largest timestamp). It includes both (!) optional JSON column data
|
||||
func (r *NodeRepository) QueryNodesWithMeta(
|
||||
ctx context.Context,
|
||||
filters []*model.NodeFilter,
|
||||
) (int, error) {
|
||||
page *model.PageRequest,
|
||||
order *model.OrderByInput, // Currently unused!
|
||||
) ([]*schema.Node, error) {
|
||||
query, qerr := AccessCheck(ctx,
|
||||
sq.Select("time_stamp", "count(*) as countRes").
|
||||
sq.Select("node.hostname", "node.cluster", "node.subcluster",
|
||||
"node_state.node_state", "node_state.health_state",
|
||||
"node.meta_data", "node_state.health_metrics").
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id"))
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()))
|
||||
if qerr != nil {
|
||||
return 0, qerr
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
for _, f := range filters {
|
||||
if f.Cluster != nil {
|
||||
query = buildStringCondition("cluster", f.Cluster, query)
|
||||
}
|
||||
if f.Subcluster != nil {
|
||||
query = buildStringCondition("subcluster", f.Subcluster, query)
|
||||
}
|
||||
if f.Hostname != nil {
|
||||
query = buildStringCondition("hostname", f.Hostname, query)
|
||||
}
|
||||
if f.SchedulerState != nil {
|
||||
query = query.Where("node_state = ?", f.SchedulerState)
|
||||
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
|
||||
now := time.Now().Unix()
|
||||
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
|
||||
}
|
||||
if f.HealthState != nil {
|
||||
query = query.Where("health_state = ?", f.HealthState)
|
||||
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
|
||||
now := time.Now().Unix()
|
||||
query = query.Where(sq.Gt{"time_stamp": (now - 60)})
|
||||
}
|
||||
}
|
||||
query = applyNodeFilters(query, filters)
|
||||
query = query.OrderBy("node.hostname ASC")
|
||||
|
||||
query = query.GroupBy("time_stamp").OrderBy("time_stamp DESC").Limit(1)
|
||||
if page != nil && page.ItemsPerPage != -1 {
|
||||
limit := uint64(page.ItemsPerPage)
|
||||
query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
|
||||
}
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
queryString, queryVars, _ := query.ToSql()
|
||||
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodes := make([]*schema.Node, 0)
|
||||
for rows.Next() {
|
||||
node := schema.Node{}
|
||||
RawMetaData := make([]byte, 0)
|
||||
RawMetricHealth := make([]byte, 0)
|
||||
|
||||
if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster,
|
||||
&node.NodeState, &node.HealthState, &RawMetaData, &RawMetricHealth); err != nil {
|
||||
rows.Close()
|
||||
cclog.Warn("Error while scanning rows (QueryNodes)")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(RawMetaData) == 0 {
|
||||
node.MetaData = nil
|
||||
} else {
|
||||
metaData := make(map[string]string)
|
||||
if err := json.Unmarshal(RawMetaData, &metaData); err != nil {
|
||||
cclog.Warn("Error while unmarshaling raw metadata json")
|
||||
return nil, err
|
||||
}
|
||||
node.MetaData = metaData
|
||||
}
|
||||
|
||||
if len(RawMetricHealth) == 0 {
|
||||
node.HealthData = nil
|
||||
} else {
|
||||
healthData := make(map[string][]string)
|
||||
if err := json.Unmarshal(RawMetricHealth, &healthData); err != nil {
|
||||
cclog.Warn("Error while unmarshaling raw healthdata json")
|
||||
return nil, err
|
||||
}
|
||||
node.HealthData = healthData
|
||||
}
|
||||
|
||||
nodes = append(nodes, &node)
|
||||
}
|
||||
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
// CountNodes returns the total matched nodes based on a node filter. It always operates
|
||||
// on the last state (largest timestamp) per node.
|
||||
func (r *NodeRepository) CountNodes(
|
||||
ctx context.Context,
|
||||
filters []*model.NodeFilter,
|
||||
) (int, error) {
|
||||
query, qerr := AccessCheck(ctx,
|
||||
sq.Select("COUNT(*)").
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()))
|
||||
if qerr != nil {
|
||||
return 0, qerr
|
||||
}
|
||||
|
||||
query = applyNodeFilters(query, filters)
|
||||
|
||||
var count int
|
||||
if err := query.RunWith(r.stmtCache).QueryRow().Scan(&count); err != nil {
|
||||
queryString, queryVars, _ := query.ToSql()
|
||||
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
var totalNodes int
|
||||
for rows.Next() {
|
||||
var timestamp int
|
||||
if err := rows.Scan(×tamp, &totalNodes); err != nil {
|
||||
rows.Close()
|
||||
cclog.Warnf("Error while scanning rows (CountNodes) at time '%d'", timestamp)
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return totalNodes, nil
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
|
||||
q := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
|
||||
"node_state.health_state", "MAX(node_state.time_stamp) as time").
|
||||
q := sq.Select("node.hostname", "node.cluster", "node.subcluster",
|
||||
"node_state.node_state", "node_state.health_state").
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()).
|
||||
Where("node.cluster = ?", cluster).
|
||||
GroupBy("node_state.node_id").
|
||||
OrderBy("node.hostname ASC")
|
||||
|
||||
rows, err := q.RunWith(r.DB).Query()
|
||||
@@ -464,10 +586,9 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
node := &schema.Node{}
|
||||
var timestamp int
|
||||
if err := rows.Scan(&node.Hostname, &node.Cluster,
|
||||
&node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil {
|
||||
cclog.Warnf("Error while scanning node list (ListNodes) at time '%d'", timestamp)
|
||||
&node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
|
||||
cclog.Warn("Error while scanning node list (ListNodes)")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -478,11 +599,11 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
|
||||
}
|
||||
|
||||
func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
|
||||
q := sq.Select("node.hostname", "node_state.node_state", "MAX(node_state.time_stamp) as time").
|
||||
q := sq.Select("node.hostname", "node_state.node_state").
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()).
|
||||
Where("node.cluster = ?", cluster).
|
||||
GroupBy("node_state.node_id").
|
||||
OrderBy("node.hostname ASC")
|
||||
|
||||
rows, err := q.RunWith(r.DB).Query()
|
||||
@@ -495,9 +616,8 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var hostname, nodestate string
|
||||
var timestamp int
|
||||
if err := rows.Scan(&hostname, &nodestate, ×tamp); err != nil {
|
||||
cclog.Warnf("Error while scanning node list (MapNodes) at time '%d'", timestamp)
|
||||
if err := rows.Scan(&hostname, &nodestate); err != nil {
|
||||
cclog.Warn("Error while scanning node list (MapNodes)")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -509,37 +629,15 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
|
||||
|
||||
func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) {
|
||||
query, qerr := AccessCheck(ctx,
|
||||
sq.Select(column, "COUNT(*) as count").
|
||||
sq.Select(column).
|
||||
From("node").
|
||||
Join("node_state ON node_state.node_id = node.id").
|
||||
Where(latestStateCondition()).
|
||||
GroupBy(column))
|
||||
Where(latestStateCondition()))
|
||||
if qerr != nil {
|
||||
return nil, qerr
|
||||
}
|
||||
|
||||
query = query.Join("node_state ON node_state.node_id = node.id")
|
||||
|
||||
for _, f := range filters {
|
||||
if f.Hostname != nil {
|
||||
query = buildStringCondition("hostname", f.Hostname, query)
|
||||
}
|
||||
if f.Cluster != nil {
|
||||
query = buildStringCondition("cluster", f.Cluster, query)
|
||||
}
|
||||
if f.Subcluster != nil {
|
||||
query = buildStringCondition("subcluster", f.Subcluster, query)
|
||||
}
|
||||
if f.SchedulerState != nil {
|
||||
query = query.Where("node_state = ?", f.SchedulerState)
|
||||
}
|
||||
if f.HealthState != nil {
|
||||
query = query.Where("health_state = ?", f.HealthState)
|
||||
}
|
||||
}
|
||||
|
||||
// Add Group and Order
|
||||
query = query.GroupBy("hostname").OrderBy("hostname DESC")
|
||||
query = applyNodeFilters(query, filters)
|
||||
|
||||
rows, err := query.RunWith(r.stmtCache).Query()
|
||||
if err != nil {
|
||||
@@ -549,6 +647,18 @@ func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeF
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
stateMap := map[string]int{}
|
||||
for rows.Next() {
|
||||
var state string
|
||||
if err := rows.Scan(&state); err != nil {
|
||||
rows.Close()
|
||||
cclog.Warn("Error while scanning rows (CountStates)")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stateMap[state] += 1
|
||||
}
|
||||
|
||||
nodes := make([]*model.NodeStates, 0)
|
||||
for rows.Next() {
|
||||
var state string
|
||||
@@ -587,8 +697,8 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model.
|
||||
if f.Cluster != nil {
|
||||
query = buildStringCondition("cluster", f.Cluster, query)
|
||||
}
|
||||
if f.Subcluster != nil {
|
||||
query = buildStringCondition("subcluster", f.Subcluster, query)
|
||||
if f.SubCluster != nil {
|
||||
query = buildStringCondition("subcluster", f.SubCluster, query)
|
||||
}
|
||||
if f.SchedulerState != nil {
|
||||
query = query.Where("node_state = ?", f.SchedulerState)
|
||||
@@ -640,6 +750,132 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model.
|
||||
return timedStates, nil
|
||||
}
|
||||
|
||||
func (r *NodeRepository) GetNodesForList(
|
||||
ctx context.Context,
|
||||
cluster string,
|
||||
subCluster string,
|
||||
stateFilter string,
|
||||
nodeFilter string,
|
||||
page *model.PageRequest,
|
||||
) ([]string, map[string]string, int, bool, error) {
|
||||
// Init Return Vars
|
||||
nodes := make([]string, 0)
|
||||
stateMap := make(map[string]string)
|
||||
countNodes := 0
|
||||
hasNextPage := false
|
||||
|
||||
// Build Filters
|
||||
queryFilters := make([]*model.NodeFilter, 0)
|
||||
if cluster != "" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}})
|
||||
}
|
||||
if subCluster != "" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{SubCluster: &model.StringInput{Eq: &subCluster}})
|
||||
}
|
||||
if nodeFilter != "" && stateFilter != "notindb" {
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}})
|
||||
}
|
||||
if stateFilter != "all" && stateFilter != "notindb" {
|
||||
queryState := schema.SchedulerState(stateFilter)
|
||||
queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState})
|
||||
}
|
||||
// if healthFilter != "all" {
|
||||
// filters = append(filters, &model.NodeFilter{HealthState: &healthFilter})
|
||||
// }
|
||||
|
||||
// Special Case: Disable Paging for missing nodes filter, save IPP for later
|
||||
var backupItems int
|
||||
if stateFilter == "notindb" {
|
||||
backupItems = page.ItemsPerPage
|
||||
page.ItemsPerPage = -1
|
||||
}
|
||||
|
||||
// Query Nodes From DB
|
||||
rawNodes, serr := r.QueryNodes(ctx, queryFilters, page, nil) // Order not Used
|
||||
if serr != nil {
|
||||
cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)")
|
||||
return nil, nil, 0, false, serr
|
||||
}
|
||||
|
||||
// Intermediate Node Result Info
|
||||
for _, node := range rawNodes {
|
||||
if node == nil {
|
||||
continue
|
||||
}
|
||||
nodes = append(nodes, node.Hostname)
|
||||
stateMap[node.Hostname] = string(node.NodeState)
|
||||
}
|
||||
|
||||
// Special Case: Find Nodes not in DB node table but in metricStore only
|
||||
if stateFilter == "notindb" {
|
||||
// Reapply Original Paging
|
||||
page.ItemsPerPage = backupItems
|
||||
// Get Nodes From Topology
|
||||
var topoNodes []string
|
||||
if subCluster != "" {
|
||||
scNodes := archive.NodeLists[cluster][subCluster]
|
||||
topoNodes = scNodes.PrintList()
|
||||
} else {
|
||||
subClusterNodeLists := archive.NodeLists[cluster]
|
||||
for _, nodeList := range subClusterNodeLists {
|
||||
topoNodes = append(topoNodes, nodeList.PrintList()...)
|
||||
}
|
||||
}
|
||||
// Compare to all nodes from cluster/subcluster in DB
|
||||
var missingNodes []string
|
||||
for _, scanNode := range topoNodes {
|
||||
if !slices.Contains(nodes, scanNode) {
|
||||
missingNodes = append(missingNodes, scanNode)
|
||||
}
|
||||
}
|
||||
// Filter nodes by name
|
||||
if nodeFilter != "" {
|
||||
filteredNodesByName := []string{}
|
||||
for _, missingNode := range missingNodes {
|
||||
if strings.Contains(missingNode, nodeFilter) {
|
||||
filteredNodesByName = append(filteredNodesByName, missingNode)
|
||||
}
|
||||
}
|
||||
missingNodes = filteredNodesByName
|
||||
}
|
||||
// Sort Missing Nodes Alphanumerically
|
||||
slices.Sort(missingNodes)
|
||||
// Total Missing
|
||||
countNodes = len(missingNodes)
|
||||
// Apply paging
|
||||
if countNodes > page.ItemsPerPage {
|
||||
start := (page.Page - 1) * page.ItemsPerPage
|
||||
end := start + page.ItemsPerPage
|
||||
if end > countNodes {
|
||||
end = countNodes
|
||||
hasNextPage = false
|
||||
} else {
|
||||
hasNextPage = true
|
||||
}
|
||||
nodes = missingNodes[start:end]
|
||||
} else {
|
||||
nodes = missingNodes
|
||||
}
|
||||
|
||||
} else {
|
||||
// DB Nodes: Count and derive hasNextPage from count
|
||||
var cerr error
|
||||
countNodes, cerr = r.CountNodes(ctx, queryFilters)
|
||||
if cerr != nil {
|
||||
cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
|
||||
return nil, nil, 0, false, cerr
|
||||
}
|
||||
hasNextPage = page.Page*page.ItemsPerPage < countNodes
|
||||
}
|
||||
|
||||
// Fallback for non-init'd node table in DB; Ignores stateFilter
|
||||
if stateFilter == "all" && countNodes == 0 {
|
||||
nodes, countNodes, hasNextPage = getNodesFromTopol(cluster, subCluster, nodeFilter, page)
|
||||
}
|
||||
|
||||
return nodes, stateMap, countNodes, hasNextPage, nil
|
||||
}
|
||||
|
||||
func AccessCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
|
||||
user := GetUserFromContext(ctx)
|
||||
return AccessCheckWithUser(user, query)
|
||||
@@ -661,3 +897,51 @@ func AccessCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBu
|
||||
return qnil, fmt.Errorf("user has no or unknown roles")
|
||||
}
|
||||
}
|
||||
|
||||
func getNodesFromTopol(cluster string, subCluster string, nodeFilter string, page *model.PageRequest) ([]string, int, bool) {
|
||||
// 0) Init additional vars
|
||||
hasNextPage := false
|
||||
totalNodes := 0
|
||||
|
||||
// 1) Get list of all nodes
|
||||
var topolNodes []string
|
||||
if subCluster != "" {
|
||||
scNodes := archive.NodeLists[cluster][subCluster]
|
||||
topolNodes = scNodes.PrintList()
|
||||
} else {
|
||||
subClusterNodeLists := archive.NodeLists[cluster]
|
||||
for _, nodeList := range subClusterNodeLists {
|
||||
topolNodes = append(topolNodes, nodeList.PrintList()...)
|
||||
}
|
||||
}
|
||||
|
||||
// 2) Filter nodes
|
||||
if nodeFilter != "" {
|
||||
filteredNodes := []string{}
|
||||
for _, node := range topolNodes {
|
||||
if strings.Contains(node, nodeFilter) {
|
||||
filteredNodes = append(filteredNodes, node)
|
||||
}
|
||||
}
|
||||
topolNodes = filteredNodes
|
||||
}
|
||||
|
||||
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after ccms return ...
|
||||
totalNodes = len(topolNodes)
|
||||
sort.Strings(topolNodes)
|
||||
|
||||
// 3) Apply paging
|
||||
if len(topolNodes) > page.ItemsPerPage {
|
||||
start := (page.Page - 1) * page.ItemsPerPage
|
||||
end := start + page.ItemsPerPage
|
||||
if end >= len(topolNodes) {
|
||||
end = len(topolNodes)
|
||||
hasNextPage = false
|
||||
} else {
|
||||
hasNextPage = true
|
||||
}
|
||||
topolNodes = topolNodes[start:end]
|
||||
}
|
||||
|
||||
return topolNodes, totalNodes, hasNextPage
|
||||
}
|
||||
|
||||
@@ -15,9 +15,9 @@ import (
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/config"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
@@ -26,7 +26,7 @@ func nodeTestSetup(t *testing.T) {
|
||||
"main": {
|
||||
"addr": "0.0.0.0:8080",
|
||||
"validate": false,
|
||||
"apiAllowedIPs": [
|
||||
"api-allowed-ips": [
|
||||
"*"
|
||||
]
|
||||
},
|
||||
@@ -38,18 +38,7 @@ func nodeTestSetup(t *testing.T) {
|
||||
"jwts": {
|
||||
"max-age": "2m"
|
||||
}
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "testcluster",
|
||||
"metricDataRepository": {"kind": "test", "url": "bla:8081"},
|
||||
"filterRanges": {
|
||||
"numNodes": { "from": 1, "to": 64 },
|
||||
"duration": { "from": 0, "to": 86400 },
|
||||
"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
|
||||
}
|
||||
}
|
||||
]
|
||||
}`
|
||||
const testclusterJSON = `{
|
||||
"name": "testcluster",
|
||||
@@ -130,7 +119,7 @@ func nodeTestSetup(t *testing.T) {
|
||||
}
|
||||
|
||||
dbfilepath := filepath.Join(tmpdir, "test.db")
|
||||
err := MigrateDB("sqlite3", dbfilepath)
|
||||
err := MigrateDB(dbfilepath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -144,19 +133,22 @@ func nodeTestSetup(t *testing.T) {
|
||||
|
||||
// Load and check main configuration
|
||||
if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
|
||||
if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
|
||||
config.Init(cfg, clustercfg)
|
||||
} else {
|
||||
cclog.Abort("Cluster configuration must be present")
|
||||
}
|
||||
config.Init(cfg)
|
||||
} else {
|
||||
cclog.Abort("Main configuration must be present")
|
||||
}
|
||||
archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
|
||||
|
||||
Connect("sqlite3", dbfilepath)
|
||||
if err := ResetConnection(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
ResetConnection()
|
||||
})
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
|
||||
Connect(dbfilepath)
|
||||
|
||||
if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
@@ -164,8 +156,12 @@ func nodeTestSetup(t *testing.T) {
|
||||
func TestUpdateNodeState(t *testing.T) {
|
||||
nodeTestSetup(t)
|
||||
|
||||
repo := GetNodeRepository()
|
||||
now := time.Now().Unix()
|
||||
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(), NodeState: "allocated",
|
||||
TimeStamp: now,
|
||||
NodeState: "allocated",
|
||||
CpusAllocated: 72,
|
||||
MemoryAllocated: 480,
|
||||
GpusAllocated: 0,
|
||||
@@ -173,18 +169,152 @@ func TestUpdateNodeState(t *testing.T) {
|
||||
JobsRunning: 1,
|
||||
}
|
||||
|
||||
repo := GetNodeRepository()
|
||||
err := repo.UpdateNodeState("host124", "testcluster", &nodeState)
|
||||
if err != nil {
|
||||
return
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
node, err := repo.GetNode("host124", "testcluster", false)
|
||||
if err != nil {
|
||||
return
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if node.NodeState != "allocated" {
|
||||
t.Errorf("wrong node state\ngot: %s \nwant: allocated ", node.NodeState)
|
||||
}
|
||||
|
||||
t.Run("FindBeforeEmpty", func(t *testing.T) {
|
||||
// Only the current-timestamp row exists, so nothing should be found before now
|
||||
rows, err := repo.FindNodeStatesBefore(now)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(rows) != 0 {
|
||||
t.Errorf("expected 0 rows, got %d", len(rows))
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("DeleteOldRows", func(t *testing.T) {
|
||||
// Insert 2 more old rows for host124
|
||||
for i, ts := range []int64{now - 7200, now - 3600} {
|
||||
ns := schema.NodeStateDB{
|
||||
TimeStamp: ts,
|
||||
NodeState: "allocated",
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
CpusAllocated: 72,
|
||||
MemoryAllocated: 480,
|
||||
JobsRunning: i,
|
||||
}
|
||||
if err := repo.UpdateNodeState("host124", "testcluster", &ns); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Delete rows older than 30 minutes
|
||||
cutoff := now - 1800
|
||||
cnt, err := repo.DeleteNodeStatesBefore(cutoff)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Should delete the 2 old rows
|
||||
if cnt != 2 {
|
||||
t.Errorf("expected 2 deleted rows, got %d", cnt)
|
||||
}
|
||||
|
||||
// Latest row should still exist
|
||||
node, err := repo.GetNode("host124", "testcluster", false)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if node.NodeState != "allocated" {
|
||||
t.Errorf("expected node state 'allocated', got %s", node.NodeState)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("PreservesLatestPerNode", func(t *testing.T) {
|
||||
// Insert a single old row for host125 — it's the latest per node so it must survive
|
||||
ns := schema.NodeStateDB{
|
||||
TimeStamp: now - 7200,
|
||||
NodeState: "idle",
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
CpusAllocated: 0,
|
||||
MemoryAllocated: 0,
|
||||
JobsRunning: 0,
|
||||
}
|
||||
if err := repo.UpdateNodeState("host125", "testcluster", &ns); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Delete everything older than now — the latest per node should be preserved
|
||||
_, err := repo.DeleteNodeStatesBefore(now)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// The latest row for host125 must still exist
|
||||
node, err := repo.GetNode("host125", "testcluster", false)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if node.NodeState != "idle" {
|
||||
t.Errorf("expected node state 'idle', got %s", node.NodeState)
|
||||
}
|
||||
|
||||
// Verify exactly 1 row remains for host125
|
||||
var countAfter int
|
||||
if err := repo.DB.QueryRow(
|
||||
"SELECT COUNT(*) FROM node_state WHERE node_id = (SELECT id FROM node WHERE hostname = 'host125')").
|
||||
Scan(&countAfter); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if countAfter != 1 {
|
||||
t.Errorf("expected 1 row remaining for host125, got %d", countAfter)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("FindBeforeWithJoin", func(t *testing.T) {
|
||||
// Insert old and current rows for host123
|
||||
for _, ts := range []int64{now - 7200, now} {
|
||||
ns := schema.NodeStateDB{
|
||||
TimeStamp: ts,
|
||||
NodeState: "allocated",
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
CpusAllocated: 8,
|
||||
MemoryAllocated: 1024,
|
||||
GpusAllocated: 1,
|
||||
JobsRunning: 1,
|
||||
}
|
||||
if err := repo.UpdateNodeState("host123", "testcluster", &ns); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Find rows older than 30 minutes, excluding latest per node
|
||||
cutoff := now - 1800
|
||||
rows, err := repo.FindNodeStatesBefore(cutoff)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Should find the old host123 row
|
||||
found := false
|
||||
for _, row := range rows {
|
||||
if row.Hostname == "host123" && row.TimeStamp == now-7200 {
|
||||
found = true
|
||||
if row.Cluster != "testcluster" {
|
||||
t.Errorf("expected cluster 'testcluster', got %s", row.Cluster)
|
||||
}
|
||||
if row.SubCluster != "sc1" {
|
||||
t.Errorf("expected subcluster 'sc1', got %s", row.SubCluster)
|
||||
}
|
||||
if row.CpusAllocated != 8 {
|
||||
t.Errorf("expected cpus_allocated 8, got %d", row.CpusAllocated)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("expected to find old host123 row among %d results", len(rows))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -6,11 +6,13 @@ package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/schema"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
@@ -46,7 +48,7 @@ func BenchmarkSelect1(b *testing.B) {
|
||||
}
|
||||
|
||||
func BenchmarkDB_FindJobById(b *testing.B) {
|
||||
var jobId int64 = 1677322
|
||||
var jobID int64 = 1677322
|
||||
|
||||
b.Run("FindJobById", func(b *testing.B) {
|
||||
db := setup(b)
|
||||
@@ -55,7 +57,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {
|
||||
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
_, err := db.FindById(getContext(b), jobId)
|
||||
_, err := db.FindByID(getContext(b), jobID)
|
||||
noErr(b, err)
|
||||
}
|
||||
})
|
||||
@@ -63,7 +65,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {
|
||||
}
|
||||
|
||||
func BenchmarkDB_FindJob(b *testing.B) {
|
||||
var jobId int64 = 107266
|
||||
var jobID int64 = 107266
|
||||
var startTime int64 = 1657557241
|
||||
cluster := "fritz"
|
||||
|
||||
@@ -74,7 +76,7 @@ func BenchmarkDB_FindJob(b *testing.B) {
|
||||
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
_, err := db.Find(&jobId, &cluster, &startTime)
|
||||
_, err := db.Find(&jobID, &cluster, &startTime)
|
||||
noErr(b, err)
|
||||
}
|
||||
})
|
||||
@@ -148,10 +150,24 @@ func getContext(tb testing.TB) context.Context {
|
||||
func setup(tb testing.TB) *JobRepository {
|
||||
tb.Helper()
|
||||
cclog.Init("warn", true)
|
||||
dbfile := "testdata/job.db"
|
||||
err := MigrateDB("sqlite3", dbfile)
|
||||
|
||||
// Copy test DB to a temp file for test isolation
|
||||
srcData, err := os.ReadFile("testdata/job.db")
|
||||
noErr(tb, err)
|
||||
Connect("sqlite3", dbfile)
|
||||
dbfile := filepath.Join(tb.TempDir(), "job.db")
|
||||
err = os.WriteFile(dbfile, srcData, 0o644)
|
||||
noErr(tb, err)
|
||||
|
||||
// Reset singletons so Connect uses the new temp DB
|
||||
err = ResetConnection()
|
||||
noErr(tb, err)
|
||||
tb.Cleanup(func() {
|
||||
ResetConnection()
|
||||
})
|
||||
|
||||
err = MigrateDB(dbfile)
|
||||
noErr(tb, err)
|
||||
Connect(dbfile)
|
||||
return GetJobRepository()
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user