Merge branch 'hotfix' of github.com:ClusterCockpit/cc-backend into hotfix

2026-05-04 12:37:30 +02:00 · 2026-03-11 05:06:26 +01:00
parent 5d3d77620e 5c72664162
commit 1cf99206a9
317 changed files with 32717 additions and 15040 deletions
--- a/internal/api/api_test.go
+++ b/internal/api/api_test.go
@@ -23,47 +23,45 @@ import (
 	"github.com/ClusterCockpit/cc-backend/internal/auth"
 	"github.com/ClusterCockpit/cc-backend/internal/config"
 	"github.com/ClusterCockpit/cc-backend/internal/graph"
-	"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
-	"github.com/ClusterCockpit/cc-backend/internal/metricdata"
+	"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/gorilla/mux"
+	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
+	ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	"github.com/go-chi/chi/v5"

 	_ "github.com/mattn/go-sqlite3"
 )

 func setup(t *testing.T) *api.RestAPI {
+	repository.ResetConnection()
+
 	const testconfig = `{
-		"main": {
-	"addr":            "0.0.0.0:8080",
-	"validate": false,
-  "apiAllowedIPs": [
-    "*"
-  ]
-	},
+	"main": {
+	   "addr":            "0.0.0.0:8080",
+	   "validate": false,
+     "api-allowed-ips": [
+       "*"
+      ]
+  },
+  "metric-store": {
+    "checkpoints": {
+      "interval": "12h"
+    },
+    "retention-in-memory": "48h",
+    "memory-cap": 100
+  },
 	"archive": {
-		"kind": "file",
-		"path": "./var/job-archive"
+		 "kind": "file",
+		 "path": "./var/job-archive"
 	},
 	"auth": {
-  "jwts": {
-      "max-age": "2m"
+     "jwts": {
+     "max-age": "2m"
+  }
  }
-	},
-	"clusters": [
-	{
-	   "name": "testcluster",
-	   "metricDataRepository": {"kind": "test", "url": "bla:8081"},
-	   "filterRanges": {
-		"numNodes": { "from": 1, "to": 64 },
-		"duration": { "from": 0, "to": 86400 },
-		"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
-	   }
-	}
-	]
 }`
 	const testclusterJSON = `{
        "name": "testcluster",
@@ -141,7 +139,7 @@ func setup(t *testing.T) *api.RestAPI {
 	}

 	dbfilepath := filepath.Join(tmpdir, "test.db")
-	err := repository.MigrateDB("sqlite3", dbfilepath)
+	err := repository.MigrateDB(dbfilepath)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -152,28 +150,23 @@ func setup(t *testing.T) *api.RestAPI {
 	}

 	ccconf.Init(cfgFilePath)
+	metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{}

 	// Load and check main configuration
 	if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
-		if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
-			config.Init(cfg, clustercfg)
-		} else {
-			cclog.Abort("Cluster configuration must be present")
-		}
+		config.Init(cfg)
 	} else {
 		cclog.Abort("Main configuration must be present")
 	}
 	archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)

-	repository.Connect("sqlite3", dbfilepath)
+	repository.Connect(dbfilepath)

-	if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
+	if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
 		t.Fatal(err)
 	}

-	if err := metricdata.Init(); err != nil {
-		t.Fatal(err)
-	}
+	// metricstore initialization removed - it's initialized via callback in tests

 	archiver.Start(repository.GetJobRepository(), context.Background())

@@ -190,11 +183,9 @@ func setup(t *testing.T) *api.RestAPI {
 }

 func cleanup() {
-	// Gracefully shutdown archiver with timeout
 	if err := archiver.Shutdown(5 * time.Second); err != nil {
 		cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
 	}
-	// TODO: Clear all caches, reset all modules, etc...
 }

 /*
@@ -221,16 +212,14 @@ func TestRestApi(t *testing.T) {
 		},
 	}

-	metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
+	metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
 		return testData, nil
 	}

-	r := mux.NewRouter()
-	r.PathPrefix("/api").Subrouter()
-	r.StrictSlash(true)
+	r := chi.NewRouter()
 	restapi.MountAPIRoutes(r)

-	var TestJobId int64 = 123
+	var TestJobID int64 = 123
 	TestClusterName := "testcluster"
 	var TestStartTime int64 = 123456789

@@ -280,7 +269,7 @@ func TestRestApi(t *testing.T) {
 		}
 		// resolver := graph.GetResolverInstance()
 		restapi.JobRepository.SyncJobs()
-		job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
+		job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -338,7 +327,7 @@ func TestRestApi(t *testing.T) {
 		}

 		// Archiving happens asynchronously, will be completed in cleanup
-		job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime)
+		job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime)
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -366,7 +355,7 @@ func TestRestApi(t *testing.T) {
 	}

 	t.Run("CheckArchive", func(t *testing.T) {
-		data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
+		data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60)
 		if err != nil {
 			t.Fatal(err)
 		}
@@ -464,4 +453,198 @@ func TestRestApi(t *testing.T) {
 	if !ok {
 		t.Fatal("subtest failed")
 	}
+
+	t.Run("GetUsedNodesNoRunning", func(t *testing.T) {
+		contextUserValue := &schema.User{
+			Username:   "testuser",
+			Projects:   make([]string, 0),
+			Roles:      []string{"api"},
+			AuthType:   0,
+			AuthSource: 2,
+		}
+
+		req := httptest.NewRequest(http.MethodGet, "/jobs/used_nodes?ts=123456790", nil)
+		recorder := httptest.NewRecorder()
+
+		ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
+
+		r.ServeHTTP(recorder, req.WithContext(ctx))
+		response := recorder.Result()
+		if response.StatusCode != http.StatusOK {
+			t.Fatal(response.Status, recorder.Body.String())
+		}
+
+		var result api.GetUsedNodesAPIResponse
+		if err := json.NewDecoder(response.Body).Decode(&result); err != nil {
+			t.Fatal(err)
+		}
+
+		if result.UsedNodes == nil {
+			t.Fatal("expected usedNodes to be non-nil")
+		}
+
+		if len(result.UsedNodes) != 0 {
+			t.Fatalf("expected no used nodes for stopped jobs, got: %v", result.UsedNodes)
+		}
+	})
+}
+
+// TestStopJobWithReusedJobId verifies that stopping a recently started job works
+// even when an older job with the same jobId exists in the job table (e.g. with
+// state "failed"). This is a regression test for the bug where Find() on the job
+// table would match the old job instead of the new one still in job_cache.
+func TestStopJobWithReusedJobId(t *testing.T) {
+	restapi := setup(t)
+	t.Cleanup(cleanup)
+
+	testData := schema.JobData{
+		"load_one": map[schema.MetricScope]*schema.JobMetric{
+			schema.MetricScopeNode: {
+				Unit:     schema.Unit{Base: "load"},
+				Timestep: 60,
+				Series: []schema.Series{
+					{
+						Hostname:   "host123",
+						Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
+						Data:       []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
+					},
+				},
+			},
+		},
+	}
+
+	metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
+		return testData, nil
+	}
+
+	r := chi.NewRouter()
+	restapi.MountAPIRoutes(r)
+
+	const contextUserKey repository.ContextKey = "user"
+	contextUserValue := &schema.User{
+		Username:   "testuser",
+		Projects:   make([]string, 0),
+		Roles:      []string{"user"},
+		AuthType:   0,
+		AuthSource: 2,
+	}
+
+	// Step 1: Start the first job (jobId=999)
+	const startJobBody1 string = `{
+		"jobId":            999,
+		"user":             "testuser",
+		"project":          "testproj",
+		"cluster":          "testcluster",
+		"partition":        "default",
+		"walltime":         3600,
+		"numNodes":         1,
+		"numHwthreads":     8,
+		"numAcc":           0,
+		"shared":           "none",
+		"monitoringStatus": 1,
+		"smt":              1,
+		"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
+		"startTime":        200000000
+	}`
+
+	if ok := t.Run("StartFirstJob", func(t *testing.T) {
+		req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody1)))
+		recorder := httptest.NewRecorder()
+		ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
+		r.ServeHTTP(recorder, req.WithContext(ctx))
+		if recorder.Result().StatusCode != http.StatusCreated {
+			t.Fatal(recorder.Result().Status, recorder.Body.String())
+		}
+	}); !ok {
+		return
+	}
+
+	// Step 2: Sync to move job from cache to job table, then stop it as "failed"
+	time.Sleep(1 * time.Second)
+	restapi.JobRepository.SyncJobs()
+
+	const stopJobBody1 string = `{
+		"jobId":     999,
+		"startTime": 200000000,
+		"cluster":   "testcluster",
+		"jobState":  "failed",
+		"stopTime":  200001000
+	}`
+
+	if ok := t.Run("StopFirstJobAsFailed", func(t *testing.T) {
+		req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody1)))
+		recorder := httptest.NewRecorder()
+		ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
+		r.ServeHTTP(recorder, req.WithContext(ctx))
+		if recorder.Result().StatusCode != http.StatusOK {
+			t.Fatal(recorder.Result().Status, recorder.Body.String())
+		}
+
+		jobid, cluster := int64(999), "testcluster"
+		job, err := restapi.JobRepository.Find(&jobid, &cluster, nil)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if job.State != schema.JobStateFailed {
+			t.Fatalf("expected first job to be failed, got: %s", job.State)
+		}
+	}); !ok {
+		return
+	}
+
+	// Wait for archiving to complete
+	time.Sleep(1 * time.Second)
+
+	// Step 3: Start a NEW job with the same jobId=999 but different startTime.
+	// This job will sit in job_cache (not yet synced).
+	const startJobBody2 string = `{
+		"jobId":            999,
+		"user":             "testuser",
+		"project":          "testproj",
+		"cluster":          "testcluster",
+		"partition":        "default",
+		"walltime":         3600,
+		"numNodes":         1,
+		"numHwthreads":     8,
+		"numAcc":           0,
+		"shared":           "none",
+		"monitoringStatus": 1,
+		"smt":              1,
+		"resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}],
+		"startTime":        300000000
+	}`
+
+	if ok := t.Run("StartSecondJob", func(t *testing.T) {
+		req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody2)))
+		recorder := httptest.NewRecorder()
+		ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
+		r.ServeHTTP(recorder, req.WithContext(ctx))
+		if recorder.Result().StatusCode != http.StatusCreated {
+			t.Fatal(recorder.Result().Status, recorder.Body.String())
+		}
+	}); !ok {
+		return
+	}
+
+	// Step 4: Stop the second job WITHOUT syncing first.
+	// Before the fix, this would fail because Find() on the job table would
+	// match the old failed job (jobId=999) and reject with "already stopped".
+	const stopJobBody2 string = `{
+		"jobId":     999,
+		"startTime": 300000000,
+		"cluster":   "testcluster",
+		"jobState":  "completed",
+		"stopTime":  300001000
+	}`
+
+	t.Run("StopSecondJobBeforeSync", func(t *testing.T) {
+		req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody2)))
+		recorder := httptest.NewRecorder()
+		ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue)
+		r.ServeHTTP(recorder, req.WithContext(ctx))
+		if recorder.Result().StatusCode != http.StatusOK {
+			t.Fatalf("expected stop to succeed for cached job, got: %s %s",
+				recorder.Result().Status, recorder.Body.String())
+		}
+	})
 }
--- a/internal/api/cluster.go
+++ b/internal/api/cluster.go
@@ -13,7 +13,7 @@ import (

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

 // GetClustersAPIResponse model
@@ -27,7 +27,7 @@ type GetClustersAPIResponse struct {
 // @description Get a list of all cluster configs. Specific cluster can be requested using query parameter.
 // @produce     json
 // @param       cluster        query    string            false "Job Cluster"
-// @success     200            {object} api.GetClustersApiResponse  "Array of clusters"
+// @success     200            {object} api.GetClustersAPIResponse  "Array of clusters"
 // @failure     400            {object} api.ErrorResponse       "Bad Request"
 // @failure     401            {object} api.ErrorResponse       "Unauthorized"
 // @failure     403            {object} api.ErrorResponse       "Forbidden"
@@ -36,9 +36,9 @@ type GetClustersAPIResponse struct {
 // @router      /api/clusters/ [get]
 func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) {
 	if user := repository.GetUserFromContext(r.Context()); user != nil &&
-		!user.HasRole(schema.RoleApi) {
+		!user.HasRole(schema.RoleAPI) {

-		handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw)
+		handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
 		return
 	}

--- a/internal/api/docs.go
+++ b/internal/api/docs.go
--- a/internal/api/job.go
+++ b/internal/api/job.go
@@ -22,12 +22,12 @@ import (
 	"github.com/ClusterCockpit/cc-backend/internal/graph"
 	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
 	"github.com/ClusterCockpit/cc-backend/internal/importer"
-	"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
+	"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/gorilla/mux"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	"github.com/go-chi/chi/v5"
 )

 const (
@@ -72,6 +72,14 @@ type EditMetaRequest struct {
 	Value string `json:"value" example:"bash script"`
 }

+// JobMetaRequest model
+type JobMetaRequest struct {
+	JobId     *int64          `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
+	Cluster   *string         `json:"cluster" example:"fritz"`                    // Cluster of job
+	StartTime *int64          `json:"startTime" example:"1649723812"`             // Start Time of job as epoch
+	Payload   EditMetaRequest `json:"payload"`                                    // Content to Add to Job Meta_Data
+}
+
 type TagJobAPIRequest []*APITag

 type GetJobAPIRequest []string
@@ -104,7 +112,7 @@ type JobMetricWithName struct {
 // @param       items-per-page query    int               false "Items per page (Default: 25)"
 // @param       page           query    int               false "Page Number (Default: 1)"
 // @param       with-metadata  query    bool              false "Include metadata (e.g. jobScript) in response"
-// @success     200            {object} api.GetJobsApiResponse  "Job array and page info"
+// @success     200            {object} api.GetJobsAPIResponse  "Job array and page info"
 // @failure     400            {object} api.ErrorResponse       "Bad Request"
 // @failure     401   		   {object} api.ErrorResponse       "Unauthorized"
 // @failure     403            {object} api.ErrorResponse       "Forbidden"
@@ -232,7 +240,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
 // @produce     json
 // @param       id          path     int                  true "Database ID of Job"
 // @param       all-metrics query    bool                 false "Include all available metrics"
-// @success     200     {object} api.GetJobApiResponse      "Job resource"
+// @success     200     {object} api.GetJobAPIResponse      "Job resource"
 // @failure     400     {object} api.ErrorResponse          "Bad Request"
 // @failure     401     {object} api.ErrorResponse          "Unauthorized"
 // @failure     403     {object} api.ErrorResponse          "Forbidden"
@@ -243,17 +251,17 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) {
 // @router      /api/jobs/{id} [get]
 func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) {
 	// Fetch job from db
-	id, ok := mux.Vars(r)["id"]
+	id := chi.URLParam(r, "id")
 	var job *schema.Job
 	var err error
-	if ok {
+	if id != "" {
 		id, e := strconv.ParseInt(id, 10, 64)
 		if e != nil {
 			handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
 			return
 		}

-		job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID
+		job, err = api.JobRepository.FindByID(r.Context(), id) // Get Job from Repo by ID
 	} else {
 		handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw)
 		return
@@ -293,7 +301,7 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
 	}

 	if r.URL.Query().Get("all-metrics") == "true" {
-		data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution)
+		data, err = metricdispatch.LoadData(job, nil, scopes, r.Context(), resolution)
 		if err != nil {
 			cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster)
 			return
@@ -324,8 +332,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
 // @accept      json
 // @produce     json
 // @param       id          path     int                  true "Database ID of Job"
-// @param       request     body     api.GetJobApiRequest true  "Array of metric names"
-// @success     200     {object} api.GetJobApiResponse      "Job resource"
+// @param       request     body     api.GetJobAPIRequest true  "Array of metric names"
+// @success     200     {object} api.GetJobAPIResponse      "Job resource"
 // @failure     400     {object} api.ErrorResponse          "Bad Request"
 // @failure     401     {object} api.ErrorResponse          "Unauthorized"
 // @failure     403     {object} api.ErrorResponse          "Forbidden"
@@ -336,17 +344,17 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request)
 // @router      /api/jobs/{id} [post]
 func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
 	// Fetch job from db
-	id, ok := mux.Vars(r)["id"]
+	id := chi.URLParam(r, "id")
 	var job *schema.Job
 	var err error
-	if ok {
+	if id != "" {
 		id, e := strconv.ParseInt(id, 10, 64)
 		if e != nil {
 			handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
 			return
 		}

-		job, err = api.JobRepository.FindById(r.Context(), id)
+		job, err = api.JobRepository.FindByID(r.Context(), id)
 	} else {
 		handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
 		return
@@ -389,7 +397,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
 		resolution = max(resolution, mc.Timestep)
 	}

-	data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution)
+	data, err := metricdispatch.LoadData(job, metrics, scopes, r.Context(), resolution)
 	if err != nil {
 		cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster)
 		return
@@ -423,29 +431,29 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) {
 }

 // editMeta godoc
-// @summary    Edit meta-data json
+// @summary    Edit meta-data json of job identified by database id
 // @tags Job add and modify
-// @description Edit key value pairs in job metadata json
+// @description Edit key value pairs in job metadata json of job specified by database id
 // @description If a key already exists its content will be overwritten
 // @accept      json
 // @produce     json
 // @param       id      path     int                  true "Job Database ID"
-// @param       request body     api.EditMetaRequest  true "Kay value pair to add"
+// @param       request body     api.EditMetaRequest  true "Metadata Key value pair to add or update"
 // @success     200     {object} schema.Job                "Updated job resource"
 // @failure     400     {object} api.ErrorResponse         "Bad Request"
 // @failure     401     {object} api.ErrorResponse         "Unauthorized"
 // @failure     404     {object} api.ErrorResponse         "Job does not exist"
 // @failure     500     {object} api.ErrorResponse         "Internal Server Error"
 // @security    ApiKeyAuth
-// @router      /api/jobs/edit_meta/{id} [post]
+// @router      /api/jobs/edit_meta/{id} [patch]
 func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
-	id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
+	id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
 	if err != nil {
 		handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
 		return
 	}

-	job, err := api.JobRepository.FindById(r.Context(), id)
+	job, err := api.JobRepository.FindByID(r.Context(), id)
 	if err != nil {
 		handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
 		return
@@ -469,6 +477,54 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
 	}
 }

+// editMetaByRequest godoc
+// @summary    Edit meta-data json of job identified by request
+// @tags Job add and modify
+// @description Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster
+// @description If a key already exists its content will be overwritten
+// @accept      json
+// @produce     json
+// @param       request body     api.JobMetaRequest   true "Specifies job and payload to add or update"
+// @success     200     {object} schema.Job                "Updated job resource"
+// @failure     400     {object} api.ErrorResponse         "Bad Request"
+// @failure     401     {object} api.ErrorResponse         "Unauthorized"
+// @failure     404     {object} api.ErrorResponse         "Job does not exist"
+// @failure     500     {object} api.ErrorResponse         "Internal Server Error"
+// @security    ApiKeyAuth
+// @router      /api/jobs/edit_meta/ [patch]
+func (api *RestAPI) editMetaByRequest(rw http.ResponseWriter, r *http.Request) {
+	// Parse request body
+	req := JobMetaRequest{}
+	if err := decode(r.Body, &req); err != nil {
+		handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw)
+		return
+	}
+
+	// Fetch job (that will have its meta_data edited) from db
+	var job *schema.Job
+	var err error
+	if req.JobId == nil {
+		handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw)
+		return
+	}
+
+	// log.Printf("loading db job for editMetaByRequest... : JobMetaRequest=%v", req)
+	job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
+	if err != nil {
+		handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw)
+		return
+	}
+
+	if err := api.JobRepository.UpdateMetadata(job, req.Payload.Key, req.Payload.Value); err != nil {
+		http.Error(rw, err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	rw.Header().Add("Content-Type", "application/json")
+	rw.WriteHeader(http.StatusOK)
+	json.NewEncoder(rw).Encode(job)
+}
+
 // tagJob godoc
 // @summary     Adds one or more tags to a job
 // @tags Job add and modify
@@ -478,7 +534,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
 // @accept      json
 // @produce     json
 // @param       id      path     int                  true "Job Database ID"
-// @param       request body     api.TagJobApiRequest true "Array of tag-objects to add"
+// @param       request body     api.TagJobAPIRequest true "Array of tag-objects to add"
 // @success     200     {object} schema.Job                "Updated job resource"
 // @failure     400     {object} api.ErrorResponse         "Bad Request"
 // @failure     401     {object} api.ErrorResponse         "Unauthorized"
@@ -487,13 +543,13 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) {
 // @security    ApiKeyAuth
 // @router      /api/jobs/tag_job/{id} [post]
 func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
-	id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
+	id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
 	if err != nil {
 		handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
 		return
 	}

-	job, err := api.JobRepository.FindById(r.Context(), id)
+	job, err := api.JobRepository.FindByID(r.Context(), id)
 	if err != nil {
 		handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
 		return
@@ -542,7 +598,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
 // @accept      json
 // @produce     json
 // @param       id      path     int                  true "Job Database ID"
-// @param       request body     api.TagJobApiRequest true "Array of tag-objects to remove"
+// @param       request body     api.TagJobAPIRequest true "Array of tag-objects to remove"
 // @success     200     {object} schema.Job                "Updated job resource"
 // @failure     400     {object} api.ErrorResponse         "Bad Request"
 // @failure     401     {object} api.ErrorResponse         "Unauthorized"
@@ -551,13 +607,13 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) {
 // @security    ApiKeyAuth
 // @router      /jobs/tag_job/{id} [delete]
 func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
-	id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64)
+	id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
 	if err != nil {
 		handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw)
 		return
 	}

-	job, err := api.JobRepository.FindById(r.Context(), id)
+	job, err := api.JobRepository.FindByID(r.Context(), id)
 	if err != nil {
 		handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
 		return
@@ -606,7 +662,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) {
 // @description Tag wills be removed from respective archive files.
 // @accept      json
 // @produce     plain
-// @param       request body     api.TagJobApiRequest true "Array of tag-objects to remove"
+// @param       request body     api.TagJobAPIRequest true "Array of tag-objects to remove"
 // @success     200     {string} string                    "Success Response"
 // @failure     400     {object} api.ErrorResponse         "Bad Request"
 // @failure     401     {object} api.ErrorResponse         "Unauthorized"
@@ -650,7 +706,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) {
 // @accept      json
 // @produce     json
 // @param       request body     schema.Job true "Job to add"
-// @success     201     {object} api.DefaultApiResponse    "Job added successfully"
+// @success     201     {object} api.DefaultAPIResponse    "Job added successfully"
 // @failure     400     {object} api.ErrorResponse            "Bad Request"
 // @failure     401     {object} api.ErrorResponse            "Unauthorized"
 // @failure     403     {object} api.ErrorResponse            "Forbidden"
@@ -691,13 +747,21 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
 		for _, job := range jobs {
 			// Check if jobs are within the same day (prevent duplicates)
 			if (req.StartTime - job.StartTime) < secondsPerDay {
-				handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
+				handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", *job.ID, job.JobID), http.StatusUnprocessableEntity, rw)
 				return
 			}
 		}
 	}

-	id, err := api.JobRepository.Start(&req)
+	// When tags are present, insert directly into the job table so that the
+	// returned ID can be used with AddTagOrCreate (which queries the job table).
+	// Jobs without tags use the cache path as before.
+	var id int64
+	if len(req.Tags) > 0 {
+		id, err = api.JobRepository.StartDirect(&req)
+	} else {
+		id, err = api.JobRepository.Start(&req)
+	}
 	if err != nil {
 		handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw)
 		return
@@ -728,7 +792,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) {
 // @description Job to stop is specified by request body. All fields are required in this case.
 // @description Returns full job resource information according to 'Job' scheme.
 // @produce     json
-// @param       request body     api.StopJobApiRequest true "All fields required"
+// @param       request body     api.StopJobAPIRequest true "All fields required"
 // @success     200     {object} schema.Job                 "Success message"
 // @failure     400     {object} api.ErrorResponse          "Bad Request"
 // @failure     401     {object} api.ErrorResponse          "Unauthorized"
@@ -754,20 +818,20 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req)
-	job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
+	isCached := false
+	job, err = api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
 	if err != nil {
-		// Try cached jobs if not found in main repository
-		cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
-		if cachedErr != nil {
-			// Combine both errors for better debugging
-			handleError(fmt.Errorf("finding job failed: %w (cached lookup also failed: %v)", err, cachedErr), http.StatusNotFound, rw)
+		// Not in cache, try main job table
+		job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
+		if err != nil {
+			handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw)
 			return
 		}
-		job = cachedJob
+	} else {
+		isCached = true
 	}

-	api.checkAndHandleStopJob(rw, job, req)
+	api.checkAndHandleStopJob(rw, job, req, isCached)
 }

 // deleteJobByID godoc
@@ -776,7 +840,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
 // @description Job to remove is specified by database ID. This will not remove the job from the job archive.
 // @produce     json
 // @param       id      path     int                   true "Database ID of Job"
-// @success     200     {object} api.DefaultApiResponse  "Success message"
+// @success     200     {object} api.DefaultAPIResponse  "Success message"
 // @failure     400     {object} api.ErrorResponse          "Bad Request"
 // @failure     401     {object} api.ErrorResponse          "Unauthorized"
 // @failure     403     {object} api.ErrorResponse          "Forbidden"
@@ -787,16 +851,16 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) {
 // @router      /api/jobs/delete_job/{id} [delete]
 func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
 	// Fetch job (that will be stopped) from db
-	id, ok := mux.Vars(r)["id"]
+	id := chi.URLParam(r, "id")
 	var err error
-	if ok {
+	if id != "" {
 		id, e := strconv.ParseInt(id, 10, 64)
 		if e != nil {
 			handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw)
 			return
 		}

-		err = api.JobRepository.DeleteJobById(id)
+		err = api.JobRepository.DeleteJobByID(id)
 	} else {
 		handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw)
 		return
@@ -820,8 +884,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) {
 // @description Job to delete is specified by request body. All fields are required in this case.
 // @accept      json
 // @produce     json
-// @param       request body     api.DeleteJobApiRequest true "All fields required"
-// @success     200     {object} api.DefaultApiResponse  "Success message"
+// @param       request body     api.DeleteJobAPIRequest true "All fields required"
+// @success     200     {object} api.DefaultAPIResponse  "Success message"
 // @failure     400     {object} api.ErrorResponse          "Bad Request"
 // @failure     401     {object} api.ErrorResponse          "Unauthorized"
 // @failure     403     {object} api.ErrorResponse          "Forbidden"
@@ -852,7 +916,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
 		return
 	}

-	err = api.JobRepository.DeleteJobById(*job.ID)
+	err = api.JobRepository.DeleteJobByID(*job.ID)
 	if err != nil {
 		handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw)
 		return
@@ -861,7 +925,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
 	rw.Header().Add("Content-Type", "application/json")
 	rw.WriteHeader(http.StatusOK)
 	if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{
-		Message: fmt.Sprintf("Successfully deleted job %d", job.ID),
+		Message: fmt.Sprintf("Successfully deleted job %d", *job.ID),
 	}); err != nil {
 		cclog.Errorf("Failed to encode response: %v", err)
 	}
@@ -873,7 +937,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
 // @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.
 // @produce     json
 // @param       ts      path     int                   true "Unix epoch timestamp"
-// @success     200     {object} api.DefaultApiResponse  "Success message"
+// @success     200     {object} api.DefaultAPIResponse  "Success message"
 // @failure     400     {object} api.ErrorResponse          "Bad Request"
 // @failure     401     {object} api.ErrorResponse          "Unauthorized"
 // @failure     403     {object} api.ErrorResponse          "Forbidden"
@@ -886,9 +950,9 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request)
 func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
 	var cnt int
 	// Fetch job (that will be stopped) from db
-	id, ok := mux.Vars(r)["ts"]
+	id := chi.URLParam(r, "ts")
 	var err error
-	if ok {
+	if id != "" {
 		ts, e := strconv.ParseInt(id, 10, 64)
 		if e != nil {
 			handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw)
@@ -896,11 +960,13 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
 		}

 		// Check for omit-tagged query parameter
-		omitTagged := false
+		omitTagged := "none"
 		if omitTaggedStr := r.URL.Query().Get("omit-tagged"); omitTaggedStr != "" {
-			omitTagged, e = strconv.ParseBool(omitTaggedStr)
-			if e != nil {
-				handleError(fmt.Errorf("boolean expected for omit-tagged parameter: %w", e), http.StatusBadRequest, rw)
+			switch omitTaggedStr {
+			case "none", "all", "user":
+				omitTagged = omitTaggedStr
+			default:
+				handleError(fmt.Errorf("omit-tagged must be one of: none, all, user"), http.StatusBadRequest, rw)
 				return
 			}
 		}
@@ -924,20 +990,20 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) {
 	}
 }

-func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) {
+func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) {
 	// Sanity checks
 	if job.State != schema.JobStateRunning {
-		handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
+		handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw)
 		return
 	}

 	if job.StartTime > req.StopTime {
-		handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
+		handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, *job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw)
 		return
 	}

 	if req.State != "" && !req.State.Valid() {
-		handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
+		handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, *job.ID, job.Cluster, req.State), http.StatusBadRequest, rw)
 		return
 	} else if req.State == "" {
 		req.State = schema.JobStateCompleted
@@ -949,14 +1015,24 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
 	api.JobRepository.Mutex.Lock()
 	defer api.JobRepository.Mutex.Unlock()

-	if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
-		if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
-			handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
+	// If the job is still in job_cache, transfer it to the job table first
+	// so that job.ID always points to the job table for downstream code
+	if isCached {
+		newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
+		if err != nil {
+			handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw)
 			return
 		}
+		cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
+		job.ID = &newID
 	}

-	cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
+	if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
+		handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw)
+		return
+	}
+
+	cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)

 	// Send a response (with status OK). This means that errors that happen from here on forward
 	// can *NOT* be communicated to the client. If reading from a MetricDataRepository or
@@ -977,7 +1053,7 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo
 }

 func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
-	id := mux.Vars(r)["id"]
+	id := chi.URLParam(r, "id")
 	metrics := r.URL.Query()["metric"]
 	var scopes []schema.MetricScope
 	for _, scope := range r.URL.Query()["scope"] {
@@ -1022,3 +1098,57 @@ func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) {
 		cclog.Errorf("Failed to encode response: %v", err)
 	}
 }
+
+// GetUsedNodesAPIResponse model
+type GetUsedNodesAPIResponse struct {
+	UsedNodes map[string][]string `json:"usedNodes"` // Map of cluster names to lists of used node hostnames
+}
+
+// getUsedNodes godoc
+// @summary     Lists used nodes by cluster
+// @tags Job query
+// @description Get a map of cluster names to lists of unique hostnames that are currently in use by running jobs that started before the specified timestamp.
+// @produce     json
+// @param       ts             query    int               true  "Unix timestamp to filter jobs (jobs with start_time < ts)"
+// @success     200            {object} api.GetUsedNodesAPIResponse  "Map of cluster names to hostname lists"
+// @failure     400            {object} api.ErrorResponse            "Bad Request"
+// @failure     401            {object} api.ErrorResponse            "Unauthorized"
+// @failure     403            {object} api.ErrorResponse            "Forbidden"
+// @failure     500            {object} api.ErrorResponse            "Internal Server Error"
+// @security    ApiKeyAuth
+// @router      /api/jobs/used_nodes [get]
+func (api *RestAPI) getUsedNodes(rw http.ResponseWriter, r *http.Request) {
+	if user := repository.GetUserFromContext(r.Context()); user != nil &&
+		!user.HasRole(schema.RoleAPI) {
+		handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw)
+		return
+	}
+
+	tsStr := r.URL.Query().Get("ts")
+	if tsStr == "" {
+		handleError(fmt.Errorf("missing required query parameter: ts"), http.StatusBadRequest, rw)
+		return
+	}
+
+	ts, err := strconv.ParseInt(tsStr, 10, 64)
+	if err != nil {
+		handleError(fmt.Errorf("invalid timestamp format: %w", err), http.StatusBadRequest, rw)
+		return
+	}
+
+	usedNodes, err := api.JobRepository.GetUsedNodes(ts)
+	if err != nil {
+		handleError(fmt.Errorf("failed to get used nodes: %w", err), http.StatusInternalServerError, rw)
+		return
+	}
+
+	rw.Header().Add("Content-Type", "application/json")
+	payload := GetUsedNodesAPIResponse{
+		UsedNodes: usedNodes,
+	}
+
+	if err := json.NewEncoder(rw).Encode(payload); err != nil {
+		handleError(err, http.StatusInternalServerError, rw)
+		return
+	}
+}
--- a/internal/api/log.go
+++ b/internal/api/log.go
@@ -0,0 +1,165 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+package api
+
+import (
+	"bufio"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os/exec"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/ClusterCockpit/cc-backend/internal/config"
+	"github.com/ClusterCockpit/cc-backend/internal/repository"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+type LogEntry struct {
+	Timestamp string `json:"timestamp"`
+	Priority  int    `json:"priority"`
+	Message   string `json:"message"`
+	Unit      string `json:"unit"`
+}
+
+var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`)
+
+func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) {
+	user := repository.GetUserFromContext(r.Context())
+	if !user.HasRole(schema.RoleAdmin) {
+		handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw)
+		return
+	}
+
+	since := r.URL.Query().Get("since")
+	if since == "" {
+		since = "1 hour ago"
+	}
+	if !safePattern.MatchString(since) {
+		handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw)
+		return
+	}
+
+	lines := 200
+	if l := r.URL.Query().Get("lines"); l != "" {
+		n, err := strconv.Atoi(l)
+		if err != nil || n < 1 {
+			handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw)
+			return
+		}
+		if n > 1000 {
+			n = 1000
+		}
+		lines = n
+	}
+
+	unit := config.Keys.SystemdUnit
+	if unit == "" {
+		unit = "clustercockpit.service"
+	}
+
+	args := []string{
+		"--output=json",
+		"--no-pager",
+		"-n", fmt.Sprintf("%d", lines),
+		"--since", since,
+		"-u", unit,
+	}
+
+	if level := r.URL.Query().Get("level"); level != "" {
+		n, err := strconv.Atoi(level)
+		if err != nil || n < 0 || n > 7 {
+			handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw)
+			return
+		}
+		args = append(args, "--priority", fmt.Sprintf("%d", n))
+	}
+
+	if search := r.URL.Query().Get("search"); search != "" {
+		if !safePattern.MatchString(search) {
+			handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw)
+			return
+		}
+		args = append(args, "--grep", search)
+	}
+
+	cclog.Debugf("calling journalctl with %s", strings.Join(args, " "))
+	cmd := exec.CommandContext(r.Context(), "journalctl", args...)
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw)
+		return
+	}
+
+	if err := cmd.Start(); err != nil {
+		handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw)
+		return
+	}
+
+	entries := make([]LogEntry, 0, lines)
+	scanner := bufio.NewScanner(stdout)
+	for scanner.Scan() {
+		var raw map[string]any
+		if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
+			cclog.Debugf("error unmarshal log output: %v", err)
+			continue
+		}
+
+		priority := 6 // default info
+		if p, ok := raw["PRIORITY"]; ok {
+			switch v := p.(type) {
+			case string:
+				if n, err := strconv.Atoi(v); err == nil {
+					priority = n
+				}
+			case float64:
+				priority = int(v)
+			}
+		}
+
+		msg := ""
+		if m, ok := raw["MESSAGE"]; ok {
+			if s, ok := m.(string); ok {
+				msg = s
+			}
+		}
+
+		ts := ""
+		if t, ok := raw["__REALTIME_TIMESTAMP"]; ok {
+			if s, ok := t.(string); ok {
+				ts = s
+			}
+		}
+
+		unitName := ""
+		if u, ok := raw["_SYSTEMD_UNIT"]; ok {
+			if s, ok := u.(string); ok {
+				unitName = s
+			}
+		}
+
+		entries = append(entries, LogEntry{
+			Timestamp: ts,
+			Priority:  priority,
+			Message:   msg,
+			Unit:      unitName,
+		})
+	}
+
+	if err := cmd.Wait(); err != nil {
+		// journalctl returns exit code 1 when --grep matches nothing
+		if len(entries) == 0 {
+			cclog.Debugf("journalctl exited with: %v", err)
+		}
+	}
+
+	rw.Header().Set("Content-Type", "application/json")
+	if err := json.NewEncoder(rw).Encode(entries); err != nil {
+		cclog.Errorf("Failed to encode log entries: %v", err)
+	}
+}
--- a/internal/api/metricstore.go
+++ b/internal/api/metricstore.go
@@ -10,15 +10,14 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"net/http"
 	"strconv"
 	"strings"

-	"github.com/ClusterCockpit/cc-backend/internal/memorystore"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"

-	"github.com/influxdata/line-protocol/v2/lineprotocol"
+	"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
 )

 // handleFree godoc
@@ -58,7 +57,7 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
 		return
 	}

-	ms := memorystore.GetMemoryStore()
+	ms := metricstore.GetMemoryStore()
 	n := 0
 	for _, sel := range selectors {
 		bn, err := ms.Free(sel, to)
@@ -90,16 +89,17 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
 // @security    ApiKeyAuth
 // @router      /write/ [post]
 func writeMetrics(rw http.ResponseWriter, r *http.Request) {
-	bytes, err := io.ReadAll(r.Body)
 	rw.Header().Add("Content-Type", "application/json")
-	if err != nil {
-		handleError(err, http.StatusInternalServerError, rw)
-		return
-	}

-	ms := memorystore.GetMemoryStore()
-	dec := lineprotocol.NewDecoderWithBytes(bytes)
-	if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
+	// Extract the "cluster" query parameter without allocating a url.Values map.
+	cluster := queryParam(r.URL.RawQuery, "cluster")
+
+	// Stream directly from the request body instead of copying it into a
+	// temporary buffer via io.ReadAll. The line-protocol decoder supports
+	// io.Reader natively, so this avoids the largest heap allocation.
+	ms := metricstore.GetMemoryStore()
+	dec := lineprotocol.NewDecoder(r.Body)
+	if err := metricstore.DecodeLine(dec, ms, cluster); err != nil {
 		cclog.Errorf("/api/write error: %s", err.Error())
 		handleError(err, http.StatusBadRequest, rw)
 		return
@@ -107,6 +107,20 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
 	rw.WriteHeader(http.StatusOK)
 }

+// queryParam extracts a single query-parameter value from a raw query string
+// without allocating a url.Values map. Returns "" if the key is not present.
+func queryParam(raw, key string) string {
+	for raw != "" {
+		var kv string
+		kv, raw, _ = strings.Cut(raw, "&")
+		k, v, _ := strings.Cut(kv, "=")
+		if k == key {
+			return v
+		}
+	}
+	return ""
+}
+
 // handleDebug godoc
 // @summary Debug endpoint
 // @tags debug
@@ -129,42 +143,9 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
 		selector = strings.Split(raw, ":")
 	}

-	ms := memorystore.GetMemoryStore()
+	ms := metricstore.GetMemoryStore()
 	if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
 		handleError(err, http.StatusBadRequest, rw)
 		return
 	}
 }
-
-// handleHealthCheck godoc
-// @summary HealthCheck endpoint
-// @tags healthcheck
-// @description This endpoint allows the users to check if a node is healthy
-// @produce     json
-// @param       selector        query    string            false "Selector"
-// @success     200            {string} string  "Debug dump"
-// @failure     400            {object} api.ErrorResponse       "Bad Request"
-// @failure     401            {object} api.ErrorResponse       "Unauthorized"
-// @failure     403            {object} api.ErrorResponse       "Forbidden"
-// @failure     500            {object} api.ErrorResponse       "Internal Server Error"
-// @security    ApiKeyAuth
-// @router      /healthcheck/ [get]
-func metricsHealth(rw http.ResponseWriter, r *http.Request) {
-	rawCluster := r.URL.Query().Get("cluster")
-	rawNode := r.URL.Query().Get("node")
-
-	if rawCluster == "" || rawNode == "" {
-		handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
-		return
-	}
-
-	rw.Header().Add("Content-Type", "application/json")
-
-	selector := []string{rawCluster, rawNode}
-
-	ms := memorystore.GetMemoryStore()
-	if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
-		handleError(err, http.StatusBadRequest, rw)
-		return
-	}
-}
--- a/internal/api/nats.go
+++ b/internal/api/nats.go
@@ -0,0 +1,400 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package api
+
+import (
+	"database/sql"
+	"encoding/json"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/ClusterCockpit/cc-backend/internal/archiver"
+	"github.com/ClusterCockpit/cc-backend/internal/config"
+	"github.com/ClusterCockpit/cc-backend/internal/importer"
+	"github.com/ClusterCockpit/cc-backend/internal/repository"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
+	"github.com/ClusterCockpit/cc-lib/v2/nats"
+	"github.com/ClusterCockpit/cc-lib/v2/receivers"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
+)
+
+// NatsAPI provides NATS subscription-based handlers for Job and Node operations.
+// It mirrors the functionality of the REST API but uses NATS messaging with
+// InfluxDB line protocol as the message format.
+//
+// # Message Format
+//
+// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/)
+// with the following structure:
+//
+//	measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
+//
+// # Job Events
+//
+// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations:
+//
+//	job,function=start_job event="{...JSON payload...}" <timestamp>
+//	job,function=stop_job event="{...JSON payload...}" <timestamp>
+//
+// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure.
+//
+// Example job start message:
+//
+//	job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000
+//
+// # Node State Events
+//
+// Node state updates use the "nodestate" measurement with cluster information:
+//
+//	nodestate event="{...JSON payload...}" <timestamp>
+//
+// The JSON payload follows the UpdateNodeStatesRequest structure.
+//
+// Example node state message:
+//
+//	nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000
+type NatsAPI struct {
+	JobRepository *repository.JobRepository
+	// RepositoryMutex protects job creation operations from race conditions
+	// when checking for duplicate jobs during startJob calls.
+	RepositoryMutex sync.Mutex
+}
+
+// NewNatsAPI creates a new NatsAPI instance with default dependencies.
+func NewNatsAPI() *NatsAPI {
+	return &NatsAPI{
+		JobRepository: repository.GetJobRepository(),
+	}
+}
+
+// StartSubscriptions registers all NATS subscriptions for Job and Node APIs.
+// Returns an error if the NATS client is not available or subscription fails.
+func (api *NatsAPI) StartSubscriptions() error {
+	client := nats.GetClient()
+	if client == nil {
+		cclog.Warn("NATS client not available, skipping API subscriptions")
+		return nil
+	}
+
+	if config.Keys.APISubjects != nil {
+
+		s := config.Keys.APISubjects
+
+		if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil {
+			return err
+		}
+
+		if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil {
+			return err
+		}
+
+		cclog.Info("NATS API subscriptions started")
+	}
+	return nil
+}
+
+// processJobEvent routes job event messages to the appropriate handler based on the "function" tag.
+// Validates that required tags and fields are present before processing.
+func (api *NatsAPI) processJobEvent(msg lp.CCMessage) {
+	function, ok := msg.GetTag("function")
+	if !ok {
+		cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name())
+		return
+	}
+
+	switch function {
+	case "start_job":
+		v, ok := msg.GetEventValue()
+		if !ok {
+			cclog.Errorf("Job start event is missing event field with JSON payload")
+			return
+		}
+		api.handleStartJob(v)
+
+	case "stop_job":
+		v, ok := msg.GetEventValue()
+		if !ok {
+			cclog.Errorf("Job stop event is missing event field with JSON payload")
+			return
+		}
+		api.handleStopJob(v)
+
+	default:
+		cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function)
+	}
+}
+
+// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol.
+// The message must be in line protocol format with measurement="job" and include:
+//   - tag "function" with value "start_job" or "stop_job"
+//   - field "event" containing JSON payload (schema.Job or StopJobAPIRequest)
+//
+// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000
+func (api *NatsAPI) handleJobEvent(subject string, data []byte) {
+	if len(data) == 0 {
+		cclog.Warnf("NATS %s: received empty message", subject)
+		return
+	}
+
+	d := influx.NewDecoderWithBytes(data)
+
+	for d.Next() {
+		m, err := receivers.DecodeInfluxMessage(d)
+		if err != nil {
+			cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
+			return
+		}
+
+		if !m.IsEvent() {
+			cclog.Debugf("NATS %s: received non-event message, skipping", subject)
+			continue
+		}
+
+		if m.Name() == "job" {
+			api.processJobEvent(m)
+		} else {
+			cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name())
+		}
+	}
+}
+
+// handleStartJob processes job start messages received via NATS.
+// The payload parameter contains JSON following the schema.Job structure.
+// Jobs are validated, checked for duplicates, and inserted into the database.
+func (api *NatsAPI) handleStartJob(payload string) {
+	if payload == "" {
+		cclog.Error("NATS start job: payload is empty")
+		return
+	}
+	req := schema.Job{
+		Shared:           "none",
+		MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
+	}
+
+	dec := json.NewDecoder(strings.NewReader(payload))
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(&req); err != nil {
+		cclog.Errorf("NATS start job: parsing request failed: %v", err)
+		return
+	}
+
+	cclog.Debugf("NATS start job: %s", req.GoString())
+	req.State = schema.JobStateRunning
+
+	if err := importer.SanityChecks(&req); err != nil {
+		cclog.Errorf("NATS start job: sanity check failed: %v", err)
+		return
+	}
+
+	var unlockOnce sync.Once
+	api.RepositoryMutex.Lock()
+	defer unlockOnce.Do(api.RepositoryMutex.Unlock)
+
+	jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
+	if err != nil && err != sql.ErrNoRows {
+		cclog.Errorf("NATS start job: checking for duplicate failed: %v", err)
+		return
+	}
+	if err == nil {
+		for _, job := range jobs {
+			if (req.StartTime - job.StartTime) < secondsPerDay {
+				cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)",
+					req.JobID, req.Cluster, job.ID)
+				return
+			}
+		}
+	}
+
+	// When tags are present, insert directly into the job table so that the
+	// returned ID can be used with AddTagOrCreate (which queries the job table).
+	var id int64
+	if len(req.Tags) > 0 {
+		id, err = api.JobRepository.StartDirect(&req)
+	} else {
+		id, err = api.JobRepository.Start(&req)
+	}
+	if err != nil {
+		cclog.Errorf("NATS start job: insert into database failed: %v", err)
+		return
+	}
+	unlockOnce.Do(api.RepositoryMutex.Unlock)
+
+	for _, tag := range req.Tags {
+		if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil {
+			cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err)
+			return
+		}
+	}
+
+	cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d",
+		id, req.Cluster, req.JobID, req.User, req.StartTime)
+}
+
+// handleStopJob processes job stop messages received via NATS.
+// The payload parameter contains JSON following the StopJobAPIRequest structure.
+// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled.
+func (api *NatsAPI) handleStopJob(payload string) {
+	if payload == "" {
+		cclog.Error("NATS stop job: payload is empty")
+		return
+	}
+	var req StopJobAPIRequest
+
+	dec := json.NewDecoder(strings.NewReader(payload))
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(&req); err != nil {
+		cclog.Errorf("NATS job stop: parsing request failed: %v", err)
+		return
+	}
+
+	if req.JobID == nil {
+		cclog.Errorf("NATS job stop: the field 'jobId' is required")
+		return
+	}
+
+	isCached := false
+	job, err := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime)
+	if err != nil {
+		// Not in cache, try main job table
+		job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime)
+		if err != nil {
+			cclog.Errorf("NATS job stop: finding job failed: %v", err)
+			return
+		}
+	} else {
+		isCached = true
+	}
+
+	if job.State != schema.JobStateRunning {
+		cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)",
+			job.JobID, job.ID, job.Cluster, job.State)
+		return
+	}
+
+	if job.StartTime > req.StopTime {
+		cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d",
+			job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime)
+		return
+	}
+
+	if req.State != "" && !req.State.Valid() {
+		cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v",
+			job.JobID, job.ID, job.Cluster, req.State)
+		return
+	} else if req.State == "" {
+		req.State = schema.JobStateCompleted
+	}
+
+	job.Duration = int32(req.StopTime - job.StartTime)
+	job.State = req.State
+	api.JobRepository.Mutex.Lock()
+	defer api.JobRepository.Mutex.Unlock()
+
+	// If the job is still in job_cache, transfer it to the job table first
+	if isCached {
+		newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID)
+		if err != nil {
+			cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v",
+				job.JobID, *job.ID, job.Cluster, err)
+			return
+		}
+		cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID)
+		job.ID = &newID
+	}
+
+	if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
+		cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v",
+			job.JobID, *job.ID, job.Cluster, job.State, err)
+		return
+	}
+
+	cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s",
+		*job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State)
+
+	if job.MonitoringStatus == schema.MonitoringStatusDisabled {
+		return
+	}
+
+	archiver.TriggerArchiving(job)
+}
+
+// processNodestateEvent extracts and processes node state data from the InfluxDB message.
+// Updates node states in the repository for all nodes in the payload.
+func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
+	v, ok := msg.GetEventValue()
+	if !ok {
+		cclog.Errorf("Nodestate event is missing event field with JSON payload")
+		return
+	}
+
+	var req UpdateNodeStatesRequest
+
+	dec := json.NewDecoder(strings.NewReader(v))
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(&req); err != nil {
+		cclog.Errorf("NATS nodestate: parsing request failed: %v", err)
+		return
+	}
+
+	repo := repository.GetNodeRepository()
+	requestReceived := time.Now().Unix()
+
+	for _, node := range req.Nodes {
+		state := determineState(node.States)
+		nodeState := schema.NodeStateDB{
+			TimeStamp:       requestReceived,
+			NodeState:       state,
+			CpusAllocated:   node.CpusAllocated,
+			MemoryAllocated: node.MemoryAllocated,
+			GpusAllocated:   node.GpusAllocated,
+			HealthState:     schema.MonitoringStateFull,
+			JobsRunning:     node.JobsRunning,
+		}
+
+		if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
+			cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v",
+				node.Hostname, req.Cluster, err)
+		}
+	}
+
+	cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster)
+}
+
+// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol.
+// The message must be in line protocol format with measurement="nodestate" and include:
+//   - field "event" containing JSON payload (UpdateNodeStatesRequest)
+//
+// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000
+func (api *NatsAPI) handleNodeState(subject string, data []byte) {
+	if len(data) == 0 {
+		cclog.Warnf("NATS %s: received empty message", subject)
+		return
+	}
+
+	d := influx.NewDecoderWithBytes(data)
+
+	for d.Next() {
+		m, err := receivers.DecodeInfluxMessage(d)
+		if err != nil {
+			cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err)
+			return
+		}
+
+		if !m.IsEvent() {
+			cclog.Warnf("NATS %s: received non-event message, skipping", subject)
+			continue
+		}
+
+		if m.Name() == "nodestate" {
+			api.processNodestateEvent(m)
+		} else {
+			cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name())
+		}
+	}
+}
--- a/internal/api/nats_test.go
+++ b/internal/api/nats_test.go
@@ -0,0 +1,947 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+package api
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/ClusterCockpit/cc-backend/internal/archiver"
+	"github.com/ClusterCockpit/cc-backend/internal/auth"
+	"github.com/ClusterCockpit/cc-backend/internal/config"
+	"github.com/ClusterCockpit/cc-backend/internal/graph"
+	"github.com/ClusterCockpit/cc-backend/internal/repository"
+	"github.com/ClusterCockpit/cc-backend/pkg/archive"
+	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
+	ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+
+	_ "github.com/mattn/go-sqlite3"
+)
+
+func setupNatsTest(t *testing.T) *NatsAPI {
+	repository.ResetConnection()
+
+	const testconfig = `{
+		"main": {
+	"addr":            "0.0.0.0:8080",
+	"validate": false,
+  "api-allowed-ips": [
+    "*"
+  ]
+	},
+	"archive": {
+		"kind": "file",
+		"path": "./var/job-archive"
+	},
+	"auth": {
+  "jwts": {
+      "max-age": "2m"
+  }
+	}
+}`
+	const testclusterJSON = `{
+        "name": "testcluster",
+		"subClusters": [
+			{
+				"name": "sc1",
+				"nodes": "host123,host124,host125",
+				"processorType": "Intel Core i7-4770",
+				"socketsPerNode": 1,
+				"coresPerSocket": 4,
+				"threadsPerCore": 2,
+                "flopRateScalar": {
+                  "unit": {
+                    "prefix": "G",
+                    "base": "F/s"
+                  },
+                  "value": 14
+                },
+                "flopRateSimd": {
+                  "unit": {
+                    "prefix": "G",
+                    "base": "F/s"
+                  },
+                  "value": 112
+                },
+                "memoryBandwidth": {
+                  "unit": {
+                    "prefix": "G",
+                    "base": "B/s"
+                  },
+                  "value": 24
+                },
+                "numberOfNodes": 70,
+				"topology": {
+					"node": [0, 1, 2, 3, 4, 5, 6, 7],
+					"socket": [[0, 1, 2, 3, 4, 5, 6, 7]],
+					"memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]],
+					"die": [[0, 1, 2, 3, 4, 5, 6, 7]],
+					"core": [[0], [1], [2], [3], [4], [5], [6], [7]]
+				}
+			}
+		],
+		"metricConfig": [
+			{
+				"name": "load_one",
+			    "unit": { "base": ""},
+				"scope": "node",
+				"timestep": 60,
+                "aggregation": "avg",
+				"peak": 8,
+				"normal": 0,
+				"caution": 0,
+				"alert": 0
+			}
+		]
+	}`
+
+	cclog.Init("info", true)
+	tmpdir := t.TempDir()
+	jobarchive := filepath.Join(tmpdir, "job-archive")
+	if err := os.Mkdir(jobarchive, 0o777); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil {
+		t.Fatal(err)
+	}
+
+	dbfilepath := filepath.Join(tmpdir, "test.db")
+	err := repository.MigrateDB(dbfilepath)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	cfgFilePath := filepath.Join(tmpdir, "config.json")
+	if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil {
+		t.Fatal(err)
+	}
+
+	ccconf.Init(cfgFilePath)
+
+	// Load and check main configuration
+	if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
+		config.Init(cfg)
+	} else {
+		cclog.Abort("Main configuration must be present")
+	}
+	archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)
+
+	repository.Connect(dbfilepath)
+
+	if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
+		t.Fatal(err)
+	}
+
+	// metricstore initialization removed - it's initialized via callback in tests
+
+	archiver.Start(repository.GetJobRepository(), context.Background())
+
+	if cfg := ccconf.GetPackageConfig("auth"); cfg != nil {
+		auth.Init(&cfg)
+	} else {
+		cclog.Warn("Authentication disabled due to missing configuration")
+		auth.Init(nil)
+	}
+
+	graph.Init()
+
+	return NewNatsAPI()
+}
+
+func cleanupNatsTest() {
+	if err := archiver.Shutdown(5 * time.Second); err != nil {
+		cclog.Warnf("Archiver shutdown timeout in tests: %v", err)
+	}
+}
+
+func TestNatsHandleStartJob(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	tests := []struct {
+		name          string
+		payload       string
+		expectError   bool
+		validateJob   func(t *testing.T, job *schema.Job)
+		shouldFindJob bool
+	}{
+		{
+			name: "valid job start",
+			payload: `{
+				"jobId": 1001,
+				"user": "testuser1",
+				"project": "testproj1",
+				"cluster": "testcluster",
+				"partition": "main",
+				"walltime": 7200,
+				"numNodes": 1,
+				"numHwthreads": 8,
+				"numAcc": 0,
+				"shared": "none",
+				"monitoringStatus": 1,
+				"smt": 1,
+				"resources": [
+					{
+						"hostname": "host123",
+						"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
+					}
+				],
+				"startTime": 1234567890
+			}`,
+			expectError:   false,
+			shouldFindJob: true,
+			validateJob: func(t *testing.T, job *schema.Job) {
+				if job.JobID != 1001 {
+					t.Errorf("expected JobID 1001, got %d", job.JobID)
+				}
+				if job.User != "testuser1" {
+					t.Errorf("expected user testuser1, got %s", job.User)
+				}
+				if job.State != schema.JobStateRunning {
+					t.Errorf("expected state running, got %s", job.State)
+				}
+			},
+		},
+		{
+			name: "invalid JSON",
+			payload: `{
+				"jobId": "not a number",
+				"user": "testuser2"
+			}`,
+			expectError:   true,
+			shouldFindJob: false,
+		},
+		{
+			name: "missing required fields",
+			payload: `{
+				"jobId": 1002
+			}`,
+			expectError:   true,
+			shouldFindJob: false,
+		},
+		{
+			name: "job with unknown fields (should fail due to DisallowUnknownFields)",
+			payload: `{
+				"jobId": 1003,
+				"user": "testuser3",
+				"project": "testproj3",
+				"cluster": "testcluster",
+				"partition": "main",
+				"walltime": 3600,
+				"numNodes": 1,
+				"numHwthreads": 8,
+				"unknownField": "should cause error",
+				"startTime": 1234567900
+			}`,
+			expectError:   true,
+			shouldFindJob: false,
+		},
+		{
+			name: "job with tags",
+			payload: `{
+				"jobId": 1004,
+				"user": "testuser4",
+				"project": "testproj4",
+				"cluster": "testcluster",
+				"partition": "main",
+				"walltime": 3600,
+				"numNodes": 1,
+				"numHwthreads": 8,
+				"numAcc": 0,
+				"shared": "none",
+				"monitoringStatus": 1,
+				"smt": 1,
+				"resources": [
+					{
+						"hostname": "host123",
+						"hwthreads": [0, 1, 2, 3]
+					}
+				],
+				"tags": [
+					{
+						"type": "test",
+						"name": "testtag",
+						"scope": "testuser4"
+					}
+				],
+				"startTime": 1234567910
+			}`,
+			expectError:   false,
+			shouldFindJob: true,
+			validateJob: func(t *testing.T, job *schema.Job) {
+				if job.JobID != 1004 {
+					t.Errorf("expected JobID 1004, got %d", job.JobID)
+				}
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			natsAPI.handleStartJob(tt.payload)
+			natsAPI.JobRepository.SyncJobs()
+
+			// Allow some time for async operations
+			time.Sleep(100 * time.Millisecond)
+
+			if tt.shouldFindJob {
+				// Extract jobId from payload
+				var payloadMap map[string]any
+				json.Unmarshal([]byte(tt.payload), &payloadMap)
+				jobID := int64(payloadMap["jobId"].(float64))
+				cluster := payloadMap["cluster"].(string)
+				startTime := int64(payloadMap["startTime"].(float64))
+
+				job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime)
+				if err != nil {
+					if !tt.expectError {
+						t.Fatalf("expected to find job, but got error: %v", err)
+					}
+					return
+				}
+
+				if tt.validateJob != nil {
+					tt.validateJob(t, job)
+				}
+			}
+		})
+	}
+}
+
+func TestNatsHandleStopJob(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	// First, create a running job
+	startPayload := `{
+		"jobId": 2001,
+		"user": "testuser",
+		"project": "testproj",
+		"cluster": "testcluster",
+		"partition": "main",
+		"walltime": 3600,
+		"numNodes": 1,
+		"numHwthreads": 8,
+		"numAcc": 0,
+		"shared": "none",
+		"monitoringStatus": 1,
+		"smt": 1,
+		"resources": [
+			{
+				"hostname": "host123",
+				"hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]
+			}
+		],
+		"startTime": 1234567890
+	}`
+
+	natsAPI.handleStartJob(startPayload)
+	natsAPI.JobRepository.SyncJobs()
+	time.Sleep(100 * time.Millisecond)
+
+	tests := []struct {
+		name         string
+		payload      string
+		expectError  bool
+		validateJob  func(t *testing.T, job *schema.Job)
+		setupJobFunc func() // Optional: create specific test job
+	}{
+		{
+			name: "valid job stop - completed",
+			payload: `{
+				"jobId": 2001,
+				"cluster": "testcluster",
+				"startTime": 1234567890,
+				"jobState": "completed",
+				"stopTime": 1234571490
+			}`,
+			expectError: false,
+			validateJob: func(t *testing.T, job *schema.Job) {
+				if job.State != schema.JobStateCompleted {
+					t.Errorf("expected state completed, got %s", job.State)
+				}
+				expectedDuration := int32(1234571490 - 1234567890)
+				if job.Duration != expectedDuration {
+					t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration)
+				}
+			},
+		},
+		{
+			name: "valid job stop - failed",
+			setupJobFunc: func() {
+				startPayloadFailed := `{
+					"jobId": 2002,
+					"user": "testuser",
+					"project": "testproj",
+					"cluster": "testcluster",
+					"partition": "main",
+					"walltime": 3600,
+					"numNodes": 1,
+					"numHwthreads": 8,
+					"numAcc": 0,
+					"shared": "none",
+					"monitoringStatus": 1,
+					"smt": 1,
+					"resources": [
+						{
+							"hostname": "host123",
+							"hwthreads": [0, 1, 2, 3]
+						}
+					],
+					"startTime": 1234567900
+				}`
+				natsAPI.handleStartJob(startPayloadFailed)
+				natsAPI.JobRepository.SyncJobs()
+				time.Sleep(100 * time.Millisecond)
+			},
+			payload: `{
+				"jobId": 2002,
+				"cluster": "testcluster",
+				"startTime": 1234567900,
+				"jobState": "failed",
+				"stopTime": 1234569900
+			}`,
+			expectError: false,
+			validateJob: func(t *testing.T, job *schema.Job) {
+				if job.State != schema.JobStateFailed {
+					t.Errorf("expected state failed, got %s", job.State)
+				}
+			},
+		},
+		{
+			name: "invalid JSON",
+			payload: `{
+				"jobId": "not a number"
+			}`,
+			expectError: true,
+		},
+		{
+			name: "missing jobId",
+			payload: `{
+				"cluster": "testcluster",
+				"jobState": "completed",
+				"stopTime": 1234571490
+			}`,
+			expectError: true,
+		},
+		{
+			name: "invalid job state",
+			setupJobFunc: func() {
+				startPayloadInvalid := `{
+					"jobId": 2003,
+					"user": "testuser",
+					"project": "testproj",
+					"cluster": "testcluster",
+					"partition": "main",
+					"walltime": 3600,
+					"numNodes": 1,
+					"numHwthreads": 8,
+					"numAcc": 0,
+					"shared": "none",
+					"monitoringStatus": 1,
+					"smt": 1,
+					"resources": [
+						{
+							"hostname": "host123",
+							"hwthreads": [0, 1]
+						}
+					],
+					"startTime": 1234567910
+				}`
+				natsAPI.handleStartJob(startPayloadInvalid)
+				natsAPI.JobRepository.SyncJobs()
+				time.Sleep(100 * time.Millisecond)
+			},
+			payload: `{
+				"jobId": 2003,
+				"cluster": "testcluster",
+				"startTime": 1234567910,
+				"jobState": "invalid_state",
+				"stopTime": 1234571510
+			}`,
+			expectError: true,
+		},
+		{
+			name: "stopTime before startTime",
+			setupJobFunc: func() {
+				startPayloadTime := `{
+					"jobId": 2004,
+					"user": "testuser",
+					"project": "testproj",
+					"cluster": "testcluster",
+					"partition": "main",
+					"walltime": 3600,
+					"numNodes": 1,
+					"numHwthreads": 8,
+					"numAcc": 0,
+					"shared": "none",
+					"monitoringStatus": 1,
+					"smt": 1,
+					"resources": [
+						{
+							"hostname": "host123",
+							"hwthreads": [0]
+						}
+					],
+					"startTime": 1234567920
+				}`
+				natsAPI.handleStartJob(startPayloadTime)
+				natsAPI.JobRepository.SyncJobs()
+				time.Sleep(100 * time.Millisecond)
+			},
+			payload: `{
+				"jobId": 2004,
+				"cluster": "testcluster",
+				"startTime": 1234567920,
+				"jobState": "completed",
+				"stopTime": 1234567900
+			}`,
+			expectError: true,
+		},
+		{
+			name: "job not found",
+			payload: `{
+				"jobId": 99999,
+				"cluster": "testcluster",
+				"startTime": 1234567890,
+				"jobState": "completed",
+				"stopTime": 1234571490
+			}`,
+			expectError: true,
+		},
+	}
+
+	testData := schema.JobData{
+		"load_one": map[schema.MetricScope]*schema.JobMetric{
+			schema.MetricScopeNode: {
+				Unit:     schema.Unit{Base: "load"},
+				Timestep: 60,
+				Series: []schema.Series{
+					{
+						Hostname:   "host123",
+						Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3},
+						Data:       []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3},
+					},
+				},
+			},
+		},
+	}
+
+	metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
+		return testData, nil
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if tt.setupJobFunc != nil {
+				tt.setupJobFunc()
+			}
+
+			natsAPI.handleStopJob(tt.payload)
+
+			// Allow some time for async operations
+			time.Sleep(100 * time.Millisecond)
+
+			if !tt.expectError && tt.validateJob != nil {
+				// Extract job details from payload
+				var payloadMap map[string]any
+				json.Unmarshal([]byte(tt.payload), &payloadMap)
+				jobID := int64(payloadMap["jobId"].(float64))
+				cluster := payloadMap["cluster"].(string)
+
+				var startTime *int64
+				if st, ok := payloadMap["startTime"]; ok {
+					t := int64(st.(float64))
+					startTime = &t
+				}
+
+				job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime)
+				if err != nil {
+					t.Fatalf("expected to find job, but got error: %v", err)
+				}
+
+				tt.validateJob(t, job)
+			}
+		})
+	}
+}
+
+func TestNatsHandleNodeState(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	tests := []struct {
+		name        string
+		data        []byte
+		expectError bool
+		validateFn  func(t *testing.T)
+	}{
+		{
+			name:        "valid node state update",
+			data:        []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`),
+			expectError: false,
+			validateFn: func(t *testing.T) {
+				// In a full test, we would verify the node state was updated in the database
+				// For now, just ensure no error occurred
+			},
+		},
+		{
+			name:        "multiple nodes",
+			data:        []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`),
+			expectError: false,
+		},
+		{
+			name:        "invalid JSON in event field",
+			data:        []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`),
+			expectError: true,
+		},
+		{
+			name:        "empty nodes array",
+			data:        []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`),
+			expectError: false, // Empty array should not cause error
+		},
+		{
+			name:        "invalid line protocol format",
+			data:        []byte(`invalid line protocol format`),
+			expectError: true,
+		},
+		{
+			name:        "empty data",
+			data:        []byte(``),
+			expectError: false, // Should be handled gracefully with warning
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			natsAPI.handleNodeState("test.subject", tt.data)
+
+			// Allow some time for async operations
+			time.Sleep(50 * time.Millisecond)
+
+			if tt.validateFn != nil {
+				tt.validateFn(t)
+			}
+		})
+	}
+}
+
+func TestNatsProcessJobEvent(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	msgStartJob, err := lp.NewMessage(
+		"job",
+		map[string]string{"function": "start_job"},
+		nil,
+		map[string]any{
+			"event": `{
+				"jobId": 3001,
+				"user": "testuser",
+				"project": "testproj",
+				"cluster": "testcluster",
+				"partition": "main",
+				"walltime": 3600,
+				"numNodes": 1,
+				"numHwthreads": 8,
+				"numAcc": 0,
+				"shared": "none",
+				"monitoringStatus": 1,
+				"smt": 1,
+				"resources": [
+					{
+						"hostname": "host123",
+						"hwthreads": [0, 1, 2, 3]
+					}
+				],
+				"startTime": 1234567890
+			}`,
+		},
+		time.Now(),
+	)
+	if err != nil {
+		t.Fatalf("failed to create test message: %v", err)
+	}
+
+	msgMissingTag, err := lp.NewMessage(
+		"job",
+		map[string]string{},
+		nil,
+		map[string]any{
+			"event": `{}`,
+		},
+		time.Now(),
+	)
+	if err != nil {
+		t.Fatalf("failed to create test message: %v", err)
+	}
+
+	msgUnknownFunc, err := lp.NewMessage(
+		"job",
+		map[string]string{"function": "unknown_function"},
+		nil,
+		map[string]any{
+			"event": `{}`,
+		},
+		time.Now(),
+	)
+	if err != nil {
+		t.Fatalf("failed to create test message: %v", err)
+	}
+
+	tests := []struct {
+		name        string
+		message     lp.CCMessage
+		expectError bool
+	}{
+		{
+			name:        "start_job function",
+			message:     msgStartJob,
+			expectError: false,
+		},
+		{
+			name:        "missing function tag",
+			message:     msgMissingTag,
+			expectError: true,
+		},
+		{
+			name:        "unknown function",
+			message:     msgUnknownFunc,
+			expectError: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			natsAPI.processJobEvent(tt.message)
+			time.Sleep(50 * time.Millisecond)
+		})
+	}
+}
+
+func TestNatsHandleJobEvent(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	tests := []struct {
+		name        string
+		data        []byte
+		expectError bool
+	}{
+		{
+			name:        "valid influx line protocol",
+			data:        []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`),
+			expectError: false,
+		},
+		{
+			name:        "invalid influx line protocol",
+			data:        []byte(`invalid line protocol format`),
+			expectError: true,
+		},
+		{
+			name:        "empty data",
+			data:        []byte(``),
+			expectError: false, // Decoder should handle empty input gracefully
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// HandleJobEvent doesn't return errors, it logs them
+			// We're just ensuring it doesn't panic
+			natsAPI.handleJobEvent("test.subject", tt.data)
+			time.Sleep(50 * time.Millisecond)
+		})
+	}
+}
+
+func TestNatsHandleJobEventEdgeCases(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	tests := []struct {
+		name        string
+		data        []byte
+		expectError bool
+		description string
+	}{
+		{
+			name:        "non-event message (metric data)",
+			data:        []byte(`job,function=start_job value=123.45 1234567890000000000`),
+			expectError: false,
+			description: "Should skip non-event messages gracefully",
+		},
+		{
+			name:        "wrong measurement name",
+			data:        []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`),
+			expectError: false,
+			description: "Should warn about unexpected measurement but not fail",
+		},
+		{
+			name:        "missing event field",
+			data:        []byte(`job,function=start_job other_field="value" 1234567890000000000`),
+			expectError: true,
+			description: "Should error when event field is missing",
+		},
+		{
+			name:        "multiple measurements in one message",
+			data:        []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"),
+			expectError: false,
+			description: "Should process multiple lines",
+		},
+		{
+			name:        "escaped quotes in JSON payload",
+			data:        []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`),
+			expectError: true,
+			description: "Should handle escaped quotes (though JSON parsing may fail)",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			natsAPI.handleJobEvent("test.subject", tt.data)
+			time.Sleep(50 * time.Millisecond)
+		})
+	}
+}
+
+func TestNatsHandleNodeStateEdgeCases(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	tests := []struct {
+		name        string
+		data        []byte
+		expectError bool
+		description string
+	}{
+		{
+			name:        "missing cluster field in JSON",
+			data:        []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`),
+			expectError: true,
+			description: "Should fail when cluster is missing",
+		},
+		{
+			name:        "malformed JSON with unescaped quotes",
+			data:        []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`),
+			expectError: true,
+			description: "Should fail on malformed JSON",
+		},
+		{
+			name:        "unicode characters in hostname",
+			data:        []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
+			expectError: false,
+			description: "Should handle unicode characters",
+		},
+		{
+			name:        "very large node count",
+			data:        []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`),
+			expectError: false,
+			description: "Should handle multiple nodes efficiently",
+		},
+		{
+			name:        "timestamp in past",
+			data:        []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`),
+			expectError: false,
+			description: "Should accept any valid timestamp",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			natsAPI.handleNodeState("test.subject", tt.data)
+			time.Sleep(50 * time.Millisecond)
+		})
+	}
+}
+
+func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) {
+	natsAPI := setupNatsTest(t)
+	t.Cleanup(cleanupNatsTest)
+
+	// Start a job
+	payload := `{
+		"jobId": 5001,
+		"user": "testuser",
+		"project": "testproj",
+		"cluster": "testcluster",
+		"partition": "main",
+		"walltime": 3600,
+		"numNodes": 1,
+		"numHwthreads": 8,
+		"numAcc": 0,
+		"shared": "none",
+		"monitoringStatus": 1,
+		"smt": 1,
+		"resources": [
+			{
+				"hostname": "host123",
+				"hwthreads": [0, 1, 2, 3]
+			}
+		],
+		"startTime": 1234567890
+	}`
+
+	natsAPI.handleStartJob(payload)
+	natsAPI.JobRepository.SyncJobs()
+	time.Sleep(100 * time.Millisecond)
+
+	// Try to start the same job again (within 24 hours)
+	duplicatePayload := `{
+		"jobId": 5001,
+		"user": "testuser",
+		"project": "testproj",
+		"cluster": "testcluster",
+		"partition": "main",
+		"walltime": 3600,
+		"numNodes": 1,
+		"numHwthreads": 8,
+		"numAcc": 0,
+		"shared": "none",
+		"monitoringStatus": 1,
+		"smt": 1,
+		"resources": [
+			{
+				"hostname": "host123",
+				"hwthreads": [0, 1, 2, 3]
+			}
+		],
+		"startTime": 1234567900
+	}`
+
+	natsAPI.handleStartJob(duplicatePayload)
+	natsAPI.JobRepository.SyncJobs()
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify only one job exists
+	jobID := int64(5001)
+	cluster := "testcluster"
+	jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil)
+	if err != nil && err != sql.ErrNoRows {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(jobs) != 1 {
+		t.Errorf("expected 1 job, got %d", len(jobs))
+	}
+}
--- a/internal/api/node.go
+++ b/internal/api/node.go
@@ -7,12 +7,17 @@ package api

 import (
 	"fmt"
+	"maps"
 	"net/http"
 	"strings"
 	"time"

+	"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	"github.com/ClusterCockpit/cc-backend/pkg/archive"
+	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

 type UpdateNodeStatesRequest struct {
@@ -20,6 +25,15 @@ type UpdateNodeStatesRequest struct {
 	Cluster string               `json:"cluster" example:"fritz"`
 }

+// metricListToNames converts a map of metric configurations to a list of metric names
+func metricListToNames(metricList map[string]*schema.Metric) []string {
+	names := make([]string, 0, len(metricList))
+	for name := range metricList {
+		names = append(names, name)
+	}
+	return names
+}
+
 // this routine assumes that only one of them exists per node
 func determineState(states []string) schema.SchedulerState {
 	for _, state := range states {
@@ -47,7 +61,7 @@ func determineState(states []string) schema.SchedulerState {
 // @description Required query-parameter defines if all users or only users with additional special roles are returned.
 // @produce     json
 // @param       request body UpdateNodeStatesRequest true "Request body containing nodes and their states"
-// @success     200     {object} api.DefaultApiResponse "Success message"
+// @success     200     {object} api.DefaultAPIResponse "Success message"
 // @failure     400     {object} api.ErrorResponse      "Bad Request"
 // @failure     401     {object} api.ErrorResponse      "Unauthorized"
 // @failure     403     {object} api.ErrorResponse      "Forbidden"
@@ -62,19 +76,70 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
 			http.StatusBadRequest, rw)
 		return
 	}
+	requestReceived := time.Now().Unix()
 	repo := repository.GetNodeRepository()

+	m := make(map[string][]string)
+	metricNames := make(map[string][]string)
+	healthResults := make(map[string]metricstore.HealthCheckResult)
+
+	startMs := time.Now()
+
+	// Step 1: Build nodeList and metricList per subcluster
+	for _, node := range req.Nodes {
+		if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
+			m[sc] = append(m[sc], node.Hostname)
+		}
+	}
+
+	for sc := range m {
+		if sc != "" {
+			metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
+			metricNames[sc] = metricListToNames(metricList)
+		}
+	}
+
+	// Step 2: Determine which metric store to query and perform health check
+	healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
+	if err != nil {
+		cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
+	} else {
+		for sc, nl := range m {
+			if sc != "" {
+				if results, err := healthRepo.HealthCheck(req.Cluster, nl, metricNames[sc]); err == nil {
+					maps.Copy(healthResults, results)
+				}
+			}
+		}
+	}
+
+	cclog.Debugf("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs))
+	startDB := time.Now()
+
 	for _, node := range req.Nodes {
 		state := determineState(node.States)
+		healthState := schema.MonitoringStateFailed
+		var healthMetrics string
+		if result, ok := healthResults[node.Hostname]; ok {
+			healthState = result.State
+			healthMetrics = result.HealthMetrics
+		}
 		nodeState := schema.NodeStateDB{
-			TimeStamp: time.Now().Unix(), NodeState: state,
+			TimeStamp:       requestReceived,
+			NodeState:       state,
 			CpusAllocated:   node.CpusAllocated,
 			MemoryAllocated: node.MemoryAllocated,
 			GpusAllocated:   node.GpusAllocated,
-			HealthState:     schema.MonitoringStateFull,
+			HealthState:     healthState,
+			HealthMetrics:   healthMetrics,
 			JobsRunning:     node.JobsRunning,
 		}

-		repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
+		if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
+			cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v",
+				node.Hostname, req.Cluster, err)
+		}
 	}
+
+	cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
 }
--- a/internal/api/rest.go
+++ b/internal/api/rest.go
@@ -22,10 +22,11 @@ import (
 	"github.com/ClusterCockpit/cc-backend/internal/auth"
 	"github.com/ClusterCockpit/cc-backend/internal/config"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/ClusterCockpit/cc-lib/util"
-	"github.com/gorilla/mux"
+	"github.com/ClusterCockpit/cc-backend/internal/tagger"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	"github.com/ClusterCockpit/cc-lib/v2/util"
+	"github.com/go-chi/chi/v5"
 )

 // @title                      ClusterCockpit REST API
@@ -48,6 +49,7 @@ import (
 const (
 	noticeFilePath  = "./var/notice.txt"
 	noticeFilePerms = 0o644
+	maxNoticeLength = 10000 // Maximum allowed notice content length in characters
 )

 type RestAPI struct {
@@ -61,6 +63,7 @@ type RestAPI struct {
 	RepositoryMutex sync.Mutex
 }

+// New creates and initializes a new RestAPI instance with configured dependencies.
 func New() *RestAPI {
 	return &RestAPI{
 		JobRepository:   repository.GetJobRepository(),
@@ -69,79 +72,100 @@ func New() *RestAPI {
 	}
 }

-func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
-	r.StrictSlash(true)
+// MountAPIRoutes registers REST API endpoints for job and cluster management.
+// These routes use JWT token authentication via the X-Auth-Token header.
+func (api *RestAPI) MountAPIRoutes(r chi.Router) {
 	// REST API Uses TokenAuth
 	// User List
-	r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
+	r.Get("/users/", api.getUsers)
 	// Cluster List
-	r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
+	r.Get("/clusters/", api.getClusters)
 	// Slurm node state
-	r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
+	r.Post("/nodestate/", api.updateNodeStates)
+	r.Put("/nodestate/", api.updateNodeStates)
 	// Job Handler
-	r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut)
-	r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut)
-	r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
-	r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
-	r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
-	r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch)
-	r.HandleFunc("/jobs/tag_job/{id}", api.removeTagJob).Methods(http.MethodDelete)
-	r.HandleFunc("/jobs/edit_meta/{id}", api.editMeta).Methods(http.MethodPost, http.MethodPatch)
-	r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
-	r.HandleFunc("/jobs/delete_job/", api.deleteJobByRequest).Methods(http.MethodDelete)
-	r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobByID).Methods(http.MethodDelete)
-	r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete)
+	if config.Keys.APISubjects == nil {
+		cclog.Info("Enabling REST start/stop job API")
+		r.Post("/jobs/start_job/", api.startJob)
+		r.Put("/jobs/start_job/", api.startJob)
+		r.Post("/jobs/stop_job/", api.stopJobByRequest)
+		r.Put("/jobs/stop_job/", api.stopJobByRequest)
+	}
+	r.Get("/jobs/", api.getJobs)
+	r.Get("/jobs/used_nodes", api.getUsedNodes)
+	r.Post("/jobs/tag_job/{id}", api.tagJob)
+	r.Patch("/jobs/tag_job/{id}", api.tagJob)
+	r.Delete("/jobs/tag_job/{id}", api.removeTagJob)
+	r.Patch("/jobs/edit_meta/{id}", api.editMeta)
+	r.Patch("/jobs/edit_meta/", api.editMetaByRequest)
+	r.Get("/jobs/metrics/{id}", api.getJobMetrics)
+	r.Delete("/jobs/delete_job/", api.deleteJobByRequest)
+	r.Delete("/jobs/delete_job/{id}", api.deleteJobByID)
+	r.Delete("/jobs/delete_job_before/{ts}", api.deleteJobBefore)
+	r.Post("/jobs/{id}", api.getJobByID)
+	r.Get("/jobs/{id}", api.getCompleteJobByID)

-	r.HandleFunc("/tags/", api.removeTags).Methods(http.MethodDelete)
+	r.Delete("/tags/", api.removeTags)

 	if api.MachineStateDir != "" {
-		r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet)
-		r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost)
+		r.Get("/machine_state/{cluster}/{host}", api.getMachineState)
+		r.Put("/machine_state/{cluster}/{host}", api.putMachineState)
+		r.Post("/machine_state/{cluster}/{host}", api.putMachineState)
 	}
 }

-func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) {
-	r.StrictSlash(true)
+// MountUserAPIRoutes registers user-accessible REST API endpoints.
+// These are limited endpoints for regular users with JWT token authentication.
+func (api *RestAPI) MountUserAPIRoutes(r chi.Router) {
 	// REST API Uses TokenAuth
-	r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet)
-	r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost)
-	r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet)
-	r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet)
+	r.Get("/jobs/", api.getJobs)
+	r.Post("/jobs/{id}", api.getJobByID)
+	r.Get("/jobs/{id}", api.getCompleteJobByID)
+	r.Get("/jobs/metrics/{id}", api.getJobMetrics)
 }

-func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
+// MountMetricStoreAPIRoutes registers metric storage API endpoints.
+// These endpoints handle metric data ingestion and health checks with JWT token authentication.
+func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) {
 	// REST API Uses TokenAuth
-	// Note: StrictSlash handles trailing slash variations automatically
-	r.HandleFunc("/api/free", freeMetrics).Methods(http.MethodPost)
-	r.HandleFunc("/api/write", writeMetrics).Methods(http.MethodPost)
-	r.HandleFunc("/api/debug", debugMetrics).Methods(http.MethodGet)
-	r.HandleFunc("/api/healthcheck", metricsHealth).Methods(http.MethodGet)
+	r.Post("/free", freeMetrics)
+	r.Post("/write", writeMetrics)
+	r.Get("/debug", debugMetrics)
+	r.Post("/healthcheck", api.updateNodeStates)
 	// Same endpoints but with trailing slash
-	r.HandleFunc("/api/free/", freeMetrics).Methods(http.MethodPost)
-	r.HandleFunc("/api/write/", writeMetrics).Methods(http.MethodPost)
-	r.HandleFunc("/api/debug/", debugMetrics).Methods(http.MethodGet)
-	r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet)
+	r.Post("/free/", freeMetrics)
+	r.Post("/write/", writeMetrics)
+	r.Get("/debug/", debugMetrics)
+	r.Post("/healthcheck/", api.updateNodeStates)
 }

-func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) {
-	r.StrictSlash(true)
+// MountConfigAPIRoutes registers configuration and user management endpoints.
+// These routes use session-based authentication and require admin privileges.
+// Routes use full paths (including /config prefix) to avoid conflicting with
+// the /config page route when registered via Group instead of Route.
+func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) {
 	// Settings Frontend Uses SessionAuth
 	if api.Authentication != nil {
-		r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet)
-		r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut)
-		r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet)
-		r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete)
-		r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost)
-		r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost)
+		r.Get("/config/roles/", api.getRoles)
+		r.Post("/config/users/", api.createUser)
+		r.Put("/config/users/", api.createUser)
+		r.Get("/config/users/", api.getUsers)
+		r.Delete("/config/users/", api.deleteUser)
+		r.Post("/config/user/{id}", api.updateUser)
+		r.Post("/config/notice/", api.editNotice)
+		r.Get("/config/taggers/", api.getTaggers)
+		r.Post("/config/taggers/run/", api.runTagger)
 	}
 }

-func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) {
-	r.StrictSlash(true)
+// MountFrontendAPIRoutes registers frontend-specific API endpoints.
+// These routes support JWT generation and user configuration updates with session authentication.
+func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) {
+	r.Get("/logs/", api.getJournalLog)
 	// Settings Frontend Uses SessionAuth
 	if api.Authentication != nil {
-		r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet)
-		r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost)
+		r.Get("/jwt/", api.getJWT)
+		r.Post("/configuration/", api.updateConfiguration)
 	}
 }

@@ -157,6 +181,8 @@ type DefaultAPIResponse struct {
 	Message string `json:"msg"`
 }

+// handleError writes a standardized JSON error response with the given status code.
+// It logs the error at WARN level and ensures proper Content-Type headers are set.
 func handleError(err error, statusCode int, rw http.ResponseWriter) {
 	cclog.Warnf("REST ERROR : %s", err.Error())
 	rw.Header().Add("Content-Type", "application/json")
@@ -169,15 +195,38 @@ func handleError(err error, statusCode int, rw http.ResponseWriter) {
 	}
 }

+// decode reads JSON from r into val with strict validation that rejects unknown fields.
 func decode(r io.Reader, val any) error {
 	dec := json.NewDecoder(r)
 	dec.DisallowUnknownFields()
 	return dec.Decode(val)
 }

-func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
-	// SecuredCheck() only worked with TokenAuth: Removed
+// validatePathComponent checks if a path component contains potentially malicious patterns
+// that could be used for path traversal attacks. Returns an error if validation fails.
+func validatePathComponent(component, componentName string) error {
+	if strings.Contains(component, "..") ||
+		strings.Contains(component, "/") ||
+		strings.Contains(component, "\\") {
+		return fmt.Errorf("invalid %s", componentName)
+	}
+	return nil
+}

+// editNotice godoc
+// @summary     Update system notice
+// @tags        Config
+// @description Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.
+// @accept      mpfd
+// @produce     plain
+// @param       new-content formData string true "New notice content (max 10000 characters)"
+// @success     200 {string} string "Update Notice Content Success"
+// @failure     400 {object} ErrorResponse "Bad Request"
+// @failure     403 {object} ErrorResponse "Forbidden"
+// @failure     500 {object} ErrorResponse "Internal Server Error"
+// @security    ApiKeyAuth
+// @router      /notice/ [post]
+func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
 	if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
 		handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw)
 		return
@@ -186,9 +235,8 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
 	// Get Value
 	newContent := r.FormValue("new-content")

-	// Validate content length to prevent DoS
-	if len(newContent) > 10000 {
-		handleError(fmt.Errorf("notice content exceeds maximum length of 10000 characters"), http.StatusBadRequest, rw)
+	if len(newContent) > maxNoticeLength {
+		handleError(fmt.Errorf("notice content exceeds maximum length of %d characters", maxNoticeLength), http.StatusBadRequest, rw)
 		return
 	}

@@ -200,7 +248,9 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {
 			handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw)
 			return
 		}
-		ntxt.Close()
+		if err := ntxt.Close(); err != nil {
+			cclog.Warnf("Failed to close notice file: %v", err)
+		}
 	}

 	if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil {
@@ -210,13 +260,66 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) {

 	rw.Header().Set("Content-Type", "text/plain")
 	rw.WriteHeader(http.StatusOK)
+	var msg []byte
 	if newContent != "" {
-		rw.Write([]byte("Update Notice Content Success"))
+		msg = []byte("Update Notice Content Success")
 	} else {
-		rw.Write([]byte("Empty Notice Content Success"))
+		msg = []byte("Empty Notice Content Success")
+	}
+	if _, err := rw.Write(msg); err != nil {
+		cclog.Errorf("Failed to write response: %v", err)
 	}
 }

+func (api *RestAPI) getTaggers(rw http.ResponseWriter, r *http.Request) {
+	if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
+		handleError(fmt.Errorf("only admins are allowed to list taggers"), http.StatusForbidden, rw)
+		return
+	}
+
+	rw.Header().Set("Content-Type", "application/json")
+	if err := json.NewEncoder(rw).Encode(tagger.ListTaggers()); err != nil {
+		cclog.Errorf("Failed to encode tagger list: %v", err)
+	}
+}
+
+func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) {
+	if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) {
+		handleError(fmt.Errorf("only admins are allowed to run taggers"), http.StatusForbidden, rw)
+		return
+	}
+
+	name := r.FormValue("name")
+	if name == "" {
+		handleError(fmt.Errorf("missing required parameter: name"), http.StatusBadRequest, rw)
+		return
+	}
+
+	if err := tagger.RunTaggerByName(name); err != nil {
+		handleError(err, http.StatusConflict, rw)
+		return
+	}
+
+	rw.Header().Set("Content-Type", "text/plain")
+	rw.WriteHeader(http.StatusOK)
+	if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil {
+		cclog.Errorf("Failed to write response: %v", err)
+	}
+}
+
+// getJWT godoc
+// @summary     Generate JWT token
+// @tags        Frontend
+// @description Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.
+// @accept      mpfd
+// @produce     plain
+// @param       username formData string true "Username to generate JWT for"
+// @success     200 {string} string "JWT token"
+// @failure     403 {object} ErrorResponse "Forbidden"
+// @failure     404 {object} ErrorResponse "User Not Found"
+// @failure     500 {object} ErrorResponse "Internal Server Error"
+// @security    ApiKeyAuth
+// @router      /jwt/ [get]
 func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
 	rw.Header().Set("Content-Type", "text/plain")
 	username := r.FormValue("username")
@@ -241,12 +344,22 @@ func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) {
 	}

 	rw.WriteHeader(http.StatusOK)
-	rw.Write([]byte(jwt))
+	if _, err := rw.Write([]byte(jwt)); err != nil {
+		cclog.Errorf("Failed to write JWT response: %v", err)
+	}
 }

+// getRoles godoc
+// @summary     Get available roles
+// @tags        Config
+// @description Returns a list of valid user roles. Only admins are allowed.
+// @produce     json
+// @success     200 {array} string "List of role names"
+// @failure     403 {object} ErrorResponse "Forbidden"
+// @failure     500 {object} ErrorResponse "Internal Server Error"
+// @security    ApiKeyAuth
+// @router      /roles/ [get]
 func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
-	// SecuredCheck() only worked with TokenAuth: Removed
-
 	user := repository.GetUserFromContext(r.Context())
 	if !user.HasRole(schema.RoleAdmin) {
 		handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw)
@@ -265,6 +378,18 @@ func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) {
 	}
 }

+// updateConfiguration godoc
+// @summary     Update user configuration
+// @tags        Frontend
+// @description Updates a user's configuration key-value pair.
+// @accept      mpfd
+// @produce     plain
+// @param       key formData string true "Configuration key"
+// @param       value formData string true "Configuration value"
+// @success     200 {string} string "success"
+// @failure     500 {object} ErrorResponse "Internal Server Error"
+// @security    ApiKeyAuth
+// @router      /configuration/ [post]
 func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) {
 	rw.Header().Set("Content-Type", "text/plain")
 	key, value := r.FormValue("key"), r.FormValue("value")
@@ -275,26 +400,40 @@ func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request)
 	}

 	rw.WriteHeader(http.StatusOK)
-	rw.Write([]byte("success"))
+	if _, err := rw.Write([]byte("success")); err != nil {
+		cclog.Errorf("Failed to write response: %v", err)
+	}
 }

+// putMachineState godoc
+// @summary     Store machine state
+// @tags        Machine State
+// @description Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
+// @accept      json
+// @produce     plain
+// @param       cluster path string true "Cluster name"
+// @param       host path string true "Host name"
+// @success     201 "Created"
+// @failure     400 {object} ErrorResponse "Bad Request"
+// @failure     404 {object} ErrorResponse "Machine state not enabled"
+// @failure     500 {object} ErrorResponse "Internal Server Error"
+// @security    ApiKeyAuth
+// @router      /machine_state/{cluster}/{host} [put]
 func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
 	if api.MachineStateDir == "" {
 		handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
 		return
 	}

-	vars := mux.Vars(r)
-	cluster := vars["cluster"]
-	host := vars["host"]
+	cluster := chi.URLParam(r, "cluster")
+	host := chi.URLParam(r, "host")

-	// Validate cluster and host to prevent path traversal attacks
-	if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
-		handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
+	if err := validatePathComponent(cluster, "cluster name"); err != nil {
+		handleError(err, http.StatusBadRequest, rw)
 		return
 	}
-	if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
-		handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
+	if err := validatePathComponent(host, "host name"); err != nil {
+		handleError(err, http.StatusBadRequest, rw)
 		return
 	}

@@ -320,23 +459,33 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) {
 	rw.WriteHeader(http.StatusCreated)
 }

+// getMachineState godoc
+// @summary     Retrieve machine state
+// @tags        Machine State
+// @description Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.
+// @produce     json
+// @param       cluster path string true "Cluster name"
+// @param       host path string true "Host name"
+// @success     200 {object} object "Machine state JSON data"
+// @failure     400 {object} ErrorResponse "Bad Request"
+// @failure     404 {object} ErrorResponse "Machine state not enabled or file not found"
+// @security    ApiKeyAuth
+// @router      /machine_state/{cluster}/{host} [get]
 func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) {
 	if api.MachineStateDir == "" {
 		handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw)
 		return
 	}

-	vars := mux.Vars(r)
-	cluster := vars["cluster"]
-	host := vars["host"]
+	cluster := chi.URLParam(r, "cluster")
+	host := chi.URLParam(r, "host")

-	// Validate cluster and host to prevent path traversal attacks
-	if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") {
-		handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw)
+	if err := validatePathComponent(cluster, "cluster name"); err != nil {
+		handleError(err, http.StatusBadRequest, rw)
 		return
 	}
-	if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") {
-		handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw)
+	if err := validatePathComponent(host, "host name"); err != nil {
+		handleError(err, http.StatusBadRequest, rw)
 		return
 	}

--- a/internal/api/user.go
+++ b/internal/api/user.go
@@ -11,9 +11,9 @@ import (
 	"net/http"

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/gorilla/mux"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	"github.com/go-chi/chi/v5"
 )

 type APIReturnedUser struct {
@@ -31,7 +31,7 @@ type APIReturnedUser struct {
 // @description Required query-parameter defines if all users or only users with additional special roles are returned.
 // @produce     json
 // @param       not-just-user query bool true "If returned list should contain all users or only users with additional special roles"
-// @success     200     {array} api.ApiReturnedUser "List of users returned successfully"
+// @success     200     {array} api.APIReturnedUser "List of users returned successfully"
 // @failure     400     {string} string             "Bad Request"
 // @failure     401     {string} string             "Unauthorized"
 // @failure     403     {string} string             "Forbidden"
@@ -91,7 +91,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {

 	// Handle role updates
 	if newrole != "" {
-		if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil {
+		if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil {
 			handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw)
 			return
 		}
@@ -99,7 +99,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
 			cclog.Errorf("Failed to encode response: %v", err)
 		}
 	} else if delrole != "" {
-		if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil {
+		if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil {
 			handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw)
 			return
 		}
@@ -107,7 +107,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
 			cclog.Errorf("Failed to encode response: %v", err)
 		}
 	} else if newproj != "" {
-		if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil {
+		if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil {
 			handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw)
 			return
 		}
@@ -115,7 +115,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) {
 			cclog.Errorf("Failed to encode response: %v", err)
 		}
 	} else if delproj != "" {
-		if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil {
+		if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil {
 			handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw)
 			return
 		}
@@ -164,7 +164,7 @@ func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) {
 		return
 	}

-	if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) {
+	if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) {
 		handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw)
 		return
 	}
--- a/internal/archiver/README.md
+++ b/internal/archiver/README.md
@@ -106,7 +106,7 @@ Data is archived at the highest available resolution (typically 60s intervals).

 ```go
 // In archiver.go ArchiveJob() function
-jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 300)
+jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300)
 // 0 = highest resolution
 // 300 = 5-minute resolution
 ```
@@ -170,7 +170,6 @@ All exported functions are safe for concurrent use:
 - `Start()` - Safe to call once
 - `TriggerArchiving()` - Safe from multiple goroutines
 - `Shutdown()` - Safe to call once
- `WaitForArchiving()` - Deprecated, but safe

 Internal state is protected by:
 - Channel synchronization (`archiveChannel`)
@@ -185,6 +184,6 @@ Internal state is protected by:
 ## Dependencies

 - `internal/repository`: Database operations for job metadata
- `internal/metricDataDispatcher`: Loading metric data from various backends
+- `internal/metricdispatch`: Loading metric data from various backends
 - `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite)
 - `cc-lib/schema`: Job and metric data structures
--- a/internal/archiver/archiveWorker.go
+++ b/internal/archiver/archiveWorker.go
@@ -54,8 +54,8 @@ import (
 	"time"

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	sq "github.com/Masterminds/squirrel"
 )

@@ -126,7 +126,7 @@ func archivingWorker() {
 			// not using meta data, called to load JobMeta into Cache?
 			// will fail if job meta not in repository
 			if _, err := jobRepo.FetchMetadata(job); err != nil {
-				cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error())
+				cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", *job.ID, err.Error())
 				jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
 				archivePending.Done()
 				continue
@@ -136,7 +136,7 @@ func archivingWorker() {
 			// Use shutdown context to allow cancellation
 			jobMeta, err := ArchiveJob(job, shutdownCtx)
 			if err != nil {
-				cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error())
+				cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", *job.ID, err.Error())
 				jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed)
 				archivePending.Done()
 				continue
@@ -145,24 +145,24 @@ func archivingWorker() {
 			stmt := sq.Update("job").Where("job.id = ?", job.ID)

 			if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil {
-				cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error())
+				cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", *job.ID, err.Error())
 				archivePending.Done()
 				continue
 			}
 			if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil {
-				cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error())
+				cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", *job.ID, err.Error())
 				archivePending.Done()
 				continue
 			}
 			// Update the jobs database entry one last time:
 			stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful)
 			if err := jobRepo.Execute(stmt); err != nil {
-				cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error())
+				cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", *job.ID, err.Error())
 				archivePending.Done()
 				continue
 			}
 			cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start))
-			cclog.Infof("archiving job (dbid: %d) successful", job.ID)
+			cclog.Infof("archiving job (dbid: %d) successful", *job.ID)

 			repository.CallJobStopHooks(job)
 			archivePending.Done()
--- a/internal/archiver/archiver.go
+++ b/internal/archiver/archiver.go
@@ -9,11 +9,10 @@ import (
 	"context"
 	"math"

-	"github.com/ClusterCockpit/cc-backend/internal/config"
-	"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
+	"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

 // ArchiveJob archives a completed job's metric data to the configured archive backend.
@@ -60,7 +59,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
 		scopes = append(scopes, schema.MetricScopeAccelerator)
 	}

-	jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
+	jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s)
 	if err != nil {
 		cclog.Error("Error wile loading job data for archiving")
 		return nil, err
@@ -94,12 +93,5 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) {
 		}
 	}

-	// If the file based archive is disabled,
-	// only return the JobMeta structure as the
-	// statistics in there are needed.
-	if config.Keys.DisableArchive {
-		return job, nil
-	}
-
 	return job, archive.GetHandle().ImportJob(job, &jobData)
 }
--- a/internal/auth/auth.go
+++ b/internal/auth/auth.go
@@ -25,9 +25,9 @@ import (

 	"github.com/ClusterCockpit/cc-backend/internal/config"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/ClusterCockpit/cc-lib/util"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	"github.com/ClusterCockpit/cc-lib/v2/util"
 	"github.com/gorilla/sessions"
 )

@@ -40,7 +40,7 @@ type Authenticator interface {
 	// authenticator should attempt the login. This method should not perform
 	// expensive operations or actual authentication.
 	CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool)
-	
+
 	// Login performs the actually authentication for the user.
 	// It returns the authenticated user or an error if authentication fails.
 	// The user parameter may be nil if the user doesn't exist in the database yet.
@@ -65,13 +65,13 @@ var ipUserLimiters sync.Map
 func getIPUserLimiter(ip, username string) *rate.Limiter {
 	key := ip + ":" + username
 	now := time.Now()
-	
+
 	if entry, ok := ipUserLimiters.Load(key); ok {
 		rle := entry.(*rateLimiterEntry)
 		rle.lastUsed = now
 		return rle.limiter
 	}
-	
+
 	// More aggressive rate limiting: 5 attempts per 15 minutes
 	newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5)
 	ipUserLimiters.Store(key, &rateLimiterEntry{
@@ -176,7 +176,7 @@ func (auth *Authentication) AuthViaSession(
 func Init(authCfg *json.RawMessage) {
 	initOnce.Do(func() {
 		authInstance = &Authentication{}
-		
+
 		// Start background cleanup of rate limiters
 		startRateLimiterCleanup()

@@ -263,7 +263,7 @@ func GetAuthInstance() *Authentication {
 }

 // handleUserSync syncs or updates a user in the database based on configuration.
-// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled.
+// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled.
 func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) {
 	r := repository.GetUserRepository()
 	dbUser, err := r.GetUser(user.Username)
@@ -272,7 +272,7 @@ func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool)
 		cclog.Errorf("Error while loading user '%s': %v", user.Username, err)
 		return
 	}
-	
+
 	if err == sql.ErrNoRows && syncUserOnLogin { // Add new user
 		if err := r.AddUser(user); err != nil {
 			cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err)
@@ -294,6 +294,11 @@ func handleOIDCUser(OIDCUser *schema.User) {
 	handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin)
 }

+// handleLdapUser syncs LDAP user with database
+func handleLdapUser(ldapUser *schema.User) {
+	handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin)
+}
+
 func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error {
 	session, err := auth.sessionStore.New(r, "session")
 	if err != nil {
@@ -305,8 +310,13 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
 	if auth.SessionMaxAge != 0 {
 		session.Options.MaxAge = int(auth.SessionMaxAge.Seconds())
 	}
-	if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" {
-		cclog.Warn("HTTPS not configured - session cookies will not have Secure flag set (insecure for production)")
+	if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
+		// If neither TLS or an encrypted reverse proxy are used, do not mark cookies as secure.
+		cclog.Warn("Authenticating with unencrypted request. Session cookies will not have Secure flag set (insecure for production)")
+		if r.Header.Get("X-Forwarded-Proto") == "" {
+			// This warning will not be printed if e.g. X-Forwarded-Proto == http
+			cclog.Warn("If you are using a reverse proxy, make sure X-Forwarded-Proto is set")
+		}
 		session.Options.Secure = false
 	}
 	session.Options.SameSite = http.SameSiteStrictMode
@@ -438,13 +448,13 @@ func (auth *Authentication) AuthAPI(
 		if user != nil {
 			switch {
 			case len(user.Roles) == 1:
-				if user.HasRole(schema.RoleApi) {
+				if user.HasRole(schema.RoleAPI) {
 					ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
 					onsuccess.ServeHTTP(rw, r.WithContext(ctx))
 					return
 				}
 			case len(user.Roles) >= 2:
-				if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) {
+				if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) {
 					ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
 					onsuccess.ServeHTTP(rw, r.WithContext(ctx))
 					return
@@ -474,13 +484,13 @@ func (auth *Authentication) AuthUserAPI(
 		if user != nil {
 			switch {
 			case len(user.Roles) == 1:
-				if user.HasRole(schema.RoleApi) {
+				if user.HasRole(schema.RoleAPI) {
 					ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
 					onsuccess.ServeHTTP(rw, r.WithContext(ctx))
 					return
 				}
 			case len(user.Roles) >= 2:
-				if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
+				if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) {
 					ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
 					onsuccess.ServeHTTP(rw, r.WithContext(ctx))
 					return
@@ -510,13 +520,13 @@ func (auth *Authentication) AuthMetricStoreAPI(
 		if user != nil {
 			switch {
 			case len(user.Roles) == 1:
-				if user.HasRole(schema.RoleApi) {
+				if user.HasRole(schema.RoleAPI) {
 					ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
 					onsuccess.ServeHTTP(rw, r.WithContext(ctx))
 					return
 				}
 			case len(user.Roles) >= 2:
-				if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
+				if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) {
 					ctx := context.WithValue(r.Context(), repository.ContextUserKey, user)
 					onsuccess.ServeHTTP(rw, r.WithContext(ctx))
 					return
@@ -616,9 +626,9 @@ func securedCheck(user *schema.User, r *http.Request) error {
 	}
 	// If SplitHostPort fails, IPAddress is already just a host (no port)

-	// If nothing declared in config: deny all request to this api endpoint
+	// If nothing declared in config: Continue
 	if len(config.Keys.APIAllowedIPs) == 0 {
-		return fmt.Errorf("missing configuration key ApiAllowedIPs")
+		return nil
 	}
 	// If wildcard declared in config: Continue
 	if config.Keys.APIAllowedIPs[0] == "*" {
--- a/internal/auth/auth_test.go
+++ b/internal/auth/auth_test.go
@@ -15,25 +15,25 @@ import (
 func TestGetIPUserLimiter(t *testing.T) {
 	ip := "192.168.1.1"
 	username := "testuser"
-	
+
 	// Get limiter for the first time
 	limiter1 := getIPUserLimiter(ip, username)
 	if limiter1 == nil {
 		t.Fatal("Expected limiter to be created")
 	}
-	
+
 	// Get the same limiter again
 	limiter2 := getIPUserLimiter(ip, username)
 	if limiter1 != limiter2 {
 		t.Error("Expected to get the same limiter instance")
 	}
-	
+
 	// Get a different limiter for different user
 	limiter3 := getIPUserLimiter(ip, "otheruser")
 	if limiter1 == limiter3 {
 		t.Error("Expected different limiter for different user")
 	}
-	
+
 	// Get a different limiter for different IP
 	limiter4 := getIPUserLimiter("192.168.1.2", username)
 	if limiter1 == limiter4 {
@@ -45,16 +45,16 @@ func TestGetIPUserLimiter(t *testing.T) {
 func TestRateLimiterBehavior(t *testing.T) {
 	ip := "10.0.0.1"
 	username := "ratelimituser"
-	
+
 	limiter := getIPUserLimiter(ip, username)
-	
+
 	// Should allow first 5 attempts
-	for i := 0; i < 5; i++ {
+	for i := range 5 {
 		if !limiter.Allow() {
 			t.Errorf("Request %d should be allowed within rate limit", i+1)
 		}
 	}
-	
+
 	// 6th attempt should be blocked
 	if limiter.Allow() {
 		t.Error("Request 6 should be blocked by rate limiter")
@@ -65,19 +65,19 @@ func TestRateLimiterBehavior(t *testing.T) {
 func TestCleanupOldRateLimiters(t *testing.T) {
 	// Clear all existing limiters first to avoid interference from other tests
 	cleanupOldRateLimiters(time.Now().Add(24 * time.Hour))
-	
+
 	// Create some new rate limiters
 	limiter1 := getIPUserLimiter("1.1.1.1", "user1")
 	limiter2 := getIPUserLimiter("2.2.2.2", "user2")
-	
+
 	if limiter1 == nil || limiter2 == nil {
 		t.Fatal("Failed to create test limiters")
 	}
-	
+
 	// Cleanup limiters older than 1 second from now (should keep both)
 	time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference
 	cleanupOldRateLimiters(time.Now().Add(-1 * time.Second))
-	
+
 	// Verify they still exist (should get same instance)
 	if getIPUserLimiter("1.1.1.1", "user1") != limiter1 {
 		t.Error("Limiter 1 was incorrectly cleaned up")
@@ -85,10 +85,10 @@ func TestCleanupOldRateLimiters(t *testing.T) {
 	if getIPUserLimiter("2.2.2.2", "user2") != limiter2 {
 		t.Error("Limiter 2 was incorrectly cleaned up")
 	}
-	
+
 	// Cleanup limiters older than 1 hour from now (should remove both)
 	cleanupOldRateLimiters(time.Now().Add(2 * time.Hour))
-	
+
 	// Getting them again should create new instances
 	newLimiter1 := getIPUserLimiter("1.1.1.1", "user1")
 	if newLimiter1 == limiter1 {
@@ -107,14 +107,14 @@ func TestIPv4Extraction(t *testing.T) {
 		{"IPv4 without port", "192.168.1.1", "192.168.1.1"},
 		{"Localhost with port", "127.0.0.1:3000", "127.0.0.1"},
 	}
-	
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := tt.input
 			if host, _, err := net.SplitHostPort(result); err == nil {
 				result = host
 			}
-			
+
 			if result != tt.expected {
 				t.Errorf("Expected %s, got %s", tt.expected, result)
 			}
@@ -122,7 +122,7 @@ func TestIPv4Extraction(t *testing.T) {
 	}
 }

-// TestIPv6Extraction tests extracting IPv6 addresses  
+// TestIPv6Extraction tests extracting IPv6 addresses
 func TestIPv6Extraction(t *testing.T) {
 	tests := []struct {
 		name     string
@@ -134,14 +134,14 @@ func TestIPv6Extraction(t *testing.T) {
 		{"IPv6 without port", "2001:db8::1", "2001:db8::1"},
 		{"IPv6 localhost", "::1", "::1"},
 	}
-	
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := tt.input
 			if host, _, err := net.SplitHostPort(result); err == nil {
 				result = host
 			}
-			
+
 			if result != tt.expected {
 				t.Errorf("Expected %s, got %s", tt.expected, result)
 			}
@@ -160,14 +160,14 @@ func TestIPExtractionEdgeCases(t *testing.T) {
 		{"Empty string", "", ""},
 		{"Just port", ":8080", ""},
 	}
-	
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := tt.input
 			if host, _, err := net.SplitHostPort(result); err == nil {
 				result = host
 			}
-			
+
 			if result != tt.expected {
 				t.Errorf("Expected %s, got %s", tt.expected, result)
 			}
--- a/internal/auth/jwt.go
+++ b/internal/auth/jwt.go
@@ -14,8 +14,8 @@ import (
 	"strings"
 	"time"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"github.com/golang-jwt/jwt/v5"
 )

@@ -25,20 +25,20 @@ type JWTAuthConfig struct {
 	MaxAge string `json:"max-age"`

 	// Specifies which cookie should be checked for a JWT token (if no authorization header is present)
-	CookieName string `json:"cookieName"`
+	CookieName string `json:"cookie-name"`

 	// Deny login for users not in database (but defined in JWT).
 	// Ignore user roles defined in JWTs ('roles' claim), get them from db.
-	ValidateUser bool `json:"validateUser"`
+	ValidateUser bool `json:"validate-user"`

 	// Specifies which issuer should be accepted when validating external JWTs ('iss' claim)
-	TrustedIssuer string `json:"trustedIssuer"`
+	TrustedIssuer string `json:"trusted-issuer"`

 	// Should an non-existent user be added to the DB based on the information in the token
-	SyncUserOnLogin bool `json:"syncUserOnLogin"`
+	SyncUserOnLogin bool `json:"sync-user-on-login"`

 	// Should an existent user be updated in the DB based on the information in the token
-	UpdateUserOnLogin bool `json:"updateUserOnLogin"`
+	UpdateUserOnLogin bool `json:"update-user-on-login"`
 }

 type JWTAuthenticator struct {
@@ -101,20 +101,20 @@ func (ja *JWTAuthenticator) AuthViaJWT(

 	// Token is valid, extract payload
 	claims := token.Claims.(jwt.MapClaims)
-	
+
 	// Use shared helper to get user from JWT claims
 	var user *schema.User
 	user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1)
 	if err != nil {
 		return nil, err
 	}
-	
+
 	// If not validating user, we only get roles from JWT (no projects for this auth method)
 	if !Keys.JwtConfig.ValidateUser {
 		user.Roles = extractRolesFromClaims(claims, false)
 		user.Projects = nil // Standard JWT auth doesn't include projects
 	}
-	
+
 	return user, nil
 }

--- a/internal/auth/jwtCookieSession.go
+++ b/internal/auth/jwtCookieSession.go
@@ -12,8 +12,8 @@ import (
 	"net/http"
 	"os"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"github.com/golang-jwt/jwt/v5"
 )

@@ -146,13 +146,13 @@ func (ja *JWTCookieSessionAuthenticator) Login(
 	}

 	claims := token.Claims.(jwt.MapClaims)
-	
+
 	// Use shared helper to get user from JWT claims
 	user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken)
 	if err != nil {
 		return nil, err
 	}
-	
+
 	// Sync or update user if configured
 	if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) {
 		handleTokenUser(user)
--- a/internal/auth/jwtHelpers.go
+++ b/internal/auth/jwtHelpers.go
@@ -9,10 +9,11 @@ import (
 	"database/sql"
 	"errors"
 	"fmt"
+	"strings"

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"github.com/golang-jwt/jwt/v5"
 )

@@ -28,7 +29,7 @@ func extractStringFromClaims(claims jwt.MapClaims, key string) string {
 // If validateRoles is true, only valid roles are returned
 func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
 	var roles []string
-	
+
 	if rawroles, ok := claims["roles"].([]any); ok {
 		for _, rr := range rawroles {
 			if r, ok := rr.(string); ok {
@@ -42,14 +43,14 @@ func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string {
 			}
 		}
 	}
-	
+
 	return roles
 }

 // extractProjectsFromClaims extracts projects from JWT claims
 func extractProjectsFromClaims(claims jwt.MapClaims) []string {
 	projects := make([]string, 0)
-	
+
 	if rawprojs, ok := claims["projects"].([]any); ok {
 		for _, pp := range rawprojs {
 			if p, ok := pp.(string); ok {
@@ -61,7 +62,7 @@ func extractProjectsFromClaims(claims jwt.MapClaims) []string {
 			projects = append(projects, projSlice...)
 		}
 	}
-	
+
 	return projects
 }

@@ -72,22 +73,23 @@ func extractNameFromClaims(claims jwt.MapClaims) string {
 	if name, ok := claims["name"].(string); ok {
 		return name
 	}
-	
+
 	// Try nested structure: {name: {values: [...]}}
 	if wrap, ok := claims["name"].(map[string]any); ok {
 		if vals, ok := wrap["values"].([]any); ok {
 			if len(vals) == 0 {
 				return ""
 			}
-			
-			name := fmt.Sprintf("%v", vals[0])
+
+			var name strings.Builder
+			name.WriteString(fmt.Sprintf("%v", vals[0]))
 			for i := 1; i < len(vals); i++ {
-				name += fmt.Sprintf(" %v", vals[i])
+				name.WriteString(fmt.Sprintf(" %v", vals[i]))
 			}
-			return name
+			return name.String()
 		}
 	}
-	
+
 	return ""
 }

@@ -100,7 +102,7 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
 	if sub == "" {
 		return nil, errors.New("missing 'sub' claim in JWT")
 	}
-	
+
 	if validateUser {
 		// Validate user against database
 		ur := repository.GetUserRepository()
@@ -109,22 +111,22 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut
 			cclog.Errorf("Error while loading user '%v': %v", sub, err)
 			return nil, fmt.Errorf("database error: %w", err)
 		}
-		
+
 		// Deny any logins for unknown usernames
 		if user == nil || err == sql.ErrNoRows {
 			cclog.Warn("Could not find user from JWT in internal database.")
 			return nil, errors.New("unknown user")
 		}
-		
+
 		// Return database user (with database roles)
 		return user, nil
 	}
-	
+
 	// Create user from JWT claims
 	name := extractNameFromClaims(claims)
 	roles := extractRolesFromClaims(claims, true) // Validate roles
 	projects := extractProjectsFromClaims(claims)
-	
+
 	return &schema.User{
 		Username:   sub,
 		Name:       name,
--- a/internal/auth/jwtHelpers_test.go
+++ b/internal/auth/jwtHelpers_test.go
@@ -8,7 +8,7 @@ package auth
 import (
 	"testing"

-	"github.com/ClusterCockpit/cc-lib/schema"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"github.com/golang-jwt/jwt/v5"
 )

@@ -19,7 +19,7 @@ func TestExtractStringFromClaims(t *testing.T) {
 		"email": "test@example.com",
 		"age":   25, // not a string
 	}
-	
+
 	tests := []struct {
 		name     string
 		key      string
@@ -30,7 +30,7 @@ func TestExtractStringFromClaims(t *testing.T) {
 		{"Non-existent key", "missing", ""},
 		{"Non-string value", "age", ""},
 	}
-	
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := extractStringFromClaims(claims, tt.key)
@@ -88,16 +88,16 @@ func TestExtractRolesFromClaims(t *testing.T) {
 			expected:      []string{},
 		},
 	}
-	
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := extractRolesFromClaims(tt.claims, tt.validateRoles)
-			
+
 			if len(result) != len(tt.expected) {
 				t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result))
 				return
 			}
-			
+
 			for i, role := range result {
 				if i >= len(tt.expected) || role != tt.expected[i] {
 					t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role)
@@ -141,16 +141,16 @@ func TestExtractProjectsFromClaims(t *testing.T) {
 			expected: []string{"project1", "project2"}, // Should skip non-strings
 		},
 	}
-	
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := extractProjectsFromClaims(tt.claims)
-			
+
 			if len(result) != len(tt.expected) {
 				t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result))
 				return
 			}
-			
+
 			for i, project := range result {
 				if i >= len(tt.expected) || project != tt.expected[i] {
 					t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project)
@@ -216,7 +216,7 @@ func TestExtractNameFromClaims(t *testing.T) {
 			expected: "123 Smith", // Should convert to string
 		},
 	}
-	
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := extractNameFromClaims(tt.claims)
@@ -235,29 +235,28 @@ func TestGetUserFromJWT_NoValidation(t *testing.T) {
 		"roles":    []any{"user", "admin"},
 		"projects": []any{"project1", "project2"},
 	}
-	
+
 	user, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
-	
 	if err != nil {
 		t.Fatalf("Unexpected error: %v", err)
 	}
-	
+
 	if user.Username != "testuser" {
 		t.Errorf("Expected username 'testuser', got '%s'", user.Username)
 	}
-	
+
 	if user.Name != "Test User" {
 		t.Errorf("Expected name 'Test User', got '%s'", user.Name)
 	}
-	
+
 	if len(user.Roles) != 2 {
 		t.Errorf("Expected 2 roles, got %d", len(user.Roles))
 	}
-	
+
 	if len(user.Projects) != 2 {
 		t.Errorf("Expected 2 projects, got %d", len(user.Projects))
 	}
-	
+
 	if user.AuthType != schema.AuthToken {
 		t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType)
 	}
@@ -268,13 +267,13 @@ func TestGetUserFromJWT_MissingSub(t *testing.T) {
 	claims := jwt.MapClaims{
 		"name": "Test User",
 	}
-	
+
 	_, err := getUserFromJWT(claims, false, schema.AuthToken, -1)
-	
+
 	if err == nil {
 		t.Error("Expected error for missing sub claim")
 	}
-	
+
 	if err.Error() != "missing 'sub' claim in JWT" {
 		t.Errorf("Expected specific error message, got: %v", err)
 	}
--- a/internal/auth/jwtSession.go
+++ b/internal/auth/jwtSession.go
@@ -13,8 +13,8 @@ import (
 	"os"
 	"strings"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"github.com/golang-jwt/jwt/v5"
 )

@@ -75,13 +75,13 @@ func (ja *JWTSessionAuthenticator) Login(
 	}

 	claims := token.Claims.(jwt.MapClaims)
-	
+
 	// Use shared helper to get user from JWT claims
 	user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken)
 	if err != nil {
 		return nil, err
 	}
-	
+
 	// Sync or update user if configured
 	if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) {
 		handleTokenUser(user)
--- a/internal/auth/ldap.go
+++ b/internal/auth/ldap.go
@@ -6,35 +6,39 @@
 package auth

 import (
-	"errors"
 	"fmt"
+	"net"
 	"net/http"
 	"os"
 	"strings"
+	"time"

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"github.com/go-ldap/ldap/v3"
 )

 type LdapConfig struct {
 	URL             string `json:"url"`
-	UserBase        string `json:"user_base"`
-	SearchDN        string `json:"search_dn"`
-	UserBind        string `json:"user_bind"`
-	UserFilter      string `json:"user_filter"`
-	UserAttr        string `json:"username_attr"`
-	SyncInterval    string `json:"sync_interval"` // Parsed using time.ParseDuration.
-	SyncDelOldUsers bool   `json:"sync_del_old_users"`
+	UserBase        string `json:"user-base"`
+	SearchDN        string `json:"search-dn"`
+	UserBind        string `json:"user-bind"`
+	UserFilter      string `json:"user-filter"`
+	UserAttr        string `json:"username-attr"`
+	UIDAttr         string `json:"uid-attr"`
+	SyncInterval    string `json:"sync-interval"` // Parsed using time.ParseDuration.
+	SyncDelOldUsers bool   `json:"sync-del-old-users"`

-	// Should an non-existent user be added to the DB if user exists in ldap directory
-	SyncUserOnLogin bool `json:"syncUserOnLogin"`
+	// Should a non-existent user be added to the DB if user exists in ldap directory
+	SyncUserOnLogin   bool `json:"sync-user-on-login"`
+	UpdateUserOnLogin bool `json:"update-user-on-login"`
 }

 type LdapAuthenticator struct {
 	syncPassword string
 	UserAttr     string
+	UIDAttr      string
 }

 var _ Authenticator = (*LdapAuthenticator)(nil)
@@ -51,6 +55,12 @@ func (la *LdapAuthenticator) Init() error {
 		la.UserAttr = "gecos"
 	}

+	if Keys.LdapConfig.UIDAttr != "" {
+		la.UIDAttr = Keys.LdapConfig.UIDAttr
+	} else {
+		la.UIDAttr = "uid"
+	}
+
 	return nil
 }

@@ -66,55 +76,44 @@ func (la *LdapAuthenticator) CanLogin(
 		if user.AuthSource == schema.AuthViaLDAP {
 			return user, true
 		}
-	} else {
-		if lc.SyncUserOnLogin {
-			l, err := la.getLdapConnection(true)
-			if err != nil {
-				cclog.Error("LDAP connection error")
-				return nil, false
-			}
-			defer l.Close()
-
-			// Search for the given username
-			searchRequest := ldap.NewSearchRequest(
-				lc.UserBase,
-				ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
-				fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username),
-				[]string{"dn", "uid", la.UserAttr}, nil)
-
-			sr, err := l.Search(searchRequest)
-			if err != nil {
-				cclog.Warn(err)
-				return nil, false
-			}
-
-			if len(sr.Entries) != 1 {
-				cclog.Warn("LDAP: User does not exist or too many entries returned")
-				return nil, false
-			}
-
-			entry := sr.Entries[0]
-			name := entry.GetAttributeValue(la.UserAttr)
-			var roles []string
-			roles = append(roles, schema.GetRoleString(schema.RoleUser))
-			projects := make([]string, 0)
-
-			user = &schema.User{
-				Username:   username,
-				Name:       name,
-				Roles:      roles,
-				Projects:   projects,
-				AuthType:   schema.AuthSession,
-				AuthSource: schema.AuthViaLDAP,
-			}
-
-			if err := repository.GetUserRepository().AddUser(user); err != nil {
-				cclog.Errorf("User '%s' LDAP: Insert into DB failed", username)
-				return nil, false
-			}
-
-			return user, true
+	} else if lc.SyncUserOnLogin {
+		l, err := la.getLdapConnection(true)
+		if err != nil {
+			cclog.Error("LDAP connection error")
+			return nil, false
 		}
+		defer l.Close()
+
+		// Search for the given username
+		searchRequest := ldap.NewSearchRequest(
+			lc.UserBase,
+			ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
+			fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)),
+			[]string{"dn", la.UIDAttr, la.UserAttr}, nil)
+
+		sr, err := l.Search(searchRequest)
+		if err != nil {
+			cclog.Warn(err)
+			return nil, false
+		}
+
+		if len(sr.Entries) != 1 {
+			cclog.Warn("LDAP: User does not exist or too many entries returned")
+			return nil, false
+		}
+
+		entry := sr.Entries[0]
+		user = &schema.User{
+			Username:   username,
+			Name:       entry.GetAttributeValue(la.UserAttr),
+			Roles:      []string{schema.GetRoleString(schema.RoleUser)},
+			Projects:   make([]string, 0),
+			AuthType:   schema.AuthSession,
+			AuthSource: schema.AuthViaLDAP,
+		}
+
+		handleLdapUser(user)
+		return user, true
 	}

 	return nil, false
@@ -132,7 +131,7 @@ func (la *LdapAuthenticator) Login(
 	}
 	defer l.Close()

-	userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username)
+	userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username))
 	if err := l.Bind(userDn, r.FormValue("password")); err != nil {
 		cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v",
 			user.Username, err)
@@ -170,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error {
 		lc.UserBase,
 		ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
 		lc.UserFilter,
-		[]string{"dn", "uid", la.UserAttr}, nil))
+		[]string{"dn", la.UIDAttr, la.UserAttr}, nil))
 	if err != nil {
 		cclog.Warn("LDAP search error")
 		return err
@@ -178,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error {

 	newnames := map[string]string{}
 	for _, entry := range ldapResults.Entries {
-		username := entry.GetAttributeValue("uid")
+		username := entry.GetAttributeValue(la.UIDAttr)
 		if username == "" {
-			return errors.New("no attribute 'uid'")
+			return fmt.Errorf("no attribute '%s'", la.UIDAttr)
 		}

 		_, ok := users[username]
@@ -194,20 +193,19 @@ func (la *LdapAuthenticator) Sync() error {

 	for username, where := range users {
 		if where == InDB && lc.SyncDelOldUsers {
-			ur.DelUser(username)
+			if err := ur.DelUser(username); err != nil {
+				cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err)
+				return err
+			}
 			cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username)
 		} else if where == InLdap {
 			name := newnames[username]

-			var roles []string
-			roles = append(roles, schema.GetRoleString(schema.RoleUser))
-			projects := make([]string, 0)
-
 			user := &schema.User{
 				Username:   username,
 				Name:       name,
-				Roles:      roles,
-				Projects:   projects,
+				Roles:      []string{schema.GetRoleString(schema.RoleUser)},
+				Projects:   make([]string, 0),
 				AuthSource: schema.AuthViaLDAP,
 			}

@@ -224,11 +222,13 @@ func (la *LdapAuthenticator) Sync() error {

 func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) {
 	lc := Keys.LdapConfig
-	conn, err := ldap.DialURL(lc.URL)
+	conn, err := ldap.DialURL(lc.URL,
+		ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second}))
 	if err != nil {
 		cclog.Warn("LDAP URL dial failed")
 		return nil, err
 	}
+	conn.SetTimeout(30 * time.Second)

 	if admin {
 		if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil {
--- a/internal/auth/local.go
+++ b/internal/auth/local.go
@@ -9,8 +9,8 @@ import (
 	"fmt"
 	"net/http"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"golang.org/x/crypto/bcrypt"
 )

--- a/internal/auth/oidc.go
+++ b/internal/auth/oidc.go
@@ -9,23 +9,24 @@ import (
 	"context"
 	"crypto/rand"
 	"encoding/base64"
+	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"time"

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	"github.com/coreos/go-oidc/v3/oidc"
-	"github.com/gorilla/mux"
+	"github.com/go-chi/chi/v5"
 	"golang.org/x/oauth2"
 )

 type OpenIDConfig struct {
 	Provider          string `json:"provider"`
-	SyncUserOnLogin   bool   `json:"syncUserOnLogin"`
-	UpdateUserOnLogin bool   `json:"updateUserOnLogin"`
+	SyncUserOnLogin   bool   `json:"sync-user-on-login"`
+	UpdateUserOnLogin bool   `json:"update-user-on-login"`
 }

 type OIDC struct {
@@ -50,6 +51,7 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin
 		MaxAge:   int(time.Hour.Seconds()),
 		Secure:   r.TLS != nil,
 		HttpOnly: true,
+		SameSite: http.SameSiteLaxMode,
 	}
 	http.SetCookie(w, c)
 }
@@ -59,7 +61,7 @@ func NewOIDC(a *Authentication) *OIDC {
 	// Use context with timeout for provider initialization
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
-	
+
 	provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider)
 	if err != nil {
 		cclog.Fatal(err)
@@ -77,8 +79,7 @@ func NewOIDC(a *Authentication) *OIDC {
 		ClientID:     clientID,
 		ClientSecret: clientSecret,
 		Endpoint:     provider.Endpoint(),
-		RedirectURL:  "oidc-callback",
-		Scopes:       []string{oidc.ScopeOpenID, "profile", "email"},
+		Scopes:       []string{oidc.ScopeOpenID, "profile"},
 	}

 	oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a}
@@ -86,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC {
 	return oa
 }

-func (oa *OIDC) RegisterEndpoints(r *mux.Router) {
+func (oa *OIDC) RegisterEndpoints(r chi.Router) {
 	r.HandleFunc("/oidc-login", oa.OAuth2Login)
 	r.HandleFunc("/oidc-callback", oa.OAuth2Callback)
 }
@@ -119,57 +120,96 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
 	// Exchange authorization code for token with timeout
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
-	
+
 	token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier))
 	if err != nil {
-		http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError)
+		cclog.Errorf("token exchange failed: %s", err.Error())
+		http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError)
 		return
 	}

 	// Get user info from OIDC provider with same timeout
 	userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token))
 	if err != nil {
-		http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError)
+		cclog.Errorf("failed to get userinfo: %s", err.Error())
+		http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError)
 		return
 	}

-	// // Extract the ID Token from OAuth2 token.
-	// rawIDToken, ok := token.Extra("id_token").(string)
-	// if !ok {
-	// 	http.Error(rw, "Cannot access idToken", http.StatusInternalServerError)
-	// }
-	//
-	// verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
-	// // Parse and verify ID Token payload.
-	// idToken, err := verifier.Verify(context.Background(), rawIDToken)
-	// if err != nil {
-	// 	http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError)
-	// }
+	// Verify ID token and nonce to prevent replay attacks
+	rawIDToken, ok := token.Extra("id_token").(string)
+	if !ok {
+		http.Error(rw, "ID token not found in response", http.StatusInternalServerError)
+		return
+	}
+
+	nonceCookie, err := r.Cookie("nonce")
+	if err != nil {
+		http.Error(rw, "nonce cookie not found", http.StatusBadRequest)
+		return
+	}
+
+	verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID})
+	idToken, err := verifier.Verify(ctx, rawIDToken)
+	if err != nil {
+		cclog.Errorf("ID token verification failed: %s", err.Error())
+		http.Error(rw, "ID token verification failed", http.StatusInternalServerError)
+		return
+	}
+
+	if idToken.Nonce != nonceCookie.Value {
+		http.Error(rw, "Nonce mismatch", http.StatusBadRequest)
+		return
+	}

 	projects := make([]string, 0)

-	// Extract custom claims
+	// Extract custom claims from userinfo
 	var claims struct {
 		Username string `json:"preferred_username"`
 		Name     string `json:"name"`
-		Profile  struct {
+		// Keycloak realm-level roles
+		RealmAccess struct {
+			Roles []string `json:"roles"`
+		} `json:"realm_access"`
+		// Keycloak client-level roles
+		ResourceAccess struct {
 			Client struct {
 				Roles []string `json:"roles"`
 			} `json:"clustercockpit"`
 		} `json:"resource_access"`
 	}
 	if err := userInfo.Claims(&claims); err != nil {
-		http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError)
+		cclog.Errorf("failed to extract claims: %s", err.Error())
+		http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError)
+		return
+	}
+
+	if claims.Username == "" {
+		http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest)
+		return
+	}
+
+	// Merge roles from both client-level and realm-level access
+	oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...)
+
+	roleSet := make(map[string]bool)
+	for _, r := range oidcRoles {
+		switch r {
+		case "user":
+			roleSet[schema.GetRoleString(schema.RoleUser)] = true
+		case "admin":
+			roleSet[schema.GetRoleString(schema.RoleAdmin)] = true
+		case "manager":
+			roleSet[schema.GetRoleString(schema.RoleManager)] = true
+		case "support":
+			roleSet[schema.GetRoleString(schema.RoleSupport)] = true
+		}
 	}

 	var roles []string
-	for _, r := range claims.Profile.Client.Roles {
-		switch r {
-		case "user":
-			roles = append(roles, schema.GetRoleString(schema.RoleUser))
-		case "admin":
-			roles = append(roles, schema.GetRoleString(schema.RoleAdmin))
-		}
+	for role := range roleSet {
+		roles = append(roles, role)
 	}

 	if len(roles) == 0 {
@@ -188,8 +228,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) {
 		handleOIDCUser(user)
 	}

-	oa.authentication.SaveSession(rw, r, user)
-	cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
+	if err := oa.authentication.SaveSession(rw, r, user); err != nil {
+		cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error())
+		http.Error(rw, "Failed to create session", http.StatusInternalServerError)
+		return
+	}
+	cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects)
 	userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user)
 	http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx))
 }
@@ -206,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) {
 	codeVerifier := oauth2.GenerateVerifier()
 	setCallbackCookie(rw, r, "verifier", codeVerifier)

+	// Generate nonce for ID token replay protection
+	nonce, err := randString(16)
+	if err != nil {
+		http.Error(rw, "Internal error", http.StatusInternalServerError)
+		return
+	}
+	setCallbackCookie(rw, r, "nonce", nonce)
+
+	// Build redirect URL from the incoming request
+	scheme := "https"
+	if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" {
+		scheme = "http"
+	}
+	oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host)
+
 	// Redirect user to consent page to ask for permission
-	url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier))
+	url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline,
+		oauth2.S256ChallengeOption(codeVerifier),
+		oidc.Nonce(nonce))
 	http.Redirect(rw, r, url, http.StatusFound)
 }
--- a/internal/auth/schema.go
+++ b/internal/auth/schema.go
@@ -15,37 +15,44 @@ var configSchema = `
          "description": "Configure how long a token is valid. As string parsable by time.ParseDuration()",
          "type": "string"
        },
-        "cookieName": {
+        "cookie-name": {
          "description": "Cookie that should be checked for a JWT token.",
          "type": "string"
        },
-        "validateUser": {
+        "validate-user": {
          "description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.",
          "type": "boolean"
        },
-        "trustedIssuer": {
+        "trusted-issuer": {
          "description": "Issuer that should be accepted when validating external JWTs ",
          "type": "string"
        },
-        "syncUserOnLogin": {
+        "sync-user-on-login": {
          "description": "Add non-existent user to DB at login attempt with values provided in JWT.",
          "type": "boolean"
+        },
+        "update-user-on-login": {
+          "description": "Should an existent user attributes in the DB be updated at login attempt with values provided in JWT.",
+          "type": "boolean"
        }
      },
      "required": ["max-age"]
    },
    "oidc": {
-      "provider": {
-        "description": "",
-        "type": "string"
-      },
-      "syncUserOnLogin": {
-        "description": "",
-        "type": "boolean"
-      },
-      "updateUserOnLogin": {
-        "description": "",
-        "type": "boolean"
+      "type": "object",
+      "properties": {
+        "provider": {
+          "description": "OpenID Connect provider URL.",
+          "type": "string"
+        },
+        "sync-user-on-login": {
+          "description": "Add non-existent user to DB at login attempt with values provided.",
+          "type": "boolean"
+        },
+        "update-user-on-login": {
+          "description": "Should an existent user attributes in the DB be updated at login attempt with values provided.",
+          "type": "boolean"
+        }
      },
      "required": ["provider"]
    },
@@ -57,40 +64,48 @@ var configSchema = `
          "description": "URL of LDAP directory server.",
          "type": "string"
        },
-        "user_base": {
+        "user-base": {
          "description": "Base DN of user tree root.",
          "type": "string"
        },
-        "search_dn": {
+        "search-dn": {
          "description": "DN for authenticating LDAP admin account with general read rights.",
          "type": "string"
        },
-        "user_bind": {
+        "user-bind": {
          "description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.",
          "type": "string"
        },
-        "user_filter": {
+        "user-filter": {
          "description": "Filter to extract users for syncing.",
          "type": "string"
        },
-        "username_attr": {
+        "username-attr": {
          "description": "Attribute with full username. Default: gecos",
          "type": "string"
        },
-        "sync_interval": {
+        "sync-interval": {
          "description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.",
          "type": "string"
        },
-        "sync_del_old_users": {
+        "sync-del-old-users": {
          "description": "Delete obsolete users in database.",
          "type": "boolean"
        },
-        "syncUserOnLogin": {
+        "uid-attr": {
+          "description": "LDAP attribute used as login username. Default: uid",
+          "type": "string"
+        },
+        "sync-user-on-login": {
          "description": "Add non-existent user to DB at login attempt if user exists in Ldap directory",
          "type": "boolean"
+        },
+        "update-user-on-login": {
+          "description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.",
+          "type": "boolean"
        }
      },
-      "required": ["url", "user_base", "search_dn", "user_bind", "user_filter"]
+      "required": ["url", "user-base", "search-dn", "user-bind", "user-filter"]
    },
  "required": ["jwts"]
 	}`
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -11,8 +11,8 @@ import (
 	"encoding/json"
 	"time"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/resampler"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/resampler"
 )

 type ProgramConfig struct {
@@ -20,7 +20,9 @@ type ProgramConfig struct {
 	Addr string `json:"addr"`

 	// Addresses from which secured admin API endpoints can be reached, can be wildcard "*"
-	APIAllowedIPs []string `json:"apiAllowedIPs"`
+	APIAllowedIPs []string `json:"api-allowed-ips"`
+
+	APISubjects *NATSConfig `json:"api-subjects"`

 	// Drop root permissions once .env was read and the port was taken.
 	User  string `json:"user"`
@@ -35,16 +37,9 @@ type ProgramConfig struct {
 	EmbedStaticFiles bool   `json:"embed-static-files"`
 	StaticFiles      string `json:"static-files"`

-	// 'sqlite3' or 'mysql' (mysql will work for mariadb as well)
-	DBDriver string `json:"db-driver"`
-
-	// For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).
+	// Path to SQLite database file
 	DB string `json:"db"`

-	// Keep all metric data in the metric data repositories,
-	// do not write to the job-archive.
-	DisableArchive bool `json:"disable-archive"`
-
 	EnableJobTaggers bool `json:"enable-job-taggers"`

 	// Validate json input against schema
@@ -76,17 +71,42 @@ type ProgramConfig struct {

 	// If exists, will enable dynamic zoom in frontend metric plots using the configured values
 	EnableResampling *ResampleConfig `json:"resampling"`
+
+	// Systemd unit name for log viewer (default: "clustercockpit")
+	SystemdUnit string `json:"systemd-unit"`
+
+	// Node state retention configuration
+	NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
+}
+
+type NodeStateRetention struct {
+	Policy             string `json:"policy"`      // "delete" or "move"
+	Age                int    `json:"age"`         // hours, default 24
+	TargetKind         string `json:"target-kind"` // "file" or "s3"
+	TargetPath         string `json:"target-path"`
+	TargetEndpoint     string `json:"target-endpoint"`
+	TargetBucket       string `json:"target-bucket"`
+	TargetAccessKey    string `json:"target-access-key"`
+	TargetSecretKey    string `json:"target-secret-key"`
+	TargetRegion       string `json:"target-region"`
+	TargetUsePathStyle bool   `json:"target-use-path-style"`
+	MaxFileSizeMB      int    `json:"max-file-size-mb"`
 }

 type ResampleConfig struct {
 	// Minimum number of points to trigger resampling of data
-	MinimumPoints int `json:"minimumPoints"`
+	MinimumPoints int `json:"minimum-points"`
 	// Array of resampling target resolutions, in seconds; Example: [600,300,60]
 	Resolutions []int `json:"resolutions"`
 	// Trigger next zoom level at less than this many visible datapoints
 	Trigger int `json:"trigger"`
 }

+type NATSConfig struct {
+	SubjectJobEvent  string `json:"subject-job-event"`
+	SubjectNodeState string `json:"subject-node-state"`
+}
+
 type IntRange struct {
 	From int `json:"from"`
 	To   int `json:"to"`
@@ -100,32 +120,20 @@ type TimeRange struct {

 type FilterRanges struct {
 	Duration  *IntRange  `json:"duration"`
-	NumNodes  *IntRange  `json:"numNodes"`
-	StartTime *TimeRange `json:"startTime"`
+	NumNodes  *IntRange  `json:"num-nodes"`
+	StartTime *TimeRange `json:"start-time"`
 }

-type ClusterConfig struct {
-	Name                 string          `json:"name"`
-	FilterRanges         *FilterRanges   `json:"filterRanges"`
-	MetricDataRepository json.RawMessage `json:"metricDataRepository"`
-}
-
-var Clusters []*ClusterConfig
-
 var Keys ProgramConfig = ProgramConfig{
 	Addr:                      "localhost:8080",
-	DisableAuthentication:     false,
 	EmbedStaticFiles:          true,
-	DBDriver:                  "sqlite3",
 	DB:                        "./var/job.db",
-	DisableArchive:            false,
-	Validate:                  false,
 	SessionMaxAge:             "168h",
 	StopJobsExceedingWalltime: 0,
 	ShortRunningJobsDuration:  5 * 60,
 }

-func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
+func Init(mainConfig json.RawMessage) {
 	Validate(configSchema, mainConfig)
 	dec := json.NewDecoder(bytes.NewReader(mainConfig))
 	dec.DisallowUnknownFields()
@@ -133,17 +141,6 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
 		cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
 	}

-	Validate(clustersSchema, clusterConfig)
-	dec = json.NewDecoder(bytes.NewReader(clusterConfig))
-	dec.DisallowUnknownFields()
-	if err := dec.Decode(&Clusters); err != nil {
-		cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error())
-	}
-
-	if len(Clusters) < 1 {
-		cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
-	}
-
 	if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 {
 		resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints)
 	}
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -8,19 +8,15 @@ package config
 import (
 	"testing"

-	ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 )

 func TestInit(t *testing.T) {
 	fp := "../../configs/config.json"
 	ccconf.Init(fp)
 	if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
-		if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
-			Init(cfg, clustercfg)
-		} else {
-			cclog.Abort("Cluster configuration must be present")
-		}
+		Init(cfg)
 	} else {
 		cclog.Abort("Main configuration must be present")
 	}
@@ -34,11 +30,7 @@ func TestInitMinimal(t *testing.T) {
 	fp := "../../configs/config-demo.json"
 	ccconf.Init(fp)
 	if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
-		if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
-			Init(cfg, clustercfg)
-		} else {
-			cclog.Abort("Cluster configuration must be present")
-		}
+		Init(cfg)
 	} else {
 		cclog.Abort("Main configuration must be present")
 	}
--- a/internal/config/default_metrics.go
+++ b/internal/config/default_metrics.go
@@ -15,7 +15,7 @@ import (

 type DefaultMetricsCluster struct {
 	Name           string `json:"name"`
-	DefaultMetrics string `json:"default_metrics"`
+	DefaultMetrics string `json:"default-metrics"`
 }

 type DefaultMetricsConfig struct {
--- a/internal/config/schema.go
+++ b/internal/config/schema.go
@@ -6,14 +6,14 @@
 package config

 var configSchema = `
-	{
+{
  "type": "object",
  "properties": {
    "addr": {
      "description": "Address where the http (or https) server will listen on (for example: 'localhost:80').",
      "type": "string"
    },
-    "apiAllowedIPs": {
+    "api-allowed-ips": {
      "description": "Addresses from which secured API endpoints can be reached",
      "type": "array",
      "items": {
@@ -41,13 +41,9 @@ var configSchema = `
      "type": "string"
    },
    "db": {
-      "description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).",
+      "description": "Path to SQLite database file (e.g., './var/job.db')",
      "type": "string"
    },
-    "disable-archive": {
-      "description": "Keep all metric data in the metric data repositories, do not write to the job-archive.",
-      "type": "boolean"
-    },
    "enable-job-taggers": {
      "description": "Turn on automatic application and jobclass taggers",
      "type": "boolean"
@@ -81,28 +77,22 @@ var configSchema = `
      "type": "integer"
    },
    "emission-constant": {
-      "description": ".",
+      "description": "Energy mix CO2 emission constant [g/kWh]. If set, displays estimated CO2 emission for jobs.",
      "type": "integer"
    },
-    "cron-frequency": {
-      "description": "Frequency of cron job workers.",
-      "type": "object",
-      "properties": {
-        "duration-worker": {
-          "description": "Duration Update Worker [Defaults to '5m']",
-          "type": "string"
-        },
-        "footprint-worker": {
-          "description": "Metric-Footprint Update Worker [Defaults to '10m']",
-          "type": "string"
-        }
-      }
+    "machine-state-dir": {
+      "description": "Where to store MachineState files.",
+      "type": "string"
    },
-    "enable-resampling": {
+    "systemd-unit": {
+      "description": "Systemd unit name for log viewer (default: 'clustercockpit').",
+      "type": "string"
+    },
+    "resampling": {
      "description": "Enable dynamic zoom in frontend metric plots.",
      "type": "object",
      "properties": {
-        "minimumPoints": {
+        "minimum-points": {
          "description": "Minimum points to trigger resampling of time-series data.",
          "type": "integer"
        },
@@ -119,87 +109,74 @@ var configSchema = `
        }
      },
      "required": ["trigger", "resolutions"]
-    }
-	},
-  "required": ["apiAllowedIPs"]
-	}`
-
-var clustersSchema = `
-  {
-    "type": "array",
-    "items": {
+    },
+    "api-subjects": {
+      "description": "NATS subjects configuration for subscribing to job and node events.",
      "type": "object",
      "properties": {
-        "name": {
-          "description": "The name of the cluster.",
+        "subject-job-event": {
+          "description": "NATS subject for job events (start_job, stop_job)",
          "type": "string"
        },
-        "metricDataRepository": {
-          "description": "Type of the metric data repository for this cluster",
-          "type": "object",
-          "properties": {
-            "kind": {
-              "type": "string",
-                "enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"]
-            },
-            "url": {
-              "type": "string"
-            },
-            "token": {
-              "type": "string"
-            }
-          },
-          "required": ["kind"]
-        },
-        "filterRanges": {
-          "description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.",
-          "type": "object",
-          "properties": {
-            "numNodes": {
-              "description": "UI slider range for number of nodes",
-              "type": "object",
-              "properties": {
-                "from": {
-                  "type": "integer"
-                },
-                "to": {
-                  "type": "integer"
-                }
-              },
-              "required": ["from", "to"]
-            },
-            "duration": {
-              "description": "UI slider range for duration",
-              "type": "object",
-              "properties": {
-                "from": {
-                  "type": "integer"
-                },
-                "to": {
-                  "type": "integer"
-                }
-              },
-              "required": ["from", "to"]
-            },
-            "startTime": {
-              "description": "UI slider range for start time",
-              "type": "object",
-              "properties": {
-                "from": {
-                  "type": "string",
-                  "format": "date-time"
-                },
-                "to": {
-                  "type": "null"
-                }
-              },
-              "required": ["from", "to"]
-            }
-          },
-          "required": ["numNodes", "duration", "startTime"]
+        "subject-node-state": {
+          "description": "NATS subject for node state updates",
+          "type": "string"
        }
      },
-      "required": ["name", "metricDataRepository", "filterRanges"],
-      "minItems": 1
+      "required": ["subject-job-event", "subject-node-state"]
+    },
+    "nodestate-retention": {
+      "description": "Node state retention configuration for cleaning up old node_state rows.",
+      "type": "object",
+      "properties": {
+        "policy": {
+          "description": "Retention policy: 'delete' to remove old rows, 'move' to archive to Parquet then delete.",
+          "type": "string",
+          "enum": ["delete", "move"]
+        },
+        "age": {
+          "description": "Retention age in hours (default: 24).",
+          "type": "integer"
+        },
+        "target-kind": {
+          "description": "Target kind for parquet archiving: 'file' or 's3'.",
+          "type": "string",
+          "enum": ["file", "s3"]
+        },
+        "target-path": {
+          "description": "Filesystem path for parquet file target.",
+          "type": "string"
+        },
+        "target-endpoint": {
+          "description": "S3 endpoint URL.",
+          "type": "string"
+        },
+        "target-bucket": {
+          "description": "S3 bucket name.",
+          "type": "string"
+        },
+        "target-access-key": {
+          "description": "S3 access key.",
+          "type": "string"
+        },
+        "target-secret-key": {
+          "description": "S3 secret key.",
+          "type": "string"
+        },
+        "target-region": {
+          "description": "S3 region.",
+          "type": "string"
+        },
+        "target-use-path-style": {
+          "description": "Use path-style S3 addressing.",
+          "type": "boolean"
+        },
+        "max-file-size-mb": {
+          "description": "Maximum parquet file size in MB (default: 128).",
+          "type": "integer"
+        }
+      },
+      "required": ["policy"]
    }
-  }`
+  }
+}`
--- a/internal/config/validate.go
+++ b/internal/config/validate.go
@@ -8,7 +8,7 @@ package config
 import (
 	"encoding/json"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	"github.com/santhosh-tekuri/jsonschema/v5"
 )

--- a/internal/graph/generated/generated.go
+++ b/internal/graph/generated/generated.go
--- a/internal/graph/model/models_gen.go
+++ b/internal/graph/model/models_gen.go
@@ -10,9 +10,21 @@ import (
 	"time"

 	"github.com/ClusterCockpit/cc-backend/internal/config"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

+type ClusterMetricWithName struct {
+	Name     string         `json:"name"`
+	Unit     *schema.Unit   `json:"unit,omitempty"`
+	Timestep int            `json:"timestep"`
+	Data     []schema.Float `json:"data"`
+}
+
+type ClusterMetrics struct {
+	NodeCount int                      `json:"nodeCount"`
+	Metrics   []*ClusterMetricWithName `json:"metrics"`
+}
+
 type Count struct {
 	Name  string `json:"name"`
 	Count int    `json:"count"`
@@ -59,6 +71,7 @@ type JobFilter struct {
 	Project         *StringInput      `json:"project,omitempty"`
 	JobName         *StringInput      `json:"jobName,omitempty"`
 	Cluster         *StringInput      `json:"cluster,omitempty"`
+	SubCluster      *StringInput      `json:"subCluster,omitempty"`
 	Partition       *StringInput      `json:"partition,omitempty"`
 	Duration        *config.IntRange  `json:"duration,omitempty"`
 	Energy          *FloatRange       `json:"energy,omitempty"`
@@ -70,6 +83,7 @@ type JobFilter struct {
 	State           []schema.JobState `json:"state,omitempty"`
 	MetricStats     []*MetricStatItem `json:"metricStats,omitempty"`
 	Shared          *string           `json:"shared,omitempty"`
+	Schedule        *string           `json:"schedule,omitempty"`
 	Node            *StringInput      `json:"node,omitempty"`
 }

@@ -173,7 +187,7 @@ type NamedStatsWithScope struct {
 type NodeFilter struct {
 	Hostname       *StringInput           `json:"hostname,omitempty"`
 	Cluster        *StringInput           `json:"cluster,omitempty"`
-	Subcluster     *StringInput           `json:"subcluster,omitempty"`
+	SubCluster     *StringInput           `json:"subCluster,omitempty"`
 	SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"`
 	HealthState    *string                `json:"healthState,omitempty"`
 	TimeStart      *int                   `json:"timeStart,omitempty"`
--- a/internal/graph/resolver.go
+++ b/internal/graph/resolver.go
@@ -4,7 +4,7 @@ import (
 	"sync"

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	"github.com/jmoiron/sqlx"
 )

--- a/internal/graph/schema.resolvers.go
+++ b/internal/graph/schema.resolvers.go
@@ -1,13 +1,15 @@
 package graph

-// This file will be automatically regenerated based on the schema, any resolver implementations
+// This file will be automatically regenerated based on the schema, any resolver
+// implementations
 // will be copied through when generating and any unknown code will be moved to the end.
-// Code generated by github.com/99designs/gqlgen version v0.17.81
+// Code generated by github.com/99designs/gqlgen version v0.17.87

 import (
 	"context"
 	"errors"
 	"fmt"
+	"math"
 	"regexp"
 	"slices"
 	"strconv"
@@ -17,11 +19,12 @@ import (
 	"github.com/ClusterCockpit/cc-backend/internal/config"
 	"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
 	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
-	"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
+	"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

 // Partitions is the resolver for the partitions field.
@@ -86,14 +89,14 @@ func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*
 	res := []*model.EnergyFootprintValue{}
 	for name, value := range rawEnergyFootprint {
 		// Suboptimal: Nearly hardcoded metric name expectations
-		matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
+		matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
 		matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
 		matchMem := regexp.MustCompile(`mem|Mem|MEM`)
 		matchCore := regexp.MustCompile(`core|Core|CORE`)

 		hwType := ""
 		switch test := name; { // NOtice ';' for var declaration
-		case matchCpu.MatchString(test):
+		case matchCPU.MatchString(test):
 			hwType = "CPU"
 		case matchAcc.MatchString(test):
 			hwType = "Accelerator"
@@ -173,9 +176,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
 	}

 	tags := []*schema.Tag{}
-	for _, tagId := range tagIds {
+	for _, tagID := range tagIds {
 		// Get ID
-		tid, err := strconv.ParseInt(tagId, 10, 64)
+		tid, err := strconv.ParseInt(tagID, 10, 64)
 		if err != nil {
 			cclog.Warn("Error while parsing tag id")
 			return nil, err
@@ -220,9 +223,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
 	}

 	tags := []*schema.Tag{}
-	for _, tagId := range tagIds {
+	for _, tagID := range tagIds {
 		// Get ID
-		tid, err := strconv.ParseInt(tagId, 10, 64)
+		tid, err := strconv.ParseInt(tagID, 10, 64)
 		if err != nil {
 			cclog.Warn("Error while parsing tag id")
 			return nil, err
@@ -263,9 +266,9 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
 	}

 	tags := []int{}
-	for _, tagId := range tagIds {
+	for _, tagID := range tagIds {
 		// Get ID
-		tid, err := strconv.ParseInt(tagId, 10, 64)
+		tid, err := strconv.ParseInt(tagID, 10, 64)
 		if err != nil {
 			cclog.Warn("Error while parsing tag id for removal")
 			return nil, err
@@ -281,7 +284,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
 		// Test Access: Admins && Admin Tag OR Everyone && Private Tag
 		if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope {
 			// Remove from DB
-			if err = r.Repo.RemoveTagById(tid); err != nil {
+			if err = r.Repo.RemoveTagByID(tid); err != nil {
 				cclog.Warn("Error while removing tag")
 				return nil, err
 			} else {
@@ -315,18 +318,39 @@ func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (sc
 	if obj.NodeState != "" {
 		return obj.NodeState, nil
 	} else {
-		return "", fmt.Errorf("No SchedulerState (NodeState) on Object")
+		return "", fmt.Errorf("resolver: no SchedulerState (NodeState) on node object")
 	}
 }

 // HealthState is the resolver for the healthState field.
 func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) {
-	panic(fmt.Errorf("not implemented: HealthState - healthState"))
+	if obj.HealthState != "" {
+		return string(obj.HealthState), nil
+	} else {
+		return "", fmt.Errorf("resolver: no HealthState (NodeState) on node object")
+	}
 }

 // MetaData is the resolver for the metaData field.
 func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) {
-	panic(fmt.Errorf("not implemented: MetaData - metaData"))
+	if obj.MetaData != nil {
+		return obj.MetaData, nil
+	} else {
+		cclog.Debug("resolver: no MetaData (NodeState) on node object")
+		emptyMeta := make(map[string]string, 0)
+		return emptyMeta, nil
+	}
+}
+
+// HealthData is the resolver for the healthData field.
+func (r *nodeResolver) HealthData(ctx context.Context, obj *schema.Node) (any, error) {
+	if obj.HealthData != nil {
+		return obj.HealthData, nil
+	} else {
+		cclog.Debug("resolver: no HealthData (NodeState) on node object")
+		emptyHealth := make(map[string][]string, 0)
+		return emptyHealth, nil
+	}
 }

 // Clusters is the resolver for the clusters field.
@@ -341,6 +365,14 @@ func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {

 // GlobalMetrics is the resolver for the globalMetrics field.
 func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
+	user := repository.GetUserFromContext(ctx)
+
+	if user != nil {
+		if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
+			return archive.GlobalUserMetricList, nil
+		}
+	}
+
 	return archive.GlobalMetricList, nil
 }

@@ -371,12 +403,12 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
 // Node is the resolver for the node field.
 func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
 	repo := repository.GetNodeRepository()
-	numericId, err := strconv.ParseInt(id, 10, 64)
+	numericID, err := strconv.ParseInt(id, 10, 64)
 	if err != nil {
 		cclog.Warn("Error while parsing job id")
 		return nil, err
 	}
-	return repo.GetNodeById(numericId, false)
+	return repo.GetNodeByID(numericID, false)
 }

 // Nodes is the resolver for the nodes field.
@@ -387,6 +419,15 @@ func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, o
 	return &model.NodeStateResultList{Items: nodes, Count: &count}, err
 }

+// NodesWithMeta is the resolver for the nodesWithMeta field.
+func (r *queryResolver) NodesWithMeta(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) {
+	// Why Extra Handler? -> graphql.CollectAllFields(ctx) only returns toplevel fields (i.e.: items, count), and not subfields like item.metaData
+	repo := repository.GetNodeRepository()
+	nodes, err := repo.QueryNodesWithMeta(ctx, filter, nil, order) // Ignore Paging, Order Unused
+	count := len(nodes)
+	return &model.NodeStateResultList{Items: nodes, Count: &count}, err
+}
+
 // NodeStates is the resolver for the nodeStates field.
 func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) {
 	repo := repository.GetNodeRepository()
@@ -403,8 +444,7 @@ func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilt
 		return nil, herr
 	}

-	allCounts := make([]*model.NodeStates, 0)
-	allCounts = append(stateCounts, healthCounts...)
+	allCounts := append(stateCounts, healthCounts...)

 	return allCounts, nil
 }
@@ -431,18 +471,18 @@ func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.Nod
 		return healthCounts, nil
 	}

-	return nil, errors.New("Unknown Node State Query Type")
+	return nil, errors.New("unknown Node State Query Type")
 }

 // Job is the resolver for the job field.
 func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
-	numericId, err := strconv.ParseInt(id, 10, 64)
+	numericID, err := strconv.ParseInt(id, 10, 64)
 	if err != nil {
 		cclog.Warn("Error while parsing job id")
 		return nil, err
 	}

-	job, err := r.Repo.FindById(ctx, numericId)
+	job, err := r.Repo.FindByID(ctx, numericID)
 	if err != nil {
 		cclog.Warn("Error while finding job by id")
 		return nil, err
@@ -475,7 +515,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str
 		return nil, err
 	}

-	data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution)
+	data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution)
 	if err != nil {
 		cclog.Warn("Error while loading job data")
 		return nil, err
@@ -503,7 +543,7 @@ func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []strin
 		return nil, err
 	}

-	data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
+	data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
 	if err != nil {
 		cclog.Warnf("Error while loading jobStats data for job id %s", id)
 		return nil, err
@@ -528,7 +568,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
 		return nil, err
 	}

-	data, err := metricDataDispatcher.LoadScopedJobStats(job, metrics, scopes, ctx)
+	data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx)
 	if err != nil {
 		cclog.Warnf("Error while loading scopedJobStats data for job id %s", id)
 		return nil, err
@@ -542,7 +582,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [
 			for _, stat := range stats {
 				mdlStats = append(mdlStats, &model.ScopedStats{
 					Hostname: stat.Hostname,
-					ID:       stat.Id,
+					ID:       stat.ID,
 					Data:     stat.Data,
 				})
 			}
@@ -581,21 +621,24 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag

 	// Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean.
 	// Users can decide in frontend to use continuous scroll, even if app-default is paging!
+	// Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards)
 	/*
 	  Example Page 4 @ 10 IpP : Does item 41 exist?
 	  Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists.
 	*/
-	nextPage := &model.PageRequest{
-		ItemsPerPage: 1,
-		Page:         ((page.Page * page.ItemsPerPage) + 1),
+	hasNextPage := false
+	if page.ItemsPerPage != -1 {
+		nextPage := &model.PageRequest{
+			ItemsPerPage: 1,
+			Page:         ((page.Page * page.ItemsPerPage) + 1),
+		}
+		nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
+		if err != nil {
+			cclog.Warn("Error while querying next jobs")
+			return nil, err
+		}
+		hasNextPage = len(nextJobs) == 1
 	}
-	nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order)
-	if err != nil {
-		cclog.Warn("Error while querying next jobs")
-		return nil, err
-	}
-
-	hasNextPage := len(nextJobs) == 1

 	return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil
 }
@@ -693,7 +736,7 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job

 	res := []*model.JobStats{}
 	for _, job := range jobs {
-		data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx)
+		data, err := metricdispatch.LoadJobStats(job, metrics, ctx)
 		if err != nil {
 			cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID)
 			continue
@@ -744,13 +787,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [
 		return nil, errors.New("you need to be administrator or support staff for this query")
 	}

+	defaultMetrics := make([]string, 0)
+	for _, mc := range archive.GetCluster(cluster).MetricConfig {
+		defaultMetrics = append(defaultMetrics, mc.Name)
+	}
 	if metrics == nil {
-		for _, mc := range archive.GetCluster(cluster).MetricConfig {
-			metrics = append(metrics, mc.Name)
-		}
+		metrics = defaultMetrics
+	} else {
+		metrics = slices.DeleteFunc(metrics, func(metric string) bool {
+			return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics.
+		})
 	}

-	data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
+	data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
 	if err != nil {
 		cclog.Warn("error while loading node data")
 		return nil, err
@@ -804,153 +853,39 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
 		return nil, errors.New("you need to be administrator or support staff for this query")
 	}

+	nodeRepo := repository.GetNodeRepository()
+	// nodes -> array hostname
+	nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
+	if nerr != nil {
+		return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
+	}
+
 	if metrics == nil {
 		for _, mc := range archive.GetCluster(cluster).MetricConfig {
 			metrics = append(metrics, mc.Name)
 		}
 	}

-	// Build Filters
-	queryFilters := make([]*model.NodeFilter, 0)
-	if cluster != "" {
-		queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}})
-	}
-	if subCluster != "" {
-		queryFilters = append(queryFilters, &model.NodeFilter{Subcluster: &model.StringInput{Eq: &subCluster}})
-	}
-	if nodeFilter != "" && stateFilter != "notindb" {
-		queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}})
-	}
-	if stateFilter != "all" && stateFilter != "notindb" {
-		var queryState schema.SchedulerState = schema.SchedulerState(stateFilter)
-		queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState})
-	}
-	// if healthFilter != "all" {
-	// 	filters = append(filters, &model.NodeFilter{HealthState: &healthFilter})
-	// }
-
-	// Special Case: Disable Paging for missing nodes filter, save IPP for later
-	var backupItems int
-	if stateFilter == "notindb" {
-		backupItems = page.ItemsPerPage
-		page.ItemsPerPage = -1
-	}
-
-	// Query Nodes From DB
-	nodeRepo := repository.GetNodeRepository()
-	rawNodes, serr := nodeRepo.QueryNodes(ctx, queryFilters, page, nil) // Order not Used
-	if serr != nil {
-		cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)")
-		return nil, serr
-	}
-
-	// Intermediate Node Result Info
-	nodes := make([]string, 0)
-	stateMap := make(map[string]string)
-	for _, node := range rawNodes {
-		nodes = append(nodes, node.Hostname)
-		stateMap[node.Hostname] = string(node.NodeState)
-	}
-
-	// Setup Vars
-	var countNodes int
-	var cerr error
-	var hasNextPage bool
-
-	// Special Case: Find Nodes not in DB node table but in metricStore only
-	if stateFilter == "notindb" {
-		// Reapply Original Paging
-		page.ItemsPerPage = backupItems
-		// Get Nodes From Topology
-		var topoNodes []string
-		if subCluster != "" {
-			scNodes := archive.NodeLists[cluster][subCluster]
-			topoNodes = scNodes.PrintList()
-		} else {
-			subClusterNodeLists := archive.NodeLists[cluster]
-			for _, nodeList := range subClusterNodeLists {
-				topoNodes = append(topoNodes, nodeList.PrintList()...)
-			}
-		}
-		// Compare to all nodes from cluster/subcluster in DB
-		var missingNodes []string
-		for _, scanNode := range topoNodes {
-			if !slices.Contains(nodes, scanNode) {
-				missingNodes = append(missingNodes, scanNode)
-			}
-		}
-		// Filter nodes by name
-		if nodeFilter != "" {
-			filteredNodesByName := []string{}
-			for _, missingNode := range missingNodes {
-				if strings.Contains(missingNode, nodeFilter) {
-					filteredNodesByName = append(filteredNodesByName, missingNode)
-				}
-			}
-			missingNodes = filteredNodesByName
-		}
-		// Sort Missing Nodes Alphanumerically
-		slices.Sort(missingNodes)
-		// Total Missing
-		countNodes = len(missingNodes)
-		// Apply paging
-		if countNodes > page.ItemsPerPage {
-			start := (page.Page - 1) * page.ItemsPerPage
-			end := start + page.ItemsPerPage
-			if end > countNodes {
-				end = countNodes
-				hasNextPage = false
-			} else {
-				hasNextPage = true
-			}
-			nodes = missingNodes[start:end]
-		} else {
-			nodes = missingNodes
-		}
-
-	} else {
-		// DB Nodes: Count and Find Next Page
-		countNodes, cerr = nodeRepo.CountNodes(ctx, queryFilters)
-		if cerr != nil {
-			cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
-			return nil, cerr
-		}
-
-		// Example Page 4 @ 10 IpP : Does item 41 exist?
-		// Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists.
-		nextPage := &model.PageRequest{
-			ItemsPerPage: 1,
-			Page:         ((page.Page * page.ItemsPerPage) + 1),
-		}
-		nextNodes, err := nodeRepo.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used
-		if err != nil {
-			cclog.Warn("Error while querying next nodes")
-			return nil, err
-		}
-		hasNextPage = len(nextNodes) == 1
-	}
-
-	// Load Metric Data For Specified Nodes Only
-	data, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
+	// data -> map hostname:jobdata
+	data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx)
 	if err != nil {
 		cclog.Warn("error while loading node data (Resolver.NodeMetricsList")
 		return nil, err
 	}

-	// Build Result
 	nodeMetricsList := make([]*model.NodeMetrics, 0, len(data))
-	for hostname, metrics := range data {
+	for _, hostname := range nodes {
 		host := &model.NodeMetrics{
 			Host:    hostname,
 			State:   stateMap[hostname],
-			Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)),
+			Metrics: make([]*model.JobMetricWithName, 0),
 		}
 		host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname)
 		if err != nil {
 			cclog.Warnf("error in nodeMetrics resolver: %s", err)
 		}

-		for metric, scopedMetrics := range metrics {
+		for metric, scopedMetrics := range data[hostname] {
 			for scope, scopedMetric := range scopedMetrics {
 				host.Metrics = append(host.Metrics, &model.JobMetricWithName{
 					Name:   metric,
@@ -963,9 +898,9 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
 		nodeMetricsList = append(nodeMetricsList, host)
 	}

-	// Final Return
 	nodeMetricsListResult := &model.NodesResultList{
-		Items:       nodeMetricsList,
+		Items: nodeMetricsList,
+		// TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357
 		TotalNodes:  &countNodes,
 		HasNextPage: &hasNextPage,
 	}
@@ -973,6 +908,99 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
 	return nodeMetricsListResult, nil
 }

+// ClusterMetrics is the resolver for the clusterMetrics field.
+func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) {
+	user := repository.GetUserFromContext(ctx)
+	if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) {
+		return nil, errors.New("you need to be administrator or support staff for this query")
+	}
+
+	if metrics == nil {
+		for _, mc := range archive.GetCluster(cluster).MetricConfig {
+			metrics = append(metrics, mc.Name)
+		}
+	}
+
+	// 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow
+	scopes := []schema.MetricScope{"node"}
+	data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx)
+	if err != nil {
+		cclog.Warn("error while loading node data")
+		return nil, err
+	}
+
+	clusterMetricData := make([]*model.ClusterMetricWithName, 0)
+	clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData}
+
+	collectorTimestep := make(map[string]int)
+	collectorUnit := make(map[string]schema.Unit)
+	collectorData := make(map[string][]schema.Float)
+
+	for _, metrics := range data {
+		clusterMetrics.NodeCount += 1
+		for metric, scopedMetrics := range metrics {
+			for _, scopedMetric := range scopedMetrics {
+				// Collect Info Once
+				_, okTimestep := collectorTimestep[metric]
+				if !okTimestep {
+					collectorTimestep[metric] = scopedMetric.Timestep
+				}
+				_, okUnit := collectorUnit[metric]
+				if !okUnit {
+					collectorUnit[metric] = scopedMetric.Unit
+				}
+				// Collect Data
+				for _, ser := range scopedMetric.Series {
+					_, okData := collectorData[metric]
+					// Init With Datasize > 0
+					if !okData && len(ser.Data) != 0 {
+						collectorData[metric] = make([]schema.Float, len(ser.Data))
+					} else if !okData {
+						cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
+					}
+					// Sum if init'd and matching size
+					if okData && len(ser.Data) == len(collectorData[metric]) {
+						for i, val := range ser.Data {
+							if val.IsNaN() {
+								continue
+							} else {
+								collectorData[metric][i] += val
+							}
+						}
+					} else if okData {
+						cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
+					}
+				}
+			}
+		}
+	}
+
+	for metricName, data := range collectorData {
+		// use ccUnits for backend normalization to "Tera"
+		p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix)
+		p_new := ccunit.NewPrefix("T")
+		convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new)
+		u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base}
+
+		roundedData := make([]schema.Float, 0)
+		for _, v_old := range data {
+			v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0
+			roundedData = append(roundedData, schema.Float(v_new))
+		}
+
+		cm := model.ClusterMetricWithName{
+			Name:     metricName,
+			Unit:     &u_new,
+			Timestep: collectorTimestep[metricName],
+			Data:     roundedData,
+		}
+
+		clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm)
+	}
+
+	return &clusterMetrics, nil
+}
+
 // NumberOfNodes is the resolver for the numberOfNodes field.
 func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) {
 	nodeList, err := archive.ParseNodeList(obj.Nodes)
--- a/internal/graph/util.go
+++ b/internal/graph/util.go
@@ -2,18 +2,20 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
 package graph

 import (
 	"context"
 	"fmt"
 	"math"
+	"slices"

 	"github.com/99designs/gqlgen/graphql"
 	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
-	"github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	"github.com/ClusterCockpit/cc-backend/internal/metricdispatch"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

 const MAX_JOBS_FOR_ANALYSIS = 500
@@ -53,15 +55,15 @@ func (r *queryResolver) rooflineHeatmap(
 		// 	resolution = max(resolution, mc.Timestep)
 		// }

-		jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
+		jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
 		if err != nil {
-			cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
+			cclog.Warnf("Error while loading roofline metrics for job %d", *job.ID)
 			return nil, err
 		}

 		flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
 		if flops_ == nil && membw_ == nil {
-			cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
+			cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", *job.ID)
 			continue
 			// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
 		}
@@ -126,7 +128,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
 			continue
 		}

-		if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil {
+		if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil {
 			cclog.Error("Error while loading averages for footprint")
 			return nil, err
 		}
@@ -185,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
 func requireField(ctx context.Context, name string) bool {
 	fields := graphql.CollectAllFields(ctx)

-	for _, f := range fields {
-		if f == name {
-			return true
-		}
-	}
-
-	return false
+	return slices.Contains(fields, name)
 }
--- a/internal/importer/handleImport.go
+++ b/internal/importer/handleImport.go
@@ -2,6 +2,7 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
 package importer

 import (
@@ -14,8 +15,8 @@ import (
 	"github.com/ClusterCockpit/cc-backend/internal/config"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

 // HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string.
@@ -37,7 +38,7 @@ import (
 func HandleImportFlag(flag string) error {
 	r := repository.GetJobRepository()

-	for _, pair := range strings.Split(flag, ",") {
+	for pair := range strings.SplitSeq(flag, ",") {
 		files := strings.Split(pair, ":")
 		if len(files) != 2 {
 			return fmt.Errorf("REPOSITORY/INIT > invalid import flag format")
@@ -101,7 +102,7 @@ func HandleImportFlag(flag string) error {
 			return err
 		}

-		id, err := r.InsertJob(&job)
+		id, err := r.InsertJobDirect(&job)
 		if err != nil {
 			cclog.Warn("Error while job db insert")
 			return err
--- a/internal/importer/importer_test.go
+++ b/internal/importer/importer_test.go
@@ -16,8 +16,8 @@ import (
 	"github.com/ClusterCockpit/cc-backend/internal/importer"
 	"github.com/ClusterCockpit/cc-backend/internal/repository"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 )

 // copyFile copies a file from source path to destination path.
@@ -50,42 +50,14 @@ func setup(t *testing.T) *repository.JobRepository {
 		"main": {
 	"addr":            "0.0.0.0:8080",
 	"validate": false,
-  "apiAllowedIPs": [
+  "api-allowed-ips": [
    "*"
  ]},
 	"archive": {
 		"kind": "file",
 		"path": "./var/job-archive"
-	},
-	"clusters": [
-	{
-	   "name": "testcluster",
-	   "metricDataRepository": {"kind": "test", "url": "bla:8081"},
-	   "filterRanges": {
-		"numNodes": { "from": 1, "to": 64 },
-		"duration": { "from": 0, "to": 86400 },
-		"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
-	   }
-	},
-    {
-	   "name": "fritz",
-	   "metricDataRepository": {"kind": "test", "url": "bla:8081"},
-	   "filterRanges": {
-		"numNodes": { "from": 1, "to": 944 },
-		"duration": { "from": 0, "to": 86400 },
-		"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
-	   }
-	},
-    {
-		"name": "taurus",
-		"metricDataRepository": {"kind": "test", "url": "bla:8081"},
-		 "filterRanges": {
-		   "numNodes": { "from": 1, "to": 4000 },
-		   "duration": { "from": 0, "to": 604800 },
-		   "startTime": { "from": "2010-01-01T00:00:00Z", "to": null }
-		 }
-	 }
-	]}`
+	}
+	}`

 	cclog.Init("info", true)
 	tmpdir := t.TempDir()
@@ -107,7 +79,7 @@ func setup(t *testing.T) *repository.JobRepository {
 	}

 	dbfilepath := filepath.Join(tmpdir, "test.db")
-	err := repository.MigrateDB("sqlite3", dbfilepath)
+	err := repository.MigrateDB(dbfilepath)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -121,22 +93,18 @@ func setup(t *testing.T) *repository.JobRepository {

 	// Load and check main configuration
 	if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
-		if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
-			config.Init(cfg, clustercfg)
-		} else {
-			t.Fatal("Cluster configuration must be present")
-		}
+		config.Init(cfg)
 	} else {
 		t.Fatal("Main configuration must be present")
 	}

 	archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)

-	if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
+	if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
 		t.Fatal(err)
 	}

-	repository.Connect("sqlite3", dbfilepath)
+	repository.Connect(dbfilepath)
 	return repository.GetJobRepository()
 }

@@ -197,7 +165,7 @@ func TestHandleImportFlag(t *testing.T) {
 			}

 			result := readResult(t, testname)
-			job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime)
+			job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/internal/importer/initDB.go
+++ b/internal/importer/initDB.go
@@ -22,8 +22,8 @@ import (

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

 const (
@@ -111,18 +111,22 @@ func InitDB() error {
 			continue
 		}

-		id, err := r.TransactionAddNamed(t,
+		id, jobErr := r.TransactionAddNamed(t,
 			repository.NamedJobInsert, jobMeta)
-		if err != nil {
-			cclog.Errorf("repository initDB(): %v", err)
+		if jobErr != nil {
+			cclog.Errorf("repository initDB(): %v", jobErr)
 			errorOccured++
 			continue
 		}

+		// Job successfully inserted, increment counter
+		i += 1
+
 		for _, tag := range jobMeta.Tags {
 			tagstr := tag.Name + ":" + tag.Type
 			tagID, ok := tags[tagstr]
 			if !ok {
+				var err error
 				tagID, err = r.TransactionAdd(t,
 					addTagQuery,
 					tag.Name, tag.Type)
@@ -138,10 +142,6 @@ func InitDB() error {
 				setTagQuery,
 				id, tagID)
 		}
-
-		if err == nil {
-			i += 1
-		}
 	}

 	if errorOccured > 0 {
@@ -216,7 +216,7 @@ func enrichJobMetadata(job *schema.Job) error {
 				metricEnergy = math.Round(rawEnergy*100.0) / 100.0
 			}
 		} else {
-			cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
+			cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, *job.ID)
 		}

 		job.EnergyFootprint[fp] = metricEnergy
@@ -225,7 +225,7 @@ func enrichJobMetadata(job *schema.Job) error {

 	job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
 	if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
-		cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
+		cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", *job.ID)
 		return err
 	}

--- a/internal/importer/normalize.go
+++ b/internal/importer/normalize.go
@@ -2,12 +2,13 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
 package importer

 import (
 	"math"

-	ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
+	ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
 )

 // getNormalizationFactor calculates the scaling factor needed to normalize a value
--- a/internal/importer/normalize_test.go
+++ b/internal/importer/normalize_test.go
@@ -8,7 +8,7 @@ import (
 	"fmt"
 	"testing"

-	ccunits "github.com/ClusterCockpit/cc-lib/ccUnits"
+	ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits"
 )

 // TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix.
--- a/internal/memorystore/api.go
+++ b/internal/memorystore/api.go
@@ -1,217 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"math"
-
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/ClusterCockpit/cc-lib/util"
-)
-
-type APIMetricData struct {
-	Error      *string           `json:"error,omitempty"`
-	Data       schema.FloatArray `json:"data,omitempty"`
-	From       int64             `json:"from"`
-	To         int64             `json:"to"`
-	Resolution int64             `json:"resolution"`
-	Avg        schema.Float      `json:"avg"`
-	Min        schema.Float      `json:"min"`
-	Max        schema.Float      `json:"max"`
-}
-
-type APIQueryRequest struct {
-	Cluster     string     `json:"cluster"`
-	Queries     []APIQuery `json:"queries"`
-	ForAllNodes []string   `json:"for-all-nodes"`
-	From        int64      `json:"from"`
-	To          int64      `json:"to"`
-	WithStats   bool       `json:"with-stats"`
-	WithData    bool       `json:"with-data"`
-	WithPadding bool       `json:"with-padding"`
-}
-
-type APIQueryResponse struct {
-	Queries []APIQuery        `json:"queries,omitempty"`
-	Results [][]APIMetricData `json:"results"`
-}
-
-type APIQuery struct {
-	Type        *string      `json:"type,omitempty"`
-	SubType     *string      `json:"subtype,omitempty"`
-	Metric      string       `json:"metric"`
-	Hostname    string       `json:"host"`
-	Resolution  int64        `json:"resolution"`
-	TypeIds     []string     `json:"type-ids,omitempty"`
-	SubTypeIds  []string     `json:"subtype-ids,omitempty"`
-	ScaleFactor schema.Float `json:"scale-by,omitempty"`
-	Aggregate   bool         `json:"aggreg"`
-}
-
-// TODO: Optimize this, just like the stats endpoint!
-func (data *APIMetricData) AddStats() {
-	n := 0
-	sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
-	for _, x := range data.Data {
-		if x.IsNaN() {
-			continue
-		}
-
-		n += 1
-		sum += float64(x)
-		min = math.Min(min, float64(x))
-		max = math.Max(max, float64(x))
-	}
-
-	if n > 0 {
-		avg := sum / float64(n)
-		data.Avg = schema.Float(avg)
-		data.Min = schema.Float(min)
-		data.Max = schema.Float(max)
-	} else {
-		data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN
-	}
-}
-
-func (data *APIMetricData) ScaleBy(f schema.Float) {
-	if f == 0 || f == 1 {
-		return
-	}
-
-	data.Avg *= f
-	data.Min *= f
-	data.Max *= f
-	for i := 0; i < len(data.Data); i++ {
-		data.Data[i] *= f
-	}
-}
-
-func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) {
-	minfo, ok := ms.Metrics[metric]
-	if !ok {
-		return
-	}
-
-	if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
-		padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
-		ndata := make([]schema.Float, 0, padfront+len(data.Data))
-		for range padfront {
-			ndata = append(ndata, schema.NaN)
-		}
-		for j := 0; j < len(data.Data); j++ {
-			ndata = append(ndata, data.Data[j])
-		}
-		data.Data = ndata
-	}
-}
-
-func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
-	req.WithData = true
-	req.WithData = true
-	req.WithData = true
-
-	ms := GetMemoryStore()
-
-	response := APIQueryResponse{
-		Results: make([][]APIMetricData, 0, len(req.Queries)),
-	}
-	if req.ForAllNodes != nil {
-		nodes := ms.ListChildren([]string{req.Cluster})
-		for _, node := range nodes {
-			for _, metric := range req.ForAllNodes {
-				q := APIQuery{
-					Metric:   metric,
-					Hostname: node,
-				}
-				req.Queries = append(req.Queries, q)
-				response.Queries = append(response.Queries, q)
-			}
-		}
-	}
-
-	for _, query := range req.Queries {
-		sels := make([]util.Selector, 0, 1)
-		if query.Aggregate || query.Type == nil {
-			sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}}
-			if query.Type != nil {
-				if len(query.TypeIds) == 1 {
-					sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]})
-				} else {
-					ids := make([]string, len(query.TypeIds))
-					for i, id := range query.TypeIds {
-						ids[i] = *query.Type + id
-					}
-					sel = append(sel, util.SelectorElement{Group: ids})
-				}
-
-				if query.SubType != nil {
-					if len(query.SubTypeIds) == 1 {
-						sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
-					} else {
-						ids := make([]string, len(query.SubTypeIds))
-						for i, id := range query.SubTypeIds {
-							ids[i] = *query.SubType + id
-						}
-						sel = append(sel, util.SelectorElement{Group: ids})
-					}
-				}
-			}
-			sels = append(sels, sel)
-		} else {
-			for _, typeID := range query.TypeIds {
-				if query.SubType != nil {
-					for _, subTypeID := range query.SubTypeIds {
-						sels = append(sels, util.Selector{
-							{String: req.Cluster},
-							{String: query.Hostname},
-							{String: *query.Type + typeID},
-							{String: *query.SubType + subTypeID},
-						})
-					}
-				} else {
-					sels = append(sels, util.Selector{
-						{String: req.Cluster},
-						{String: query.Hostname},
-						{String: *query.Type + typeID},
-					})
-				}
-			}
-		}
-
-		// log.Printf("query: %#v\n", query)
-		// log.Printf("sels: %#v\n", sels)
-		var err error
-		res := make([]APIMetricData, 0, len(sels))
-		for _, sel := range sels {
-			data := APIMetricData{}
-
-			data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
-			if err != nil {
-				msg := err.Error()
-				data.Error = &msg
-				res = append(res, data)
-				continue
-			}
-
-			if req.WithStats {
-				data.AddStats()
-			}
-			if query.ScaleFactor != 0 {
-				data.ScaleBy(query.ScaleFactor)
-			}
-			if req.WithPadding {
-				data.PadDataWithNull(ms, req.From, req.To, query.Metric)
-			}
-			if !req.WithData {
-				data.Data = nil
-			}
-			res = append(res, data)
-		}
-		response.Results = append(response.Results, res)
-	}
-
-	return &response, nil
-}
--- a/internal/memorystore/archive.go
+++ b/internal/memorystore/archive.go
@@ -1,191 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"archive/zip"
-	"bufio"
-	"context"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"sync"
-	"sync/atomic"
-	"time"
-
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-)
-
-func Archiving(wg *sync.WaitGroup, ctx context.Context) {
-	go func() {
-		defer wg.Done()
-		d, err := time.ParseDuration(Keys.Archive.Interval)
-		if err != nil {
-			cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err)
-		}
-		if d <= 0 {
-			return
-		}
-
-		ticks := func() <-chan time.Time {
-			if d <= 0 {
-				return nil
-			}
-			return time.NewTicker(d).C
-		}()
-		for {
-			select {
-			case <-ctx.Done():
-				return
-			case <-ticks:
-				t := time.Now().Add(-d)
-				cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339))
-				n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir,
-					Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead)
-
-				if err != nil {
-					cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error())
-				} else {
-					cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n)
-				}
-			}
-		}
-	}()
-}
-
-var ErrNoNewArchiveData error = errors.New("all data already archived")
-
-// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
-// deleting them from the `checkpointsDir`.
-func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
-	entries1, err := os.ReadDir(checkpointsDir)
-	if err != nil {
-		return 0, err
-	}
-
-	type workItem struct {
-		cdir, adir    string
-		cluster, host string
-	}
-
-	var wg sync.WaitGroup
-	n, errs := int32(0), int32(0)
-	work := make(chan workItem, Keys.NumWorkers)
-
-	wg.Add(Keys.NumWorkers)
-	for worker := 0; worker < Keys.NumWorkers; worker++ {
-		go func() {
-			defer wg.Done()
-			for workItem := range work {
-				m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
-				if err != nil {
-					cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
-					atomic.AddInt32(&errs, 1)
-				}
-				atomic.AddInt32(&n, int32(m))
-			}
-		}()
-	}
-
-	for _, de1 := range entries1 {
-		entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
-		if e != nil {
-			err = e
-		}
-
-		for _, de2 := range entries2 {
-			cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
-			adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
-			work <- workItem{
-				adir: adir, cdir: cdir,
-				cluster: de1.Name(), host: de2.Name(),
-			}
-		}
-	}
-
-	close(work)
-	wg.Wait()
-
-	if err != nil {
-		return int(n), err
-	}
-
-	if errs > 0 {
-		return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n)
-	}
-	return int(n), nil
-}
-
-// Helper function for `ArchiveCheckpoints`.
-func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
-	entries, err := os.ReadDir(dir)
-	if err != nil {
-		return 0, err
-	}
-
-	extension := Keys.Checkpoints.FileFormat
-	files, err := findFiles(entries, from, extension, false)
-	if err != nil {
-		return 0, err
-	}
-
-	if deleteInstead {
-		n := 0
-		for _, checkpoint := range files {
-			filename := filepath.Join(dir, checkpoint)
-			if err = os.Remove(filename); err != nil {
-				return n, err
-			}
-			n += 1
-		}
-		return n, nil
-	}
-
-	filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
-	f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
-	if err != nil && os.IsNotExist(err) {
-		err = os.MkdirAll(archiveDir, CheckpointDirPerms)
-		if err == nil {
-			f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
-		}
-	}
-	if err != nil {
-		return 0, err
-	}
-	defer f.Close()
-	bw := bufio.NewWriter(f)
-	defer bw.Flush()
-	zw := zip.NewWriter(bw)
-	defer zw.Close()
-
-	n := 0
-	for _, checkpoint := range files {
-		filename := filepath.Join(dir, checkpoint)
-		r, err := os.Open(filename)
-		if err != nil {
-			return n, err
-		}
-		defer r.Close()
-
-		w, err := zw.Create(checkpoint)
-		if err != nil {
-			return n, err
-		}
-
-		if _, err = io.Copy(w, r); err != nil {
-			return n, err
-		}
-
-		if err = os.Remove(filename); err != nil {
-			return n, err
-		}
-		n += 1
-	}
-
-	return n, nil
-}
--- a/internal/memorystore/avroCheckpoint.go
+++ b/internal/memorystore/avroCheckpoint.go
@@ -1,482 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"bufio"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"os"
-	"path"
-	"sort"
-	"strconv"
-	"strings"
-	"sync"
-	"sync/atomic"
-	"time"
-
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/linkedin/goavro/v2"
-)
-
-var NumAvroWorkers int = 4
-var startUp bool = true
-var ErrNoNewData error = errors.New("no data in the pool")
-
-func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
-	levels := make([]*AvroLevel, 0)
-	selectors := make([][]string, 0)
-	as.root.lock.RLock()
-	// Cluster
-	for sel1, l1 := range as.root.children {
-		l1.lock.RLock()
-		// Node
-		for sel2, l2 := range l1.children {
-			l2.lock.RLock()
-			// Frequency
-			for sel3, l3 := range l2.children {
-				levels = append(levels, l3)
-				selectors = append(selectors, []string{sel1, sel2, sel3})
-			}
-			l2.lock.RUnlock()
-		}
-		l1.lock.RUnlock()
-	}
-	as.root.lock.RUnlock()
-
-	type workItem struct {
-		level    *AvroLevel
-		dir      string
-		selector []string
-	}
-
-	n, errs := int32(0), int32(0)
-
-	var wg sync.WaitGroup
-	wg.Add(NumAvroWorkers)
-	work := make(chan workItem, NumAvroWorkers*2)
-	for range NumAvroWorkers {
-		go func() {
-			defer wg.Done()
-
-			for workItem := range work {
-				from := getTimestamp(workItem.dir)
-
-				if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
-					if err == ErrNoNewArchiveData {
-						continue
-					}
-
-					cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error())
-					atomic.AddInt32(&errs, 1)
-				} else {
-					atomic.AddInt32(&n, 1)
-				}
-			}
-		}()
-	}
-
-	for i := range len(levels) {
-		dir := path.Join(dir, path.Join(selectors[i]...))
-		work <- workItem{
-			level:    levels[i],
-			dir:      dir,
-			selector: selectors[i],
-		}
-	}
-
-	close(work)
-	wg.Wait()
-
-	if errs > 0 {
-		return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
-	}
-
-	startUp = false
-
-	return int(n), nil
-}
-
-// getTimestamp returns the timestamp from the directory name
-func getTimestamp(dir string) int64 {
-	// Extract the resolution and timestamp from the directory name
-	// The existing avro file will be in epoch timestamp format
-	// iterate over all the files in the directory and find the maximum timestamp
-	// and return it
-
-	resolution := path.Base(dir)
-	dir = path.Dir(dir)
-
-	files, err := os.ReadDir(dir)
-	if err != nil {
-		return 0
-	}
-	var maxTS int64 = 0
-
-	if len(files) == 0 {
-		return 0
-	}
-
-	for _, file := range files {
-		if file.IsDir() {
-			continue
-		}
-		name := file.Name()
-
-		if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
-			continue
-		}
-
-		ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
-		if err != nil {
-			fmt.Printf("error while parsing timestamp: %s\n", err.Error())
-			continue
-		}
-
-		if ts > maxTS {
-			maxTS = ts
-		}
-	}
-
-	interval, _ := time.ParseDuration(Keys.Checkpoints.Interval)
-	updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
-
-	if startUp {
-		return 0
-	}
-
-	if updateTime < time.Now().Unix() {
-		return 0
-	}
-
-	return maxTS
-}
-
-func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
-	l.lock.Lock()
-	defer l.lock.Unlock()
-
-	// fmt.Printf("Checkpointing directory: %s\n", dir)
-	// filepath contains the resolution
-	intRes, _ := strconv.Atoi(path.Base(dir))
-
-	// find smallest overall timestamp in l.data map and delete it from l.data
-	minTS := int64(1<<63 - 1)
-	for ts, dat := range l.data {
-		if ts < minTS && len(dat) != 0 {
-			minTS = ts
-		}
-	}
-
-	if from == 0 && minTS != int64(1<<63-1) {
-		from = minTS
-	}
-
-	if from == 0 {
-		return ErrNoNewArchiveData
-	}
-
-	var schema string
-	var codec *goavro.Codec
-	recordList := make([]map[string]any, 0)
-
-	var f *os.File
-
-	filePath := dir + fmt.Sprintf("_%d.avro", from)
-
-	var err error
-
-	fp_, err_ := os.Stat(filePath)
-	if errors.Is(err_, os.ErrNotExist) {
-		err = os.MkdirAll(path.Dir(dir), 0o755)
-		if err != nil {
-			return fmt.Errorf("failed to create directory: %v", err)
-		}
-	} else if fp_.Size() != 0 {
-		f, err = os.Open(filePath)
-		if err != nil {
-			return fmt.Errorf("failed to open existing avro file: %v", err)
-		}
-
-		br := bufio.NewReader(f)
-
-		reader, err := goavro.NewOCFReader(br)
-		if err != nil {
-			return fmt.Errorf("failed to create OCF reader: %v", err)
-		}
-		codec = reader.Codec()
-		schema = codec.Schema()
-
-		f.Close()
-	}
-
-	timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
-
-	if dumpAll {
-		timeRef = time.Now().Unix()
-	}
-
-	// Empty values
-	if len(l.data) == 0 {
-		// we checkpoint avro files every 60 seconds
-		repeat := 60 / intRes
-
-		for range repeat {
-			recordList = append(recordList, make(map[string]any))
-		}
-	}
-
-	readFlag := true
-
-	for ts := range l.data {
-		flag := false
-		if ts < timeRef {
-			data := l.data[ts]
-
-			schemaGen, err := generateSchema(data)
-			if err != nil {
-				return err
-			}
-
-			flag, schema, err = compareSchema(schema, schemaGen)
-			if err != nil {
-				return fmt.Errorf("failed to compare read and generated schema: %v", err)
-			}
-			if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
-
-				f.Close()
-
-				f, err = os.Open(filePath)
-				if err != nil {
-					return fmt.Errorf("failed to open Avro file: %v", err)
-				}
-
-				br := bufio.NewReader(f)
-
-				ocfReader, err := goavro.NewOCFReader(br)
-				if err != nil {
-					return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
-				}
-
-				for ocfReader.Scan() {
-					record, err := ocfReader.Read()
-					if err != nil {
-						return fmt.Errorf("failed to read record: %v", err)
-					}
-
-					recordList = append(recordList, record.(map[string]any))
-				}
-
-				f.Close()
-
-				err = os.Remove(filePath)
-				if err != nil {
-					return fmt.Errorf("failed to delete file: %v", err)
-				}
-
-				readFlag = false
-			}
-			codec, err = goavro.NewCodec(schema)
-			if err != nil {
-				return fmt.Errorf("failed to create codec after merged schema: %v", err)
-			}
-
-			recordList = append(recordList, generateRecord(data))
-			delete(l.data, ts)
-		}
-	}
-
-	if len(recordList) == 0 {
-		return ErrNoNewArchiveData
-	}
-
-	f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
-	if err != nil {
-		return fmt.Errorf("failed to append new avro file: %v", err)
-	}
-
-	// fmt.Printf("Codec : %#v\n", codec)
-
-	writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
-		W:               f,
-		Codec:           codec,
-		CompressionName: goavro.CompressionDeflateLabel,
-	})
-	if err != nil {
-		return fmt.Errorf("failed to create OCF writer: %v", err)
-	}
-
-	// Append the new record
-	if err := writer.Append(recordList); err != nil {
-		return fmt.Errorf("failed to append record: %v", err)
-	}
-
-	f.Close()
-
-	return nil
-}
-
-func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
-	var genSchema, readSchema AvroSchema
-
-	if schemaRead == "" {
-		return false, schemaGen, nil
-	}
-
-	// Unmarshal the schema strings into AvroSchema structs
-	if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
-		return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
-	}
-	if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
-		return false, "", fmt.Errorf("failed to parse read schema: %v", err)
-	}
-
-	sort.Slice(genSchema.Fields, func(i, j int) bool {
-		return genSchema.Fields[i].Name < genSchema.Fields[j].Name
-	})
-
-	sort.Slice(readSchema.Fields, func(i, j int) bool {
-		return readSchema.Fields[i].Name < readSchema.Fields[j].Name
-	})
-
-	// Check if schemas are identical
-	schemasEqual := true
-	if len(genSchema.Fields) <= len(readSchema.Fields) {
-
-		for i := range genSchema.Fields {
-			if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
-				schemasEqual = false
-				break
-			}
-		}
-
-		// If schemas are identical, return the read schema
-		if schemasEqual {
-			return false, schemaRead, nil
-		}
-	}
-
-	// Create a map to hold unique fields from both schemas
-	fieldMap := make(map[string]AvroField)
-
-	// Add fields from the read schema
-	for _, field := range readSchema.Fields {
-		fieldMap[field.Name] = field
-	}
-
-	// Add or update fields from the generated schema
-	for _, field := range genSchema.Fields {
-		fieldMap[field.Name] = field
-	}
-
-	// Create a union schema by collecting fields from the map
-	var mergedFields []AvroField
-	for _, field := range fieldMap {
-		mergedFields = append(mergedFields, field)
-	}
-
-	// Sort fields by name for consistency
-	sort.Slice(mergedFields, func(i, j int) bool {
-		return mergedFields[i].Name < mergedFields[j].Name
-	})
-
-	// Create the merged schema
-	mergedSchema := AvroSchema{
-		Type:   "record",
-		Name:   genSchema.Name,
-		Fields: mergedFields,
-	}
-
-	// Check if schemas are identical
-	schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
-	if schemasEqual {
-		for i := range mergedSchema.Fields {
-			if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
-				schemasEqual = false
-				break
-			}
-		}
-
-		if schemasEqual {
-			return false, schemaRead, nil
-		}
-	}
-
-	// Marshal the merged schema back to JSON
-	mergedSchemaJSON, err := json.Marshal(mergedSchema)
-	if err != nil {
-		return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
-	}
-
-	return true, string(mergedSchemaJSON), nil
-}
-
-func generateSchema(data map[string]schema.Float) (string, error) {
-	// Define the Avro schema structure
-	schema := map[string]any{
-		"type":   "record",
-		"name":   "DataRecord",
-		"fields": []map[string]any{},
-	}
-
-	fieldTracker := make(map[string]struct{})
-
-	for key := range data {
-		if _, exists := fieldTracker[key]; !exists {
-			key = correctKey(key)
-
-			field := map[string]any{
-				"name":    key,
-				"type":    "double",
-				"default": -1.0,
-			}
-			schema["fields"] = append(schema["fields"].([]map[string]any), field)
-			fieldTracker[key] = struct{}{}
-		}
-	}
-
-	schemaString, err := json.Marshal(schema)
-	if err != nil {
-		return "", fmt.Errorf("failed to marshal schema: %v", err)
-	}
-
-	return string(schemaString), nil
-}
-
-func generateRecord(data map[string]schema.Float) map[string]any {
-	record := make(map[string]any)
-
-	// Iterate through each map in data
-	for key, value := range data {
-		key = correctKey(key)
-
-		// Set the value in the record
-		// avro only accepts basic types
-		record[key] = value.Double()
-	}
-
-	return record
-}
-
-func correctKey(key string) string {
-	// Replace any invalid characters in the key
-	// For example, replace spaces with underscores
-	key = strings.ReplaceAll(key, ":", "___")
-	key = strings.ReplaceAll(key, ".", "__")
-
-	return key
-}
-
-func ReplaceKey(key string) string {
-	// Replace any invalid characters in the key
-	// For example, replace spaces with underscores
-	key = strings.ReplaceAll(key, "___", ":")
-	key = strings.ReplaceAll(key, "__", ".")
-
-	return key
-}
--- a/internal/memorystore/avroHelper.go
+++ b/internal/memorystore/avroHelper.go
@@ -1,84 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"context"
-	"slices"
-	"strconv"
-	"sync"
-
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-)
-
-func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
-	// AvroPool is a pool of Avro writers.
-	go func() {
-		if Keys.Checkpoints.FileFormat == "json" {
-			wg.Done() // Mark this goroutine as done
-			return    // Exit the goroutine
-		}
-
-		defer wg.Done()
-
-		var avroLevel *AvroLevel
-		oldSelector := make([]string, 0)
-
-		for {
-			select {
-			case <-ctx.Done():
-				return
-			case val := <-LineProtocolMessages:
-				// Fetch the frequency of the metric from the global configuration
-				freq, err := GetMetricFrequency(val.MetricName)
-				if err != nil {
-					cclog.Errorf("Error fetching metric frequency: %s\n", err)
-					continue
-				}
-
-				metricName := ""
-
-				for _, selectorName := range val.Selector {
-					metricName += selectorName + Delimiter
-				}
-
-				metricName += val.MetricName
-
-				// Create a new selector for the Avro level
-				// The selector is a slice of strings that represents the path to the
-				// Avro level. It is created by appending the cluster, node, and metric
-				// name to the selector.
-				var selector []string
-				selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
-
-				if !testEq(oldSelector, selector) {
-					// Get the Avro level for the metric
-					avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
-
-					// If the Avro level is nil, create a new one
-					if avroLevel == nil {
-						cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
-					}
-					oldSelector = slices.Clone(selector)
-				}
-
-				avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
-			}
-		}
-	}()
-}
-
-func testEq(a, b []string) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	for i := range a {
-		if a[i] != b[i] {
-			return false
-		}
-	}
-	return true
-}
--- a/internal/memorystore/avroStruct.go
+++ b/internal/memorystore/avroStruct.go
@@ -1,168 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"sync"
-
-	"github.com/ClusterCockpit/cc-lib/schema"
-)
-
-var (
-	LineProtocolMessages = make(chan *AvroStruct)
-	Delimiter            = "ZZZZZ"
-)
-
-// CheckpointBufferMinutes should always be in minutes.
-// Its controls the amount of data to hold for given amount of time.
-var CheckpointBufferMinutes = 3
-
-type AvroStruct struct {
-	MetricName string
-	Cluster    string
-	Node       string
-	Selector   []string
-	Value      schema.Float
-	Timestamp  int64
-}
-
-type AvroStore struct {
-	root AvroLevel
-}
-
-var avroStore AvroStore
-
-type AvroLevel struct {
-	children map[string]*AvroLevel
-	data     map[int64]map[string]schema.Float
-	lock     sync.RWMutex
-}
-
-type AvroField struct {
-	Name    string `json:"name"`
-	Type    any    `json:"type"`
-	Default any    `json:"default,omitempty"`
-}
-
-type AvroSchema struct {
-	Type   string      `json:"type"`
-	Name   string      `json:"name"`
-	Fields []AvroField `json:"fields"`
-}
-
-func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
-	if len(selector) == 0 {
-		return l
-	}
-
-	// Allow concurrent reads:
-	l.lock.RLock()
-	var child *AvroLevel
-	var ok bool
-	if l.children == nil {
-		// Children map needs to be created...
-		l.lock.RUnlock()
-	} else {
-		child, ok := l.children[selector[0]]
-		l.lock.RUnlock()
-		if ok {
-			return child.findAvroLevelOrCreate(selector[1:])
-		}
-	}
-
-	// The level does not exist, take write lock for unqiue access:
-	l.lock.Lock()
-	// While this thread waited for the write lock, another thread
-	// could have created the child node.
-	if l.children != nil {
-		child, ok = l.children[selector[0]]
-		if ok {
-			l.lock.Unlock()
-			return child.findAvroLevelOrCreate(selector[1:])
-		}
-	}
-
-	child = &AvroLevel{
-		data:     make(map[int64]map[string]schema.Float, 0),
-		children: nil,
-	}
-
-	if l.children != nil {
-		l.children[selector[0]] = child
-	} else {
-		l.children = map[string]*AvroLevel{selector[0]: child}
-	}
-	l.lock.Unlock()
-	return child.findAvroLevelOrCreate(selector[1:])
-}
-
-func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) {
-	l.lock.Lock()
-	defer l.lock.Unlock()
-
-	KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
-
-	// Create keys in advance for the given amount of time
-	if len(l.data) != KeyCounter {
-		if len(l.data) == 0 {
-			for i := range KeyCounter {
-				l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0)
-			}
-		} else {
-			// Get the last timestamp
-			var lastTS int64
-			for ts := range l.data {
-				if ts > lastTS {
-					lastTS = ts
-				}
-			}
-			// Create keys for the next KeyCounter timestamps
-			l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0)
-		}
-	}
-
-	closestTS := int64(0)
-	minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
-	found := false
-
-	// Iterate over timestamps and choose the one which is within range.
-	// Since its epoch time, we check if the difference is less than 60 seconds.
-	for ts, dat := range l.data {
-		// Check if timestamp is within range
-		diff := timestamp - ts
-		if diff < -int64(Freq) || diff > int64(Freq) {
-			continue
-		}
-
-		// Metric already present at this timestamp — skip
-		if _, ok := dat[metricName]; ok {
-			continue
-		}
-
-		// Check if this is the closest timestamp so far
-		if Abs(diff) < minDiff {
-			minDiff = Abs(diff)
-			closestTS = ts
-			found = true
-		}
-	}
-
-	if found {
-		l.data[closestTS][metricName] = value
-	}
-}
-
-func GetAvroStore() *AvroStore {
-	return &avroStore
-}
-
-// Abs returns the absolute value of x.
-func Abs(x int64) int64 {
-	if x < 0 {
-		return -x
-	}
-	return x
-}
--- a/internal/memorystore/buffer.go
+++ b/internal/memorystore/buffer.go
@@ -1,198 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"errors"
-	"sync"
-
-	"github.com/ClusterCockpit/cc-lib/schema"
-)
-
-// Default buffer capacity.
-// `buffer.data` will only ever grow up to it's capacity and a new link
-// in the buffer chain will be created if needed so that no copying
-// of data or reallocation needs to happen on writes.
-const (
-	BufferCap int = 512
-)
-
-// So that we can reuse allocations
-var bufferPool sync.Pool = sync.Pool{
-	New: func() any {
-		return &buffer{
-			data: make([]schema.Float, 0, BufferCap),
-		}
-	},
-}
-
-var (
-	ErrNoData           error = errors.New("[METRICSTORE]> no data for this metric/level")
-	ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
-)
-
-// Each metric on each level has it's own buffer.
-// This is where the actual values go.
-// If `cap(data)` is reached, a new buffer is created and
-// becomes the new head of a buffer list.
-type buffer struct {
-	prev      *buffer
-	next      *buffer
-	data      []schema.Float
-	frequency int64
-	start     int64
-	archived  bool
-	closed    bool
-}
-
-func newBuffer(ts, freq int64) *buffer {
-	b := bufferPool.Get().(*buffer)
-	b.frequency = freq
-	b.start = ts - (freq / 2)
-	b.prev = nil
-	b.next = nil
-	b.archived = false
-	b.closed = false
-	b.data = b.data[:0]
-	return b
-}
-
-// If a new buffer was created, the new head is returnd.
-// Otherwise, the existing buffer is returnd.
-// Normaly, only "newer" data should be written, but if the value would
-// end up in the same buffer anyways it is allowed.
-func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) {
-	if ts < b.start {
-		return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past")
-	}
-
-	// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
-	idx := int((ts - b.start) / b.frequency)
-	if idx >= cap(b.data) {
-		newbuf := newBuffer(ts, b.frequency)
-		newbuf.prev = b
-		b.next = newbuf
-		b.close()
-		b = newbuf
-		idx = 0
-	}
-
-	// Overwriting value or writing value from past
-	if idx < len(b.data) {
-		b.data[idx] = value
-		return b, nil
-	}
-
-	// Fill up unwritten slots with NaN
-	for i := len(b.data); i < idx; i++ {
-		b.data = append(b.data, schema.NaN)
-	}
-
-	b.data = append(b.data, value)
-	return b, nil
-}
-
-func (b *buffer) end() int64 {
-	return b.firstWrite() + int64(len(b.data))*b.frequency
-}
-
-func (b *buffer) firstWrite() int64 {
-	return b.start + (b.frequency / 2)
-}
-
-func (b *buffer) close() {}
-
-// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
-// Simple linear interpolation is done between the two neighboring cells if possible.
-// If values at the start or end are missing, instead of NaN values, the second and thrid
-// return values contain the actual `from`/`to`.
-// This function goes back the buffer chain if `from` is older than the currents buffer start.
-// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
-// If `data` is not long enough to hold all values, this function will panic!
-func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
-	if from < b.firstWrite() {
-		if b.prev != nil {
-			return b.prev.read(from, to, data)
-		}
-		from = b.firstWrite()
-	}
-
-	i := 0
-	t := from
-	for ; t < to; t += b.frequency {
-		idx := int((t - b.start) / b.frequency)
-		if idx >= cap(b.data) {
-			if b.next == nil {
-				break
-			}
-			b = b.next
-			idx = 0
-		}
-
-		if idx >= len(b.data) {
-			if b.next == nil || to <= b.next.start {
-				break
-			}
-			data[i] += schema.NaN
-		} else if t < b.start {
-			data[i] += schema.NaN
-			// } else if b.data[idx].IsNaN() {
-			// 	data[i] += interpolate(idx, b.data)
-		} else {
-			data[i] += b.data[idx]
-		}
-		i++
-	}
-
-	return data[:i], from, t, nil
-}
-
-// Returns true if this buffer needs to be freed.
-func (b *buffer) free(t int64) (delme bool, n int) {
-	if b.prev != nil {
-		delme, m := b.prev.free(t)
-		n += m
-		if delme {
-			b.prev.next = nil
-			if cap(b.prev.data) == BufferCap {
-				bufferPool.Put(b.prev)
-			}
-			b.prev = nil
-		}
-	}
-
-	end := b.end()
-	if end < t {
-		return true, n + 1
-	}
-
-	return false, n
-}
-
-// Call `callback` on every buffer that contains data in the range from `from` to `to`.
-func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
-	if b == nil {
-		return nil
-	}
-
-	if err := b.prev.iterFromTo(from, to, callback); err != nil {
-		return err
-	}
-
-	if from <= b.end() && b.start <= to {
-		return callback(b)
-	}
-
-	return nil
-}
-
-func (b *buffer) count() int64 {
-	res := int64(len(b.data))
-	if b.prev != nil {
-		res += b.prev.count()
-	}
-	return res
-}
--- a/internal/memorystore/checkpoint.go
+++ b/internal/memorystore/checkpoint.go
@@ -1,783 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"bufio"
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"io/fs"
-	"os"
-	"path"
-	"path/filepath"
-	"runtime"
-	"sort"
-	"strconv"
-	"strings"
-	"sync"
-	"sync/atomic"
-	"time"
-
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/linkedin/goavro/v2"
-)
-
-// File operation constants
-const (
-	// CheckpointFilePerms defines default permissions for checkpoint files
-	CheckpointFilePerms = 0o644
-	// CheckpointDirPerms defines default permissions for checkpoint directories
-	CheckpointDirPerms = 0o755
-	// GCTriggerInterval determines how often GC is forced during checkpoint loading
-	// GC is triggered every GCTriggerInterval*NumWorkers loaded hosts
-	GCTriggerInterval = 100
-)
-
-// Whenever changed, update MarshalJSON as well!
-type CheckpointMetrics struct {
-	Data      []schema.Float `json:"data"`
-	Frequency int64          `json:"frequency"`
-	Start     int64          `json:"start"`
-}
-
-type CheckpointFile struct {
-	Metrics  map[string]*CheckpointMetrics `json:"metrics"`
-	Children map[string]*CheckpointFile    `json:"children"`
-	From     int64                         `json:"from"`
-	To       int64                         `json:"to"`
-}
-
-var lastCheckpoint time.Time
-
-func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
-	lastCheckpoint = time.Now()
-
-	if Keys.Checkpoints.FileFormat == "json" {
-		ms := GetMemoryStore()
-
-		go func() {
-			defer wg.Done()
-			d, err := time.ParseDuration(Keys.Checkpoints.Interval)
-			if err != nil {
-				cclog.Fatal(err)
-			}
-			if d <= 0 {
-				return
-			}
-
-			ticks := func() <-chan time.Time {
-				if d <= 0 {
-					return nil
-				}
-				return time.NewTicker(d).C
-			}()
-			for {
-				select {
-				case <-ctx.Done():
-					return
-				case <-ticks:
-					cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339))
-					now := time.Now()
-					n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir,
-						lastCheckpoint.Unix(), now.Unix())
-					if err != nil {
-						cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
-					} else {
-						cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
-						lastCheckpoint = now
-					}
-				}
-			}
-		}()
-	} else {
-		go func() {
-			defer wg.Done()
-			d, _ := time.ParseDuration("1m")
-
-			select {
-			case <-ctx.Done():
-				return
-			case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute):
-				// This is the first tick untill we collect the data for given minutes.
-				GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
-				// log.Printf("Checkpointing %d avro files", count)
-
-			}
-
-			ticks := func() <-chan time.Time {
-				if d <= 0 {
-					return nil
-				}
-				return time.NewTicker(d).C
-			}()
-
-			for {
-				select {
-				case <-ctx.Done():
-					return
-				case <-ticks:
-					// Regular ticks of 1 minute to write data.
-					GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false)
-					// log.Printf("Checkpointing %d avro files", count)
-				}
-			}
-		}()
-	}
-}
-
-// As `Float` implements a custom MarshalJSON() function,
-// serializing an array of such types has more overhead
-// than one would assume (because of extra allocations, interfaces and so on).
-func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
-	buf := make([]byte, 0, 128+len(cm.Data)*8)
-	buf = append(buf, `{"frequency":`...)
-	buf = strconv.AppendInt(buf, cm.Frequency, 10)
-	buf = append(buf, `,"start":`...)
-	buf = strconv.AppendInt(buf, cm.Start, 10)
-	buf = append(buf, `,"data":[`...)
-	for i, x := range cm.Data {
-		if i != 0 {
-			buf = append(buf, ',')
-		}
-		if x.IsNaN() {
-			buf = append(buf, `null`...)
-		} else {
-			buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
-		}
-	}
-	buf = append(buf, `]}`...)
-	return buf, nil
-}
-
-// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
-// On a per-host basis a new JSON file is created. I have no idea if this will scale.
-// The good thing: Only a host at a time is locked, so this function can run
-// in parallel to writes/reads.
-func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
-	levels := make([]*Level, 0)
-	selectors := make([][]string, 0)
-	m.root.lock.RLock()
-	for sel1, l1 := range m.root.children {
-		l1.lock.RLock()
-		for sel2, l2 := range l1.children {
-			levels = append(levels, l2)
-			selectors = append(selectors, []string{sel1, sel2})
-		}
-		l1.lock.RUnlock()
-	}
-	m.root.lock.RUnlock()
-
-	type workItem struct {
-		level    *Level
-		dir      string
-		selector []string
-	}
-
-	n, errs := int32(0), int32(0)
-
-	var wg sync.WaitGroup
-	wg.Add(Keys.NumWorkers)
-	work := make(chan workItem, Keys.NumWorkers*2)
-	for worker := 0; worker < Keys.NumWorkers; worker++ {
-		go func() {
-			defer wg.Done()
-
-			for workItem := range work {
-				if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
-					if err == ErrNoNewArchiveData {
-						continue
-					}
-
-					cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error())
-					atomic.AddInt32(&errs, 1)
-				} else {
-					atomic.AddInt32(&n, 1)
-				}
-			}
-		}()
-	}
-
-	for i := 0; i < len(levels); i++ {
-		dir := path.Join(dir, path.Join(selectors[i]...))
-		work <- workItem{
-			level:    levels[i],
-			dir:      dir,
-			selector: selectors[i],
-		}
-	}
-
-	close(work)
-	wg.Wait()
-
-	if errs > 0 {
-		return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
-	}
-	return int(n), nil
-}
-
-func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
-	l.lock.RLock()
-	defer l.lock.RUnlock()
-
-	retval := &CheckpointFile{
-		From:     from,
-		To:       to,
-		Metrics:  make(map[string]*CheckpointMetrics),
-		Children: make(map[string]*CheckpointFile),
-	}
-
-	for metric, minfo := range m.Metrics {
-		b := l.metrics[minfo.offset]
-		if b == nil {
-			continue
-		}
-
-		allArchived := true
-		b.iterFromTo(from, to, func(b *buffer) error {
-			if !b.archived {
-				allArchived = false
-			}
-			return nil
-		})
-
-		if allArchived {
-			continue
-		}
-
-		data := make([]schema.Float, (to-from)/b.frequency+1)
-		data, start, end, err := b.read(from, to, data)
-		if err != nil {
-			return nil, err
-		}
-
-		for i := int((end - start) / b.frequency); i < len(data); i++ {
-			data[i] = schema.NaN
-		}
-
-		retval.Metrics[metric] = &CheckpointMetrics{
-			Frequency: b.frequency,
-			Start:     start,
-			Data:      data,
-		}
-	}
-
-	for name, child := range l.children {
-		val, err := child.toCheckpointFile(from, to, m)
-		if err != nil {
-			return nil, err
-		}
-
-		if val != nil {
-			retval.Children[name] = val
-		}
-	}
-
-	if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
-		return nil, nil
-	}
-
-	return retval, nil
-}
-
-func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
-	cf, err := l.toCheckpointFile(from, to, m)
-	if err != nil {
-		return err
-	}
-
-	if cf == nil {
-		return ErrNoNewArchiveData
-	}
-
-	filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
-	f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
-	if err != nil && os.IsNotExist(err) {
-		err = os.MkdirAll(dir, CheckpointDirPerms)
-		if err == nil {
-			f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
-		}
-	}
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	bw := bufio.NewWriter(f)
-	if err = json.NewEncoder(bw).Encode(cf); err != nil {
-		return err
-	}
-
-	return bw.Flush()
-}
-
-func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
-	var wg sync.WaitGroup
-	work := make(chan [2]string, Keys.NumWorkers)
-	n, errs := int32(0), int32(0)
-
-	wg.Add(Keys.NumWorkers)
-	for worker := 0; worker < Keys.NumWorkers; worker++ {
-		go func() {
-			defer wg.Done()
-			for host := range work {
-				lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
-				nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
-				if err != nil {
-					cclog.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error())
-					atomic.AddInt32(&errs, 1)
-				}
-				atomic.AddInt32(&n, int32(nn))
-			}
-		}()
-	}
-
-	i := 0
-	clustersDir, err := os.ReadDir(dir)
-	for _, clusterDir := range clustersDir {
-		if !clusterDir.IsDir() {
-			err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
-			goto done
-		}
-
-		hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
-		if e != nil {
-			err = e
-			goto done
-		}
-
-		for _, hostDir := range hostsDir {
-			if !hostDir.IsDir() {
-				err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
-				goto done
-			}
-
-			i++
-			if i%Keys.NumWorkers == 0 && i > GCTriggerInterval {
-				// Forcing garbage collection runs here regulary during the loading of checkpoints
-				// will decrease the total heap size after loading everything back to memory is done.
-				// While loading data, the heap will grow fast, so the GC target size will double
-				// almost always. By forcing GCs here, we can keep it growing more slowly so that
-				// at the end, less memory is wasted.
-				runtime.GC()
-			}
-
-			work <- [2]string{clusterDir.Name(), hostDir.Name()}
-		}
-	}
-done:
-	close(work)
-	wg.Wait()
-
-	if err != nil {
-		return int(n), err
-	}
-
-	if errs > 0 {
-		return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n)
-	}
-	return int(n), nil
-}
-
-// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
-// This function can only be called once and before the very first write or read.
-// Different host's data is loaded to memory in parallel.
-func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
-	if _, err := os.Stat(dir); os.IsNotExist(err) {
-		// The directory does not exist, so create it using os.MkdirAll()
-		err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
-		if err != nil {
-			cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
-		}
-		cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
-	}
-
-	// Config read (replace with your actual config read)
-	fileFormat := Keys.Checkpoints.FileFormat
-	if fileFormat == "" {
-		fileFormat = "avro"
-	}
-
-	// Map to easily get the fallback format
-	oppositeFormat := map[string]string{
-		"json": "avro",
-		"avro": "json",
-	}
-
-	// First, attempt to load the specified format
-	if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
-		return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
-	} else if found {
-		cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
-		return m.FromCheckpoint(dir, from, fileFormat)
-	}
-
-	// If not found, attempt the opposite format
-	altFormat := oppositeFormat[fileFormat]
-	if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
-		return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
-	} else if found {
-		cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
-		return m.FromCheckpoint(dir, from, altFormat)
-	}
-
-	cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory")
-	return 0, nil
-}
-
-func checkFilesWithExtension(dir string, extension string) (bool, error) {
-	found := false
-
-	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
-		if err != nil {
-			return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
-		}
-		if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
-			found = true
-			return nil
-		}
-		return nil
-	})
-	if err != nil {
-		return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
-	}
-
-	return found, nil
-}
-
-func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
-	br := bufio.NewReader(f)
-
-	fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
-	resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
-	if err != nil {
-		return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err)
-	}
-
-	fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
-
-	// Same logic according to lineprotocol
-	fromTimestamp -= (resolution / 2)
-
-	if err != nil {
-		return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err)
-	}
-
-	// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
-
-	var recordCounter int64 = 0
-
-	// Create a new OCF reader from the buffered reader
-	ocfReader, err := goavro.NewOCFReader(br)
-	if err != nil {
-		return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err)
-	}
-
-	metricsData := make(map[string]schema.FloatArray)
-
-	for ocfReader.Scan() {
-		datum, err := ocfReader.Read()
-		if err != nil {
-			return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err)
-		}
-
-		record, ok := datum.(map[string]any)
-		if !ok {
-			return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}")
-		}
-
-		for key, value := range record {
-			metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64)))
-		}
-
-		recordCounter += 1
-	}
-
-	to := (fromTimestamp + (recordCounter / (60 / resolution) * 60))
-	if to < from {
-		return nil
-	}
-
-	for key, floatArray := range metricsData {
-		metricName := ReplaceKey(key)
-
-		if strings.Contains(metricName, Delimiter) {
-			subString := strings.Split(metricName, Delimiter)
-
-			lvl := l
-
-			for i := 0; i < len(subString)-1; i++ {
-
-				sel := subString[i]
-
-				if lvl.children == nil {
-					lvl.children = make(map[string]*Level)
-				}
-
-				child, ok := lvl.children[sel]
-				if !ok {
-					child = &Level{
-						metrics:  make([]*buffer, len(m.Metrics)),
-						children: nil,
-					}
-					lvl.children[sel] = child
-				}
-				lvl = child
-			}
-
-			leafMetricName := subString[len(subString)-1]
-			err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution)
-			if err != nil {
-				return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
-			}
-		} else {
-			err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution)
-			if err != nil {
-				return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err)
-			}
-		}
-
-	}
-
-	return nil
-}
-
-func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error {
-	n := len(floatArray)
-	b := &buffer{
-		frequency: resolution,
-		start:     from,
-		data:      floatArray[0:n:n],
-		prev:      nil,
-		next:      nil,
-		archived:  true,
-	}
-	b.close()
-
-	minfo, ok := m.Metrics[metricName]
-	if !ok {
-		return nil
-		// return errors.New("Unkown metric: " + name)
-	}
-
-	prev := l.metrics[minfo.offset]
-	if prev == nil {
-		l.metrics[minfo.offset] = b
-	} else {
-		if prev.start > b.start {
-			return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
-		}
-
-		b.prev = prev
-		prev.next = b
-
-		missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
-		if missingCount > 0 {
-			missingCount /= int(b.frequency)
-
-			for range missingCount {
-				prev.data = append(prev.data, schema.NaN)
-			}
-
-			prev.data = prev.data[0:len(prev.data):len(prev.data)]
-		}
-	}
-	l.metrics[minfo.offset] = b
-
-	return nil
-}
-
-func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
-	br := bufio.NewReader(f)
-	cf := &CheckpointFile{}
-	if err := json.NewDecoder(br).Decode(cf); err != nil {
-		return err
-	}
-
-	if cf.To != 0 && cf.To < from {
-		return nil
-	}
-
-	if err := l.loadFile(cf, m); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
-	for name, metric := range cf.Metrics {
-		n := len(metric.Data)
-		b := &buffer{
-			frequency: metric.Frequency,
-			start:     metric.Start,
-			data:      metric.Data[0:n:n], // Space is wasted here :(
-			prev:      nil,
-			next:      nil,
-			archived:  true,
-		}
-		b.close()
-
-		minfo, ok := m.Metrics[name]
-		if !ok {
-			continue
-			// return errors.New("Unkown metric: " + name)
-		}
-
-		prev := l.metrics[minfo.offset]
-		if prev == nil {
-			l.metrics[minfo.offset] = b
-		} else {
-			if prev.start > b.start {
-				return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start)
-			}
-
-			b.prev = prev
-			prev.next = b
-		}
-		l.metrics[minfo.offset] = b
-	}
-
-	if len(cf.Children) > 0 && l.children == nil {
-		l.children = make(map[string]*Level)
-	}
-
-	for sel, childCf := range cf.Children {
-		child, ok := l.children[sel]
-		if !ok {
-			child = &Level{
-				metrics:  make([]*buffer, len(m.Metrics)),
-				children: nil,
-			}
-			l.children[sel] = child
-		}
-
-		if err := child.loadFile(childCf, m); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
-	direntries, err := os.ReadDir(dir)
-	if err != nil {
-		if os.IsNotExist(err) {
-			return 0, nil
-		}
-
-		return 0, err
-	}
-
-	allFiles := make([]fs.DirEntry, 0)
-	filesLoaded := 0
-	for _, e := range direntries {
-		if e.IsDir() {
-			child := &Level{
-				metrics:  make([]*buffer, len(m.Metrics)),
-				children: make(map[string]*Level),
-			}
-
-			files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
-			filesLoaded += files
-			if err != nil {
-				return filesLoaded, err
-			}
-
-			l.children[e.Name()] = child
-		} else if strings.HasSuffix(e.Name(), "."+extension) {
-			allFiles = append(allFiles, e)
-		} else {
-			continue
-		}
-	}
-
-	files, err := findFiles(allFiles, from, extension, true)
-	if err != nil {
-		return filesLoaded, err
-	}
-
-	loaders := map[string]func(*MemoryStore, *os.File, int64) error{
-		"json": l.loadJSONFile,
-		"avro": l.loadAvroFile,
-	}
-
-	loader := loaders[extension]
-
-	for _, filename := range files {
-		// Use a closure to ensure file is closed immediately after use
-		err := func() error {
-			f, err := os.Open(path.Join(dir, filename))
-			if err != nil {
-				return err
-			}
-			defer f.Close()
-
-			return loader(m, f, from)
-		}()
-		if err != nil {
-			return filesLoaded, err
-		}
-
-		filesLoaded += 1
-	}
-
-	return filesLoaded, nil
-}
-
-// This will probably get very slow over time!
-// A solution could be some sort of an index file in which all other files
-// and the timespan they contain is listed.
-func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
-	nums := map[string]int64{}
-	for _, e := range direntries {
-		if !strings.HasSuffix(e.Name(), "."+extension) {
-			continue
-		}
-
-		ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
-		if err != nil {
-			return nil, err
-		}
-		nums[e.Name()] = ts
-	}
-
-	sort.Slice(direntries, func(i, j int) bool {
-		a, b := direntries[i], direntries[j]
-		return nums[a.Name()] < nums[b.Name()]
-	})
-
-	filenames := make([]string, 0)
-	for i := range direntries {
-		e := direntries[i]
-		ts1 := nums[e.Name()]
-
-		if findMoreRecentFiles && t <= ts1 {
-			filenames = append(filenames, e.Name())
-		}
-		if i == len(direntries)-1 {
-			continue
-		}
-
-		enext := direntries[i+1]
-		ts2 := nums[enext.Name()]
-
-		if findMoreRecentFiles {
-			if ts1 < t && t < ts2 {
-				filenames = append(filenames, e.Name())
-			}
-		} else {
-			if ts2 < t {
-				filenames = append(filenames, e.Name())
-			}
-		}
-	}
-
-	return filenames, nil
-}
--- a/internal/memorystore/config.go
+++ b/internal/memorystore/config.go
@@ -1,121 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"fmt"
-)
-
-var InternalCCMSFlag bool = false
-
-type MetricStoreConfig struct {
-	// Number of concurrent workers for checkpoint and archive operations.
-	// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
-	NumWorkers int `json:"num-workers"`
-	Checkpoints struct {
-		FileFormat string `json:"file-format"`
-		Interval   string `json:"interval"`
-		RootDir    string `json:"directory"`
-		Restore    string `json:"restore"`
-	} `json:"checkpoints"`
-	Debug struct {
-		DumpToFile string `json:"dump-to-file"`
-		EnableGops bool   `json:"gops"`
-	} `json:"debug"`
-	RetentionInMemory string `json:"retention-in-memory"`
-	Archive           struct {
-		Interval      string `json:"interval"`
-		RootDir       string `json:"directory"`
-		DeleteInstead bool   `json:"delete-instead"`
-	} `json:"archive"`
-	Nats []*NatsConfig `json:"nats"`
-}
-
-type NatsConfig struct {
-	// Address of the nats server
-	Address string `json:"address"`
-
-	// Username/Password, optional
-	Username string `json:"username"`
-	Password string `json:"password"`
-
-	// Creds file path
-	Credsfilepath string `json:"creds-file-path"`
-
-	Subscriptions []struct {
-		// Channel name
-		SubscribeTo string `json:"subscribe-to"`
-
-		// Allow lines without a cluster tag, use this as default, optional
-		ClusterTag string `json:"cluster-tag"`
-	} `json:"subscriptions"`
-}
-
-var Keys MetricStoreConfig
-
-// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time!
-type AggregationStrategy int
-
-const (
-	NoAggregation AggregationStrategy = iota
-	SumAggregation
-	AvgAggregation
-)
-
-func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
-	switch str {
-	case "":
-		return NoAggregation, nil
-	case "sum":
-		return SumAggregation, nil
-	case "avg":
-		return AvgAggregation, nil
-	default:
-		return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
-	}
-}
-
-type MetricConfig struct {
-	// Interval in seconds at which measurements are stored
-	Frequency int64
-
-	// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
-	Aggregation AggregationStrategy
-
-	// Private, used internally...
-	offset int
-}
-
-var Metrics map[string]MetricConfig
-
-func GetMetricFrequency(metricName string) (int64, error) {
-	if metric, ok := Metrics[metricName]; ok {
-		return metric.Frequency, nil
-	}
-	return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
-}
-
-// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency.
-// use metric.Name to check if the metric already exists.
-// if not, add it to the Metrics map.
-func AddMetric(name string, metric MetricConfig) error {
-	if Metrics == nil {
-		Metrics = make(map[string]MetricConfig, 0)
-	}
-
-	if existingMetric, ok := Metrics[name]; ok {
-		if existingMetric.Frequency != metric.Frequency {
-			if existingMetric.Frequency < metric.Frequency {
-				existingMetric.Frequency = metric.Frequency
-				Metrics[name] = existingMetric
-			}
-		}
-	} else {
-		Metrics[name] = metric
-	}
-
-	return nil
-}
--- a/internal/memorystore/configSchema.go
+++ b/internal/memorystore/configSchema.go
@@ -1,95 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-const configSchema = `{
-    "type": "object",
-    "description": "Configuration specific to built-in metric-store.",
-    "properties": {
-        "checkpoints": {
-            "description": "Configuration for checkpointing the metrics within metric-store",
-            "type": "object",
-            "properties": {
-                "file-format": {
-                    "description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.",
-                    "type": "string"
-                },
-                "interval": {
-                    "description": "Interval at which the metrics should be checkpointed.",
-                    "type": "string"
-                },
-                "directory": {
-                    "description": "Specify the parent directy in which the checkpointed files should be placed.",
-                    "type": "string"
-                },
-                "restore": {
-                    "description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.",
-                    "type": "string"
-                }
-            }
-        },
-        "archive": {
-            "description": "Configuration for archiving the already checkpointed files.",
-            "type": "object",
-            "properties": {
-                "interval": {
-                    "description": "Interval at which the checkpointed files should be archived.",
-                    "type": "string"
-                },
-                "directory": {
-                    "description": "Specify the parent directy in which the archived files should be placed.",
-                    "type": "string"
-                }
-            }
-        },
-        "retention-in-memory": {
-            "description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.",
-            "type": "string"
-        },
-        "nats": {
-            "description": "Configuration for accepting published data through NATS.",
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "address": {
-                        "description": "Address of the NATS server.",
-                        "type": "string"
-                    },
-                    "username": {
-                        "description": "Optional: If configured with username/password method.",
-                        "type": "string"
-                    },
-                    "password": {
-                        "description": "Optional: If configured with username/password method.",
-                        "type": "string"
-                    },
-                    "creds-file-path": {
-                        "description": "Optional: If configured with Credential File method. Path to your NATS cred file.",
-                        "type": "string"
-                    },
-                    "subscriptions": {
-                        "description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.",
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "properties": {
-                                "subscribe-to": {
-                                    "description": "Channel name",
-                                    "type": "string"
-                                },
-                                "cluster-tag": {
-                                    "description": "Optional: Allow lines without a cluster tag, use this as default",
-                                    "type": "string"
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}`
--- a/internal/memorystore/debug.go
+++ b/internal/memorystore/debug.go
@@ -1,112 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"bufio"
-	"fmt"
-	"strconv"
-)
-
-func (b *buffer) debugDump(buf []byte) []byte {
-	if b.prev != nil {
-		buf = b.prev.debugDump(buf)
-	}
-
-	start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data))
-	buf = append(buf, `{"start":`...)
-	buf = strconv.AppendInt(buf, start, 10)
-	buf = append(buf, `,"len":`...)
-	buf = strconv.AppendInt(buf, int64(len), 10)
-	buf = append(buf, `,"end":`...)
-	buf = strconv.AppendInt(buf, end, 10)
-	if b.archived {
-		buf = append(buf, `,"saved":true`...)
-	}
-	if b.next != nil {
-		buf = append(buf, `},`...)
-	} else {
-		buf = append(buf, `}`...)
-	}
-	return buf
-}
-
-func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
-	l.lock.RLock()
-	defer l.lock.RUnlock()
-	for i := 0; i < depth; i++ {
-		buf = append(buf, '\t')
-	}
-	buf = append(buf, '"')
-	buf = append(buf, lvlname...)
-	buf = append(buf, "\":{\n"...)
-	depth += 1
-	objitems := 0
-	for name, mc := range m.Metrics {
-		if b := l.metrics[mc.offset]; b != nil {
-			for i := 0; i < depth; i++ {
-				buf = append(buf, '\t')
-			}
-
-			buf = append(buf, '"')
-			buf = append(buf, name...)
-			buf = append(buf, `":[`...)
-			buf = b.debugDump(buf)
-			buf = append(buf, "],\n"...)
-			objitems++
-		}
-	}
-
-	for name, lvl := range l.children {
-		_, err := w.Write(buf)
-		if err != nil {
-			return nil, err
-		}
-
-		buf = buf[0:0]
-		buf, err = lvl.debugDump(m, w, name, buf, depth)
-		if err != nil {
-			return nil, err
-		}
-
-		buf = append(buf, ',', '\n')
-		objitems++
-	}
-
-	// remove final `,`:
-	if objitems > 0 {
-		buf = append(buf[0:len(buf)-1], '\n')
-	}
-
-	depth -= 1
-	for i := 0; i < depth; i++ {
-		buf = append(buf, '\t')
-	}
-	buf = append(buf, '}')
-	return buf, nil
-}
-
-func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error {
-	lvl := m.root.findLevel(selector)
-	if lvl == nil {
-		return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
-	}
-
-	buf := make([]byte, 0, 2048)
-	buf = append(buf, "{"...)
-
-	buf, err := lvl.debugDump(m, w, "data", buf, 0)
-	if err != nil {
-		return err
-	}
-
-	buf = append(buf, "}\n"...)
-	if _, err = w.Write(buf); err != nil {
-		return err
-	}
-
-	return w.Flush()
-}
--- a/internal/memorystore/healthcheck.go
+++ b/internal/memorystore/healthcheck.go
@@ -1,92 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"bufio"
-	"fmt"
-	"time"
-)
-
-// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
-// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
-// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
-const MaxMissingDataPoints int64 = 5
-
-// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
-// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
-// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
-// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
-const MaxUnhealthyMetrics int64 = 5
-
-func (b *buffer) healthCheck() int64 {
-	// Check if the buffer is empty
-	if b.data == nil {
-		return 1
-	}
-
-	bufferEnd := b.start + b.frequency*int64(len(b.data))
-	t := time.Now().Unix()
-
-	// Check if the buffer is too old
-	if t-bufferEnd > MaxMissingDataPoints*b.frequency {
-		return 1
-	}
-
-	return 0
-}
-
-func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
-	l.lock.RLock()
-	defer l.lock.RUnlock()
-
-	for _, mc := range m.Metrics {
-		if b := l.metrics[mc.offset]; b != nil {
-			count += b.healthCheck()
-		}
-	}
-
-	for _, lvl := range l.children {
-		c, err := lvl.healthCheck(m, 0)
-		if err != nil {
-			return 0, err
-		}
-		count += c
-	}
-
-	return count, nil
-}
-
-func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
-	lvl := m.root.findLevel(selector)
-	if lvl == nil {
-		return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
-	}
-
-	buf := make([]byte, 0, 25)
-	// buf = append(buf, "{"...)
-
-	var count int64 = 0
-
-	unhealthyMetricsCount, err := lvl.healthCheck(m, count)
-	if err != nil {
-		return err
-	}
-
-	if unhealthyMetricsCount < MaxUnhealthyMetrics {
-		buf = append(buf, "Healthy"...)
-	} else {
-		buf = append(buf, "Unhealthy"...)
-	}
-
-	// buf = append(buf, "}\n"...)
-
-	if _, err = w.Write(buf); err != nil {
-		return err
-	}
-
-	return w.Flush()
-}
--- a/internal/memorystore/level.go
+++ b/internal/memorystore/level.go
@@ -1,192 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"sync"
-	"unsafe"
-
-	"github.com/ClusterCockpit/cc-lib/util"
-)
-
-// Could also be called "node" as this forms a node in a tree structure.
-// Called Level because "node" might be confusing here.
-// Can be both a leaf or a inner node. In this tree structue, inner nodes can
-// also hold data (in `metrics`).
-type Level struct {
-	children map[string]*Level
-	metrics  []*buffer
-	lock     sync.RWMutex
-}
-
-// Find the correct level for the given selector, creating it if
-// it does not exist. Example selector in the context of the
-// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
-// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
-func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
-	if len(selector) == 0 {
-		return l
-	}
-
-	// Allow concurrent reads:
-	l.lock.RLock()
-	var child *Level
-	var ok bool
-	if l.children == nil {
-		// Children map needs to be created...
-		l.lock.RUnlock()
-	} else {
-		child, ok = l.children[selector[0]]
-		l.lock.RUnlock()
-		if ok {
-			return child.findLevelOrCreate(selector[1:], nMetrics)
-		}
-	}
-
-	// The level does not exist, take write lock for unqiue access:
-	l.lock.Lock()
-	// While this thread waited for the write lock, another thread
-	// could have created the child node.
-	if l.children != nil {
-		child, ok = l.children[selector[0]]
-		if ok {
-			l.lock.Unlock()
-			return child.findLevelOrCreate(selector[1:], nMetrics)
-		}
-	}
-
-	child = &Level{
-		metrics:  make([]*buffer, nMetrics),
-		children: nil,
-	}
-
-	if l.children != nil {
-		l.children[selector[0]] = child
-	} else {
-		l.children = map[string]*Level{selector[0]: child}
-	}
-	l.lock.Unlock()
-	return child.findLevelOrCreate(selector[1:], nMetrics)
-}
-
-func (l *Level) free(t int64) (int, error) {
-	l.lock.Lock()
-	defer l.lock.Unlock()
-
-	n := 0
-	for i, b := range l.metrics {
-		if b != nil {
-			delme, m := b.free(t)
-			n += m
-			if delme {
-				if cap(b.data) == BufferCap {
-					bufferPool.Put(b)
-				}
-				l.metrics[i] = nil
-			}
-		}
-	}
-
-	for _, l := range l.children {
-		m, err := l.free(t)
-		n += m
-		if err != nil {
-			return n, err
-		}
-	}
-
-	return n, nil
-}
-
-func (l *Level) sizeInBytes() int64 {
-	l.lock.RLock()
-	defer l.lock.RUnlock()
-	size := int64(0)
-
-	for _, b := range l.metrics {
-		if b != nil {
-			size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
-		}
-	}
-
-	for _, child := range l.children {
-		size += child.sizeInBytes()
-	}
-
-	return size
-}
-
-func (l *Level) findLevel(selector []string) *Level {
-	if len(selector) == 0 {
-		return l
-	}
-
-	l.lock.RLock()
-	defer l.lock.RUnlock()
-
-	lvl := l.children[selector[0]]
-	if lvl == nil {
-		return nil
-	}
-
-	return lvl.findLevel(selector[1:])
-}
-
-func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
-	l.lock.RLock()
-	defer l.lock.RUnlock()
-
-	if len(selector) == 0 {
-		b := l.metrics[offset]
-		if b != nil {
-			return f(b)
-		}
-
-		for _, lvl := range l.children {
-			err := lvl.findBuffers(nil, offset, f)
-			if err != nil {
-				return err
-			}
-		}
-		return nil
-	}
-
-	sel := selector[0]
-	if len(sel.String) != 0 && l.children != nil {
-		lvl, ok := l.children[sel.String]
-		if ok {
-			err := lvl.findBuffers(selector[1:], offset, f)
-			if err != nil {
-				return err
-			}
-		}
-		return nil
-	}
-
-	if sel.Group != nil && l.children != nil {
-		for _, key := range sel.Group {
-			lvl, ok := l.children[key]
-			if ok {
-				err := lvl.findBuffers(selector[1:], offset, f)
-				if err != nil {
-					return err
-				}
-			}
-		}
-		return nil
-	}
-
-	if sel.Any && l.children != nil {
-		for _, lvl := range l.children {
-			if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
-				return err
-			}
-		}
-		return nil
-	}
-
-	return nil
-}
--- a/internal/memorystore/lineprotocol.go
+++ b/internal/memorystore/lineprotocol.go
@@ -1,351 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"context"
-	"fmt"
-	"sync"
-	"time"
-
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/influxdata/line-protocol/v2/lineprotocol"
-	"github.com/nats-io/nats.go"
-)
-
-// Each connection is handled in it's own goroutine. This is a blocking function.
-// func ReceiveRaw(ctx context.Context,
-// 	listener net.Listener,
-// 	handleLine func(*lineprotocol.Decoder, string) error,
-// ) error {
-// 	var wg sync.WaitGroup
-
-// 	wg.Add(1)
-// 	go func() {
-// 		defer wg.Done()
-// 		<-ctx.Done()
-// 		if err := listener.Close(); err != nil {
-// 			log.Printf("listener.Close(): %s", err.Error())
-// 		}
-// 	}()
-
-// 	for {
-// 		conn, err := listener.Accept()
-// 		if err != nil {
-// 			if errors.Is(err, net.ErrClosed) {
-// 				break
-// 			}
-
-// 			log.Printf("listener.Accept(): %s", err.Error())
-// 		}
-
-// 		wg.Add(2)
-// 		go func() {
-// 			defer wg.Done()
-// 			defer conn.Close()
-
-// 			dec := lineprotocol.NewDecoder(conn)
-// 			connctx, cancel := context.WithCancel(context.Background())
-// 			defer cancel()
-// 			go func() {
-// 				defer wg.Done()
-// 				select {
-// 				case <-connctx.Done():
-// 					conn.Close()
-// 				case <-ctx.Done():
-// 					conn.Close()
-// 				}
-// 			}()
-
-// 			if err := handleLine(dec, "default"); err != nil {
-// 				if errors.Is(err, net.ErrClosed) {
-// 					return
-// 				}
-
-// 				log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error())
-// 				errmsg := make([]byte, 128)
-// 				errmsg = append(errmsg, `error: `...)
-// 				errmsg = append(errmsg, err.Error()...)
-// 				errmsg = append(errmsg, '\n')
-// 				conn.Write(errmsg)
-// 			}
-// 		}()
-// 	}
-
-// 	wg.Wait()
-// 	return nil
-// }
-
-// ReceiveNats connects to a nats server and subscribes to "updates". This is a
-// blocking function. handleLine will be called for each line recieved via
-// nats. Send `true` through the done channel for gracefull termination.
-func ReceiveNats(conf *(NatsConfig),
-	ms *MemoryStore,
-	workers int,
-	ctx context.Context,
-) error {
-	var opts []nats.Option
-	if conf.Username != "" && conf.Password != "" {
-		opts = append(opts, nats.UserInfo(conf.Username, conf.Password))
-	}
-
-	if conf.Credsfilepath != "" {
-		opts = append(opts, nats.UserCredentials(conf.Credsfilepath))
-	}
-
-	nc, err := nats.Connect(conf.Address, opts...)
-	if err != nil {
-		return err
-	}
-	defer nc.Close()
-
-	var wg sync.WaitGroup
-	var subs []*nats.Subscription
-
-	msgs := make(chan *nats.Msg, workers*2)
-
-	for _, sc := range conf.Subscriptions {
-		clusterTag := sc.ClusterTag
-		var sub *nats.Subscription
-		if workers > 1 {
-			wg.Add(workers)
-
-			for range workers {
-				go func() {
-					for m := range msgs {
-						dec := lineprotocol.NewDecoderWithBytes(m.Data)
-						if err := DecodeLine(dec, ms, clusterTag); err != nil {
-							cclog.Errorf("error: %s", err.Error())
-						}
-					}
-
-					wg.Done()
-				}()
-			}
-
-			sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
-				msgs <- m
-			})
-		} else {
-			sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
-				dec := lineprotocol.NewDecoderWithBytes(m.Data)
-				if err := DecodeLine(dec, ms, clusterTag); err != nil {
-					cclog.Errorf("error: %s", err.Error())
-				}
-			})
-		}
-
-		if err != nil {
-			return err
-		}
-		cclog.Infof("NATS subscription to '%s' on '%s' established", sc.SubscribeTo, conf.Address)
-		subs = append(subs, sub)
-	}
-
-	<-ctx.Done()
-	for _, sub := range subs {
-		err = sub.Unsubscribe()
-		if err != nil {
-			cclog.Errorf("NATS unsubscribe failed: %s", err.Error())
-		}
-	}
-	close(msgs)
-	wg.Wait()
-
-	nc.Close()
-	cclog.Print("NATS connection closed")
-	return nil
-}
-
-// Place `prefix` in front of `buf` but if possible,
-// do that inplace in `buf`.
-func reorder(buf, prefix []byte) []byte {
-	n := len(prefix)
-	m := len(buf)
-	if cap(buf) < m+n {
-		return append(prefix[:n:n], buf...)
-	} else {
-		buf = buf[:n+m]
-		for i := m - 1; i >= 0; i-- {
-			buf[i+n] = buf[i]
-		}
-		for i := range n {
-			buf[i] = prefix[i]
-		}
-		return buf
-	}
-}
-
-// Decode lines using dec and make write calls to the MemoryStore.
-// If a line is missing its cluster tag, use clusterDefault as default.
-func DecodeLine(dec *lineprotocol.Decoder,
-	ms *MemoryStore,
-	clusterDefault string,
-) error {
-	// Reduce allocations in loop:
-	t := time.Now()
-	metric, metricBuf := Metric{}, make([]byte, 0, 16)
-	selector := make([]string, 0, 4)
-	typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
-
-	// Optimize for the case where all lines in a "batch" are about the same
-	// cluster and host. By using `WriteToLevel` (level = host), we do not need
-	// to take the root- and cluster-level lock as often.
-	var lvl *Level = nil
-	prevCluster, prevHost := "", ""
-
-	var ok bool
-	for dec.Next() {
-		rawmeasurement, err := dec.Measurement()
-		if err != nil {
-			return err
-		}
-
-		// Needs to be copied because another call to dec.* would
-		// invalidate the returned slice.
-		metricBuf = append(metricBuf[:0], rawmeasurement...)
-
-		// The go compiler optimizes map[string(byteslice)] lookups:
-		metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
-		if !ok {
-			continue
-		}
-
-		typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
-		cluster, host := clusterDefault, ""
-		for {
-			key, val, err := dec.NextTag()
-			if err != nil {
-				return err
-			}
-			if key == nil {
-				break
-			}
-
-			// The go compiler optimizes string([]byte{...}) == "...":
-			switch string(key) {
-			case "cluster":
-				if string(val) == prevCluster {
-					cluster = prevCluster
-				} else {
-					cluster = string(val)
-					lvl = nil
-				}
-			case "hostname", "host":
-				if string(val) == prevHost {
-					host = prevHost
-				} else {
-					host = string(val)
-					lvl = nil
-				}
-			case "type":
-				if string(val) == "node" {
-					break
-				}
-
-				// We cannot be sure that the "type" tag comes before the "type-id" tag:
-				if len(typeBuf) == 0 {
-					typeBuf = append(typeBuf, val...)
-				} else {
-					typeBuf = reorder(typeBuf, val)
-				}
-			case "type-id":
-				typeBuf = append(typeBuf, val...)
-			case "subtype":
-				// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
-				if len(subTypeBuf) == 0 {
-					subTypeBuf = append(subTypeBuf, val...)
-				} else {
-					subTypeBuf = reorder(subTypeBuf, val)
-					// subTypeBuf = reorder(typeBuf, val)
-				}
-			case "stype-id":
-				subTypeBuf = append(subTypeBuf, val...)
-			default:
-				// Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need)
-				// return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val))
-			}
-		}
-
-		// If the cluster or host changed, the lvl was set to nil
-		if lvl == nil {
-			selector = selector[:2]
-			selector[0], selector[1] = cluster, host
-			lvl = ms.GetLevel(selector)
-			prevCluster, prevHost = cluster, host
-		}
-
-		// subtypes:
-		selector = selector[:0]
-		if len(typeBuf) > 0 {
-			selector = append(selector, string(typeBuf)) // <- Allocation :(
-			if len(subTypeBuf) > 0 {
-				selector = append(selector, string(subTypeBuf))
-			}
-		}
-
-		for {
-			key, val, err := dec.NextField()
-			if err != nil {
-				return err
-			}
-
-			if key == nil {
-				break
-			}
-
-			if string(key) != "value" {
-				return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val)
-			}
-
-			if val.Kind() == lineprotocol.Float {
-				metric.Value = schema.Float(val.FloatV())
-			} else if val.Kind() == lineprotocol.Int {
-				metric.Value = schema.Float(val.IntV())
-			} else if val.Kind() == lineprotocol.Uint {
-				metric.Value = schema.Float(val.UintV())
-			} else {
-				return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String())
-			}
-		}
-
-		if t, err = dec.Time(lineprotocol.Second, t); err != nil {
-			t = time.Now()
-			if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil {
-				t = time.Now()
-				if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil {
-					t = time.Now()
-					if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil {
-						return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
-					}
-				}
-			}
-		}
-
-		if err != nil {
-			return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
-		}
-
-		time := t.Unix()
-
-		if Keys.Checkpoints.FileFormat != "json" {
-			LineProtocolMessages <- &AvroStruct{
-				MetricName: string(metricBuf),
-				Cluster:    cluster,
-				Node:       host,
-				Selector:   append([]string{}, selector...),
-				Value:      metric.Value,
-				Timestamp:  time,
-			}
-		}
-
-		if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil {
-			return err
-		}
-	}
-	return nil
-}
--- a/internal/memorystore/memorystore.go
+++ b/internal/memorystore/memorystore.go
@@ -1,437 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-// Package memorystore provides an efficient in-memory time-series metric storage system
-// with support for hierarchical data organization, checkpointing, and archiving.
-//
-// The package organizes metrics in a tree structure (cluster → host → component) and
-// provides concurrent read/write access to metric data with configurable aggregation strategies.
-// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
-// and enforcing retention policies.
-//
-// Key features:
-//   - In-memory metric storage with configurable retention
-//   - Hierarchical data organization (selectors)
-//   - Concurrent checkpoint/archive workers
-//   - Support for sum and average aggregation
-//   - NATS integration for metric ingestion
-package memorystore
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"errors"
-	"runtime"
-	"sync"
-	"time"
-
-	"github.com/ClusterCockpit/cc-backend/internal/config"
-	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/resampler"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	"github.com/ClusterCockpit/cc-lib/util"
-)
-
-var (
-	singleton  sync.Once
-	msInstance *MemoryStore
-	// shutdownFunc stores the context cancellation function created in Init
-	// and is called during Shutdown to cancel all background goroutines
-	shutdownFunc context.CancelFunc
-)
-
-
-
-type Metric struct {
-	Name         string
-	Value        schema.Float
-	MetricConfig MetricConfig
-}
-
-type MemoryStore struct {
-	Metrics map[string]MetricConfig
-	root    Level
-}
-
-func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) {
-	startupTime := time.Now()
-
-	if rawConfig != nil {
-		config.Validate(configSchema, rawConfig)
-		dec := json.NewDecoder(bytes.NewReader(rawConfig))
-		// dec.DisallowUnknownFields()
-		if err := dec.Decode(&Keys); err != nil {
-			cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error())
-		}
-	}
-
-	// Set NumWorkers from config or use default
-	if Keys.NumWorkers <= 0 {
-		maxWorkers := 10
-		Keys.NumWorkers = min(runtime.NumCPU()/2+1, maxWorkers)
-	}
-	cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers)
-
-	// Helper function to add metric configuration
-	addMetricConfig := func(mc schema.MetricConfig) {
-		agg, err := AssignAggregationStrategy(mc.Aggregation)
-		if err != nil {
-			cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error())
-		}
-
-		AddMetric(mc.Name, MetricConfig{
-			Frequency:   int64(mc.Timestep),
-			Aggregation: agg,
-		})
-	}
-
-	for _, c := range archive.Clusters {
-		for _, mc := range c.MetricConfig {
-			addMetricConfig(*mc)
-		}
-
-		for _, sc := range c.SubClusters {
-			for _, mc := range sc.MetricConfig {
-				addMetricConfig(mc)
-			}
-		}
-	}
-
-	// Pass the config.MetricStoreKeys
-	InitMetrics(Metrics)
-
-	ms := GetMemoryStore()
-
-	d, err := time.ParseDuration(Keys.Checkpoints.Restore)
-	if err != nil {
-		cclog.Fatal(err)
-	}
-
-	restoreFrom := startupTime.Add(-d)
-	cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
-	files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
-	loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
-	if err != nil {
-		cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error())
-	} else {
-		cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
-	}
-
-	// Try to use less memory by forcing a GC run here and then
-	// lowering the target percentage. The default of 100 means
-	// that only once the ratio of new allocations execeds the
-	// previously active heap, a GC is triggered.
-	// Forcing a GC here will set the "previously active heap"
-	// to a minumum.
-	runtime.GC()
-
-	ctx, shutdown := context.WithCancel(context.Background())
-
-	wg.Add(4)
-
-	Retention(wg, ctx)
-	Checkpointing(wg, ctx)
-	Archiving(wg, ctx)
-	DataStaging(wg, ctx)
-
-	// Note: Signal handling has been removed from this function.
-	// The caller is responsible for handling shutdown signals and calling
-	// the shutdown() function when appropriate.
-	// Store the shutdown function for later use by Shutdown()
-	shutdownFunc = shutdown
-
-	if Keys.Nats != nil {
-		for _, natsConf := range Keys.Nats {
-			// TODO: When multiple nats configs share a URL, do a single connect.
-			wg.Add(1)
-			nc := natsConf
-			go func() {
-				// err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx)
-				err := ReceiveNats(nc, ms, 1, ctx)
-				if err != nil {
-					cclog.Fatal(err)
-				}
-				wg.Done()
-			}()
-		}
-	}
-}
-
-// InitMetrics creates a new, initialized instance of a MemoryStore.
-// Will panic if values in the metric configurations are invalid.
-func InitMetrics(metrics map[string]MetricConfig) {
-	singleton.Do(func() {
-		offset := 0
-		for key, cfg := range metrics {
-			if cfg.Frequency == 0 {
-				panic("[METRICSTORE]> invalid frequency")
-			}
-
-			metrics[key] = MetricConfig{
-				Frequency:   cfg.Frequency,
-				Aggregation: cfg.Aggregation,
-				offset:      offset,
-			}
-			offset += 1
-		}
-
-		msInstance = &MemoryStore{
-			root: Level{
-				metrics:  make([]*buffer, len(metrics)),
-				children: make(map[string]*Level),
-			},
-			Metrics: metrics,
-		}
-	})
-}
-
-func GetMemoryStore() *MemoryStore {
-	if msInstance == nil {
-		cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!")
-	}
-
-	return msInstance
-}
-
-func Shutdown() {
-	// Cancel the context to signal all background goroutines to stop
-	if shutdownFunc != nil {
-		shutdownFunc()
-	}
-
-	cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
-	var files int
-	var err error
-
-	ms := GetMemoryStore()
-
-	if Keys.Checkpoints.FileFormat == "json" {
-		files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
-	} else {
-		files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
-		close(LineProtocolMessages)
-	}
-
-	if err != nil {
-		cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
-	}
-	cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
-}
-
-func getName(m *MemoryStore, i int) string {
-	for key, val := range m.Metrics {
-		if val.offset == i {
-			return key
-		}
-	}
-	return ""
-}
-
-func Retention(wg *sync.WaitGroup, ctx context.Context) {
-	ms := GetMemoryStore()
-
-	go func() {
-		defer wg.Done()
-		d, err := time.ParseDuration(Keys.RetentionInMemory)
-		if err != nil {
-			cclog.Fatal(err)
-		}
-		if d <= 0 {
-			return
-		}
-
-		ticks := func() <-chan time.Time {
-			d := d / 2
-			if d <= 0 {
-				return nil
-			}
-			return time.NewTicker(d).C
-		}()
-		for {
-			select {
-			case <-ctx.Done():
-				return
-			case <-ticks:
-				t := time.Now().Add(-d)
-				cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
-				freed, err := ms.Free(nil, t.Unix())
-				if err != nil {
-					cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error())
-				} else {
-					cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed)
-				}
-			}
-		}
-	}()
-}
-
-// Write all values in `metrics` to the level specified by `selector` for time `ts`.
-// Look at `findLevelOrCreate` for how selectors work.
-func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
-	var ok bool
-	for i, metric := range metrics {
-		if metric.MetricConfig.Frequency == 0 {
-			metric.MetricConfig, ok = m.Metrics[metric.Name]
-			if !ok {
-				metric.MetricConfig.Frequency = 0
-			}
-			metrics[i] = metric
-		}
-	}
-
-	return m.WriteToLevel(&m.root, selector, ts, metrics)
-}
-
-func (m *MemoryStore) GetLevel(selector []string) *Level {
-	return m.root.findLevelOrCreate(selector, len(m.Metrics))
-}
-
-// WriteToLevel assumes that `minfo` in `metrics` is filled in
-func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
-	l = l.findLevelOrCreate(selector, len(m.Metrics))
-	l.lock.Lock()
-	defer l.lock.Unlock()
-
-	for _, metric := range metrics {
-		if metric.MetricConfig.Frequency == 0 {
-			continue
-		}
-
-		b := l.metrics[metric.MetricConfig.offset]
-		if b == nil {
-			// First write to this metric and level
-			b = newBuffer(ts, metric.MetricConfig.Frequency)
-			l.metrics[metric.MetricConfig.offset] = b
-		}
-
-		nb, err := b.write(ts, metric.Value)
-		if err != nil {
-			return err
-		}
-
-		// Last write created a new buffer...
-		if b != nb {
-			l.metrics[metric.MetricConfig.offset] = nb
-		}
-	}
-	return nil
-}
-
-// Read returns all values for metric `metric` from `from` to `to` for the selected level(s).
-// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
-// The second and third return value are the actual from/to for the data. Those can be different from
-// the range asked for if no data was available.
-func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
-	if from > to {
-		return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range")
-	}
-
-	minfo, ok := m.Metrics[metric]
-	if !ok {
-		return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric)
-	}
-
-	n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
-
-	err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
-		cdata, cfrom, cto, err := b.read(from, to, data)
-		if err != nil {
-			return err
-		}
-
-		if n == 0 {
-			from, to = cfrom, cto
-		} else if from != cfrom || to != cto || len(data) != len(cdata) {
-			missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
-			if missingfront != 0 {
-				return ErrDataDoesNotAlign
-			}
-
-			newlen := len(cdata) - missingback
-			if newlen < 1 {
-				return ErrDataDoesNotAlign
-			}
-			cdata = cdata[0:newlen]
-			if len(cdata) != len(data) {
-				return ErrDataDoesNotAlign
-			}
-
-			from, to = cfrom, cto
-		}
-
-		data = cdata
-		n += 1
-		return nil
-	})
-
-	if err != nil {
-		return nil, 0, 0, 0, err
-	} else if n == 0 {
-		return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found")
-	} else if n > 1 {
-		if minfo.Aggregation == AvgAggregation {
-			normalize := 1. / schema.Float(n)
-			for i := 0; i < len(data); i++ {
-				data[i] *= normalize
-			}
-		} else if minfo.Aggregation != SumAggregation {
-			return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation")
-		}
-	}
-
-	data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
-	if err != nil {
-		return nil, 0, 0, 0, err
-	}
-
-	return data, from, to, resolution, nil
-}
-
-// Free releases all buffers for the selected level and all its children that
-// contain only values older than `t`.
-func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
-	return m.GetLevel(selector).free(t)
-}
-
-func (m *MemoryStore) FreeAll() error {
-	for k := range m.root.children {
-		delete(m.root.children, k)
-	}
-
-	return nil
-}
-
-func (m *MemoryStore) SizeInBytes() int64 {
-	return m.root.sizeInBytes()
-}
-
-// ListChildren , given a selector, returns a list of all children of the level
-// selected.
-func (m *MemoryStore) ListChildren(selector []string) []string {
-	lvl := &m.root
-	for lvl != nil && len(selector) != 0 {
-		lvl.lock.RLock()
-		next := lvl.children[selector[0]]
-		lvl.lock.RUnlock()
-		lvl = next
-		selector = selector[1:]
-	}
-
-	if lvl == nil {
-		return nil
-	}
-
-	lvl.lock.RLock()
-	defer lvl.lock.RUnlock()
-
-	children := make([]string, 0, len(lvl.children))
-	for child := range lvl.children {
-		children = append(children, child)
-	}
-
-	return children
-}
--- a/internal/memorystore/memorystore_test.go
+++ b/internal/memorystore/memorystore_test.go
@@ -1,156 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"testing"
-
-	"github.com/ClusterCockpit/cc-lib/schema"
-)
-
-func TestAssignAggregationStrategy(t *testing.T) {
-	tests := []struct {
-		name     string
-		input    string
-		expected AggregationStrategy
-		wantErr  bool
-	}{
-		{"empty string", "", NoAggregation, false},
-		{"sum", "sum", SumAggregation, false},
-		{"avg", "avg", AvgAggregation, false},
-		{"invalid", "invalid", NoAggregation, true},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result, err := AssignAggregationStrategy(tt.input)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("AssignAggregationStrategy(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr)
-				return
-			}
-			if result != tt.expected {
-				t.Errorf("AssignAggregationStrategy(%q) = %v, want %v", tt.input, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestAddMetric(t *testing.T) {
-	// Reset Metrics before test
-	Metrics = make(map[string]MetricConfig)
-
-	err := AddMetric("test_metric", MetricConfig{
-		Frequency:   60,
-		Aggregation: SumAggregation,
-	})
-	if err != nil {
-		t.Errorf("AddMetric() error = %v", err)
-	}
-
-	if _, ok := Metrics["test_metric"]; !ok {
-		t.Error("AddMetric() did not add metric to Metrics map")
-	}
-
-	// Test updating with higher frequency
-	err = AddMetric("test_metric", MetricConfig{
-		Frequency:   120,
-		Aggregation: SumAggregation,
-	})
-	if err != nil {
-		t.Errorf("AddMetric() error = %v", err)
-	}
-
-	if Metrics["test_metric"].Frequency != 120 {
-		t.Errorf("AddMetric() frequency = %d, want 120", Metrics["test_metric"].Frequency)
-	}
-
-	// Test updating with lower frequency (should not update)
-	err = AddMetric("test_metric", MetricConfig{
-		Frequency:   30,
-		Aggregation: SumAggregation,
-	})
-	if err != nil {
-		t.Errorf("AddMetric() error = %v", err)
-	}
-
-	if Metrics["test_metric"].Frequency != 120 {
-		t.Errorf("AddMetric() frequency = %d, want 120 (should not downgrade)", Metrics["test_metric"].Frequency)
-	}
-}
-
-func TestGetMetricFrequency(t *testing.T) {
-	// Reset Metrics before test
-	Metrics = map[string]MetricConfig{
-		"test_metric": {
-			Frequency:   60,
-			Aggregation: SumAggregation,
-		},
-	}
-
-	freq, err := GetMetricFrequency("test_metric")
-	if err != nil {
-		t.Errorf("GetMetricFrequency() error = %v", err)
-	}
-	if freq != 60 {
-		t.Errorf("GetMetricFrequency() = %d, want 60", freq)
-	}
-
-	_, err = GetMetricFrequency("nonexistent")
-	if err == nil {
-		t.Error("GetMetricFrequency() expected error for nonexistent metric")
-	}
-}
-
-func TestBufferWrite(t *testing.T) {
-	b := newBuffer(100, 10)
-
-	// Test writing value
-	nb, err := b.write(100, schema.Float(42.0))
-	if err != nil {
-		t.Errorf("buffer.write() error = %v", err)
-	}
-	if nb != b {
-		t.Error("buffer.write() created new buffer unexpectedly")
-	}
-	if len(b.data) != 1 {
-		t.Errorf("buffer.write() len(data) = %d, want 1", len(b.data))
-	}
-	if b.data[0] != schema.Float(42.0) {
-		t.Errorf("buffer.write() data[0] = %v, want 42.0", b.data[0])
-	}
-
-	// Test writing value from past (should error)
-	_, err = b.write(50, schema.Float(10.0))
-	if err == nil {
-		t.Error("buffer.write() expected error for past timestamp")
-	}
-}
-
-func TestBufferRead(t *testing.T) {
-	b := newBuffer(100, 10)
-	
-	// Write some test data
-	b.write(100, schema.Float(1.0))
-	b.write(110, schema.Float(2.0))
-	b.write(120, schema.Float(3.0))
-
-	// Read data
-	data := make([]schema.Float, 3)
-	result, from, to, err := b.read(100, 130, data)
-	if err != nil {
-		t.Errorf("buffer.read() error = %v", err)
-	}
-	// Buffer read should return from as firstWrite (start + freq/2)
-	if from != 100 {
-		t.Errorf("buffer.read() from = %d, want 100", from)
-	}
-	if to != 130 {
-		t.Errorf("buffer.read() to = %d, want 130", to)
-	}
-	if len(result) != 3 {
-		t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
-	}
-}
--- a/internal/memorystore/stats.go
+++ b/internal/memorystore/stats.go
@@ -1,124 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package memorystore
-
-import (
-	"errors"
-	"math"
-
-	"github.com/ClusterCockpit/cc-lib/util"
-)
-
-type Stats struct {
-	Samples int
-	Avg     util.Float
-	Min     util.Float
-	Max     util.Float
-}
-
-func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) {
-	if from < b.start {
-		if b.prev != nil {
-			return b.prev.stats(from, to)
-		}
-		from = b.start
-	}
-
-	// TODO: Check if b.closed and if so and the full buffer is queried,
-	// use b.statistics instead of iterating over the buffer.
-
-	samples := 0
-	sum, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
-
-	var t int64
-	for t = from; t < to; t += b.frequency {
-		idx := int((t - b.start) / b.frequency)
-		if idx >= cap(b.data) {
-			b = b.next
-			if b == nil {
-				break
-			}
-			idx = 0
-		}
-
-		if t < b.start || idx >= len(b.data) {
-			continue
-		}
-
-		xf := float64(b.data[idx])
-		if math.IsNaN(xf) {
-			continue
-		}
-
-		samples += 1
-		sum += xf
-		min = math.Min(min, xf)
-		max = math.Max(max, xf)
-	}
-
-	return Stats{
-		Samples: samples,
-		Avg:     util.Float(sum) / util.Float(samples),
-		Min:     util.Float(min),
-		Max:     util.Float(max),
-	}, from, t, nil
-}
-
-// Returns statistics for the requested metric on the selected node/level.
-// Data is aggregated to the selected level the same way as in `MemoryStore.Read`.
-// If `Stats.Samples` is zero, the statistics should not be considered as valid.
-func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) {
-	if from > to {
-		return nil, 0, 0, errors.New("invalid time range")
-	}
-
-	minfo, ok := m.Metrics[metric]
-	if !ok {
-		return nil, 0, 0, errors.New("unkown metric: " + metric)
-	}
-
-	n, samples := 0, 0
-	avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32
-	err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
-		stats, cfrom, cto, err := b.stats(from, to)
-		if err != nil {
-			return err
-		}
-
-		if n == 0 {
-			from, to = cfrom, cto
-		} else if from != cfrom || to != cto {
-			return ErrDataDoesNotAlign
-		}
-
-		samples += stats.Samples
-		avg += stats.Avg
-		min = math.Min(min, float64(stats.Min))
-		max = math.Max(max, float64(stats.Max))
-		n += 1
-		return nil
-	})
-	if err != nil {
-		return nil, 0, 0, err
-	}
-
-	if n == 0 {
-		return nil, 0, 0, ErrNoData
-	}
-
-	if minfo.Aggregation == AvgAggregation {
-		avg /= util.Float(n)
-	} else if n > 1 && minfo.Aggregation != SumAggregation {
-		return nil, 0, 0, errors.New("invalid aggregation")
-	}
-
-	return &Stats{
-		Samples: samples,
-		Avg:     avg,
-		Min:     util.Float(min),
-		Max:     util.Float(max),
-	}, from, to, nil
-}
--- a/internal/metricDataDispatcher/dataLoader.go
+++ b/internal/metricDataDispatcher/dataLoader.go
@@ -1,381 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-package metricDataDispatcher
-
-import (
-	"context"
-	"fmt"
-	"math"
-	"time"
-
-	"github.com/ClusterCockpit/cc-backend/internal/config"
-	"github.com/ClusterCockpit/cc-backend/internal/metricdata"
-	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/lrucache"
-	"github.com/ClusterCockpit/cc-lib/resampler"
-	"github.com/ClusterCockpit/cc-lib/schema"
-)
-
-var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
-
-func cacheKey(
-	job *schema.Job,
-	metrics []string,
-	scopes []schema.MetricScope,
-	resolution int,
-) string {
-	// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
-	// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
-	return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
-		job.ID, job.State, metrics, scopes, resolution)
-}
-
-// Fetches the metric data for a job.
-func LoadData(job *schema.Job,
-	metrics []string,
-	scopes []schema.MetricScope,
-	ctx context.Context,
-	resolution int,
-) (schema.JobData, error) {
-	data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
-		var jd schema.JobData
-		var err error
-
-		if job.State == schema.JobStateRunning ||
-			job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
-			config.Keys.DisableArchive {
-
-			repo, err := metricdata.GetMetricDataRepo(job.Cluster)
-			if err != nil {
-				return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0
-			}
-
-			if scopes == nil {
-				scopes = append(scopes, schema.MetricScopeNode)
-			}
-
-			if metrics == nil {
-				cluster := archive.GetCluster(job.Cluster)
-				for _, mc := range cluster.MetricConfig {
-					metrics = append(metrics, mc.Name)
-				}
-			}
-
-			jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution)
-			if err != nil {
-				if len(jd) != 0 {
-					cclog.Warnf("partial error: %s", err.Error())
-					// return err, 0, 0 // Reactivating will block archiving on one partial error
-				} else {
-					cclog.Error("Error while loading job data from metric repository")
-					return err, 0, 0
-				}
-			}
-			size = jd.Size()
-		} else {
-			var jd_temp schema.JobData
-			jd_temp, err = archive.GetHandle().LoadJobData(job)
-			if err != nil {
-				cclog.Error("Error while loading job data from archive")
-				return err, 0, 0
-			}
-
-			// Deep copy the cached archive hashmap
-			jd = metricdata.DeepCopy(jd_temp)
-
-			// Resampling for archived data.
-			// Pass the resolution from frontend here.
-			for _, v := range jd {
-				for _, v_ := range v {
-					timestep := int64(0)
-					for i := 0; i < len(v_.Series); i += 1 {
-						v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
-						if err != nil {
-							return err, 0, 0
-						}
-					}
-					v_.Timestep = int(timestep)
-				}
-			}
-
-			// Avoid sending unrequested data to the client:
-			if metrics != nil || scopes != nil {
-				if metrics == nil {
-					metrics = make([]string, 0, len(jd))
-					for k := range jd {
-						metrics = append(metrics, k)
-					}
-				}
-
-				res := schema.JobData{}
-				for _, metric := range metrics {
-					if perscope, ok := jd[metric]; ok {
-						if len(perscope) > 1 {
-							subset := make(map[schema.MetricScope]*schema.JobMetric)
-							for _, scope := range scopes {
-								if jm, ok := perscope[scope]; ok {
-									subset[scope] = jm
-								}
-							}
-
-							if len(subset) > 0 {
-								perscope = subset
-							}
-						}
-
-						res[metric] = perscope
-					}
-				}
-				jd = res
-			}
-			size = jd.Size()
-		}
-
-		ttl = 5 * time.Hour
-		if job.State == schema.JobStateRunning {
-			ttl = 2 * time.Minute
-		}
-
-		// FIXME: Review: Is this really necessary or correct.
-		// Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes)
-		// For /monitoring/job/<job> and some other places, flops_any and mem_bw need
-		// to be available at the scope 'node'. If a job has a lot of nodes,
-		// statisticsSeries should be available so that a min/median/max Graph can be
-		// used instead of a lot of single lines.
-		// NOTE: New StatsSeries will always be calculated as 'min/median/max'
-		//       Existing (archived) StatsSeries can be 'min/mean/max'!
-		const maxSeriesSize int = 15
-		for _, scopes := range jd {
-			for _, jm := range scopes {
-				if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
-					continue
-				}
-
-				jm.AddStatisticsSeries()
-			}
-		}
-
-		nodeScopeRequested := false
-		for _, scope := range scopes {
-			if scope == schema.MetricScopeNode {
-				nodeScopeRequested = true
-			}
-		}
-
-		if nodeScopeRequested {
-			jd.AddNodeScope("flops_any")
-			jd.AddNodeScope("mem_bw")
-		}
-
-		// Round Resulting Stat Values
-		jd.RoundMetricStats()
-
-		return jd, ttl, size
-	})
-
-	if err, ok := data.(error); ok {
-		cclog.Error("Error in returned dataset")
-		return nil, err
-	}
-
-	return data.(schema.JobData), nil
-}
-
-// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
-func LoadAverages(
-	job *schema.Job,
-	metrics []string,
-	data [][]schema.Float,
-	ctx context.Context,
-) error {
-	if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
-		return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
-	}
-
-	repo, err := metricdata.GetMetricDataRepo(job.Cluster)
-	if err != nil {
-		return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster)
-	}
-
-	stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion?
-	if err != nil {
-		cclog.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project)
-		return err
-	}
-
-	for i, m := range metrics {
-		nodes, ok := stats[m]
-		if !ok {
-			data[i] = append(data[i], schema.NaN)
-			continue
-		}
-
-		sum := 0.0
-		for _, node := range nodes {
-			sum += node.Avg
-		}
-		data[i] = append(data[i], schema.Float(sum))
-	}
-
-	return nil
-}
-
-// Used for statsTable in frontend: Return scoped statistics by metric.
-func LoadScopedJobStats(
-	job *schema.Job,
-	metrics []string,
-	scopes []schema.MetricScope,
-	ctx context.Context,
-) (schema.ScopedJobStats, error) {
-	if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
-		return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
-	}
-
-	repo, err := metricdata.GetMetricDataRepo(job.Cluster)
-	if err != nil {
-		return nil, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
-	}
-
-	scopedStats, err := repo.LoadScopedStats(job, metrics, scopes, ctx)
-	if err != nil {
-		cclog.Errorf("error while loading scoped statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
-		return nil, err
-	}
-
-	return scopedStats, nil
-}
-
-// Used for polar plots in frontend: Aggregates statistics for all nodes to single values for job per metric.
-func LoadJobStats(
-	job *schema.Job,
-	metrics []string,
-	ctx context.Context,
-) (map[string]schema.MetricStatistics, error) {
-	if job.State != schema.JobStateRunning && !config.Keys.DisableArchive {
-		return archive.LoadStatsFromArchive(job, metrics)
-	}
-
-	data := make(map[string]schema.MetricStatistics, len(metrics))
-	repo, err := metricdata.GetMetricDataRepo(job.Cluster)
-	if err != nil {
-		return data, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster)
-	}
-
-	stats, err := repo.LoadStats(job, metrics, ctx)
-	if err != nil {
-		cclog.Errorf("error while loading statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project)
-		return data, err
-	}
-
-	for _, m := range metrics {
-		sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
-		nodes, ok := stats[m]
-		if !ok {
-			data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
-			continue
-		}
-
-		for _, node := range nodes {
-			sum += node.Avg
-			min = math.Min(min, node.Min)
-			max = math.Max(max, node.Max)
-		}
-
-		data[m] = schema.MetricStatistics{
-			Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
-			Min: (math.Round(min*100) / 100),
-			Max: (math.Round(max*100) / 100),
-		}
-	}
-
-	return data, nil
-}
-
-// Used for the classic node/system view. Returns a map of nodes to a map of metrics.
-func LoadNodeData(
-	cluster string,
-	metrics, nodes []string,
-	scopes []schema.MetricScope,
-	from, to time.Time,
-	ctx context.Context,
-) (map[string]map[string][]*schema.JobMetric, error) {
-	repo, err := metricdata.GetMetricDataRepo(cluster)
-	if err != nil {
-		return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
-	}
-
-	if metrics == nil {
-		for _, m := range archive.GetCluster(cluster).MetricConfig {
-			metrics = append(metrics, m.Name)
-		}
-	}
-
-	data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
-	if err != nil {
-		if len(data) != 0 {
-			cclog.Warnf("partial error: %s", err.Error())
-		} else {
-			cclog.Error("Error while loading node data from metric repository")
-			return nil, err
-		}
-	}
-
-	if data == nil {
-		return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
-	}
-
-	return data, nil
-}
-
-func LoadNodeListData(
-	cluster, subCluster string,
-	nodes []string,
-	metrics []string,
-	scopes []schema.MetricScope,
-	resolution int,
-	from, to time.Time,
-	ctx context.Context,
-) (map[string]schema.JobData, error) {
-	repo, err := metricdata.GetMetricDataRepo(cluster)
-	if err != nil {
-		return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
-	}
-
-	if metrics == nil {
-		for _, m := range archive.GetCluster(cluster).MetricConfig {
-			metrics = append(metrics, m.Name)
-		}
-	}
-
-	data, err := repo.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
-	if err != nil {
-		if len(data) != 0 {
-			cclog.Warnf("partial error: %s", err.Error())
-		} else {
-			cclog.Error("Error while loading node data from metric repository")
-			return nil, err
-		}
-	}
-
-	// NOTE: New StatsSeries will always be calculated as 'min/median/max'
-	const maxSeriesSize int = 8
-	for _, jd := range data {
-		for _, scopes := range jd {
-			for _, jm := range scopes {
-				if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
-					continue
-				}
-				jm.AddStatisticsSeries()
-			}
-		}
-	}
-
-	if data == nil {
-		return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster)
-	}
-
-	return data, nil
-}
--- a/internal/metricdata/cc-metric-store-internal.go
+++ b/internal/metricdata/cc-metric-store-internal.go
--- a/internal/metricdata/cc-metric-store.go
+++ b/internal/metricdata/cc-metric-store.go
--- a/internal/metricdata/metricdata.go
+++ b/internal/metricdata/metricdata.go
@@ -1,88 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package metricdata
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"time"
-
-	"github.com/ClusterCockpit/cc-backend/internal/config"
-	"github.com/ClusterCockpit/cc-backend/internal/memorystore"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-)
-
-type MetricDataRepository interface {
-	// Initialize this MetricDataRepository. One instance of
-	// this interface will only ever be responsible for one cluster.
-	Init(rawConfig json.RawMessage) error
-
-	// Return the JobData for the given job, only with the requested metrics.
-	LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error)
-
-	// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
-	LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
-
-	// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
-	LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error)
-
-	// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
-	LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
-
-	// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
-	LoadNodeListData(cluster, subCluster string, nodes, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, ctx context.Context) (map[string]schema.JobData, error)
-}
-
-var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
-
-func Init() error {
-	for _, cluster := range config.Clusters {
-		if cluster.MetricDataRepository != nil {
-			var kind struct {
-				Kind string `json:"kind"`
-			}
-			if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
-				cclog.Warn("Error while unmarshaling raw json MetricDataRepository")
-				return err
-			}
-
-			var mdr MetricDataRepository
-			switch kind.Kind {
-			case "cc-metric-store":
-				mdr = &CCMetricStore{}
-			case "cc-metric-store-internal":
-				mdr = &CCMetricStoreInternal{}
-				memorystore.InternalCCMSFlag = true
-			case "prometheus":
-				mdr = &PrometheusDataRepository{}
-			case "test":
-				mdr = &TestMetricDataRepository{}
-			default:
-				return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
-			}
-
-			if err := mdr.Init(cluster.MetricDataRepository); err != nil {
-				cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name)
-				return err
-			}
-			metricDataRepos[cluster.Name] = mdr
-		}
-	}
-	return nil
-}
-
-func GetMetricDataRepo(cluster string) (MetricDataRepository, error) {
-	var err error
-	repo, ok := metricDataRepos[cluster]
-
-	if !ok {
-		err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster)
-	}
-
-	return repo, err
-}
--- a/internal/metricdata/prometheus.go
+++ b/internal/metricdata/prometheus.go
@@ -1,587 +0,0 @@
-// Copyright (C) 2022 DKRZ
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-package metricdata
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"math"
-	"net/http"
-	"os"
-	"regexp"
-	"sort"
-	"strings"
-	"sync"
-	"text/template"
-	"time"
-
-	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
-	promapi "github.com/prometheus/client_golang/api"
-	promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
-	promcfg "github.com/prometheus/common/config"
-	promm "github.com/prometheus/common/model"
-)
-
-type PrometheusDataRepositoryConfig struct {
-	Url       string            `json:"url"`
-	Username  string            `json:"username,omitempty"`
-	Suffix    string            `json:"suffix,omitempty"`
-	Templates map[string]string `json:"query-templates"`
-}
-
-type PrometheusDataRepository struct {
-	client      promapi.Client
-	queryClient promv1.API
-	suffix      string
-	templates   map[string]*template.Template
-}
-
-type PromQLArgs struct {
-	Nodes string
-}
-
-type Trie map[rune]Trie
-
-var logOnce sync.Once
-
-func contains(s []schema.MetricScope, str schema.MetricScope) bool {
-	for _, v := range s {
-		if v == str {
-			return true
-		}
-	}
-	return false
-}
-
-func MinMaxMean(data []schema.Float) (float64, float64, float64) {
-	if len(data) == 0 {
-		return 0.0, 0.0, 0.0
-	}
-	min := math.MaxFloat64
-	max := -math.MaxFloat64
-	var sum float64
-	var n float64
-	for _, val := range data {
-		if val.IsNaN() {
-			continue
-		}
-		sum += float64(val)
-		n += 1
-		if float64(val) > max {
-			max = float64(val)
-		}
-		if float64(val) < min {
-			min = float64(val)
-		}
-	}
-	return min, max, sum / n
-}
-
-// Rewritten from
-// https://github.com/ermanh/trieregex/blob/master/trieregex/trieregex.py
-func nodeRegex(nodes []string) string {
-	root := Trie{}
-	// add runes of each compute node to trie
-	for _, node := range nodes {
-		_trie := root
-		for _, c := range node {
-			if _, ok := _trie[c]; !ok {
-				_trie[c] = Trie{}
-			}
-			_trie = _trie[c]
-		}
-		_trie['*'] = Trie{}
-	}
-	// recursively build regex from rune trie
-	var trieRegex func(trie Trie, reset bool) string
-	trieRegex = func(trie Trie, reset bool) string {
-		if reset == true {
-			trie = root
-		}
-		if len(trie) == 0 {
-			return ""
-		}
-		if len(trie) == 1 {
-			for key, _trie := range trie {
-				if key == '*' {
-					return ""
-				}
-				return regexp.QuoteMeta(string(key)) + trieRegex(_trie, false)
-			}
-		} else {
-			sequences := []string{}
-			for key, _trie := range trie {
-				if key != '*' {
-					sequences = append(sequences, regexp.QuoteMeta(string(key))+trieRegex(_trie, false))
-				}
-			}
-			sort.Slice(sequences, func(i, j int) bool {
-				return (-len(sequences[i]) < -len(sequences[j])) || (sequences[i] < sequences[j])
-			})
-			var result string
-			// single edge from this tree node
-			if len(sequences) == 1 {
-				result = sequences[0]
-				if len(result) > 1 {
-					result = "(?:" + result + ")"
-				}
-				// multiple edges, each length 1
-			} else if s := strings.Join(sequences, ""); len(s) == len(sequences) {
-				// char or numeric range
-				if len(s)-1 == int(s[len(s)-1])-int(s[0]) {
-					result = fmt.Sprintf("[%c-%c]", s[0], s[len(s)-1])
-					// char or numeric set
-				} else {
-					result = "[" + s + "]"
-				}
-				// multiple edges of different lengths
-			} else {
-				result = "(?:" + strings.Join(sequences, "|") + ")"
-			}
-			if _, ok := trie['*']; ok {
-				result += "?"
-			}
-			return result
-		}
-		return ""
-	}
-	return trieRegex(root, true)
-}
-
-func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error {
-	var config PrometheusDataRepositoryConfig
-	// parse config
-	if err := json.Unmarshal(rawConfig, &config); err != nil {
-		cclog.Warn("Error while unmarshaling raw json config")
-		return err
-	}
-	// support basic authentication
-	var rt http.RoundTripper = nil
-	if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" {
-		prom_pw := promcfg.Secret(prom_pw)
-		rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper)
-	} else {
-		if config.Username != "" {
-			return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set")
-		}
-	}
-	// init client
-	client, err := promapi.NewClient(promapi.Config{
-		Address:      config.Url,
-		RoundTripper: rt,
-	})
-	if err != nil {
-		cclog.Error("Error while initializing new prometheus client")
-		return err
-	}
-	// init query client
-	pdb.client = client
-	pdb.queryClient = promv1.NewAPI(pdb.client)
-	// site config
-	pdb.suffix = config.Suffix
-	// init query templates
-	pdb.templates = make(map[string]*template.Template)
-	for metric, templ := range config.Templates {
-		pdb.templates[metric], err = template.New(metric).Parse(templ)
-		if err == nil {
-			cclog.Debugf("Added PromQL template for %s: %s", metric, templ)
-		} else {
-			cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric)
-		}
-	}
-	return nil
-}
-
-// TODO: respect scope argument
-func (pdb *PrometheusDataRepository) FormatQuery(
-	metric string,
-	scope schema.MetricScope,
-	nodes []string,
-	cluster string,
-) (string, error) {
-	args := PromQLArgs{}
-	if len(nodes) > 0 {
-		args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix)
-	} else {
-		args.Nodes = fmt.Sprintf(".*%s", pdb.suffix)
-	}
-
-	buf := &bytes.Buffer{}
-	if templ, ok := pdb.templates[metric]; ok {
-		err := templ.Execute(buf, args)
-		if err != nil {
-			return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ))
-		} else {
-			query := buf.String()
-			cclog.Debugf("PromQL: %s", query)
-			return query, nil
-		}
-	} else {
-		return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > No PromQL for metric %s configured.", metric))
-	}
-}
-
-// Convert PromAPI row to CC schema.Series
-func (pdb *PrometheusDataRepository) RowToSeries(
-	from time.Time,
-	step int64,
-	steps int64,
-	row *promm.SampleStream,
-) schema.Series {
-	ts := from.Unix()
-	hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
-	// init array of expected length with NaN
-	values := make([]schema.Float, steps+1)
-	for i := range values {
-		values[i] = schema.NaN
-	}
-	// copy recorded values from prom sample pair
-	for _, v := range row.Values {
-		idx := (v.Timestamp.Unix() - ts) / step
-		values[idx] = schema.Float(v.Value)
-	}
-	min, max, mean := MinMaxMean(values)
-	// output struct
-	return schema.Series{
-		Hostname: hostname,
-		Data:     values,
-		Statistics: schema.MetricStatistics{
-			Avg: mean,
-			Min: min,
-			Max: max,
-		},
-	}
-}
-
-func (pdb *PrometheusDataRepository) LoadData(
-	job *schema.Job,
-	metrics []string,
-	scopes []schema.MetricScope,
-	ctx context.Context,
-	resolution int,
-) (schema.JobData, error) {
-	// TODO respect requested scope
-	if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
-		scopes = append(scopes, schema.MetricScopeNode)
-	}
-
-	jobData := make(schema.JobData)
-	// parse job specs
-	nodes := make([]string, len(job.Resources))
-	for i, resource := range job.Resources {
-		nodes[i] = resource.Hostname
-	}
-	from := time.Unix(job.StartTime, 0)
-	to := time.Unix(job.StartTime+int64(job.Duration), 0)
-
-	for _, scope := range scopes {
-		if scope != schema.MetricScopeNode {
-			logOnce.Do(func() {
-				cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
-			})
-			continue
-		}
-
-		for _, metric := range metrics {
-			metricConfig := archive.GetMetricConfig(job.Cluster, metric)
-			if metricConfig == nil {
-				cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster)
-				return nil, errors.New("Prometheus config error")
-			}
-			query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster)
-			if err != nil {
-				cclog.Warn("Error while formatting prometheus query")
-				return nil, err
-			}
-
-			// ranged query over all job nodes
-			r := promv1.Range{
-				Start: from,
-				End:   to,
-				Step:  time.Duration(metricConfig.Timestep * 1e9),
-			}
-			result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
-			if err != nil {
-				cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query)
-				return nil, errors.New("Prometheus query error")
-			}
-			if len(warnings) > 0 {
-				cclog.Warnf("Warnings: %v\n", warnings)
-			}
-
-			// init data structures
-			if _, ok := jobData[metric]; !ok {
-				jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
-			}
-			jobMetric, ok := jobData[metric][scope]
-			if !ok {
-				jobMetric = &schema.JobMetric{
-					Unit:     metricConfig.Unit,
-					Timestep: metricConfig.Timestep,
-					Series:   make([]schema.Series, 0),
-				}
-			}
-			step := int64(metricConfig.Timestep)
-			steps := int64(to.Sub(from).Seconds()) / step
-			// iter rows of host, metric, values
-			for _, row := range result.(promm.Matrix) {
-				jobMetric.Series = append(jobMetric.Series,
-					pdb.RowToSeries(from, step, steps, row))
-			}
-			// only add metric if at least one host returned data
-			if !ok && len(jobMetric.Series) > 0 {
-				jobData[metric][scope] = jobMetric
-			}
-			// sort by hostname to get uniform coloring
-			sort.Slice(jobMetric.Series, func(i, j int) bool {
-				return (jobMetric.Series[i].Hostname < jobMetric.Series[j].Hostname)
-			})
-		}
-	}
-	return jobData, nil
-}
-
-// TODO change implementation to precomputed/cached stats
-func (pdb *PrometheusDataRepository) LoadStats(
-	job *schema.Job,
-	metrics []string,
-	ctx context.Context,
-) (map[string]map[string]schema.MetricStatistics, error) {
-	// map of metrics of nodes of stats
-	stats := map[string]map[string]schema.MetricStatistics{}
-
-	data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
-	if err != nil {
-		cclog.Warn("Error while loading job for stats")
-		return nil, err
-	}
-	for metric, metricData := range data {
-		stats[metric] = make(map[string]schema.MetricStatistics)
-		for _, series := range metricData[schema.MetricScopeNode].Series {
-			stats[metric][series.Hostname] = series.Statistics
-		}
-	}
-
-	return stats, nil
-}
-
-func (pdb *PrometheusDataRepository) LoadNodeData(
-	cluster string,
-	metrics, nodes []string,
-	scopes []schema.MetricScope,
-	from, to time.Time,
-	ctx context.Context,
-) (map[string]map[string][]*schema.JobMetric, error) {
-	t0 := time.Now()
-	// Map of hosts of metrics of value slices
-	data := make(map[string]map[string][]*schema.JobMetric)
-	// query db for each metric
-	// TODO: scopes seems to be always empty
-	if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
-		scopes = append(scopes, schema.MetricScopeNode)
-	}
-	for _, scope := range scopes {
-		if scope != schema.MetricScopeNode {
-			logOnce.Do(func() {
-				cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
-			})
-			continue
-		}
-		for _, metric := range metrics {
-			metricConfig := archive.GetMetricConfig(cluster, metric)
-			if metricConfig == nil {
-				cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster)
-				return nil, errors.New("Prometheus config error")
-			}
-			query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
-			if err != nil {
-				cclog.Warn("Error while formatting prometheus query")
-				return nil, err
-			}
-
-			// ranged query over all nodes
-			r := promv1.Range{
-				Start: from,
-				End:   to,
-				Step:  time.Duration(metricConfig.Timestep * 1e9),
-			}
-			result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
-			if err != nil {
-				cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
-				return nil, errors.New("Prometheus query error")
-			}
-			if len(warnings) > 0 {
-				cclog.Warnf("Warnings: %v\n", warnings)
-			}
-
-			step := int64(metricConfig.Timestep)
-			steps := int64(to.Sub(from).Seconds()) / step
-
-			// iter rows of host, metric, values
-			for _, row := range result.(promm.Matrix) {
-				hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
-				hostdata, ok := data[hostname]
-				if !ok {
-					hostdata = make(map[string][]*schema.JobMetric)
-					data[hostname] = hostdata
-				}
-				// output per host and metric
-				hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
-					Unit:     metricConfig.Unit,
-					Timestep: metricConfig.Timestep,
-					Series:   []schema.Series{pdb.RowToSeries(from, step, steps, row)},
-				},
-				)
-			}
-		}
-	}
-	t1 := time.Since(t0)
-	cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1)
-	return data, nil
-}
-
-// Implemented by NHR@FAU; Used in Job-View StatsTable
-func (pdb *PrometheusDataRepository) LoadScopedStats(
-	job *schema.Job,
-	metrics []string,
-	scopes []schema.MetricScope,
-	ctx context.Context,
-) (schema.ScopedJobStats, error) {
-	// Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable
-	scopedJobStats := make(schema.ScopedJobStats)
-	data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/)
-	if err != nil {
-		cclog.Warn("Error while loading job for scopedJobStats")
-		return nil, err
-	}
-
-	for metric, metricData := range data {
-		for _, scope := range scopes {
-			if scope != schema.MetricScopeNode {
-				logOnce.Do(func() {
-					cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
-				})
-				continue
-			}
-
-			if _, ok := scopedJobStats[metric]; !ok {
-				scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
-			}
-
-			if _, ok := scopedJobStats[metric][scope]; !ok {
-				scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
-			}
-
-			for _, series := range metricData[scope].Series {
-				scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
-					Hostname: series.Hostname,
-					Data:     &series.Statistics,
-				})
-			}
-		}
-	}
-
-	return scopedJobStats, nil
-}
-
-// Implemented by NHR@FAU; Used in NodeList-View
-func (pdb *PrometheusDataRepository) LoadNodeListData(
-	cluster, subCluster string,
-	nodes []string,
-	metrics []string,
-	scopes []schema.MetricScope,
-	resolution int,
-	from, to time.Time,
-	ctx context.Context,
-) (map[string]schema.JobData, error) {
-	// Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList
-
-	// Fetch Data, based on pdb.LoadNodeData()
-	t0 := time.Now()
-	// Map of hosts of jobData
-	data := make(map[string]schema.JobData)
-
-	// query db for each metric
-	// TODO: scopes seems to be always empty
-	if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) {
-		scopes = append(scopes, schema.MetricScopeNode)
-	}
-
-	for _, scope := range scopes {
-		if scope != schema.MetricScopeNode {
-			logOnce.Do(func() {
-				cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope)
-			})
-			continue
-		}
-
-		for _, metric := range metrics {
-			metricConfig := archive.GetMetricConfig(cluster, metric)
-			if metricConfig == nil {
-				cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster)
-				return nil, errors.New("Prometheus config error")
-			}
-			query, err := pdb.FormatQuery(metric, scope, nodes, cluster)
-			if err != nil {
-				cclog.Warn("Error while formatting prometheus query")
-				return nil, err
-			}
-
-			// ranged query over all nodes
-			r := promv1.Range{
-				Start: from,
-				End:   to,
-				Step:  time.Duration(metricConfig.Timestep * 1e9),
-			}
-			result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r)
-			if err != nil {
-				cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err)
-				return nil, errors.New("Prometheus query error")
-			}
-			if len(warnings) > 0 {
-				cclog.Warnf("Warnings: %v\n", warnings)
-			}
-
-			step := int64(metricConfig.Timestep)
-			steps := int64(to.Sub(from).Seconds()) / step
-
-			// iter rows of host, metric, values
-			for _, row := range result.(promm.Matrix) {
-				hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix)
-
-				hostdata, ok := data[hostname]
-				if !ok {
-					hostdata = make(schema.JobData)
-					data[hostname] = hostdata
-				}
-
-				metricdata, ok := hostdata[metric]
-				if !ok {
-					metricdata = make(map[schema.MetricScope]*schema.JobMetric)
-					data[hostname][metric] = metricdata
-				}
-
-				// output per host, metric and scope
-				scopeData, ok := metricdata[scope]
-				if !ok {
-					scopeData = &schema.JobMetric{
-						Unit:     metricConfig.Unit,
-						Timestep: metricConfig.Timestep,
-						Series:   []schema.Series{pdb.RowToSeries(from, step, steps, row)},
-					}
-					data[hostname][metric][scope] = scopeData
-				}
-			}
-		}
-	}
-	t1 := time.Since(t0)
-	cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1)
-	return data, nil
-}
--- a/internal/metricdata/utils.go
+++ b/internal/metricdata/utils.go
@@ -1,118 +0,0 @@
-// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
-// All rights reserved. This file is part of cc-backend.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package metricdata
-
-import (
-	"context"
-	"encoding/json"
-	"time"
-
-	"github.com/ClusterCockpit/cc-lib/schema"
-)
-
-var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) {
-	panic("TODO")
-}
-
-// TestMetricDataRepository is only a mock for unit-testing.
-type TestMetricDataRepository struct{}
-
-func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
-	return nil
-}
-
-func (tmdr *TestMetricDataRepository) LoadData(
-	job *schema.Job,
-	metrics []string,
-	scopes []schema.MetricScope,
-	ctx context.Context,
-	resolution int,
-) (schema.JobData, error) {
-	return TestLoadDataCallback(job, metrics, scopes, ctx, resolution)
-}
-
-func (tmdr *TestMetricDataRepository) LoadStats(
-	job *schema.Job,
-	metrics []string,
-	ctx context.Context,
-) (map[string]map[string]schema.MetricStatistics, error) {
-	panic("TODO")
-}
-
-func (tmdr *TestMetricDataRepository) LoadScopedStats(
-	job *schema.Job,
-	metrics []string,
-	scopes []schema.MetricScope,
-	ctx context.Context,
-) (schema.ScopedJobStats, error) {
-	panic("TODO")
-}
-
-func (tmdr *TestMetricDataRepository) LoadNodeData(
-	cluster string,
-	metrics, nodes []string,
-	scopes []schema.MetricScope,
-	from, to time.Time,
-	ctx context.Context,
-) (map[string]map[string][]*schema.JobMetric, error) {
-	panic("TODO")
-}
-
-func (tmdr *TestMetricDataRepository) LoadNodeListData(
-	cluster, subCluster string,
-	nodes []string,
-	metrics []string,
-	scopes []schema.MetricScope,
-	resolution int,
-	from, to time.Time,
-	ctx context.Context,
-) (map[string]schema.JobData, error) {
-	panic("TODO")
-}
-
-func DeepCopy(jdTemp schema.JobData) schema.JobData {
-	jd := make(schema.JobData, len(jdTemp))
-	for k, v := range jdTemp {
-		jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jdTemp[k]))
-		for k_, v_ := range v {
-			jd[k][k_] = new(schema.JobMetric)
-			jd[k][k_].Series = make([]schema.Series, len(v_.Series))
-			for i := 0; i < len(v_.Series); i += 1 {
-				jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data))
-				copy(jd[k][k_].Series[i].Data, v_.Series[i].Data)
-				jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname
-				jd[k][k_].Series[i].Id = v_.Series[i].Id
-				jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg
-				jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min
-				jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max
-			}
-			jd[k][k_].Timestep = v_.Timestep
-			jd[k][k_].Unit.Base = v_.Unit.Base
-			jd[k][k_].Unit.Prefix = v_.Unit.Prefix
-			if v_.StatisticsSeries != nil {
-				// Init Slices
-				jd[k][k_].StatisticsSeries = new(schema.StatsSeries)
-				jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max))
-				jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min))
-				jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median))
-				jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean))
-				// Copy Data
-				copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max)
-				copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min)
-				copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median)
-				copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean)
-				// Handle Percentiles
-				for k__, v__ := range v_.StatisticsSeries.Percentiles {
-					jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__))
-					copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__)
-				}
-			} else {
-				jd[k][k_].StatisticsSeries = v_.StatisticsSeries
-			}
-		}
-	}
-	return jd
-}
--- a/internal/metricdispatch/configSchema.go
+++ b/internal/metricdispatch/configSchema.go
@@ -0,0 +1,29 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package metricdispatch
+
+const configSchema = `{
+  "type": "array",
+  "description": "Array of metric store configurations with scope-based routing.",
+  "items": {
+    "type": "object",
+    "properties": {
+      "scope": {
+        "description": "Scope identifier for routing metrics (e.g., cluster name, '*' for default)",
+        "type": "string"
+      },
+      "url": {
+        "description": "URL of the metric store endpoint",
+        "type": "string"
+      },
+      "token": {
+        "description": "Authentication token for the metric store",
+        "type": "string"
+      }
+    },
+    "required": ["scope", "url", "token"]
+  }
+}`
--- a/internal/metricdispatch/dataLoader.go
+++ b/internal/metricdispatch/dataLoader.go
@@ -0,0 +1,533 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// Package metricdispatch provides a unified interface for loading and caching job metric data.
+//
+// This package serves as a central dispatcher that routes metric data requests to the appropriate
+// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store).
+// For completed jobs, data is retrieved from the file-based job archive.
+//
+// # Key Features
+//
+//   - Automatic backend selection based on job state (running vs. archived)
+//   - LRU cache for performance optimization (128 MB default cache size)
+//   - Data resampling using Largest Triangle Three Bucket algorithm for archived data
+//   - Automatic statistics series generation for jobs with many nodes
+//   - Support for scoped metrics (node, socket, accelerator, core)
+//
+// # Cache Behavior
+//
+// Cached data has different TTL (time-to-live) values depending on job state:
+//   - Running jobs: 2 minutes (data changes frequently)
+//   - Completed jobs: 5 hours (data is static)
+//
+// The cache key is based on job ID, state, requested metrics, scopes, and resolution.
+//
+// # Usage
+//
+// The primary entry point is LoadData, which automatically handles both running and archived jobs:
+//
+//	jobData, err := metricdispatch.LoadData(job, metrics, scopes, ctx, resolution)
+//	if err != nil {
+//	    // Handle error
+//	}
+//
+// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format.
+package metricdispatch
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"time"
+
+	"github.com/ClusterCockpit/cc-backend/pkg/archive"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/lrucache"
+	"github.com/ClusterCockpit/cc-lib/v2/resampler"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+// cache is an LRU cache with 128 MB capacity for storing loaded job metric data.
+// The cache reduces load on both the metric store and archive backends.
+var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
+
+// cacheKey generates a unique cache key for a job's metric data based on job ID, state,
+// requested metrics, scopes, and resolution. Duration and StartTime are intentionally excluded
+// because job.ID is more unique and the cache TTL ensures entries don't persist indefinitely.
+func cacheKey(
+	job *schema.Job,
+	metrics []string,
+	scopes []schema.MetricScope,
+	resolution int,
+) string {
+	return fmt.Sprintf("%d(%s):[%v],[%v]-%d",
+		*job.ID, job.State, metrics, scopes, resolution)
+}
+
+// LoadData retrieves metric data for a job from the appropriate backend (memory store for running jobs,
+// archive for completed jobs) and applies caching, resampling, and statistics generation as needed.
+//
+// For running jobs or when archive is disabled, data is fetched from the metric store.
+// For completed archived jobs, data is loaded from the job archive and resampled if needed.
+//
+// Parameters:
+//   - job: The job for which to load metric data
+//   - metrics: List of metric names to load (nil loads all metrics for the cluster)
+//   - scopes: Metric scopes to include (nil defaults to node scope)
+//   - ctx: Context for cancellation and timeouts
+//   - resolution: Target number of data points for resampling (only applies to archived data)
+//
+// Returns the loaded job data and any error encountered. For partial errors (some metrics failed),
+// the function returns the successfully loaded data with a warning logged.
+func LoadData(job *schema.Job,
+	metrics []string,
+	scopes []schema.MetricScope,
+	ctx context.Context,
+	resolution int,
+) (schema.JobData, error) {
+	data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) {
+		var jd schema.JobData
+		var err error
+
+		if job.State == schema.JobStateRunning ||
+			job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving {
+
+			ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
+			if err != nil {
+				cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
+					job.Cluster, job.SubCluster, err.Error())
+				return err, 0, 0
+			}
+
+			if scopes == nil {
+				scopes = append(scopes, schema.MetricScopeNode)
+			}
+
+			if metrics == nil {
+				cluster := archive.GetCluster(job.Cluster)
+				for _, mc := range cluster.MetricConfig {
+					metrics = append(metrics, mc.Name)
+				}
+			}
+
+			jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
+			if err != nil {
+				if len(jd) != 0 {
+					cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
+						job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
+				} else {
+					cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
+						job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
+					return err, 0, 0
+				}
+			}
+			size = jd.Size()
+		} else {
+			var jdTemp schema.JobData
+			jdTemp, err = archive.GetHandle().LoadJobData(job)
+			if err != nil {
+				cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
+					job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
+				return err, 0, 0
+			}
+
+			jd = deepCopy(jdTemp)
+
+			// Resample archived data using Largest Triangle Three Bucket algorithm to reduce data points
+			// to the requested resolution, improving transfer performance and client-side rendering.
+			for _, v := range jd {
+				for _, v_ := range v {
+					timestep := int64(0)
+					for i := 0; i < len(v_.Series); i += 1 {
+						v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution))
+						if err != nil {
+							return err, 0, 0
+						}
+					}
+					v_.Timestep = int(timestep)
+				}
+			}
+
+			// Filter job data to only include requested metrics and scopes, avoiding unnecessary data transfer.
+			if metrics != nil || scopes != nil {
+				if metrics == nil {
+					metrics = make([]string, 0, len(jd))
+					for k := range jd {
+						metrics = append(metrics, k)
+					}
+				}
+
+				res := schema.JobData{}
+				for _, metric := range metrics {
+					if perscope, ok := jd[metric]; ok {
+						if len(perscope) > 1 {
+							subset := make(map[schema.MetricScope]*schema.JobMetric)
+							for _, scope := range scopes {
+								if jm, ok := perscope[scope]; ok {
+									subset[scope] = jm
+								}
+							}
+
+							if len(subset) > 0 {
+								perscope = subset
+							}
+						}
+
+						res[metric] = perscope
+					}
+				}
+				jd = res
+			}
+			size = jd.Size()
+		}
+
+		ttl = 5 * time.Hour
+		if job.State == schema.JobStateRunning {
+			ttl = 2 * time.Minute
+		}
+
+		// Generate statistics series for jobs with many nodes to enable min/median/max graphs
+		// instead of overwhelming the UI with individual node lines. Note that newly calculated
+		// statistics use min/median/max, while archived statistics may use min/mean/max.
+		const maxSeriesSize int = 15
+		for _, scopes := range jd {
+			for _, jm := range scopes {
+				if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
+					continue
+				}
+
+				jm.AddStatisticsSeries()
+			}
+		}
+
+		nodeScopeRequested := false
+		for _, scope := range scopes {
+			if scope == schema.MetricScopeNode {
+				nodeScopeRequested = true
+			}
+		}
+
+		if nodeScopeRequested {
+			jd.AddNodeScope("flops_any")
+			jd.AddNodeScope("mem_bw")
+		}
+
+		// Round Resulting Stat Values
+		jd.RoundMetricStats()
+
+		return jd, ttl, size
+	})
+
+	if err, ok := data.(error); ok {
+		cclog.Errorf("error in cached dataset for job %d: %s", job.JobID, err.Error())
+		return nil, err
+	}
+
+	return data.(schema.JobData), nil
+}
+
+// LoadAverages computes average values for the specified metrics across all nodes of a job.
+// For running jobs, it loads statistics from the metric store. For completed jobs, it uses
+// the pre-calculated averages from the job archive. The results are appended to the data slice.
+func LoadAverages(
+	job *schema.Job,
+	metrics []string,
+	data [][]schema.Float,
+	ctx context.Context,
+) error {
+	if job.State != schema.JobStateRunning {
+		return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
+	}
+
+	ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
+	if err != nil {
+		cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
+			job.Cluster, job.SubCluster, err.Error())
+		return err
+	}
+
+	stats, err := ms.LoadStats(job, metrics, ctx)
+	if err != nil {
+		cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
+			job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
+		return err
+	}
+
+	for i, m := range metrics {
+		nodes, ok := stats[m]
+		if !ok {
+			data[i] = append(data[i], schema.NaN)
+			continue
+		}
+
+		sum := 0.0
+		for _, node := range nodes {
+			sum += node.Avg
+		}
+		data[i] = append(data[i], schema.Float(sum))
+	}
+
+	return nil
+}
+
+// LoadScopedJobStats retrieves job statistics organized by metric scope (node, socket, core, accelerator).
+// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
+// statistics are loaded from the job archive.
+func LoadScopedJobStats(
+	job *schema.Job,
+	metrics []string,
+	scopes []schema.MetricScope,
+	ctx context.Context,
+) (schema.ScopedJobStats, error) {
+	if job.State != schema.JobStateRunning {
+		return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
+	}
+
+	ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
+	if err != nil {
+		cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
+			job.Cluster, job.SubCluster, err.Error())
+		return nil, err
+	}
+
+	scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
+	if err != nil {
+		cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
+			job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
+		return nil, err
+	}
+
+	// Round Resulting Stat Values
+	scopedStats.RoundScopedMetricStats()
+
+	return scopedStats, nil
+}
+
+// LoadJobStats retrieves aggregated statistics (min/avg/max) for each requested metric across all job nodes.
+// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated
+// statistics are loaded from the job archive.
+func LoadJobStats(
+	job *schema.Job,
+	metrics []string,
+	ctx context.Context,
+) (map[string]schema.MetricStatistics, error) {
+	if job.State != schema.JobStateRunning {
+		return archive.LoadStatsFromArchive(job, metrics)
+	}
+
+	ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
+	if err != nil {
+		cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
+			job.Cluster, job.SubCluster, err.Error())
+		return nil, err
+	}
+
+	data := make(map[string]schema.MetricStatistics, len(metrics))
+
+	stats, err := ms.LoadStats(job, metrics, ctx)
+	if err != nil {
+		cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
+			job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
+		return data, err
+	}
+
+	for _, m := range metrics {
+		sum, avg, min, max := 0.0, 0.0, 0.0, 0.0
+		nodes, ok := stats[m]
+		if !ok {
+			data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max}
+			continue
+		}
+
+		for _, node := range nodes {
+			sum += node.Avg
+			min = math.Min(min, node.Min)
+			max = math.Max(max, node.Max)
+		}
+
+		data[m] = schema.MetricStatistics{
+			Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100),
+			Min: (math.Round(min*100) / 100),
+			Max: (math.Round(max*100) / 100),
+		}
+	}
+
+	return data, nil
+}
+
+// LoadNodeData retrieves metric data for specific nodes in a cluster within a time range.
+// This is used for node monitoring views and system status pages. Data is always fetched from
+// the metric store (not the archive) since it's for current/recent node status monitoring.
+//
+// Returns a nested map structure: node -> metric -> scoped data.
+// FIXME: Add support for subcluster specific cc-metric-stores
+func LoadNodeData(
+	cluster string,
+	metrics, nodes []string,
+	scopes []schema.MetricScope,
+	from, to time.Time,
+	ctx context.Context,
+) (map[string]map[string][]*schema.JobMetric, error) {
+	if metrics == nil {
+		for _, m := range archive.GetCluster(cluster).MetricConfig {
+			metrics = append(metrics, m.Name)
+		}
+	}
+
+	ms, err := GetMetricDataRepo(cluster, "")
+	if err != nil {
+		cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
+			cluster, err.Error())
+		return nil, err
+	}
+
+	data, err := ms.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
+	if err != nil {
+		if len(data) != 0 {
+			cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
+		} else {
+			cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
+			return nil, err
+		}
+	}
+
+	if data == nil {
+		return nil, fmt.Errorf("metric store for cluster '%s' does not support node data queries", cluster)
+	}
+
+	return data, nil
+}
+
+// LoadNodeListData retrieves time-series metric data for multiple nodes within a time range,
+// with optional resampling and automatic statistics generation for large datasets.
+// This is used for comparing multiple nodes or displaying node status over time.
+//
+// Returns a map of node names to their job-like metric data structures.
+func LoadNodeListData(
+	cluster, subCluster string,
+	nodes []string,
+	metrics []string,
+	scopes []schema.MetricScope,
+	resolution int,
+	from, to time.Time,
+	ctx context.Context,
+) (map[string]schema.JobData, error) {
+	if metrics == nil {
+		for _, m := range archive.GetCluster(cluster).MetricConfig {
+			metrics = append(metrics, m.Name)
+		}
+	}
+
+	ms, err := GetMetricDataRepo(cluster, subCluster)
+	if err != nil {
+		cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
+			cluster, subCluster, err.Error())
+		return nil, err
+	}
+
+	data, err := ms.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
+	if err != nil {
+		if len(data) != 0 {
+			cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
+				cluster, subCluster, err.Error())
+		} else {
+			cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
+				cluster, subCluster, err.Error())
+			return nil, err
+		}
+	}
+
+	// Generate statistics series for datasets with many series to improve visualization performance.
+	// Statistics are calculated as min/median/max.
+	const maxSeriesSize int = 8
+	for _, jd := range data {
+		for _, scopes := range jd {
+			for _, jm := range scopes {
+				if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize {
+					continue
+				}
+				jm.AddStatisticsSeries()
+			}
+		}
+	}
+
+	if data == nil {
+		return nil, fmt.Errorf("metric store for cluster '%s' does not support node list queries", cluster)
+	}
+
+	return data, nil
+}
+
+// deepCopy creates a deep copy of JobData to prevent cache corruption when modifying
+// archived data (e.g., during resampling). This ensures the cached archive data remains
+// immutable while allowing per-request transformations.
+func deepCopy(source schema.JobData) schema.JobData {
+	result := make(schema.JobData, len(source))
+
+	for metricName, scopeMap := range source {
+		result[metricName] = make(map[schema.MetricScope]*schema.JobMetric, len(scopeMap))
+
+		for scope, jobMetric := range scopeMap {
+			result[metricName][scope] = copyJobMetric(jobMetric)
+		}
+	}
+
+	return result
+}
+
+func copyJobMetric(src *schema.JobMetric) *schema.JobMetric {
+	dst := &schema.JobMetric{
+		Timestep: src.Timestep,
+		Unit:     src.Unit,
+		Series:   make([]schema.Series, len(src.Series)),
+	}
+
+	for i := range src.Series {
+		dst.Series[i] = copySeries(&src.Series[i])
+	}
+
+	if src.StatisticsSeries != nil {
+		dst.StatisticsSeries = copyStatisticsSeries(src.StatisticsSeries)
+	}
+
+	return dst
+}
+
+func copySeries(src *schema.Series) schema.Series {
+	dst := schema.Series{
+		Hostname:   src.Hostname,
+		ID:         src.ID,
+		Statistics: src.Statistics,
+		Data:       make([]schema.Float, len(src.Data)),
+	}
+
+	copy(dst.Data, src.Data)
+	return dst
+}
+
+func copyStatisticsSeries(src *schema.StatsSeries) *schema.StatsSeries {
+	dst := &schema.StatsSeries{
+		Min:    make([]schema.Float, len(src.Min)),
+		Mean:   make([]schema.Float, len(src.Mean)),
+		Median: make([]schema.Float, len(src.Median)),
+		Max:    make([]schema.Float, len(src.Max)),
+	}
+
+	copy(dst.Min, src.Min)
+	copy(dst.Mean, src.Mean)
+	copy(dst.Median, src.Median)
+	copy(dst.Max, src.Max)
+
+	if len(src.Percentiles) > 0 {
+		dst.Percentiles = make(map[int][]schema.Float, len(src.Percentiles))
+		for percentile, values := range src.Percentiles {
+			dst.Percentiles[percentile] = make([]schema.Float, len(values))
+			copy(dst.Percentiles[percentile], values)
+		}
+	}
+
+	return dst
+}
--- a/internal/metricdispatch/dataLoader_test.go
+++ b/internal/metricdispatch/dataLoader_test.go
@@ -0,0 +1,125 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package metricdispatch
+
+import (
+	"testing"
+
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+func TestDeepCopy(t *testing.T) {
+	nodeId := "0"
+	original := schema.JobData{
+		"cpu_load": {
+			schema.MetricScopeNode: &schema.JobMetric{
+				Timestep: 60,
+				Unit:     schema.Unit{Base: "load", Prefix: ""},
+				Series: []schema.Series{
+					{
+						Hostname: "node001",
+						ID:       &nodeId,
+						Data:     []schema.Float{1.0, 2.0, 3.0},
+						Statistics: schema.MetricStatistics{
+							Min: 1.0,
+							Avg: 2.0,
+							Max: 3.0,
+						},
+					},
+				},
+				StatisticsSeries: &schema.StatsSeries{
+					Min:    []schema.Float{1.0, 1.5, 2.0},
+					Mean:   []schema.Float{2.0, 2.5, 3.0},
+					Median: []schema.Float{2.0, 2.5, 3.0},
+					Max:    []schema.Float{3.0, 3.5, 4.0},
+					Percentiles: map[int][]schema.Float{
+						25: {1.5, 2.0, 2.5},
+						75: {2.5, 3.0, 3.5},
+					},
+				},
+			},
+		},
+	}
+
+	copied := deepCopy(original)
+
+	original["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] = 999.0
+	original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] = 888.0
+	original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] = 777.0
+
+	if copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] != 1.0 {
+		t.Errorf("Series data was not deeply copied: got %v, want 1.0",
+			copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0])
+	}
+
+	if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] != 1.0 {
+		t.Errorf("StatisticsSeries was not deeply copied: got %v, want 1.0",
+			copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0])
+	}
+
+	if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] != 1.5 {
+		t.Errorf("Percentiles was not deeply copied: got %v, want 1.5",
+			copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0])
+	}
+
+	if copied["cpu_load"][schema.MetricScopeNode].Timestep != 60 {
+		t.Errorf("Timestep not copied correctly: got %v, want 60",
+			copied["cpu_load"][schema.MetricScopeNode].Timestep)
+	}
+
+	if copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname != "node001" {
+		t.Errorf("Hostname not copied correctly: got %v, want node001",
+			copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname)
+	}
+}
+
+func TestDeepCopyNilStatisticsSeries(t *testing.T) {
+	original := schema.JobData{
+		"mem_used": {
+			schema.MetricScopeNode: &schema.JobMetric{
+				Timestep: 60,
+				Series: []schema.Series{
+					{
+						Hostname: "node001",
+						Data:     []schema.Float{1.0, 2.0},
+					},
+				},
+				StatisticsSeries: nil,
+			},
+		},
+	}
+
+	copied := deepCopy(original)
+
+	if copied["mem_used"][schema.MetricScopeNode].StatisticsSeries != nil {
+		t.Errorf("StatisticsSeries should be nil, got %v",
+			copied["mem_used"][schema.MetricScopeNode].StatisticsSeries)
+	}
+}
+
+func TestDeepCopyEmptyPercentiles(t *testing.T) {
+	original := schema.JobData{
+		"cpu_load": {
+			schema.MetricScopeNode: &schema.JobMetric{
+				Timestep: 60,
+				Series:   []schema.Series{},
+				StatisticsSeries: &schema.StatsSeries{
+					Min:         []schema.Float{1.0},
+					Mean:        []schema.Float{2.0},
+					Median:      []schema.Float{2.0},
+					Max:         []schema.Float{3.0},
+					Percentiles: nil,
+				},
+			},
+		},
+	}
+
+	copied := deepCopy(original)
+
+	if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles != nil {
+		t.Errorf("Percentiles should be nil when source is nil/empty")
+	}
+}
--- a/internal/metricdispatch/metricdata.go
+++ b/internal/metricdispatch/metricdata.go
@@ -0,0 +1,123 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+package metricdispatch
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/ClusterCockpit/cc-backend/internal/config"
+	ccms "github.com/ClusterCockpit/cc-backend/internal/metricstoreclient"
+	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+type MetricDataRepository interface {
+	// Return the JobData for the given job, only with the requested metrics.
+	LoadData(job *schema.Job,
+		metrics []string,
+		scopes []schema.MetricScope,
+		ctx context.Context,
+		resolution int) (schema.JobData, error)
+
+	// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
+	LoadStats(job *schema.Job,
+		metrics []string,
+		ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
+
+	// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
+	LoadScopedStats(job *schema.Job,
+		metrics []string,
+		scopes []schema.MetricScope,
+		ctx context.Context) (schema.ScopedJobStats, error)
+
+	// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
+	LoadNodeData(cluster string,
+		metrics, nodes []string,
+		scopes []schema.MetricScope,
+		from, to time.Time,
+		ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
+
+	// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
+	LoadNodeListData(cluster, subCluster string,
+		nodes []string,
+		metrics []string,
+		scopes []schema.MetricScope,
+		resolution int,
+		from, to time.Time,
+		ctx context.Context) (map[string]schema.JobData, error)
+
+	// HealthCheck evaluates the monitoring state for a set of nodes against expected metrics.
+	HealthCheck(cluster string,
+		nodes []string,
+		metrics []string) (map[string]metricstore.HealthCheckResult, error)
+}
+
+type CCMetricStoreConfig struct {
+	Scope string `json:"scope"`
+	URL   string `json:"url"`
+	Token string `json:"token"`
+}
+
+var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
+
+func Init(rawConfig json.RawMessage) error {
+	if rawConfig != nil {
+		var configs []CCMetricStoreConfig
+		config.Validate(configSchema, rawConfig)
+		dec := json.NewDecoder(bytes.NewReader(rawConfig))
+		dec.DisallowUnknownFields()
+		if err := dec.Decode(&configs); err != nil {
+			return fmt.Errorf("[METRICDISPATCH]> External Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error())
+		}
+
+		if len(configs) == 0 {
+			return fmt.Errorf("[METRICDISPATCH]> No external metric store configurations found in config file")
+		}
+
+		for _, config := range configs {
+			metricDataRepos[config.Scope] = ccms.NewCCMetricStore(config.URL, config.Token)
+		}
+	}
+
+	return nil
+}
+
+func GetMetricDataRepo(cluster string, subcluster string) (MetricDataRepository, error) {
+	var repo MetricDataRepository
+	var ok bool
+
+	key := cluster + "-" + subcluster
+	repo, ok = metricDataRepos[key]
+
+	if !ok {
+		repo, ok = metricDataRepos[cluster]
+
+		if !ok {
+			repo, ok = metricDataRepos["*"]
+
+			if !ok {
+				if metricstore.MetricStoreHandle == nil {
+					return nil, fmt.Errorf("[METRICDISPATCH]> no metric data repository configured '%s'", key)
+				}
+
+				repo = metricstore.MetricStoreHandle
+				cclog.Debugf("[METRICDISPATCH]> Using internal metric data repository for '%s'", key)
+			}
+		}
+	}
+
+	return repo, nil
+}
+
+// GetHealthCheckRepo returns the MetricDataRepository for performing health checks on a cluster.
+// It uses the same fallback logic as GetMetricDataRepo: cluster → wildcard → internal.
+func GetHealthCheckRepo(cluster string) (MetricDataRepository, error) {
+	return GetMetricDataRepo(cluster, "")
+}
--- a/internal/metricstoreclient/cc-metric-store-queries.go
+++ b/internal/metricstoreclient/cc-metric-store-queries.go
@@ -0,0 +1,239 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// Package metricstoreclient - Query Building
+//
+// This file contains the query construction and scope transformation logic for cc-metric-store queries.
+// It handles the complex mapping between requested metric scopes and native hardware topology,
+// automatically aggregating or filtering metrics as needed.
+//
+// # Scope Transformations
+//
+// The buildScopeQueries function implements the core scope transformation algorithm.
+// It handles 25+ different transformation cases, mapping between:
+//   - Accelerator (GPU) scope
+//   - HWThread (hardware thread/SMT) scope
+//   - Core (CPU core) scope
+//   - Socket (CPU package) scope
+//   - MemoryDomain (NUMA domain) scope
+//   - Node (full system) scope
+//
+// Transformations follow these rules:
+//   - Same scope: Return data as-is (e.g., Core → Core)
+//   - Coarser scope: Aggregate data (e.g., Core → Socket with Aggregate=true)
+//   - Finer scope: Error - cannot increase granularity
+//
+// # Query Building
+//
+// buildQueries and buildNodeQueries are the main entry points, handling job-specific
+// and node-specific query construction respectively. They:
+//   - Validate metric configurations
+//   - Handle subcluster-specific metric filtering
+//   - Detect and skip duplicate scope requests
+//   - Call buildScopeQueries for each metric/scope/host combination
+package metricstoreclient
+
+import (
+	"fmt"
+
+	"github.com/ClusterCockpit/cc-backend/pkg/archive"
+	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+// buildQueries constructs API queries for job-specific metric data.
+// It iterates through metrics, scopes, and job resources to build the complete query set.
+//
+// The function handles:
+//   - Metric configuration validation and subcluster filtering
+//   - Scope deduplication to avoid redundant queries
+//   - Hardware thread list resolution (job-allocated vs full node)
+//   - Delegation to buildScopeQueries for scope transformations
+//
+// Returns queries and their corresponding assigned scopes (which may differ from requested scopes).
+func (ccms *CCMetricStore) buildQueries(
+	job *schema.Job,
+	metrics []string,
+	scopes []schema.MetricScope,
+	resolution int,
+) ([]APIQuery, []schema.MetricScope, error) {
+	// Initialize both slices together
+	queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
+	assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources))
+
+	topology, err := ccms.getTopology(job.Cluster, job.SubCluster)
+	if err != nil {
+		cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error())
+		return nil, nil, err
+	}
+
+	for _, metric := range metrics {
+		remoteName := metric
+		mc := archive.GetMetricConfig(job.Cluster, metric)
+		if mc == nil {
+			cclog.Warnf("metric '%s' is not specified for cluster '%s' - skipping", metric, job.Cluster)
+			continue
+		}
+
+		// Skip if metric is removed for subcluster
+		if len(mc.SubClusters) != 0 && metricstore.IsMetricRemovedForSubCluster(mc, job.SubCluster) {
+			continue
+		}
+
+		// Avoid duplicates...
+		handledScopes := make([]schema.MetricScope, 0, 3)
+
+	scopesLoop:
+		for _, requestedScope := range scopes {
+			nativeScope := mc.Scope
+			if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
+				continue
+			}
+
+			scope := nativeScope.Max(requestedScope)
+			for _, s := range handledScopes {
+				if scope == s {
+					continue scopesLoop
+				}
+			}
+			handledScopes = append(handledScopes, scope)
+
+			for _, host := range job.Resources {
+				hwthreads := host.HWThreads
+				if hwthreads == nil {
+					hwthreads = topology.Node
+				}
+
+				scopeResults, ok := metricstore.BuildScopeQueries(
+					nativeScope, requestedScope,
+					remoteName, host.Hostname,
+					topology, hwthreads, host.Accelerators,
+				)
+
+				if !ok {
+					return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
+				}
+
+				for _, sr := range scopeResults {
+					queries = append(queries, APIQuery{
+						Metric:     sr.Metric,
+						Hostname:   sr.Hostname,
+						Aggregate:  sr.Aggregate,
+						Type:       sr.Type,
+						TypeIds:    sr.TypeIds,
+						Resolution: resolution,
+					})
+					assignedScope = append(assignedScope, sr.Scope)
+				}
+			}
+		}
+	}
+
+	return queries, assignedScope, nil
+}
+
+// buildNodeQueries constructs API queries for node-specific metric data (Systems View).
+// Similar to buildQueries but uses full node topology instead of job-allocated resources.
+//
+// The function handles:
+//   - SubCluster topology resolution (either pre-loaded or per-node lookup)
+//   - Full node hardware thread lists (not job-specific subsets)
+//   - All accelerators on each node
+//   - Metric configuration validation with subcluster filtering
+//
+// Returns queries and their corresponding assigned scopes.
+func (ccms *CCMetricStore) buildNodeQueries(
+	cluster string,
+	subCluster string,
+	nodes []string,
+	metrics []string,
+	scopes []schema.MetricScope,
+	resolution int,
+) ([]APIQuery, []schema.MetricScope, error) {
+	// Initialize both slices together
+	queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes))
+	assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes))
+
+	for _, metric := range metrics {
+		remoteName := metric
+		mc := archive.GetMetricConfig(cluster, metric)
+		if mc == nil {
+			cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster)
+			continue
+		}
+
+		// Skip if metric is removed for subcluster
+		if mc.SubClusters != nil && metricstore.IsMetricRemovedForSubCluster(mc, subCluster) {
+			continue
+		}
+
+		// Avoid duplicates...
+		handledScopes := make([]schema.MetricScope, 0, 3)
+
+	scopesLoop:
+		for _, requestedScope := range scopes {
+			nativeScope := mc.Scope
+
+			scope := nativeScope.Max(requestedScope)
+			for _, s := range handledScopes {
+				if scope == s {
+					continue scopesLoop
+				}
+			}
+			handledScopes = append(handledScopes, scope)
+
+			for _, hostname := range nodes {
+				var topology *schema.Topology
+				var err error
+
+				// If no subCluster given, get it by node
+				if subCluster == "" {
+					topology, err = ccms.getTopologyByNode(cluster, hostname)
+				} else {
+					topology, err = ccms.getTopology(cluster, subCluster)
+				}
+
+				if err != nil {
+					return nil, nil, err
+				}
+
+				// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
+				// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
+				acceleratorIds := topology.GetAcceleratorIDs()
+
+				// Moved check here if metric matches hardware specs
+				if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
+					continue scopesLoop
+				}
+
+				scopeResults, ok := metricstore.BuildScopeQueries(
+					nativeScope, requestedScope,
+					remoteName, hostname,
+					topology, topology.Node, acceleratorIds,
+				)
+
+				if !ok {
+					return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
+				}
+
+				for _, sr := range scopeResults {
+					queries = append(queries, APIQuery{
+						Metric:     sr.Metric,
+						Hostname:   sr.Hostname,
+						Aggregate:  sr.Aggregate,
+						Type:       sr.Type,
+						TypeIds:    sr.TypeIds,
+						Resolution: resolution,
+					})
+					assignedScope = append(assignedScope, sr.Scope)
+				}
+			}
+		}
+	}
+
+	return queries, assignedScope, nil
+}
+
--- a/internal/metricstoreclient/cc-metric-store.go
+++ b/internal/metricstoreclient/cc-metric-store.go
@@ -0,0 +1,796 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// Package metricstoreclient provides a client for querying the cc-metric-store time series database.
+//
+// The cc-metric-store is a high-performance time series database optimized for HPC metric data.
+// This client handles HTTP communication, query construction, scope transformations, and data retrieval
+// for job and node metrics across different metric scopes (node, socket, core, hwthread, accelerator).
+//
+// # Architecture
+//
+// The package is split into two main components:
+//   - Client Operations (cc-metric-store.go): HTTP client, request handling, data loading methods
+//   - Query Building (cc-metric-store-queries.go): Query construction and scope transformation logic
+//
+// # Basic Usage
+//
+//	store := NewCCMetricStore("http://localhost:8080", "jwt-token")
+//
+//	// Load job data
+//	jobData, err := store.LoadData(job, metrics, scopes, ctx, resolution)
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//
+// # Metric Scopes
+//
+// The client supports hierarchical metric scopes that map to HPC hardware topology:
+//   - MetricScopeAccelerator: GPU/accelerator level metrics
+//   - MetricScopeHWThread: Hardware thread (SMT) level metrics
+//   - MetricScopeCore: CPU core level metrics
+//   - MetricScopeSocket: CPU socket level metrics
+//   - MetricScopeMemoryDomain: NUMA domain level metrics
+//   - MetricScopeNode: Full node level metrics
+//
+// The client automatically handles scope transformations, aggregating finer-grained metrics
+// to coarser scopes when needed (e.g., aggregating core metrics to socket level).
+//
+// # Error Handling
+//
+// The client supports partial errors - if some queries fail, it returns both the successful
+// data and an error listing the failed queries. This allows processing partial results
+// when some nodes or metrics are temporarily unavailable.
+//
+// # API Versioning
+//
+// The client uses cc-metric-store API v2, which includes support for:
+//   - Data resampling for bandwidth optimization
+//   - Multi-scope queries in a single request
+//   - Aggregation across hardware topology levels
+package metricstoreclient
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/ClusterCockpit/cc-backend/pkg/archive"
+	ms "github.com/ClusterCockpit/cc-backend/pkg/metricstore"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+)
+
+// CCMetricStore is the HTTP client for communicating with cc-metric-store.
+// It manages connection details, authentication, and provides methods for querying metrics.
+type CCMetricStore struct {
+	client        http.Client                 // HTTP client with 10-second timeout
+	jwt           string                      // JWT Bearer token for authentication
+	url           string                      // Base URL of cc-metric-store instance
+	queryEndpoint string                      // Full URL to query API endpoint
+	topologyCache map[string]*schema.Topology // cluster -> topology cache
+}
+
+// APIQueryRequest represents a request to the cc-metric-store query API.
+// It supports both explicit queries and "for-all-nodes" bulk queries.
+type APIQueryRequest struct {
+	Cluster     string     `json:"cluster"`       // Target cluster name
+	Queries     []APIQuery `json:"queries"`       // Explicit list of metric queries
+	ForAllNodes []string   `json:"for-all-nodes"` // Metrics to query for all nodes
+	From        int64      `json:"from"`          // Start time (Unix timestamp)
+	To          int64      `json:"to"`            // End time (Unix timestamp)
+	WithStats   bool       `json:"with-stats"`    // Include min/avg/max statistics
+	WithData    bool       `json:"with-data"`     // Include time series data points
+}
+
+// APIQuery specifies a single metric query with optional scope filtering.
+// Type and TypeIds define the hardware scope (core, socket, accelerator, etc.).
+type APIQuery struct {
+	Type       *string  `json:"type,omitempty"`        // Scope type (e.g., "core", "socket")
+	SubType    *string  `json:"subtype,omitempty"`     // Sub-scope type (reserved for future use)
+	Metric     string   `json:"metric"`                // Metric name
+	Hostname   string   `json:"host"`                  // Target hostname
+	Resolution int      `json:"resolution"`            // Data resolution in seconds (0 = native)
+	TypeIds    []string `json:"type-ids,omitempty"`    // IDs for the scope type (e.g., core IDs)
+	SubTypeIds []string `json:"subtype-ids,omitempty"` // IDs for sub-scope (reserved)
+	Aggregate  bool     `json:"aggreg"`                // Aggregate across TypeIds
+}
+
+// APIQueryResponse contains the results from a cc-metric-store query.
+// Results align with the Queries slice by index.
+type APIQueryResponse struct {
+	Queries []APIQuery        `json:"queries,omitempty"` // Echoed queries (for bulk requests)
+	Results [][]APIMetricData `json:"results"`           // Result data, indexed by query
+}
+
+// APIMetricData represents time series data and statistics for a single metric series.
+// Error is set if this particular series failed to load.
+type APIMetricData struct {
+	Error      *string        `json:"error"`      // Error message if query failed
+	Data       []schema.Float `json:"data"`       // Time series data points
+	From       int64          `json:"from"`       // Actual start time of data
+	To         int64          `json:"to"`         // Actual end time of data
+	Resolution int            `json:"resolution"` // Actual resolution of data in seconds
+	Avg        schema.Float   `json:"avg"`        // Average value across time range
+	Min        schema.Float   `json:"min"`        // Minimum value in time range
+	Max        schema.Float   `json:"max"`        // Maximum value in time range
+}
+
+// NewCCMetricStore creates and initializes a new (external) CCMetricStore client.
+// The url parameter should include the protocol and port (e.g., "http://localhost:8080").
+// The token parameter is a JWT used for Bearer authentication; pass empty string if auth is disabled.
+func NewCCMetricStore(url string, token string) *CCMetricStore {
+	return &CCMetricStore{
+		url:           url,
+		queryEndpoint: fmt.Sprintf("%s/api/query", url),
+		jwt:           token,
+		client: http.Client{
+			Timeout: 10 * time.Second,
+		},
+		topologyCache: make(map[string]*schema.Topology),
+	}
+}
+
+// doRequest executes an HTTP POST request to the cc-metric-store query API.
+// It handles JSON encoding/decoding, authentication, and API versioning.
+// The request body is automatically closed to prevent resource leaks.
+func (ccms *CCMetricStore) doRequest(
+	ctx context.Context,
+	body *APIQueryRequest,
+) (*APIQueryResponse, error) {
+	buf := &bytes.Buffer{}
+	if err := json.NewEncoder(buf).Encode(body); err != nil {
+		cclog.Errorf("Error while encoding request body: %s", err.Error())
+		return nil, err
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf)
+	if err != nil {
+		cclog.Errorf("Error while building request body: %s", err.Error())
+		return nil, err
+	}
+	if ccms.jwt != "" {
+		req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
+	}
+
+	// versioning the cc-metric-store query API.
+	// v2 = data with resampling
+	// v1 = data without resampling
+	q := req.URL.Query()
+	q.Add("version", "v2")
+	req.URL.RawQuery = q.Encode()
+
+	res, err := ccms.client.Do(req)
+	if err != nil {
+		cclog.Errorf("Error while performing request: %s", err.Error())
+		return nil, err
+	}
+	defer res.Body.Close()
+
+	if res.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
+	}
+
+	var resBody APIQueryResponse
+	if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
+		cclog.Errorf("Error while decoding result body: %s", err.Error())
+		return nil, err
+	}
+
+	return &resBody, nil
+}
+
+// getTopology returns the topology for a given cluster and subcluster, caching it if not already present
+func (ccms *CCMetricStore) getTopology(cluster, subCluster string) (*schema.Topology, error) {
+	cacheKey := fmt.Sprintf("%s:%s", cluster, subCluster)
+	if topology, ok := ccms.topologyCache[cacheKey]; ok {
+		return topology, nil
+	}
+
+	subcluster, err := archive.GetSubCluster(cluster, subCluster)
+	if err != nil {
+		return nil, err
+	}
+
+	ccms.topologyCache[cacheKey] = &subcluster.Topology
+	return &subcluster.Topology, nil
+}
+
+// getTopologyByNode returns the topology for a given cluster and node, caching it if not already present
+func (ccms *CCMetricStore) getTopologyByNode(cluster, node string) (*schema.Topology, error) {
+	subCluster, err := archive.GetSubClusterByNode(cluster, node)
+	if err != nil {
+		return nil, err
+	}
+
+	return ccms.getTopology(cluster, subCluster)
+}
+
+// LoadData retrieves time series data and statistics for the specified job and metrics.
+// It queries data for the job's time range and resources, handling scope transformations automatically.
+//
+// Parameters:
+//   - job: Job metadata including cluster, time range, and allocated resources
+//   - metrics: List of metric names to retrieve
+//   - scopes: Requested metric scopes (node, socket, core, etc.)
+//   - ctx: Context for cancellation and timeouts
+//   - resolution: Data resolution in seconds (0 for native resolution)
+//
+// Returns JobData organized as: metric -> scope -> series list.
+// Supports partial errors: returns available data even if some queries fail.
+func (ccms *CCMetricStore) LoadData(
+	job *schema.Job,
+	metrics []string,
+	scopes []schema.MetricScope,
+	ctx context.Context,
+	resolution int,
+) (schema.JobData, error) {
+	queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution)
+	if err != nil {
+		cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
+		return nil, err
+	}
+
+	// Verify assignment is correct - log any inconsistencies for debugging
+	if len(queries) != len(assignedScope) {
+		cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d",
+			len(queries), len(assignedScope))
+	}
+
+	req := APIQueryRequest{
+		Cluster:   job.Cluster,
+		From:      job.StartTime,
+		To:        job.StartTime + int64(job.Duration),
+		Queries:   queries,
+		WithStats: true,
+		WithData:  true,
+	}
+
+	resBody, err := ccms.doRequest(ctx, &req)
+	if err != nil {
+		cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
+		return nil, err
+	}
+
+	var errors []string
+	jobData := make(schema.JobData)
+
+	// Add safety check for potential index out of range errors
+	if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
+		cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
+			len(req.Queries), len(resBody.Results), len(assignedScope))
+		if len(resBody.Results) > len(req.Queries) {
+			resBody.Results = resBody.Results[:len(req.Queries)]
+		}
+		if len(assignedScope) > len(req.Queries) {
+			assignedScope = assignedScope[:len(req.Queries)]
+		}
+	}
+
+	for i, row := range resBody.Results {
+		query := req.Queries[i]
+		metric := query.Metric
+		scope := assignedScope[i]
+		mc := archive.GetMetricConfig(job.Cluster, metric)
+
+		if mc == nil {
+			cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster)
+			continue
+		}
+
+		if _, ok := jobData[metric]; !ok {
+			jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
+		}
+
+		res := mc.Timestep
+		if len(row) > 0 {
+			res = row[0].Resolution
+		}
+
+		jobMetric, ok := jobData[metric][scope]
+		if !ok {
+			jobMetric = &schema.JobMetric{
+				Unit:     mc.Unit,
+				Timestep: res,
+				Series:   make([]schema.Series, 0),
+			}
+			jobData[metric][scope] = jobMetric
+		}
+
+		for ndx, res := range row {
+			if res.Error != nil {
+				/* Build list for "partial errors", if any */
+				errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
+				continue
+			}
+
+			id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
+
+			ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
+
+			jobMetric.Series = append(jobMetric.Series, schema.Series{
+				Hostname: query.Hostname,
+				ID:       id,
+				Statistics: schema.MetricStatistics{
+					Avg: float64(res.Avg),
+					Min: float64(res.Min),
+					Max: float64(res.Max),
+				},
+				Data: res.Data,
+			})
+		}
+
+		// So that one can later check len(jobData):
+		if len(jobMetric.Series) == 0 {
+			delete(jobData[metric], scope)
+			if len(jobData[metric]) == 0 {
+				delete(jobData, metric)
+			}
+		}
+	}
+
+	if len(errors) != 0 {
+		/* Returns list for "partial errors" */
+		return jobData, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
+	}
+	return jobData, nil
+}
+
+// LoadStats retrieves min/avg/max statistics for job metrics at node scope.
+// This is faster than LoadData when only statistical summaries are needed (no time series data).
+//
+// Returns statistics organized as: metric -> hostname -> statistics.
+func (ccms *CCMetricStore) LoadStats(
+	job *schema.Job,
+	metrics []string,
+	ctx context.Context,
+) (map[string]map[string]schema.MetricStatistics, error) {
+	queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
+	if err != nil {
+		cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error())
+		return nil, err
+	}
+
+	req := APIQueryRequest{
+		Cluster:   job.Cluster,
+		From:      job.StartTime,
+		To:        job.StartTime + int64(job.Duration),
+		Queries:   queries,
+		WithStats: true,
+		WithData:  false,
+	}
+
+	resBody, err := ccms.doRequest(ctx, &req)
+	if err != nil {
+		cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
+		return nil, err
+	}
+
+	stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
+	for i, res := range resBody.Results {
+		if i >= len(req.Queries) {
+			cclog.Warnf("LoadStats: result index %d exceeds queries length %d", i, len(req.Queries))
+			break
+		}
+		if len(res) == 0 {
+			// No Data Found For Metric, Logged in FetchData to Warn
+			continue
+		}
+		query := req.Queries[i]
+		metric := query.Metric
+		data := res[0]
+		if data.Error != nil {
+			cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
+			continue
+		}
+
+		metricdata, ok := stats[metric]
+		if !ok {
+			metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
+			stats[metric] = metricdata
+		}
+
+		if hasNaNStats(data.Avg, data.Min, data.Max) {
+			cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname)
+			continue
+		}
+
+		metricdata[query.Hostname] = schema.MetricStatistics{
+			Avg: float64(data.Avg),
+			Min: float64(data.Min),
+			Max: float64(data.Max),
+		}
+	}
+
+	return stats, nil
+}
+
+// LoadScopedStats retrieves statistics for job metrics across multiple scopes.
+// Used for the Job-View Statistics Table to display per-scope breakdowns.
+//
+// Returns statistics organized as: metric -> scope -> list of scoped statistics.
+// Each scoped statistic includes hostname, hardware ID (if applicable), and min/avg/max values.
+func (ccms *CCMetricStore) LoadScopedStats(
+	job *schema.Job,
+	metrics []string,
+	scopes []schema.MetricScope,
+	ctx context.Context,
+) (schema.ScopedJobStats, error) {
+	queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, 0)
+	if err != nil {
+		cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error())
+		return nil, err
+	}
+
+	req := APIQueryRequest{
+		Cluster:   job.Cluster,
+		From:      job.StartTime,
+		To:        job.StartTime + int64(job.Duration),
+		Queries:   queries,
+		WithStats: true,
+		WithData:  false,
+	}
+
+	resBody, err := ccms.doRequest(ctx, &req)
+	if err != nil {
+		cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error())
+		return nil, err
+	}
+
+	var errors []string
+	scopedJobStats := make(schema.ScopedJobStats)
+
+	for i, row := range resBody.Results {
+		query := req.Queries[i]
+		metric := query.Metric
+		scope := assignedScope[i]
+
+		if _, ok := scopedJobStats[metric]; !ok {
+			scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
+		}
+
+		if _, ok := scopedJobStats[metric][scope]; !ok {
+			scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
+		}
+
+		for ndx, res := range row {
+			if res.Error != nil {
+				/* Build list for "partial errors", if any */
+				errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
+				continue
+			}
+
+			id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
+
+			ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
+
+			scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
+				Hostname: query.Hostname,
+				ID:       id,
+				Data: &schema.MetricStatistics{
+					Avg: float64(res.Avg),
+					Min: float64(res.Min),
+					Max: float64(res.Max),
+				},
+			})
+		}
+
+		// So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty
+		if len(scopedJobStats[metric][scope]) == 0 {
+			delete(scopedJobStats[metric], scope)
+			if len(scopedJobStats[metric]) == 0 {
+				delete(scopedJobStats, metric)
+			}
+		}
+	}
+
+	if len(errors) != 0 {
+		/* Returns list for "partial errors" */
+		return scopedJobStats, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
+	}
+	return scopedJobStats, nil
+}
+
+// LoadNodeData retrieves current metric data for specified nodes in a cluster.
+// Used for the Systems-View Node-Overview to display real-time node status.
+//
+// If nodes is nil, queries all metrics for all nodes in the cluster (bulk query).
+// Returns data organized as: hostname -> metric -> list of JobMetric (with time series and stats).
+func (ccms *CCMetricStore) LoadNodeData(
+	cluster string,
+	metrics, nodes []string,
+	scopes []schema.MetricScope,
+	from, to time.Time,
+	ctx context.Context,
+) (map[string]map[string][]*schema.JobMetric, error) {
+	req := APIQueryRequest{
+		Cluster:   cluster,
+		From:      from.Unix(),
+		To:        to.Unix(),
+		WithStats: true,
+		WithData:  true,
+	}
+
+	if nodes == nil {
+		req.ForAllNodes = append(req.ForAllNodes, metrics...)
+	} else {
+		for _, node := range nodes {
+			for _, metric := range metrics {
+				req.Queries = append(req.Queries, APIQuery{
+					Hostname:   node,
+					Metric:     metric,
+					Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution
+				})
+			}
+		}
+	}
+
+	resBody, err := ccms.doRequest(ctx, &req)
+	if err != nil {
+		cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error())
+		return nil, err
+	}
+
+	var errors []string
+	data := make(map[string]map[string][]*schema.JobMetric)
+	for i, res := range resBody.Results {
+		if len(res) == 0 {
+			// No Data Found For Metric, Logged in FetchData to Warn
+			continue
+		}
+
+		var query APIQuery
+		if resBody.Queries != nil {
+			query = resBody.Queries[i]
+		} else {
+			query = req.Queries[i]
+		}
+
+		metric := query.Metric
+		qdata := res[0]
+		if qdata.Error != nil {
+			errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
+			continue
+		}
+
+		mc := archive.GetMetricConfig(cluster, metric)
+		if mc == nil {
+			cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
+			continue
+		}
+
+		ms.SanitizeStats(&qdata.Avg, &qdata.Min, &qdata.Max)
+
+		hostdata, ok := data[query.Hostname]
+		if !ok {
+			hostdata = make(map[string][]*schema.JobMetric)
+			data[query.Hostname] = hostdata
+		}
+
+		hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
+			Unit:     mc.Unit,
+			Timestep: mc.Timestep,
+			Series: []schema.Series{
+				{
+					Hostname: query.Hostname,
+					Data:     qdata.Data,
+					Statistics: schema.MetricStatistics{
+						Avg: float64(qdata.Avg),
+						Min: float64(qdata.Min),
+						Max: float64(qdata.Max),
+					},
+				},
+			},
+		})
+	}
+
+	if len(errors) != 0 {
+		/* Returns list of "partial errors" */
+		return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
+	}
+
+	return data, nil
+}
+
+// LoadNodeListData retrieves paginated node metrics for the Systems-View Node-List.
+//
+// Supports filtering by subcluster and node name pattern. The nodeFilter performs
+// substring matching on hostnames.
+//
+// Returns:
+//   - Node data organized as: hostname -> JobData (metric -> scope -> series)
+//   - Total node count (before pagination)
+//   - HasNextPage flag indicating if more pages are available
+//   - Error (may be partial error with some data returned)
+func (ccms *CCMetricStore) LoadNodeListData(
+	cluster, subCluster string,
+	nodes []string,
+	metrics []string,
+	scopes []schema.MetricScope,
+	resolution int,
+	from, to time.Time,
+	ctx context.Context,
+) (map[string]schema.JobData, error) {
+	queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
+	if err != nil {
+		cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
+		return nil, err
+	}
+
+	// Verify assignment is correct - log any inconsistencies for debugging
+	if len(queries) != len(assignedScope) {
+		cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d",
+			len(queries), len(assignedScope))
+	}
+
+	req := APIQueryRequest{
+		Cluster:   cluster,
+		Queries:   queries,
+		From:      from.Unix(),
+		To:        to.Unix(),
+		WithStats: true,
+		WithData:  true,
+	}
+
+	resBody, err := ccms.doRequest(ctx, &req)
+	if err != nil {
+		cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error())
+		return nil, err
+	}
+
+	var errors []string
+	data := make(map[string]schema.JobData)
+
+	// Add safety check for index out of range issues
+	if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) {
+		cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d",
+			len(req.Queries), len(resBody.Results), len(assignedScope))
+		if len(resBody.Results) > len(req.Queries) {
+			resBody.Results = resBody.Results[:len(req.Queries)]
+		}
+		if len(assignedScope) > len(req.Queries) {
+			assignedScope = assignedScope[:len(req.Queries)]
+		}
+	}
+
+	for i, row := range resBody.Results {
+		var query APIQuery
+		if resBody.Queries != nil {
+			if i < len(resBody.Queries) {
+				query = resBody.Queries[i]
+			} else {
+				cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d",
+					i, len(resBody.Queries))
+				continue
+			}
+		} else {
+			query = req.Queries[i]
+		}
+
+		metric := query.Metric
+		scope := assignedScope[i]
+		mc := archive.GetMetricConfig(cluster, metric)
+		if mc == nil {
+			cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster)
+			continue
+		}
+
+		res := mc.Timestep
+		if len(row) > 0 {
+			res = row[0].Resolution
+		}
+
+		// Init Nested Map Data Structures If Not Found
+		hostData, ok := data[query.Hostname]
+		if !ok {
+			hostData = make(schema.JobData)
+			data[query.Hostname] = hostData
+		}
+
+		metricData, ok := hostData[metric]
+		if !ok {
+			metricData = make(map[schema.MetricScope]*schema.JobMetric)
+			data[query.Hostname][metric] = metricData
+		}
+
+		scopeData, ok := metricData[scope]
+		if !ok {
+			scopeData = &schema.JobMetric{
+				Unit:     mc.Unit,
+				Timestep: res,
+				Series:   make([]schema.Series, 0),
+			}
+			data[query.Hostname][metric][scope] = scopeData
+		}
+
+		for ndx, res := range row {
+			if res.Error != nil {
+				/* Build list for "partial errors", if any */
+				errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
+				continue
+			}
+
+			id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname)
+
+			ms.SanitizeStats(&res.Avg, &res.Min, &res.Max)
+
+			scopeData.Series = append(scopeData.Series, schema.Series{
+				Hostname: query.Hostname,
+				ID:       id,
+				Statistics: schema.MetricStatistics{
+					Avg: float64(res.Avg),
+					Min: float64(res.Min),
+					Max: float64(res.Max),
+				},
+				Data: res.Data,
+			})
+		}
+	}
+
+	if len(errors) != 0 {
+		/* Returns list of "partial errors" */
+		return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", "))
+	}
+
+	return data, nil
+}
+
+// HealthCheck queries the external cc-metric-store's health check endpoint.
+// It sends a HealthCheckReq as the request body to /api/healthcheck and
+// returns the per-node health check results.
+func (ccms *CCMetricStore) HealthCheck(cluster string,
+	nodes []string, metrics []string,
+) (map[string]ms.HealthCheckResult, error) {
+	req := ms.HealthCheckReq{
+		Cluster:     cluster,
+		Nodes:       nodes,
+		MetricNames: metrics,
+	}
+
+	buf := &bytes.Buffer{}
+	if err := json.NewEncoder(buf).Encode(req); err != nil {
+		cclog.Errorf("Error while encoding health check request body: %s", err.Error())
+		return nil, err
+	}
+
+	endpoint := fmt.Sprintf("%s/api/healthcheck", ccms.url)
+	httpReq, err := http.NewRequest(http.MethodGet, endpoint, buf)
+	if err != nil {
+		cclog.Errorf("Error while building health check request: %s", err.Error())
+		return nil, err
+	}
+	if ccms.jwt != "" {
+		httpReq.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
+	}
+
+	res, err := ccms.client.Do(httpReq)
+	if err != nil {
+		cclog.Errorf("Error while performing health check request: %s", err.Error())
+		return nil, err
+	}
+	defer res.Body.Close()
+
+	if res.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("'%s': HTTP Status: %s", endpoint, res.Status)
+	}
+
+	var results map[string]ms.HealthCheckResult
+	if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&results); err != nil {
+		cclog.Errorf("Error while decoding health check response: %s", err.Error())
+		return nil, err
+	}
+
+	return results, nil
+}
+
+// hasNaNStats returns true if any of the statistics contain NaN values.
+func hasNaNStats(avg, min, max schema.Float) bool {
+	return avg.IsNaN() || min.IsNaN() || max.IsNaN()
+}
--- a/internal/repository/dbConnection.go
+++ b/internal/repository/dbConnection.go
@@ -12,7 +12,7 @@ import (
 	"sync"
 	"time"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	"github.com/jmoiron/sqlx"
 	"github.com/mattn/go-sqlite3"
 	"github.com/qustavo/sqlhooks/v2"
@@ -51,7 +51,7 @@ func setupSqlite(db *sql.DB) error {
 	return nil
 }

-func Connect(driver string, db string) {
+func Connect(db string) {
 	var err error
 	var dbHandle *sqlx.DB

@@ -64,39 +64,31 @@ func Connect(driver string, db string) {
 			ConnectionMaxIdleTime: repoConfig.ConnectionMaxIdleTime,
 		}

-		switch driver {
-		case "sqlite3":
-			// TODO: Have separate DB handles for Writes and Reads
-			// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
-			connectionURLParams := make(url.Values)
-			connectionURLParams.Add("_txlock", "immediate")
-			connectionURLParams.Add("_journal_mode", "WAL")
-			connectionURLParams.Add("_busy_timeout", "5000")
-			connectionURLParams.Add("_synchronous", "NORMAL")
-			connectionURLParams.Add("_cache_size", "1000000000")
-			connectionURLParams.Add("_foreign_keys", "true")
-			opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode())
+		// TODO: Have separate DB handles for Writes and Reads
+		// Optimize SQLite connection: https://kerkour.com/sqlite-for-servers
+		connectionURLParams := make(url.Values)
+		connectionURLParams.Add("_txlock", "immediate")
+		connectionURLParams.Add("_journal_mode", "WAL")
+		connectionURLParams.Add("_busy_timeout", "5000")
+		connectionURLParams.Add("_synchronous", "NORMAL")
+		connectionURLParams.Add("_cache_size", "1000000000")
+		connectionURLParams.Add("_foreign_keys", "true")
+		opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode())

-			if cclog.Loglevel() == "debug" {
-				sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
-				dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
-			} else {
-				dbHandle, err = sqlx.Open("sqlite3", opts.URL)
-			}
-
-			err = setupSqlite(dbHandle.DB)
-			if err != nil {
-				cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error())
-			}
-		case "mysql":
-			opts.URL += "?multiStatements=true"
-			dbHandle, err = sqlx.Open("mysql", opts.URL)
-		default:
-			cclog.Abortf("DB Connection: Unsupported database driver '%s'.\n", driver)
+		if cclog.Loglevel() == "debug" {
+			sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{}))
+			dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL)
+		} else {
+			dbHandle, err = sqlx.Open("sqlite3", opts.URL)
 		}

 		if err != nil {
-			cclog.Abortf("DB Connection: Could not connect to '%s' database with sqlx.Open().\nError: %s\n", driver, err.Error())
+			cclog.Abortf("DB Connection: Could not connect to SQLite database with sqlx.Open().\nError: %s\n", err.Error())
+		}
+
+		err = setupSqlite(dbHandle.DB)
+		if err != nil {
+			cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error())
 		}

 		dbHandle.SetMaxOpenConns(opts.MaxOpenConnections)
@@ -104,8 +96,8 @@ func Connect(driver string, db string) {
 		dbHandle.SetConnMaxLifetime(opts.ConnectionMaxLifetime)
 		dbHandle.SetConnMaxIdleTime(opts.ConnectionMaxIdleTime)

-		dbConnInstance = &DBConnection{DB: dbHandle, Driver: driver}
-		err = checkDBVersion(driver, dbHandle.DB)
+		dbConnInstance = &DBConnection{DB: dbHandle}
+		err = checkDBVersion(dbHandle.DB)
 		if err != nil {
 			cclog.Abortf("DB Connection: Failed DB version check.\nError: %s\n", err.Error())
 		}
@@ -119,3 +111,26 @@ func GetConnection() *DBConnection {

 	return dbConnInstance
 }
+
+// ResetConnection closes the current database connection and resets the connection state.
+// This function is intended for testing purposes only to allow test isolation.
+func ResetConnection() error {
+	if dbConnInstance != nil && dbConnInstance.DB != nil {
+		if err := dbConnInstance.DB.Close(); err != nil {
+			return fmt.Errorf("failed to close database connection: %w", err)
+		}
+	}
+
+	dbConnInstance = nil
+	dbConnOnce = sync.Once{}
+	jobRepoInstance = nil
+	jobRepoOnce = sync.Once{}
+	nodeRepoInstance = nil
+	nodeRepoOnce = sync.Once{}
+	userRepoInstance = nil
+	userRepoOnce = sync.Once{}
+	userCfgRepoInstance = nil
+	userCfgRepoOnce = sync.Once{}
+
+	return nil
+}
--- a/internal/repository/hooks.go
+++ b/internal/repository/hooks.go
@@ -2,13 +2,14 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
 package repository

 import (
 	"context"
 	"time"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 )

 // Hooks satisfies the sqlhook.Hooks interface
--- a/internal/repository/hooks_test.go
+++ b/internal/repository/hooks_test.go
@@ -0,0 +1,274 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+package repository
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	_ "github.com/mattn/go-sqlite3"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+type MockJobHook struct {
+	startCalled bool
+	stopCalled  bool
+	startJobs   []*schema.Job
+	stopJobs    []*schema.Job
+}
+
+func (m *MockJobHook) JobStartCallback(job *schema.Job) {
+	m.startCalled = true
+	m.startJobs = append(m.startJobs, job)
+}
+
+func (m *MockJobHook) JobStopCallback(job *schema.Job) {
+	m.stopCalled = true
+	m.stopJobs = append(m.stopJobs, job)
+}
+
+func TestRegisterJobHook(t *testing.T) {
+	t.Run("register single hook", func(t *testing.T) {
+		hooks = nil
+		mock := &MockJobHook{}
+
+		RegisterJobHook(mock)
+
+		assert.NotNil(t, hooks)
+		assert.Len(t, hooks, 1)
+		assert.Equal(t, mock, hooks[0])
+
+		hooks = nil
+	})
+
+	t.Run("register multiple hooks", func(t *testing.T) {
+		hooks = nil
+		mock1 := &MockJobHook{}
+		mock2 := &MockJobHook{}
+
+		RegisterJobHook(mock1)
+		RegisterJobHook(mock2)
+
+		assert.Len(t, hooks, 2)
+		assert.Equal(t, mock1, hooks[0])
+		assert.Equal(t, mock2, hooks[1])
+
+		hooks = nil
+	})
+
+	t.Run("register nil hook does not add to hooks", func(t *testing.T) {
+		hooks = nil
+		RegisterJobHook(nil)
+
+		if hooks != nil {
+			assert.Len(t, hooks, 0, "Nil hook should not be added")
+		}
+
+		hooks = nil
+	})
+}
+
+func TestCallJobStartHooks(t *testing.T) {
+	t.Run("call start hooks with single job", func(t *testing.T) {
+		hooks = nil
+		mock := &MockJobHook{}
+		RegisterJobHook(mock)
+
+		job := &schema.Job{
+			JobID:   123,
+			User:    "testuser",
+			Cluster: "testcluster",
+		}
+
+		CallJobStartHooks([]*schema.Job{job})
+
+		assert.True(t, mock.startCalled)
+		assert.False(t, mock.stopCalled)
+		assert.Len(t, mock.startJobs, 1)
+		assert.Equal(t, int64(123), mock.startJobs[0].JobID)
+
+		hooks = nil
+	})
+
+	t.Run("call start hooks with multiple jobs", func(t *testing.T) {
+		hooks = nil
+		mock := &MockJobHook{}
+		RegisterJobHook(mock)
+
+		jobs := []*schema.Job{
+			{JobID: 1, User: "user1", Cluster: "cluster1"},
+			{JobID: 2, User: "user2", Cluster: "cluster2"},
+			{JobID: 3, User: "user3", Cluster: "cluster3"},
+		}
+
+		CallJobStartHooks(jobs)
+
+		assert.True(t, mock.startCalled)
+		assert.Len(t, mock.startJobs, 3)
+		assert.Equal(t, int64(1), mock.startJobs[0].JobID)
+		assert.Equal(t, int64(2), mock.startJobs[1].JobID)
+		assert.Equal(t, int64(3), mock.startJobs[2].JobID)
+
+		hooks = nil
+	})
+
+	t.Run("call start hooks with multiple registered hooks", func(t *testing.T) {
+		hooks = nil
+		mock1 := &MockJobHook{}
+		mock2 := &MockJobHook{}
+		RegisterJobHook(mock1)
+		RegisterJobHook(mock2)
+
+		job := &schema.Job{
+			JobID: 456, User: "testuser", Cluster: "testcluster",
+		}
+
+		CallJobStartHooks([]*schema.Job{job})
+
+		assert.True(t, mock1.startCalled)
+		assert.True(t, mock2.startCalled)
+		assert.Len(t, mock1.startJobs, 1)
+		assert.Len(t, mock2.startJobs, 1)
+
+		hooks = nil
+	})
+
+	t.Run("call start hooks with nil hooks", func(t *testing.T) {
+		hooks = nil
+
+		job := &schema.Job{
+			JobID: 789, User: "testuser", Cluster: "testcluster",
+		}
+
+		CallJobStartHooks([]*schema.Job{job})
+
+		hooks = nil
+	})
+
+	t.Run("call start hooks with empty job list", func(t *testing.T) {
+		hooks = nil
+		mock := &MockJobHook{}
+		RegisterJobHook(mock)
+
+		CallJobStartHooks([]*schema.Job{})
+
+		assert.False(t, mock.startCalled)
+		assert.Len(t, mock.startJobs, 0)
+
+		hooks = nil
+	})
+}
+
+func TestCallJobStopHooks(t *testing.T) {
+	t.Run("call stop hooks with single job", func(t *testing.T) {
+		hooks = nil
+		mock := &MockJobHook{}
+		RegisterJobHook(mock)
+
+		job := &schema.Job{
+			JobID:   123,
+			User:    "testuser",
+			Cluster: "testcluster",
+		}
+
+		CallJobStopHooks(job)
+
+		assert.True(t, mock.stopCalled)
+		assert.False(t, mock.startCalled)
+		assert.Len(t, mock.stopJobs, 1)
+		assert.Equal(t, int64(123), mock.stopJobs[0].JobID)
+
+		hooks = nil
+	})
+
+	t.Run("call stop hooks with multiple registered hooks", func(t *testing.T) {
+		hooks = nil
+		mock1 := &MockJobHook{}
+		mock2 := &MockJobHook{}
+		RegisterJobHook(mock1)
+		RegisterJobHook(mock2)
+
+		job := &schema.Job{
+			JobID: 456, User: "testuser", Cluster: "testcluster",
+		}
+
+		CallJobStopHooks(job)
+
+		assert.True(t, mock1.stopCalled)
+		assert.True(t, mock2.stopCalled)
+		assert.Len(t, mock1.stopJobs, 1)
+		assert.Len(t, mock2.stopJobs, 1)
+
+		hooks = nil
+	})
+
+	t.Run("call stop hooks with nil hooks", func(t *testing.T) {
+		hooks = nil
+
+		job := &schema.Job{
+			JobID: 789, User: "testuser", Cluster: "testcluster",
+		}
+
+		CallJobStopHooks(job)
+
+		hooks = nil
+	})
+}
+
+func TestSQLHooks(t *testing.T) {
+	_ = setup(t)
+
+	t.Run("hooks log queries in debug mode", func(t *testing.T) {
+		h := &Hooks{}
+
+		ctx := context.Background()
+		query := "SELECT * FROM job WHERE job_id = ?"
+		args := []any{123}
+
+		ctxWithTime, err := h.Before(ctx, query, args...)
+		require.NoError(t, err)
+		assert.NotNil(t, ctxWithTime)
+
+		beginTime := ctxWithTime.Value("begin")
+		require.NotNil(t, beginTime)
+		_, ok := beginTime.(time.Time)
+		assert.True(t, ok, "Begin time should be time.Time")
+
+		time.Sleep(10 * time.Millisecond)
+
+		ctxAfter, err := h.After(ctxWithTime, query, args...)
+		require.NoError(t, err)
+		assert.NotNil(t, ctxAfter)
+	})
+}
+
+func TestHookIntegration(t *testing.T) {
+	t.Run("hooks are called during job lifecycle", func(t *testing.T) {
+		hooks = nil
+		mock := &MockJobHook{}
+		RegisterJobHook(mock)
+
+		job := &schema.Job{
+			JobID:   999,
+			User:    "integrationuser",
+			Cluster: "integrationcluster",
+		}
+
+		CallJobStartHooks([]*schema.Job{job})
+		assert.True(t, mock.startCalled)
+		assert.Equal(t, 1, len(mock.startJobs))
+
+		CallJobStopHooks(job)
+		assert.True(t, mock.stopCalled)
+		assert.Equal(t, 1, len(mock.stopJobs))
+
+		assert.Equal(t, mock.startJobs[0].JobID, mock.stopJobs[0].JobID)
+
+		hooks = nil
+	})
+}
--- a/internal/repository/job.go
+++ b/internal/repository/job.go
--- a/internal/repository/jobCreate.go
+++ b/internal/repository/jobCreate.go
@@ -2,14 +2,15 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
 package repository

 import (
 	"encoding/json"
 	"fmt"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	sq "github.com/Masterminds/squirrel"
 )

@@ -29,6 +30,27 @@ const NamedJobInsert string = `INSERT INTO job (
  :shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint,  :energy, :energy_footprint, :resources, :meta_data
 );`

+// InsertJobDirect inserts a job directly into the job table (not job_cache).
+// Use this when the returned ID will be used for operations on the job table
+// (e.g., adding tags), or for imported jobs that are already completed.
+func (r *JobRepository) InsertJobDirect(job *schema.Job) (int64, error) {
+	r.Mutex.Lock()
+	defer r.Mutex.Unlock()
+
+	res, err := r.DB.NamedExec(NamedJobInsert, job)
+	if err != nil {
+		cclog.Warn("Error while NamedJobInsert (direct)")
+		return 0, err
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		cclog.Warn("Error while getting last insert ID (direct)")
+		return 0, err
+	}
+
+	return id, nil
+}
+
 func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {
 	r.Mutex.Lock()
 	defer r.Mutex.Unlock()
@@ -70,8 +92,9 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
 		jobs = append(jobs, job)
 	}

+	// Use INSERT OR IGNORE to skip jobs already transferred by the stop path
 	_, err = r.DB.Exec(
-		"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
+		"INSERT OR IGNORE INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache")
 	if err != nil {
 		cclog.Warnf("Error while Job sync: %v", err)
 		return nil, err
@@ -83,9 +106,48 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) {
 		return nil, err
 	}

+	// Resolve correct job.id from the job table. The IDs read from job_cache
+	// are from a different auto-increment sequence and must not be used to
+	// query the job table.
+	for _, job := range jobs {
+		var newID int64
+		if err := sq.Select("job.id").From("job").
+			Where("job.job_id = ? AND job.cluster = ? AND job.start_time = ?",
+				job.JobID, job.Cluster, job.StartTime).
+			RunWith(r.stmtCache).QueryRow().Scan(&newID); err != nil {
+			cclog.Warnf("SyncJobs: could not resolve job table id for job %d on %s: %v",
+				job.JobID, job.Cluster, err)
+			continue
+		}
+		job.ID = &newID
+	}
+
 	return jobs, nil
 }

+// TransferCachedJobToMain moves a job from job_cache to the job table.
+// Caller must hold r.Mutex. Returns the new job table ID.
+func (r *JobRepository) TransferCachedJobToMain(cacheID int64) (int64, error) {
+	res, err := r.DB.Exec(
+		"INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache WHERE id = ?",
+		cacheID)
+	if err != nil {
+		return 0, fmt.Errorf("transferring cached job %d to main table failed: %w", cacheID, err)
+	}
+
+	newID, err := res.LastInsertId()
+	if err != nil {
+		return 0, fmt.Errorf("getting new job ID after transfer failed: %w", err)
+	}
+
+	_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", cacheID)
+	if err != nil {
+		return 0, fmt.Errorf("deleting cached job %d after transfer failed: %w", cacheID, err)
+	}
+
+	return newID, nil
+}
+
 // Start inserts a new job in the table, returning the unique job ID.
 // Statistics are not transfered!
 func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
@@ -107,41 +169,46 @@ func (r *JobRepository) Start(job *schema.Job) (id int64, err error) {
 	return r.InsertJob(job)
 }

+// StartDirect inserts a new job directly into the job table (not job_cache).
+// Use this when the returned ID will immediately be used for job table
+// operations such as adding tags.
+func (r *JobRepository) StartDirect(job *schema.Job) (id int64, err error) {
+	job.RawFootprint, err = json.Marshal(job.Footprint)
+	if err != nil {
+		return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err)
+	}
+
+	job.RawResources, err = json.Marshal(job.Resources)
+	if err != nil {
+		return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err)
+	}
+
+	job.RawMetaData, err = json.Marshal(job.MetaData)
+	if err != nil {
+		return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err)
+	}
+
+	return r.InsertJobDirect(job)
+}
+
 // Stop updates the job with the database id jobId using the provided arguments.
 func (r *JobRepository) Stop(
-	jobId int64,
+	jobID int64,
 	duration int32,
 	state schema.JobState,
 	monitoringStatus int32,
 ) (err error) {
 	// Invalidate cache entries as job state is changing
-	r.cache.Del(fmt.Sprintf("metadata:%d", jobId))
-	r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobId))
+	r.cache.Del(fmt.Sprintf("metadata:%d", jobID))
+	r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobID))

 	stmt := sq.Update("job").
 		Set("job_state", state).
 		Set("duration", duration).
 		Set("monitoring_status", monitoringStatus).
-		Where("job.id = ?", jobId)
+		Where("job.id = ?", jobID)

 	_, err = stmt.RunWith(r.stmtCache).Exec()
 	return err
 }

-func (r *JobRepository) StopCached(
-	jobId int64,
-	duration int32,
-	state schema.JobState,
-	monitoringStatus int32,
-) (err error) {
-	// Note: StopCached updates job_cache table, not the main job table
-	// Cache invalidation happens when job is synced to main table
-	stmt := sq.Update("job_cache").
-		Set("job_state", state).
-		Set("duration", duration).
-		Set("monitoring_status", monitoringStatus).
-		Where("job_cache.id = ?", jobId)
-
-	_, err = stmt.RunWith(r.stmtCache).Exec()
-	return err
-}
--- a/internal/repository/jobCreate_test.go
+++ b/internal/repository/jobCreate_test.go
@@ -0,0 +1,607 @@
+// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
+// All rights reserved. This file is part of cc-backend.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+package repository
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
+	_ "github.com/mattn/go-sqlite3"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// createTestJob creates a minimal valid job for testing
+func createTestJob(jobID int64, cluster string) *schema.Job {
+	return &schema.Job{
+		JobID:            jobID,
+		User:             "testuser",
+		Project:          "testproject",
+		Cluster:          cluster,
+		SubCluster:       "main",
+		Partition:        "batch",
+		NumNodes:         1,
+		NumHWThreads:     4,
+		NumAcc:           0,
+		Shared:           "none",
+		MonitoringStatus: schema.MonitoringStatusRunningOrArchiving,
+		SMT:              1,
+		State:            schema.JobStateRunning,
+		StartTime:        1234567890,
+		Duration:         0,
+		Walltime:         3600,
+		Resources: []*schema.Resource{
+			{
+				Hostname:  "node01",
+				HWThreads: []int{0, 1, 2, 3},
+			},
+		},
+		Footprint: map[string]float64{
+			"cpu_load":      50.0,
+			"mem_used":      8000.0,
+			"flops_any":     0.5,
+			"mem_bw":        10.0,
+			"net_bw":        2.0,
+			"file_bw":       1.0,
+			"cpu_used":      2.0,
+			"cpu_load_core": 12.5,
+		},
+		MetaData: map[string]string{
+			"jobName":     "test_job",
+			"queue":       "normal",
+			"qosName":     "default",
+			"accountName": "testaccount",
+		},
+	}
+}
+
+func TestInsertJob(t *testing.T) {
+	r := setup(t)
+
+	t.Run("successful insertion", func(t *testing.T) {
+		job := createTestJob(999001, "testcluster")
+		job.RawResources, _ = json.Marshal(job.Resources)
+		job.RawFootprint, _ = json.Marshal(job.Footprint)
+		job.RawMetaData, _ = json.Marshal(job.MetaData)
+
+		id, err := r.InsertJob(job)
+		require.NoError(t, err, "InsertJob should succeed")
+		assert.Greater(t, id, int64(0), "Should return valid insert ID")
+
+		// Verify job was inserted into job_cache
+		var count int
+		err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?",
+			job.JobID, job.Cluster).Scan(&count)
+		require.NoError(t, err)
+		assert.Equal(t, 1, count, "Job should be in job_cache table")
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job_cache WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
+		require.NoError(t, err)
+	})
+
+	t.Run("insertion with all fields", func(t *testing.T) {
+		job := createTestJob(999002, "testcluster")
+		job.ArrayJobID = 5000
+		job.Energy = 1500.5
+		job.RawResources, _ = json.Marshal(job.Resources)
+		job.RawFootprint, _ = json.Marshal(job.Footprint)
+		job.RawMetaData, _ = json.Marshal(job.MetaData)
+
+		id, err := r.InsertJob(job)
+		require.NoError(t, err)
+		assert.Greater(t, id, int64(0))
+
+		// Verify all fields were stored correctly
+		var retrievedJob schema.Job
+		err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, energy 
+			FROM job_cache WHERE id = ?`, id).Scan(
+			&retrievedJob.JobID, &retrievedJob.User, &retrievedJob.Project,
+			&retrievedJob.Cluster, &retrievedJob.ArrayJobID, &retrievedJob.Energy)
+		require.NoError(t, err)
+		assert.Equal(t, job.JobID, retrievedJob.JobID)
+		assert.Equal(t, job.User, retrievedJob.User)
+		assert.Equal(t, job.Project, retrievedJob.Project)
+		assert.Equal(t, job.Cluster, retrievedJob.Cluster)
+		assert.Equal(t, job.ArrayJobID, retrievedJob.ArrayJobID)
+		assert.Equal(t, job.Energy, retrievedJob.Energy)
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+}
+
+func TestStart(t *testing.T) {
+	r := setup(t)
+
+	t.Run("successful job start with JSON encoding", func(t *testing.T) {
+		job := createTestJob(999003, "testcluster")
+
+		id, err := r.Start(job)
+		require.NoError(t, err, "Start should succeed")
+		assert.Greater(t, id, int64(0), "Should return valid insert ID")
+
+		// Verify job was inserted and JSON fields were encoded
+		var rawResources, rawFootprint, rawMetaData []byte
+		err = r.DB.QueryRow(`SELECT resources, footprint, meta_data FROM job_cache WHERE id = ?`, id).Scan(
+			&rawResources, &rawFootprint, &rawMetaData)
+		require.NoError(t, err)
+
+		// Verify resources JSON
+		var resources []*schema.Resource
+		err = json.Unmarshal(rawResources, &resources)
+		require.NoError(t, err, "Resources should be valid JSON")
+		assert.Equal(t, 1, len(resources))
+		assert.Equal(t, "node01", resources[0].Hostname)
+
+		// Verify footprint JSON
+		var footprint map[string]float64
+		err = json.Unmarshal(rawFootprint, &footprint)
+		require.NoError(t, err, "Footprint should be valid JSON")
+		assert.Equal(t, 50.0, footprint["cpu_load"])
+		assert.Equal(t, 8000.0, footprint["mem_used"])
+
+		// Verify metadata JSON
+		var metaData map[string]string
+		err = json.Unmarshal(rawMetaData, &metaData)
+		require.NoError(t, err, "MetaData should be valid JSON")
+		assert.Equal(t, "test_job", metaData["jobName"])
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+
+	t.Run("job start with empty footprint", func(t *testing.T) {
+		job := createTestJob(999004, "testcluster")
+		job.Footprint = map[string]float64{}
+
+		id, err := r.Start(job)
+		require.NoError(t, err)
+		assert.Greater(t, id, int64(0))
+
+		// Verify empty footprint was encoded as empty JSON object
+		var rawFootprint []byte
+		err = r.DB.QueryRow(`SELECT footprint FROM job_cache WHERE id = ?`, id).Scan(&rawFootprint)
+		require.NoError(t, err)
+		assert.Equal(t, []byte("{}"), rawFootprint)
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+
+	t.Run("job start with nil metadata", func(t *testing.T) {
+		job := createTestJob(999005, "testcluster")
+		job.MetaData = nil
+
+		id, err := r.Start(job)
+		require.NoError(t, err)
+		assert.Greater(t, id, int64(0))
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+}
+
+func TestStop(t *testing.T) {
+	r := setup(t)
+
+	t.Run("successful job stop", func(t *testing.T) {
+		// First insert a job using Start
+		job := createTestJob(999106, "testcluster")
+		id, err := r.Start(job)
+		require.NoError(t, err)
+
+		// Move from job_cache to job table (simulate SyncJobs) - exclude id to let it auto-increment
+		_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, 
+			cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, 
+			num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) 
+			SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, 
+			cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, 
+			num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint 
+			FROM job_cache WHERE id = ?`, id)
+		require.NoError(t, err)
+		_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
+		require.NoError(t, err)
+
+		// Get the new job id in the job table
+		err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
+			job.JobID, job.Cluster, job.StartTime).Scan(&id)
+		require.NoError(t, err)
+
+		// Stop the job
+		duration := int32(3600)
+		state := schema.JobStateCompleted
+		monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful)
+
+		err = r.Stop(id, duration, state, monitoringStatus)
+		require.NoError(t, err, "Stop should succeed")
+
+		// Verify job was updated
+		var retrievedDuration int32
+		var retrievedState string
+		var retrievedMonStatus int32
+		err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job WHERE id = ?`, id).Scan(
+			&retrievedDuration, &retrievedState, &retrievedMonStatus)
+		require.NoError(t, err)
+		assert.Equal(t, duration, retrievedDuration)
+		assert.Equal(t, string(state), retrievedState)
+		assert.Equal(t, monitoringStatus, retrievedMonStatus)
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+
+	t.Run("stop updates job state transitions", func(t *testing.T) {
+		// Insert a job
+		job := createTestJob(999107, "testcluster")
+		id, err := r.Start(job)
+		require.NoError(t, err)
+
+		// Move to job table
+		_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, 
+			cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, 
+			num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) 
+			SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, 
+			cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, 
+			num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint 
+			FROM job_cache WHERE id = ?`, id)
+		require.NoError(t, err)
+		_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
+		require.NoError(t, err)
+
+		// Get the new job id in the job table
+		err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
+			job.JobID, job.Cluster, job.StartTime).Scan(&id)
+		require.NoError(t, err)
+
+		// Stop the job with different duration
+		err = r.Stop(id, 7200, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful))
+		require.NoError(t, err)
+
+		// Verify the duration was updated correctly
+		var duration int32
+		err = r.DB.QueryRow(`SELECT duration FROM job WHERE id = ?`, id).Scan(&duration)
+		require.NoError(t, err)
+		assert.Equal(t, int32(7200), duration, "Duration should be updated to 7200")
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+
+	t.Run("stop with different states", func(t *testing.T) {
+		testCases := []struct {
+			name             string
+			jobID            int64
+			state            schema.JobState
+			monitoringStatus int32
+		}{
+			{"completed", 999108, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)},
+			{"failed", 999118, schema.JobStateFailed, int32(schema.MonitoringStatusArchivingSuccessful)},
+			{"cancelled", 999119, schema.JobStateCancelled, int32(schema.MonitoringStatusArchivingSuccessful)},
+			{"timeout", 999120, schema.JobStateTimeout, int32(schema.MonitoringStatusArchivingSuccessful)},
+		}
+
+		for _, tc := range testCases {
+			t.Run(tc.name, func(t *testing.T) {
+				job := createTestJob(tc.jobID, "testcluster")
+				id, err := r.Start(job)
+				require.NoError(t, err)
+
+				// Move to job table
+				_, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, 
+					cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, 
+					num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) 
+					SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, 
+					cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, 
+					num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint 
+					FROM job_cache WHERE id = ?`, id)
+				require.NoError(t, err)
+				_, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id)
+				require.NoError(t, err)
+
+				// Get the new job id in the job table
+				err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
+					job.JobID, job.Cluster, job.StartTime).Scan(&id)
+				require.NoError(t, err)
+
+				// Stop with specific state
+				err = r.Stop(id, 1800, tc.state, tc.monitoringStatus)
+				require.NoError(t, err)
+
+				// Verify state was set correctly
+				var retrievedState string
+				err = r.DB.QueryRow(`SELECT job_state FROM job WHERE id = ?`, id).Scan(&retrievedState)
+				require.NoError(t, err)
+				assert.Equal(t, string(tc.state), retrievedState)
+
+				// Clean up
+				_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
+				require.NoError(t, err)
+			})
+		}
+	})
+}
+
+func TestTransferCachedJobToMain(t *testing.T) {
+	r := setup(t)
+
+	t.Run("successful transfer from cache to main", func(t *testing.T) {
+		// Insert a job in job_cache
+		job := createTestJob(999009, "testcluster")
+		cacheID, err := r.Start(job)
+		require.NoError(t, err)
+
+		// Transfer the cached job to the main table
+		r.Mutex.Lock()
+		newID, err := r.TransferCachedJobToMain(cacheID)
+		r.Mutex.Unlock()
+		require.NoError(t, err, "TransferCachedJobToMain should succeed")
+		assert.NotEqual(t, cacheID, newID, "New ID should differ from cache ID")
+
+		// Verify job exists in job table
+		var count int
+		err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE id = ?`, newID).Scan(&count)
+		require.NoError(t, err)
+		assert.Equal(t, 1, count, "Job should exist in main table")
+
+		// Verify job was removed from job_cache
+		err = r.DB.QueryRow(`SELECT COUNT(*) FROM job_cache WHERE id = ?`, cacheID).Scan(&count)
+		require.NoError(t, err)
+		assert.Equal(t, 0, count, "Job should be removed from cache")
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID)
+		require.NoError(t, err)
+	})
+
+	t.Run("transfer preserves job data", func(t *testing.T) {
+		// Insert a job in job_cache
+		job := createTestJob(999010, "testcluster")
+		cacheID, err := r.Start(job)
+		require.NoError(t, err)
+
+		// Transfer the cached job
+		r.Mutex.Lock()
+		newID, err := r.TransferCachedJobToMain(cacheID)
+		r.Mutex.Unlock()
+		require.NoError(t, err)
+
+		// Verify the transferred job has the correct data
+		var jobID int64
+		var cluster string
+		err = r.DB.QueryRow(`SELECT job_id, cluster FROM job WHERE id = ?`, newID).Scan(&jobID, &cluster)
+		require.NoError(t, err)
+		assert.Equal(t, job.JobID, jobID)
+		assert.Equal(t, job.Cluster, cluster)
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID)
+		require.NoError(t, err)
+	})
+}
+
+func TestSyncJobs(t *testing.T) {
+	r := setup(t)
+
+	t.Run("sync jobs from cache to main table", func(t *testing.T) {
+		// Ensure cache is empty first
+		_, err := r.DB.Exec("DELETE FROM job_cache")
+		require.NoError(t, err)
+
+		// Insert multiple jobs in job_cache
+		job1 := createTestJob(999011, "testcluster")
+		job2 := createTestJob(999012, "testcluster")
+		job3 := createTestJob(999013, "testcluster")
+
+		_, err = r.Start(job1)
+		require.NoError(t, err)
+		_, err = r.Start(job2)
+		require.NoError(t, err)
+		_, err = r.Start(job3)
+		require.NoError(t, err)
+
+		// Verify jobs are in job_cache
+		var cacheCount int
+		err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)",
+			job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount)
+		require.NoError(t, err)
+		assert.Equal(t, 3, cacheCount, "All jobs should be in job_cache")
+
+		// Sync jobs
+		jobs, err := r.SyncJobs()
+		require.NoError(t, err, "SyncJobs should succeed")
+		assert.Equal(t, 3, len(jobs), "Should return 3 synced jobs")
+
+		// Verify jobs were moved to job table
+		var jobCount int
+		err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE job_id IN (?, ?, ?)",
+			job1.JobID, job2.JobID, job3.JobID).Scan(&jobCount)
+		require.NoError(t, err)
+		assert.Equal(t, 3, jobCount, "All jobs should be in job table")
+
+		// Verify job_cache was cleared
+		err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)",
+			job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount)
+		require.NoError(t, err)
+		assert.Equal(t, 0, cacheCount, "job_cache should be empty after sync")
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE job_id IN (?, ?, ?)", job1.JobID, job2.JobID, job3.JobID)
+		require.NoError(t, err)
+	})
+
+	t.Run("sync preserves job data", func(t *testing.T) {
+		// Ensure cache is empty first
+		_, err := r.DB.Exec("DELETE FROM job_cache")
+		require.NoError(t, err)
+
+		// Insert a job with specific data
+		job := createTestJob(999014, "testcluster")
+		job.ArrayJobID = 7777
+		job.Energy = 2500.75
+		job.Duration = 1800
+
+		id, err := r.Start(job)
+		require.NoError(t, err)
+
+		// Update some fields to simulate job progress
+		result, err := r.DB.Exec(`UPDATE job_cache SET duration = ?, energy = ? WHERE id = ?`,
+			3600, 3000.5, id)
+		require.NoError(t, err)
+		rowsAffected, _ := result.RowsAffected()
+		require.Equal(t, int64(1), rowsAffected, "UPDATE should affect exactly 1 row")
+
+		// Verify the update worked
+		var checkDuration int32
+		var checkEnergy float64
+		err = r.DB.QueryRow(`SELECT duration, energy FROM job_cache WHERE id = ?`, id).Scan(&checkDuration, &checkEnergy)
+		require.NoError(t, err)
+		require.Equal(t, int32(3600), checkDuration, "Duration should be updated to 3600 before sync")
+		require.Equal(t, 3000.5, checkEnergy, "Energy should be updated to 3000.5 before sync")
+
+		// Sync jobs
+		jobs, err := r.SyncJobs()
+		require.NoError(t, err)
+		require.Equal(t, 1, len(jobs), "Should return exactly 1 synced job")
+
+		// Verify in database
+		var dbJob schema.Job
+		err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, duration, energy 
+			FROM job WHERE job_id = ? AND cluster = ?`, job.JobID, job.Cluster).Scan(
+			&dbJob.JobID, &dbJob.User, &dbJob.Project, &dbJob.Cluster,
+			&dbJob.ArrayJobID, &dbJob.Duration, &dbJob.Energy)
+		require.NoError(t, err)
+		assert.Equal(t, job.JobID, dbJob.JobID)
+		assert.Equal(t, int32(3600), dbJob.Duration)
+		assert.Equal(t, 3000.5, dbJob.Energy)
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
+		require.NoError(t, err)
+	})
+
+	t.Run("sync returns job table IDs not cache IDs", func(t *testing.T) {
+		// Ensure cache is empty first
+		_, err := r.DB.Exec("DELETE FROM job_cache")
+		require.NoError(t, err)
+
+		// Insert a job into job_cache
+		job := createTestJob(999015, "testcluster")
+		cacheID, err := r.Start(job)
+		require.NoError(t, err)
+
+		// Sync jobs
+		jobs, err := r.SyncJobs()
+		require.NoError(t, err)
+		require.Equal(t, 1, len(jobs))
+
+		// The returned ID must refer to the job table, not job_cache
+		var jobTableID int64
+		err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?",
+			jobs[0].JobID, jobs[0].Cluster, jobs[0].StartTime).Scan(&jobTableID)
+		require.NoError(t, err)
+		assert.Equal(t, jobTableID, *jobs[0].ID,
+			"returned ID should match the job table row, not the cache ID (%d)", cacheID)
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster)
+		require.NoError(t, err)
+	})
+
+	t.Run("sync with empty cache returns empty list", func(t *testing.T) {
+		// Ensure cache is empty
+		_, err := r.DB.Exec("DELETE FROM job_cache")
+		require.NoError(t, err)
+
+		// Sync should return empty list
+		jobs, err := r.SyncJobs()
+		require.NoError(t, err)
+		assert.Equal(t, 0, len(jobs), "Should return empty list when cache is empty")
+	})
+}
+
+func TestInsertJobDirect(t *testing.T) {
+	r := setup(t)
+
+	t.Run("inserts into job table not cache", func(t *testing.T) {
+		job := createTestJob(999020, "testcluster")
+		job.RawResources, _ = json.Marshal(job.Resources)
+		job.RawFootprint, _ = json.Marshal(job.Footprint)
+		job.RawMetaData, _ = json.Marshal(job.MetaData)
+
+		id, err := r.InsertJobDirect(job)
+		require.NoError(t, err, "InsertJobDirect should succeed")
+		assert.Greater(t, id, int64(0), "Should return valid insert ID")
+
+		// Verify job is in job table
+		var count int
+		err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE id = ?", id).Scan(&count)
+		require.NoError(t, err)
+		assert.Equal(t, 1, count, "Job should be in job table")
+
+		// Verify job is NOT in job_cache
+		err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?",
+			job.JobID, job.Cluster).Scan(&count)
+		require.NoError(t, err)
+		assert.Equal(t, 0, count, "Job should NOT be in job_cache")
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+
+	t.Run("returned ID works for tag operations", func(t *testing.T) {
+		job := createTestJob(999021, "testcluster")
+		job.RawResources, _ = json.Marshal(job.Resources)
+		job.RawFootprint, _ = json.Marshal(job.Footprint)
+		job.RawMetaData, _ = json.Marshal(job.MetaData)
+
+		id, err := r.InsertJobDirect(job)
+		require.NoError(t, err)
+
+		// Adding a tag using the returned ID should succeed (FK constraint on jobtag)
+		err = r.ImportTag(id, "test_type", "test_name", "global")
+		require.NoError(t, err, "ImportTag should succeed with direct insert ID")
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM jobtag WHERE job_id = ?", id)
+		require.NoError(t, err)
+		_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+}
+
+func TestStartDirect(t *testing.T) {
+	r := setup(t)
+
+	t.Run("inserts into job table with JSON encoding", func(t *testing.T) {
+		job := createTestJob(999022, "testcluster")
+
+		id, err := r.StartDirect(job)
+		require.NoError(t, err, "StartDirect should succeed")
+		assert.Greater(t, id, int64(0))
+
+		// Verify job is in job table with encoded JSON
+		var rawResources []byte
+		err = r.DB.QueryRow("SELECT resources FROM job WHERE id = ?", id).Scan(&rawResources)
+		require.NoError(t, err)
+
+		var resources []*schema.Resource
+		err = json.Unmarshal(rawResources, &resources)
+		require.NoError(t, err, "Resources should be valid JSON")
+		assert.Equal(t, "node01", resources[0].Hostname)
+
+		// Clean up
+		_, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id)
+		require.NoError(t, err)
+	})
+}
--- a/internal/repository/jobFind.go
+++ b/internal/repository/jobFind.go
@@ -2,6 +2,7 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
 package repository

 import (
@@ -11,8 +12,8 @@ import (
 	"time"

 	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	sq "github.com/Masterminds/squirrel"
 )

@@ -22,13 +23,17 @@ import (
 // It returns a pointer to a schema.Job data structure and an error variable.
 // To check if no job was found test err == sql.ErrNoRows
 func (r *JobRepository) Find(
-	jobId *int64,
+	jobID *int64,
 	cluster *string,
 	startTime *int64,
 ) (*schema.Job, error) {
+	if jobID == nil {
+		return nil, fmt.Errorf("jobID cannot be nil")
+	}
+
 	start := time.Now()
 	q := sq.Select(jobColumns...).From("job").
-		Where("job.job_id = ?", *jobId)
+		Where("job.job_id = ?", *jobID)

 	if cluster != nil {
 		q = q.Where("job.cluster = ?", *cluster)
@@ -37,19 +42,29 @@ func (r *JobRepository) Find(
 		q = q.Where("job.start_time = ?", *startTime)
 	}

-	q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match
+	q = q.OrderBy("job.id DESC").Limit(1) // always use newest matching job by db id if more than one match

 	cclog.Debugf("Timer Find %s", time.Since(start))
 	return scanJob(q.RunWith(r.stmtCache).QueryRow())
 }

+// FindCached executes a SQL query to find a specific batch job from the job_cache table.
+// The job is queried using the batch job id, and optionally filtered by cluster name
+// and start time (UNIX epoch time seconds). This method uses cached job data which
+// may be stale but provides faster access than Find().
+// It returns a pointer to a schema.Job data structure and an error variable.
+// To check if no job was found test err == sql.ErrNoRows
 func (r *JobRepository) FindCached(
-	jobId *int64,
+	jobID *int64,
 	cluster *string,
 	startTime *int64,
 ) (*schema.Job, error) {
+	if jobID == nil {
+		return nil, fmt.Errorf("jobID cannot be nil")
+	}
+
 	q := sq.Select(jobCacheColumns...).From("job_cache").
-		Where("job_cache.job_id = ?", *jobId)
+		Where("job_cache.job_id = ?", *jobID)

 	if cluster != nil {
 		q = q.Where("job_cache.cluster = ?", *cluster)
@@ -58,24 +73,28 @@ func (r *JobRepository) FindCached(
 		q = q.Where("job_cache.start_time = ?", *startTime)
 	}

-	q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match
+	q = q.OrderBy("job_cache.id DESC").Limit(1) // always use newest matching job by db id if more than one match

 	return scanJob(q.RunWith(r.stmtCache).QueryRow())
 }

-// Find executes a SQL query to find a specific batch job.
-// The job is queried using the batch job id, the cluster name,
-// and the start time of the job in UNIX epoch time seconds.
-// It returns a pointer to a schema.Job data structure and an error variable.
-// To check if no job was found test err == sql.ErrNoRows
+// FindAll executes a SQL query to find all batch jobs matching the given criteria.
+// Jobs are queried using the batch job id, and optionally filtered by cluster name
+// and start time (UNIX epoch time seconds).
+// It returns a slice of pointers to schema.Job data structures and an error variable.
+// An empty slice is returned if no matching jobs are found.
 func (r *JobRepository) FindAll(
-	jobId *int64,
+	jobID *int64,
 	cluster *string,
 	startTime *int64,
 ) ([]*schema.Job, error) {
+	if jobID == nil {
+		return nil, fmt.Errorf("jobID cannot be nil")
+	}
+
 	start := time.Now()
 	q := sq.Select(jobColumns...).From("job").
-		Where("job.job_id = ?", *jobId)
+		Where("job.job_id = ?", *jobID)

 	if cluster != nil {
 		q = q.Where("job.cluster = ?", *cluster)
@@ -86,8 +105,8 @@ func (r *JobRepository) FindAll(

 	rows, err := q.RunWith(r.stmtCache).Query()
 	if err != nil {
-		cclog.Error("Error while running query")
-		return nil, err
+		cclog.Errorf("Error while running FindAll query for jobID=%d: %v", *jobID, err)
+		return nil, fmt.Errorf("failed to execute FindAll query: %w", err)
 	}
 	defer rows.Close()

@@ -95,8 +114,8 @@ func (r *JobRepository) FindAll(
 	for rows.Next() {
 		job, err := scanJob(rows)
 		if err != nil {
-			cclog.Warn("Error while scanning rows")
-			return nil, err
+			cclog.Warnf("Error while scanning rows in FindAll: %v", err)
+			return nil, fmt.Errorf("failed to scan job row: %w", err)
 		}
 		jobs = append(jobs, job)
 	}
@@ -119,8 +138,8 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) {

 	rows, err := query.RunWith(r.stmtCache).Query()
 	if err != nil {
-		cclog.Error("Error while running query")
-		return nil, err
+		cclog.Errorf("Error while running GetJobList query (limit=%d, offset=%d): %v", limit, offset, err)
+		return nil, fmt.Errorf("failed to execute GetJobList query: %w", err)
 	}
 	defer rows.Close()

@@ -129,23 +148,23 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) {
 		var id int64
 		err := rows.Scan(&id)
 		if err != nil {
-			cclog.Warn("Error while scanning rows")
-			return nil, err
+			cclog.Warnf("Error while scanning rows in GetJobList: %v", err)
+			return nil, fmt.Errorf("failed to scan job ID: %w", err)
 		}
 		jl = append(jl, id)
 	}

-	cclog.Infof("Return job count %d", len(jl))
+	cclog.Debugf("JobRepository.GetJobList(): Return job count %d", len(jl))
 	return jl, nil
 }

-// FindById executes a SQL query to find a specific batch job.
+// FindByID executes a SQL query to find a specific batch job.
 // The job is queried using the database id.
 // It returns a pointer to a schema.Job data structure and an error variable.
 // To check if no job was found test err == sql.ErrNoRows
-func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, error) {
+func (r *JobRepository) FindByID(ctx context.Context, jobID int64) (*schema.Job, error) {
 	q := sq.Select(jobColumns...).
-		From("job").Where("job.id = ?", jobId)
+		From("job").Where("job.id = ?", jobID)

 	q, qerr := SecurityCheck(ctx, q)
 	if qerr != nil {
@@ -155,14 +174,14 @@ func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job,
 	return scanJob(q.RunWith(r.stmtCache).QueryRow())
 }

-// FindByIdWithUser executes a SQL query to find a specific batch job.
+// FindByIDWithUser executes a SQL query to find a specific batch job.
 // The job is queried using the database id. The user is passed directly,
 // instead as part of the context.
 // It returns a pointer to a schema.Job data structure and an error variable.
 // To check if no job was found test err == sql.ErrNoRows
-func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schema.Job, error) {
+func (r *JobRepository) FindByIDWithUser(user *schema.User, jobID int64) (*schema.Job, error) {
 	q := sq.Select(jobColumns...).
-		From("job").Where("job.id = ?", jobId)
+		From("job").Where("job.id = ?", jobID)

 	q, qerr := SecurityCheckWithUser(user, q)
 	if qerr != nil {
@@ -172,24 +191,24 @@ func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schem
 	return scanJob(q.RunWith(r.stmtCache).QueryRow())
 }

-// FindByIdDirect executes a SQL query to find a specific batch job.
+// FindByIDDirect executes a SQL query to find a specific batch job.
 // The job is queried using the database id.
 // It returns a pointer to a schema.Job data structure and an error variable.
 // To check if no job was found test err == sql.ErrNoRows
-func (r *JobRepository) FindByIdDirect(jobId int64) (*schema.Job, error) {
+func (r *JobRepository) FindByIDDirect(jobID int64) (*schema.Job, error) {
 	q := sq.Select(jobColumns...).
-		From("job").Where("job.id = ?", jobId)
+		From("job").Where("job.id = ?", jobID)
 	return scanJob(q.RunWith(r.stmtCache).QueryRow())
 }

-// FindByJobId executes a SQL query to find a specific batch job.
+// FindByJobID executes a SQL query to find a specific batch job.
 // The job is queried using the slurm id and the clustername.
 // It returns a pointer to a schema.Job data structure and an error variable.
 // To check if no job was found test err == sql.ErrNoRows
-func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime int64, cluster string) (*schema.Job, error) {
+func (r *JobRepository) FindByJobID(ctx context.Context, jobID int64, startTime int64, cluster string) (*schema.Job, error) {
 	q := sq.Select(jobColumns...).
 		From("job").
-		Where("job.job_id = ?", jobId).
+		Where("job.job_id = ?", jobID).
 		Where("job.cluster = ?", cluster).
 		Where("job.start_time = ?", startTime)

@@ -201,19 +220,22 @@ func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime
 	return scanJob(q.RunWith(r.stmtCache).QueryRow())
 }

-// IsJobOwner executes a SQL query to find a specific batch job.
-// The job is queried using the slurm id,a username and the cluster.
-// It returns a bool.
-// If job was found, user is owner: test err != sql.ErrNoRows
-func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cluster string) bool {
+// IsJobOwner checks if the specified user owns the batch job identified by jobID,
+// startTime, and cluster. Returns true if the user is the owner, false otherwise.
+// This method does not return errors; it returns false for both non-existent jobs
+// and jobs owned by other users.
+func (r *JobRepository) IsJobOwner(jobID int64, startTime int64, user string, cluster string) bool {
 	q := sq.Select("id").
 		From("job").
-		Where("job.job_id = ?", jobId).
+		Where("job.job_id = ?", jobID).
 		Where("job.hpc_user = ?", user).
 		Where("job.cluster = ?", cluster).
 		Where("job.start_time = ?", startTime)

 	_, err := scanJob(q.RunWith(r.stmtCache).QueryRow())
+	if err != nil && err != sql.ErrNoRows {
+		cclog.Warnf("IsJobOwner: unexpected error for jobID=%d, user=%s, cluster=%s: %v", jobID, user, cluster, err)
+	}
 	return err != sql.ErrNoRows
 }

@@ -231,6 +253,11 @@ func (r *JobRepository) FindConcurrentJobs(
 	}

 	query = query.Where("cluster = ?", job.Cluster)
+
+	if len(job.Resources) == 0 {
+		return nil, fmt.Errorf("job has no resources defined")
+	}
+
 	var startTime int64
 	var stopTime int64

@@ -243,25 +270,28 @@ func (r *JobRepository) FindConcurrentJobs(
 		stopTime = startTime + int64(job.Duration)
 	}

-	// Add 200s overlap for jobs start time at the end
-	startTimeTail := startTime + 10
-	stopTimeTail := stopTime - 200
-	startTimeFront := startTime + 200
+	// Time buffer constant for finding overlapping jobs
+	// overlapBufferEnd: 200s buffer at job end to account for scheduling/cleanup overlap
+	const overlapBufferEnd = 200

-	queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)",
-		"running", startTimeTail, stopTimeTail, startTime)
+	stopTimeTail := stopTime - overlapBufferEnd
+	startTimeFront := startTime + overlapBufferEnd
+
+	queryRunning := query.Where("job.job_state = ?", "running").
+		Where("job.start_time <= ?", stopTimeTail)
 	// Get At Least One Exact Hostname Match from JSON Resources Array in Database
 	queryRunning = queryRunning.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)

-	query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)",
-		"running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime)
+	query = query.Where("job.job_state != ?", "running").
+		Where("job.start_time < ?", stopTimeTail).
+		Where("(job.start_time + job.duration) > ?", startTimeFront)
 	// Get At Least One Exact Hostname Match from JSON Resources Array in Database
 	query = query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname)

 	rows, err := query.RunWith(r.stmtCache).Query()
 	if err != nil {
-		cclog.Errorf("Error while running query: %v", err)
-		return nil, err
+		cclog.Errorf("Error while running concurrent jobs query: %v", err)
+		return nil, fmt.Errorf("failed to execute concurrent jobs query: %w", err)
 	}
 	defer rows.Close()

@@ -269,44 +299,44 @@ func (r *JobRepository) FindConcurrentJobs(
 	queryString := fmt.Sprintf("cluster=%s", job.Cluster)

 	for rows.Next() {
-		var id, jobId, startTime sql.NullInt64
+		var id, jobID, startTime sql.NullInt64

-		if err = rows.Scan(&id, &jobId, &startTime); err != nil {
-			cclog.Warn("Error while scanning rows")
-			return nil, err
+		if err = rows.Scan(&id, &jobID, &startTime); err != nil {
+			cclog.Warnf("Error while scanning concurrent job rows: %v", err)
+			return nil, fmt.Errorf("failed to scan concurrent job row: %w", err)
 		}

 		if id.Valid {
-			queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
+			queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
 			items = append(items,
 				&model.JobLink{
 					ID:    fmt.Sprint(id.Int64),
-					JobID: int(jobId.Int64),
+					JobID: int(jobID.Int64),
 				})
 		}
 	}

 	rows, err = queryRunning.RunWith(r.stmtCache).Query()
 	if err != nil {
-		cclog.Errorf("Error while running query: %v", err)
-		return nil, err
+		cclog.Errorf("Error while running concurrent running jobs query: %v", err)
+		return nil, fmt.Errorf("failed to execute concurrent running jobs query: %w", err)
 	}
 	defer rows.Close()

 	for rows.Next() {
-		var id, jobId, startTime sql.NullInt64
+		var id, jobID, startTime sql.NullInt64

-		if err := rows.Scan(&id, &jobId, &startTime); err != nil {
-			cclog.Warn("Error while scanning rows")
-			return nil, err
+		if err := rows.Scan(&id, &jobID, &startTime); err != nil {
+			cclog.Warnf("Error while scanning running concurrent job rows: %v", err)
+			return nil, fmt.Errorf("failed to scan running concurrent job row: %w", err)
 		}

 		if id.Valid {
-			queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64))
+			queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64))
 			items = append(items,
 				&model.JobLink{
 					ID:    fmt.Sprint(id.Int64),
-					JobID: int(jobId.Int64),
+					JobID: int(jobID.Int64),
 				})
 		}
 	}
--- a/internal/repository/jobHooks.go
+++ b/internal/repository/jobHooks.go
@@ -2,16 +2,45 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
 package repository

 import (
 	"sync"

-	"github.com/ClusterCockpit/cc-lib/schema"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

+// JobHook interface allows external components to hook into job lifecycle events.
+// Implementations can perform actions when jobs start or stop, such as tagging,
+// logging, notifications, or triggering external workflows.
+//
+// Example implementation:
+//
+//	type MyJobTagger struct{}
+//
+//	func (t *MyJobTagger) JobStartCallback(job *schema.Job) {
+//	    if job.NumNodes > 100 {
+//	        // Tag large jobs automatically
+//	    }
+//	}
+//
+//	func (t *MyJobTagger) JobStopCallback(job *schema.Job) {
+//	    if job.State == schema.JobStateFailed {
+//	        // Log or alert on failed jobs
+//	    }
+//	}
+//
+// Register hooks during application initialization:
+//
+//	repository.RegisterJobHook(&MyJobTagger{})
 type JobHook interface {
+	// JobStartCallback is invoked when one or more jobs start.
+	// This is called synchronously, so implementations should be fast.
 	JobStartCallback(job *schema.Job)
+
+	// JobStopCallback is invoked when a job completes.
+	// This is called synchronously, so implementations should be fast.
 	JobStopCallback(job *schema.Job)
 }

@@ -20,7 +49,13 @@ var (
 	hooks    []JobHook
 )

-func RegisterJobJook(hook JobHook) {
+// RegisterJobHook registers a JobHook to receive job lifecycle callbacks.
+// Multiple hooks can be registered and will be called in registration order.
+// This function is safe to call multiple times and is typically called during
+// application initialization.
+//
+// Nil hooks are silently ignored to simplify conditional registration.
+func RegisterJobHook(hook JobHook) {
 	initOnce.Do(func() {
 		hooks = make([]JobHook, 0)
 	})
@@ -30,6 +65,12 @@ func RegisterJobJook(hook JobHook) {
 	}
 }

+// CallJobStartHooks invokes all registered JobHook.JobStartCallback methods
+// for each job in the provided slice. This is called internally by the repository
+// when jobs are started (e.g., via StartJob or batch job imports).
+//
+// Hooks are called synchronously in registration order. If a hook panics,
+// the panic will propagate to the caller.
 func CallJobStartHooks(jobs []*schema.Job) {
 	if hooks == nil {
 		return
@@ -44,6 +85,12 @@ func CallJobStartHooks(jobs []*schema.Job) {
 	}
 }

+// CallJobStopHooks invokes all registered JobHook.JobStopCallback methods
+// for the provided job. This is called internally by the repository when a
+// job completes (e.g., via StopJob or job state updates).
+//
+// Hooks are called synchronously in registration order. If a hook panics,
+// the panic will propagate to the caller.
 func CallJobStopHooks(job *schema.Job) {
 	if hooks == nil {
 		return
--- a/internal/repository/jobQuery.go
+++ b/internal/repository/jobQuery.go
@@ -2,6 +2,10 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
+// Package repository provides job query functionality with filtering, pagination,
+// and security controls. This file contains the main query builders and security
+// checks for job retrieval operations.
 package repository

 import (
@@ -14,11 +18,27 @@ import (

 	"github.com/ClusterCockpit/cc-backend/internal/config"
 	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	sq "github.com/Masterminds/squirrel"
 )

+const (
+	// Default initial capacity for job result slices
+	defaultJobsCapacity = 50
+)
+
+// QueryJobs retrieves jobs from the database with optional filtering, pagination,
+// and sorting. Security controls are automatically applied based on the user context.
+//
+// Parameters:
+//   - ctx: Context containing user authentication information
+//   - filters: Optional job filters (cluster, state, user, time ranges, etc.)
+//   - page: Optional pagination parameters (page number and items per page)
+//   - order: Optional sorting specification (column or footprint field)
+//
+// Returns a slice of jobs matching the criteria, or an error if the query fails.
+// The function enforces role-based access control through SecurityCheck.
 func (r *JobRepository) QueryJobs(
 	ctx context.Context,
 	filters []*model.JobFilter,
@@ -33,26 +53,24 @@ func (r *JobRepository) QueryJobs(
 	if order != nil {
 		field := toSnakeCase(order.Field)
 		if order.Type == "col" {
-			// "col": Fixed column name query
 			switch order.Order {
 			case model.SortDirectionEnumAsc:
 				query = query.OrderBy(fmt.Sprintf("job.%s ASC", field))
 			case model.SortDirectionEnumDesc:
 				query = query.OrderBy(fmt.Sprintf("job.%s DESC", field))
 			default:
-				return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column")
+				return nil, errors.New("invalid sorting order for column")
 			}
 		} else {
-			// "foot": Order by footprint JSON field values
-			// Verify and Search Only in Valid Jsons
-			query = query.Where("JSON_VALID(meta_data)")
+			// Order by footprint JSON field values
+			query = query.Where("JSON_VALID(footprint)")
 			switch order.Order {
 			case model.SortDirectionEnumAsc:
 				query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field))
 			case model.SortDirectionEnumDesc:
 				query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field))
 			default:
-				return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint")
+				return nil, errors.New("invalid sorting order for footprint")
 			}
 		}
 	}
@@ -69,29 +87,35 @@ func (r *JobRepository) QueryJobs(
 	rows, err := query.RunWith(r.stmtCache).Query()
 	if err != nil {
 		queryString, queryVars, _ := query.ToSql()
-		cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
-		return nil, err
+		return nil, fmt.Errorf("query failed [%s] %v: %w", queryString, queryVars, err)
 	}
+	defer rows.Close()

-	jobs := make([]*schema.Job, 0, 50)
+	jobs := make([]*schema.Job, 0, defaultJobsCapacity)
 	for rows.Next() {
 		job, err := scanJob(rows)
 		if err != nil {
-			rows.Close()
-			cclog.Warn("Error while scanning rows (Jobs)")
-			return nil, err
+			cclog.Warnf("Error scanning job row: %v", err)
+			return nil, fmt.Errorf("failed to scan job row: %w", err)
 		}
 		jobs = append(jobs, job)
 	}

+	if err := rows.Err(); err != nil {
+		return nil, fmt.Errorf("error iterating job rows: %w", err)
+	}
+
 	return jobs, nil
 }

+// CountJobs returns the total number of jobs matching the given filters.
+// Security controls are automatically applied based on the user context.
+// Uses DISTINCT count to handle tag filters correctly (jobs may appear multiple
+// times when joined with the tag table).
 func (r *JobRepository) CountJobs(
 	ctx context.Context,
 	filters []*model.JobFilter,
 ) (int, error) {
-	// DISTICT count for tags filters, does not affect other queries
 	query, qerr := SecurityCheck(ctx, sq.Select("count(DISTINCT job.id)").From("job"))
 	if qerr != nil {
 		return 0, qerr
@@ -103,12 +127,22 @@ func (r *JobRepository) CountJobs(

 	var count int
 	if err := query.RunWith(r.DB).Scan(&count); err != nil {
-		return 0, err
+		return 0, fmt.Errorf("failed to count jobs: %w", err)
 	}

 	return count, nil
 }

+// SecurityCheckWithUser applies role-based access control filters to a job query
+// based on the provided user's roles and permissions.
+//
+// Access rules by role:
+//   - API role (exclusive): Full access to all jobs
+//   - Admin/Support roles: Full access to all jobs
+//   - Manager role: Access to jobs in managed projects plus own jobs
+//   - User role: Access only to own jobs
+//
+// Returns an error if the user is nil or has no recognized roles.
 func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBuilder, error) {
 	if user == nil {
 		var qnil sq.SelectBuilder
@@ -116,84 +150,68 @@ func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.Select
 	}

 	switch {
-	case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): // API-User : All jobs
+	case len(user.Roles) == 1 && user.HasRole(schema.RoleAPI):
 		return query, nil
-	case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): // Admin & Support : All jobs
+	case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}):
 		return query, nil
-	case user.HasRole(schema.RoleManager): // Manager : Add filter for managed projects' jobs only + personal jobs
+	case user.HasRole(schema.RoleManager):
 		if len(user.Projects) != 0 {
 			return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.hpc_user": user.Username}}), nil
-		} else {
-			cclog.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username)
-			return query.Where("job.hpc_user = ?", user.Username), nil
 		}
-	case user.HasRole(schema.RoleUser): // User : Only personal jobs
+		cclog.Debugf("Manager '%s' has no assigned projects, restricting to personal jobs", user.Username)
 		return query.Where("job.hpc_user = ?", user.Username), nil
-	default: // No known Role, return error
+	case user.HasRole(schema.RoleUser):
+		return query.Where("job.hpc_user = ?", user.Username), nil
+	default:
 		var qnil sq.SelectBuilder
 		return qnil, fmt.Errorf("user has no or unknown roles")
 	}
 }

+// SecurityCheck extracts the user from the context and applies role-based access
+// control filters to the query. This is a convenience wrapper around SecurityCheckWithUser.
 func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
 	user := GetUserFromContext(ctx)
-
 	return SecurityCheckWithUser(user, query)
 }

-// Build a sq.SelectBuilder out of a schema.JobFilter.
+// BuildWhereClause constructs SQL WHERE conditions from a JobFilter and applies
+// them to the query. Supports filtering by job properties (cluster, state, user),
+// time ranges, resource usage, tags, and JSON field searches in meta_data,
+// footprint, and resources columns.
 func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder {
-	if filter.Tags != nil {
-		// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
-		query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
-	}
+	// Primary Key
 	if filter.DbID != nil {
 		dbIDs := make([]string, len(filter.DbID))
 		copy(dbIDs, filter.DbID)
 		query = query.Where(sq.Eq{"job.id": dbIDs})
 	}
-	if filter.JobID != nil {
-		query = buildStringCondition("job.job_id", filter.JobID, query)
-	}
-	if filter.ArrayJobID != nil {
-		query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
-	}
-	if filter.User != nil {
-		query = buildStringCondition("job.hpc_user", filter.User, query)
-	}
-	if filter.Project != nil {
-		query = buildStringCondition("job.project", filter.Project, query)
-	}
-	if filter.JobName != nil {
-		query = buildMetaJsonCondition("jobName", filter.JobName, query)
-	}
+	// Explicit indices
 	if filter.Cluster != nil {
 		query = buildStringCondition("job.cluster", filter.Cluster, query)
 	}
+	if filter.SubCluster != nil {
+		query = buildStringCondition("job.subcluster", filter.SubCluster, query)
+	}
 	if filter.Partition != nil {
 		query = buildStringCondition("job.cluster_partition", filter.Partition, query)
 	}
-	if filter.StartTime != nil {
-		query = buildTimeCondition("job.start_time", filter.StartTime, query)
-	}
-	if filter.Duration != nil {
-		query = buildIntCondition("job.duration", filter.Duration, query)
-	}
-	if filter.MinRunningFor != nil {
-		now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs.
-		query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor)
-	}
-	if filter.Shared != nil {
-		query = query.Where("job.shared = ?", *filter.Shared)
-	}
 	if filter.State != nil {
 		states := make([]string, len(filter.State))
 		for i, val := range filter.State {
 			states[i] = string(val)
 		}
-
 		query = query.Where(sq.Eq{"job.job_state": states})
 	}
+	if filter.Shared != nil {
+		query = query.Where("job.shared = ?", *filter.Shared)
+	}
+	if filter.Project != nil {
+		query = buildStringCondition("job.project", filter.Project, query)
+	}
+	if filter.User != nil {
+		query = buildStringCondition("job.hpc_user", filter.User, query)
+	}
 	if filter.NumNodes != nil {
 		query = buildIntCondition("job.num_nodes", filter.NumNodes, query)
 	}
@@ -203,33 +221,95 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select
 	if filter.NumHWThreads != nil {
 		query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query)
 	}
-	if filter.Node != nil {
-		query = buildResourceJsonCondition("hostname", filter.Node, query)
+	if filter.ArrayJobID != nil {
+		query = query.Where("job.array_job_id = ?", *filter.ArrayJobID)
+	}
+	if filter.StartTime != nil {
+		query = buildTimeCondition("job.start_time", filter.StartTime, query)
+	}
+	if filter.Duration != nil {
+		query = buildIntCondition("job.duration", filter.Duration, query)
 	}
 	if filter.Energy != nil {
 		query = buildFloatCondition("job.energy", filter.Energy, query)
 	}
+	// Indices on Tag Table
+	if filter.Tags != nil {
+		// This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query?
+		query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct()
+	}
+	// No explicit Indices
+	if filter.JobID != nil {
+		query = buildStringCondition("job.job_id", filter.JobID, query)
+	}
+	// Queries Within JSONs
 	if filter.MetricStats != nil {
 		for _, ms := range filter.MetricStats {
-			query = buildFloatJsonCondition(ms.MetricName, ms.Range, query)
+			query = buildFloatJSONCondition(ms.MetricName, ms.Range, query)
 		}
 	}
+	if filter.Node != nil {
+		query = buildResourceJSONCondition("hostname", filter.Node, query)
+	}
+	if filter.JobName != nil {
+		query = buildMetaJSONCondition("jobName", filter.JobName, query)
+	}
+	if filter.Schedule != nil {
+		interactiveJobname := "interactive"
+		switch *filter.Schedule {
+		case "interactive":
+			iFilter := model.StringInput{Eq: &interactiveJobname}
+			query = buildMetaJSONCondition("jobName", &iFilter, query)
+		case "batch":
+			sFilter := model.StringInput{Neq: &interactiveJobname}
+			query = buildMetaJSONCondition("jobName", &sFilter, query)
+		}
+	}
+
+	// Configurable Filter to exclude recently started jobs, see config.go: ShortRunningJobsDuration
+	if filter.MinRunningFor != nil {
+		now := time.Now().Unix()
+		// Only jobs whose start timestamp is more than MinRunningFor seconds in the past
+		// If a job completed within the configured timeframe, it will still show up after the start_time matches the condition!
+		query = query.Where(sq.Lt{"job.start_time": (now - int64(*filter.MinRunningFor))})
+	}
 	return query
 }

+// buildIntCondition creates clauses for integer range filters, using BETWEEN only if required.
 func buildIntCondition(field string, cond *config.IntRange, query sq.SelectBuilder) sq.SelectBuilder {
-	return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
+	if cond.From != 1 && cond.To != 0 {
+		return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
+	} else if cond.From != 1 && cond.To == 0 {
+		return query.Where(field+" >= ?", cond.From)
+	} else if cond.From == 1 && cond.To != 0 {
+		return query.Where(field+" <= ?", cond.To)
+	} else {
+		return query
+	}
 }

+// buildFloatCondition creates a clauses for float range filters, using BETWEEN only if required.
 func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
-	return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
+	if cond.From != 1.0 && cond.To != 0.0 {
+		return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To)
+	} else if cond.From != 1.0 && cond.To == 0.0 {
+		return query.Where(field+" >= ?", cond.From)
+	} else if cond.From == 1.0 && cond.To != 0.0 {
+		return query.Where(field+" <= ?", cond.To)
+	} else {
+		return query
+	}
 }

+// buildTimeCondition creates time range filters supporting absolute timestamps,
+// relative time ranges (last6h, last24h, last7d, last30d), or open-ended ranges.
+// Reminder: BETWEEN Queries are slower and dont use indices as frequently: Only use if both conditions required
 func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBuilder) sq.SelectBuilder {
 	if cond.From != nil && cond.To != nil {
 		return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix())
 	} else if cond.From != nil {
-		return query.Where("? <= "+field, cond.From.Unix())
+		return query.Where(field+" >= ?", cond.From.Unix())
 	} else if cond.To != nil {
 		return query.Where(field+" <= ?", cond.To.Unix())
 	} else if cond.Range != "" {
@@ -248,18 +328,28 @@ func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBui
 			cclog.Debugf("No known named timeRange: startTime.range = %s", cond.Range)
 			return query
 		}
-		return query.Where(field+" BETWEEN ? AND ?", then, now)
+		return query.Where(field+" >= ?", then)
 	} else {
 		return query
 	}
 }

-func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
-	// Verify and Search Only in Valid Jsons
+// buildFloatJSONCondition creates a filter on a numeric field within the footprint JSON column, using BETWEEN only if required.
+func buildFloatJSONCondition(jsonField string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder {
 	query = query.Where("JSON_VALID(footprint)")
-	return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To)
+	if cond.From != 1.0 && cond.To != 0.0 {
+		return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") BETWEEN ? AND ?", cond.From, cond.To)
+	} else if cond.From != 1.0 && cond.To == 0.0 {
+		return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") >= ?", cond.From)
+	} else if cond.From == 1.0 && cond.To != 0.0 {
+		return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") <= ?", cond.To)
+	} else {
+		return query
+	}
 }

+// buildStringCondition creates filters for string fields supporting equality,
+// inequality, prefix, suffix, substring, and IN list matching.
 func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
 	if cond.Eq != nil {
 		return query.Where(field+" = ?", *cond.Eq)
@@ -284,10 +374,9 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select
 	return query
 }

-func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
-	// Verify and Search Only in Valid Jsons
+// buildMetaJSONCondition creates filters on fields within the meta_data JSON column.
+func buildMetaJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
 	query = query.Where("JSON_VALID(meta_data)")
-	// add "AND" Sql query Block for field match
 	if cond.Eq != nil {
 		return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") = ?", *cond.Eq)
 	}
@@ -306,10 +395,10 @@ func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.
 	return query
 }

-func buildResourceJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
-	// Verify and Search Only in Valid Jsons
+// buildResourceJSONCondition creates filters on fields within the resources JSON array column.
+// Uses json_each to search within array elements.
+func buildResourceJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder {
 	query = query.Where("JSON_VALID(resources)")
-	// add "AND" Sql query Block for field match
 	if cond.Eq != nil {
 		return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") = ?)", *cond.Eq)
 	}
@@ -333,15 +422,16 @@ var (
 	matchAllCap   = regexp.MustCompile("([a-z0-9])([A-Z])")
 )

+// toSnakeCase converts camelCase strings to snake_case for SQL column names.
+// Includes security checks to prevent SQL injection attempts.
+// Panics if potentially dangerous characters are detected.
 func toSnakeCase(str string) string {
 	for _, c := range str {
-		if c == '\'' || c == '\\' {
-			cclog.Panic("toSnakeCase() attack vector!")
+		if c == '\'' || c == '\\' || c == '"' || c == ';' || c == '-' || c == ' ' {
+			cclog.Panicf("toSnakeCase: potentially dangerous character detected in input: %q", str)
 		}
 	}

-	str = strings.ReplaceAll(str, "'", "")
-	str = strings.ReplaceAll(str, "\\", "")
 	snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}")
 	snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}")
 	return strings.ToLower(snake)
--- a/internal/repository/job_test.go
+++ b/internal/repository/job_test.go
@@ -10,7 +10,7 @@ import (
 	"testing"
 	"time"

-	"github.com/ClusterCockpit/cc-lib/schema"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	_ "github.com/mattn/go-sqlite3"
 )

@@ -33,7 +33,7 @@ func TestFind(t *testing.T) {
 func TestFindById(t *testing.T) {
 	r := setup(t)

-	job, err := r.FindById(getContext(t), 338)
+	job, err := r.FindByID(getContext(t), 338)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -78,7 +78,7 @@ func TestFindJobsBetween(t *testing.T) {

 	// 1. Find a job to use (Find all jobs)
 	// We use a large time range to ensure we get something if it exists
-	jobs, err := r.FindJobsBetween(0, 9999999999, false)
+	jobs, err := r.FindJobsBetween(0, 9999999999, "none")
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -88,21 +88,21 @@ func TestFindJobsBetween(t *testing.T) {

 	targetJob := jobs[0]

-	// 2. Create a tag
-	tagName := fmt.Sprintf("testtag_%d", time.Now().UnixNano())
-	tagId, err := r.CreateTag("testtype", tagName, "global")
+	// 2. Create an auto-tagger tag (type "app")
+	appTagName := fmt.Sprintf("apptag_%d", time.Now().UnixNano())
+	appTagID, err := r.CreateTag("app", appTagName, "global")
 	if err != nil {
 		t.Fatal(err)
 	}

-	// 3. Link Tag (Manually to avoid archive dependency side-effects in unit test)
-	_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagId)
+	// 3. Link auto-tagger tag to job
+	_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, appTagID)
 	if err != nil {
 		t.Fatal(err)
 	}

-	// 4. Search with omitTagged = false (Should find the job)
-	jobsFound, err := r.FindJobsBetween(0, 9999999999, false)
+	// 4. Search with omitTagged = "none" (Should find the job)
+	jobsFound, err := r.FindJobsBetween(0, 9999999999, "none")
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -115,18 +115,58 @@ func TestFindJobsBetween(t *testing.T) {
 		}
 	}
 	if !found {
-		t.Errorf("Target job %d should be found when omitTagged=false", *targetJob.ID)
+		t.Errorf("Target job %d should be found when omitTagged=none", *targetJob.ID)
 	}

-	// 5. Search with omitTagged = true (Should NOT find the job)
-	jobsFiltered, err := r.FindJobsBetween(0, 9999999999, true)
+	// 5. Search with omitTagged = "all" (Should NOT find the job — it has a tag)
+	jobsFiltered, err := r.FindJobsBetween(0, 9999999999, "all")
 	if err != nil {
 		t.Fatal(err)
 	}

 	for _, j := range jobsFiltered {
 		if *j.ID == *targetJob.ID {
-			t.Errorf("Target job %d should NOT be found when omitTagged=true", *targetJob.ID)
+			t.Errorf("Target job %d should NOT be found when omitTagged=all", *targetJob.ID)
+		}
+	}
+
+	// 6. Search with omitTagged = "user": auto-tagger tag ("app") should NOT exclude the job
+	jobsUserFilter, err := r.FindJobsBetween(0, 9999999999, "user")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	found = false
+	for _, j := range jobsUserFilter {
+		if *j.ID == *targetJob.ID {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Errorf("Target job %d should be found when omitTagged=user (only has auto-tagger tag)", *targetJob.ID)
+	}
+
+	// 7. Add a user-created tag (type "testtype") to the same job
+	userTagName := fmt.Sprintf("usertag_%d", time.Now().UnixNano())
+	userTagID, err := r.CreateTag("testtype", userTagName, "global")
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, userTagID)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// 8. Now omitTagged = "user" should exclude the job (has a user-created tag)
+	jobsUserFilter2, err := r.FindJobsBetween(0, 9999999999, "user")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, j := range jobsUserFilter2 {
+		if *j.ID == *targetJob.ID {
+			t.Errorf("Target job %d should NOT be found when omitTagged=user (has user-created tag)", *targetJob.ID)
 		}
 	}
 }
--- a/internal/repository/migration.go
+++ b/internal/repository/migration.go
@@ -10,52 +10,48 @@ import (
 	"embed"
 	"fmt"

-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	"github.com/golang-migrate/migrate/v4"
-	"github.com/golang-migrate/migrate/v4/database/mysql"
 	"github.com/golang-migrate/migrate/v4/database/sqlite3"
 	"github.com/golang-migrate/migrate/v4/source/iofs"
 )

+// Version is the current database schema version required by this version of cc-backend.
+// When the database schema changes, this version is incremented and a new migration file
+// is added to internal/repository/migrations/sqlite3/.
+//
+// Version history:
+//   - Version 10: Current version
+//
+// Migration files are embedded at build time from the migrations directory.
 const Version uint = 10

 //go:embed migrations/*
 var migrationFiles embed.FS

-func checkDBVersion(backend string, db *sql.DB) error {
-	var m *migrate.Migrate
+// checkDBVersion verifies that the database schema version matches the expected version.
+// This is called automatically during Connect() to ensure schema compatibility.
+//
+// Returns an error if:
+//   - Database version is older than expected (needs migration)
+//   - Database version is newer than expected (needs app upgrade)
+//   - Database is in a dirty state (failed migration)
+//
+// A "dirty" database indicates a migration was started but not completed successfully.
+// This requires manual intervention to fix the database and force the version.
+func checkDBVersion(db *sql.DB) error {
+	driver, err := sqlite3.WithInstance(db, &sqlite3.Config{})
+	if err != nil {
+		return err
+	}
+	d, err := iofs.New(migrationFiles, "migrations/sqlite3")
+	if err != nil {
+		return err
+	}

-	switch backend {
-	case "sqlite3":
-		driver, err := sqlite3.WithInstance(db, &sqlite3.Config{})
-		if err != nil {
-			return err
-		}
-		d, err := iofs.New(migrationFiles, "migrations/sqlite3")
-		if err != nil {
-			return err
-		}
-
-		m, err = migrate.NewWithInstance("iofs", d, "sqlite3", driver)
-		if err != nil {
-			return err
-		}
-	case "mysql":
-		driver, err := mysql.WithInstance(db, &mysql.Config{})
-		if err != nil {
-			return err
-		}
-		d, err := iofs.New(migrationFiles, "migrations/mysql")
-		if err != nil {
-			return err
-		}
-
-		m, err = migrate.NewWithInstance("iofs", d, "mysql", driver)
-		if err != nil {
-			return err
-		}
-	default:
-		cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
+	m, err := migrate.NewWithInstance("iofs", d, "sqlite3", driver)
+	if err != nil {
+		return err
 	}

 	v, dirty, err := m.Version()
@@ -80,37 +76,41 @@ func checkDBVersion(backend string, db *sql.DB) error {
 	return nil
 }

-func getMigrateInstance(backend string, db string) (m *migrate.Migrate, err error) {
-	switch backend {
-	case "sqlite3":
-		d, err := iofs.New(migrationFiles, "migrations/sqlite3")
-		if err != nil {
-			cclog.Fatal(err)
-		}
+// getMigrateInstance creates a new migration instance for the given database file.
+// This is used internally by MigrateDB, RevertDB, and ForceDB.
+func getMigrateInstance(db string) (m *migrate.Migrate, err error) {
+	d, err := iofs.New(migrationFiles, "migrations/sqlite3")
+	if err != nil {
+		return nil, err
+	}

-		m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db))
-		if err != nil {
-			return m, err
-		}
-	case "mysql":
-		d, err := iofs.New(migrationFiles, "migrations/mysql")
-		if err != nil {
-			return m, err
-		}
-
-		m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("mysql://%s?multiStatements=true", db))
-		if err != nil {
-			return m, err
-		}
-	default:
-		cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend)
+	m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db))
+	if err != nil {
+		return nil, err
 	}

 	return m, nil
 }

-func MigrateDB(backend string, db string) error {
-	m, err := getMigrateInstance(backend, db)
+// MigrateDB applies all pending database migrations to bring the schema up to date.
+// This should be run with the -migrate-db flag before starting the application
+// after upgrading to a new version that requires schema changes.
+//
+// Process:
+//  1. Checks current database version
+//  2. Applies all migrations from current version to target Version
+//  3. Updates schema_migrations table to track applied migrations
+//
+// Important:
+//   - Always backup your database before running migrations
+//   - Migrations are irreversible without manual intervention
+//   - If a migration fails, the database is marked "dirty" and requires manual fix
+//
+// Usage:
+//
+//	cc-backend -migrate-db
+func MigrateDB(db string) error {
+	m, err := getMigrateInstance(db)
 	if err != nil {
 		return err
 	}
@@ -118,7 +118,7 @@ func MigrateDB(backend string, db string) error {
 	v, dirty, err := m.Version()
 	if err != nil {
 		if err == migrate.ErrNilVersion {
-			cclog.Warn("Legacy database without version or missing database file!")
+			cclog.Info("Legacy database without version or missing database file!")
 		} else {
 			return err
 		}
@@ -144,8 +144,19 @@ func MigrateDB(backend string, db string) error {
 	return nil
 }

-func RevertDB(backend string, db string) error {
-	m, err := getMigrateInstance(backend, db)
+// RevertDB rolls back the database schema to the previous version (Version - 1).
+// This is primarily used for testing or emergency rollback scenarios.
+//
+// Warning:
+//   - This may cause data loss if newer schema added columns/tables
+//   - Always backup before reverting
+//   - Not all migrations are safely reversible
+//
+// Usage:
+//
+//	cc-backend -revert-db
+func RevertDB(db string) error {
+	m, err := getMigrateInstance(db)
 	if err != nil {
 		return err
 	}
@@ -162,8 +173,23 @@ func RevertDB(backend string, db string) error {
 	return nil
 }

-func ForceDB(backend string, db string) error {
-	m, err := getMigrateInstance(backend, db)
+// ForceDB forces the database schema version to the current Version without running migrations.
+// This is only used to recover from failed migrations that left the database in a "dirty" state.
+//
+// When to use:
+//   - After manually fixing a failed migration
+//   - When you've manually applied schema changes and need to update the version marker
+//
+// Warning:
+//   - This does NOT apply any schema changes
+//   - Only use after manually verifying the schema is correct
+//   - Improper use can cause schema/version mismatch
+//
+// Usage:
+//
+//	cc-backend -force-db
+func ForceDB(db string) error {
+	m, err := getMigrateInstance(db)
 	if err != nil {
 		return err
 	}
--- a/internal/repository/migrations/mysql/01_init-schema.down.sql
+++ b/internal/repository/migrations/mysql/01_init-schema.down.sql
@@ -1,5 +0,0 @@
-DROP TABLE IF EXISTS job;
-DROP TABLE IF EXISTS tags;
-DROP TABLE IF EXISTS jobtag;
-DROP TABLE IF EXISTS configuration;
-DROP TABLE IF EXISTS user;
--- a/internal/repository/migrations/mysql/01_init-schema.up.sql
+++ b/internal/repository/migrations/mysql/01_init-schema.up.sql
@@ -1,66 +0,0 @@
-CREATE TABLE IF NOT EXISTS job (
-    id                INTEGER AUTO_INCREMENT PRIMARY KEY ,
-    job_id            BIGINT NOT NULL,
-    cluster           VARCHAR(255) NOT NULL,
-    subcluster        VARCHAR(255) NOT NULL,
-    start_time        BIGINT NOT NULL, -- Unix timestamp
-
-    user              VARCHAR(255) NOT NULL,
-    project           VARCHAR(255) NOT NULL,
-    `partition`       VARCHAR(255) NOT NULL,
-    array_job_id      BIGINT NOT NULL,
-    duration          INT NOT NULL DEFAULT 0,
-    walltime          INT NOT NULL DEFAULT 0,
-    job_state         VARCHAR(255) NOT NULL 
-    CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled',
-            'stopped', 'timeout', 'preempted', 'out_of_memory')),
-    meta_data         TEXT,          -- JSON
-    resources         TEXT NOT NULL, -- JSON
-
-    num_nodes         INT NOT NULL,
-    num_hwthreads     INT NOT NULL,
-    num_acc           INT NOT NULL,
-    smt               TINYINT NOT NULL DEFAULT 1 CHECK(smt               IN (0, 1   )),
-    exclusive         TINYINT NOT NULL DEFAULT 1 CHECK(exclusive         IN (0, 1, 2)),
-    monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)),
-
-    mem_used_max        REAL NOT NULL DEFAULT 0.0,
-    flops_any_avg       REAL NOT NULL DEFAULT 0.0,
-    mem_bw_avg          REAL NOT NULL DEFAULT 0.0,
-    load_avg            REAL NOT NULL DEFAULT 0.0,
-    net_bw_avg          REAL NOT NULL DEFAULT 0.0,
-    net_data_vol_total  REAL NOT NULL DEFAULT 0.0,
-    file_bw_avg         REAL NOT NULL DEFAULT 0.0,
-    file_data_vol_total REAL NOT NULL DEFAULT 0.0,
-    UNIQUE (job_id, cluster, start_time)
-    );
-
-CREATE TABLE IF NOT EXISTS tag (
-    id       INTEGER PRIMARY KEY,
-    tag_type VARCHAR(255) NOT NULL,
-    tag_name VARCHAR(255) NOT NULL,
-    UNIQUE (tag_type, tag_name));
-
-CREATE TABLE IF NOT EXISTS jobtag (
-    job_id INTEGER,
-    tag_id INTEGER,
-    PRIMARY KEY (job_id, tag_id),
-    FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE,
-    FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE);
-
-CREATE TABLE IF NOT EXISTS user (
-	username varchar(255) PRIMARY KEY NOT NULL,
-	password varchar(255) DEFAULT NULL,
-	ldap     tinyint      NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */
-	name     varchar(255) DEFAULT NULL,
-	roles    varchar(255) NOT NULL DEFAULT "[]",
-	email    varchar(255) DEFAULT NULL);
-
-CREATE TABLE IF NOT EXISTS configuration (
-	username varchar(255),
-	confkey  varchar(255),
-	value    varchar(255),
-	PRIMARY KEY (username, confkey),
-	FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION);
-
-
--- a/internal/repository/migrations/mysql/02_add-index.down.sql
+++ b/internal/repository/migrations/mysql/02_add-index.down.sql
@@ -1,8 +0,0 @@
-DROP INDEX IF EXISTS job_stats;
-DROP INDEX IF EXISTS job_by_user;
-DROP INDEX IF EXISTS job_by_starttime;
-DROP INDEX IF EXISTS job_by_job_id;
-DROP INDEX IF EXISTS job_list;
-DROP INDEX IF EXISTS job_list_user;
-DROP INDEX IF EXISTS job_list_users;
-DROP INDEX IF EXISTS job_list_users_start;
--- a/internal/repository/migrations/mysql/02_add-index.up.sql
+++ b/internal/repository/migrations/mysql/02_add-index.up.sql
@@ -1,8 +0,0 @@
-CREATE INDEX IF NOT EXISTS job_stats        ON job (cluster,subcluster,user);
-CREATE INDEX IF NOT EXISTS job_by_user      ON job (user);
-CREATE INDEX IF NOT EXISTS job_by_starttime ON job (start_time);
-CREATE INDEX IF NOT EXISTS job_by_job_id    ON job (job_id);
-CREATE INDEX IF NOT EXISTS job_list         ON job (cluster, job_state);
-CREATE INDEX IF NOT EXISTS job_list_user    ON job (user, cluster, job_state);
-CREATE INDEX IF NOT EXISTS job_list_users   ON job (user, job_state);
-CREATE INDEX IF NOT EXISTS job_list_users_start ON job (start_time, user, job_state);
--- a/internal/repository/migrations/mysql/03_add-userprojects.down.sql
+++ b/internal/repository/migrations/mysql/03_add-userprojects.down.sql
@@ -1 +0,0 @@
-ALTER TABLE user DROP COLUMN projects;
--- a/internal/repository/migrations/mysql/03_add-userprojects.up.sql
+++ b/internal/repository/migrations/mysql/03_add-userprojects.up.sql
@@ -1 +0,0 @@
-ALTER TABLE user ADD COLUMN projects varchar(255) NOT NULL DEFAULT "[]";
--- a/internal/repository/migrations/mysql/04_alter-table-job.down.sql
+++ b/internal/repository/migrations/mysql/04_alter-table-job.down.sql
@@ -1,5 +0,0 @@
-ALTER TABLE job
-    MODIFY `partition` VARCHAR(255) NOT NULL,
-    MODIFY array_job_id BIGINT NOT NULL,
-    MODIFY num_hwthreads INT NOT NULL,
-    MODIFY num_acc INT NOT NULL;
--- a/internal/repository/migrations/mysql/04_alter-table-job.up.sql
+++ b/internal/repository/migrations/mysql/04_alter-table-job.up.sql
@@ -1,5 +0,0 @@
-ALTER TABLE job
-    MODIFY `partition` VARCHAR(255),
-    MODIFY array_job_id BIGINT,
-    MODIFY num_hwthreads INT,
-    MODIFY num_acc INT;
--- a/internal/repository/migrations/mysql/05_extend-tags.down.sql
+++ b/internal/repository/migrations/mysql/05_extend-tags.down.sql
@@ -1,2 +0,0 @@
-ALTER TABLE tag DROP COLUMN insert_time;
-ALTER TABLE jobtag DROP COLUMN insert_time;
--- a/internal/repository/migrations/mysql/05_extend-tags.up.sql
+++ b/internal/repository/migrations/mysql/05_extend-tags.up.sql
@@ -1,2 +0,0 @@
-ALTER TABLE tag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP;
-ALTER TABLE jobtag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP;
--- a/internal/repository/migrations/mysql/06_change-config.down.sql
+++ b/internal/repository/migrations/mysql/06_change-config.down.sql
@@ -1 +0,0 @@
-ALTER TABLE configuration MODIFY value VARCHAR(255);
--- a/internal/repository/migrations/mysql/06_change-config.up.sql
+++ b/internal/repository/migrations/mysql/06_change-config.up.sql
@@ -1 +0,0 @@
-ALTER TABLE configuration MODIFY value TEXT;
--- a/internal/repository/migrations/mysql/07_fix-tag-id.down.sql
+++ b/internal/repository/migrations/mysql/07_fix-tag-id.down.sql
@@ -1,3 +0,0 @@
-SET FOREIGN_KEY_CHECKS = 0;
-ALTER TABLE tag MODIFY id INTEGER;
-SET FOREIGN_KEY_CHECKS = 1;
--- a/internal/repository/migrations/mysql/07_fix-tag-id.up.sql
+++ b/internal/repository/migrations/mysql/07_fix-tag-id.up.sql
@@ -1,3 +0,0 @@
-SET FOREIGN_KEY_CHECKS = 0;
-ALTER TABLE tag MODIFY id INTEGER AUTO_INCREMENT;
-SET FOREIGN_KEY_CHECKS = 1;
--- a/internal/repository/migrations/mysql/08_add-footprint.down.sql
+++ b/internal/repository/migrations/mysql/08_add-footprint.down.sql
@@ -1,83 +0,0 @@
-ALTER TABLE job DROP energy;
-ALTER TABLE job DROP energy_footprint;
-ALTER TABLE job ADD COLUMN flops_any_avg;
-ALTER TABLE job ADD COLUMN mem_bw_avg;
-ALTER TABLE job ADD COLUMN mem_used_max;
-ALTER TABLE job ADD COLUMN load_avg;
-ALTER TABLE job ADD COLUMN net_bw_avg;
-ALTER TABLE job ADD COLUMN net_data_vol_total;
-ALTER TABLE job ADD COLUMN file_bw_avg;
-ALTER TABLE job ADD COLUMN file_data_vol_total;
-
-UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg');
-UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg');
-UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max');
-UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg');
-UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg');
-UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total');
-UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg');
-UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total');
-
-ALTER TABLE job DROP footprint;
-- Do not use reserved keywords anymore
-RENAME TABLE hpc_user TO `user`;
-ALTER TABLE job RENAME COLUMN hpc_user TO `user`;
-ALTER TABLE job RENAME COLUMN cluster_partition TO `partition`;
-
-DROP INDEX IF EXISTS jobs_cluster;
-DROP INDEX IF EXISTS jobs_cluster_user;
-DROP INDEX IF EXISTS jobs_cluster_project;
-DROP INDEX IF EXISTS jobs_cluster_subcluster;
-DROP INDEX IF EXISTS jobs_cluster_starttime;
-DROP INDEX IF EXISTS jobs_cluster_duration;
-DROP INDEX IF EXISTS jobs_cluster_numnodes;
-
-DROP INDEX IF EXISTS jobs_cluster_partition;
-DROP INDEX IF EXISTS jobs_cluster_partition_starttime;
-DROP INDEX IF EXISTS jobs_cluster_partition_duration;
-DROP INDEX IF EXISTS jobs_cluster_partition_numnodes;
-
-DROP INDEX IF EXISTS jobs_cluster_partition_jobstate;
-DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user;
-DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project;
-DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime;
-DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration;
-DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes;
-
-DROP INDEX IF EXISTS jobs_cluster_jobstate;
-DROP INDEX IF EXISTS jobs_cluster_jobstate_user;
-DROP INDEX IF EXISTS jobs_cluster_jobstate_project;
-
-DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime;
-DROP INDEX IF EXISTS jobs_cluster_jobstate_duration;
-DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes;
-
-DROP INDEX IF EXISTS jobs_user;
-DROP INDEX IF EXISTS jobs_user_starttime;
-DROP INDEX IF EXISTS jobs_user_duration;
-DROP INDEX IF EXISTS jobs_user_numnodes;
-
-DROP INDEX IF EXISTS jobs_project;
-DROP INDEX IF EXISTS jobs_project_user;
-DROP INDEX IF EXISTS jobs_project_starttime;
-DROP INDEX IF EXISTS jobs_project_duration;
-DROP INDEX IF EXISTS jobs_project_numnodes;
-
-DROP INDEX IF EXISTS jobs_jobstate;
-DROP INDEX IF EXISTS jobs_jobstate_user;
-DROP INDEX IF EXISTS jobs_jobstate_project;
-DROP INDEX IF EXISTS jobs_jobstate_starttime;
-DROP INDEX IF EXISTS jobs_jobstate_duration;
-DROP INDEX IF EXISTS jobs_jobstate_numnodes;
-
-DROP INDEX IF EXISTS jobs_arrayjobid_starttime;
-DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime;
-
-DROP INDEX IF EXISTS jobs_starttime;
-DROP INDEX IF EXISTS jobs_duration;
-DROP INDEX IF EXISTS jobs_numnodes;
-
-DROP INDEX IF EXISTS jobs_duration_starttime;
-DROP INDEX IF EXISTS jobs_numnodes_starttime;
-DROP INDEX IF EXISTS jobs_numacc_starttime;
-DROP INDEX IF EXISTS jobs_energy_starttime;
--- a/internal/repository/migrations/mysql/08_add-footprint.up.sql
+++ b/internal/repository/migrations/mysql/08_add-footprint.up.sql
@@ -1,123 +0,0 @@
-DROP INDEX IF EXISTS job_stats ON job;
-DROP INDEX IF EXISTS job_by_user ON job;
-DROP INDEX IF EXISTS job_by_starttime ON job;
-DROP INDEX IF EXISTS job_by_job_id ON job;
-DROP INDEX IF EXISTS job_list ON job;
-DROP INDEX IF EXISTS job_list_user ON job;
-DROP INDEX IF EXISTS job_list_users ON job;
-DROP INDEX IF EXISTS job_list_users_start ON job;
-
-ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0;
-ALTER TABLE job ADD COLUMN energy_footprint JSON;
-
-ALTER TABLE job ADD COLUMN footprint JSON;
-ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global';
-
-- Do not use reserved keywords anymore
-RENAME TABLE `user` TO hpc_user;
-ALTER TABLE job RENAME COLUMN `user` TO hpc_user;
-ALTER TABLE job RENAME COLUMN `partition` TO cluster_partition;
-
-ALTER TABLE job MODIFY COLUMN cluster VARCHAR(50);
-ALTER TABLE job MODIFY COLUMN hpc_user VARCHAR(50);
-ALTER TABLE job MODIFY COLUMN subcluster VARCHAR(50);
-ALTER TABLE job MODIFY COLUMN project VARCHAR(50);
-ALTER TABLE job MODIFY COLUMN cluster_partition VARCHAR(50);
-ALTER TABLE job MODIFY COLUMN job_state VARCHAR(25);
-
-UPDATE job SET footprint = '{"flops_any_avg": 0.0}';
-UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg);
-UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg);
-UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max);
-UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg);
-UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0;
-UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0;
-UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0;
-UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0;
-
-ALTER TABLE job DROP flops_any_avg;
-ALTER TABLE job DROP mem_bw_avg;
-ALTER TABLE job DROP mem_used_max;
-ALTER TABLE job DROP load_avg;
-ALTER TABLE job DROP net_bw_avg;
-ALTER TABLE job DROP net_data_vol_total;
-ALTER TABLE job DROP file_bw_avg;
-ALTER TABLE job DROP file_data_vol_total;
-
-- Indices for: Single filters, combined filters, sorting, sorting with filters
-- Cluster Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
-CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
-CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
-CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
-- Cluster Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
-CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
-
-- Cluster+Partition Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
-- Cluster+Partition Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
-
-- Cluster+Partition+Jobstate Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
-- Cluster+Partition+Jobstate Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
-
-- Cluster+JobState Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
-- Cluster+JobState Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
-
-- User Filter
-CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
-- User Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
-CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
-CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
-
-- Project Filter
-CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
-CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
-- Project Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
-CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
-CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
-
-- JobState Filter
-CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
-CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
-CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
-CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
-- JobState Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
-CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
-CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
-
-- ArrayJob Filter
-CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);
-
-- Sorting without active filters
-CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
-CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
-CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
-
-- Single filters with default starttime sorting
-CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
-CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
-CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
-CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);
-
-- Optimize DB index usage
--- a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql
+++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql
@@ -118,104 +118,116 @@ DROP TABLE lookup_exclusive;
 DROP TABLE job; -- Deletes All Existing 'job' Indices; Recreate after Renaming
 ALTER TABLE job_new RENAME TO job;

-- Recreate Indices from 08_add-footprint, include new submit_time indices
+-- Recreate Indices from 08_add-footprint; include new 'shared' column
 -- Cluster Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster);
 CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user);
 CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project);
 CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster);
 -- Cluster Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_submittime ON job (cluster, submit_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration);
 CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes);
 CREATE INDEX IF NOT EXISTS jobs_cluster_numhwthreads ON job (cluster, num_hwthreads);
 CREATE INDEX IF NOT EXISTS jobs_cluster_numacc ON job (cluster, num_acc);
 CREATE INDEX IF NOT EXISTS jobs_cluster_energy ON job (cluster, energy);

+-- Cluster Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_cluster_duration_starttime ON job (cluster, duration, start_time);
+CREATE INDEX IF NOT EXISTS jobs_cluster_starttime_duration ON job (cluster, start_time, duration);
+
 -- Cluster+Partition Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition);
+CREATE INDEX IF NOT EXISTS jobs_cluster_partition_user ON job (cluster, cluster_partition, hpc_user);
+CREATE INDEX IF NOT EXISTS jobs_cluster_partition_project ON job (cluster, cluster_partition, project);
+CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
+CREATE INDEX IF NOT EXISTS jobs_cluster_partition_shared ON job (cluster, cluster_partition, shared);
+
 -- Cluster+Partition Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_submittime ON job (cluster, cluster_partition, submit_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration);
 CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes);
 CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numhwthreads ON job (cluster, cluster_partition, num_hwthreads);
 CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numacc ON job (cluster, cluster_partition, num_acc);
 CREATE INDEX IF NOT EXISTS jobs_cluster_partition_energy ON job (cluster, cluster_partition, energy);

-- Cluster+Partition+Jobstate Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project);
-- Cluster+Partition+Jobstate Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_submittime ON job (cluster, cluster_partition, job_state, submit_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numhwthreads ON job (cluster, cluster_partition, job_state, num_hwthreads);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numacc ON job (cluster, cluster_partition, job_state, num_acc);
-CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_energy ON job (cluster, cluster_partition, job_state, energy);
+-- Cluster+Partition Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration_starttime ON job (cluster, cluster_partition, duration, start_time);
+CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime_duration ON job (cluster, cluster_partition, start_time, duration);

 -- Cluster+JobState Filter
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state);
 CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user);
 CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project);
 -- Cluster+JobState Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_submittime ON job (cluster, job_state, submit_time);
-CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration);
 CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes);
 CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numhwthreads ON job (cluster, job_state, num_hwthreads);
 CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numacc ON job (cluster, job_state, num_acc);
 CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_energy ON job (cluster, job_state, energy);

+-- Cluster+JobState Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime_duration ON job (cluster, job_state, start_time, duration);
+CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration_starttime ON job (cluster, job_state, duration, start_time);
+
+-- Cluster+Shared Filter
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_user ON job (cluster, shared, hpc_user);
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_project ON job (cluster, shared, project);
+-- Cluster+Shared Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numnodes ON job (cluster, shared, num_nodes);
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numhwthreads ON job (cluster, shared, num_hwthreads);
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numacc ON job (cluster, shared, num_acc);
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_energy ON job (cluster, shared, energy);
+
+-- Cluster+Shared Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_starttime_duration ON job (cluster, shared, start_time, duration);
+CREATE INDEX IF NOT EXISTS jobs_cluster_shared_duration_starttime ON job (cluster, shared, duration, start_time);
+
 -- User Filter
-CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user);
 -- User Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time);
-CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration);
 CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes);
 CREATE INDEX IF NOT EXISTS jobs_user_numhwthreads ON job (hpc_user, num_hwthreads);
 CREATE INDEX IF NOT EXISTS jobs_user_numacc ON job (hpc_user, num_acc);
 CREATE INDEX IF NOT EXISTS jobs_user_energy ON job (hpc_user, energy);

+-- Cluster+Shared Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_user_starttime_duration ON job (hpc_user, start_time, duration);
+CREATE INDEX IF NOT EXISTS jobs_user_duration_starttime ON job (hpc_user, duration, start_time);
+
 -- Project Filter
-CREATE INDEX IF NOT EXISTS jobs_project ON job (project);
 CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user);
 -- Project Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time);
-CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration);
 CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes);
 CREATE INDEX IF NOT EXISTS jobs_project_numhwthreads ON job (project, num_hwthreads);
 CREATE INDEX IF NOT EXISTS jobs_project_numacc ON job (project, num_acc);
 CREATE INDEX IF NOT EXISTS jobs_project_energy ON job (project, energy);

+-- Cluster+Shared Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_project_starttime_duration ON job (project, start_time, duration);
+CREATE INDEX IF NOT EXISTS jobs_project_duration_starttime ON job (project, duration, start_time);
+
 -- JobState Filter
-CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state);
 CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user);
 CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project);
-CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster);
 -- JobState Filter Sorting
-CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time);
-CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration);
 CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes);
 CREATE INDEX IF NOT EXISTS jobs_jobstate_numhwthreads ON job (job_state, num_hwthreads);
 CREATE INDEX IF NOT EXISTS jobs_jobstate_numacc ON job (job_state, num_acc);
 CREATE INDEX IF NOT EXISTS jobs_jobstate_energy ON job (job_state, energy);

+-- Cluster+Shared Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime_duration ON job (job_state, start_time, duration);
+CREATE INDEX IF NOT EXISTS jobs_jobstate_duration_starttime ON job (job_state, duration, start_time);
+
+-- Shared Filter
+CREATE INDEX IF NOT EXISTS jobs_shared_user ON job (shared, hpc_user);
+CREATE INDEX IF NOT EXISTS jobs_shared_project ON job (shared, project);
+-- Shared Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_shared_numnodes ON job (shared, num_nodes);
+CREATE INDEX IF NOT EXISTS jobs_shared_numhwthreads ON job (shared, num_hwthreads);
+CREATE INDEX IF NOT EXISTS jobs_shared_numacc ON job (shared, num_acc);
+CREATE INDEX IF NOT EXISTS jobs_shared_energy ON job (shared, energy);
+
+-- Cluster+Shared Time Filter Sorting
+CREATE INDEX IF NOT EXISTS jobs_shared_starttime_duration ON job (shared, start_time, duration);
+CREATE INDEX IF NOT EXISTS jobs_shared_duration_starttime ON job (shared, duration, start_time);
+
 -- ArrayJob Filter
 CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time);
 CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time);

-- Sorting without active filters
-CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
-CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
-CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes);
-CREATE INDEX IF NOT EXISTS jobs_numhwthreads ON job (num_hwthreads);
-CREATE INDEX IF NOT EXISTS jobs_numacc ON job (num_acc);
-CREATE INDEX IF NOT EXISTS jobs_energy ON job (energy);
-
 -- Single filters with default starttime sorting
 CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time);
 CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time);
@@ -223,6 +235,22 @@ CREATE INDEX IF NOT EXISTS jobs_numhwthreads_starttime ON job (num_hwthreads, st
 CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time);
 CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time);

+-- Single filters with duration sorting
+CREATE INDEX IF NOT EXISTS jobs_starttime_duration ON job (start_time, duration);
+CREATE INDEX IF NOT EXISTS jobs_numnodes_duration ON job (num_nodes, duration);
+CREATE INDEX IF NOT EXISTS jobs_numhwthreads_duration ON job (num_hwthreads, duration);
+CREATE INDEX IF NOT EXISTS jobs_numacc_duration ON job (num_acc, duration);
+CREATE INDEX IF NOT EXISTS jobs_energy_duration ON job (energy, duration);
+
+-- Backup Indices For High Variety Columns
+CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time);
+CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration);
+
+-- Notes:
+-- Cluster+Partition+Jobstate Filter: Tested -> Full Array Of Combinations non-required
+-- Cluster+JobState+Shared Filter: Tested -> No further timing improvement
+-- JobState+Shared Filter: Tested -> No further timing improvement
+
 -- Optimize DB index usage
 PRAGMA optimize;

--- a/internal/repository/migrations/sqlite3/10_node-table.up.sql
+++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql
@@ -23,6 +23,7 @@ CREATE TABLE "node_state" (
    CHECK (health_state IN (
        'full', 'partial', 'failed'
    )),
+    health_metrics TEXT,          -- JSON array of strings
    node_id INTEGER,
    FOREIGN KEY (node_id) REFERENCES node (id)
 );
@@ -33,12 +34,11 @@ CREATE INDEX IF NOT EXISTS nodes_cluster_subcluster ON node (cluster, subcluster

 -- Add NEW Indices For New Node_State Table Fields
 CREATE INDEX IF NOT EXISTS nodestates_timestamp ON node_state (time_stamp);
-CREATE INDEX IF NOT EXISTS nodestates_state ON node_state (node_state);
-CREATE INDEX IF NOT EXISTS nodestates_health ON node_state (health_state);
 CREATE INDEX IF NOT EXISTS nodestates_state_timestamp ON node_state (node_state, time_stamp);
 CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp);
 CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state);
 CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state);
+CREATE INDEX IF NOT EXISTS nodestates_nodeid_timestamp ON node_state (node_id, time_stamp DESC);

 -- Add NEW Indices For Increased Amounts of Tags
 CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id);
--- a/internal/repository/node.go
+++ b/internal/repository/node.go
@@ -10,14 +10,17 @@ import (
 	"database/sql"
 	"encoding/json"
 	"fmt"
+	"slices"
+	"sort"
+	"strings"
 	"sync"
 	"time"

 	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/lrucache"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/lrucache"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	sq "github.com/Masterminds/squirrel"
 	"github.com/jmoiron/sqlx"
 )
@@ -49,6 +52,38 @@ func GetNodeRepository() *NodeRepository {
 	return nodeRepoInstance
 }

+// latestStateCondition returns a squirrel expression that restricts node_state
+// rows to the latest per node_id using a correlated subquery.
+// Requires the query to join node and node_state tables.
+func latestStateCondition() sq.Sqlizer {
+	return sq.Expr(
+		"node_state.id = (SELECT ns2.id FROM node_state ns2 WHERE ns2.node_id = node.id ORDER BY ns2.time_stamp DESC LIMIT 1)",
+	)
+}
+
+// applyNodeFilters applies common NodeFilter conditions to a query that joins
+// the node and node_state tables with latestStateCondition.
+func applyNodeFilters(query sq.SelectBuilder, filters []*model.NodeFilter) sq.SelectBuilder {
+	for _, f := range filters {
+		if f.Cluster != nil {
+			query = buildStringCondition("node.cluster", f.Cluster, query)
+		}
+		if f.SubCluster != nil {
+			query = buildStringCondition("node.subcluster", f.SubCluster, query)
+		}
+		if f.Hostname != nil {
+			query = buildStringCondition("node.hostname", f.Hostname, query)
+		}
+		if f.SchedulerState != nil {
+			query = query.Where("node_state.node_state = ?", f.SchedulerState)
+		}
+		if f.HealthState != nil {
+			query = query.Where("node_state.health_state = ?", f.HealthState)
+		}
+	}
+	return query
+}
+
 func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) {
 	start := time.Now()

@@ -79,17 +114,16 @@ func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[str

 func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) {
 	node := &schema.Node{}
-	var timestamp int
-	if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
-		"node_state.health_state", "MAX(node_state.time_stamp) as time").
-		From("node_state").
-		Join("node ON node_state.node_id = node.id").
+	if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
+		"node_state.node_state", "node_state.health_state").
+		From("node").
+		Join("node_state ON node_state.node_id = node.id").
+		Where(latestStateCondition()).
 		Where("node.hostname = ?", hostname).
 		Where("node.cluster = ?", cluster).
-		GroupBy("node_state.node_id").
 		RunWith(r.DB).
-		QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil {
-		cclog.Warnf("Error while querying node '%s' at time '%d' from database: %v", hostname, timestamp, err)
+		QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
+		cclog.Warnf("Error while querying node '%s' from database: %v", hostname, err)
 		return nil, err
 	}

@@ -106,31 +140,28 @@ func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool)
 	return node, nil
 }

-func (r *NodeRepository) GetNodeById(id int64, withMeta bool) (*schema.Node, error) {
+func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) {
 	node := &schema.Node{}
-	var timestamp int
-	if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
-		"node_state.health_state", "MAX(node_state.time_stamp) as time").
-		From("node_state").
-		Join("node ON node_state.node_id = node.id").
+	if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
+		"node_state.node_state", "node_state.health_state").
+		From("node").
+		Join("node_state ON node_state.node_id = node.id").
+		Where(latestStateCondition()).
 		Where("node.id = ?", id).
-		GroupBy("node_state.node_id").
 		RunWith(r.DB).
-		QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil {
-		cclog.Warnf("Error while querying node ID '%d' at time '%d' from database: %v", id, timestamp, err)
+		QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
+		cclog.Warnf("Error while querying node ID '%d' from database: %v", id, err)
 		return nil, err
 	}

-	// NEEDS METADATA BY ID
-	// if withMeta {
-	// 	var err error
-	// 	var meta map[string]string
-	// 	if meta, err = r.FetchMetadata(hostname, cluster); err != nil {
-	// 		cclog.Warnf("Error while fetching metadata for node '%s'", hostname)
-	// 		return nil, err
-	// 	}
-	// 	node.MetaData = meta
-	// }
+	if withMeta {
+		meta, metaErr := r.FetchMetadata(node.Hostname, node.Cluster)
+		if metaErr != nil {
+			cclog.Warnf("Error while fetching metadata for node ID '%d': %v", id, metaErr)
+			return nil, metaErr
+		}
+		node.MetaData = meta
+	}

 	return node, nil
 }
@@ -166,9 +197,10 @@ func (r *NodeRepository) AddNode(node *schema.NodeDB) (int64, error) {
 }

 const NamedNodeStateInsert string = `
-INSERT INTO node_state (time_stamp, node_state, health_state, cpus_allocated,
-	memory_allocated, gpus_allocated, jobs_running, node_id)
-	VALUES (:time_stamp, :node_state, :health_state, :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`
+INSERT INTO node_state (time_stamp, node_state, health_state, health_metrics,
+	cpus_allocated, memory_allocated, gpus_allocated, jobs_running, node_id)
+	VALUES (:time_stamp, :node_state, :health_state, :health_metrics,
+	:cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`

 // TODO: Add real Monitoring Health State

@@ -194,8 +226,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
 				return err
 			}

-			cclog.Infof("Added node '%s' to database", hostname)
-			return nil
+			cclog.Debugf("Added node '%s' to database", hostname)
 		} else {
 			cclog.Warnf("Error while querying node '%v' from database", id)
 			return err
@@ -209,7 +240,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
 		cclog.Errorf("Error while adding node state for '%v' to database", hostname)
 		return err
 	}
-	cclog.Infof("Updated node state for '%s' in database", hostname)
+	cclog.Debugf("Updated node state for '%s' in database", hostname)
 	return nil
 }

@@ -222,6 +253,77 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt
 // 	return nil
 // }

+// NodeStateWithNode combines a node state row with denormalized node info.
+type NodeStateWithNode struct {
+	ID              int64  `db:"id"`
+	TimeStamp       int64  `db:"time_stamp"`
+	NodeState       string `db:"node_state"`
+	HealthState     string `db:"health_state"`
+	HealthMetrics   string `db:"health_metrics"`
+	CpusAllocated   int    `db:"cpus_allocated"`
+	MemoryAllocated int64  `db:"memory_allocated"`
+	GpusAllocated   int    `db:"gpus_allocated"`
+	JobsRunning     int    `db:"jobs_running"`
+	Hostname        string `db:"hostname"`
+	Cluster         string `db:"cluster"`
+	SubCluster      string `db:"subcluster"`
+}
+
+// FindNodeStatesBefore returns all node_state rows with time_stamp < cutoff,
+// joined with node info for denormalized archiving.
+func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode, error) {
+	rows, err := sq.Select(
+		"node_state.id", "node_state.time_stamp", "node_state.node_state",
+		"node_state.health_state", "node_state.health_metrics",
+		"node_state.cpus_allocated", "node_state.memory_allocated",
+		"node_state.gpus_allocated", "node_state.jobs_running",
+		"node.hostname", "node.cluster", "node.subcluster",
+	).
+		From("node_state").
+		Join("node ON node_state.node_id = node.id").
+		Where(sq.Lt{"node_state.time_stamp": cutoff}).
+		Where("node_state.id NOT IN (SELECT ns2.id FROM node_state ns2 WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id))").
+		OrderBy("node.cluster ASC", "node.subcluster ASC", "node.hostname ASC", "node_state.time_stamp ASC").
+		RunWith(r.DB).Query()
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var result []NodeStateWithNode
+	for rows.Next() {
+		var ns NodeStateWithNode
+		var healthMetrics sql.NullString
+		if err := rows.Scan(&ns.ID, &ns.TimeStamp, &ns.NodeState,
+			&ns.HealthState, &healthMetrics,
+			&ns.CpusAllocated, &ns.MemoryAllocated,
+			&ns.GpusAllocated, &ns.JobsRunning,
+			&ns.Hostname, &ns.Cluster, &ns.SubCluster); err != nil {
+			return nil, err
+		}
+		ns.HealthMetrics = healthMetrics.String
+		result = append(result, ns)
+	}
+	return result, nil
+}
+
+// DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff,
+// but always preserves the row with the latest timestamp per node_id.
+func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) {
+	res, err := r.DB.Exec(
+		`DELETE FROM node_state WHERE time_stamp < ?
+		 AND id NOT IN (
+		   SELECT id FROM node_state ns2
+		   WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id)
+		 )`,
+		cutoff,
+	)
+	if err != nil {
+		return 0, err
+	}
+	return res.RowsAffected()
+}
+
 func (r *NodeRepository) DeleteNode(id int64) error {
 	_, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id)
 	if err != nil {
@@ -241,38 +343,17 @@ func (r *NodeRepository) QueryNodes(
 	order *model.OrderByInput, // Currently unused!
 ) ([]*schema.Node, error) {
 	query, qerr := AccessCheck(ctx,
-		sq.Select("hostname", "cluster", "subcluster", "node_state", "health_state", "MAX(time_stamp) as time").
+		sq.Select("node.hostname", "node.cluster", "node.subcluster",
+			"node_state.node_state", "node_state.health_state").
 			From("node").
-			Join("node_state ON node_state.node_id = node.id"))
+			Join("node_state ON node_state.node_id = node.id").
+			Where(latestStateCondition()))
 	if qerr != nil {
 		return nil, qerr
 	}

-	for _, f := range filters {
-		if f.Cluster != nil {
-			query = buildStringCondition("cluster", f.Cluster, query)
-		}
-		if f.Subcluster != nil {
-			query = buildStringCondition("subcluster", f.Subcluster, query)
-		}
-		if f.Hostname != nil {
-			query = buildStringCondition("hostname", f.Hostname, query)
-		}
-		if f.SchedulerState != nil {
-			query = query.Where("node_state = ?", f.SchedulerState)
-			// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
-			now := time.Now().Unix()
-			query = query.Where(sq.Gt{"time_stamp": (now - 60)})
-		}
-		if f.HealthState != nil {
-			query = query.Where("health_state = ?", f.HealthState)
-			// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
-			now := time.Now().Unix()
-			query = query.Where(sq.Gt{"time_stamp": (now - 60)})
-		}
-	}
-
-	query = query.GroupBy("node_id").OrderBy("hostname ASC")
+	query = applyNodeFilters(query, filters)
+	query = query.OrderBy("node.hostname ASC")

 	if page != nil && page.ItemsPerPage != -1 {
 		limit := uint64(page.ItemsPerPage)
@@ -290,11 +371,10 @@ func (r *NodeRepository) QueryNodes(
 	nodes := make([]*schema.Node, 0)
 	for rows.Next() {
 		node := schema.Node{}
-		var timestamp int
 		if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster,
-			&node.NodeState, &node.HealthState, &timestamp); err != nil {
+			&node.NodeState, &node.HealthState); err != nil {
 			rows.Close()
-			cclog.Warnf("Error while scanning rows (QueryNodes) at time '%d'", timestamp)
+			cclog.Warn("Error while scanning rows (QueryNodes)")
 			return nil, err
 		}
 		nodes = append(nodes, &node)
@@ -386,73 +466,115 @@ func (r *NodeRepository) QueryNodesWithMeta(
 	return nodes, nil
 }

-// CountNodes returns the total matched nodes based on a node filter. It always operates
-// on the last state (largest timestamp).
-func (r *NodeRepository) CountNodes(
+// QueryNodesWithMeta returns a list of nodes based on a node filter. It always operates
+// on the last state (largest timestamp). It includes both (!) optional JSON column data
+func (r *NodeRepository) QueryNodesWithMeta(
 	ctx context.Context,
 	filters []*model.NodeFilter,
-) (int, error) {
+	page *model.PageRequest,
+	order *model.OrderByInput, // Currently unused!
+) ([]*schema.Node, error) {
 	query, qerr := AccessCheck(ctx,
-		sq.Select("time_stamp", "count(*) as countRes").
+		sq.Select("node.hostname", "node.cluster", "node.subcluster",
+			"node_state.node_state", "node_state.health_state",
+			"node.meta_data", "node_state.health_metrics").
 			From("node").
-			Join("node_state ON node_state.node_id = node.id"))
+			Join("node_state ON node_state.node_id = node.id").
+			Where(latestStateCondition()))
 	if qerr != nil {
-		return 0, qerr
+		return nil, qerr
 	}

-	for _, f := range filters {
-		if f.Cluster != nil {
-			query = buildStringCondition("cluster", f.Cluster, query)
-		}
-		if f.Subcluster != nil {
-			query = buildStringCondition("subcluster", f.Subcluster, query)
-		}
-		if f.Hostname != nil {
-			query = buildStringCondition("hostname", f.Hostname, query)
-		}
-		if f.SchedulerState != nil {
-			query = query.Where("node_state = ?", f.SchedulerState)
-			// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
-			now := time.Now().Unix()
-			query = query.Where(sq.Gt{"time_stamp": (now - 60)})
-		}
-		if f.HealthState != nil {
-			query = query.Where("health_state = ?", f.HealthState)
-			// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
-			now := time.Now().Unix()
-			query = query.Where(sq.Gt{"time_stamp": (now - 60)})
-		}
-	}
+	query = applyNodeFilters(query, filters)
+	query = query.OrderBy("node.hostname ASC")

-	query = query.GroupBy("time_stamp").OrderBy("time_stamp DESC").Limit(1)
+	if page != nil && page.ItemsPerPage != -1 {
+		limit := uint64(page.ItemsPerPage)
+		query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit)
+	}

 	rows, err := query.RunWith(r.stmtCache).Query()
 	if err != nil {
+		queryString, queryVars, _ := query.ToSql()
+		cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
+		return nil, err
+	}
+
+	nodes := make([]*schema.Node, 0)
+	for rows.Next() {
+		node := schema.Node{}
+		RawMetaData := make([]byte, 0)
+		RawMetricHealth := make([]byte, 0)
+
+		if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster,
+			&node.NodeState, &node.HealthState, &RawMetaData, &RawMetricHealth); err != nil {
+			rows.Close()
+			cclog.Warn("Error while scanning rows (QueryNodes)")
+			return nil, err
+		}
+
+		if len(RawMetaData) == 0 {
+			node.MetaData = nil
+		} else {
+			metaData := make(map[string]string)
+			if err := json.Unmarshal(RawMetaData, &metaData); err != nil {
+				cclog.Warn("Error while unmarshaling raw metadata json")
+				return nil, err
+			}
+			node.MetaData = metaData
+		}
+
+		if len(RawMetricHealth) == 0 {
+			node.HealthData = nil
+		} else {
+			healthData := make(map[string][]string)
+			if err := json.Unmarshal(RawMetricHealth, &healthData); err != nil {
+				cclog.Warn("Error while unmarshaling raw healthdata json")
+				return nil, err
+			}
+			node.HealthData = healthData
+		}
+
+		nodes = append(nodes, &node)
+	}
+
+	return nodes, nil
+}
+
+// CountNodes returns the total matched nodes based on a node filter. It always operates
+// on the last state (largest timestamp) per node.
+func (r *NodeRepository) CountNodes(
+	ctx context.Context,
+	filters []*model.NodeFilter,
+) (int, error) {
+	query, qerr := AccessCheck(ctx,
+		sq.Select("COUNT(*)").
+			From("node").
+			Join("node_state ON node_state.node_id = node.id").
+			Where(latestStateCondition()))
+	if qerr != nil {
+		return 0, qerr
+	}
+
+	query = applyNodeFilters(query, filters)
+
+	var count int
+	if err := query.RunWith(r.stmtCache).QueryRow().Scan(&count); err != nil {
 		queryString, queryVars, _ := query.ToSql()
 		cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
 		return 0, err
 	}

-	var totalNodes int
-	for rows.Next() {
-		var timestamp int
-		if err := rows.Scan(&timestamp, &totalNodes); err != nil {
-			rows.Close()
-			cclog.Warnf("Error while scanning rows (CountNodes) at time '%d'", timestamp)
-			return 0, err
-		}
-	}
-
-	return totalNodes, nil
+	return count, nil
 }

 func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
-	q := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state",
-		"node_state.health_state", "MAX(node_state.time_stamp) as time").
+	q := sq.Select("node.hostname", "node.cluster", "node.subcluster",
+		"node_state.node_state", "node_state.health_state").
 		From("node").
 		Join("node_state ON node_state.node_id = node.id").
+		Where(latestStateCondition()).
 		Where("node.cluster = ?", cluster).
-		GroupBy("node_state.node_id").
 		OrderBy("node.hostname ASC")

 	rows, err := q.RunWith(r.DB).Query()
@@ -464,10 +586,9 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
 	defer rows.Close()
 	for rows.Next() {
 		node := &schema.Node{}
-		var timestamp int
 		if err := rows.Scan(&node.Hostname, &node.Cluster,
-			&node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil {
-			cclog.Warnf("Error while scanning node list (ListNodes) at time '%d'", timestamp)
+			&node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
+			cclog.Warn("Error while scanning node list (ListNodes)")
 			return nil, err
 		}

@@ -478,11 +599,11 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
 }

 func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
-	q := sq.Select("node.hostname", "node_state.node_state", "MAX(node_state.time_stamp) as time").
+	q := sq.Select("node.hostname", "node_state.node_state").
 		From("node").
 		Join("node_state ON node_state.node_id = node.id").
+		Where(latestStateCondition()).
 		Where("node.cluster = ?", cluster).
-		GroupBy("node_state.node_id").
 		OrderBy("node.hostname ASC")

 	rows, err := q.RunWith(r.DB).Query()
@@ -495,9 +616,8 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
 	defer rows.Close()
 	for rows.Next() {
 		var hostname, nodestate string
-		var timestamp int
-		if err := rows.Scan(&hostname, &nodestate, &timestamp); err != nil {
-			cclog.Warnf("Error while scanning node list (MapNodes) at time '%d'", timestamp)
+		if err := rows.Scan(&hostname, &nodestate); err != nil {
+			cclog.Warn("Error while scanning node list (MapNodes)")
 			return nil, err
 		}

@@ -509,37 +629,15 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {

 func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) {
 	query, qerr := AccessCheck(ctx,
-		sq.Select(column, "COUNT(*) as count").
+		sq.Select(column).
 			From("node").
 			Join("node_state ON node_state.node_id = node.id").
-			Where(latestStateCondition()).
-			GroupBy(column))
+			Where(latestStateCondition()))
 	if qerr != nil {
 		return nil, qerr
 	}

-	query = query.Join("node_state ON node_state.node_id = node.id")
-
-	for _, f := range filters {
-		if f.Hostname != nil {
-			query = buildStringCondition("hostname", f.Hostname, query)
-		}
-		if f.Cluster != nil {
-			query = buildStringCondition("cluster", f.Cluster, query)
-		}
-		if f.Subcluster != nil {
-			query = buildStringCondition("subcluster", f.Subcluster, query)
-		}
-		if f.SchedulerState != nil {
-			query = query.Where("node_state = ?", f.SchedulerState)
-		}
-		if f.HealthState != nil {
-			query = query.Where("health_state = ?", f.HealthState)
-		}
-	}
-
-	// Add Group and Order
-	query = query.GroupBy("hostname").OrderBy("hostname DESC")
+	query = applyNodeFilters(query, filters)

 	rows, err := query.RunWith(r.stmtCache).Query()
 	if err != nil {
@@ -549,6 +647,18 @@ func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeF
 	}
 	defer rows.Close()

+	stateMap := map[string]int{}
+	for rows.Next() {
+		var state string
+		if err := rows.Scan(&state); err != nil {
+			rows.Close()
+			cclog.Warn("Error while scanning rows (CountStates)")
+			return nil, err
+		}
+
+		stateMap[state] += 1
+	}
+
 	nodes := make([]*model.NodeStates, 0)
 	for rows.Next() {
 		var state string
@@ -587,8 +697,8 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model.
 		if f.Cluster != nil {
 			query = buildStringCondition("cluster", f.Cluster, query)
 		}
-		if f.Subcluster != nil {
-			query = buildStringCondition("subcluster", f.Subcluster, query)
+		if f.SubCluster != nil {
+			query = buildStringCondition("subcluster", f.SubCluster, query)
 		}
 		if f.SchedulerState != nil {
 			query = query.Where("node_state = ?", f.SchedulerState)
@@ -640,6 +750,132 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model.
 	return timedStates, nil
 }

+func (r *NodeRepository) GetNodesForList(
+	ctx context.Context,
+	cluster string,
+	subCluster string,
+	stateFilter string,
+	nodeFilter string,
+	page *model.PageRequest,
+) ([]string, map[string]string, int, bool, error) {
+	// Init Return Vars
+	nodes := make([]string, 0)
+	stateMap := make(map[string]string)
+	countNodes := 0
+	hasNextPage := false
+
+	// Build Filters
+	queryFilters := make([]*model.NodeFilter, 0)
+	if cluster != "" {
+		queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}})
+	}
+	if subCluster != "" {
+		queryFilters = append(queryFilters, &model.NodeFilter{SubCluster: &model.StringInput{Eq: &subCluster}})
+	}
+	if nodeFilter != "" && stateFilter != "notindb" {
+		queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}})
+	}
+	if stateFilter != "all" && stateFilter != "notindb" {
+		queryState := schema.SchedulerState(stateFilter)
+		queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState})
+	}
+	// if healthFilter != "all" {
+	// 	filters = append(filters, &model.NodeFilter{HealthState: &healthFilter})
+	// }
+
+	// Special Case: Disable Paging for missing nodes filter, save IPP for later
+	var backupItems int
+	if stateFilter == "notindb" {
+		backupItems = page.ItemsPerPage
+		page.ItemsPerPage = -1
+	}
+
+	// Query Nodes From DB
+	rawNodes, serr := r.QueryNodes(ctx, queryFilters, page, nil) // Order not Used
+	if serr != nil {
+		cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)")
+		return nil, nil, 0, false, serr
+	}
+
+	// Intermediate Node Result Info
+	for _, node := range rawNodes {
+		if node == nil {
+			continue
+		}
+		nodes = append(nodes, node.Hostname)
+		stateMap[node.Hostname] = string(node.NodeState)
+	}
+
+	// Special Case: Find Nodes not in DB node table but in metricStore only
+	if stateFilter == "notindb" {
+		// Reapply Original Paging
+		page.ItemsPerPage = backupItems
+		// Get Nodes From Topology
+		var topoNodes []string
+		if subCluster != "" {
+			scNodes := archive.NodeLists[cluster][subCluster]
+			topoNodes = scNodes.PrintList()
+		} else {
+			subClusterNodeLists := archive.NodeLists[cluster]
+			for _, nodeList := range subClusterNodeLists {
+				topoNodes = append(topoNodes, nodeList.PrintList()...)
+			}
+		}
+		// Compare to all nodes from cluster/subcluster in DB
+		var missingNodes []string
+		for _, scanNode := range topoNodes {
+			if !slices.Contains(nodes, scanNode) {
+				missingNodes = append(missingNodes, scanNode)
+			}
+		}
+		// Filter nodes by name
+		if nodeFilter != "" {
+			filteredNodesByName := []string{}
+			for _, missingNode := range missingNodes {
+				if strings.Contains(missingNode, nodeFilter) {
+					filteredNodesByName = append(filteredNodesByName, missingNode)
+				}
+			}
+			missingNodes = filteredNodesByName
+		}
+		// Sort Missing Nodes Alphanumerically
+		slices.Sort(missingNodes)
+		// Total Missing
+		countNodes = len(missingNodes)
+		// Apply paging
+		if countNodes > page.ItemsPerPage {
+			start := (page.Page - 1) * page.ItemsPerPage
+			end := start + page.ItemsPerPage
+			if end > countNodes {
+				end = countNodes
+				hasNextPage = false
+			} else {
+				hasNextPage = true
+			}
+			nodes = missingNodes[start:end]
+		} else {
+			nodes = missingNodes
+		}
+
+	} else {
+		// DB Nodes: Count and derive hasNextPage from count
+		var cerr error
+		countNodes, cerr = r.CountNodes(ctx, queryFilters)
+		if cerr != nil {
+			cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
+			return nil, nil, 0, false, cerr
+		}
+		hasNextPage = page.Page*page.ItemsPerPage < countNodes
+	}
+
+	// Fallback for non-init'd node table in DB; Ignores stateFilter
+	if stateFilter == "all" && countNodes == 0 {
+		nodes, countNodes, hasNextPage = getNodesFromTopol(cluster, subCluster, nodeFilter, page)
+	}
+
+	return nodes, stateMap, countNodes, hasNextPage, nil
+}
+
 func AccessCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) {
 	user := GetUserFromContext(ctx)
 	return AccessCheckWithUser(user, query)
@@ -661,3 +897,51 @@ func AccessCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBu
 		return qnil, fmt.Errorf("user has no or unknown roles")
 	}
 }
+
+func getNodesFromTopol(cluster string, subCluster string, nodeFilter string, page *model.PageRequest) ([]string, int, bool) {
+	// 0) Init additional vars
+	hasNextPage := false
+	totalNodes := 0
+
+	// 1) Get list of all nodes
+	var topolNodes []string
+	if subCluster != "" {
+		scNodes := archive.NodeLists[cluster][subCluster]
+		topolNodes = scNodes.PrintList()
+	} else {
+		subClusterNodeLists := archive.NodeLists[cluster]
+		for _, nodeList := range subClusterNodeLists {
+			topolNodes = append(topolNodes, nodeList.PrintList()...)
+		}
+	}
+
+	// 2) Filter nodes
+	if nodeFilter != "" {
+		filteredNodes := []string{}
+		for _, node := range topolNodes {
+			if strings.Contains(node, nodeFilter) {
+				filteredNodes = append(filteredNodes, node)
+			}
+		}
+		topolNodes = filteredNodes
+	}
+
+	// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after ccms return ...
+	totalNodes = len(topolNodes)
+	sort.Strings(topolNodes)
+
+	// 3) Apply paging
+	if len(topolNodes) > page.ItemsPerPage {
+		start := (page.Page - 1) * page.ItemsPerPage
+		end := start + page.ItemsPerPage
+		if end >= len(topolNodes) {
+			end = len(topolNodes)
+			hasNextPage = false
+		} else {
+			hasNextPage = true
+		}
+		topolNodes = topolNodes[start:end]
+	}
+
+	return topolNodes, totalNodes, hasNextPage
+}
--- a/internal/repository/node_test.go
+++ b/internal/repository/node_test.go
@@ -15,9 +15,9 @@ import (

 	"github.com/ClusterCockpit/cc-backend/internal/config"
 	"github.com/ClusterCockpit/cc-backend/pkg/archive"
-	ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	_ "github.com/mattn/go-sqlite3"
 )

@@ -26,7 +26,7 @@ func nodeTestSetup(t *testing.T) {
 		"main": {
 	"addr":            "0.0.0.0:8080",
 	"validate": false,
-  "apiAllowedIPs": [
+  "api-allowed-ips": [
    "*"
  ]
 	},
@@ -38,18 +38,7 @@ func nodeTestSetup(t *testing.T) {
  "jwts": {
      "max-age": "2m"
  }
-	},
-	"clusters": [
-	{
-	   "name": "testcluster",
-	   "metricDataRepository": {"kind": "test", "url": "bla:8081"},
-	   "filterRanges": {
-		"numNodes": { "from": 1, "to": 64 },
-		"duration": { "from": 0, "to": 86400 },
-		"startTime": { "from": "2022-01-01T00:00:00Z", "to": null }
-	   }
 	}
-	]
 }`
 	const testclusterJSON = `{
        "name": "testcluster",
@@ -130,7 +119,7 @@ func nodeTestSetup(t *testing.T) {
 	}

 	dbfilepath := filepath.Join(tmpdir, "test.db")
-	err := MigrateDB("sqlite3", dbfilepath)
+	err := MigrateDB(dbfilepath)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -144,19 +133,22 @@ func nodeTestSetup(t *testing.T) {

 	// Load and check main configuration
 	if cfg := ccconf.GetPackageConfig("main"); cfg != nil {
-		if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil {
-			config.Init(cfg, clustercfg)
-		} else {
-			cclog.Abort("Cluster configuration must be present")
-		}
+		config.Init(cfg)
 	} else {
 		cclog.Abort("Main configuration must be present")
 	}
 	archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive)

-	Connect("sqlite3", dbfilepath)
+	if err := ResetConnection(); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() {
+		ResetConnection()
+	})

-	if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil {
+	Connect(dbfilepath)
+
+	if err := archive.Init(json.RawMessage(archiveCfg)); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -164,8 +156,12 @@ func nodeTestSetup(t *testing.T) {
 func TestUpdateNodeState(t *testing.T) {
 	nodeTestSetup(t)

+	repo := GetNodeRepository()
+	now := time.Now().Unix()
+
 	nodeState := schema.NodeStateDB{
-		TimeStamp: time.Now().Unix(), NodeState: "allocated",
+		TimeStamp:       now,
+		NodeState:       "allocated",
 		CpusAllocated:   72,
 		MemoryAllocated: 480,
 		GpusAllocated:   0,
@@ -173,18 +169,152 @@ func TestUpdateNodeState(t *testing.T) {
 		JobsRunning:     1,
 	}

-	repo := GetNodeRepository()
 	err := repo.UpdateNodeState("host124", "testcluster", &nodeState)
 	if err != nil {
-		return
+		t.Fatal(err)
 	}

 	node, err := repo.GetNode("host124", "testcluster", false)
 	if err != nil {
-		return
+		t.Fatal(err)
 	}

 	if node.NodeState != "allocated" {
 		t.Errorf("wrong node state\ngot: %s \nwant: allocated ", node.NodeState)
 	}
+
+	t.Run("FindBeforeEmpty", func(t *testing.T) {
+		// Only the current-timestamp row exists, so nothing should be found before now
+		rows, err := repo.FindNodeStatesBefore(now)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if len(rows) != 0 {
+			t.Errorf("expected 0 rows, got %d", len(rows))
+		}
+	})
+
+	t.Run("DeleteOldRows", func(t *testing.T) {
+		// Insert 2 more old rows for host124
+		for i, ts := range []int64{now - 7200, now - 3600} {
+			ns := schema.NodeStateDB{
+				TimeStamp:       ts,
+				NodeState:       "allocated",
+				HealthState:     schema.MonitoringStateFull,
+				CpusAllocated:   72,
+				MemoryAllocated: 480,
+				JobsRunning:     i,
+			}
+			if err := repo.UpdateNodeState("host124", "testcluster", &ns); err != nil {
+				t.Fatal(err)
+			}
+		}
+
+		// Delete rows older than 30 minutes
+		cutoff := now - 1800
+		cnt, err := repo.DeleteNodeStatesBefore(cutoff)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Should delete the 2 old rows
+		if cnt != 2 {
+			t.Errorf("expected 2 deleted rows, got %d", cnt)
+		}
+
+		// Latest row should still exist
+		node, err := repo.GetNode("host124", "testcluster", false)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if node.NodeState != "allocated" {
+			t.Errorf("expected node state 'allocated', got %s", node.NodeState)
+		}
+	})
+
+	t.Run("PreservesLatestPerNode", func(t *testing.T) {
+		// Insert a single old row for host125 — it's the latest per node so it must survive
+		ns := schema.NodeStateDB{
+			TimeStamp:       now - 7200,
+			NodeState:       "idle",
+			HealthState:     schema.MonitoringStateFull,
+			CpusAllocated:   0,
+			MemoryAllocated: 0,
+			JobsRunning:     0,
+		}
+		if err := repo.UpdateNodeState("host125", "testcluster", &ns); err != nil {
+			t.Fatal(err)
+		}
+
+		// Delete everything older than now — the latest per node should be preserved
+		_, err := repo.DeleteNodeStatesBefore(now)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// The latest row for host125 must still exist
+		node, err := repo.GetNode("host125", "testcluster", false)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if node.NodeState != "idle" {
+			t.Errorf("expected node state 'idle', got %s", node.NodeState)
+		}
+
+		// Verify exactly 1 row remains for host125
+		var countAfter int
+		if err := repo.DB.QueryRow(
+			"SELECT COUNT(*) FROM node_state WHERE node_id = (SELECT id FROM node WHERE hostname = 'host125')").
+			Scan(&countAfter); err != nil {
+			t.Fatal(err)
+		}
+		if countAfter != 1 {
+			t.Errorf("expected 1 row remaining for host125, got %d", countAfter)
+		}
+	})
+
+	t.Run("FindBeforeWithJoin", func(t *testing.T) {
+		// Insert old and current rows for host123
+		for _, ts := range []int64{now - 7200, now} {
+			ns := schema.NodeStateDB{
+				TimeStamp:       ts,
+				NodeState:       "allocated",
+				HealthState:     schema.MonitoringStateFull,
+				CpusAllocated:   8,
+				MemoryAllocated: 1024,
+				GpusAllocated:   1,
+				JobsRunning:     1,
+			}
+			if err := repo.UpdateNodeState("host123", "testcluster", &ns); err != nil {
+				t.Fatal(err)
+			}
+		}
+
+		// Find rows older than 30 minutes, excluding latest per node
+		cutoff := now - 1800
+		rows, err := repo.FindNodeStatesBefore(cutoff)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Should find the old host123 row
+		found := false
+		for _, row := range rows {
+			if row.Hostname == "host123" && row.TimeStamp == now-7200 {
+				found = true
+				if row.Cluster != "testcluster" {
+					t.Errorf("expected cluster 'testcluster', got %s", row.Cluster)
+				}
+				if row.SubCluster != "sc1" {
+					t.Errorf("expected subcluster 'sc1', got %s", row.SubCluster)
+				}
+				if row.CpusAllocated != 8 {
+					t.Errorf("expected cpus_allocated 8, got %d", row.CpusAllocated)
+				}
+			}
+		}
+		if !found {
+			t.Errorf("expected to find old host123 row among %d results", len(rows))
+		}
+	})
 }
--- a/internal/repository/repository_test.go
+++ b/internal/repository/repository_test.go
@@ -6,11 +6,13 @@ package repository

 import (
 	"context"
+	"os"
+	"path/filepath"
 	"testing"

 	"github.com/ClusterCockpit/cc-backend/internal/graph/model"
-	cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
-	"github.com/ClusterCockpit/cc-lib/schema"
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	"github.com/ClusterCockpit/cc-lib/v2/schema"
 	_ "github.com/mattn/go-sqlite3"
 )

@@ -46,7 +48,7 @@ func BenchmarkSelect1(b *testing.B) {
 }

 func BenchmarkDB_FindJobById(b *testing.B) {
-	var jobId int64 = 1677322
+	var jobID int64 = 1677322

 	b.Run("FindJobById", func(b *testing.B) {
 		db := setup(b)
@@ -55,7 +57,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {

 		b.RunParallel(func(pb *testing.PB) {
 			for pb.Next() {
-				_, err := db.FindById(getContext(b), jobId)
+				_, err := db.FindByID(getContext(b), jobID)
 				noErr(b, err)
 			}
 		})
@@ -63,7 +65,7 @@ func BenchmarkDB_FindJobById(b *testing.B) {
 }

 func BenchmarkDB_FindJob(b *testing.B) {
-	var jobId int64 = 107266
+	var jobID int64 = 107266
 	var startTime int64 = 1657557241
 	cluster := "fritz"

@@ -74,7 +76,7 @@ func BenchmarkDB_FindJob(b *testing.B) {

 		b.RunParallel(func(pb *testing.PB) {
 			for pb.Next() {
-				_, err := db.Find(&jobId, &cluster, &startTime)
+				_, err := db.Find(&jobID, &cluster, &startTime)
 				noErr(b, err)
 			}
 		})
@@ -148,10 +150,24 @@ func getContext(tb testing.TB) context.Context {
 func setup(tb testing.TB) *JobRepository {
 	tb.Helper()
 	cclog.Init("warn", true)
-	dbfile := "testdata/job.db"
-	err := MigrateDB("sqlite3", dbfile)
+
+	// Copy test DB to a temp file for test isolation
+	srcData, err := os.ReadFile("testdata/job.db")
 	noErr(tb, err)
-	Connect("sqlite3", dbfile)
+	dbfile := filepath.Join(tb.TempDir(), "job.db")
+	err = os.WriteFile(dbfile, srcData, 0o644)
+	noErr(tb, err)
+
+	// Reset singletons so Connect uses the new temp DB
+	err = ResetConnection()
+	noErr(tb, err)
+	tb.Cleanup(func() {
+		ResetConnection()
+	})
+
+	err = MigrateDB(dbfile)
+	noErr(tb, err)
+	Connect(dbfile)
 	return GetJobRepository()
 }

--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`ALTER TABLE user ADD COLUMN projects varchar(255) NOT NULL DEFAULT "[]";`
				`@@ -1 +0,0 @@`
				`ALTER TABLE configuration MODIFY value VARCHAR(255);`