feat: Add support for multiple external metric stores

This commit is contained in:
2026-01-27 10:02:07 +01:00
parent 4853814228
commit b307e885ce
9 changed files with 280 additions and 87 deletions

View File

@@ -0,0 +1,29 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdispatch
const configSchema = `{
"type": "array",
"description": "Array of metric store configurations with scope-based routing.",
"items": {
"type": "object",
"properties": {
"scope": {
"description": "Scope identifier for routing metrics (e.g., cluster name, '*' for default)",
"type": "string"
},
"url": {
"description": "URL of the metric store endpoint",
"type": "string"
},
"token": {
"description": "Authentication token for the metric store",
"type": "string"
}
},
"required": ["scope", "url", "token"]
}
}`

View File

@@ -44,7 +44,6 @@ import (
"time"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/lrucache"
"github.com/ClusterCockpit/cc-lib/v2/resampler"
@@ -96,6 +95,13 @@ func LoadData(job *schema.Job,
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving {
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
return err, 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
@@ -107,7 +113,7 @@ func LoadData(job *schema.Job,
}
}
jd, err = metricstore.LoadData(job, metrics, scopes, ctx, resolution)
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
if err != nil {
if len(jd) != 0 {
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s",
@@ -236,7 +242,14 @@ func LoadAverages(
return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here?
}
stats, err := metricstore.LoadStats(job, metrics, ctx)
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
return err
}
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
@@ -273,7 +286,14 @@ func LoadScopedJobStats(
return archive.LoadScopedStatsFromArchive(job, metrics, scopes)
}
scopedStats, err := metricstore.LoadScopedStats(job, metrics, scopes, ctx)
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
return nil, err
}
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
if err != nil {
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
@@ -295,9 +315,16 @@ func LoadJobStats(
return archive.LoadStatsFromArchive(job, metrics)
}
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
if err != nil {
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
return nil, err
}
data := make(map[string]schema.MetricStatistics, len(metrics))
stats, err := metricstore.LoadStats(job, metrics, ctx)
stats, err := ms.LoadStats(job, metrics, ctx)
if err != nil {
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
job.JobID, job.User, job.Project, err.Error())
@@ -333,6 +360,7 @@ func LoadJobStats(
// the metric store (not the archive) since it's for current/recent node status monitoring.
//
// Returns a nested map structure: node -> metric -> scoped data.
// FIXME: Add support for subcluster specific cc-metric-stores
func LoadNodeData(
cluster string,
metrics, nodes []string,
@@ -346,7 +374,14 @@ func LoadNodeData(
}
}
data, err := metricstore.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
ms, err := GetMetricDataRepo(cluster, "")
if err != nil {
cclog.Errorf("failed to load node data from metric store: %s",
err.Error())
return nil, err
}
data, err := ms.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
@@ -383,7 +418,14 @@ func LoadNodeListData(
}
}
data, err := metricstore.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
ms, err := GetMetricDataRepo(cluster, subCluster)
if err != nil {
cclog.Errorf("failed to load node data from metric store: %s",
err.Error())
return nil, err
}
data, err := ms.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx)
if err != nil {
if len(data) != 0 {
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",

View File

@@ -0,0 +1,112 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricdispatch
import (
"bytes"
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
ccms "github.com/ClusterCockpit/cc-backend/internal/metricstoreclient"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type MetricDataRepository interface {
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only.
LoadStats(job *schema.Job,
metrics []string,
ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of metrics to a map of scopes to the scoped metric statistics of the job.
LoadScopedStats(job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context) (schema.ScopedJobStats, error)
// Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node.
LoadNodeData(cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
// Return a map of hosts to a map of metrics to a map of scopes for multiple nodes.
LoadNodeListData(cluster, subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
ctx context.Context) (map[string]schema.JobData, error)
}
type CCMetricStoreConfig struct {
Scope string `json:"scope"`
URL string `json:"url"`
Token string `json:"token"`
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
func Init(rawConfig json.RawMessage) error {
if rawConfig != nil {
var configs []CCMetricStoreConfig
config.Validate(configSchema, rawConfig)
dec := json.NewDecoder(bytes.NewReader(rawConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&configs); err != nil {
return fmt.Errorf("[METRICDISPATCH]> Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error())
}
if len(configs) == 0 {
return fmt.Errorf("[METRICDISPATCH]> No metric store configurations found in config file")
}
for _, config := range configs {
metricDataRepos[config.Scope] = ccms.NewCCMetricStore(config.URL, config.Token)
}
}
return nil
}
func GetMetricDataRepo(cluster string, subcluster string) (MetricDataRepository, error) {
var repo MetricDataRepository
var ok bool
key := cluster + "-" + subcluster
repo, ok = metricDataRepos[key]
if !ok {
repo, ok = metricDataRepos[cluster]
if !ok {
repo, ok = metricDataRepos["*"]
if !ok {
if metricstore.MetricStoreHandle == nil {
return nil, fmt.Errorf("[METRICDISPATCH]> no metric data repository configured '%s'", key)
}
repo = metricstore.MetricStoreHandle
cclog.Debugf("[METRICDISPATCH]> Using internal metric data repository for '%s'", key)
}
}
}
return repo, nil
}