2021-10-26 10:24:43 +02:00
|
|
|
package metricdata
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2022-03-17 16:15:35 +01:00
|
|
|
"encoding/json"
|
2021-11-26 10:32:36 +01:00
|
|
|
"fmt"
|
2022-01-17 13:33:35 +01:00
|
|
|
"time"
|
2021-10-26 10:24:43 +02:00
|
|
|
|
2022-01-27 09:40:59 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/config"
|
2022-03-01 16:01:25 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/log"
|
2022-01-27 09:40:59 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/schema"
|
2022-01-17 13:33:35 +01:00
|
|
|
"github.com/iamlouk/lrucache"
|
2021-10-26 10:24:43 +02:00
|
|
|
)
|
|
|
|
|
2021-12-08 10:14:45 +01:00
|
|
|
type MetricDataRepository interface {
|
2021-12-09 16:25:48 +01:00
|
|
|
// Initialize this MetricDataRepository. One instance of
|
|
|
|
// this interface will only ever be responsible for one cluster.
|
2022-03-17 16:15:35 +01:00
|
|
|
Init(rawConfig json.RawMessage) error
|
2021-12-09 16:25:48 +01:00
|
|
|
|
|
|
|
// Return the JobData for the given job, only with the requested metrics.
|
2022-01-07 09:47:41 +01:00
|
|
|
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
|
2021-12-09 16:25:48 +01:00
|
|
|
|
2022-01-12 13:03:01 +01:00
|
|
|
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
|
2021-12-17 15:49:22 +01:00
|
|
|
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
2021-12-09 16:25:48 +01:00
|
|
|
|
2022-01-31 15:16:05 +01:00
|
|
|
// Return a map of hosts to a map of metrics at the requested scopes for that node.
|
2022-03-24 14:34:42 +01:00
|
|
|
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
|
2021-12-08 10:14:45 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
|
2021-11-26 10:32:36 +01:00
|
|
|
|
2021-12-08 10:14:45 +01:00
|
|
|
var JobArchivePath string
|
|
|
|
|
2021-12-16 09:35:03 +01:00
|
|
|
var useArchive bool
|
|
|
|
|
|
|
|
func Init(jobArchivePath string, disableArchive bool) error {
|
|
|
|
useArchive = !disableArchive
|
2021-12-08 10:14:45 +01:00
|
|
|
JobArchivePath = jobArchivePath
|
|
|
|
for _, cluster := range config.Clusters {
|
|
|
|
if cluster.MetricDataRepository != nil {
|
2022-03-17 16:15:35 +01:00
|
|
|
var kind struct {
|
|
|
|
Kind string `json:"kind"`
|
|
|
|
}
|
|
|
|
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2022-01-24 10:06:25 +01:00
|
|
|
var mdr MetricDataRepository
|
2022-03-17 16:15:35 +01:00
|
|
|
switch kind.Kind {
|
2021-12-08 10:14:45 +01:00
|
|
|
case "cc-metric-store":
|
2022-01-24 10:06:25 +01:00
|
|
|
mdr = &CCMetricStore{}
|
2022-03-15 18:35:27 +01:00
|
|
|
case "influxdb":
|
|
|
|
mdr = &InfluxDBv2DataRepository{}
|
2022-01-24 10:06:25 +01:00
|
|
|
case "test":
|
|
|
|
mdr = &TestMetricDataRepository{}
|
2021-12-08 10:14:45 +01:00
|
|
|
default:
|
2022-03-17 16:15:35 +01:00
|
|
|
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", kind.Kind, cluster.Name)
|
2021-12-08 10:14:45 +01:00
|
|
|
}
|
2022-01-24 10:06:25 +01:00
|
|
|
|
2022-03-17 16:15:35 +01:00
|
|
|
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
|
2022-01-24 10:06:25 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
metricDataRepos[cluster.Name] = mdr
|
2021-12-08 10:14:45 +01:00
|
|
|
}
|
2021-11-26 10:32:36 +01:00
|
|
|
}
|
2021-12-08 10:14:45 +01:00
|
|
|
return nil
|
2021-11-26 10:32:36 +01:00
|
|
|
}
|
|
|
|
|
2022-01-20 10:08:50 +01:00
|
|
|
var cache *lrucache.Cache = lrucache.New(512 * 1024 * 1024)
|
2022-01-17 13:33:35 +01:00
|
|
|
|
2021-10-26 10:24:43 +02:00
|
|
|
// Fetches the metric data for a job.
|
2022-01-07 09:47:41 +01:00
|
|
|
func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
2022-03-21 13:30:19 +01:00
|
|
|
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
|
2022-01-20 10:08:50 +01:00
|
|
|
var jd schema.JobData
|
|
|
|
var err error
|
2022-02-16 09:06:23 +01:00
|
|
|
if job.State == schema.JobStateRunning ||
|
|
|
|
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
|
|
|
|
!useArchive {
|
2022-01-20 10:08:50 +01:00
|
|
|
repo, ok := metricDataRepos[job.Cluster]
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster), 0, 0
|
|
|
|
}
|
2022-01-17 13:33:35 +01:00
|
|
|
|
2022-01-20 10:08:50 +01:00
|
|
|
if scopes == nil {
|
|
|
|
scopes = append(scopes, schema.MetricScopeNode)
|
|
|
|
}
|
2021-12-08 10:14:45 +01:00
|
|
|
|
2022-01-20 10:08:50 +01:00
|
|
|
if metrics == nil {
|
2022-03-14 10:18:56 +01:00
|
|
|
cluster := config.GetCluster(job.Cluster)
|
2022-01-20 10:08:50 +01:00
|
|
|
for _, mc := range cluster.MetricConfig {
|
|
|
|
metrics = append(metrics, mc.Name)
|
|
|
|
}
|
|
|
|
}
|
2022-01-17 13:33:35 +01:00
|
|
|
|
2022-01-20 10:08:50 +01:00
|
|
|
jd, err = repo.LoadData(job, metrics, scopes, ctx)
|
|
|
|
if err != nil {
|
2022-03-01 16:01:25 +01:00
|
|
|
if len(jd) != 0 {
|
2022-03-09 14:27:47 +01:00
|
|
|
log.Errorf("partial error: %s", err.Error())
|
2022-03-01 16:01:25 +01:00
|
|
|
} else {
|
|
|
|
return err, 0, 0
|
|
|
|
}
|
2022-01-20 10:08:50 +01:00
|
|
|
}
|
2022-03-21 13:30:19 +01:00
|
|
|
size = jd.Size()
|
2022-01-20 10:08:50 +01:00
|
|
|
} else {
|
|
|
|
jd, err = loadFromArchive(job)
|
|
|
|
if err != nil {
|
|
|
|
return err, 0, 0
|
|
|
|
}
|
|
|
|
|
2022-03-21 13:30:19 +01:00
|
|
|
// Avoid sending unrequested data to the client:
|
2022-01-20 10:08:50 +01:00
|
|
|
if metrics != nil {
|
|
|
|
res := schema.JobData{}
|
|
|
|
for _, metric := range metrics {
|
2022-03-21 13:30:19 +01:00
|
|
|
if perscope, ok := jd[metric]; ok {
|
|
|
|
if len(scopes) > 1 {
|
|
|
|
subset := make(map[schema.MetricScope]*schema.JobMetric)
|
|
|
|
for _, scope := range scopes {
|
|
|
|
if jm, ok := perscope[scope]; ok {
|
|
|
|
subset[scope] = jm
|
|
|
|
}
|
|
|
|
}
|
|
|
|
perscope = subset
|
|
|
|
}
|
|
|
|
|
|
|
|
res[metric] = perscope
|
2022-01-20 10:08:50 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
jd = res
|
2022-01-17 13:33:35 +01:00
|
|
|
}
|
2022-03-21 13:30:19 +01:00
|
|
|
size = 1 // loadFromArchive() caches in the same cache.
|
2022-01-17 13:33:35 +01:00
|
|
|
}
|
|
|
|
|
2022-03-21 13:30:19 +01:00
|
|
|
ttl = 5 * time.Hour
|
2022-01-20 10:08:50 +01:00
|
|
|
if job.State == schema.JobStateRunning {
|
|
|
|
ttl = 2 * time.Minute
|
2022-01-10 16:13:40 +01:00
|
|
|
}
|
|
|
|
|
2022-01-20 10:08:50 +01:00
|
|
|
prepareJobData(job, jd, scopes)
|
2022-03-21 13:30:19 +01:00
|
|
|
return jd, ttl, size
|
2022-01-20 10:08:50 +01:00
|
|
|
})
|
2021-11-26 10:32:36 +01:00
|
|
|
|
2022-01-20 10:08:50 +01:00
|
|
|
if err, ok := data.(error); ok {
|
2021-10-26 10:24:43 +02:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2022-01-20 10:08:50 +01:00
|
|
|
return data.(schema.JobData), nil
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
|
2021-12-17 15:49:22 +01:00
|
|
|
func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
|
|
|
|
if job.State != schema.JobStateRunning && useArchive {
|
2021-12-08 11:50:16 +01:00
|
|
|
return loadAveragesFromArchive(job, metrics, data)
|
|
|
|
}
|
|
|
|
|
2021-12-16 13:17:48 +01:00
|
|
|
repo, ok := metricDataRepos[job.Cluster]
|
2021-12-08 11:50:16 +01:00
|
|
|
if !ok {
|
2021-12-16 13:17:48 +01:00
|
|
|
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
stats, err := repo.LoadStats(job, metrics, ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
for i, m := range metrics {
|
|
|
|
nodes, ok := stats[m]
|
|
|
|
if !ok {
|
|
|
|
data[i] = append(data[i], schema.NaN)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
sum := 0.0
|
|
|
|
for _, node := range nodes {
|
|
|
|
sum += node.Avg
|
|
|
|
}
|
|
|
|
data[i] = append(data[i], schema.Float(sum))
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
2021-12-08 11:50:16 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-01-31 15:16:05 +01:00
|
|
|
// Used for the node/system view. Returns a map of nodes to a map of metrics.
|
2022-03-24 14:34:42 +01:00
|
|
|
func LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
|
2022-01-31 15:16:05 +01:00
|
|
|
repo, ok := metricDataRepos[cluster]
|
2021-12-08 11:50:16 +01:00
|
|
|
if !ok {
|
2022-01-31 15:16:05 +01:00
|
|
|
return nil, fmt.Errorf("no metric data repository configured for '%s'", cluster)
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
2021-12-16 09:35:03 +01:00
|
|
|
if metrics == nil {
|
2022-03-14 10:18:56 +01:00
|
|
|
for _, m := range config.GetCluster(cluster).MetricConfig {
|
2021-12-16 09:35:03 +01:00
|
|
|
metrics = append(metrics, m.Name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-24 14:34:42 +01:00
|
|
|
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
|
2021-12-08 11:50:16 +01:00
|
|
|
if err != nil {
|
2022-03-09 14:27:47 +01:00
|
|
|
if len(data) != 0 {
|
|
|
|
log.Errorf("partial error: %s", err.Error())
|
|
|
|
} else {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if data == nil {
|
2022-01-31 15:16:05 +01:00
|
|
|
return nil, fmt.Errorf("the metric data repository for '%s' does not support this query", cluster)
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return data, nil
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
2022-01-17 13:33:35 +01:00
|
|
|
|
|
|
|
func cacheKey(job *schema.Job, metrics []string, scopes []schema.MetricScope) string {
|
|
|
|
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
|
|
|
|
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
|
2022-01-20 10:08:50 +01:00
|
|
|
return fmt.Sprintf("%d(%s):[%v],[%v]",
|
|
|
|
job.ID, job.State, metrics, scopes)
|
|
|
|
}
|
|
|
|
|
|
|
|
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need to be available at the scope 'node'.
|
|
|
|
// If a job has a lot of nodes, statisticsSeries should be available so that a min/mean/max Graph can be used instead of
|
|
|
|
// a lot of single lines.
|
|
|
|
func prepareJobData(job *schema.Job, jobData schema.JobData, scopes []schema.MetricScope) {
|
|
|
|
const maxSeriesSize int = 15
|
|
|
|
for _, scopes := range jobData {
|
|
|
|
for _, jm := range scopes {
|
|
|
|
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
jm.AddStatisticsSeries()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
nodeScopeRequested := false
|
|
|
|
for _, scope := range scopes {
|
|
|
|
if scope == schema.MetricScopeNode {
|
|
|
|
nodeScopeRequested = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if nodeScopeRequested {
|
|
|
|
jobData.AddNodeScope("flops_any")
|
|
|
|
jobData.AddNodeScope("mem_bw")
|
|
|
|
}
|
2022-01-17 13:33:35 +01:00
|
|
|
}
|