cc-backend/internal/metricdata/cc-metric-store.go

1140 lines
32 KiB
Go
Raw Normal View History

2024-04-11 23:04:30 +02:00
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
2021-11-26 10:32:36 +01:00
package metricdata
import (
"bufio"
2021-11-26 10:32:36 +01:00
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"sort"
2022-05-04 09:22:55 +02:00
"strconv"
"strings"
2021-11-26 10:32:36 +01:00
"time"
"github.com/ClusterCockpit/cc-backend/internal/graph/model"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
2023-01-24 12:03:36 +01:00
"github.com/ClusterCockpit/cc-backend/pkg/log"
2022-06-21 17:52:36 +02:00
"github.com/ClusterCockpit/cc-backend/pkg/schema"
2021-11-26 10:32:36 +01:00
)
type CCMetricStoreConfig struct {
Kind string `json:"kind"`
Url string `json:"url"`
Token string `json:"token"`
// If metrics are known to this MetricDataRepository under a different
// name than in the `metricConfig` section of the 'cluster.json',
// provide this optional mapping of local to remote name for this metric.
Renamings map[string]string `json:"metricRenamings"`
}
2021-11-26 10:32:36 +01:00
type CCMetricStore struct {
here2there map[string]string
there2here map[string]string
client http.Client
2022-01-20 10:43:46 +01:00
jwt string
url string
queryEndpoint string
2021-11-26 10:32:36 +01:00
}
2022-01-20 10:43:46 +01:00
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
Queries []ApiQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
2022-01-20 10:43:46 +01:00
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
2021-11-26 10:32:36 +01:00
}
2022-01-12 13:03:01 +01:00
type ApiQuery struct {
Type *string `json:"type,omitempty"`
SubType *string `json:"subtype,omitempty"`
2022-05-04 09:22:55 +02:00
Metric string `json:"metric"`
Hostname string `json:"host"`
Resolution int `json:"resolution"`
2022-05-04 09:22:55 +02:00
TypeIds []string `json:"type-ids,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
Aggregate bool `json:"aggreg"`
2022-01-12 13:03:01 +01:00
}
type ApiQueryResponse struct {
Queries []ApiQuery `json:"queries,omitempty"`
Results [][]ApiMetricData `json:"results"`
}
2021-11-26 10:32:36 +01:00
type ApiMetricData struct {
Error *string `json:"error"`
Data []schema.Float `json:"data"`
From int64 `json:"from"`
To int64 `json:"to"`
Resolution int `json:"resolution"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
2021-11-26 10:32:36 +01:00
}
func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error {
var config CCMetricStoreConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
log.Warn("Error while unmarshaling raw json config")
return err
}
ccms.url = config.Url
ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url)
ccms.jwt = config.Token
2022-01-12 13:03:01 +01:00
ccms.client = http.Client{
2022-03-29 15:22:17 +02:00
Timeout: 10 * time.Second,
2022-01-12 13:03:01 +01:00
}
2022-01-24 10:06:25 +01:00
if config.Renamings != nil {
ccms.here2there = config.Renamings
ccms.there2here = make(map[string]string, len(config.Renamings))
2022-01-24 10:06:25 +01:00
for k, v := range ccms.here2there {
ccms.there2here[v] = k
}
} else {
ccms.here2there = make(map[string]string)
ccms.there2here = make(map[string]string)
}
2021-11-26 10:32:36 +01:00
return nil
}
2022-01-24 10:06:25 +01:00
func (ccms *CCMetricStore) toRemoteName(metric string) string {
if renamed, ok := ccms.here2there[metric]; ok {
return renamed
}
return metric
}
func (ccms *CCMetricStore) toLocalName(metric string) string {
if renamed, ok := ccms.there2here[metric]; ok {
return renamed
}
return metric
}
func (ccms *CCMetricStore) doRequest(
ctx context.Context,
body *ApiQueryRequest,
) (*ApiQueryResponse, error) {
2022-01-20 10:43:46 +01:00
buf := &bytes.Buffer{}
if err := json.NewEncoder(buf).Encode(body); err != nil {
log.Warn("Error while encoding request body")
2021-11-26 10:32:36 +01:00
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf)
2021-11-26 10:32:36 +01:00
if err != nil {
log.Warn("Error while building request body")
2021-11-26 10:32:36 +01:00
return nil, err
}
2021-12-09 16:26:59 +01:00
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
2021-11-26 10:32:36 +01:00
2024-09-17 14:36:42 +02:00
// versioning the cc-metric-store query API.
// v2 = data with resampling
// v1 = data without resampling
q := req.URL.Query()
q.Add("version", "v2")
req.URL.RawQuery = q.Encode()
2022-01-20 10:43:46 +01:00
res, err := ccms.client.Do(req)
2022-01-12 13:03:01 +01:00
if err != nil {
log.Error("Error while performing request")
2022-01-12 13:03:01 +01:00
return nil, err
}
2022-01-20 10:43:46 +01:00
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
}
var resBody ApiQueryResponse
2022-01-20 10:43:46 +01:00
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
log.Warn("Error while decoding result body")
return nil, err
}
return &resBody, nil
2022-01-20 10:43:46 +01:00
}
func (ccms *CCMetricStore) LoadData(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
ctx context.Context,
resolution int,
) (schema.JobData, error) {
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution)
if err != nil {
log.Warn("Error while building queries")
return nil, err
}
2022-01-20 10:43:46 +01:00
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: true,
}
2022-01-20 10:43:46 +01:00
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
log.Error("Error while performing request")
return nil, err
}
var errors []string
jobData := make(schema.JobData)
for i, row := range resBody.Results {
2022-01-20 10:43:46 +01:00
query := req.Queries[i]
2022-01-24 10:06:25 +01:00
metric := ccms.toLocalName(query.Metric)
2022-01-20 10:43:46 +01:00
scope := assignedScope[i]
mc := archive.GetMetricConfig(job.Cluster, metric)
2022-01-24 10:06:25 +01:00
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
2024-09-17 14:36:42 +02:00
res := row[0].Resolution
if res == 0 {
res = mc.Timestep
}
2022-01-24 10:06:25 +01:00
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: mc.Unit,
2024-09-17 14:36:42 +02:00
Timestep: res,
Series: make([]schema.Series, 0),
}
2022-01-24 10:06:25 +01:00
jobData[metric][scope] = jobMetric
}
for ndx, res := range row {
2022-01-20 10:43:46 +01:00
if res.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
continue
2022-01-20 10:43:46 +01:00
}
id := (*string)(nil)
2022-01-20 10:43:46 +01:00
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
2022-01-20 10:43:46 +01:00
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// "schema.Float()" because regular float64 can not be JSONed when NaN.
2022-01-20 10:43:46 +01:00
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
2022-01-20 10:43:46 +01:00
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname,
Id: id,
Statistics: schema.MetricStatistics{
2022-01-20 10:43:46 +01:00
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
// So that one can later check len(jobData):
if len(jobMetric.Series) == 0 {
delete(jobData[metric], scope)
if len(jobData[metric]) == 0 {
delete(jobData, metric)
}
}
2021-11-26 10:32:36 +01:00
}
if len(errors) != 0 {
/* Returns list for "partial errors" */
return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
}
2021-11-26 10:32:36 +01:00
return jobData, nil
}
2022-01-12 13:03:01 +01:00
var (
hwthreadString = string(schema.MetricScopeHWThread)
coreString = string(schema.MetricScopeCore)
memoryDomainString = string(schema.MetricScopeMemoryDomain)
socketString = string(schema.MetricScopeSocket)
acceleratorString = string(schema.MetricScopeAccelerator)
2022-01-12 13:03:01 +01:00
)
func (ccms *CCMetricStore) buildQueries(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) ([]ApiQuery, []schema.MetricScope, error) {
2022-01-12 13:03:01 +01:00
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
assignedScope := []schema.MetricScope{}
2022-01-12 13:03:01 +01:00
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
if scerr != nil {
return nil, nil, scerr
}
topology := subcluster.Topology
2022-01-12 13:03:01 +01:00
for _, metric := range metrics {
2022-01-24 10:06:25 +01:00
remoteName := ccms.toRemoteName(metric)
mc := archive.GetMetricConfig(job.Cluster, metric)
2022-01-12 13:03:01 +01:00
if mc == nil {
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
2023-02-15 11:50:51 +01:00
log.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
2022-01-12 13:03:01 +01:00
continue
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
2022-01-12 13:03:01 +01:00
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
2023-05-04 07:00:30 +02:00
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
continue
}
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
2022-01-12 13:03:01 +01:00
}
handledScopes = append(handledScopes, scope)
2022-01-12 13:03:01 +01:00
for _, host := range job.Resources {
hwthreads := host.HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
2022-01-12 13:03:01 +01:00
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
if scope != schema.MetricScopeAccelerator {
// Skip all other catched cases
continue
}
2022-01-20 10:43:46 +01:00
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &acceleratorString,
TypeIds: host.Accelerators,
Resolution: resolution,
2022-01-20 10:43:46 +01:00
})
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
continue
}
2022-01-12 13:03:01 +01:00
// Accelerator -> Node
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
if len(host.Accelerators) == 0 {
continue
}
2022-01-12 13:03:01 +01:00
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &acceleratorString,
TypeIds: host.Accelerators,
Resolution: resolution,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> HWThead
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
2022-01-20 10:43:46 +01:00
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &hwthreadString,
TypeIds: intToStringSlice(hwthreads),
Resolution: resolution,
2022-01-20 10:43:46 +01:00
})
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> Core
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
for _, core := range cores {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Core[core]),
Resolution: resolution,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
}
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> Socket
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
for _, socket := range sockets {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Socket[socket]),
Resolution: resolution,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
}
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> Node
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(hwthreads),
Resolution: resolution,
2022-01-20 10:43:46 +01:00
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Core
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &coreString,
TypeIds: intToStringSlice(cores),
Resolution: resolution,
2022-01-20 10:43:46 +01:00
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &coreString,
TypeIds: intToStringSlice(cores),
Resolution: resolution,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// MemoryDomain -> MemoryDomain
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// MemoryDoman -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
2022-01-20 10:43:46 +01:00
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
2022-01-20 10:43:46 +01:00
})
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// Socket -> Node
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// Node -> Node
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
2022-01-12 13:03:01 +01:00
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Resolution: resolution,
2022-01-12 13:03:01 +01:00
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
2022-01-12 13:03:01 +01:00
}
return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
2022-01-12 13:03:01 +01:00
}
}
}
return queries, assignedScope, nil
}
func (ccms *CCMetricStore) LoadStats(
job *schema.Job,
metrics []string,
ctx context.Context,
) (map[string]map[string]schema.MetricStatistics, error) {
2024-08-25 16:13:43 +02:00
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
// resolution := 9000
2024-08-25 16:13:43 +02:00
// for _, mc := range metricConfigs {
// resolution = min(resolution, mc.Timestep)
// }
2024-08-25 16:13:43 +02:00
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
if err != nil {
log.Warn("Error while building query")
return nil, err
}
2022-01-20 10:43:46 +01:00
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: false,
}
2022-01-20 10:43:46 +01:00
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
log.Error("Error while performing request")
2022-01-20 10:43:46 +01:00
return nil, err
}
2022-01-20 10:43:46 +01:00
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
for i, res := range resBody.Results {
2022-01-20 10:43:46 +01:00
query := req.Queries[i]
2022-01-24 10:06:25 +01:00
metric := ccms.toLocalName(query.Metric)
2022-01-20 10:43:46 +01:00
data := res[0]
if data.Error != nil {
log.Infof("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
continue
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
2022-01-20 10:43:46 +01:00
}
2022-01-24 10:06:25 +01:00
metricdata, ok := stats[metric]
2022-01-20 10:43:46 +01:00
if !ok {
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
2022-01-24 10:06:25 +01:00
stats[metric] = metricdata
2022-01-20 10:43:46 +01:00
}
2022-01-20 10:43:46 +01:00
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
log.Infof("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname)
continue
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
}
2022-01-20 10:43:46 +01:00
metricdata[query.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg),
Min: float64(data.Min),
Max: float64(data.Max),
}
}
return stats, nil
}
// TODO: Support sub-node-scope metrics! For this, the partition of a node needs to be known!
func (ccms *CCMetricStore) LoadNodeData(
cluster string,
metrics, nodes []string,
scopes []schema.MetricScope,
from, to time.Time,
ctx context.Context,
) (map[string]map[string][]*schema.JobMetric, error) {
2022-01-20 10:43:46 +01:00
req := ApiQueryRequest{
Cluster: cluster,
From: from.Unix(),
To: to.Unix(),
WithStats: true,
2022-01-20 10:43:46 +01:00
WithData: true,
}
if nodes == nil {
for _, metric := range metrics {
req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric))
}
} else {
2022-01-20 10:43:46 +01:00
for _, node := range nodes {
for _, metric := range metrics {
req.Queries = append(req.Queries, ApiQuery{
Hostname: node,
Metric: ccms.toRemoteName(metric),
Resolution: 60, // Default for Node Queries
2022-01-20 10:43:46 +01:00
})
}
}
2021-12-09 16:26:59 +01:00
}
2022-01-20 10:43:46 +01:00
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
2024-09-17 14:36:42 +02:00
log.Error(fmt.Sprintf("Error while performing request %#v\n", err))
return nil, err
}
var errors []string
data := make(map[string]map[string][]*schema.JobMetric)
for i, res := range resBody.Results {
var query ApiQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
} else {
query = req.Queries[i]
}
metric := ccms.toLocalName(query.Metric)
2022-01-20 10:43:46 +01:00
qdata := res[0]
if qdata.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
}
if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() {
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0.
}
hostdata, ok := data[query.Hostname]
2022-01-20 10:43:46 +01:00
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[query.Hostname] = hostdata
}
mc := archive.GetMetricConfig(cluster, metric)
2022-02-08 11:03:32 +01:00
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: mc.Unit,
Timestep: mc.Timestep,
Series: []schema.Series{
{
Hostname: query.Hostname,
Data: qdata.Data,
Statistics: schema.MetricStatistics{
Avg: float64(qdata.Avg),
Min: float64(qdata.Min),
Max: float64(qdata.Max),
},
},
},
})
}
if len(errors) != 0 {
/* Returns list of "partial errors" */
return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
}
return data, nil
}
2022-05-04 09:22:55 +02:00
func (ccms *CCMetricStore) LoadNodeListData(
cluster, subCluster, nodeFilter string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
from, to time.Time,
page *model.PageRequest,
ctx context.Context,
) (map[string]map[string]map[schema.MetricScope]*schema.JobMetric, int, bool, error) {
// 0) Init additional vars
var totalNodes int = 0
var hasNextPage bool = false
// 1) Get list of all nodes
var nodes []string
if subCluster != "" {
scNodes := archive.NodeLists[cluster][subCluster]
nodes = scNodes.PrintList()
} else {
subClusterNodeLists := archive.NodeLists[cluster]
for _, nodeList := range subClusterNodeLists {
nodes = append(nodes, nodeList.PrintList()...)
}
}
// 2) Filter nodes
if nodeFilter != "" {
filteredNodes := []string{}
for _, node := range nodes {
if strings.Contains(node, nodeFilter) {
filteredNodes = append(filteredNodes, node)
}
}
nodes = filteredNodes
}
// 2.1) Count total nodes && Sort nodes -> Sorting invalidated after ccms return ...
totalNodes = len(nodes)
sort.Strings(nodes)
// 3) Apply paging
if len(nodes) > page.ItemsPerPage {
start := (page.Page - 1) * page.ItemsPerPage
end := start + page.ItemsPerPage
if end > len(nodes) {
end = len(nodes)
hasNextPage = false
} else {
hasNextPage = true
}
nodes = nodes[start:end]
}
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
if err != nil {
log.Warn("Error while building queries")
return nil, totalNodes, hasNextPage, err
}
req := ApiQueryRequest{
Cluster: cluster,
Queries: queries,
From: from.Unix(),
To: to.Unix(),
WithStats: true,
WithData: true,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
log.Error(fmt.Sprintf("Error while performing request %#v\n", err))
return nil, totalNodes, hasNextPage, err
}
var errors []string
data := make(map[string]map[string]map[schema.MetricScope]*schema.JobMetric)
for i, row := range resBody.Results {
var query ApiQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
} else {
query = req.Queries[i]
}
// qdata := res[0]
metric := ccms.toLocalName(query.Metric)
scope := assignedScope[i]
mc := archive.GetMetricConfig(cluster, metric)
res := row[0].Resolution
if res == 0 {
res = mc.Timestep
}
// Init Nested Map Data Structures If Not Found
hostData, ok := data[query.Hostname]
if !ok {
hostData = make(map[string]map[schema.MetricScope]*schema.JobMetric)
data[query.Hostname] = hostData
}
metricData, ok := hostData[metric]
if !ok {
metricData = make(map[schema.MetricScope]*schema.JobMetric)
data[query.Hostname][metric] = metricData
}
scopeData, ok := metricData[scope]
if !ok {
scopeData = &schema.JobMetric{
Unit: mc.Unit,
Timestep: res,
Series: make([]schema.Series, 0),
}
data[query.Hostname][metric][scope] = scopeData
}
for ndx, res := range row {
if res.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
continue
}
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[ndx]
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// "schema.Float()" because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
scopeData.Series = append(scopeData.Series, schema.Series{
Hostname: query.Hostname,
Id: id,
Statistics: schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
}
if len(errors) != 0 {
/* Returns list of "partial errors" */
return data, totalNodes, hasNextPage, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
}
return data, totalNodes, hasNextPage, nil
}
func (ccms *CCMetricStore) buildNodeQueries(
cluster string,
subCluster string,
nodes []string,
metrics []string,
scopes []schema.MetricScope,
resolution int,
) ([]ApiQuery, []schema.MetricScope, error) {
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes))
assignedScope := []schema.MetricScope{}
// Get Topol before loop if subCluster given
var subClusterTopol *schema.SubCluster
var scterr error
if subCluster != "" {
subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster)
if scterr != nil {
// TODO: Log
return nil, nil, scterr
}
}
for _, metric := range metrics {
remoteName := ccms.toRemoteName(metric)
mc := archive.GetMetricConfig(cluster, metric)
if mc == nil {
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster)
log.Infof("metric '%s' is not specified for cluster '%s'", metric, cluster)
continue
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
}
handledScopes = append(handledScopes, scope)
for _, hostname := range nodes {
// If no subCluster given, get it by node
if subCluster == "" {
subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname)
if scnerr != nil {
return nil, nil, scnerr
}
subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName)
if scterr != nil {
return nil, nil, scterr
}
}
// Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable
// Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable
topology := subClusterTopol.Topology
acceleratorIds := topology.GetAcceleratorIDs()
// Moved check here if metric matches hardware specs
if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 {
continue scopesLoop
}
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
if scope != schema.MetricScopeAccelerator {
// Skip all other catched cases
continue
}
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: false,
Type: &acceleratorString,
TypeIds: acceleratorIds,
Resolution: resolution,
})
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
continue
}
// Accelerator -> Node
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
if len(acceleratorIds) == 0 {
continue
}
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &acceleratorString,
TypeIds: acceleratorIds,
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// HWThread -> HWThead
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: false,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Node),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// HWThread -> Core
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
for _, core := range cores {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Core[core]),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
}
continue
}
// HWThread -> Socket
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
for _, socket := range sockets {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Socket[socket]),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
}
continue
}
// HWThread -> Node
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Node),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Core
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: false,
Type: &coreString,
TypeIds: intToStringSlice(cores),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(topology.Node)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &coreString,
TypeIds: intToStringSlice(cores),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// MemoryDomain -> MemoryDomain
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: false,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// MemoryDoman -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: false,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// Socket -> Node
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
sockets, _ := topology.GetSocketsFromHWThreads(topology.Node)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Aggregate: true,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
// Node -> Node
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: hostname,
Resolution: resolution,
})
assignedScope = append(assignedScope, scope)
continue
}
return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
}
}
}
return queries, assignedScope, nil
}
2022-05-04 09:22:55 +02:00
func intToStringSlice(is []int) []string {
2022-05-05 10:03:54 +02:00
ss := make([]string, len(is))
2022-05-04 09:22:55 +02:00
for i, x := range is {
ss[i] = strconv.Itoa(x)
}
return ss
}