2024-04-11 23:04:30 +02:00
|
|
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
2022-07-29 06:29:21 +02:00
|
|
|
// All rights reserved.
|
|
|
|
// Use of this source code is governed by a MIT-style
|
|
|
|
// license that can be found in the LICENSE file.
|
2021-11-26 10:32:36 +01:00
|
|
|
package metricdata
|
|
|
|
|
|
|
|
import (
|
2022-01-07 09:47:41 +01:00
|
|
|
"bufio"
|
2021-11-26 10:32:36 +01:00
|
|
|
"bytes"
|
|
|
|
"context"
|
|
|
|
"encoding/json"
|
|
|
|
"fmt"
|
|
|
|
"net/http"
|
2022-05-04 09:22:55 +02:00
|
|
|
"strconv"
|
2022-03-01 14:29:04 +01:00
|
|
|
"strings"
|
2021-11-26 10:32:36 +01:00
|
|
|
"time"
|
|
|
|
|
2022-09-05 17:46:38 +02:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
2023-01-24 12:03:36 +01:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
2022-06-21 17:52:36 +02:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
2021-11-26 10:32:36 +01:00
|
|
|
)
|
|
|
|
|
2022-03-17 16:15:35 +01:00
|
|
|
type CCMetricStoreConfig struct {
|
|
|
|
Kind string `json:"kind"`
|
|
|
|
Url string `json:"url"`
|
|
|
|
Token string `json:"token"`
|
|
|
|
|
|
|
|
// If metrics are known to this MetricDataRepository under a different
|
|
|
|
// name than in the `metricConfig` section of the 'cluster.json',
|
|
|
|
// provide this optional mapping of local to remote name for this metric.
|
|
|
|
Renamings map[string]string `json:"metricRenamings"`
|
|
|
|
}
|
|
|
|
|
2021-11-26 10:32:36 +01:00
|
|
|
type CCMetricStore struct {
|
2024-03-22 08:59:35 +01:00
|
|
|
here2there map[string]string
|
|
|
|
there2here map[string]string
|
|
|
|
client http.Client
|
2022-01-20 10:43:46 +01:00
|
|
|
jwt string
|
|
|
|
url string
|
|
|
|
queryEndpoint string
|
2021-11-26 10:32:36 +01:00
|
|
|
}
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
type ApiQueryRequest struct {
|
|
|
|
Cluster string `json:"cluster"`
|
2024-03-22 08:59:35 +01:00
|
|
|
Queries []ApiQuery `json:"queries"`
|
|
|
|
ForAllNodes []string `json:"for-all-nodes"`
|
2022-01-20 10:43:46 +01:00
|
|
|
From int64 `json:"from"`
|
|
|
|
To int64 `json:"to"`
|
|
|
|
WithStats bool `json:"with-stats"`
|
|
|
|
WithData bool `json:"with-data"`
|
2021-11-26 10:32:36 +01:00
|
|
|
}
|
|
|
|
|
2022-01-12 13:03:01 +01:00
|
|
|
type ApiQuery struct {
|
2024-03-22 08:59:35 +01:00
|
|
|
Type *string `json:"type,omitempty"`
|
|
|
|
SubType *string `json:"subtype,omitempty"`
|
2022-05-04 09:22:55 +02:00
|
|
|
Metric string `json:"metric"`
|
|
|
|
Hostname string `json:"host"`
|
2024-08-22 14:29:51 +02:00
|
|
|
Resolution int `json:"resolution"`
|
2022-05-04 09:22:55 +02:00
|
|
|
TypeIds []string `json:"type-ids,omitempty"`
|
|
|
|
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
2024-03-22 08:59:35 +01:00
|
|
|
Aggregate bool `json:"aggreg"`
|
2022-01-12 13:03:01 +01:00
|
|
|
}
|
|
|
|
|
2022-02-02 13:04:38 +01:00
|
|
|
type ApiQueryResponse struct {
|
|
|
|
Queries []ApiQuery `json:"queries,omitempty"`
|
|
|
|
Results [][]ApiMetricData `json:"results"`
|
|
|
|
}
|
|
|
|
|
2021-11-26 10:32:36 +01:00
|
|
|
type ApiMetricData struct {
|
2024-08-22 14:29:51 +02:00
|
|
|
Error *string `json:"error"`
|
|
|
|
Data []schema.Float `json:"data"`
|
|
|
|
From int64 `json:"from"`
|
|
|
|
To int64 `json:"to"`
|
|
|
|
Resolution int `json:"resolution"`
|
|
|
|
Avg schema.Float `json:"avg"`
|
|
|
|
Min schema.Float `json:"min"`
|
|
|
|
Max schema.Float `json:"max"`
|
2021-11-26 10:32:36 +01:00
|
|
|
}
|
|
|
|
|
2022-03-17 16:15:35 +01:00
|
|
|
func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error {
|
|
|
|
var config CCMetricStoreConfig
|
|
|
|
if err := json.Unmarshal(rawConfig, &config); err != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
log.Warn("Error while unmarshaling raw json config")
|
2022-03-17 16:15:35 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
ccms.url = config.Url
|
2024-09-30 15:27:49 +02:00
|
|
|
ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url)
|
2022-03-17 16:15:35 +01:00
|
|
|
ccms.jwt = config.Token
|
2022-01-12 13:03:01 +01:00
|
|
|
ccms.client = http.Client{
|
2022-03-29 15:22:17 +02:00
|
|
|
Timeout: 10 * time.Second,
|
2022-01-12 13:03:01 +01:00
|
|
|
}
|
2022-01-24 10:06:25 +01:00
|
|
|
|
2022-03-17 16:15:35 +01:00
|
|
|
if config.Renamings != nil {
|
|
|
|
ccms.here2there = config.Renamings
|
|
|
|
ccms.there2here = make(map[string]string, len(config.Renamings))
|
2022-01-24 10:06:25 +01:00
|
|
|
for k, v := range ccms.here2there {
|
|
|
|
ccms.there2here[v] = k
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ccms.here2there = make(map[string]string)
|
|
|
|
ccms.there2here = make(map[string]string)
|
|
|
|
}
|
|
|
|
|
2021-11-26 10:32:36 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-01-24 10:06:25 +01:00
|
|
|
func (ccms *CCMetricStore) toRemoteName(metric string) string {
|
|
|
|
if renamed, ok := ccms.here2there[metric]; ok {
|
|
|
|
return renamed
|
|
|
|
}
|
|
|
|
|
|
|
|
return metric
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ccms *CCMetricStore) toLocalName(metric string) string {
|
|
|
|
if renamed, ok := ccms.there2here[metric]; ok {
|
|
|
|
return renamed
|
|
|
|
}
|
|
|
|
|
|
|
|
return metric
|
|
|
|
}
|
|
|
|
|
2022-09-07 12:24:45 +02:00
|
|
|
func (ccms *CCMetricStore) doRequest(
|
|
|
|
ctx context.Context,
|
2024-03-22 08:59:35 +01:00
|
|
|
body *ApiQueryRequest,
|
|
|
|
) (*ApiQueryResponse, error) {
|
2022-01-20 10:43:46 +01:00
|
|
|
buf := &bytes.Buffer{}
|
|
|
|
if err := json.NewEncoder(buf).Encode(body); err != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
log.Warn("Error while encoding request body")
|
2021-11-26 10:32:36 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2024-08-22 14:29:51 +02:00
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf)
|
2021-11-26 10:32:36 +01:00
|
|
|
if err != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
log.Warn("Error while building request body")
|
2021-11-26 10:32:36 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
2021-12-09 16:26:59 +01:00
|
|
|
if ccms.jwt != "" {
|
|
|
|
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
|
|
|
|
}
|
2021-11-26 10:32:36 +01:00
|
|
|
|
2024-09-17 14:36:42 +02:00
|
|
|
// versioning the cc-metric-store query API.
|
|
|
|
// v2 = data with resampling
|
|
|
|
// v1 = data without resampling
|
|
|
|
q := req.URL.Query()
|
|
|
|
q.Add("version", "v2")
|
|
|
|
req.URL.RawQuery = q.Encode()
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
res, err := ccms.client.Do(req)
|
2022-01-12 13:03:01 +01:00
|
|
|
if err != nil {
|
2023-01-31 18:28:44 +01:00
|
|
|
log.Error("Error while performing request")
|
2022-01-12 13:03:01 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
if res.StatusCode != http.StatusOK {
|
|
|
|
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
|
2022-01-07 09:47:41 +01:00
|
|
|
}
|
|
|
|
|
2022-02-02 13:04:38 +01:00
|
|
|
var resBody ApiQueryResponse
|
2022-01-20 10:43:46 +01:00
|
|
|
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
log.Warn("Error while decoding result body")
|
2022-01-07 09:47:41 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2022-02-02 13:04:38 +01:00
|
|
|
return &resBody, nil
|
2022-01-20 10:43:46 +01:00
|
|
|
}
|
|
|
|
|
2022-09-07 12:24:45 +02:00
|
|
|
func (ccms *CCMetricStore) LoadData(
|
|
|
|
job *schema.Job,
|
|
|
|
metrics []string,
|
|
|
|
scopes []schema.MetricScope,
|
2024-03-22 08:59:35 +01:00
|
|
|
ctx context.Context,
|
2024-08-22 14:29:51 +02:00
|
|
|
resolution int,
|
2024-03-22 08:59:35 +01:00
|
|
|
) (schema.JobData, error) {
|
2024-08-22 14:29:51 +02:00
|
|
|
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution)
|
2022-01-07 09:47:41 +01:00
|
|
|
if err != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
log.Warn("Error while building queries")
|
2022-01-07 09:47:41 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
2022-01-20 10:43:46 +01:00
|
|
|
|
|
|
|
req := ApiQueryRequest{
|
|
|
|
Cluster: job.Cluster,
|
|
|
|
From: job.StartTime.Unix(),
|
|
|
|
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
|
|
|
|
Queries: queries,
|
|
|
|
WithStats: true,
|
|
|
|
WithData: true,
|
2022-01-07 09:47:41 +01:00
|
|
|
}
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
resBody, err := ccms.doRequest(ctx, &req)
|
|
|
|
if err != nil {
|
2023-01-31 18:28:44 +01:00
|
|
|
log.Error("Error while performing request")
|
2022-01-07 09:47:41 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2022-03-01 14:29:04 +01:00
|
|
|
var errors []string
|
2024-03-22 08:59:35 +01:00
|
|
|
jobData := make(schema.JobData)
|
2022-02-02 13:04:38 +01:00
|
|
|
for i, row := range resBody.Results {
|
2022-01-20 10:43:46 +01:00
|
|
|
query := req.Queries[i]
|
2022-01-24 10:06:25 +01:00
|
|
|
metric := ccms.toLocalName(query.Metric)
|
2022-01-20 10:43:46 +01:00
|
|
|
scope := assignedScope[i]
|
2022-09-05 17:46:38 +02:00
|
|
|
mc := archive.GetMetricConfig(job.Cluster, metric)
|
2022-01-24 10:06:25 +01:00
|
|
|
if _, ok := jobData[metric]; !ok {
|
|
|
|
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
|
2022-01-07 09:47:41 +01:00
|
|
|
}
|
|
|
|
|
2024-09-17 14:36:42 +02:00
|
|
|
res := row[0].Resolution
|
|
|
|
if res == 0 {
|
|
|
|
res = mc.Timestep
|
|
|
|
}
|
|
|
|
|
2022-01-24 10:06:25 +01:00
|
|
|
jobMetric, ok := jobData[metric][scope]
|
2024-08-22 14:29:51 +02:00
|
|
|
|
2022-01-07 09:47:41 +01:00
|
|
|
if !ok {
|
|
|
|
jobMetric = &schema.JobMetric{
|
|
|
|
Unit: mc.Unit,
|
2024-09-17 14:36:42 +02:00
|
|
|
Timestep: res,
|
2022-01-07 09:47:41 +01:00
|
|
|
Series: make([]schema.Series, 0),
|
|
|
|
}
|
2022-01-24 10:06:25 +01:00
|
|
|
jobData[metric][scope] = jobMetric
|
2022-01-07 09:47:41 +01:00
|
|
|
}
|
|
|
|
|
2024-03-22 16:10:30 +01:00
|
|
|
for ndx, res := range row {
|
2022-01-20 10:43:46 +01:00
|
|
|
if res.Error != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
/* Build list for "partial errors", if any */
|
2022-03-01 14:29:04 +01:00
|
|
|
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
|
|
|
|
continue
|
2022-01-20 10:43:46 +01:00
|
|
|
}
|
2022-01-07 09:47:41 +01:00
|
|
|
|
2023-03-22 19:21:11 +01:00
|
|
|
id := (*string)(nil)
|
2022-01-20 10:43:46 +01:00
|
|
|
if query.Type != nil {
|
2023-03-22 19:21:11 +01:00
|
|
|
id = new(string)
|
2024-03-22 16:10:30 +01:00
|
|
|
*id = query.TypeIds[ndx]
|
2022-01-20 10:43:46 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
|
|
|
|
// TODO: use schema.Float instead of float64?
|
|
|
|
// This is done because regular float64 can not be JSONed when NaN.
|
|
|
|
res.Avg = schema.Float(0)
|
|
|
|
res.Min = schema.Float(0)
|
|
|
|
res.Max = schema.Float(0)
|
|
|
|
}
|
2022-01-10 16:13:40 +01:00
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
jobMetric.Series = append(jobMetric.Series, schema.Series{
|
|
|
|
Hostname: query.Hostname,
|
|
|
|
Id: id,
|
2023-03-22 19:21:11 +01:00
|
|
|
Statistics: schema.MetricStatistics{
|
2022-01-20 10:43:46 +01:00
|
|
|
Avg: float64(res.Avg),
|
|
|
|
Min: float64(res.Min),
|
|
|
|
Max: float64(res.Max),
|
|
|
|
},
|
|
|
|
Data: res.Data,
|
|
|
|
})
|
|
|
|
}
|
2022-03-01 16:01:25 +01:00
|
|
|
|
|
|
|
// So that one can later check len(jobData):
|
|
|
|
if len(jobMetric.Series) == 0 {
|
|
|
|
delete(jobData[metric], scope)
|
|
|
|
if len(jobData[metric]) == 0 {
|
|
|
|
delete(jobData, metric)
|
|
|
|
}
|
|
|
|
}
|
2021-11-26 10:32:36 +01:00
|
|
|
}
|
|
|
|
|
2022-03-01 14:29:04 +01:00
|
|
|
if len(errors) != 0 {
|
2023-02-01 11:58:27 +01:00
|
|
|
/* Returns list for "partial errors" */
|
2023-01-19 16:59:14 +01:00
|
|
|
return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
2022-03-01 14:29:04 +01:00
|
|
|
}
|
2021-11-26 10:32:36 +01:00
|
|
|
return jobData, nil
|
|
|
|
}
|
2021-12-08 11:50:16 +01:00
|
|
|
|
2022-01-12 13:03:01 +01:00
|
|
|
var (
|
2022-07-28 18:07:30 +02:00
|
|
|
hwthreadString = string(schema.MetricScopeHWThread)
|
2022-02-15 13:19:26 +01:00
|
|
|
coreString = string(schema.MetricScopeCore)
|
|
|
|
memoryDomainString = string(schema.MetricScopeMemoryDomain)
|
|
|
|
socketString = string(schema.MetricScopeSocket)
|
|
|
|
acceleratorString = string(schema.MetricScopeAccelerator)
|
2022-01-12 13:03:01 +01:00
|
|
|
)
|
|
|
|
|
2022-09-07 12:24:45 +02:00
|
|
|
func (ccms *CCMetricStore) buildQueries(
|
|
|
|
job *schema.Job,
|
|
|
|
metrics []string,
|
2024-03-22 08:59:35 +01:00
|
|
|
scopes []schema.MetricScope,
|
2024-08-22 14:29:51 +02:00
|
|
|
resolution int,
|
2024-03-22 08:59:35 +01:00
|
|
|
) ([]ApiQuery, []schema.MetricScope, error) {
|
2022-01-12 13:03:01 +01:00
|
|
|
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
2022-01-17 13:33:35 +01:00
|
|
|
assignedScope := []schema.MetricScope{}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2023-03-29 10:39:31 +02:00
|
|
|
subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster)
|
|
|
|
if scerr != nil {
|
|
|
|
return nil, nil, scerr
|
|
|
|
}
|
|
|
|
topology := subcluster.Topology
|
|
|
|
|
2022-01-12 13:03:01 +01:00
|
|
|
for _, metric := range metrics {
|
2022-01-24 10:06:25 +01:00
|
|
|
remoteName := ccms.toRemoteName(metric)
|
2022-09-05 17:46:38 +02:00
|
|
|
mc := archive.GetMetricConfig(job.Cluster, metric)
|
2022-01-12 13:03:01 +01:00
|
|
|
if mc == nil {
|
2023-01-19 16:59:14 +01:00
|
|
|
// return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
2023-02-15 11:50:51 +01:00
|
|
|
log.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
2022-01-12 13:03:01 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// Avoid duplicates...
|
|
|
|
handledScopes := make([]schema.MetricScope, 0, 3)
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
scopesLoop:
|
|
|
|
for _, requestedScope := range scopes {
|
|
|
|
nativeScope := mc.Scope
|
2023-05-04 07:00:30 +02:00
|
|
|
if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 {
|
2022-07-28 18:07:30 +02:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
scope := nativeScope.Max(requestedScope)
|
|
|
|
for _, s := range handledScopes {
|
|
|
|
if scope == s {
|
|
|
|
continue scopesLoop
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
}
|
2022-01-17 13:33:35 +01:00
|
|
|
handledScopes = append(handledScopes, scope)
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
for _, host := range job.Resources {
|
|
|
|
hwthreads := host.HWThreads
|
|
|
|
if hwthreads == nil {
|
|
|
|
hwthreads = topology.Node
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
|
|
|
|
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
|
2024-03-22 16:10:30 +01:00
|
|
|
if scope != schema.MetricScopeAccelerator {
|
|
|
|
// Skip all other catched cases
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: false,
|
|
|
|
Type: &acceleratorString,
|
|
|
|
TypeIds: host.Accelerators,
|
|
|
|
Resolution: resolution,
|
2022-01-20 10:43:46 +01:00
|
|
|
})
|
|
|
|
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
|
2022-01-17 13:33:35 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// Accelerator -> Node
|
|
|
|
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
|
|
|
|
if len(host.Accelerators) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: true,
|
|
|
|
Type: &acceleratorString,
|
|
|
|
TypeIds: host.Accelerators,
|
|
|
|
Resolution: resolution,
|
2022-01-17 13:33:35 +01:00
|
|
|
})
|
2022-01-20 10:43:46 +01:00
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// HWThread -> HWThead
|
|
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
|
2022-01-20 10:43:46 +01:00
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: false,
|
|
|
|
Type: &hwthreadString,
|
|
|
|
TypeIds: intToStringSlice(hwthreads),
|
|
|
|
Resolution: resolution,
|
2022-01-20 10:43:46 +01:00
|
|
|
})
|
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// HWThread -> Core
|
|
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
|
|
|
|
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
|
|
|
for _, core := range cores {
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: true,
|
|
|
|
Type: &hwthreadString,
|
|
|
|
TypeIds: intToStringSlice(topology.Core[core]),
|
|
|
|
Resolution: resolution,
|
2022-01-17 13:33:35 +01:00
|
|
|
})
|
2022-01-20 10:43:46 +01:00
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// HWThread -> Socket
|
|
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
|
|
|
|
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
|
|
|
for _, socket := range sockets {
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: true,
|
|
|
|
Type: &hwthreadString,
|
|
|
|
TypeIds: intToStringSlice(topology.Socket[socket]),
|
|
|
|
Resolution: resolution,
|
2022-01-17 13:33:35 +01:00
|
|
|
})
|
2022-01-20 10:43:46 +01:00
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// HWThread -> Node
|
|
|
|
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: true,
|
|
|
|
Type: &hwthreadString,
|
|
|
|
TypeIds: intToStringSlice(hwthreads),
|
|
|
|
Resolution: resolution,
|
2022-01-20 10:43:46 +01:00
|
|
|
})
|
|
|
|
assignedScope = append(assignedScope, scope)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Core -> Core
|
|
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
|
|
|
|
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: false,
|
|
|
|
Type: &coreString,
|
|
|
|
TypeIds: intToStringSlice(cores),
|
|
|
|
Resolution: resolution,
|
2022-01-20 10:43:46 +01:00
|
|
|
})
|
|
|
|
assignedScope = append(assignedScope, scope)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Core -> Node
|
|
|
|
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
|
|
|
|
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: true,
|
|
|
|
Type: &coreString,
|
|
|
|
TypeIds: intToStringSlice(cores),
|
|
|
|
Resolution: resolution,
|
2022-01-17 13:33:35 +01:00
|
|
|
})
|
2022-01-20 10:43:46 +01:00
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-02-15 13:19:26 +01:00
|
|
|
// MemoryDomain -> MemoryDomain
|
|
|
|
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
|
|
|
|
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: false,
|
|
|
|
Type: &memoryDomainString,
|
|
|
|
TypeIds: intToStringSlice(sockets),
|
|
|
|
Resolution: resolution,
|
2022-02-15 13:19:26 +01:00
|
|
|
})
|
|
|
|
assignedScope = append(assignedScope, scope)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// MemoryDoman -> Node
|
|
|
|
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
|
|
|
|
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: true,
|
|
|
|
Type: &memoryDomainString,
|
|
|
|
TypeIds: intToStringSlice(sockets),
|
|
|
|
Resolution: resolution,
|
2022-02-15 13:19:26 +01:00
|
|
|
})
|
|
|
|
assignedScope = append(assignedScope, scope)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// Socket -> Socket
|
|
|
|
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
|
|
|
|
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
2022-01-20 10:43:46 +01:00
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: false,
|
|
|
|
Type: &socketString,
|
|
|
|
TypeIds: intToStringSlice(sockets),
|
|
|
|
Resolution: resolution,
|
2022-01-20 10:43:46 +01:00
|
|
|
})
|
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// Socket -> Node
|
|
|
|
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
|
|
|
|
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
|
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Aggregate: true,
|
|
|
|
Type: &socketString,
|
|
|
|
TypeIds: intToStringSlice(sockets),
|
|
|
|
Resolution: resolution,
|
2022-01-17 13:33:35 +01:00
|
|
|
})
|
2022-01-20 10:43:46 +01:00
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-01-12 13:03:01 +01:00
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
// Node -> Node
|
|
|
|
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
|
2022-01-12 13:03:01 +01:00
|
|
|
queries = append(queries, ApiQuery{
|
2024-08-22 14:29:51 +02:00
|
|
|
Metric: remoteName,
|
|
|
|
Hostname: host.Hostname,
|
|
|
|
Resolution: resolution,
|
2022-01-12 13:03:01 +01:00
|
|
|
})
|
2022-01-20 10:43:46 +01:00
|
|
|
assignedScope = append(assignedScope, scope)
|
2022-01-17 13:33:35 +01:00
|
|
|
continue
|
2022-01-12 13:03:01 +01:00
|
|
|
}
|
2022-01-17 13:33:35 +01:00
|
|
|
|
2023-01-19 16:59:14 +01:00
|
|
|
return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
|
2022-01-12 13:03:01 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-17 13:33:35 +01:00
|
|
|
return queries, assignedScope, nil
|
|
|
|
}
|
|
|
|
|
2022-09-07 12:24:45 +02:00
|
|
|
func (ccms *CCMetricStore) LoadStats(
|
|
|
|
job *schema.Job,
|
|
|
|
metrics []string,
|
2024-03-22 08:59:35 +01:00
|
|
|
ctx context.Context,
|
|
|
|
) (map[string]map[string]schema.MetricStatistics, error) {
|
2024-08-22 14:29:51 +02:00
|
|
|
|
2024-08-25 16:13:43 +02:00
|
|
|
// metricConfigs := archive.GetCluster(job.Cluster).MetricConfig
|
|
|
|
// resolution := 9000
|
2024-08-22 14:29:51 +02:00
|
|
|
|
2024-08-25 16:13:43 +02:00
|
|
|
// for _, mc := range metricConfigs {
|
|
|
|
// resolution = min(resolution, mc.Timestep)
|
|
|
|
// }
|
2024-08-22 14:29:51 +02:00
|
|
|
|
2024-08-25 16:13:43 +02:00
|
|
|
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization?
|
2021-12-08 11:50:16 +01:00
|
|
|
if err != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
log.Warn("Error while building query")
|
2021-12-08 11:50:16 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
req := ApiQueryRequest{
|
|
|
|
Cluster: job.Cluster,
|
|
|
|
From: job.StartTime.Unix(),
|
|
|
|
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
|
|
|
|
Queries: queries,
|
|
|
|
WithStats: true,
|
|
|
|
WithData: false,
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
resBody, err := ccms.doRequest(ctx, &req)
|
|
|
|
if err != nil {
|
2023-01-31 18:28:44 +01:00
|
|
|
log.Error("Error while performing request")
|
2022-01-20 10:43:46 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
2021-12-16 13:17:48 +01:00
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
|
2022-02-02 13:04:38 +01:00
|
|
|
for i, res := range resBody.Results {
|
2022-01-20 10:43:46 +01:00
|
|
|
query := req.Queries[i]
|
2022-01-24 10:06:25 +01:00
|
|
|
metric := ccms.toLocalName(query.Metric)
|
2022-01-20 10:43:46 +01:00
|
|
|
data := res[0]
|
|
|
|
if data.Error != nil {
|
2023-08-31 15:17:40 +02:00
|
|
|
log.Infof("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
|
|
|
|
continue
|
|
|
|
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
|
2022-01-20 10:43:46 +01:00
|
|
|
}
|
2021-12-08 11:50:16 +01:00
|
|
|
|
2022-01-24 10:06:25 +01:00
|
|
|
metricdata, ok := stats[metric]
|
2022-01-20 10:43:46 +01:00
|
|
|
if !ok {
|
|
|
|
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
|
2022-01-24 10:06:25 +01:00
|
|
|
stats[metric] = metricdata
|
2022-01-20 10:43:46 +01:00
|
|
|
}
|
2021-12-08 11:50:16 +01:00
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
|
2023-08-31 15:17:40 +02:00
|
|
|
log.Infof("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname)
|
|
|
|
continue
|
|
|
|
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
2022-01-20 10:43:46 +01:00
|
|
|
metricdata[query.Hostname] = schema.MetricStatistics{
|
|
|
|
Avg: float64(data.Avg),
|
|
|
|
Min: float64(data.Min),
|
|
|
|
Max: float64(data.Max),
|
|
|
|
}
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return stats, nil
|
|
|
|
}
|
|
|
|
|
2022-01-31 15:16:05 +01:00
|
|
|
// TODO: Support sub-node-scope metrics! For this, the partition of a node needs to be known!
|
2022-09-07 12:24:45 +02:00
|
|
|
func (ccms *CCMetricStore) LoadNodeData(
|
|
|
|
cluster string,
|
|
|
|
metrics, nodes []string,
|
|
|
|
scopes []schema.MetricScope,
|
|
|
|
from, to time.Time,
|
2024-03-22 08:59:35 +01:00
|
|
|
ctx context.Context,
|
|
|
|
) (map[string]map[string][]*schema.JobMetric, error) {
|
2022-01-20 10:43:46 +01:00
|
|
|
req := ApiQueryRequest{
|
2022-01-31 15:16:05 +01:00
|
|
|
Cluster: cluster,
|
|
|
|
From: from.Unix(),
|
|
|
|
To: to.Unix(),
|
2022-02-02 13:04:38 +01:00
|
|
|
WithStats: true,
|
2022-01-20 10:43:46 +01:00
|
|
|
WithData: true,
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if nodes == nil {
|
2022-02-02 13:04:38 +01:00
|
|
|
for _, metric := range metrics {
|
|
|
|
req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric))
|
|
|
|
}
|
2021-12-08 11:50:16 +01:00
|
|
|
} else {
|
2022-01-20 10:43:46 +01:00
|
|
|
for _, node := range nodes {
|
|
|
|
for _, metric := range metrics {
|
|
|
|
req.Queries = append(req.Queries, ApiQuery{
|
2024-08-26 09:55:33 +02:00
|
|
|
Hostname: node,
|
|
|
|
Metric: ccms.toRemoteName(metric),
|
|
|
|
Resolution: 60, // Default for Node Queries
|
2022-01-20 10:43:46 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2021-12-09 16:26:59 +01:00
|
|
|
}
|
2022-01-20 10:43:46 +01:00
|
|
|
|
|
|
|
resBody, err := ccms.doRequest(ctx, &req)
|
2021-12-08 11:50:16 +01:00
|
|
|
if err != nil {
|
2024-09-17 14:36:42 +02:00
|
|
|
log.Error(fmt.Sprintf("Error while performing request %#v\n", err))
|
2021-12-08 11:50:16 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2022-03-09 14:27:47 +01:00
|
|
|
var errors []string
|
2022-01-31 15:16:05 +01:00
|
|
|
data := make(map[string]map[string][]*schema.JobMetric)
|
2022-02-02 13:04:38 +01:00
|
|
|
for i, res := range resBody.Results {
|
|
|
|
var query ApiQuery
|
|
|
|
if resBody.Queries != nil {
|
|
|
|
query = resBody.Queries[i]
|
|
|
|
} else {
|
|
|
|
query = req.Queries[i]
|
|
|
|
}
|
|
|
|
|
2022-01-31 15:16:05 +01:00
|
|
|
metric := ccms.toLocalName(query.Metric)
|
2022-01-20 10:43:46 +01:00
|
|
|
qdata := res[0]
|
|
|
|
if qdata.Error != nil {
|
2023-02-01 11:58:27 +01:00
|
|
|
/* Build list for "partial errors", if any */
|
2022-03-09 14:27:47 +01:00
|
|
|
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
2022-02-02 13:04:38 +01:00
|
|
|
if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() {
|
2023-01-19 16:59:14 +01:00
|
|
|
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
|
2022-03-01 10:23:08 +01:00
|
|
|
qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0.
|
2022-02-02 13:04:38 +01:00
|
|
|
}
|
|
|
|
|
2022-01-31 15:16:05 +01:00
|
|
|
hostdata, ok := data[query.Hostname]
|
2022-01-20 10:43:46 +01:00
|
|
|
if !ok {
|
2022-01-31 15:16:05 +01:00
|
|
|
hostdata = make(map[string][]*schema.JobMetric)
|
|
|
|
data[query.Hostname] = hostdata
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
2022-09-05 17:46:38 +02:00
|
|
|
mc := archive.GetMetricConfig(cluster, metric)
|
2022-02-08 11:03:32 +01:00
|
|
|
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
|
2022-01-31 15:16:05 +01:00
|
|
|
Unit: mc.Unit,
|
|
|
|
Timestep: mc.Timestep,
|
|
|
|
Series: []schema.Series{
|
|
|
|
{
|
|
|
|
Hostname: query.Hostname,
|
|
|
|
Data: qdata.Data,
|
2023-03-22 19:21:11 +01:00
|
|
|
Statistics: schema.MetricStatistics{
|
2022-02-02 13:04:38 +01:00
|
|
|
Avg: float64(qdata.Avg),
|
|
|
|
Min: float64(qdata.Min),
|
|
|
|
Max: float64(qdata.Max),
|
|
|
|
},
|
2022-01-31 15:16:05 +01:00
|
|
|
},
|
|
|
|
},
|
|
|
|
})
|
2021-12-08 11:50:16 +01:00
|
|
|
}
|
|
|
|
|
2022-03-09 14:27:47 +01:00
|
|
|
if len(errors) != 0 {
|
2023-02-01 11:58:27 +01:00
|
|
|
/* Returns list of "partial errors" */
|
|
|
|
return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", "))
|
2022-03-09 14:27:47 +01:00
|
|
|
}
|
|
|
|
|
2021-12-08 11:50:16 +01:00
|
|
|
return data, nil
|
|
|
|
}
|
2022-05-04 09:22:55 +02:00
|
|
|
|
|
|
|
func intToStringSlice(is []int) []string {
|
2022-05-05 10:03:54 +02:00
|
|
|
ss := make([]string, len(is))
|
2022-05-04 09:22:55 +02:00
|
|
|
for i, x := range is {
|
|
|
|
ss[i] = strconv.Itoa(x)
|
|
|
|
}
|
|
|
|
return ss
|
|
|
|
}
|