cc-backend/metricdata/cc-metric-store.go

465 lines
13 KiB
Go
Raw Normal View History

2021-11-26 10:32:36 +01:00
package metricdata
import (
"bufio"
2021-11-26 10:32:36 +01:00
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
"github.com/ClusterCockpit/cc-jobarchive/config"
"github.com/ClusterCockpit/cc-jobarchive/schema"
)
type CCMetricStore struct {
2022-01-20 10:43:46 +01:00
jwt string
url string
queryEndpoint string
client http.Client
2021-11-26 10:32:36 +01:00
}
2022-01-20 10:43:46 +01:00
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
Queries []ApiQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
2021-11-26 10:32:36 +01:00
}
2022-01-12 13:03:01 +01:00
type ApiQuery struct {
2022-01-20 10:43:46 +01:00
Metric string `json:"metric"`
Hostname string `json:"host"`
Aggregate bool `json:"aggreg"`
Type *string `json:"type,omitempty"`
TypeIds []int `json:"type-ids,omitempty"`
SubType *string `json:"subtype,omitempty"`
SubTypeIds []int `json:"subtype-ids,omitempty"`
2022-01-12 13:03:01 +01:00
}
2021-11-26 10:32:36 +01:00
type ApiMetricData struct {
Error *string `json:"error"`
From int64 `json:"from"`
To int64 `json:"to"`
Data []schema.Float `json:"data"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
2021-11-26 10:32:36 +01:00
}
2021-12-20 10:49:46 +01:00
func (ccms *CCMetricStore) Init(url, token string) error {
ccms.url = url
2022-01-20 10:43:46 +01:00
ccms.queryEndpoint = fmt.Sprintf("%s/api/query", url)
2021-12-20 10:49:46 +01:00
ccms.jwt = token
2022-01-12 13:03:01 +01:00
ccms.client = http.Client{
Timeout: 5 * time.Second,
}
2021-11-26 10:32:36 +01:00
return nil
}
2022-01-20 10:43:46 +01:00
func (ccms *CCMetricStore) doRequest(ctx context.Context, body *ApiQueryRequest) ([][]ApiMetricData, error) {
buf := &bytes.Buffer{}
if err := json.NewEncoder(buf).Encode(body); err != nil {
2021-11-26 10:32:36 +01:00
return nil, err
}
2022-01-20 10:43:46 +01:00
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.queryEndpoint, buf)
2021-11-26 10:32:36 +01:00
if err != nil {
return nil, err
}
2021-12-09 16:26:59 +01:00
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
2021-11-26 10:32:36 +01:00
2022-01-20 10:43:46 +01:00
res, err := ccms.client.Do(req)
2022-01-12 13:03:01 +01:00
if err != nil {
return nil, err
}
2022-01-20 10:43:46 +01:00
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
}
2022-01-20 10:43:46 +01:00
var resBody [][]ApiMetricData
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
return nil, err
}
2022-01-20 10:43:46 +01:00
return resBody, nil
}
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
if err != nil {
return nil, err
}
2022-01-20 10:43:46 +01:00
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: true,
}
2022-01-20 10:43:46 +01:00
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
var jobData schema.JobData = make(schema.JobData)
2022-01-20 10:43:46 +01:00
for i, row := range resBody {
query := req.Queries[i]
scope := assignedScope[i]
mc := config.GetMetricConfig(job.Cluster, query.Metric)
if _, ok := jobData[query.Metric]; !ok {
jobData[query.Metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
2022-01-20 10:43:46 +01:00
jobMetric, ok := jobData[query.Metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0),
}
2022-01-20 10:43:46 +01:00
jobData[query.Metric][scope] = jobMetric
}
2022-01-20 10:43:46 +01:00
for _, res := range row {
if res.Error != nil {
return nil, fmt.Errorf("cc-metric-store error while fetching %s: %s", query.Metric, *res.Error)
}
2022-01-20 10:43:46 +01:00
id := (*int)(nil)
if query.Type != nil {
id = new(int)
*id = query.TypeIds[0]
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// TODO: use schema.Float instead of float64?
// This is done because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
2022-01-20 10:43:46 +01:00
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname,
Id: id,
Statistics: &schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
2021-11-26 10:32:36 +01:00
}
return jobData, nil
}
2022-01-12 13:03:01 +01:00
var (
2022-01-20 10:43:46 +01:00
hwthreadString = string("cpu") // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
coreString = string(schema.MetricScopeCore)
2022-01-12 13:03:01 +01:00
socketString = string(schema.MetricScopeSocket)
acceleratorString = string(schema.MetricScopeAccelerator)
)
func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) {
2022-01-12 13:03:01 +01:00
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
topology := config.GetPartition(job.Cluster, job.Partition).Topology
assignedScope := []schema.MetricScope{}
2022-01-12 13:03:01 +01:00
for _, metric := range metrics {
mc := config.GetMetricConfig(job.Cluster, metric)
if mc == nil {
// return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
// log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
continue
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
2022-01-12 13:03:01 +01:00
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
2022-01-12 13:03:01 +01:00
}
handledScopes = append(handledScopes, scope)
2022-01-12 13:03:01 +01:00
for _, host := range job.Resources {
hwthreads := host.HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
2022-01-12 13:03:01 +01:00
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
2022-01-20 10:43:46 +01:00
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
Aggregate: false,
Type: &acceleratorString,
TypeIds: host.Accelerators,
})
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
continue
}
2022-01-12 13:03:01 +01:00
// Accelerator -> Node
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
if len(host.Accelerators) == 0 {
continue
}
2022-01-12 13:03:01 +01:00
queries = append(queries, ApiQuery{
2022-01-20 10:43:46 +01:00
Metric: metric,
Hostname: host.Hostname,
Aggregate: true,
Type: &acceleratorString,
TypeIds: host.Accelerators,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> HWThead
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
2022-01-20 10:43:46 +01:00
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
Aggregate: false,
Type: &hwthreadString,
TypeIds: hwthreads,
})
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> Core
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
for _, core := range cores {
queries = append(queries, ApiQuery{
2022-01-20 10:43:46 +01:00
Metric: metric,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: topology.Core[core],
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
}
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> Socket
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
for _, socket := range sockets {
queries = append(queries, ApiQuery{
2022-01-20 10:43:46 +01:00
Metric: metric,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: topology.Socket[socket],
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
}
continue
}
2022-01-12 13:03:01 +01:00
// HWThread -> Node
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
2022-01-20 10:43:46 +01:00
Metric: metric,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: hwthreads,
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Core
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
Aggregate: false,
Type: &coreString,
TypeIds: cores,
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
Aggregate: true,
Type: &coreString,
TypeIds: cores,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
2022-01-20 10:43:46 +01:00
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
Aggregate: false,
Type: &socketString,
TypeIds: sockets,
})
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// Socket -> Node
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
2022-01-20 10:43:46 +01:00
Metric: metric,
Hostname: host.Hostname,
Aggregate: true,
Type: &socketString,
TypeIds: sockets,
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
}
2022-01-12 13:03:01 +01:00
// Node -> Node
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
2022-01-12 13:03:01 +01:00
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
2022-01-12 13:03:01 +01:00
})
2022-01-20 10:43:46 +01:00
assignedScope = append(assignedScope, scope)
continue
2022-01-12 13:03:01 +01:00
}
return nil, nil, fmt.Errorf("TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
2022-01-12 13:03:01 +01:00
}
}
}
return queries, assignedScope, nil
}
2021-12-17 15:49:22 +01:00
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
2022-01-20 10:43:46 +01:00
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode})
if err != nil {
return nil, err
}
2022-01-20 10:43:46 +01:00
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: false,
}
2022-01-20 10:43:46 +01:00
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
2022-01-20 10:43:46 +01:00
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
for i, res := range resBody {
query := req.Queries[i]
data := res[0]
if data.Error != nil {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", query.Metric, query.Hostname, *data.Error)
}
2022-01-20 10:43:46 +01:00
metricdata, ok := stats[query.Metric]
if !ok {
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
stats[query.Metric] = metricdata
}
2022-01-20 10:43:46 +01:00
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", query.Metric, query.Hostname, "avg/min/max is NaN")
}
2022-01-20 10:43:46 +01:00
metricdata[query.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg),
Min: float64(data.Min),
Max: float64(data.Max),
}
}
return stats, nil
}
func (ccms *CCMetricStore) LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
2022-01-20 10:43:46 +01:00
req := ApiQueryRequest{
Cluster: clusterId,
From: from,
To: to,
WithStats: false,
WithData: true,
}
if nodes == nil {
2022-01-20 10:43:46 +01:00
req.ForAllNodes = metrics
} else {
2022-01-20 10:43:46 +01:00
for _, node := range nodes {
for _, metric := range metrics {
req.Queries = append(req.Queries, ApiQuery{
Hostname: node,
Metric: metric,
})
}
}
2021-12-09 16:26:59 +01:00
}
2022-01-20 10:43:46 +01:00
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
2022-01-20 10:43:46 +01:00
data := make(map[string]map[string][]schema.Float)
for i, res := range resBody {
query := req.Queries[i]
qdata := res[0]
if qdata.Error != nil {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", query.Metric, query.Hostname, *qdata.Error)
}
2022-01-20 10:43:46 +01:00
nodedata, ok := data[query.Hostname]
if !ok {
nodedata = make(map[string][]schema.Float)
data[query.Hostname] = nodedata
}
2022-01-20 10:43:46 +01:00
nodedata[query.Metric] = qdata.Data
}
return data, nil
}