Use new simpler cc-metric-store API

This commit is contained in:
Lou Knauer 2022-01-20 10:43:46 +01:00
parent c254c689af
commit 6743d94b0e

View File

@ -5,10 +5,8 @@ import (
"bytes" "bytes"
"context" "context"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"net/http" "net/http"
"strconv"
"time" "time"
"github.com/ClusterCockpit/cc-jobarchive/config" "github.com/ClusterCockpit/cc-jobarchive/config"
@ -18,21 +16,28 @@ import (
type CCMetricStore struct { type CCMetricStore struct {
jwt string jwt string
url string url string
queryEndpoint string
client http.Client client http.Client
} }
type ApiRequestBody struct { type ApiQueryRequest struct {
Metrics []string `json:"metrics"` Cluster string `json:"cluster"`
Selectors [][]string `json:"selectors"` From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
Queries []ApiQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
} }
type ApiQuery struct { type ApiQuery struct {
Metric string `json:"metric"` Metric string `json:"metric"`
Hostname string `json:"hostname"` Hostname string `json:"host"`
Aggregate bool `json:"aggreg"`
Type *string `json:"type,omitempty"` Type *string `json:"type,omitempty"`
TypeIds []string `json:"type-ids,omitempty"` TypeIds []int `json:"type-ids,omitempty"`
SubType *string `json:"subtype,omitempty"` SubType *string `json:"subtype,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"` SubTypeIds []int `json:"subtype-ids,omitempty"`
} }
type ApiMetricData struct { type ApiMetricData struct {
@ -45,18 +50,9 @@ type ApiMetricData struct {
Max schema.Float `json:"max"` Max schema.Float `json:"max"`
} }
type ApiStatsData struct {
Error *string `json:"error"`
From int64 `json:"from"`
To int64 `json:"to"`
Samples int `json:"samples"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
}
func (ccms *CCMetricStore) Init(url, token string) error { func (ccms *CCMetricStore) Init(url, token string) error {
ccms.url = url ccms.url = url
ccms.queryEndpoint = fmt.Sprintf("%s/api/query", url)
ccms.jwt = token ccms.jwt = token
ccms.client = http.Client{ ccms.client = http.Client{
Timeout: 5 * time.Second, Timeout: 5 * time.Second,
@ -64,100 +60,67 @@ func (ccms *CCMetricStore) Init(url, token string) error {
return nil return nil
} }
func (ccms *CCMetricStore) doRequest(job *schema.Job, suffix string, metrics []string, ctx context.Context) (*http.Response, error) { func (ccms *CCMetricStore) doRequest(ctx context.Context, body *ApiQueryRequest) ([][]ApiMetricData, error) {
from, to := job.StartTime.Unix(), job.StartTime.Add(time.Duration(job.Duration)*time.Second).Unix() buf := &bytes.Buffer{}
reqBody := ApiRequestBody{} if err := json.NewEncoder(buf).Encode(body); err != nil {
reqBody.Metrics = metrics
for _, node := range job.Resources {
if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
}
reqBody.Selectors = append(reqBody.Selectors, []string{job.Cluster, node.Hostname})
}
reqBodyBytes, err := json.Marshal(reqBody)
if err != nil {
return nil, err return nil, err
} }
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/api/%d/%d/%s", ccms.url, from, to, suffix), bytes.NewReader(reqBodyBytes)) req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.queryEndpoint, buf)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if ccms.jwt != "" { if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt)) req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
} }
return ccms.client.Do(req)
res, err := ccms.client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
}
var resBody [][]ApiMetricData
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
return nil, err
}
return resBody, nil
} }
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) { func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
From int64 `json:"from"`
To int64 `json:"to"`
Queries []ApiQuery `json:"queries"`
}
type ApiQueryResponse struct {
ApiMetricData
Query *ApiQuery `json:"query"`
}
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes) queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
if err != nil { if err != nil {
return nil, err return nil, err
} }
reqBody := ApiQueryRequest{ req := ApiQueryRequest{
Cluster: job.Cluster, Cluster: job.Cluster,
From: job.StartTime.Unix(), From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries, Queries: queries,
WithStats: true,
WithData: true,
} }
buf := &bytes.Buffer{} resBody, err := ccms.doRequest(ctx, &req)
if err := json.NewEncoder(buf).Encode(reqBody); err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.url+"/api/query", buf)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
res, err := ccms.client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("cc-metric-store replied with: %s", res.Status)
}
var resBody []ApiQueryResponse
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
return nil, err
}
// log.Printf("response: %#v", resBody)
var jobData schema.JobData = make(schema.JobData) var jobData schema.JobData = make(schema.JobData)
for i, res := range resBody { for i, row := range resBody {
metric := res.Query.Metric query := req.Queries[i]
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
if res.Error != nil {
return nil, fmt.Errorf("cc-metric-store error while fetching %s: %s", metric, *res.Error)
}
scope := assignedScope[i] scope := assignedScope[i]
mc := config.GetMetricConfig(job.Cluster, metric) mc := config.GetMetricConfig(job.Cluster, query.Metric)
jobMetric, ok := jobData[metric][scope] if _, ok := jobData[query.Metric]; !ok {
jobData[query.Metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
jobMetric, ok := jobData[query.Metric][scope]
if !ok { if !ok {
jobMetric = &schema.JobMetric{ jobMetric = &schema.JobMetric{
Unit: mc.Unit, Unit: mc.Unit,
@ -165,13 +128,18 @@ func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []
Timestep: mc.Timestep, Timestep: mc.Timestep,
Series: make([]schema.Series, 0), Series: make([]schema.Series, 0),
} }
jobData[metric][scope] = jobMetric jobData[query.Metric][scope] = jobMetric
}
for _, res := range row {
if res.Error != nil {
return nil, fmt.Errorf("cc-metric-store error while fetching %s: %s", query.Metric, *res.Error)
} }
id := (*int)(nil) id := (*int)(nil)
if res.Query.Type != nil { if query.Type != nil {
id = new(int) id = new(int)
*id, _ = strconv.Atoi(res.Query.TypeIds[0]) *id = query.TypeIds[0]
} }
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
@ -183,7 +151,7 @@ func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []
} }
jobMetric.Series = append(jobMetric.Series, schema.Series{ jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: res.Query.Hostname, Hostname: query.Hostname,
Id: id, Id: id,
Statistics: &schema.MetricStatistics{ Statistics: &schema.MetricStatistics{
Avg: float64(res.Avg), Avg: float64(res.Avg),
@ -193,13 +161,14 @@ func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []
Data: res.Data, Data: res.Data,
}) })
} }
}
return jobData, nil return jobData, nil
} }
var ( var (
hwthreadString = string("cpu") // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit hwthreadString = string("cpu") // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
// coreString = string(schema.MetricScopeCore) coreString = string(schema.MetricScopeCore)
socketString = string(schema.MetricScopeSocket) socketString = string(schema.MetricScopeSocket)
acceleratorString = string(schema.MetricScopeAccelerator) acceleratorString = string(schema.MetricScopeAccelerator)
) )
@ -239,15 +208,14 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
for _, accel := range host.Accelerators {
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Aggregate: false,
Type: &acceleratorString, Type: &acceleratorString,
TypeIds: []string{strconv.Itoa(accel)}, TypeIds: host.Accelerators,
}) })
assignedScope = append(assignedScope, schema.MetricScopeAccelerator) assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
}
continue continue
} }
@ -260,24 +228,24 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Aggregate: true,
Type: &acceleratorString, Type: &acceleratorString,
TypeIds: toStringSlice(host.Accelerators), TypeIds: host.Accelerators,
}) })
assignedScope = append(assignedScope, schema.MetricScopeNode) assignedScope = append(assignedScope, scope)
continue continue
} }
// HWThread -> HWThead // HWThread -> HWThead
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
for _, hwthread := range hwthreads {
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Aggregate: false,
Type: &hwthreadString, Type: &hwthreadString,
TypeIds: []string{strconv.Itoa(hwthread)}, TypeIds: hwthreads,
}) })
assignedScope = append(assignedScope, schema.MetricScopeHWThread) assignedScope = append(assignedScope, scope)
}
continue continue
} }
@ -288,10 +256,11 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString, Type: &hwthreadString,
TypeIds: toStringSlice(topology.Core[core]), TypeIds: topology.Core[core],
}) })
assignedScope = append(assignedScope, schema.MetricScopeCore) assignedScope = append(assignedScope, scope)
} }
continue continue
} }
@ -303,10 +272,11 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString, Type: &hwthreadString,
TypeIds: toStringSlice(topology.Socket[socket]), TypeIds: topology.Socket[socket],
}) })
assignedScope = append(assignedScope, schema.MetricScopeSocket) assignedScope = append(assignedScope, scope)
} }
continue continue
} }
@ -316,25 +286,53 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString, Type: &hwthreadString,
TypeIds: toStringSlice(hwthreads), TypeIds: hwthreads,
}) })
assignedScope = append(assignedScope, schema.MetricScopeNode) assignedScope = append(assignedScope, scope)
continue
}
// Core -> Core
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
Aggregate: false,
Type: &coreString,
TypeIds: cores,
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: metric,
Hostname: host.Hostname,
Aggregate: true,
Type: &coreString,
TypeIds: cores,
})
assignedScope = append(assignedScope, scope)
continue continue
} }
// Socket -> Socket // Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
for _, socket := range sockets {
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Type: &acceleratorString, Aggregate: false,
TypeIds: []string{strconv.Itoa(socket)}, Type: &socketString,
TypeIds: sockets,
}) })
assignedScope = append(assignedScope, schema.MetricScopeSocket) assignedScope = append(assignedScope, scope)
}
continue continue
} }
@ -344,10 +342,11 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
queries = append(queries, ApiQuery{ queries = append(queries, ApiQuery{
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
Aggregate: true,
Type: &socketString, Type: &socketString,
TypeIds: toStringSlice(sockets), TypeIds: sockets,
}) })
assignedScope = append(assignedScope, schema.MetricScopeNode) assignedScope = append(assignedScope, scope)
continue continue
} }
@ -357,7 +356,7 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
Metric: metric, Metric: metric,
Hostname: host.Hostname, Hostname: host.Hostname,
}) })
assignedScope = append(assignedScope, schema.MetricScopeNode) assignedScope = append(assignedScope, scope)
continue continue
} }
@ -369,121 +368,96 @@ func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scope
return queries, assignedScope, nil return queries, assignedScope, nil
} }
func toStringSlice(s []int) []string {
ret := make([]string, len(s))
for i, val := range s {
ret[i] = strconv.Itoa(val)
}
return ret
}
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) { func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
res, err := ccms.doRequest(job, "stats", metrics, ctx) queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode})
if err != nil { if err != nil {
return nil, err return nil, err
} }
resdata := make([]map[string]ApiStatsData, 0, len(job.Resources)) req := ApiQueryRequest{
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil { Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: false,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err return nil, err
} }
stats := map[string]map[string]schema.MetricStatistics{} stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
for _, metric := range metrics { for i, res := range resBody {
nodestats := map[string]schema.MetricStatistics{} query := req.Queries[i]
for i, node := range job.Resources { data := res[0]
if node.Accelerators != nil || node.HWThreads != nil {
// TODO/FIXME:
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
}
data := resdata[i][metric]
if data.Error != nil { if data.Error != nil {
return nil, errors.New(*data.Error) return nil, fmt.Errorf("fetching %s for node %s failed: %s", query.Metric, query.Hostname, *data.Error)
} }
if data.Samples == 0 { metricdata, ok := stats[query.Metric]
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric) if !ok {
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
stats[query.Metric] = metricdata
} }
nodestats[node.Hostname] = schema.MetricStatistics{ if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", query.Metric, query.Hostname, "avg/min/max is NaN")
}
metricdata[query.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg), Avg: float64(data.Avg),
Min: float64(data.Min), Min: float64(data.Min),
Max: float64(data.Max), Max: float64(data.Max),
} }
} }
stats[metric] = nodestats
}
return stats, nil return stats, nil
} }
func (ccms *CCMetricStore) LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) { func (ccms *CCMetricStore) LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
reqBody := ApiRequestBody{} req := ApiQueryRequest{
reqBody.Metrics = metrics Cluster: clusterId,
From: from,
To: to,
WithStats: false,
WithData: true,
}
if nodes == nil {
req.ForAllNodes = metrics
} else {
for _, node := range nodes { for _, node := range nodes {
reqBody.Selectors = append(reqBody.Selectors, []string{clusterId, node}) for _, metric := range metrics {
req.Queries = append(req.Queries, ApiQuery{
Hostname: node,
Metric: metric,
})
}
}
} }
reqBodyBytes, err := json.Marshal(reqBody) resBody, err := ccms.doRequest(ctx, &req)
if err != nil { if err != nil {
return nil, err return nil, err
} }
var req *http.Request data := make(map[string]map[string][]schema.Float)
if nodes == nil { for i, res := range resBody {
req, err = http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/api/%s/%d/%d/all-nodes", ccms.url, clusterId, from, to), bytes.NewReader(reqBodyBytes)) query := req.Queries[i]
} else { qdata := res[0]
req, err = http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/api/%d/%d/timeseries", ccms.url, from, to), bytes.NewReader(reqBodyBytes)) if qdata.Error != nil {
} return nil, fmt.Errorf("fetching %s for node %s failed: %s", query.Metric, query.Hostname, *qdata.Error)
if err != nil {
return nil, err
}
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
res, err := ccms.client.Do(req)
if err != nil {
return nil, err
} }
data := map[string]map[string][]schema.Float{} nodedata, ok := data[query.Hostname]
if nodes == nil { if !ok {
resdata := map[string]map[string]ApiMetricData{} nodedata = make(map[string][]schema.Float)
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil { data[query.Hostname] = nodedata
return nil, err
} }
for node, metrics := range resdata { nodedata[query.Metric] = qdata.Data
nodedata := map[string][]schema.Float{}
for metric, data := range metrics {
if data.Error != nil {
return nil, errors.New(*data.Error)
}
nodedata[metric] = data.Data
}
data[node] = nodedata
}
} else {
resdata := make([]map[string]ApiMetricData, 0, len(nodes))
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
return nil, err
}
for i, node := range nodes {
metricsData := map[string][]schema.Float{}
for metric, data := range resdata[i] {
if data.Error != nil {
return nil, errors.New(*data.Error)
}
metricsData[metric] = data.Data
}
data[node] = metricsData
}
} }
return data, nil return data, nil