mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-27 05:49:04 +01:00
start work on supporting metrics with a scope of hwthread
This commit is contained in:
parent
3f88e512f0
commit
e581bfc70f
@ -156,6 +156,19 @@ func GetClusterConfig(cluster string) *model.Cluster {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetPartition(cluster, partition string) *model.Partition {
|
||||||
|
for _, c := range Clusters {
|
||||||
|
if c.Name == cluster {
|
||||||
|
for _, p := range c.Partitions {
|
||||||
|
if p.Name == partition {
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func GetMetricConfig(cluster, metric string) *model.MetricConfig {
|
func GetMetricConfig(cluster, metric string) *model.MetricConfig {
|
||||||
for _, c := range Clusters {
|
for _, c := range Clusters {
|
||||||
if c.Name == cluster {
|
if c.Name == cluster {
|
||||||
|
@ -136,11 +136,18 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
for _, mc := range metricConfigs {
|
for _, mc := range metricConfigs {
|
||||||
allMetrics = append(allMetrics, mc.Name)
|
allMetrics = append(allMetrics, mc.Name)
|
||||||
}
|
}
|
||||||
jobData, err := LoadData(job, allMetrics, ctx)
|
|
||||||
|
// TODO: Use more granular resolution on non-exclusive jobs?
|
||||||
|
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
||||||
|
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := calcStatisticsSeries(job, jobData); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
jobMeta := &schema.JobMeta{
|
jobMeta := &schema.JobMeta{
|
||||||
BaseJob: job.BaseJob,
|
BaseJob: job.BaseJob,
|
||||||
StartTime: job.StartTime.Unix(),
|
StartTime: job.StartTime.Unix(),
|
||||||
@ -212,3 +219,51 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|||||||
|
|
||||||
return jobMeta, f.Close()
|
return jobMeta, f.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add statisticsSeries fields
|
||||||
|
func calcStatisticsSeries(job *schema.Job, jobData schema.JobData) error {
|
||||||
|
for _, scopes := range jobData {
|
||||||
|
for _, jobMetric := range scopes {
|
||||||
|
if jobMetric.StatisticsSeries != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(jobMetric.Series) < 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
n := 0
|
||||||
|
for _, series := range jobMetric.Series {
|
||||||
|
if len(series.Data) > n {
|
||||||
|
n = len(series.Data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mean, min, max := make([]schema.Float, n), make([]schema.Float, n), make([]schema.Float, n)
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
sum, smin, smax := schema.Float(0.), math.MaxFloat32, -math.MaxFloat32
|
||||||
|
for _, series := range jobMetric.Series {
|
||||||
|
if len(series.Data) >= i {
|
||||||
|
sum, smin, smax = schema.NaN, math.NaN(), math.NaN()
|
||||||
|
break
|
||||||
|
}
|
||||||
|
x := series.Data[i]
|
||||||
|
sum += x
|
||||||
|
smin = math.Min(smin, float64(x))
|
||||||
|
smax = math.Max(smax, float64(x))
|
||||||
|
}
|
||||||
|
sum /= schema.Float(len(jobMetric.Series))
|
||||||
|
mean[i] = sum
|
||||||
|
min[i] = schema.Float(smin)
|
||||||
|
max[i] = schema.Float(smax)
|
||||||
|
}
|
||||||
|
|
||||||
|
jobMetric.StatisticsSeries.Mean = mean
|
||||||
|
jobMetric.StatisticsSeries.Min = min
|
||||||
|
jobMetric.StatisticsSeries.Max = max
|
||||||
|
jobMetric.Series = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
@ -1,12 +1,14 @@
|
|||||||
package metricdata
|
package metricdata
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-jobarchive/config"
|
"github.com/ClusterCockpit/cc-jobarchive/config"
|
||||||
@ -29,9 +31,9 @@ type ApiMetricData struct {
|
|||||||
From int64 `json:"from"`
|
From int64 `json:"from"`
|
||||||
To int64 `json:"to"`
|
To int64 `json:"to"`
|
||||||
Data []schema.Float `json:"data"`
|
Data []schema.Float `json:"data"`
|
||||||
Avg *float64 `json:"avg"`
|
Avg schema.Float `json:"avg"`
|
||||||
Min *float64 `json:"min"`
|
Min schema.Float `json:"min"`
|
||||||
Max *float64 `json:"max"`
|
Max schema.Float `json:"max"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ApiStatsData struct {
|
type ApiStatsData struct {
|
||||||
@ -78,54 +80,176 @@ func (ccms *CCMetricStore) doRequest(job *schema.Job, suffix string, metrics []s
|
|||||||
return ccms.client.Do(req)
|
return ccms.client.Do(req)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||||
res, err := ccms.doRequest(job, "timeseries?with-stats=true", metrics, ctx)
|
|
||||||
if err != nil {
|
type ApiQuery struct {
|
||||||
|
Metric string `json:"metric"`
|
||||||
|
Hostname string `json:"hostname"`
|
||||||
|
Type *string `json:"type,omitempty"`
|
||||||
|
TypeIds []string `json:"type-ids,omitempty"`
|
||||||
|
SubType *string `json:"subtype,omitempty"`
|
||||||
|
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ApiQueryRequest struct {
|
||||||
|
Cluster string `json:"cluster"`
|
||||||
|
From int64 `json:"from"`
|
||||||
|
To int64 `json:"to"`
|
||||||
|
Queries []ApiQuery `json:"queries"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ApiQueryResponse struct {
|
||||||
|
ApiMetricData
|
||||||
|
Query *ApiQuery `json:"query"`
|
||||||
|
}
|
||||||
|
|
||||||
|
reqBody := ApiQueryRequest{
|
||||||
|
Cluster: job.Cluster,
|
||||||
|
From: job.StartTime.Unix(),
|
||||||
|
To: job.StartTime.Add(time.Duration(job.Duration)).Unix(),
|
||||||
|
Queries: make([]ApiQuery, 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(scopes) != 1 {
|
||||||
|
return nil, errors.New("todo: support more than one scope in a query")
|
||||||
|
}
|
||||||
|
|
||||||
|
topology := config.GetPartition(job.Cluster, job.Partition).Topology
|
||||||
|
scopeForMetric := map[string]schema.MetricScope{}
|
||||||
|
for _, metric := range metrics {
|
||||||
|
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||||
|
nativeScope, requestedScope := mc.Scope, scopes[0]
|
||||||
|
|
||||||
|
// case 1: A metric is requested at node scope with a native scope of node as well
|
||||||
|
// case 2: A metric is requested at node scope and node is exclusive
|
||||||
|
if (nativeScope == requestedScope && nativeScope == schema.MetricScopeNode) ||
|
||||||
|
(job.Exclusive == 1 && requestedScope == schema.MetricScopeNode) {
|
||||||
|
nodes := map[string]bool{}
|
||||||
|
for _, resource := range job.Resources {
|
||||||
|
nodes[resource.Hostname] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
for node := range nodes {
|
||||||
|
reqBody.Queries = append(reqBody.Queries, ApiQuery{
|
||||||
|
Metric: metric,
|
||||||
|
Hostname: node,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
scopeForMetric[metric] = schema.MetricScopeNode
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// case: Read a metric at hwthread scope with native scope hwthread
|
||||||
|
if nativeScope == requestedScope && nativeScope == schema.MetricScopeHWThread && job.NumNodes == 1 {
|
||||||
|
hwthreads := job.Resources[0].HWThreads
|
||||||
|
if hwthreads == nil {
|
||||||
|
hwthreads = topology.Node
|
||||||
|
}
|
||||||
|
|
||||||
|
t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
|
||||||
|
for _, hwthread := range hwthreads {
|
||||||
|
reqBody.Queries = append(reqBody.Queries, ApiQuery{
|
||||||
|
Metric: metric,
|
||||||
|
Hostname: job.Resources[0].Hostname,
|
||||||
|
Type: &t,
|
||||||
|
TypeIds: []string{strconv.Itoa(hwthread)},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
scopeForMetric[metric] = schema.MetricScopeHWThread
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// case: A metric is requested at node scope, has a hwthread scope and node is not exclusive and runs on a single node
|
||||||
|
if requestedScope == schema.MetricScopeNode && nativeScope == schema.MetricScopeHWThread && job.Exclusive != 1 && job.NumNodes == 1 {
|
||||||
|
hwthreads := job.Resources[0].HWThreads
|
||||||
|
if hwthreads == nil {
|
||||||
|
hwthreads = topology.Node
|
||||||
|
}
|
||||||
|
|
||||||
|
t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
|
||||||
|
ids := make([]string, 0, len(hwthreads))
|
||||||
|
for _, hwthread := range hwthreads {
|
||||||
|
ids = append(ids, strconv.Itoa(hwthread))
|
||||||
|
}
|
||||||
|
|
||||||
|
reqBody.Queries = append(reqBody.Queries, ApiQuery{
|
||||||
|
Metric: metric,
|
||||||
|
Hostname: job.Resources[0].Hostname,
|
||||||
|
Type: &t,
|
||||||
|
TypeIds: ids,
|
||||||
|
})
|
||||||
|
scopeForMetric[metric] = schema.MetricScopeNode
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Job teilt sich knoten und metric native scope ist kleiner als node
|
||||||
|
panic("todo")
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := &bytes.Buffer{}
|
||||||
|
if err := json.NewEncoder(buf).Encode(reqBody); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
resdata := make([]map[string]ApiMetricData, 0, len(job.Resources))
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.url+"/api/query", buf)
|
||||||
if err := json.NewDecoder(res.Body).Decode(&resdata); err != nil {
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if ccms.jwt != "" {
|
||||||
|
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
|
||||||
|
}
|
||||||
|
res, err := ccms.client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if res.StatusCode != http.StatusOK {
|
||||||
|
return nil, fmt.Errorf("cc-metric-store replied with: %s", res.Status)
|
||||||
|
}
|
||||||
|
|
||||||
|
var resBody []ApiQueryResponse
|
||||||
|
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var jobData schema.JobData = make(schema.JobData)
|
var jobData schema.JobData = make(schema.JobData)
|
||||||
for _, metric := range metrics {
|
for _, res := range resBody {
|
||||||
|
metric := res.Query.Metric
|
||||||
|
if res.Error != nil {
|
||||||
|
return nil, fmt.Errorf("cc-metric-store error while fetching %s: %s", metric, *res.Error)
|
||||||
|
}
|
||||||
|
|
||||||
mc := config.GetMetricConfig(job.Cluster, metric)
|
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||||
metricData := &schema.JobMetric{
|
scope := scopeForMetric[metric]
|
||||||
Scope: "node", // TODO: FIXME: Whatever...
|
jobMetric, ok := jobData[metric][scope]
|
||||||
|
if !ok {
|
||||||
|
jobMetric = &schema.JobMetric{
|
||||||
Unit: mc.Unit,
|
Unit: mc.Unit,
|
||||||
|
Scope: scope,
|
||||||
Timestep: mc.Timestep,
|
Timestep: mc.Timestep,
|
||||||
Series: make([]schema.Series, 0, len(job.Resources)),
|
Series: make([]schema.Series, 0),
|
||||||
|
}
|
||||||
|
jobData[metric][scope] = jobMetric
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, node := range job.Resources {
|
id := (*int)(nil)
|
||||||
if node.Accelerators != nil || node.HWThreads != nil {
|
if res.Query.Type != nil {
|
||||||
// TODO/FIXME:
|
id = new(int)
|
||||||
return nil, errors.New("todo: cc-metric-store resources: Accelerator/HWThreads")
|
*id, _ = strconv.Atoi(res.Query.TypeIds[0])
|
||||||
}
|
}
|
||||||
|
|
||||||
data := resdata[i][metric]
|
jobMetric.Series = append(jobMetric.Series, schema.Series{
|
||||||
if data.Error != nil {
|
Hostname: res.Query.Hostname,
|
||||||
return nil, errors.New(*data.Error)
|
Id: id,
|
||||||
}
|
|
||||||
|
|
||||||
if data.Avg == nil || data.Min == nil || data.Max == nil {
|
|
||||||
return nil, fmt.Errorf("no data for node '%s' and metric '%s'", node.Hostname, metric)
|
|
||||||
}
|
|
||||||
|
|
||||||
metricData.Series = append(metricData.Series, schema.Series{
|
|
||||||
Hostname: node.Hostname,
|
|
||||||
Data: data.Data,
|
|
||||||
Statistics: &schema.MetricStatistics{
|
Statistics: &schema.MetricStatistics{
|
||||||
Avg: *data.Avg,
|
Avg: float64(res.Avg),
|
||||||
Min: *data.Min,
|
Min: float64(res.Min),
|
||||||
Max: *data.Max,
|
Max: float64(res.Max),
|
||||||
},
|
},
|
||||||
|
Data: res.Data,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
jobData[metric] = map[string]*schema.JobMetric{"node": metricData}
|
|
||||||
}
|
|
||||||
|
|
||||||
return jobData, nil
|
return jobData, nil
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,7 @@ type MetricDataRepository interface {
|
|||||||
Init(url, token string) error
|
Init(url, token string) error
|
||||||
|
|
||||||
// Return the JobData for the given job, only with the requested metrics.
|
// Return the JobData for the given job, only with the requested metrics.
|
||||||
LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error)
|
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
|
||||||
|
|
||||||
// Return a map of metrics to a map of nodes to the metric statistics of the job.
|
// Return a map of metrics to a map of nodes to the metric statistics of the job.
|
||||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||||
@ -56,14 +56,14 @@ func Init(jobArchivePath string, disableArchive bool) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fetches the metric data for a job.
|
// Fetches the metric data for a job.
|
||||||
func LoadData(job *schema.Job, metrics []string, ctx context.Context) (schema.JobData, error) {
|
func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||||
if job.State == schema.JobStateRunning || !useArchive {
|
if job.State == schema.JobStateRunning || !useArchive {
|
||||||
repo, ok := metricDataRepos[job.Cluster]
|
repo, ok := metricDataRepos[job.Cluster]
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
|
return nil, fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
|
||||||
}
|
}
|
||||||
|
|
||||||
return repo.LoadData(job, metrics, ctx)
|
return repo.LoadData(job, metrics, scopes, ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
data, err := loadFromArchive(job)
|
data, err := loadFromArchive(job)
|
||||||
|
Loading…
Reference in New Issue
Block a user