From 98c4de65a727990a022869978079ea89761c486c Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Wed, 12 Jan 2022 13:03:01 +0100 Subject: [PATCH] work on socket scoped metrics --- graph/model/models.go | 28 ++++ metricdata/archive.go | 6 +- metricdata/cc-metric-store.go | 255 ++++++++++++++++++++-------------- metricdata/metricdata.go | 5 +- schema/metrics.go | 19 +-- 5 files changed, 198 insertions(+), 115 deletions(-) diff --git a/graph/model/models.go b/graph/model/models.go index 96f0f7e..1705696 100644 --- a/graph/model/models.go +++ b/graph/model/models.go @@ -15,3 +15,31 @@ type MetricDataRepository struct { Url string `json:"url"` Token string `json:"token"` } + +// Return a list of socket IDs given a list of hwthread IDs. +// Even if just one hwthread is in that socket, add it to the list. +// If no hwthreads other than those in the argument list are assigned to +// one of the sockets in the first return value, return true as the second value. +// TODO: Optimize this, there must be a more efficient way/algorithm. +func (topo *Topology) GetSockets(hwthreads []int) (sockets []int, exclusive bool) { + socketsMap := map[int]int{} + for _, hwthread := range hwthreads { + for socket, hwthreadsInSocket := range topo.Socket { + for _, hwthreadInSocket := range hwthreadsInSocket { + if hwthread == hwthreadInSocket { + socketsMap[socket] += 1 + } + } + } + } + + exclusive = true + hwthreadsPerSocket := len(topo.Node) / len(topo.Socket) + sockets = make([]int, 0, len(socketsMap)) + for socket, count := range socketsMap { + sockets = append(sockets, socket) + exclusive = exclusive && count == hwthreadsPerSocket + } + + return sockets, exclusive +} diff --git a/metricdata/archive.go b/metricdata/archive.go index 53f87b1..d894c4e 100644 --- a/metricdata/archive.go +++ b/metricdata/archive.go @@ -144,7 +144,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { return nil, err } - if err := calcStatisticsSeries(job, jobData); err != nil { + if err := calcStatisticsSeries(job, jobData, 7); err != nil { return nil, err } @@ -221,14 +221,14 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) { } // Add statisticsSeries fields -func calcStatisticsSeries(job *schema.Job, jobData schema.JobData) error { +func calcStatisticsSeries(job *schema.Job, jobData schema.JobData, maxSeries int) error { for _, scopes := range jobData { for _, jobMetric := range scopes { if jobMetric.StatisticsSeries != nil { continue } - if len(jobMetric.Series) < 5 { + if len(jobMetric.Series) <= maxSeries { continue } diff --git a/metricdata/cc-metric-store.go b/metricdata/cc-metric-store.go index 28a0069..22018f9 100644 --- a/metricdata/cc-metric-store.go +++ b/metricdata/cc-metric-store.go @@ -7,7 +7,6 @@ import ( "encoding/json" "errors" "fmt" - "log" "net/http" "strconv" "time" @@ -27,6 +26,15 @@ type ApiRequestBody struct { Selectors [][]string `json:"selectors"` } +type ApiQuery struct { + Metric string `json:"metric"` + Hostname string `json:"hostname"` + Type *string `json:"type,omitempty"` + TypeIds []string `json:"type-ids,omitempty"` + SubType *string `json:"subtype,omitempty"` + SubTypeIds []string `json:"subtype-ids,omitempty"` +} + type ApiMetricData struct { Error *string `json:"error"` From int64 `json:"from"` @@ -50,6 +58,9 @@ type ApiStatsData struct { func (ccms *CCMetricStore) Init(url, token string) error { ccms.url = url ccms.jwt = token + ccms.client = http.Client{ + Timeout: 5 * time.Second, + } return nil } @@ -82,17 +93,6 @@ func (ccms *CCMetricStore) doRequest(job *schema.Job, suffix string, metrics []s } func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) { - // log.Printf("job: %#v", job) - - type ApiQuery struct { - Metric string `json:"metric"` - Hostname string `json:"hostname"` - Type *string `json:"type,omitempty"` - TypeIds []string `json:"type-ids,omitempty"` - SubType *string `json:"subtype,omitempty"` - SubTypeIds []string `json:"subtype-ids,omitempty"` - } - type ApiQueryRequest struct { Cluster string `json:"cluster"` From int64 `json:"from"` @@ -105,101 +105,18 @@ func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes [] Query *ApiQuery `json:"query"` } + queries, scopeForMetric, err := ccms.buildQueries(job, metrics, scopes) + if err != nil { + return nil, err + } + reqBody := ApiQueryRequest{ Cluster: job.Cluster, From: job.StartTime.Unix(), To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(), - Queries: make([]ApiQuery, 0), + Queries: queries, } - if len(scopes) != 1 { - return nil, errors.New("todo: support more than one scope in a query") - } - - topology := config.GetPartition(job.Cluster, job.Partition).Topology - scopeForMetric := map[string]schema.MetricScope{} - for _, metric := range metrics { - mc := config.GetMetricConfig(job.Cluster, metric) - if mc == nil { - // return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) - log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) - continue - } - - nativeScope, requestedScope := mc.Scope, scopes[0] - - // case 1: A metric is requested at node scope with a native scope of node as well - // case 2: A metric is requested at node scope and node is exclusive - // case 3: A metric has native scope node - if (nativeScope == requestedScope && nativeScope == schema.MetricScopeNode) || - (job.Exclusive == 1 && requestedScope == schema.MetricScopeNode) || - (nativeScope == schema.MetricScopeNode) { - nodes := map[string]bool{} - for _, resource := range job.Resources { - nodes[resource.Hostname] = true - } - - for node := range nodes { - reqBody.Queries = append(reqBody.Queries, ApiQuery{ - Metric: metric, - Hostname: node, - }) - } - - scopeForMetric[metric] = schema.MetricScopeNode - continue - } - - // case: Read a metric at hwthread scope with native scope hwthread - if nativeScope == requestedScope && nativeScope == schema.MetricScopeHWThread && job.NumNodes == 1 { - hwthreads := job.Resources[0].HWThreads - if hwthreads == nil { - hwthreads = topology.Node - } - - t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit - for _, hwthread := range hwthreads { - reqBody.Queries = append(reqBody.Queries, ApiQuery{ - Metric: metric, - Hostname: job.Resources[0].Hostname, - Type: &t, - TypeIds: []string{strconv.Itoa(hwthread)}, - }) - } - - scopeForMetric[metric] = schema.MetricScopeHWThread - continue - } - - // case: A metric is requested at node scope, has a hwthread scope and node is not exclusive and runs on a single node - if requestedScope == schema.MetricScopeNode && nativeScope == schema.MetricScopeHWThread && job.Exclusive != 1 && job.NumNodes == 1 { - hwthreads := job.Resources[0].HWThreads - if hwthreads == nil { - hwthreads = topology.Node - } - - t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit - ids := make([]string, 0, len(hwthreads)) - for _, hwthread := range hwthreads { - ids = append(ids, strconv.Itoa(hwthread)) - } - - reqBody.Queries = append(reqBody.Queries, ApiQuery{ - Metric: metric, - Hostname: job.Resources[0].Hostname, - Type: &t, - TypeIds: ids, - }) - scopeForMetric[metric] = schema.MetricScopeNode - continue - } - - // TODO: Job teilt sich knoten und metric native scope ist kleiner als node - panic("todo") - } - - // log.Printf("query: %#v", reqBody) - buf := &bytes.Buffer{} if err := json.NewEncoder(buf).Encode(reqBody); err != nil { return nil, err @@ -281,6 +198,142 @@ func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes [] return jobData, nil } +var ( + cpuString = string(schema.MetricScopeCpu) + socketString = string(schema.MetricScopeSocket) + acceleratorString = string(schema.MetricScopeAccelerator) +) + +func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scopes []schema.MetricScope) ([]ApiQuery, map[string]schema.MetricScope, error) { + queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) + assignedScopes := make(map[string]schema.MetricScope, len(metrics)) + topology := config.GetPartition(job.Cluster, job.Partition).Topology + + if len(scopes) != 1 { + return nil, nil, errors.New("todo: support more than one scope in a query") + } + + _ = topology + + for _, metric := range metrics { + mc := config.GetMetricConfig(job.Cluster, metric) + if mc == nil { + // return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) + // log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) + continue + } + + nativeScope, requestedScope := mc.Scope, scopes[0] + + // case 1: A metric is requested at node scope with a native scope of node as well + // case 2: A metric is requested at node scope and node is exclusive + // case 3: A metric has native scope node + if (nativeScope == requestedScope && nativeScope == schema.MetricScopeNode) || + (job.Exclusive == 1 && requestedScope == schema.MetricScopeNode) || + (nativeScope == schema.MetricScopeNode) { + nodes := map[string]bool{} + for _, resource := range job.Resources { + nodes[resource.Hostname] = true + } + + for node := range nodes { + queries = append(queries, ApiQuery{ + Metric: metric, + Hostname: node, + }) + } + + assignedScopes[metric] = schema.MetricScopeNode + continue + } + + // case: Read a metric at hwthread scope with native scope hwthread + if nativeScope == requestedScope && nativeScope == schema.MetricScopeHWThread && job.NumNodes == 1 { + hwthreads := job.Resources[0].HWThreads + if hwthreads == nil { + hwthreads = topology.Node + } + + for _, hwthread := range hwthreads { + queries = append(queries, ApiQuery{ + Metric: metric, + Hostname: job.Resources[0].Hostname, + Type: &cpuString, // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit + TypeIds: []string{strconv.Itoa(hwthread)}, + }) + } + + assignedScopes[metric] = schema.MetricScopeHWThread + continue + } + + // case: A metric is requested at node scope, has a hwthread scope and node is not exclusive and runs on a single node + if requestedScope == schema.MetricScopeNode && nativeScope == schema.MetricScopeHWThread && job.Exclusive != 1 && job.NumNodes == 1 { + hwthreads := job.Resources[0].HWThreads + if hwthreads == nil { + hwthreads = topology.Node + } + + ids := make([]string, 0, len(hwthreads)) + for _, hwthread := range hwthreads { + ids = append(ids, strconv.Itoa(hwthread)) + } + + queries = append(queries, ApiQuery{ + Metric: metric, + Hostname: job.Resources[0].Hostname, + Type: &cpuString, // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit + TypeIds: ids, + }) + assignedScopes[metric] = schema.MetricScopeNode + continue + } + + // case: A metric of native scope socket is requested at any scope lower than node and runs on a single node + if requestedScope.LowerThan(schema.MetricScopeNode) && nativeScope == schema.MetricScopeSocket && job.NumNodes == 1 { + hwthreads := job.Resources[0].HWThreads + if hwthreads == nil { + hwthreads = topology.Node + } + + sockets, _ := topology.GetSockets(hwthreads) + ids := make([]string, 0, len(sockets)) + for _, socket := range sockets { + ids = append(ids, strconv.Itoa(socket)) + } + + queries = append(queries, ApiQuery{ + Metric: metric, + Hostname: job.Resources[0].Hostname, + Type: &socketString, + TypeIds: ids, + }) + assignedScopes[metric] = schema.MetricScopeNode + continue + } + + // case: A metric of native scope accelerator is requested at a sub-node scope + if requestedScope.LowerThan(schema.MetricScopeNode) && nativeScope == schema.MetricScopeAccelerator { + for _, resource := range job.Resources { + for _, acc := range resource.Accelerators { + queries = append(queries, ApiQuery{ + Metric: metric, + Hostname: job.Resources[0].Hostname, + Type: &acceleratorString, + TypeIds: []string{strconv.Itoa(acc)}, + }) + } + } + assignedScopes[metric] = schema.MetricScopeAccelerator + } + + // TODO: Job teilt sich knoten und metric native scope ist kleiner als node + panic("todo") + } + + return queries, assignedScopes, nil +} + func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) { res, err := ccms.doRequest(job, "stats", metrics, ctx) if err != nil { diff --git a/metricdata/metricdata.go b/metricdata/metricdata.go index 25f4925..32faea0 100644 --- a/metricdata/metricdata.go +++ b/metricdata/metricdata.go @@ -16,7 +16,7 @@ type MetricDataRepository interface { // Return the JobData for the given job, only with the requested metrics. LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) - // Return a map of metrics to a map of nodes to the metric statistics of the job. + // Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now. LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) // Return a map of nodes to a map of metrics to the data for the requested time. @@ -68,7 +68,7 @@ func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ct return nil, err } - calcStatisticsSeries(job, data) + calcStatisticsSeries(job, data, 7) return data, nil } @@ -122,6 +122,7 @@ func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx return nil } +// Used for the node/system view. Returns a map of nodes to a map of metrics (at node scope). func LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) { repo, ok := metricDataRepos[clusterId] if !ok { diff --git a/schema/metrics.go b/schema/metrics.go index 384f65d..300c23c 100644 --- a/schema/metrics.go +++ b/schema/metrics.go @@ -42,22 +42,23 @@ const ( MetricScopeSocket MetricScope = "socket" MetricScopeCpu MetricScope = "cpu" MetricScopeHWThread MetricScope = "hwthread" + + MetricScopeAccelerator MetricScope = "accelerator" ) var metricScopeGranularity map[MetricScope]int = map[MetricScope]int{ - MetricScopeNode: 1, - MetricScopeSocket: 2, - MetricScopeCpu: 3, - MetricScopeHWThread: 4, + MetricScopeNode: 10, + MetricScopeSocket: 5, + MetricScopeCpu: 2, + MetricScopeHWThread: 1, + + MetricScopeAccelerator: 5, // Special/Randomly choosen } -func (e *MetricScope) MaxGranularity(other MetricScope) MetricScope { +func (e *MetricScope) LowerThan(other MetricScope) bool { a := metricScopeGranularity[*e] b := metricScopeGranularity[other] - if a < b { - return *e - } - return other + return a < b } func (e *MetricScope) UnmarshalGQL(v interface{}) error {