mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-01-26 03:19:06 +01:00
work on socket scoped metrics
This commit is contained in:
parent
f185d12078
commit
98c4de65a7
@ -15,3 +15,31 @@ type MetricDataRepository struct {
|
||||
Url string `json:"url"`
|
||||
Token string `json:"token"`
|
||||
}
|
||||
|
||||
// Return a list of socket IDs given a list of hwthread IDs.
|
||||
// Even if just one hwthread is in that socket, add it to the list.
|
||||
// If no hwthreads other than those in the argument list are assigned to
|
||||
// one of the sockets in the first return value, return true as the second value.
|
||||
// TODO: Optimize this, there must be a more efficient way/algorithm.
|
||||
func (topo *Topology) GetSockets(hwthreads []int) (sockets []int, exclusive bool) {
|
||||
socketsMap := map[int]int{}
|
||||
for _, hwthread := range hwthreads {
|
||||
for socket, hwthreadsInSocket := range topo.Socket {
|
||||
for _, hwthreadInSocket := range hwthreadsInSocket {
|
||||
if hwthread == hwthreadInSocket {
|
||||
socketsMap[socket] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exclusive = true
|
||||
hwthreadsPerSocket := len(topo.Node) / len(topo.Socket)
|
||||
sockets = make([]int, 0, len(socketsMap))
|
||||
for socket, count := range socketsMap {
|
||||
sockets = append(sockets, socket)
|
||||
exclusive = exclusive && count == hwthreadsPerSocket
|
||||
}
|
||||
|
||||
return sockets, exclusive
|
||||
}
|
||||
|
@ -144,7 +144,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := calcStatisticsSeries(job, jobData); err != nil {
|
||||
if err := calcStatisticsSeries(job, jobData, 7); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@ -221,14 +221,14 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
||||
}
|
||||
|
||||
// Add statisticsSeries fields
|
||||
func calcStatisticsSeries(job *schema.Job, jobData schema.JobData) error {
|
||||
func calcStatisticsSeries(job *schema.Job, jobData schema.JobData, maxSeries int) error {
|
||||
for _, scopes := range jobData {
|
||||
for _, jobMetric := range scopes {
|
||||
if jobMetric.StatisticsSeries != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(jobMetric.Series) < 5 {
|
||||
if len(jobMetric.Series) <= maxSeries {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -7,7 +7,6 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
@ -27,6 +26,15 @@ type ApiRequestBody struct {
|
||||
Selectors [][]string `json:"selectors"`
|
||||
}
|
||||
|
||||
type ApiQuery struct {
|
||||
Metric string `json:"metric"`
|
||||
Hostname string `json:"hostname"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
TypeIds []string `json:"type-ids,omitempty"`
|
||||
SubType *string `json:"subtype,omitempty"`
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||
}
|
||||
|
||||
type ApiMetricData struct {
|
||||
Error *string `json:"error"`
|
||||
From int64 `json:"from"`
|
||||
@ -50,6 +58,9 @@ type ApiStatsData struct {
|
||||
func (ccms *CCMetricStore) Init(url, token string) error {
|
||||
ccms.url = url
|
||||
ccms.jwt = token
|
||||
ccms.client = http.Client{
|
||||
Timeout: 5 * time.Second,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -82,17 +93,6 @@ func (ccms *CCMetricStore) doRequest(job *schema.Job, suffix string, metrics []s
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
|
||||
// log.Printf("job: %#v", job)
|
||||
|
||||
type ApiQuery struct {
|
||||
Metric string `json:"metric"`
|
||||
Hostname string `json:"hostname"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
TypeIds []string `json:"type-ids,omitempty"`
|
||||
SubType *string `json:"subtype,omitempty"`
|
||||
SubTypeIds []string `json:"subtype-ids,omitempty"`
|
||||
}
|
||||
|
||||
type ApiQueryRequest struct {
|
||||
Cluster string `json:"cluster"`
|
||||
From int64 `json:"from"`
|
||||
@ -105,101 +105,18 @@ func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []
|
||||
Query *ApiQuery `json:"query"`
|
||||
}
|
||||
|
||||
queries, scopeForMetric, err := ccms.buildQueries(job, metrics, scopes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reqBody := ApiQueryRequest{
|
||||
Cluster: job.Cluster,
|
||||
From: job.StartTime.Unix(),
|
||||
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
|
||||
Queries: make([]ApiQuery, 0),
|
||||
Queries: queries,
|
||||
}
|
||||
|
||||
if len(scopes) != 1 {
|
||||
return nil, errors.New("todo: support more than one scope in a query")
|
||||
}
|
||||
|
||||
topology := config.GetPartition(job.Cluster, job.Partition).Topology
|
||||
scopeForMetric := map[string]schema.MetricScope{}
|
||||
for _, metric := range metrics {
|
||||
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||
if mc == nil {
|
||||
// return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
||||
log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
nativeScope, requestedScope := mc.Scope, scopes[0]
|
||||
|
||||
// case 1: A metric is requested at node scope with a native scope of node as well
|
||||
// case 2: A metric is requested at node scope and node is exclusive
|
||||
// case 3: A metric has native scope node
|
||||
if (nativeScope == requestedScope && nativeScope == schema.MetricScopeNode) ||
|
||||
(job.Exclusive == 1 && requestedScope == schema.MetricScopeNode) ||
|
||||
(nativeScope == schema.MetricScopeNode) {
|
||||
nodes := map[string]bool{}
|
||||
for _, resource := range job.Resources {
|
||||
nodes[resource.Hostname] = true
|
||||
}
|
||||
|
||||
for node := range nodes {
|
||||
reqBody.Queries = append(reqBody.Queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: node,
|
||||
})
|
||||
}
|
||||
|
||||
scopeForMetric[metric] = schema.MetricScopeNode
|
||||
continue
|
||||
}
|
||||
|
||||
// case: Read a metric at hwthread scope with native scope hwthread
|
||||
if nativeScope == requestedScope && nativeScope == schema.MetricScopeHWThread && job.NumNodes == 1 {
|
||||
hwthreads := job.Resources[0].HWThreads
|
||||
if hwthreads == nil {
|
||||
hwthreads = topology.Node
|
||||
}
|
||||
|
||||
t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
|
||||
for _, hwthread := range hwthreads {
|
||||
reqBody.Queries = append(reqBody.Queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: job.Resources[0].Hostname,
|
||||
Type: &t,
|
||||
TypeIds: []string{strconv.Itoa(hwthread)},
|
||||
})
|
||||
}
|
||||
|
||||
scopeForMetric[metric] = schema.MetricScopeHWThread
|
||||
continue
|
||||
}
|
||||
|
||||
// case: A metric is requested at node scope, has a hwthread scope and node is not exclusive and runs on a single node
|
||||
if requestedScope == schema.MetricScopeNode && nativeScope == schema.MetricScopeHWThread && job.Exclusive != 1 && job.NumNodes == 1 {
|
||||
hwthreads := job.Resources[0].HWThreads
|
||||
if hwthreads == nil {
|
||||
hwthreads = topology.Node
|
||||
}
|
||||
|
||||
t := "cpu" // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
|
||||
ids := make([]string, 0, len(hwthreads))
|
||||
for _, hwthread := range hwthreads {
|
||||
ids = append(ids, strconv.Itoa(hwthread))
|
||||
}
|
||||
|
||||
reqBody.Queries = append(reqBody.Queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: job.Resources[0].Hostname,
|
||||
Type: &t,
|
||||
TypeIds: ids,
|
||||
})
|
||||
scopeForMetric[metric] = schema.MetricScopeNode
|
||||
continue
|
||||
}
|
||||
|
||||
// TODO: Job teilt sich knoten und metric native scope ist kleiner als node
|
||||
panic("todo")
|
||||
}
|
||||
|
||||
// log.Printf("query: %#v", reqBody)
|
||||
|
||||
buf := &bytes.Buffer{}
|
||||
if err := json.NewEncoder(buf).Encode(reqBody); err != nil {
|
||||
return nil, err
|
||||
@ -281,6 +198,142 @@ func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []
|
||||
return jobData, nil
|
||||
}
|
||||
|
||||
var (
|
||||
cpuString = string(schema.MetricScopeCpu)
|
||||
socketString = string(schema.MetricScopeSocket)
|
||||
acceleratorString = string(schema.MetricScopeAccelerator)
|
||||
)
|
||||
|
||||
func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scopes []schema.MetricScope) ([]ApiQuery, map[string]schema.MetricScope, error) {
|
||||
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
|
||||
assignedScopes := make(map[string]schema.MetricScope, len(metrics))
|
||||
topology := config.GetPartition(job.Cluster, job.Partition).Topology
|
||||
|
||||
if len(scopes) != 1 {
|
||||
return nil, nil, errors.New("todo: support more than one scope in a query")
|
||||
}
|
||||
|
||||
_ = topology
|
||||
|
||||
for _, metric := range metrics {
|
||||
mc := config.GetMetricConfig(job.Cluster, metric)
|
||||
if mc == nil {
|
||||
// return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
||||
// log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
|
||||
continue
|
||||
}
|
||||
|
||||
nativeScope, requestedScope := mc.Scope, scopes[0]
|
||||
|
||||
// case 1: A metric is requested at node scope with a native scope of node as well
|
||||
// case 2: A metric is requested at node scope and node is exclusive
|
||||
// case 3: A metric has native scope node
|
||||
if (nativeScope == requestedScope && nativeScope == schema.MetricScopeNode) ||
|
||||
(job.Exclusive == 1 && requestedScope == schema.MetricScopeNode) ||
|
||||
(nativeScope == schema.MetricScopeNode) {
|
||||
nodes := map[string]bool{}
|
||||
for _, resource := range job.Resources {
|
||||
nodes[resource.Hostname] = true
|
||||
}
|
||||
|
||||
for node := range nodes {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: node,
|
||||
})
|
||||
}
|
||||
|
||||
assignedScopes[metric] = schema.MetricScopeNode
|
||||
continue
|
||||
}
|
||||
|
||||
// case: Read a metric at hwthread scope with native scope hwthread
|
||||
if nativeScope == requestedScope && nativeScope == schema.MetricScopeHWThread && job.NumNodes == 1 {
|
||||
hwthreads := job.Resources[0].HWThreads
|
||||
if hwthreads == nil {
|
||||
hwthreads = topology.Node
|
||||
}
|
||||
|
||||
for _, hwthread := range hwthreads {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: job.Resources[0].Hostname,
|
||||
Type: &cpuString, // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
|
||||
TypeIds: []string{strconv.Itoa(hwthread)},
|
||||
})
|
||||
}
|
||||
|
||||
assignedScopes[metric] = schema.MetricScopeHWThread
|
||||
continue
|
||||
}
|
||||
|
||||
// case: A metric is requested at node scope, has a hwthread scope and node is not exclusive and runs on a single node
|
||||
if requestedScope == schema.MetricScopeNode && nativeScope == schema.MetricScopeHWThread && job.Exclusive != 1 && job.NumNodes == 1 {
|
||||
hwthreads := job.Resources[0].HWThreads
|
||||
if hwthreads == nil {
|
||||
hwthreads = topology.Node
|
||||
}
|
||||
|
||||
ids := make([]string, 0, len(hwthreads))
|
||||
for _, hwthread := range hwthreads {
|
||||
ids = append(ids, strconv.Itoa(hwthread))
|
||||
}
|
||||
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: job.Resources[0].Hostname,
|
||||
Type: &cpuString, // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
|
||||
TypeIds: ids,
|
||||
})
|
||||
assignedScopes[metric] = schema.MetricScopeNode
|
||||
continue
|
||||
}
|
||||
|
||||
// case: A metric of native scope socket is requested at any scope lower than node and runs on a single node
|
||||
if requestedScope.LowerThan(schema.MetricScopeNode) && nativeScope == schema.MetricScopeSocket && job.NumNodes == 1 {
|
||||
hwthreads := job.Resources[0].HWThreads
|
||||
if hwthreads == nil {
|
||||
hwthreads = topology.Node
|
||||
}
|
||||
|
||||
sockets, _ := topology.GetSockets(hwthreads)
|
||||
ids := make([]string, 0, len(sockets))
|
||||
for _, socket := range sockets {
|
||||
ids = append(ids, strconv.Itoa(socket))
|
||||
}
|
||||
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: job.Resources[0].Hostname,
|
||||
Type: &socketString,
|
||||
TypeIds: ids,
|
||||
})
|
||||
assignedScopes[metric] = schema.MetricScopeNode
|
||||
continue
|
||||
}
|
||||
|
||||
// case: A metric of native scope accelerator is requested at a sub-node scope
|
||||
if requestedScope.LowerThan(schema.MetricScopeNode) && nativeScope == schema.MetricScopeAccelerator {
|
||||
for _, resource := range job.Resources {
|
||||
for _, acc := range resource.Accelerators {
|
||||
queries = append(queries, ApiQuery{
|
||||
Metric: metric,
|
||||
Hostname: job.Resources[0].Hostname,
|
||||
Type: &acceleratorString,
|
||||
TypeIds: []string{strconv.Itoa(acc)},
|
||||
})
|
||||
}
|
||||
}
|
||||
assignedScopes[metric] = schema.MetricScopeAccelerator
|
||||
}
|
||||
|
||||
// TODO: Job teilt sich knoten und metric native scope ist kleiner als node
|
||||
panic("todo")
|
||||
}
|
||||
|
||||
return queries, assignedScopes, nil
|
||||
}
|
||||
|
||||
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
|
||||
res, err := ccms.doRequest(job, "stats", metrics, ctx)
|
||||
if err != nil {
|
||||
|
@ -16,7 +16,7 @@ type MetricDataRepository interface {
|
||||
// Return the JobData for the given job, only with the requested metrics.
|
||||
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
|
||||
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job.
|
||||
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
|
||||
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
|
||||
|
||||
// Return a map of nodes to a map of metrics to the data for the requested time.
|
||||
@ -68,7 +68,7 @@ func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ct
|
||||
return nil, err
|
||||
}
|
||||
|
||||
calcStatisticsSeries(job, data)
|
||||
calcStatisticsSeries(job, data, 7)
|
||||
return data, nil
|
||||
}
|
||||
|
||||
@ -122,6 +122,7 @@ func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx
|
||||
return nil
|
||||
}
|
||||
|
||||
// Used for the node/system view. Returns a map of nodes to a map of metrics (at node scope).
|
||||
func LoadNodeData(clusterId string, metrics, nodes []string, from, to int64, ctx context.Context) (map[string]map[string][]schema.Float, error) {
|
||||
repo, ok := metricDataRepos[clusterId]
|
||||
if !ok {
|
||||
|
@ -42,22 +42,23 @@ const (
|
||||
MetricScopeSocket MetricScope = "socket"
|
||||
MetricScopeCpu MetricScope = "cpu"
|
||||
MetricScopeHWThread MetricScope = "hwthread"
|
||||
|
||||
MetricScopeAccelerator MetricScope = "accelerator"
|
||||
)
|
||||
|
||||
var metricScopeGranularity map[MetricScope]int = map[MetricScope]int{
|
||||
MetricScopeNode: 1,
|
||||
MetricScopeSocket: 2,
|
||||
MetricScopeCpu: 3,
|
||||
MetricScopeHWThread: 4,
|
||||
MetricScopeNode: 10,
|
||||
MetricScopeSocket: 5,
|
||||
MetricScopeCpu: 2,
|
||||
MetricScopeHWThread: 1,
|
||||
|
||||
MetricScopeAccelerator: 5, // Special/Randomly choosen
|
||||
}
|
||||
|
||||
func (e *MetricScope) MaxGranularity(other MetricScope) MetricScope {
|
||||
func (e *MetricScope) LowerThan(other MetricScope) bool {
|
||||
a := metricScopeGranularity[*e]
|
||||
b := metricScopeGranularity[other]
|
||||
if a < b {
|
||||
return *e
|
||||
}
|
||||
return other
|
||||
return a < b
|
||||
}
|
||||
|
||||
func (e *MetricScope) UnmarshalGQL(v interface{}) error {
|
||||
|
Loading…
Reference in New Issue
Block a user