Refactor directory structure

This commit is contained in:
Jan Eitzinger
2022-06-21 17:52:36 +02:00
parent 45359cca9d
commit 81819db436
54 changed files with 767 additions and 454 deletions

View File

@@ -0,0 +1,257 @@
package metricdata
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path"
"path/filepath"
"strconv"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
// For a given job, return the path of the `data.json`/`meta.json` file.
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
func getPath(job *schema.Job, file string, checkLegacy bool) (string, error) {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
if !checkLegacy {
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
}
legacyPath := filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, file)
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
}
return legacyPath, nil
}
// Assuming job is completed/archived, return the jobs metric data.
func loadFromArchive(job *schema.Job) (schema.JobData, error) {
filename, err := getPath(job, "data.json", true)
if err != nil {
return nil, err
}
data := cache.Get(filename, func() (value interface{}, ttl time.Duration, size int) {
f, err := os.Open(filename)
if err != nil {
return err, 0, 1000
}
defer f.Close()
var data schema.JobData
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&data); err != nil {
return err, 0, 1000
}
return data, 1 * time.Hour, data.Size()
})
if err, ok := data.(error); ok {
return nil, err
}
return data.(schema.JobData), nil
}
func loadMetaJson(job *schema.Job) (*schema.JobMeta, error) {
filename, err := getPath(job, "meta.json", true)
if err != nil {
return nil, err
}
bytes, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
var metaFile schema.JobMeta = schema.JobMeta{
BaseJob: schema.JobDefaults,
}
if err := json.Unmarshal(bytes, &metaFile); err != nil {
return nil, err
}
return &metaFile, nil
}
// If the job is archived, find its `meta.json` file and override the tags list
// in that JSON file. If the job is not archived, nothing is done.
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
if job.State == schema.JobStateRunning {
return nil
}
filename, err := getPath(job, "meta.json", true)
if err != nil {
return err
}
f, err := os.Open(filename)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
var metaFile schema.JobMeta = schema.JobMeta{
BaseJob: schema.JobDefaults,
}
if err := json.NewDecoder(f).Decode(&metaFile); err != nil {
return err
}
f.Close()
metaFile.Tags = make([]*schema.Tag, 0)
for _, tag := range tags {
metaFile.Tags = append(metaFile.Tags, &schema.Tag{
Name: tag.Name,
Type: tag.Type,
})
}
bytes, err := json.Marshal(metaFile)
if err != nil {
return err
}
return os.WriteFile(filename, bytes, 0644)
}
// Helper to metricdata.LoadAverages().
func loadAveragesFromArchive(job *schema.Job, metrics []string, data [][]schema.Float) error {
metaFile, err := loadMetaJson(job)
if err != nil {
return err
}
for i, m := range metrics {
if stat, ok := metaFile.Statistics[m]; ok {
data[i] = append(data[i], schema.Float(stat.Avg))
} else {
data[i] = append(data[i], schema.NaN)
}
}
return nil
}
func GetStatistics(job *schema.Job) (map[string]schema.JobStatistics, error) {
metaFile, err := loadMetaJson(job)
if err != nil {
return nil, err
}
return metaFile.Statistics, nil
}
// Writes a running job to the job-archive
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
allMetrics := make([]string, 0)
metricConfigs := config.GetCluster(job.Cluster).MetricConfig
for _, mc := range metricConfigs {
allMetrics = append(allMetrics, mc.Name)
}
// TODO: Talk about this! What resolutions to store data at...
scopes := []schema.MetricScope{schema.MetricScopeNode}
if job.NumNodes <= 8 {
scopes = append(scopes, schema.MetricScopeCore)
}
jobData, err := LoadData(job, allMetrics, scopes, ctx)
if err != nil {
return nil, err
}
jobMeta := &schema.JobMeta{
BaseJob: job.BaseJob,
StartTime: job.StartTime.Unix(),
Statistics: make(map[string]schema.JobStatistics),
}
for metric, data := range jobData {
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
nodeData, ok := data["node"]
if !ok {
// TODO/FIXME: Calc average for non-node metrics as well!
continue
}
for _, series := range nodeData.Series {
avg += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
jobMeta.Statistics[metric] = schema.JobStatistics{
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
Avg: avg / float64(job.NumNodes),
Min: min,
Max: max,
}
}
// If the file based archive is disabled,
// only return the JobMeta structure as the
// statistics in there are needed.
if !useArchive {
return jobMeta, nil
}
dir, err := getPath(job, "", false)
if err != nil {
return nil, err
}
return jobMeta, writeFiles(dir, jobMeta, &jobData)
}
func writeFiles(dir string, jobMeta *schema.JobMeta, jobData *schema.JobData) error {
if err := os.MkdirAll(dir, 0777); err != nil {
return err
}
f, err := os.Create(path.Join(dir, "meta.json"))
if err != nil {
return err
}
if err := json.NewEncoder(f).Encode(jobMeta); err != nil {
return err
}
if err := f.Close(); err != nil {
return err
}
f, err = os.Create(path.Join(dir, "data.json"))
if err != nil {
return err
}
if err := json.NewEncoder(f).Encode(jobData); err != nil {
return err
}
return f.Close()
}
// Used to import a non-running job into the job-archive.
func ImportJob(job *schema.JobMeta, jobData *schema.JobData) error {
dir, err := getPath(&schema.Job{
BaseJob: job.BaseJob,
StartTimeUnix: job.StartTime,
StartTime: time.Unix(job.StartTime, 0),
}, "", false)
if err != nil {
return err
}
return writeFiles(dir, job, jobData)
}

View File

@@ -0,0 +1,611 @@
package metricdata
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
type CCMetricStoreConfig struct {
Kind string `json:"kind"`
Url string `json:"url"`
Token string `json:"token"`
// If metrics are known to this MetricDataRepository under a different
// name than in the `metricConfig` section of the 'cluster.json',
// provide this optional mapping of local to remote name for this metric.
Renamings map[string]string `json:"metricRenamings"`
}
type CCMetricStore struct {
jwt string
url string
queryEndpoint string
client http.Client
here2there map[string]string
there2here map[string]string
}
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
Queries []ApiQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
}
type ApiQuery struct {
Metric string `json:"metric"`
Hostname string `json:"host"`
Aggregate bool `json:"aggreg"`
Type *string `json:"type,omitempty"`
TypeIds []string `json:"type-ids,omitempty"`
SubType *string `json:"subtype,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
}
type ApiQueryResponse struct {
Queries []ApiQuery `json:"queries,omitempty"`
Results [][]ApiMetricData `json:"results"`
}
type ApiMetricData struct {
Error *string `json:"error"`
From int64 `json:"from"`
To int64 `json:"to"`
Data []schema.Float `json:"data"`
Avg schema.Float `json:"avg"`
Min schema.Float `json:"min"`
Max schema.Float `json:"max"`
}
func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error {
var config CCMetricStoreConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
return err
}
ccms.url = config.Url
ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url)
ccms.jwt = config.Token
ccms.client = http.Client{
Timeout: 10 * time.Second,
}
if config.Renamings != nil {
ccms.here2there = config.Renamings
ccms.there2here = make(map[string]string, len(config.Renamings))
for k, v := range ccms.here2there {
ccms.there2here[v] = k
}
} else {
ccms.here2there = make(map[string]string)
ccms.there2here = make(map[string]string)
}
return nil
}
func (ccms *CCMetricStore) toRemoteName(metric string) string {
if renamed, ok := ccms.here2there[metric]; ok {
return renamed
}
return metric
}
func (ccms *CCMetricStore) toLocalName(metric string) string {
if renamed, ok := ccms.there2here[metric]; ok {
return renamed
}
return metric
}
func (ccms *CCMetricStore) doRequest(ctx context.Context, body *ApiQueryRequest) (*ApiQueryResponse, error) {
buf := &bytes.Buffer{}
if err := json.NewEncoder(buf).Encode(body); err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, ccms.queryEndpoint, buf)
if err != nil {
return nil, err
}
if ccms.jwt != "" {
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt))
}
res, err := ccms.client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status)
}
var resBody ApiQueryResponse
if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil {
return nil, err
}
return &resBody, nil
}
func (ccms *CCMetricStore) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
topology := config.GetSubCluster(job.Cluster, job.SubCluster).Topology
queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes)
if err != nil {
return nil, err
}
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: true,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
var errors []string
var jobData schema.JobData = make(schema.JobData)
for i, row := range resBody.Results {
query := req.Queries[i]
metric := ccms.toLocalName(query.Metric)
scope := assignedScope[i]
mc := config.GetMetricConfig(job.Cluster, metric)
if _, ok := jobData[metric]; !ok {
jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric)
}
jobMetric, ok := jobData[metric][scope]
if !ok {
jobMetric = &schema.JobMetric{
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0),
}
jobData[metric][scope] = jobMetric
}
for _, res := range row {
if res.Error != nil {
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
continue
}
id := (*int)(nil)
if query.Type != nil {
id = new(int)
*id, err = strconv.Atoi(query.TypeIds[0])
if err != nil || *query.Type == acceleratorString {
*id, _ = topology.GetAcceleratorIndex(query.TypeIds[0])
}
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
// TODO: use schema.Float instead of float64?
// This is done because regular float64 can not be JSONed when NaN.
res.Avg = schema.Float(0)
res.Min = schema.Float(0)
res.Max = schema.Float(0)
}
jobMetric.Series = append(jobMetric.Series, schema.Series{
Hostname: query.Hostname,
Id: id,
Statistics: &schema.MetricStatistics{
Avg: float64(res.Avg),
Min: float64(res.Min),
Max: float64(res.Max),
},
Data: res.Data,
})
}
// So that one can later check len(jobData):
if len(jobMetric.Series) == 0 {
delete(jobData[metric], scope)
if len(jobData[metric]) == 0 {
delete(jobData, metric)
}
}
}
if len(errors) != 0 {
return jobData, fmt.Errorf("cc-metric-store: %s", strings.Join(errors, ", "))
}
return jobData, nil
}
var (
hwthreadString = string("cpu") // TODO/FIXME: inconsistency between cc-metric-collector and ClusterCockpit
coreString = string(schema.MetricScopeCore)
memoryDomainString = string(schema.MetricScopeMemoryDomain)
socketString = string(schema.MetricScopeSocket)
acceleratorString = string(schema.MetricScopeAccelerator)
)
func (ccms *CCMetricStore) buildQueries(job *schema.Job, metrics []string, scopes []schema.MetricScope) ([]ApiQuery, []schema.MetricScope, error) {
queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources))
topology := config.GetSubCluster(job.Cluster, job.SubCluster).Topology
assignedScope := []schema.MetricScope{}
for _, metric := range metrics {
remoteName := ccms.toRemoteName(metric)
mc := config.GetMetricConfig(job.Cluster, metric)
if mc == nil {
// return nil, fmt.Errorf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
// log.Printf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
continue
}
// Avoid duplicates...
handledScopes := make([]schema.MetricScope, 0, 3)
scopesLoop:
for _, requestedScope := range scopes {
nativeScope := mc.Scope
scope := nativeScope.Max(requestedScope)
for _, s := range handledScopes {
if scope == s {
continue scopesLoop
}
}
handledScopes = append(handledScopes, scope)
for _, host := range job.Resources {
hwthreads := host.HWThreads
if hwthreads == nil {
hwthreads = topology.Node
}
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &acceleratorString,
TypeIds: host.Accelerators,
})
assignedScope = append(assignedScope, schema.MetricScopeAccelerator)
continue
}
// Accelerator -> Node
if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode {
if len(host.Accelerators) == 0 {
continue
}
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &acceleratorString,
TypeIds: host.Accelerators,
})
assignedScope = append(assignedScope, scope)
continue
}
// HWThread -> HWThead
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &hwthreadString,
TypeIds: intToStringSlice(hwthreads),
})
assignedScope = append(assignedScope, scope)
continue
}
// HWThread -> Core
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
for _, core := range cores {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Core[core]),
})
assignedScope = append(assignedScope, scope)
}
continue
}
// HWThread -> Socket
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
for _, socket := range sockets {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(topology.Socket[socket]),
})
assignedScope = append(assignedScope, scope)
}
continue
}
// HWThread -> Node
if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &hwthreadString,
TypeIds: intToStringSlice(hwthreads),
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Core
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &coreString,
TypeIds: intToStringSlice(cores),
})
assignedScope = append(assignedScope, scope)
continue
}
// Core -> Node
if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode {
cores, _ := topology.GetCoresFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &coreString,
TypeIds: intToStringSlice(cores),
})
assignedScope = append(assignedScope, scope)
continue
}
// MemoryDomain -> MemoryDomain
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// MemoryDoman -> Node
if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode {
sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &memoryDomainString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// Socket -> Socket
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: false,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// Socket -> Node
if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode {
sockets, _ := topology.GetSocketsFromHWThreads(hwthreads)
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
Aggregate: true,
Type: &socketString,
TypeIds: intToStringSlice(sockets),
})
assignedScope = append(assignedScope, scope)
continue
}
// Node -> Node
if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode {
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,
})
assignedScope = append(assignedScope, scope)
continue
}
return nil, nil, fmt.Errorf("TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope)
}
}
}
return queries, assignedScope, nil
}
func (ccms *CCMetricStore) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode})
if err != nil {
return nil, err
}
req := ApiQueryRequest{
Cluster: job.Cluster,
From: job.StartTime.Unix(),
To: job.StartTime.Add(time.Duration(job.Duration) * time.Second).Unix(),
Queries: queries,
WithStats: true,
WithData: false,
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
for i, res := range resBody.Results {
query := req.Queries[i]
metric := ccms.toLocalName(query.Metric)
data := res[0]
if data.Error != nil {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
}
metricdata, ok := stats[metric]
if !ok {
metricdata = make(map[string]schema.MetricStatistics, job.NumNodes)
stats[metric] = metricdata
}
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
}
metricdata[query.Hostname] = schema.MetricStatistics{
Avg: float64(data.Avg),
Min: float64(data.Min),
Max: float64(data.Max),
}
}
return stats, nil
}
// TODO: Support sub-node-scope metrics! For this, the partition of a node needs to be known!
func (ccms *CCMetricStore) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
req := ApiQueryRequest{
Cluster: cluster,
From: from.Unix(),
To: to.Unix(),
WithStats: true,
WithData: true,
}
if nodes == nil {
for _, metric := range metrics {
req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric))
}
} else {
for _, node := range nodes {
for _, metric := range metrics {
req.Queries = append(req.Queries, ApiQuery{
Hostname: node,
Metric: ccms.toRemoteName(metric),
})
}
}
}
resBody, err := ccms.doRequest(ctx, &req)
if err != nil {
return nil, err
}
var errors []string
data := make(map[string]map[string][]*schema.JobMetric)
for i, res := range resBody.Results {
var query ApiQuery
if resBody.Queries != nil {
query = resBody.Queries[i]
} else {
query = req.Queries[i]
}
metric := ccms.toLocalName(query.Metric)
qdata := res[0]
if qdata.Error != nil {
errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error))
}
if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() {
// return nil, fmt.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0.
}
hostdata, ok := data[query.Hostname]
if !ok {
hostdata = make(map[string][]*schema.JobMetric)
data[query.Hostname] = hostdata
}
mc := config.GetMetricConfig(cluster, metric)
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Unit: mc.Unit,
Scope: schema.MetricScopeNode,
Timestep: mc.Timestep,
Series: []schema.Series{
{
Hostname: query.Hostname,
Data: qdata.Data,
Statistics: &schema.MetricStatistics{
Avg: float64(qdata.Avg),
Min: float64(qdata.Min),
Max: float64(qdata.Max),
},
},
},
})
}
if len(errors) != 0 {
return data, fmt.Errorf("cc-metric-store: %s", strings.Join(errors, ", "))
}
return data, nil
}
func intToStringSlice(is []int) []string {
ss := make([]string, len(is))
for i, x := range is {
ss[i] = strconv.Itoa(x)
}
return ss
}

View File

@@ -0,0 +1,308 @@
package metricdata
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"log"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
)
type InfluxDBv2DataRepositoryConfig struct {
Url string `json:"url"`
Token string `json:"token"`
Bucket string `json:"bucket"`
Org string `json:"org"`
SkipTls bool `json:"skiptls"`
}
type InfluxDBv2DataRepository struct {
client influxdb2.Client
queryClient influxdb2Api.QueryAPI
bucket, measurement string
}
func (idb *InfluxDBv2DataRepository) Init(rawConfig json.RawMessage) error {
var config InfluxDBv2DataRepositoryConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
return err
}
idb.client = influxdb2.NewClientWithOptions(config.Url, config.Token, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: config.SkipTls}))
idb.queryClient = idb.client.QueryAPI(config.Org)
idb.bucket = config.Bucket
return nil
}
func (idb *InfluxDBv2DataRepository) formatTime(t time.Time) string {
return t.Format(time.RFC3339) // Like “2006-01-02T15:04:05Z07:00”
}
func (idb *InfluxDBv2DataRepository) epochToTime(epoch int64) time.Time {
return time.Unix(epoch, 0)
}
func (idb *InfluxDBv2DataRepository) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
measurementsConds := make([]string, 0, len(metrics))
for _, m := range metrics {
measurementsConds = append(measurementsConds, fmt.Sprintf(`r["_measurement"] == "%s"`, m))
}
measurementsCond := strings.Join(measurementsConds, " or ")
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
jobData := make(schema.JobData) // Empty Schema: map[<string>FIELD]map[<MetricScope>SCOPE]<*JobMetric>METRIC
// Requested Scopes
for _, scope := range scopes {
query := ""
switch scope {
case "node":
// Get Finest Granularity, Groupy By Measurement and Hostname (== Metric / Node), Calculate Mean for 60s windows
// log.Println("Note: Scope 'node' requested. ")
query = fmt.Sprintf(`
from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => (%s) and (%s) )
|> drop(columns: ["_start", "_stop"])
|> group(columns: ["hostname", "_measurement"])
|> aggregateWindow(every: 60s, fn: mean)
|> drop(columns: ["_time"])`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
measurementsCond, hostsCond)
case "socket":
log.Println("Note: Scope 'socket' requested, but not yet supported: Will return 'node' scope only. ")
continue
case "core":
log.Println("Note: Scope 'core' requested, but not yet supported: Will return 'node' scope only. ")
continue
// Get Finest Granularity only, Set NULL to 0.0
// query = fmt.Sprintf(`
// from(bucket: "%s")
// |> range(start: %s, stop: %s)
// |> filter(fn: (r) => %s )
// |> filter(fn: (r) => %s )
// |> drop(columns: ["_start", "_stop", "cluster"])
// |> map(fn: (r) => (if exists r._value then {r with _value: r._value} else {r with _value: 0.0}))`,
// idb.bucket,
// idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix + int64(job.Duration) + int64(1) )),
// measurementsCond, hostsCond)
default:
log.Println("Note: Unknown Scope requested: Will return 'node' scope. ")
continue
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node'")
}
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
return nil, err
}
// Init Metrics: Only Node level now -> TODO: Matching /check on scope level ...
for _, metric := range metrics {
jobMetric, ok := jobData[metric]
if !ok {
mc := config.GetMetricConfig(job.Cluster, metric)
jobMetric = map[schema.MetricScope]*schema.JobMetric{
scope: { // uses scope var from above!
Unit: mc.Unit,
Scope: scope,
Timestep: mc.Timestep,
Series: make([]schema.Series, 0, len(job.Resources)),
StatisticsSeries: nil, // Should be: &schema.StatsSeries{},
},
}
}
jobData[metric] = jobMetric
}
// Process Result: Time-Data
field, host, hostSeries := "", "", schema.Series{}
// typeId := 0
switch scope {
case "node":
for rows.Next() {
row := rows.Record()
if host == "" || host != row.ValueByKey("hostname").(string) || rows.TableChanged() {
if host != "" {
// Append Series before reset
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
field, host = row.Measurement(), row.ValueByKey("hostname").(string)
hostSeries = schema.Series{
Hostname: host,
Statistics: nil,
Data: make([]schema.Float, 0),
}
}
val, ok := row.Value().(float64)
if ok {
hostSeries.Data = append(hostSeries.Data, schema.Float(val))
} else {
hostSeries.Data = append(hostSeries.Data, schema.Float(0))
}
}
case "socket":
continue
case "core":
continue
// Include Series.Id in hostSeries
// for rows.Next() {
// row := rows.Record()
// if ( host == "" || host != row.ValueByKey("hostname").(string) || typeId != row.ValueByKey("type-id").(int) || rows.TableChanged() ) {
// if ( host != "" ) {
// // Append Series before reset
// jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
// }
// field, host, typeId = row.Measurement(), row.ValueByKey("hostname").(string), row.ValueByKey("type-id").(int)
// hostSeries = schema.Series{
// Hostname: host,
// Id: &typeId,
// Statistics: nil,
// Data: make([]schema.Float, 0),
// }
// }
// val := row.Value().(float64)
// hostSeries.Data = append(hostSeries.Data, schema.Float(val))
// }
default:
continue
// return nil, errors.New("the InfluxDB metric data repository does not yet support other scopes than 'node, core'")
}
// Append last Series
jobData[field][scope].Series = append(jobData[field][scope].Series, hostSeries)
}
// Get Stats
stats, err := idb.LoadStats(job, metrics, ctx)
if err != nil {
return nil, err
}
for _, scope := range scopes {
if scope == "node" { // No 'socket/core' support yet
for metric, nodes := range stats {
// log.Println(fmt.Sprintf("<< Add Stats for : Field %s >>", metric))
for node, stats := range nodes {
// log.Println(fmt.Sprintf("<< Add Stats for : Host %s : Min %.2f, Max %.2f, Avg %.2f >>", node, stats.Min, stats.Max, stats.Avg ))
for index, _ := range jobData[metric][scope].Series {
// log.Println(fmt.Sprintf("<< Try to add Stats to Series in Position %d >>", index))
if jobData[metric][scope].Series[index].Hostname == node {
// log.Println(fmt.Sprintf("<< Match for Series in Position %d : Host %s >>", index, jobData[metric][scope].Series[index].Hostname))
jobData[metric][scope].Series[index].Statistics = &schema.MetricStatistics{Avg: stats.Avg, Min: stats.Min, Max: stats.Max}
// log.Println(fmt.Sprintf("<< Result Inner: Min %.2f, Max %.2f, Avg %.2f >>", jobData[metric][scope].Series[index].Statistics.Min, jobData[metric][scope].Series[index].Statistics.Max, jobData[metric][scope].Series[index].Statistics.Avg))
}
}
}
}
}
}
// DEBUG:
// for _, scope := range scopes {
// for _, met := range metrics {
// for _, series := range jobData[met][scope].Series {
// log.Println(fmt.Sprintf("<< Result: %d data points for metric %s on %s with scope %s, Stats: Min %.2f, Max %.2f, Avg %.2f >>",
// len(series.Data), met, series.Hostname, scope,
// series.Statistics.Min, series.Statistics.Max, series.Statistics.Avg))
// }
// }
// }
return jobData, nil
}
func (idb *InfluxDBv2DataRepository) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
stats := map[string]map[string]schema.MetricStatistics{}
hostsConds := make([]string, 0, len(job.Resources))
for _, h := range job.Resources {
if h.HWThreads != nil || h.Accelerators != nil {
// TODO
return nil, errors.New("the InfluxDB metric data repository does not yet support HWThreads or Accelerators")
}
hostsConds = append(hostsConds, fmt.Sprintf(`r["hostname"] == "%s"`, h.Hostname))
}
hostsCond := strings.Join(hostsConds, " or ")
// lenMet := len(metrics)
for _, metric := range metrics {
// log.Println(fmt.Sprintf("<< You are here: %s (Index %d of %d metrics)", metric, index, lenMet))
query := fmt.Sprintf(`
data = from(bucket: "%s")
|> range(start: %s, stop: %s)
|> filter(fn: (r) => r._measurement == "%s" and r._field == "value" and (%s))
union(tables: [data |> mean(column: "_value") |> set(key: "_field", value: "avg"),
data |> min(column: "_value") |> set(key: "_field", value: "min"),
data |> max(column: "_value") |> set(key: "_field", value: "max")])
|> pivot(rowKey: ["hostname"], columnKey: ["_field"], valueColumn: "_value")
|> group()`,
idb.bucket,
idb.formatTime(job.StartTime), idb.formatTime(idb.epochToTime(job.StartTimeUnix+int64(job.Duration)+int64(1))),
metric, hostsCond)
rows, err := idb.queryClient.Query(ctx, query)
if err != nil {
return nil, err
}
nodes := map[string]schema.MetricStatistics{}
for rows.Next() {
row := rows.Record()
host := row.ValueByKey("hostname").(string)
avg, avgok := row.ValueByKey("avg").(float64)
if !avgok {
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic AVG. Expected 'float64', got %v", metric, avg))
avg = 0.0
}
min, minok := row.ValueByKey("min").(float64)
if !minok {
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic MIN. Expected 'float64', got %v", metric, min))
min = 0.0
}
max, maxok := row.ValueByKey("max").(float64)
if !maxok {
// log.Println(fmt.Sprintf(">> Assertion error for metric %s, statistic MAX. Expected 'float64', got %v", metric, max))
max = 0.0
}
nodes[host] = schema.MetricStatistics{
Avg: avg,
Min: min,
Max: max,
}
}
stats[metric] = nodes
}
return stats, nil
}
func (idb *InfluxDBv2DataRepository) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
// TODO : Implement to be used in Analysis- und System/Node-View
log.Println(fmt.Sprintf("LoadNodeData unimplemented for InfluxDBv2DataRepository, Args: cluster %s, metrics %v, nodes %v, scopes %v", cluster, metrics, nodes, scopes))
return nil, errors.New("unimplemented for InfluxDBv2DataRepository")
}

View File

@@ -0,0 +1,254 @@
package metricdata
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/iamlouk/lrucache"
)
type MetricDataRepository interface {
// Initialize this MetricDataRepository. One instance of
// this interface will only ever be responsible for one cluster.
Init(rawConfig json.RawMessage) error
// Return the JobData for the given job, only with the requested metrics.
LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error)
// Return a map of metrics to a map of nodes to the metric statistics of the job. node scope assumed for now.
LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error)
// Return a map of hosts to a map of metrics at the requested scopes for that node.
LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error)
}
var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{}
var JobArchivePath string
var useArchive bool
func Init(jobArchivePath string, disableArchive bool) error {
useArchive = !disableArchive
JobArchivePath = jobArchivePath
for _, cluster := range config.Clusters {
if cluster.MetricDataRepository != nil {
var kind struct {
Kind string `json:"kind"`
}
if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil {
return err
}
var mdr MetricDataRepository
switch kind.Kind {
case "cc-metric-store":
mdr = &CCMetricStore{}
case "influxdb":
mdr = &InfluxDBv2DataRepository{}
case "test":
mdr = &TestMetricDataRepository{}
default:
return fmt.Errorf("unkown metric data repository '%s' for cluster '%s'", kind.Kind, cluster.Name)
}
if err := mdr.Init(cluster.MetricDataRepository); err != nil {
return err
}
metricDataRepos[cluster.Name] = mdr
}
}
return nil
}
var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
// Fetches the metric data for a job.
func LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
data := cache.Get(cacheKey(job, metrics, scopes), func() (_ interface{}, ttl time.Duration, size int) {
var jd schema.JobData
var err error
if job.State == schema.JobStateRunning ||
job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving ||
!useArchive {
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster), 0, 0
}
if scopes == nil {
scopes = append(scopes, schema.MetricScopeNode)
}
if metrics == nil {
cluster := config.GetCluster(job.Cluster)
for _, mc := range cluster.MetricConfig {
metrics = append(metrics, mc.Name)
}
}
jd, err = repo.LoadData(job, metrics, scopes, ctx)
if err != nil {
if len(jd) != 0 {
log.Errorf("partial error: %s", err.Error())
} else {
return err, 0, 0
}
}
size = jd.Size()
} else {
jd, err = loadFromArchive(job)
if err != nil {
return err, 0, 0
}
// Avoid sending unrequested data to the client:
if metrics != nil || scopes != nil {
if metrics == nil {
metrics = make([]string, 0, len(jd))
for k := range jd {
metrics = append(metrics, k)
}
}
res := schema.JobData{}
for _, metric := range metrics {
if perscope, ok := jd[metric]; ok {
if len(perscope) > 1 {
subset := make(map[schema.MetricScope]*schema.JobMetric)
for _, scope := range scopes {
if jm, ok := perscope[scope]; ok {
subset[scope] = jm
}
}
if len(subset) > 0 {
perscope = subset
}
}
res[metric] = perscope
}
}
jd = res
}
size = 1 // loadFromArchive() caches in the same cache.
}
ttl = 5 * time.Hour
if job.State == schema.JobStateRunning {
ttl = 2 * time.Minute
}
prepareJobData(job, jd, scopes)
return jd, ttl, size
})
if err, ok := data.(error); ok {
return nil, err
}
return data.(schema.JobData), nil
}
// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize.
func LoadAverages(job *schema.Job, metrics []string, data [][]schema.Float, ctx context.Context) error {
if job.State != schema.JobStateRunning && useArchive {
return loadAveragesFromArchive(job, metrics, data)
}
repo, ok := metricDataRepos[job.Cluster]
if !ok {
return fmt.Errorf("no metric data repository configured for '%s'", job.Cluster)
}
stats, err := repo.LoadStats(job, metrics, ctx)
if err != nil {
return err
}
for i, m := range metrics {
nodes, ok := stats[m]
if !ok {
data[i] = append(data[i], schema.NaN)
continue
}
sum := 0.0
for _, node := range nodes {
sum += node.Avg
}
data[i] = append(data[i], schema.Float(sum))
}
return nil
}
// Used for the node/system view. Returns a map of nodes to a map of metrics.
func LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
repo, ok := metricDataRepos[cluster]
if !ok {
return nil, fmt.Errorf("no metric data repository configured for '%s'", cluster)
}
if metrics == nil {
for _, m := range config.GetCluster(cluster).MetricConfig {
metrics = append(metrics, m.Name)
}
}
data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx)
if err != nil {
if len(data) != 0 {
log.Errorf("partial error: %s", err.Error())
} else {
return nil, err
}
}
if data == nil {
return nil, fmt.Errorf("the metric data repository for '%s' does not support this query", cluster)
}
return data, nil
}
func cacheKey(job *schema.Job, metrics []string, scopes []schema.MetricScope) string {
// Duration and StartTime do not need to be in the cache key as StartTime is less unique than
// job.ID and the TTL of the cache entry makes sure it does not stay there forever.
return fmt.Sprintf("%d(%s):[%v],[%v]",
job.ID, job.State, metrics, scopes)
}
// For /monitoring/job/<job> and some other places, flops_any and mem_bw need to be available at the scope 'node'.
// If a job has a lot of nodes, statisticsSeries should be available so that a min/mean/max Graph can be used instead of
// a lot of single lines.
func prepareJobData(job *schema.Job, jobData schema.JobData, scopes []schema.MetricScope) {
const maxSeriesSize int = 15
for _, scopes := range jobData {
for _, jm := range scopes {
if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize {
continue
}
jm.AddStatisticsSeries()
}
}
nodeScopeRequested := false
for _, scope := range scopes {
if scope == schema.MetricScopeNode {
nodeScopeRequested = true
}
}
if nodeScopeRequested {
jobData.AddNodeScope("flops_any")
jobData.AddNodeScope("mem_bw")
}
}

View File

@@ -0,0 +1,32 @@
package metricdata
import (
"context"
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
panic("TODO")
}
// Only a mock for unit-testing.
type TestMetricDataRepository struct{}
func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error {
return nil
}
func (tmdr *TestMetricDataRepository) LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.JobData, error) {
return TestLoadDataCallback(job, metrics, scopes, ctx)
}
func (tmdr *TestMetricDataRepository) LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) {
panic("TODO")
}
func (tmdr *TestMetricDataRepository) LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) {
panic("TODO")
}