2021-10-26 10:24:43 +02:00
|
|
|
package metricdata
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
2021-11-26 10:34:29 +01:00
|
|
|
"context"
|
2021-10-26 10:24:43 +02:00
|
|
|
"encoding/json"
|
2021-11-26 10:34:29 +01:00
|
|
|
"errors"
|
2021-10-26 10:24:43 +02:00
|
|
|
"fmt"
|
2021-11-26 10:34:29 +01:00
|
|
|
"math"
|
2021-10-26 10:24:43 +02:00
|
|
|
"os"
|
2021-11-26 10:34:29 +01:00
|
|
|
"path"
|
2021-10-26 10:24:43 +02:00
|
|
|
"path/filepath"
|
|
|
|
"strconv"
|
|
|
|
|
2021-11-26 10:34:29 +01:00
|
|
|
"github.com/ClusterCockpit/cc-jobarchive/config"
|
2021-10-26 10:24:43 +02:00
|
|
|
"github.com/ClusterCockpit/cc-jobarchive/schema"
|
|
|
|
)
|
|
|
|
|
|
|
|
// For a given job, return the path of the `data.json`/`meta.json` file.
|
|
|
|
// TODO: Implement Issue ClusterCockpit/ClusterCockpit#97
|
2021-12-17 15:49:22 +01:00
|
|
|
func getPath(job *schema.Job, file string, checkLegacy bool) (string, error) {
|
2021-12-16 13:17:48 +01:00
|
|
|
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
|
2021-12-08 11:50:47 +01:00
|
|
|
if !checkLegacy {
|
2021-12-16 13:17:48 +01:00
|
|
|
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
2021-12-08 11:50:47 +01:00
|
|
|
}
|
|
|
|
|
2021-12-16 13:17:48 +01:00
|
|
|
legacyPath := filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, file)
|
2021-11-26 10:34:29 +01:00
|
|
|
if _, err := os.Stat(legacyPath); errors.Is(err, os.ErrNotExist) {
|
2021-12-16 13:17:48 +01:00
|
|
|
return filepath.Join(JobArchivePath, job.Cluster, lvl1, lvl2, strconv.FormatInt(job.StartTime.Unix(), 10), file), nil
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return legacyPath, nil
|
2021-10-26 10:24:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Assuming job is completed/archived, return the jobs metric data.
|
2021-12-17 15:49:22 +01:00
|
|
|
func loadFromArchive(job *schema.Job) (schema.JobData, error) {
|
2021-12-08 11:50:47 +01:00
|
|
|
filename, err := getPath(job, "data.json", true)
|
2021-10-26 10:24:43 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
f, err := os.Open(filename)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
var data schema.JobData
|
|
|
|
if err := json.NewDecoder(bufio.NewReader(f)).Decode(&data); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return data, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the job is archived, find its `meta.json` file and override the tags list
|
|
|
|
// in that JSON file. If the job is not archived, nothing is done.
|
2021-12-17 15:49:22 +01:00
|
|
|
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
|
|
|
|
if job.State == schema.JobStateRunning {
|
2021-10-26 10:24:43 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-12-08 11:50:47 +01:00
|
|
|
filename, err := getPath(job, "meta.json", true)
|
2021-10-26 10:24:43 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
f, err := os.Open(filename)
|
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2021-12-17 15:49:22 +01:00
|
|
|
var metaFile schema.JobMeta = schema.JobMeta{
|
|
|
|
BaseJob: schema.JobDefaults,
|
|
|
|
}
|
2021-10-26 10:24:43 +02:00
|
|
|
if err := json.NewDecoder(f).Decode(&metaFile); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
f.Close()
|
|
|
|
|
2021-12-17 15:49:22 +01:00
|
|
|
metaFile.Tags = make([]*schema.Tag, 0)
|
2021-10-26 10:24:43 +02:00
|
|
|
for _, tag := range tags {
|
2021-12-17 15:49:22 +01:00
|
|
|
metaFile.Tags = append(metaFile.Tags, &schema.Tag{
|
|
|
|
Name: tag.Name,
|
|
|
|
Type: tag.Type,
|
2021-10-26 10:24:43 +02:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
bytes, err := json.Marshal(metaFile)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return os.WriteFile(filename, bytes, 0644)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Helper to metricdata.LoadAverages().
|
2021-12-17 15:49:22 +01:00
|
|
|
func loadAveragesFromArchive(job *schema.Job, metrics []string, data [][]schema.Float) error {
|
2021-12-08 11:50:47 +01:00
|
|
|
filename, err := getPath(job, "meta.json", true)
|
2021-10-26 10:24:43 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
bytes, err := os.ReadFile(filename)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
var metaFile schema.JobMeta
|
|
|
|
if err := json.Unmarshal(bytes, &metaFile); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
for i, m := range metrics {
|
|
|
|
if stat, ok := metaFile.Statistics[m]; ok {
|
|
|
|
data[i] = append(data[i], schema.Float(stat.Avg))
|
|
|
|
} else {
|
|
|
|
data[i] = append(data[i], schema.NaN)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
2021-11-26 10:34:29 +01:00
|
|
|
|
|
|
|
// Writes a running job to the job-archive
|
2021-12-17 15:49:22 +01:00
|
|
|
func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.JobMeta, error) {
|
|
|
|
if job.State != schema.JobStateRunning {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, errors.New("cannot archive job that is not running")
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
allMetrics := make([]string, 0)
|
2021-12-16 13:17:48 +01:00
|
|
|
metricConfigs := config.GetClusterConfig(job.Cluster).MetricConfig
|
2021-11-26 10:34:29 +01:00
|
|
|
for _, mc := range metricConfigs {
|
|
|
|
allMetrics = append(allMetrics, mc.Name)
|
|
|
|
}
|
2022-01-07 09:47:41 +01:00
|
|
|
|
|
|
|
// TODO: Use more granular resolution on non-exclusive jobs?
|
|
|
|
scopes := []schema.MetricScope{schema.MetricScopeNode}
|
|
|
|
jobData, err := LoadData(job, allMetrics, scopes, ctx)
|
2021-11-26 10:34:29 +01:00
|
|
|
if err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
2022-01-12 13:03:01 +01:00
|
|
|
if err := calcStatisticsSeries(job, jobData, 7); err != nil {
|
2022-01-07 09:47:41 +01:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2021-12-17 15:49:22 +01:00
|
|
|
jobMeta := &schema.JobMeta{
|
|
|
|
BaseJob: job.BaseJob,
|
|
|
|
StartTime: job.StartTime.Unix(),
|
|
|
|
Statistics: make(map[string]schema.JobStatistics),
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for metric, data := range jobData {
|
|
|
|
avg, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32
|
2021-12-17 15:49:22 +01:00
|
|
|
nodeData, ok := data["node"]
|
|
|
|
if !ok {
|
|
|
|
// TODO/FIXME: Calc average for non-node metrics as well!
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, series := range nodeData.Series {
|
|
|
|
avg += series.Statistics.Avg
|
|
|
|
min = math.Min(min, series.Statistics.Min)
|
|
|
|
max = math.Max(max, series.Statistics.Max)
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
2021-12-17 15:49:22 +01:00
|
|
|
jobMeta.Statistics[metric] = schema.JobStatistics{
|
2021-12-16 13:17:48 +01:00
|
|
|
Unit: config.GetMetricConfig(job.Cluster, metric).Unit,
|
2021-11-26 10:34:29 +01:00
|
|
|
Avg: avg / float64(job.NumNodes),
|
|
|
|
Min: min,
|
|
|
|
Max: max,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-16 09:35:03 +01:00
|
|
|
// If the file based archive is disabled,
|
|
|
|
// only return the JobMeta structure as the
|
|
|
|
// statistics in there are needed.
|
|
|
|
if !useArchive {
|
2021-12-17 15:49:22 +01:00
|
|
|
return jobMeta, nil
|
2021-12-16 09:35:03 +01:00
|
|
|
}
|
|
|
|
|
2021-12-08 11:50:47 +01:00
|
|
|
dirPath, err := getPath(job, "", false)
|
2021-11-26 10:34:29 +01:00
|
|
|
if err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if err := os.MkdirAll(dirPath, 0777); err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
f, err := os.Create(path.Join(dirPath, "meta.json"))
|
|
|
|
if err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
writer := bufio.NewWriter(f)
|
2021-12-17 15:49:22 +01:00
|
|
|
if err := json.NewEncoder(writer).Encode(jobMeta); err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
if err := writer.Flush(); err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
f, err = os.Create(path.Join(dirPath, "data.json"))
|
|
|
|
if err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
writer = bufio.NewWriter(f)
|
2021-12-08 11:50:47 +01:00
|
|
|
if err := json.NewEncoder(writer).Encode(jobData); err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
if err := writer.Flush(); err != nil {
|
2021-12-16 09:35:03 +01:00
|
|
|
return nil, err
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
|
|
|
|
2021-12-17 15:49:22 +01:00
|
|
|
return jobMeta, f.Close()
|
2021-11-26 10:34:29 +01:00
|
|
|
}
|
2022-01-07 09:47:41 +01:00
|
|
|
|
|
|
|
// Add statisticsSeries fields
|
2022-01-12 13:03:01 +01:00
|
|
|
func calcStatisticsSeries(job *schema.Job, jobData schema.JobData, maxSeries int) error {
|
2022-01-07 09:47:41 +01:00
|
|
|
for _, scopes := range jobData {
|
|
|
|
for _, jobMetric := range scopes {
|
|
|
|
if jobMetric.StatisticsSeries != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-01-12 13:03:01 +01:00
|
|
|
if len(jobMetric.Series) <= maxSeries {
|
2022-01-07 09:47:41 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
n := 0
|
|
|
|
for _, series := range jobMetric.Series {
|
|
|
|
if len(series.Data) > n {
|
|
|
|
n = len(series.Data)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mean, min, max := make([]schema.Float, n), make([]schema.Float, n), make([]schema.Float, n)
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
sum, smin, smax := schema.Float(0.), math.MaxFloat32, -math.MaxFloat32
|
|
|
|
for _, series := range jobMetric.Series {
|
2022-01-10 16:13:40 +01:00
|
|
|
if i >= len(series.Data) {
|
2022-01-07 09:47:41 +01:00
|
|
|
sum, smin, smax = schema.NaN, math.NaN(), math.NaN()
|
|
|
|
break
|
|
|
|
}
|
|
|
|
x := series.Data[i]
|
|
|
|
sum += x
|
|
|
|
smin = math.Min(smin, float64(x))
|
|
|
|
smax = math.Max(smax, float64(x))
|
|
|
|
}
|
|
|
|
sum /= schema.Float(len(jobMetric.Series))
|
|
|
|
mean[i] = sum
|
|
|
|
min[i] = schema.Float(smin)
|
|
|
|
max[i] = schema.Float(smax)
|
|
|
|
}
|
|
|
|
|
2022-01-10 16:13:40 +01:00
|
|
|
jobMetric.StatisticsSeries = &schema.StatsSeries{
|
|
|
|
Min: min, Mean: mean, Max: max,
|
|
|
|
}
|
2022-01-07 09:47:41 +01:00
|
|
|
jobMetric.Series = nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|