Merge branch 'master' into 37-provide-a-s3-compatible-storage-backend-for-the-job-archive

This commit is contained in:
2025-10-23 15:23:32 +02:00
294 changed files with 35907 additions and 19676 deletions

View File

@@ -1,26 +1,20 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archive
import (
"bufio"
"bytes"
"compress/gzip"
"encoding/json"
"fmt"
"io"
"path/filepath"
"strconv"
"sync"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/lrucache"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
const Version uint64 = 1
const Version uint64 = 2
type ArchiveBackend interface {
Init(rawConfig json.RawMessage) (uint64, error)
@@ -33,6 +27,8 @@ type ArchiveBackend interface {
LoadJobData(job *schema.Job) (schema.JobData, error)
LoadJobStats(job *schema.Job) (schema.ScopedJobStats, error)
LoadClusterCfg(name string) (*schema.Cluster, error)
StoreJobMeta(jobMeta *schema.JobMeta) error
@@ -60,105 +56,55 @@ type JobContainer struct {
}
var (
initOnce sync.Once
cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024)
ar ArchiveBackend
useArchive bool
)
func getPath(
job *schema.Job,
rootPath string,
file string,
) string {
return filepath.Join(
getDirectory(job, rootPath), file)
}
func getDirectory(
job *schema.Job,
rootPath string,
) string {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
return filepath.Join(
rootPath,
job.Cluster,
lvl1, lvl2,
strconv.FormatInt(job.StartTime.Unix(), 10))
}
func loadJobMeta(b []byte) (*schema.JobMeta, error) {
if config.Keys.Validate {
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err)
}
}
return DecodeJobMeta(bytes.NewReader(b))
}
func loadJobData(f io.Reader, key string, isCompressed bool) (schema.JobData, error) {
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(r, key)
} else {
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(bufio.NewReader(f), key)
}
}
func Init(rawConfig json.RawMessage, disableArchive bool) error {
useArchive = !disableArchive
var err error
var cfg struct {
Kind string `json:"kind"`
}
initOnce.Do(func() {
useArchive = !disableArchive
if err := json.Unmarshal(rawConfig, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
return err
}
var cfg struct {
Kind string `json:"kind"`
}
switch cfg.Kind {
case "file":
ar = &FsArchive{}
// case "s3":
// ar = &S3Archive{}
default:
return fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
}
if err = json.Unmarshal(rawConfig, &cfg); err != nil {
log.Warn("Error while unmarshaling raw config json")
return
}
version, err := ar.Init(rawConfig)
if err != nil {
log.Error("Error while initializing archiveBackend")
return err
}
log.Infof("Load archive version %d", version)
switch cfg.Kind {
case "file":
ar = &FsArchive{}
// case "s3":
// ar = &S3Archive{}
default:
err = fmt.Errorf("ARCHIVE/ARCHIVE > unkown archive backend '%s''", cfg.Kind)
}
return initClusterConfig()
var version uint64
version, err = ar.Init(rawConfig)
if err != nil {
log.Errorf("Error while initializing archiveBackend: %s", err.Error())
return
}
log.Infof("Load archive version %d", version)
err = initClusterConfig()
})
return err
}
func GetHandle() ArchiveBackend {
return ar
}
// Helper to metricdata.LoadAverages().
// Helper to metricdataloader.LoadAverages().
func LoadAveragesFromArchive(
job *schema.Job,
metrics []string,
@@ -166,7 +112,7 @@ func LoadAveragesFromArchive(
) error {
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
@@ -181,16 +127,80 @@ func LoadAveragesFromArchive(
return nil
}
// Helper to metricdataloader.LoadJobStats().
func LoadStatsFromArchive(
job *schema.Job,
metrics []string,
) (map[string]schema.MetricStatistics, error) {
data := make(map[string]schema.MetricStatistics, len(metrics))
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return data, err
}
for _, m := range metrics {
stat, ok := metaFile.Statistics[m]
if !ok {
data[m] = schema.MetricStatistics{Min: 0.0, Avg: 0.0, Max: 0.0}
continue
}
data[m] = schema.MetricStatistics{
Avg: stat.Avg,
Min: stat.Min,
Max: stat.Max,
}
}
return data, nil
}
// Helper to metricdataloader.LoadScopedJobStats().
func LoadScopedStatsFromArchive(
job *schema.Job,
metrics []string,
scopes []schema.MetricScope,
) (schema.ScopedJobStats, error) {
data, err := ar.LoadJobStats(job)
if err != nil {
log.Errorf("Error while loading job stats from archiveBackend: %s", err.Error())
return nil, err
}
return data, nil
}
func GetStatistics(job *schema.Job) (map[string]schema.JobStatistics, error) {
metaFile, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return nil, err
}
return metaFile.Statistics, nil
}
// If the job is archived, find its `meta.json` file and override the Metadata
// in that JSON file. If the job is not archived, nothing is done.
func UpdateMetadata(job *schema.Job, metadata map[string]string) error {
if job.State == schema.JobStateRunning || !useArchive {
return nil
}
jobMeta, err := ar.LoadJobMeta(job)
if err != nil {
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
for k, v := range metadata {
jobMeta.MetaData[k] = v
}
return ar.StoreJobMeta(jobMeta)
}
// If the job is archived, find its `meta.json` file and override the tags list
// in that JSON file. If the job is not archived, nothing is done.
func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
@@ -200,15 +210,16 @@ func UpdateTags(job *schema.Job, tags []*schema.Tag) error {
jobMeta, err := ar.LoadJobMeta(job)
if err != nil {
log.Warn("Error while loading job metadata from archiveBackend")
log.Errorf("Error while loading job metadata from archiveBackend: %s", err.Error())
return err
}
jobMeta.Tags = make([]*schema.Tag, 0)
for _, tag := range tags {
jobMeta.Tags = append(jobMeta.Tags, &schema.Tag{
Name: tag.Name,
Type: tag.Type,
Name: tag.Name,
Type: tag.Type,
Scope: tag.Scope,
})
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -12,13 +12,16 @@ import (
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
var Clusters []*schema.Cluster
var nodeLists map[string]map[string]NodeList
var (
Clusters []*schema.Cluster
GlobalMetricList []*schema.GlobalMetricListItem
NodeLists map[string]map[string]NodeList
)
func initClusterConfig() error {
Clusters = []*schema.Cluster{}
nodeLists = map[string]map[string]NodeList{}
NodeLists = map[string]map[string]NodeList{}
metricLookup := make(map[string]schema.GlobalMetricListItem)
for _, c := range ar.GetClusters() {
@@ -49,11 +52,79 @@ func initClusterConfig() error {
if !mc.Scope.Valid() {
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
}
ml, ok := metricLookup[mc.Name]
if !ok {
metricLookup[mc.Name] = schema.GlobalMetricListItem{
Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint,
}
ml = metricLookup[mc.Name]
}
availability := schema.ClusterSupport{Cluster: cluster.Name}
scLookup := make(map[string]*schema.SubClusterConfig)
for _, scc := range mc.SubClusters {
scLookup[scc.Name] = scc
}
for _, sc := range cluster.SubClusters {
newMetric := &schema.MetricConfig{
Unit: mc.Unit,
Energy: mc.Energy,
Name: mc.Name,
Scope: mc.Scope,
Aggregation: mc.Aggregation,
Peak: mc.Peak,
Caution: mc.Caution,
Alert: mc.Alert,
Timestep: mc.Timestep,
Normal: mc.Normal,
LowerIsBetter: mc.LowerIsBetter,
}
if mc.Footprint != "" {
newMetric.Footprint = mc.Footprint
}
if cfg, ok := scLookup[sc.Name]; ok {
if !cfg.Remove {
availability.SubClusters = append(availability.SubClusters, sc.Name)
newMetric.Peak = cfg.Peak
newMetric.Normal = cfg.Normal
newMetric.Caution = cfg.Caution
newMetric.Alert = cfg.Alert
newMetric.Footprint = cfg.Footprint
newMetric.Energy = cfg.Energy
newMetric.LowerIsBetter = cfg.LowerIsBetter
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
ml.Footprint = newMetric.Footprint
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
} else {
availability.SubClusters = append(availability.SubClusters, sc.Name)
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
}
ml.Availability = append(metricLookup[mc.Name].Availability, availability)
metricLookup[mc.Name] = ml
}
Clusters = append(Clusters, cluster)
nodeLists[cluster.Name] = make(map[string]NodeList)
NodeLists[cluster.Name] = make(map[string]NodeList)
for _, sc := range cluster.SubClusters {
if sc.Nodes == "*" {
continue
@@ -63,15 +134,18 @@ func initClusterConfig() error {
if err != nil {
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > in %s/cluster.json: %w", cluster.Name, err)
}
nodeLists[cluster.Name][sc.Name] = nl
NodeLists[cluster.Name][sc.Name] = nl
}
}
for _, ml := range metricLookup {
GlobalMetricList = append(GlobalMetricList, &ml)
}
return nil
}
func GetCluster(cluster string) *schema.Cluster {
for _, c := range Clusters {
if c.Name == cluster {
return c
@@ -90,11 +164,10 @@ func GetSubCluster(cluster, subcluster string) (*schema.SubCluster, error) {
}
}
}
return nil, fmt.Errorf("Subcluster '%v' not found for cluster '%v', or cluster '%v' not configured!", subcluster, cluster, cluster)
return nil, fmt.Errorf("subcluster '%v' not found for cluster '%v', or cluster '%v' not configured", subcluster, cluster, cluster)
}
func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
for _, c := range Clusters {
if c.Name == cluster {
for _, m := range c.MetricConfig {
@@ -110,7 +183,6 @@ func GetMetricConfig(cluster, metric string) *schema.MetricConfig {
// AssignSubCluster sets the `job.subcluster` property of the job based
// on its cluster and resources.
func AssignSubCluster(job *schema.BaseJob) error {
cluster := GetCluster(job.Cluster)
if cluster == nil {
return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > unkown cluster: %v", job.Cluster)
@@ -130,7 +202,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
}
host0 := job.Resources[0].Hostname
for sc, nl := range nodeLists[job.Cluster] {
for sc, nl := range NodeLists[job.Cluster] {
if nl != nil && nl.Contains(host0) {
job.SubCluster = sc
return nil
@@ -146,8 +218,7 @@ func AssignSubCluster(job *schema.BaseJob) error {
}
func GetSubClusterByNode(cluster, hostname string) (string, error) {
for sc, nl := range nodeLists[cluster] {
for sc, nl := range NodeLists[cluster] {
if nl != nil && nl.Contains(hostname) {
return sc, nil
}
@@ -164,3 +235,13 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) {
return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname)
}
func MetricIndex(mc []schema.MetricConfig, name string) (int, error) {
for i, m := range mc {
if m.Name == name {
return i, nil
}
}
return 0, fmt.Errorf("unknown metric name %s", name)
}

View File

@@ -0,0 +1,39 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package archive_test
import (
"encoding/json"
"testing"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
)
func TestClusterConfig(t *testing.T) {
if err := archive.Init(json.RawMessage("{\"kind\": \"file\",\"path\": \"testdata/archive\"}"), false); err != nil {
t.Fatal(err)
}
sc, err := archive.GetSubCluster("fritz", "spr1tb")
if err != nil {
t.Fatal(err)
}
// spew.Dump(sc.MetricConfig)
if len(sc.Footprint) != 3 {
t.Fail()
}
if len(sc.MetricConfig) != 15 {
t.Fail()
}
for _, metric := range sc.MetricConfig {
if metric.LowerIsBetter && metric.Name != "mem_used" {
t.Fail()
}
}
// spew.Dump(archive.GlobalMetricList)
// t.Fail()
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -40,6 +40,109 @@ type clusterInfo struct {
diskSize float64
}
func getDirectory(
job *schema.Job,
rootPath string,
) string {
lvl1, lvl2 := fmt.Sprintf("%d", job.JobID/1000), fmt.Sprintf("%03d", job.JobID%1000)
return filepath.Join(
rootPath,
job.Cluster,
lvl1, lvl2,
strconv.FormatInt(job.StartTime.Unix(), 10))
}
func getPath(
job *schema.Job,
rootPath string,
file string,
) string {
return filepath.Join(
getDirectory(job, rootPath), file)
}
func loadJobMeta(filename string) (*schema.JobMeta, error) {
b, err := os.ReadFile(filename)
if err != nil {
log.Errorf("loadJobMeta() > open file error: %v", err)
return &schema.JobMeta{}, err
}
if config.Keys.Validate {
if err := schema.Validate(schema.Meta, bytes.NewReader(b)); err != nil {
return &schema.JobMeta{}, fmt.Errorf("validate job meta: %v", err)
}
}
return DecodeJobMeta(bytes.NewReader(b))
}
func loadJobData(filename string, isCompressed bool) (schema.JobData, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend LoadJobData()- %v", err)
return nil, err
}
defer f.Close()
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(r, filename)
} else {
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return schema.JobData{}, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobData(bufio.NewReader(f), filename)
}
}
func loadJobStats(filename string, isCompressed bool) (schema.ScopedJobStats, error) {
f, err := os.Open(filename)
if err != nil {
log.Errorf("fsBackend LoadJobStats()- %v", err)
return nil, err
}
defer f.Close()
if isCompressed {
r, err := gzip.NewReader(f)
if err != nil {
log.Errorf(" %v", err)
return nil, err
}
defer r.Close()
if config.Keys.Validate {
if err := schema.Validate(schema.Data, r); err != nil {
return nil, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobStats(r, filename)
} else {
if config.Keys.Validate {
if err := schema.Validate(schema.Data, bufio.NewReader(f)); err != nil {
return nil, fmt.Errorf("validate job data: %v", err)
}
}
return DecodeJobStats(bufio.NewReader(f), filename)
}
}
func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) {
var config FsArchiveConfig
if err := json.Unmarshal(rawConfig, &config); err != nil {
@@ -317,6 +420,18 @@ func (fsa *FsArchive) LoadJobData(job *schema.Job) (schema.JobData, error) {
return loadJobData(f, filename, isCompressed)
}
func (fsa *FsArchive) LoadJobStats(job *schema.Job) (schema.ScopedJobStats, error) {
var isCompressed bool = true
filename := getPath(job, fsa.path, "data.json.gz")
if !util.CheckFileExists(filename) {
filename = getPath(job, fsa.path, "data.json")
isCompressed = false
}
return loadJobStats(filename, isCompressed)
}
func (fsa *FsArchive) LoadJobMeta(job *schema.Job) (*schema.JobMeta, error) {
filename := getPath(job, fsa.path, "meta.json")
b, err := os.ReadFile(filename)

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -49,10 +49,10 @@ func TestInit(t *testing.T) {
if fsa.path != "testdata/archive" {
t.Fail()
}
if version != 1 {
if version != 2 {
t.Fail()
}
if len(fsa.clusters) != 1 || fsa.clusters[0] != "emmy" {
if len(fsa.clusters) != 3 || fsa.clusters[1] != "emmy" {
t.Fail()
}
}

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@@ -9,8 +9,8 @@ import (
"io"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
"github.com/ClusterCockpit/cc-backend/pkg/log"
"github.com/ClusterCockpit/cc-backend/pkg/schema"
)
func DecodeJobData(r io.Reader, k string) (schema.JobData, error) {
@@ -32,6 +32,43 @@ func DecodeJobData(r io.Reader, k string) (schema.JobData, error) {
return data.(schema.JobData), nil
}
func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) {
jobData, err := DecodeJobData(r, k)
// Convert schema.JobData to schema.ScopedJobStats
if jobData != nil {
scopedJobStats := make(schema.ScopedJobStats)
for metric, metricData := range jobData {
if _, ok := scopedJobStats[metric]; !ok {
scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats)
}
for scope, jobMetric := range metricData {
if _, ok := scopedJobStats[metric][scope]; !ok {
scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0)
}
for _, series := range jobMetric.Series {
scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{
Hostname: series.Hostname,
Id: series.Id,
Data: &series.Statistics,
})
}
// So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty
if len(scopedJobStats[metric][scope]) == 0 {
delete(scopedJobStats[metric], scope)
if len(scopedJobStats[metric]) == 0 {
delete(scopedJobStats, metric)
}
}
}
}
return scopedJobStats, nil
}
return nil, err
}
func DecodeJobMeta(r io.Reader) (*schema.JobMeta, error) {
var d schema.JobMeta
if err := json.NewDecoder(r).Decode(&d); err != nil {

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2023 NHR@FAU, University Erlangen-Nuremberg.
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1 +1 @@
1
2