From a50b832c2ac2e31387100b9b18cc42b62d859ebe Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 8 Aug 2025 14:24:52 +0200 Subject: [PATCH 01/11] Import metric store packages --- internal/avro/avroCheckpoint.go | 473 +++++++++++++++++ internal/avro/avroHelper.go | 79 +++ internal/avro/avroStruct.go | 163 ++++++ internal/memorystore/archive.go | 190 +++++++ internal/memorystore/buffer.go | 233 +++++++++ internal/memorystore/checkpoint.go | 764 ++++++++++++++++++++++++++++ internal/memorystore/config.go | 26 + internal/memorystore/debug.go | 107 ++++ internal/memorystore/healthcheck.go | 88 ++++ internal/memorystore/level.go | 187 +++++++ internal/memorystore/memorystore.go | 372 ++++++++++++++ internal/memorystore/stats.go | 120 +++++ 12 files changed, 2802 insertions(+) create mode 100644 internal/avro/avroCheckpoint.go create mode 100644 internal/avro/avroHelper.go create mode 100644 internal/avro/avroStruct.go create mode 100644 internal/memorystore/archive.go create mode 100644 internal/memorystore/buffer.go create mode 100644 internal/memorystore/checkpoint.go create mode 100644 internal/memorystore/config.go create mode 100644 internal/memorystore/debug.go create mode 100644 internal/memorystore/healthcheck.go create mode 100644 internal/memorystore/level.go create mode 100644 internal/memorystore/memorystore.go create mode 100644 internal/memorystore/stats.go diff --git a/internal/avro/avroCheckpoint.go b/internal/avro/avroCheckpoint.go new file mode 100644 index 0000000..4a3cf19 --- /dev/null +++ b/internal/avro/avroCheckpoint.go @@ -0,0 +1,473 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package avro + +import ( + "bufio" + "encoding/json" + "errors" + "fmt" + "log" + "os" + "path" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/ClusterCockpit/cc-lib/util" + "github.com/linkedin/goavro/v2" +) + +var NumWorkers int = 4 + +var ErrNoNewData error = errors.New("no data in the pool") + +func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) { + levels := make([]*AvroLevel, 0) + selectors := make([][]string, 0) + as.root.lock.RLock() + // Cluster + for sel1, l1 := range as.root.children { + l1.lock.RLock() + // Node + for sel2, l2 := range l1.children { + l2.lock.RLock() + // Frequency + for sel3, l3 := range l2.children { + levels = append(levels, l3) + selectors = append(selectors, []string{sel1, sel2, sel3}) + } + l2.lock.RUnlock() + } + l1.lock.RUnlock() + } + as.root.lock.RUnlock() + + type workItem struct { + level *AvroLevel + dir string + selector []string + } + + n, errs := int32(0), int32(0) + + var wg sync.WaitGroup + wg.Add(NumWorkers) + work := make(chan workItem, NumWorkers*2) + for range NumWorkers { + go func() { + defer wg.Done() + + for workItem := range work { + var from int64 = getTimestamp(workItem.dir) + + if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil { + if err == ErrNoNewData { + continue + } + + log.Printf("error while checkpointing %#v: %s", workItem.selector, err.Error()) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + } + } + }() + } + + for i := range len(levels) { + dir := path.Join(dir, path.Join(selectors[i]...)) + work <- workItem{ + level: levels[i], + dir: dir, + selector: selectors[i], + } + } + + close(work) + wg.Wait() + + if errs > 0 { + return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +// getTimestamp returns the timestamp from the directory name +func getTimestamp(dir string) int64 { + // Extract the resolution and timestamp from the directory name + // The existing avro file will be in epoch timestamp format + // iterate over all the files in the directory and find the maximum timestamp + // and return it + + resolution := path.Base(dir) + dir = path.Dir(dir) + + files, err := os.ReadDir(dir) + if err != nil { + return 0 + } + var maxTs int64 = 0 + + if len(files) == 0 { + return 0 + } + + for _, file := range files { + if file.IsDir() { + continue + } + name := file.Name() + + if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") { + continue + } + + ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64) + if err != nil { + fmt.Printf("error while parsing timestamp: %s\n", err.Error()) + continue + } + + if ts > maxTs { + maxTs = ts + } + } + + interval, _ := time.ParseDuration(Keys.Checkpoints.Interval) + updateTime := time.Unix(maxTs, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix() + + if updateTime < time.Now().Unix() { + return 0 + } + + return maxTs +} + +func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { + l.lock.Lock() + defer l.lock.Unlock() + + // fmt.Printf("Checkpointing directory: %s\n", dir) + // filepath contains the resolution + int_res, _ := strconv.Atoi(path.Base(dir)) + + // find smallest overall timestamp in l.data map and delete it from l.data + var minTs int64 = int64(1<<63 - 1) + for ts, dat := range l.data { + if ts < minTs && len(dat) != 0 { + minTs = ts + } + } + + if from == 0 && minTs != int64(1<<63-1) { + from = minTs + } + + if from == 0 { + return ErrNoNewData + } + + var schema string + var codec *goavro.Codec + record_list := make([]map[string]interface{}, 0) + + var f *os.File + + filePath := dir + fmt.Sprintf("_%d.avro", from) + + var err error + + fp_, err_ := os.Stat(filePath) + if errors.Is(err_, os.ErrNotExist) { + err = os.MkdirAll(path.Dir(dir), 0o755) + if err != nil { + return fmt.Errorf("failed to create directory: %v", err) + } + } else if fp_.Size() != 0 { + f, err = os.Open(filePath) + if err != nil { + return fmt.Errorf("failed to open existing avro file: %v", err) + } + + br := bufio.NewReader(f) + + reader, err := goavro.NewOCFReader(br) + if err != nil { + return fmt.Errorf("failed to create OCF reader: %v", err) + } + codec = reader.Codec() + schema = codec.Schema() + + f.Close() + } + + time_ref := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix() + + if dumpAll { + time_ref = time.Now().Unix() + } + + // Empty values + if len(l.data) == 0 { + // we checkpoint avro files every 60 seconds + repeat := 60 / int_res + + for range repeat { + record_list = append(record_list, make(map[string]interface{})) + } + } + + readFlag := true + + for ts := range l.data { + flag := false + if ts < time_ref { + data := l.data[ts] + + schema_gen, err := generateSchema(data) + if err != nil { + return err + } + + flag, schema, err = compareSchema(schema, schema_gen) + if err != nil { + return fmt.Errorf("failed to compare read and generated schema: %v", err) + } + if flag && readFlag && !errors.Is(err_, os.ErrNotExist) { + + f.Close() + + f, err = os.Open(filePath) + if err != nil { + return fmt.Errorf("failed to open Avro file: %v", err) + } + + br := bufio.NewReader(f) + + ocfReader, err := goavro.NewOCFReader(br) + if err != nil { + return fmt.Errorf("failed to create OCF reader while changing schema: %v", err) + } + + for ocfReader.Scan() { + record, err := ocfReader.Read() + if err != nil { + return fmt.Errorf("failed to read record: %v", err) + } + + record_list = append(record_list, record.(map[string]interface{})) + } + + f.Close() + + err = os.Remove(filePath) + if err != nil { + return fmt.Errorf("failed to delete file: %v", err) + } + + readFlag = false + } + codec, err = goavro.NewCodec(schema) + if err != nil { + return fmt.Errorf("failed to create codec after merged schema: %v", err) + } + + record_list = append(record_list, generateRecord(data)) + delete(l.data, ts) + } + } + + if len(record_list) == 0 { + return ErrNoNewData + } + + f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644) + if err != nil { + return fmt.Errorf("failed to append new avro file: %v", err) + } + + // fmt.Printf("Codec : %#v\n", codec) + + writer, err := goavro.NewOCFWriter(goavro.OCFConfig{ + W: f, + Codec: codec, + CompressionName: goavro.CompressionDeflateLabel, + }) + if err != nil { + return fmt.Errorf("failed to create OCF writer: %v", err) + } + + // Append the new record + if err := writer.Append(record_list); err != nil { + return fmt.Errorf("failed to append record: %v", err) + } + + f.Close() + + return nil +} + +func compareSchema(schemaRead, schemaGen string) (bool, string, error) { + var genSchema, readSchema AvroSchema + + if schemaRead == "" { + return false, schemaGen, nil + } + + // Unmarshal the schema strings into AvroSchema structs + if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil { + return false, "", fmt.Errorf("failed to parse generated schema: %v", err) + } + if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil { + return false, "", fmt.Errorf("failed to parse read schema: %v", err) + } + + sort.Slice(genSchema.Fields, func(i, j int) bool { + return genSchema.Fields[i].Name < genSchema.Fields[j].Name + }) + + sort.Slice(readSchema.Fields, func(i, j int) bool { + return readSchema.Fields[i].Name < readSchema.Fields[j].Name + }) + + // Check if schemas are identical + schemasEqual := true + if len(genSchema.Fields) <= len(readSchema.Fields) { + + for i := range genSchema.Fields { + if genSchema.Fields[i].Name != readSchema.Fields[i].Name { + schemasEqual = false + break + } + } + + // If schemas are identical, return the read schema + if schemasEqual { + return false, schemaRead, nil + } + } + + // Create a map to hold unique fields from both schemas + fieldMap := make(map[string]AvroField) + + // Add fields from the read schema + for _, field := range readSchema.Fields { + fieldMap[field.Name] = field + } + + // Add or update fields from the generated schema + for _, field := range genSchema.Fields { + fieldMap[field.Name] = field + } + + // Create a union schema by collecting fields from the map + var mergedFields []AvroField + for _, field := range fieldMap { + mergedFields = append(mergedFields, field) + } + + // Sort fields by name for consistency + sort.Slice(mergedFields, func(i, j int) bool { + return mergedFields[i].Name < mergedFields[j].Name + }) + + // Create the merged schema + mergedSchema := AvroSchema{ + Type: "record", + Name: genSchema.Name, + Fields: mergedFields, + } + + // Check if schemas are identical + schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields) + if schemasEqual { + for i := range mergedSchema.Fields { + if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name { + schemasEqual = false + break + } + } + + if schemasEqual { + return false, schemaRead, nil + } + } + + // Marshal the merged schema back to JSON + mergedSchemaJson, err := json.Marshal(mergedSchema) + if err != nil { + return false, "", fmt.Errorf("failed to marshal merged schema: %v", err) + } + + return true, string(mergedSchemaJson), nil +} + +func generateSchema(data map[string]util.Float) (string, error) { + // Define the Avro schema structure + schema := map[string]interface{}{ + "type": "record", + "name": "DataRecord", + "fields": []map[string]interface{}{}, + } + + fieldTracker := make(map[string]struct{}) + + for key := range data { + if _, exists := fieldTracker[key]; !exists { + key = correctKey(key) + + field := map[string]interface{}{ + "name": key, + "type": "double", + "default": -1.0, + } + schema["fields"] = append(schema["fields"].([]map[string]interface{}), field) + fieldTracker[key] = struct{}{} + } + } + + schemaString, err := json.Marshal(schema) + if err != nil { + return "", fmt.Errorf("failed to marshal schema: %v", err) + } + + return string(schemaString), nil +} + +func generateRecord(data map[string]util.Float) map[string]interface{} { + record := make(map[string]interface{}) + + // Iterate through each map in data + for key, value := range data { + key = correctKey(key) + + // Set the value in the record + record[key] = value.Double() + } + + return record +} + +func correctKey(key string) string { + // Replace any invalid characters in the key + // For example, replace spaces with underscores + key = strings.ReplaceAll(key, ":", "___") + key = strings.ReplaceAll(key, ".", "__") + + return key +} + +func ReplaceKey(key string) string { + // Replace any invalid characters in the key + // For example, replace spaces with underscores + key = strings.ReplaceAll(key, "___", ":") + key = strings.ReplaceAll(key, "__", ".") + + return key +} diff --git a/internal/avro/avroHelper.go b/internal/avro/avroHelper.go new file mode 100644 index 0000000..ee09759 --- /dev/null +++ b/internal/avro/avroHelper.go @@ -0,0 +1,79 @@ +package avro + +import ( + "context" + "fmt" + "strconv" + "sync" + +) + +func DataStaging(wg *sync.WaitGroup, ctx context.Context) { + + // AvroPool is a pool of Avro writers. + go func() { + if Keys.Checkpoints.FileFormat == "json" { + wg.Done() // Mark this goroutine as done + return // Exit the goroutine + } + + defer wg.Done() + + var avroLevel *AvroLevel + oldSelector := make([]string, 0) + + for { + select { + case <-ctx.Done(): + return + case val := <-LineProtocolMessages: + //Fetch the frequency of the metric from the global configuration + freq, err := Keys.GetMetricFrequency(val.MetricName) + if err != nil { + fmt.Printf("Error fetching metric frequency: %s\n", err) + continue + } + + metricName := "" + + for _, selector_name := range val.Selector { + metricName += selector_name + Delimiter + } + + metricName += val.MetricName + + // Create a new selector for the Avro level + // The selector is a slice of strings that represents the path to the + // Avro level. It is created by appending the cluster, node, and metric + // name to the selector. + var selector []string + selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10)) + + if !testEq(oldSelector, selector) { + // Get the Avro level for the metric + avroLevel = avroStore.root.findAvroLevelOrCreate(selector) + + // If the Avro level is nil, create a new one + if avroLevel == nil { + fmt.Printf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) + } + oldSelector = append([]string{}, selector...) + } + + avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq)) + } + } + }() +} + +func testEq(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/internal/avro/avroStruct.go b/internal/avro/avroStruct.go new file mode 100644 index 0000000..27aac47 --- /dev/null +++ b/internal/avro/avroStruct.go @@ -0,0 +1,163 @@ +package avro + +import ( + "sync" + + "github.com/ClusterCockpit/cc-lib/util" +) + +var ( + LineProtocolMessages = make(chan *AvroStruct) + Delimiter = "ZZZZZ" +) + +// CheckpointBufferMinutes should always be in minutes. +// Its controls the amount of data to hold for given amount of time. +var CheckpointBufferMinutes = 3 + +type AvroStruct struct { + MetricName string + Cluster string + Node string + Selector []string + Value util.Float + Timestamp int64 +} + +type AvroStore struct { + root AvroLevel +} + +var avroStore AvroStore + +type AvroLevel struct { + children map[string]*AvroLevel + data map[int64]map[string]util.Float + lock sync.RWMutex +} + +type AvroField struct { + Name string `json:"name"` + Type interface{} `json:"type"` + Default interface{} `json:"default,omitempty"` +} + +type AvroSchema struct { + Type string `json:"type"` + Name string `json:"name"` + Fields []AvroField `json:"fields"` +} + +func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel { + if len(selector) == 0 { + return l + } + + // Allow concurrent reads: + l.lock.RLock() + var child *AvroLevel + var ok bool + if l.children == nil { + // Children map needs to be created... + l.lock.RUnlock() + } else { + child, ok := l.children[selector[0]] + l.lock.RUnlock() + if ok { + return child.findAvroLevelOrCreate(selector[1:]) + } + } + + // The level does not exist, take write lock for unqiue access: + l.lock.Lock() + // While this thread waited for the write lock, another thread + // could have created the child node. + if l.children != nil { + child, ok = l.children[selector[0]] + if ok { + l.lock.Unlock() + return child.findAvroLevelOrCreate(selector[1:]) + } + } + + child = &AvroLevel{ + data: make(map[int64]map[string]util.Float, 0), + children: nil, + } + + if l.children != nil { + l.children[selector[0]] = child + } else { + l.children = map[string]*AvroLevel{selector[0]: child} + } + l.lock.Unlock() + return child.findAvroLevelOrCreate(selector[1:]) +} + +func (l *AvroLevel) addMetric(metricName string, value util.Float, timestamp int64, Freq int) { + l.lock.Lock() + defer l.lock.Unlock() + + KeyCounter := int(CheckpointBufferMinutes * 60 / Freq) + + // Create keys in advance for the given amount of time + if len(l.data) != KeyCounter { + if len(l.data) == 0 { + for i := range KeyCounter { + l.data[timestamp+int64(i*Freq)] = make(map[string]util.Float, 0) + } + } else { + // Get the last timestamp + var lastTs int64 + for ts := range l.data { + if ts > lastTs { + lastTs = ts + } + } + // Create keys for the next KeyCounter timestamps + l.data[lastTs+int64(Freq)] = make(map[string]util.Float, 0) + } + } + + closestTs := int64(0) + minDiff := int64(Freq) + 1 // Start with diff just outside the valid range + found := false + + // Iterate over timestamps and choose the one which is within range. + // Since its epoch time, we check if the difference is less than 60 seconds. + for ts, dat := range l.data { + // Check if timestamp is within range + diff := timestamp - ts + if diff < -int64(Freq) || diff > int64(Freq) { + continue + } + + // Metric already present at this timestamp — skip + if _, ok := dat[metricName]; ok { + continue + } + + // Check if this is the closest timestamp so far + if Abs(diff) < minDiff { + minDiff = Abs(diff) + closestTs = ts + found = true + } + } + + if found { + l.data[closestTs][metricName] = value + } +} + +func GetAvroStore() *AvroStore { + return &avroStore +} + +// Abs returns the absolute value of x. +func Abs(x int64) int64 { + if x < 0 { + return -x + } + return x +} diff --git a/internal/memorystore/archive.go b/internal/memorystore/archive.go new file mode 100644 index 0000000..6e25aff --- /dev/null +++ b/internal/memorystore/archive.go @@ -0,0 +1,190 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package memorystore + +import ( + "archive/zip" + "bufio" + "context" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "sync" + "sync/atomic" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/ccLogger" +) + +func Archiving(wg *sync.WaitGroup, ctx context.Context) { + go func() { + defer wg.Done() + d, err := time.ParseDuration(Keys.Archive.Interval) + if err != nil { + cclog.Fatalf("error parsing archive interval duration: %v\n", err) + } + if d <= 0 { + return + } + + ticks := func() <-chan time.Time { + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + for { + select { + case <-ctx.Done(): + return + case <-ticks: + t := time.Now().Add(-d) + cclog.Infof("start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339)) + n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir, + Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead) + + if err != nil { + cclog.Warnf("archiving failed: %s\n", err.Error()) + } else { + cclog.Infof("done: %d files zipped and moved to archive\n", n) + } + } + } + }() +} + +var ErrNoNewData error = errors.New("all data already archived") + +// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`, +// deleting them from the `checkpointsDir`. +func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) { + entries1, err := os.ReadDir(checkpointsDir) + if err != nil { + return 0, err + } + + type workItem struct { + cdir, adir string + cluster, host string + } + + var wg sync.WaitGroup + n, errs := int32(0), int32(0) + work := make(chan workItem, NumWorkers) + + wg.Add(NumWorkers) + for worker := 0; worker < NumWorkers; worker++ { + go func() { + defer wg.Done() + for workItem := range work { + m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead) + if err != nil { + cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error()) + atomic.AddInt32(&errs, 1) + } + atomic.AddInt32(&n, int32(m)) + } + }() + } + + for _, de1 := range entries1 { + entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name())) + if e != nil { + err = e + } + + for _, de2 := range entries2 { + cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name()) + adir := filepath.Join(archiveDir, de1.Name(), de2.Name()) + work <- workItem{ + adir: adir, cdir: cdir, + cluster: de1.Name(), host: de2.Name(), + } + } + } + + close(work) + wg.Wait() + + if err != nil { + return int(n), err + } + + if errs > 0 { + return int(n), fmt.Errorf("%d errors happend while archiving (%d successes)", errs, n) + } + return int(n), nil +} + +// Helper function for `ArchiveCheckpoints`. +func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) { + entries, err := os.ReadDir(dir) + if err != nil { + return 0, err + } + + extension := Keys.Checkpoints.FileFormat + files, err := findFiles(entries, from, extension, false) + if err != nil { + return 0, err + } + + if deleteInstead { + n := 0 + for _, checkpoint := range files { + filename := filepath.Join(dir, checkpoint) + if err = os.Remove(filename); err != nil { + return n, err + } + n += 1 + } + return n, nil + } + + filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from)) + f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil && os.IsNotExist(err) { + err = os.MkdirAll(archiveDir, 0o755) + if err == nil { + f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) + } + } + if err != nil { + return 0, err + } + defer f.Close() + bw := bufio.NewWriter(f) + defer bw.Flush() + zw := zip.NewWriter(bw) + defer zw.Close() + + n := 0 + for _, checkpoint := range files { + filename := filepath.Join(dir, checkpoint) + r, err := os.Open(filename) + if err != nil { + return n, err + } + defer r.Close() + + w, err := zw.Create(checkpoint) + if err != nil { + return n, err + } + + if _, err = io.Copy(w, r); err != nil { + return n, err + } + + if err = os.Remove(filename); err != nil { + return n, err + } + n += 1 + } + + return n, nil +} diff --git a/internal/memorystore/buffer.go b/internal/memorystore/buffer.go new file mode 100644 index 0000000..d084c6d --- /dev/null +++ b/internal/memorystore/buffer.go @@ -0,0 +1,233 @@ +package memorystore + +import ( + "errors" + "sync" + + "github.com/ClusterCockpit/cc-lib/util" +) + +// Default buffer capacity. +// `buffer.data` will only ever grow up to it's capacity and a new link +// in the buffer chain will be created if needed so that no copying +// of data or reallocation needs to happen on writes. +const ( + BUFFER_CAP int = 512 +) + +// So that we can reuse allocations +var bufferPool sync.Pool = sync.Pool{ + New: func() interface{} { + return &buffer{ + data: make([]util.Float, 0, BUFFER_CAP), + } + }, +} + +var ( + ErrNoData error = errors.New("no data for this metric/level") + ErrDataDoesNotAlign error = errors.New("data from lower granularities does not align") +) + +// Each metric on each level has it's own buffer. +// This is where the actual values go. +// If `cap(data)` is reached, a new buffer is created and +// becomes the new head of a buffer list. +type buffer struct { + prev *buffer + next *buffer + data []util.Float + frequency int64 + start int64 + archived bool + closed bool +} + +func newBuffer(ts, freq int64) *buffer { + b := bufferPool.Get().(*buffer) + b.frequency = freq + b.start = ts - (freq / 2) + b.prev = nil + b.next = nil + b.archived = false + b.closed = false + b.data = b.data[:0] + return b +} + +// If a new buffer was created, the new head is returnd. +// Otherwise, the existing buffer is returnd. +// Normaly, only "newer" data should be written, but if the value would +// end up in the same buffer anyways it is allowed. +func (b *buffer) write(ts int64, value util.Float) (*buffer, error) { + if ts < b.start { + return nil, errors.New("cannot write value to buffer from past") + } + + // idx := int((ts - b.start + (b.frequency / 3)) / b.frequency) + idx := int((ts - b.start) / b.frequency) + if idx >= cap(b.data) { + newbuf := newBuffer(ts, b.frequency) + newbuf.prev = b + b.next = newbuf + b.close() + b = newbuf + idx = 0 + } + + // Overwriting value or writing value from past + if idx < len(b.data) { + b.data[idx] = value + return b, nil + } + + // Fill up unwritten slots with NaN + for i := len(b.data); i < idx; i++ { + b.data = append(b.data, util.NaN) + } + + b.data = append(b.data, value) + return b, nil +} + +func (b *buffer) end() int64 { + return b.firstWrite() + int64(len(b.data))*b.frequency +} + +func (b *buffer) firstWrite() int64 { + return b.start + (b.frequency / 2) +} + +func (b *buffer) close() {} + +/* +func (b *buffer) close() { + if b.closed { + return + } + + b.closed = true + n, sum, min, max := 0, 0., math.MaxFloat64, -math.MaxFloat64 + for _, x := range b.data { + if x.IsNaN() { + continue + } + + n += 1 + f := float64(x) + sum += f + min = math.Min(min, f) + max = math.Max(max, f) + } + + b.statisticts.samples = n + if n > 0 { + b.statisticts.avg = Float(sum / float64(n)) + b.statisticts.min = Float(min) + b.statisticts.max = Float(max) + } else { + b.statisticts.avg = NaN + b.statisticts.min = NaN + b.statisticts.max = NaN + } +} +*/ + +// func interpolate(idx int, data []Float) Float { +// if idx == 0 || idx+1 == len(data) { +// return NaN +// } +// return (data[idx-1] + data[idx+1]) / 2.0 +// } + +// Return all known values from `from` to `to`. Gaps of information are represented as NaN. +// Simple linear interpolation is done between the two neighboring cells if possible. +// If values at the start or end are missing, instead of NaN values, the second and thrid +// return values contain the actual `from`/`to`. +// This function goes back the buffer chain if `from` is older than the currents buffer start. +// The loaded values are added to `data` and `data` is returned, possibly with a shorter length. +// If `data` is not long enough to hold all values, this function will panic! +func (b *buffer) read(from, to int64, data []util.Float) ([]util.Float, int64, int64, error) { + if from < b.firstWrite() { + if b.prev != nil { + return b.prev.read(from, to, data) + } + from = b.firstWrite() + } + + i := 0 + t := from + for ; t < to; t += b.frequency { + idx := int((t - b.start) / b.frequency) + if idx >= cap(b.data) { + if b.next == nil { + break + } + b = b.next + idx = 0 + } + + if idx >= len(b.data) { + if b.next == nil || to <= b.next.start { + break + } + data[i] += util.NaN + } else if t < b.start { + data[i] += util.NaN + // } else if b.data[idx].IsNaN() { + // data[i] += interpolate(idx, b.data) + } else { + data[i] += b.data[idx] + } + i++ + } + + return data[:i], from, t, nil +} + +// Returns true if this buffer needs to be freed. +func (b *buffer) free(t int64) (delme bool, n int) { + if b.prev != nil { + delme, m := b.prev.free(t) + n += m + if delme { + b.prev.next = nil + if cap(b.prev.data) == BUFFER_CAP { + bufferPool.Put(b.prev) + } + b.prev = nil + } + } + + end := b.end() + if end < t { + return true, n + 1 + } + + return false, n +} + +// Call `callback` on every buffer that contains data in the range from `from` to `to`. +func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error { + if b == nil { + return nil + } + + if err := b.prev.iterFromTo(from, to, callback); err != nil { + return err + } + + if from <= b.end() && b.start <= to { + return callback(b) + } + + return nil +} + +func (b *buffer) count() int64 { + res := int64(len(b.data)) + if b.prev != nil { + res += b.prev.count() + } + return res +} diff --git a/internal/memorystore/checkpoint.go b/internal/memorystore/checkpoint.go new file mode 100644 index 0000000..ecd6fb1 --- /dev/null +++ b/internal/memorystore/checkpoint.go @@ -0,0 +1,764 @@ +package memorystore + +import ( + "bufio" + "context" + "encoding/json" + "errors" + "fmt" + "io/fs" + "log" + "os" + "path" + "path/filepath" + "runtime" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/avro" + "github.com/ClusterCockpit/cc-lib/util" + "github.com/linkedin/goavro/v2" +) + +// Whenever changed, update MarshalJSON as well! +type CheckpointMetrics struct { + Data []util.Float `json:"data"` + Frequency int64 `json:"frequency"` + Start int64 `json:"start"` +} + +type CheckpointFile struct { + Metrics map[string]*CheckpointMetrics `json:"metrics"` + Children map[string]*CheckpointFile `json:"children"` + From int64 `json:"from"` + To int64 `json:"to"` +} + +var lastCheckpoint time.Time + +func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { + lastCheckpoint = time.Now() + + if Keys.Checkpoints.FileFormat == "json" { + ms := GetMemoryStore() + + go func() { + defer wg.Done() + d, err := time.ParseDuration(Keys.Checkpoints.Interval) + if err != nil { + log.Fatal(err) + } + if d <= 0 { + return + } + + ticks := func() <-chan time.Time { + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + for { + select { + case <-ctx.Done(): + return + case <-ticks: + log.Printf("start checkpointing (starting at %s)...\n", lastCheckpoint.Format(time.RFC3339)) + now := time.Now() + n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, + lastCheckpoint.Unix(), now.Unix()) + if err != nil { + log.Printf("checkpointing failed: %s\n", err.Error()) + } else { + log.Printf("done: %d checkpoint files created\n", n) + lastCheckpoint = now + } + } + } + }() + } else { + go func() { + defer wg.Done() + d, _ := time.ParseDuration("1m") + + select { + case <-ctx.Done(): + return + case <-time.After(time.Duration(avro.CheckpointBufferMinutes) * time.Minute): + // This is the first tick untill we collect the data for given minutes. + avro.GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) + // log.Printf("Checkpointing %d avro files", count) + + } + + ticks := func() <-chan time.Time { + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + + for { + select { + case <-ctx.Done(): + return + case <-ticks: + // Regular ticks of 1 minute to write data. + avro.GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) + // log.Printf("Checkpointing %d avro files", count) + } + } + }() + } +} + +// As `Float` implements a custom MarshalJSON() function, +// serializing an array of such types has more overhead +// than one would assume (because of extra allocations, interfaces and so on). +func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) { + buf := make([]byte, 0, 128+len(cm.Data)*8) + buf = append(buf, `{"frequency":`...) + buf = strconv.AppendInt(buf, cm.Frequency, 10) + buf = append(buf, `,"start":`...) + buf = strconv.AppendInt(buf, cm.Start, 10) + buf = append(buf, `,"data":[`...) + for i, x := range cm.Data { + if i != 0 { + buf = append(buf, ',') + } + if x.IsNaN() { + buf = append(buf, `null`...) + } else { + buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32) + } + } + buf = append(buf, `]}`...) + return buf, nil +} + +// Metrics stored at the lowest 2 levels are not stored away (root and cluster)! +// On a per-host basis a new JSON file is created. I have no idea if this will scale. +// The good thing: Only a host at a time is locked, so this function can run +// in parallel to writes/reads. +func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) { + levels := make([]*Level, 0) + selectors := make([][]string, 0) + m.root.lock.RLock() + for sel1, l1 := range m.root.children { + l1.lock.RLock() + for sel2, l2 := range l1.children { + levels = append(levels, l2) + selectors = append(selectors, []string{sel1, sel2}) + } + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + type workItem struct { + level *Level + dir string + selector []string + } + + n, errs := int32(0), int32(0) + + var wg sync.WaitGroup + wg.Add(NumWorkers) + work := make(chan workItem, NumWorkers*2) + for worker := 0; worker < NumWorkers; worker++ { + go func() { + defer wg.Done() + + for workItem := range work { + if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil { + if err == ErrNoNewData { + continue + } + + log.Printf("error while checkpointing %#v: %s", workItem.selector, err.Error()) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + } + } + }() + } + + for i := 0; i < len(levels); i++ { + dir := path.Join(dir, path.Join(selectors[i]...)) + work <- workItem{ + level: levels[i], + dir: dir, + selector: selectors[i], + } + } + + close(work) + wg.Wait() + + if errs > 0 { + return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) { + l.lock.RLock() + defer l.lock.RUnlock() + + retval := &CheckpointFile{ + From: from, + To: to, + Metrics: make(map[string]*CheckpointMetrics), + Children: make(map[string]*CheckpointFile), + } + + for metric, minfo := range m.Metrics { + b := l.metrics[minfo.Offset] + if b == nil { + continue + } + + allArchived := true + b.iterFromTo(from, to, func(b *buffer) error { + if !b.archived { + allArchived = false + } + return nil + }) + + if allArchived { + continue + } + + data := make([]util.Float, (to-from)/b.frequency+1) + data, start, end, err := b.read(from, to, data) + if err != nil { + return nil, err + } + + for i := int((end - start) / b.frequency); i < len(data); i++ { + data[i] = util.NaN + } + + retval.Metrics[metric] = &CheckpointMetrics{ + Frequency: b.frequency, + Start: start, + Data: data, + } + } + + for name, child := range l.children { + val, err := child.toCheckpointFile(from, to, m) + if err != nil { + return nil, err + } + + if val != nil { + retval.Children[name] = val + } + } + + if len(retval.Children) == 0 && len(retval.Metrics) == 0 { + return nil, nil + } + + return retval, nil +} + +func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error { + cf, err := l.toCheckpointFile(from, to, m) + if err != nil { + return err + } + + if cf == nil { + return ErrNoNewData + } + + filepath := path.Join(dir, fmt.Sprintf("%d.json", from)) + f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil && os.IsNotExist(err) { + err = os.MkdirAll(dir, 0o755) + if err == nil { + f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644) + } + } + if err != nil { + return err + } + defer f.Close() + + bw := bufio.NewWriter(f) + if err = json.NewEncoder(bw).Encode(cf); err != nil { + return err + } + + return bw.Flush() +} + +func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) { + var wg sync.WaitGroup + work := make(chan [2]string, NumWorkers) + n, errs := int32(0), int32(0) + + wg.Add(NumWorkers) + for worker := 0; worker < NumWorkers; worker++ { + go func() { + defer wg.Done() + for host := range work { + lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics)) + nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension) + if err != nil { + log.Fatalf("error while loading checkpoints: %s", err.Error()) + atomic.AddInt32(&errs, 1) + } + atomic.AddInt32(&n, int32(nn)) + } + }() + } + + i := 0 + clustersDir, err := os.ReadDir(dir) + for _, clusterDir := range clustersDir { + if !clusterDir.IsDir() { + err = errors.New("expected only directories at first level of checkpoints/ directory") + goto done + } + + hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name())) + if e != nil { + err = e + goto done + } + + for _, hostDir := range hostsDir { + if !hostDir.IsDir() { + err = errors.New("expected only directories at second level of checkpoints/ directory") + goto done + } + + i++ + if i%NumWorkers == 0 && i > 100 { + // Forcing garbage collection runs here regulary during the loading of checkpoints + // will decrease the total heap size after loading everything back to memory is done. + // While loading data, the heap will grow fast, so the GC target size will double + // almost always. By forcing GCs here, we can keep it growing more slowly so that + // at the end, less memory is wasted. + runtime.GC() + } + + work <- [2]string{clusterDir.Name(), hostDir.Name()} + } + } +done: + close(work) + wg.Wait() + + if err != nil { + return int(n), err + } + + if errs > 0 { + return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +// Metrics stored at the lowest 2 levels are not loaded (root and cluster)! +// This function can only be called once and before the very first write or read. +// Different host's data is loaded to memory in parallel. +func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { + if _, err := os.Stat(dir); os.IsNotExist(err) { + // The directory does not exist, so create it using os.MkdirAll() + err := os.MkdirAll(dir, 0755) // 0755 sets the permissions for the directory + if err != nil { + log.Fatalf("Error creating directory: %#v\n", err) + } + fmt.Printf("%#v Directory created successfully.\n", dir) + } + + // Config read (replace with your actual config read) + fileFormat := Keys.Checkpoints.FileFormat + if fileFormat == "" { + fileFormat = "avro" + } + + // Map to easily get the fallback format + oppositeFormat := map[string]string{ + "json": "avro", + "avro": "json", + } + + // First, attempt to load the specified format + if found, err := checkFilesWithExtension(dir, fileFormat); err != nil { + return 0, fmt.Errorf("error checking files with extension: %v", err) + } else if found { + log.Printf("Loading %s files because fileformat is %s\n", fileFormat, fileFormat) + return m.FromCheckpoint(dir, from, fileFormat) + } + + // If not found, attempt the opposite format + altFormat := oppositeFormat[fileFormat] + if found, err := checkFilesWithExtension(dir, altFormat); err != nil { + return 0, fmt.Errorf("error checking files with extension: %v", err) + } else if found { + log.Printf("Loading %s files but fileformat is %s\n", altFormat, fileFormat) + return m.FromCheckpoint(dir, from, altFormat) + } + + log.Println("No valid checkpoint files found in the directory.") + return 0, nil +} + +func checkFilesWithExtension(dir string, extension string) (bool, error) { + found := false + + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return fmt.Errorf("error accessing path %s: %v", path, err) + } + if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension { + found = true + return nil + } + return nil + }) + if err != nil { + return false, fmt.Errorf("error walking through directories: %s", err) + } + + return found, nil +} + +func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + + fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:] + resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64) + if err != nil { + return fmt.Errorf("error while reading avro file (resolution parsing) : %s", err) + } + + from_timestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64) + + // Same logic according to lineprotocol + from_timestamp -= (resolution / 2) + + if err != nil { + return fmt.Errorf("error converting timestamp from the avro file : %s", err) + } + + // fmt.Printf("File : %s with resolution : %d\n", fileName, resolution) + + var recordCounter int64 = 0 + + // Create a new OCF reader from the buffered reader + ocfReader, err := goavro.NewOCFReader(br) + if err != nil { + panic(err) + } + + metricsData := make(map[string]util.FloatArray) + + for ocfReader.Scan() { + datum, err := ocfReader.Read() + if err != nil { + return fmt.Errorf("error while reading avro file : %s", err) + } + + record, ok := datum.(map[string]interface{}) + if !ok { + panic("failed to assert datum as map[string]interface{}") + } + + for key, value := range record { + metricsData[key] = append(metricsData[key], util.ConvertToFloat(value.(float64))) + } + + recordCounter += 1 + } + + to := (from_timestamp + (recordCounter / (60 / resolution) * 60)) + if to < from { + return nil + } + + for key, floatArray := range metricsData { + metricName := avro.ReplaceKey(key) + + if strings.Contains(metricName, avro.Delimiter) { + subString := strings.Split(metricName, avro.Delimiter) + + lvl := l + + for i := 0; i < len(subString)-1; i++ { + + sel := subString[i] + + if lvl.children == nil { + lvl.children = make(map[string]*Level) + } + + child, ok := lvl.children[sel] + if !ok { + child = &Level{ + metrics: make([]*buffer, len(m.Metrics)), + children: nil, + } + lvl.children[sel] = child + } + lvl = child + } + + leafMetricName := subString[len(subString)-1] + err = lvl.createBuffer(m, leafMetricName, floatArray, from_timestamp, resolution) + if err != nil { + return fmt.Errorf("error while creating buffers from avroReader : %s", err) + } + } else { + err = l.createBuffer(m, metricName, floatArray, from_timestamp, resolution) + if err != nil { + return fmt.Errorf("error while creating buffers from avroReader : %s", err) + } + } + + } + + return nil +} + +func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray util.FloatArray, from int64, resolution int64) error { + n := len(floatArray) + b := &buffer{ + frequency: resolution, + start: from, + data: floatArray[0:n:n], + prev: nil, + next: nil, + archived: true, + } + b.close() + + minfo, ok := m.Metrics[metricName] + if !ok { + return nil + // return errors.New("Unkown metric: " + name) + } + + prev := l.metrics[minfo.Offset] + if prev == nil { + l.metrics[minfo.Offset] = b + } else { + if prev.start > b.start { + return errors.New("wooops") + } + + b.prev = prev + prev.next = b + + missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency)) + if missingCount > 0 { + missingCount /= int(b.frequency) + + for range missingCount { + prev.data = append(prev.data, util.NaN) + } + + prev.data = prev.data[0:len(prev.data):len(prev.data)] + } + } + l.metrics[minfo.Offset] = b + + return nil +} + +func (l *Level) loadJsonFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + cf := &CheckpointFile{} + if err := json.NewDecoder(br).Decode(cf); err != nil { + return err + } + + if cf.To != 0 && cf.To < from { + return nil + } + + if err := l.loadFile(cf, m); err != nil { + return err + } + + return nil +} + +func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error { + for name, metric := range cf.Metrics { + n := len(metric.Data) + b := &buffer{ + frequency: metric.Frequency, + start: metric.Start, + data: metric.Data[0:n:n], // Space is wasted here :( + prev: nil, + next: nil, + archived: true, + } + b.close() + + minfo, ok := m.Metrics[name] + if !ok { + continue + // return errors.New("Unkown metric: " + name) + } + + prev := l.metrics[minfo.Offset] + if prev == nil { + l.metrics[minfo.Offset] = b + } else { + if prev.start > b.start { + return errors.New("wooops") + } + + b.prev = prev + prev.next = b + } + l.metrics[minfo.Offset] = b + } + + if len(cf.Children) > 0 && l.children == nil { + l.children = make(map[string]*Level) + } + + for sel, childCf := range cf.Children { + child, ok := l.children[sel] + if !ok { + child = &Level{ + metrics: make([]*buffer, len(m.Metrics)), + children: nil, + } + l.children[sel] = child + } + + if err := child.loadFile(childCf, m); err != nil { + return err + } + } + + return nil +} + +func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) { + direntries, err := os.ReadDir(dir) + if err != nil { + if os.IsNotExist(err) { + return 0, nil + } + + return 0, err + } + + allFiles := make([]fs.DirEntry, 0) + filesLoaded := 0 + for _, e := range direntries { + if e.IsDir() { + child := &Level{ + metrics: make([]*buffer, len(m.Metrics)), + children: make(map[string]*Level), + } + + files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension) + filesLoaded += files + if err != nil { + return filesLoaded, err + } + + l.children[e.Name()] = child + } else if strings.HasSuffix(e.Name(), "."+extension) { + allFiles = append(allFiles, e) + } else { + continue + } + } + + files, err := findFiles(allFiles, from, extension, true) + if err != nil { + return filesLoaded, err + } + + loaders := map[string]func(*MemoryStore, *os.File, int64) error{ + "json": l.loadJsonFile, + "avro": l.loadAvroFile, + } + + loader := loaders[extension] + + for _, filename := range files { + f, err := os.Open(path.Join(dir, filename)) + if err != nil { + return filesLoaded, err + } + defer f.Close() + + if err = loader(m, f, from); err != nil { + return filesLoaded, err + } + + filesLoaded += 1 + } + + return filesLoaded, nil +} + +// This will probably get very slow over time! +// A solution could be some sort of an index file in which all other files +// and the timespan they contain is listed. +func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) { + nums := map[string]int64{} + for _, e := range direntries { + if !strings.HasSuffix(e.Name(), "."+extension) { + continue + } + + ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64) + if err != nil { + return nil, err + } + nums[e.Name()] = ts + } + + sort.Slice(direntries, func(i, j int) bool { + a, b := direntries[i], direntries[j] + return nums[a.Name()] < nums[b.Name()] + }) + + filenames := make([]string, 0) + for i := 0; i < len(direntries); i++ { + e := direntries[i] + ts1 := nums[e.Name()] + + if findMoreRecentFiles && t <= ts1 { + filenames = append(filenames, e.Name()) + } + if i == len(direntries)-1 { + continue + } + + enext := direntries[i+1] + ts2 := nums[enext.Name()] + + if findMoreRecentFiles { + if ts1 < t && t < ts2 { + filenames = append(filenames, e.Name()) + } + } else { + if ts2 < t { + filenames = append(filenames, e.Name()) + } + } + } + + return filenames, nil +} diff --git a/internal/memorystore/config.go b/internal/memorystore/config.go new file mode 100644 index 0000000..0d8a8ab --- /dev/null +++ b/internal/memorystore/config.go @@ -0,0 +1,26 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package memorystore + +type MetricStoreConfig struct { + Checkpoints struct { + FileFormat string `json:"file-format"` + Interval string `json:"interval"` + RootDir string `json:"directory"` + Restore string `json:"restore"` + } `json:"checkpoints"` + Debug struct { + DumpToFile string `json:"dump-to-file"` + EnableGops bool `json:"gops"` + } `json:"debug"` + RetentionInMemory string `json:"retention-in-memory"` + Archive struct { + Interval string `json:"interval"` + RootDir string `json:"directory"` + DeleteInstead bool `json:"delete-instead"` + } `json:"archive"` +} + +var Keys MetricStoreConfig diff --git a/internal/memorystore/debug.go b/internal/memorystore/debug.go new file mode 100644 index 0000000..2743a45 --- /dev/null +++ b/internal/memorystore/debug.go @@ -0,0 +1,107 @@ +package memorystore + +import ( + "bufio" + "fmt" + "strconv" +) + +func (b *buffer) debugDump(buf []byte) []byte { + if b.prev != nil { + buf = b.prev.debugDump(buf) + } + + start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data)) + buf = append(buf, `{"start":`...) + buf = strconv.AppendInt(buf, start, 10) + buf = append(buf, `,"len":`...) + buf = strconv.AppendInt(buf, int64(len), 10) + buf = append(buf, `,"end":`...) + buf = strconv.AppendInt(buf, end, 10) + if b.archived { + buf = append(buf, `,"saved":true`...) + } + if b.next != nil { + buf = append(buf, `},`...) + } else { + buf = append(buf, `}`...) + } + return buf +} + +func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) { + l.lock.RLock() + defer l.lock.RUnlock() + for i := 0; i < depth; i++ { + buf = append(buf, '\t') + } + buf = append(buf, '"') + buf = append(buf, lvlname...) + buf = append(buf, "\":{\n"...) + depth += 1 + objitems := 0 + for name, mc := range m.Metrics { + if b := l.metrics[mc.Offset]; b != nil { + for i := 0; i < depth; i++ { + buf = append(buf, '\t') + } + + buf = append(buf, '"') + buf = append(buf, name...) + buf = append(buf, `":[`...) + buf = b.debugDump(buf) + buf = append(buf, "],\n"...) + objitems++ + } + } + + for name, lvl := range l.children { + _, err := w.Write(buf) + if err != nil { + return nil, err + } + + buf = buf[0:0] + buf, err = lvl.debugDump(m, w, name, buf, depth) + if err != nil { + return nil, err + } + + buf = append(buf, ',', '\n') + objitems++ + } + + // remove final `,`: + if objitems > 0 { + buf = append(buf[0:len(buf)-1], '\n') + } + + depth -= 1 + for i := 0; i < depth; i++ { + buf = append(buf, '\t') + } + buf = append(buf, '}') + return buf, nil +} + +func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error { + lvl := m.root.findLevel(selector) + if lvl == nil { + return fmt.Errorf("not found: %#v", selector) + } + + buf := make([]byte, 0, 2048) + buf = append(buf, "{"...) + + buf, err := lvl.debugDump(m, w, "data", buf, 0) + if err != nil { + return err + } + + buf = append(buf, "}\n"...) + if _, err = w.Write(buf); err != nil { + return err + } + + return w.Flush() +} diff --git a/internal/memorystore/healthcheck.go b/internal/memorystore/healthcheck.go new file mode 100644 index 0000000..cb22d49 --- /dev/null +++ b/internal/memorystore/healthcheck.go @@ -0,0 +1,88 @@ +package memorystore + +import ( + "bufio" + "fmt" + "time" +) + +// This is a threshold that allows a node to be healthy with certain number of data points missing. +// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a +// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy. +const MaxMissingDataPoints int64 = 5 + +// This is a threshold which allows upto certain number of metrics in a node to be unhealthly. +// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last +// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does +// not receive data for MaxMissingDataPoints data points will deem the node unhealthy. +const MaxUnhealthyMetrics int64 = 5 + +func (b *buffer) healthCheck() int64 { + + // Check if the buffer is empty + if b.data == nil { + return 1 + } + + buffer_end := b.start + b.frequency*int64(len(b.data)) + t := time.Now().Unix() + + // Check if the buffer is too old + if t-buffer_end > MaxMissingDataPoints*b.frequency { + return 1 + } + + return 0 +} + +func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) { + l.lock.RLock() + defer l.lock.RUnlock() + + for _, mc := range m.Metrics { + if b := l.metrics[mc.Offset]; b != nil { + count += b.healthCheck() + } + } + + for _, lvl := range l.children { + c, err := lvl.healthCheck(m, 0) + if err != nil { + return 0, err + } + count += c + } + + return count, nil +} + +func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error { + lvl := m.root.findLevel(selector) + if lvl == nil { + return fmt.Errorf("not found: %#v", selector) + } + + buf := make([]byte, 0, 25) + // buf = append(buf, "{"...) + + var count int64 = 0 + + unhealthyMetricsCount, err := lvl.healthCheck(m, count) + if err != nil { + return err + } + + if unhealthyMetricsCount < MaxUnhealthyMetrics { + buf = append(buf, "Healthy"...) + } else { + buf = append(buf, "Unhealthy"...) + } + + // buf = append(buf, "}\n"...) + + if _, err = w.Write(buf); err != nil { + return err + } + + return w.Flush() +} diff --git a/internal/memorystore/level.go b/internal/memorystore/level.go new file mode 100644 index 0000000..76916e6 --- /dev/null +++ b/internal/memorystore/level.go @@ -0,0 +1,187 @@ +package memorystore + +import ( + "sync" + "unsafe" + + "github.com/ClusterCockpit/cc-lib/util" +) + +// Could also be called "node" as this forms a node in a tree structure. +// Called Level because "node" might be confusing here. +// Can be both a leaf or a inner node. In this tree structue, inner nodes can +// also hold data (in `metrics`). +type Level struct { + children map[string]*Level + metrics []*buffer + lock sync.RWMutex +} + +// Find the correct level for the given selector, creating it if +// it does not exist. Example selector in the context of the +// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }. +// This function would probably benefit a lot from `level.children` beeing a `sync.Map`? +func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level { + if len(selector) == 0 { + return l + } + + // Allow concurrent reads: + l.lock.RLock() + var child *Level + var ok bool + if l.children == nil { + // Children map needs to be created... + l.lock.RUnlock() + } else { + child, ok := l.children[selector[0]] + l.lock.RUnlock() + if ok { + return child.findLevelOrCreate(selector[1:], nMetrics) + } + } + + // The level does not exist, take write lock for unqiue access: + l.lock.Lock() + // While this thread waited for the write lock, another thread + // could have created the child node. + if l.children != nil { + child, ok = l.children[selector[0]] + if ok { + l.lock.Unlock() + return child.findLevelOrCreate(selector[1:], nMetrics) + } + } + + child = &Level{ + metrics: make([]*buffer, nMetrics), + children: nil, + } + + if l.children != nil { + l.children[selector[0]] = child + } else { + l.children = map[string]*Level{selector[0]: child} + } + l.lock.Unlock() + return child.findLevelOrCreate(selector[1:], nMetrics) +} + +func (l *Level) free(t int64) (int, error) { + l.lock.Lock() + defer l.lock.Unlock() + + n := 0 + for i, b := range l.metrics { + if b != nil { + delme, m := b.free(t) + n += m + if delme { + if cap(b.data) == BUFFER_CAP { + bufferPool.Put(b) + } + l.metrics[i] = nil + } + } + } + + for _, l := range l.children { + m, err := l.free(t) + n += m + if err != nil { + return n, err + } + } + + return n, nil +} + +func (l *Level) sizeInBytes() int64 { + l.lock.RLock() + defer l.lock.RUnlock() + size := int64(0) + + for _, b := range l.metrics { + if b != nil { + size += b.count() * int64(unsafe.Sizeof(util.Float(0))) + } + } + + for _, child := range l.children { + size += child.sizeInBytes() + } + + return size +} + +func (l *Level) findLevel(selector []string) *Level { + if len(selector) == 0 { + return l + } + + l.lock.RLock() + defer l.lock.RUnlock() + + lvl := l.children[selector[0]] + if lvl == nil { + return nil + } + + return lvl.findLevel(selector[1:]) +} + +func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error { + l.lock.RLock() + defer l.lock.RUnlock() + + if len(selector) == 0 { + b := l.metrics[offset] + if b != nil { + return f(b) + } + + for _, lvl := range l.children { + err := lvl.findBuffers(nil, offset, f) + if err != nil { + return err + } + } + return nil + } + + sel := selector[0] + if len(sel.String) != 0 && l.children != nil { + lvl, ok := l.children[sel.String] + if ok { + err := lvl.findBuffers(selector[1:], offset, f) + if err != nil { + return err + } + } + return nil + } + + if sel.Group != nil && l.children != nil { + for _, key := range sel.Group { + lvl, ok := l.children[key] + if ok { + err := lvl.findBuffers(selector[1:], offset, f) + if err != nil { + return err + } + } + } + return nil + } + + if sel.Any && l.children != nil { + for _, lvl := range l.children { + if err := lvl.findBuffers(selector[1:], offset, f); err != nil { + return err + } + } + return nil + } + + return nil +} diff --git a/internal/memorystore/memorystore.go b/internal/memorystore/memorystore.go new file mode 100644 index 0000000..7659a89 --- /dev/null +++ b/internal/memorystore/memorystore.go @@ -0,0 +1,372 @@ +package memorystore + +import ( + "context" + "errors" + "log" + "runtime" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/avro" + "github.com/ClusterCockpit/cc-lib/resampler" + "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-metric-store/internal/config" +) + +var ( + singleton sync.Once + msInstance *MemoryStore +) + +var NumWorkers int = 4 + +func init() { + maxWorkers := 10 + NumWorkers = runtime.NumCPU()/2 + 1 + if NumWorkers > maxWorkers { + NumWorkers = maxWorkers + } +} + +type Metric struct { + Name string + Value util.Float + MetricConfig config.MetricConfig +} + +type MemoryStore struct { + Metrics map[string]config.MetricConfig + root Level +} + +// Create a new, initialized instance of a MemoryStore. +// Will panic if values in the metric configurations are invalid. +func Init(metrics map[string]config.MetricConfig) { + singleton.Do(func() { + offset := 0 + for key, cfg := range metrics { + if cfg.Frequency == 0 { + panic("invalid frequency") + } + + metrics[key] = config.MetricConfig{ + Frequency: cfg.Frequency, + Aggregation: cfg.Aggregation, + Offset: offset, + } + offset += 1 + } + + msInstance = &MemoryStore{ + root: Level{ + metrics: make([]*buffer, len(metrics)), + children: make(map[string]*Level), + }, + Metrics: metrics, + } + }) +} + +func GetMemoryStore() *MemoryStore { + if msInstance == nil { + log.Fatalf("MemoryStore not initialized!") + } + + return msInstance +} + +func Shutdown() { + log.Printf("Writing to '%s'...\n", config.Keys.Checkpoints.RootDir) + var files int + var err error + + ms := GetMemoryStore() + + if config.Keys.Checkpoints.FileFormat == "json" { + files, err = ms.ToCheckpoint(config.Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) + } else { + files, err = avro.GetAvroStore().ToCheckpoint(config.Keys.Checkpoints.RootDir, true) + close(avro.LineProtocolMessages) + } + + if err != nil { + log.Printf("Writing checkpoint failed: %s\n", err.Error()) + } + log.Printf("Done! (%d files written)\n", files) + + // ms.PrintHeirarchy() +} + +// func (m *MemoryStore) PrintHeirarchy() { +// m.root.lock.Lock() +// defer m.root.lock.Unlock() + +// fmt.Printf("Root : \n") + +// for lvl1, sel1 := range m.root.children { +// fmt.Printf("\t%s\n", lvl1) +// for lvl2, sel2 := range sel1.children { +// fmt.Printf("\t\t%s\n", lvl2) +// if lvl1 == "fritz" && lvl2 == "f0201" { + +// for name, met := range m.Metrics { +// mt := sel2.metrics[met.Offset] + +// fmt.Printf("\t\t\t\t%s\n", name) +// fmt.Printf("\t\t\t\t") + +// for mt != nil { +// // if name == "cpu_load" { +// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data) +// // } +// mt = mt.prev +// } +// fmt.Printf("\n") + +// } +// } +// for lvl3, sel3 := range sel2.children { +// if lvl1 == "fritz" && lvl2 == "f0201" && lvl3 == "hwthread70" { + +// fmt.Printf("\t\t\t\t\t%s\n", lvl3) + +// for name, met := range m.Metrics { +// mt := sel3.metrics[met.Offset] + +// fmt.Printf("\t\t\t\t\t\t%s\n", name) + +// fmt.Printf("\t\t\t\t\t\t") + +// for mt != nil { +// // if name == "clock" { +// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data) + +// mt = mt.prev +// } +// fmt.Printf("\n") + +// } + +// // for i, _ := range sel3.metrics { +// // fmt.Printf("\t\t\t\t\t%s\n", getName(configmetrics, i)) +// // } +// } +// } +// } +// } + +// } + +func getName(m *MemoryStore, i int) string { + for key, val := range m.Metrics { + if val.Offset == i { + return key + } + } + return "" +} + +func Retention(wg *sync.WaitGroup, ctx context.Context) { + ms := GetMemoryStore() + + go func() { + defer wg.Done() + d, err := time.ParseDuration(config.Keys.RetentionInMemory) + if err != nil { + log.Fatal(err) + } + if d <= 0 { + return + } + + ticks := func() <-chan time.Time { + d := d / 2 + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + for { + select { + case <-ctx.Done(): + return + case <-ticks: + t := time.Now().Add(-d) + log.Printf("start freeing buffers (older than %s)...\n", t.Format(time.RFC3339)) + freed, err := ms.Free(nil, t.Unix()) + if err != nil { + log.Printf("freeing up buffers failed: %s\n", err.Error()) + } else { + log.Printf("done: %d buffers freed\n", freed) + } + } + } + }() +} + +// Write all values in `metrics` to the level specified by `selector` for time `ts`. +// Look at `findLevelOrCreate` for how selectors work. +func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error { + var ok bool + for i, metric := range metrics { + if metric.MetricConfig.Frequency == 0 { + metric.MetricConfig, ok = m.Metrics[metric.Name] + if !ok { + metric.MetricConfig.Frequency = 0 + } + metrics[i] = metric + } + } + + return m.WriteToLevel(&m.root, selector, ts, metrics) +} + +func (m *MemoryStore) GetLevel(selector []string) *Level { + return m.root.findLevelOrCreate(selector, len(m.Metrics)) +} + +// Assumes that `minfo` in `metrics` is filled in! +func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error { + l = l.findLevelOrCreate(selector, len(m.Metrics)) + l.lock.Lock() + defer l.lock.Unlock() + + for _, metric := range metrics { + if metric.MetricConfig.Frequency == 0 { + continue + } + + b := l.metrics[metric.MetricConfig.Offset] + if b == nil { + // First write to this metric and level + b = newBuffer(ts, metric.MetricConfig.Frequency) + l.metrics[metric.MetricConfig.Offset] = b + } + + nb, err := b.write(ts, metric.Value) + if err != nil { + return err + } + + // Last write created a new buffer... + if b != nb { + l.metrics[metric.MetricConfig.Offset] = nb + } + } + return nil +} + +// Returns all values for metric `metric` from `from` to `to` for the selected level(s). +// If the level does not hold the metric itself, the data will be aggregated recursively from the children. +// The second and third return value are the actual from/to for the data. Those can be different from +// the range asked for if no data was available. +func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]util.Float, int64, int64, int64, error) { + if from > to { + return nil, 0, 0, 0, errors.New("invalid time range") + } + + minfo, ok := m.Metrics[metric] + if !ok { + return nil, 0, 0, 0, errors.New("unkown metric: " + metric) + } + + n, data := 0, make([]util.Float, (to-from)/minfo.Frequency+1) + + err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error { + cdata, cfrom, cto, err := b.read(from, to, data) + if err != nil { + return err + } + + if n == 0 { + from, to = cfrom, cto + } else if from != cfrom || to != cto || len(data) != len(cdata) { + missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency) + if missingfront != 0 { + return ErrDataDoesNotAlign + } + + newlen := len(cdata) - missingback + if newlen < 1 { + return ErrDataDoesNotAlign + } + cdata = cdata[0:newlen] + if len(cdata) != len(data) { + return ErrDataDoesNotAlign + } + + from, to = cfrom, cto + } + + data = cdata + n += 1 + return nil + }) + + if err != nil { + return nil, 0, 0, 0, err + } else if n == 0 { + return nil, 0, 0, 0, errors.New("metric or host not found") + } else if n > 1 { + if minfo.Aggregation == config.AvgAggregation { + normalize := 1. / util.Float(n) + for i := 0; i < len(data); i++ { + data[i] *= normalize + } + } else if minfo.Aggregation != config.SumAggregation { + return nil, 0, 0, 0, errors.New("invalid aggregation") + } + } + + data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution) + if err != nil { + return nil, 0, 0, 0, err + } + + return data, from, to, resolution, nil +} + +// Release all buffers for the selected level and all its children that contain only +// values older than `t`. +func (m *MemoryStore) Free(selector []string, t int64) (int, error) { + return m.GetLevel(selector).free(t) +} + +func (m *MemoryStore) FreeAll() error { + for k := range m.root.children { + delete(m.root.children, k) + } + + return nil +} + +func (m *MemoryStore) SizeInBytes() int64 { + return m.root.sizeInBytes() +} + +// Given a selector, return a list of all children of the level selected. +func (m *MemoryStore) ListChildren(selector []string) []string { + lvl := &m.root + for lvl != nil && len(selector) != 0 { + lvl.lock.RLock() + next := lvl.children[selector[0]] + lvl.lock.RUnlock() + lvl = next + selector = selector[1:] + } + + if lvl == nil { + return nil + } + + lvl.lock.RLock() + defer lvl.lock.RUnlock() + + children := make([]string, 0, len(lvl.children)) + for child := range lvl.children { + children = append(children, child) + } + + return children +} diff --git a/internal/memorystore/stats.go b/internal/memorystore/stats.go new file mode 100644 index 0000000..6682d62 --- /dev/null +++ b/internal/memorystore/stats.go @@ -0,0 +1,120 @@ +package memorystore + +import ( + "errors" + "math" + + "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-metric-store/internal/config" +) + +type Stats struct { + Samples int + Avg util.Float + Min util.Float + Max util.Float +} + +func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) { + if from < b.start { + if b.prev != nil { + return b.prev.stats(from, to) + } + from = b.start + } + + // TODO: Check if b.closed and if so and the full buffer is queried, + // use b.statistics instead of iterating over the buffer. + + samples := 0 + sum, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 + + var t int64 + for t = from; t < to; t += b.frequency { + idx := int((t - b.start) / b.frequency) + if idx >= cap(b.data) { + b = b.next + if b == nil { + break + } + idx = 0 + } + + if t < b.start || idx >= len(b.data) { + continue + } + + xf := float64(b.data[idx]) + if math.IsNaN(xf) { + continue + } + + samples += 1 + sum += xf + min = math.Min(min, xf) + max = math.Max(max, xf) + } + + return Stats{ + Samples: samples, + Avg: util.Float(sum) / util.Float(samples), + Min: util.Float(min), + Max: util.Float(max), + }, from, t, nil +} + +// Returns statistics for the requested metric on the selected node/level. +// Data is aggregated to the selected level the same way as in `MemoryStore.Read`. +// If `Stats.Samples` is zero, the statistics should not be considered as valid. +func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) { + if from > to { + return nil, 0, 0, errors.New("invalid time range") + } + + minfo, ok := m.Metrics[metric] + if !ok { + return nil, 0, 0, errors.New("unkown metric: " + metric) + } + + n, samples := 0, 0 + avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32 + err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error { + stats, cfrom, cto, err := b.stats(from, to) + if err != nil { + return err + } + + if n == 0 { + from, to = cfrom, cto + } else if from != cfrom || to != cto { + return ErrDataDoesNotAlign + } + + samples += stats.Samples + avg += stats.Avg + min = math.Min(min, float64(stats.Min)) + max = math.Max(max, float64(stats.Max)) + n += 1 + return nil + }) + if err != nil { + return nil, 0, 0, err + } + + if n == 0 { + return nil, 0, 0, ErrNoData + } + + if minfo.Aggregation == config.AvgAggregation { + avg /= util.Float(n) + } else if n > 1 && minfo.Aggregation != config.SumAggregation { + return nil, 0, 0, errors.New("invalid aggregation") + } + + return &Stats{ + Samples: samples, + Avg: avg, + Min: util.Float(min), + Max: util.Float(max), + }, from, to, nil +} From bca176170c114b1492a0ea4c900ac3640fd10b39 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Wed, 3 Sep 2025 08:22:15 +0200 Subject: [PATCH 02/11] Migration SQL fix --- cmd/cc-backend/main.go | 18 ++- cmd/cc-backend/server.go | 4 + go.mod | 2 + go.sum | 6 + internal/config/config.go | 10 ++ internal/importer/initDB.go | 4 + internal/memorystore/checkpoint.go | 2 +- internal/memorystore/memorystore.go | 115 +++++++++++++++--- internal/memorystore/stats.go | 5 +- internal/repository/jobCreate.go | 16 +-- .../sqlite3/09_add-job-cache.up.sql | 19 ++- internal/taskManager/taskManager.go | 7 +- var/._job-archive | Bin 0 -> 163 bytes 13 files changed, 172 insertions(+), 36 deletions(-) create mode 100755 var/._job-archive diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 56018c3..0790a0b 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -18,6 +18,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/importer" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/tagger" @@ -96,6 +97,12 @@ func main() { } else { cclog.Abort("Cluster configuration must be present") } + + if mscfg := ccconf.GetPackageConfig("metric-store"); mscfg != nil { + config.InitMetricStore(mscfg) + } else { + cclog.Abort("Metric Store configuration must be present") + } } else { cclog.Abort("Main configuration must be present") } @@ -201,7 +208,7 @@ func main() { if archiveCfg := ccconf.GetPackageConfig("archive"); archiveCfg != nil { err = archive.Init(archiveCfg, config.Keys.DisableArchive) } else { - err = archive.Init(json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`), config.Keys.DisableArchive) + err = archive.Init(json.RawMessage("{\"kind\":\"file\",\"path\":\"./var/job-archive\"}"), config.Keys.DisableArchive) } if err != nil { cclog.Abortf("Init: Failed to initialize archive.\nError: %s\n", err.Error()) @@ -241,10 +248,15 @@ func main() { cclog.Exit("No errors, server flag not set. Exiting cc-backend.") } + //Metric Store starts after all flags have been processes + memorystore.Init() + archiver.Start(repository.GetJobRepository()) - taskManager.Start(ccconf.GetPackageConfig("cron"), - ccconf.GetPackageConfig("archive")) + // // Comment out + // taskManager.Start(ccconf.GetPackageConfig("cron"), + // ccconf.GetPackageConfig("archive")) + serverInit() var wg sync.WaitGroup diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 3983268..537270d 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -26,6 +26,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph/generated" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/routerConfig" "github.com/ClusterCockpit/cc-backend/web" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" @@ -325,6 +326,9 @@ func serverShutdown() { // First shut down the server gracefully (waiting for all ongoing requests) server.Shutdown(context.Background()) + //Archive all the metric store data + memorystore.Shutdown() + // Then, wait for any async archivings still pending... archiver.WaitForArchiving() } diff --git a/go.mod b/go.mod index 554ea56..5858cff 100644 --- a/go.mod +++ b/go.mod @@ -51,6 +51,7 @@ require ( github.com/go-openapi/spec v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect + github.com/golang/snappy v0.0.4 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect @@ -63,6 +64,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect + github.com/linkedin/goavro/v2 v2.14.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect diff --git a/go.sum b/go.sum index 6f61908..3c51770 100644 --- a/go.sum +++ b/go.sum @@ -91,6 +91,9 @@ github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeD github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= github.com/golang-migrate/migrate/v4 v4.18.2 h1:2VSCMz7x7mjyTXx3m2zPokOY82LTRgxK1yQYKo6wWQ8= github.com/golang-migrate/migrate/v4 v4.18.2/go.mod h1:2CM6tJvn2kqPXwnXO/d3rAQYiyoIm180VsO8PRX6Rpk= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= @@ -166,6 +169,8 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/linkedin/goavro/v2 v2.14.0 h1:aNO/js65U+Mwq4yB5f1h01c3wiM458qtRad1DN0CMUI= +github.com/linkedin/goavro/v2 v2.14.0/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= @@ -233,6 +238,7 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= diff --git a/internal/config/config.go b/internal/config/config.go index 7332941..74ee9b0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -9,6 +9,7 @@ import ( "encoding/json" "time" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" ) @@ -166,3 +167,12 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) { cclog.Abort("Config Init: At least one cluster required in config. Exited with error.") } } + +func InitMetricStore(msConfig json.RawMessage) { + // Validate(msConfigSchema, msConfig) + dec := json.NewDecoder(bytes.NewReader(msConfig)) + dec.DisallowUnknownFields() + if err := dec.Decode(&memorystore.Keys); err != nil { + cclog.Abortf("Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", msConfig, err.Error()) + } +} diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index 179c21c..79879b2 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -142,6 +142,10 @@ func InitDB() error { continue } + if jobMeta.Shared == "" { + jobMeta.Shared = "none" + } + id, err := r.TransactionAddNamed(t, repository.NamedJobInsert, jobMeta) if err != nil { diff --git a/internal/memorystore/checkpoint.go b/internal/memorystore/checkpoint.go index ecd6fb1..80a048b 100644 --- a/internal/memorystore/checkpoint.go +++ b/internal/memorystore/checkpoint.go @@ -19,7 +19,7 @@ import ( "sync/atomic" "time" - "github.com/ClusterCockpit/cc-backend/pkg/avro" + "github.com/ClusterCockpit/cc-backend/internal/avro" "github.com/ClusterCockpit/cc-lib/util" "github.com/linkedin/goavro/v2" ) diff --git a/internal/memorystore/memorystore.go b/internal/memorystore/memorystore.go index 7659a89..76079d4 100644 --- a/internal/memorystore/memorystore.go +++ b/internal/memorystore/memorystore.go @@ -2,16 +2,18 @@ package memorystore import ( "context" + "encoding/json" "errors" + "fmt" "log" "runtime" "sync" "time" - "github.com/ClusterCockpit/cc-backend/pkg/avro" + "github.com/ClusterCockpit/cc-backend/internal/avro" "github.com/ClusterCockpit/cc-lib/resampler" + "github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/util" - "github.com/ClusterCockpit/cc-metric-store/internal/config" ) var ( @@ -29,20 +31,101 @@ func init() { } } +// For aggregation over multiple values at different cpus/sockets/..., not time! +type AggregationStrategy int + +const ( + NoAggregation AggregationStrategy = iota + SumAggregation + AvgAggregation +) + +func (as *AggregationStrategy) UnmarshalJSON(data []byte) error { + var str string + if err := json.Unmarshal(data, &str); err != nil { + return err + } + + switch str { + case "": + *as = NoAggregation + case "sum": + *as = SumAggregation + case "avg": + *as = AvgAggregation + default: + return fmt.Errorf("invalid aggregation strategy: %#v", str) + } + return nil +} + +type MetricConfig struct { + // Interval in seconds at which measurements will arive. + Frequency int64 `json:"frequency"` + + // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. + Aggregation AggregationStrategy `json:"aggregation"` + + // Private, used internally... + Offset int +} + type Metric struct { Name string Value util.Float - MetricConfig config.MetricConfig + MetricConfig MetricConfig } type MemoryStore struct { - Metrics map[string]config.MetricConfig + Metrics map[string]MetricConfig root Level } +func Init() { + startupTime := time.Now() + + //Pass the keys from cluster config + InitMetrics() + + ms := GetMemoryStore() + + d, err := time.ParseDuration(Keys.Checkpoints.Restore) + if err != nil { + log.Fatal(err) + } + + restoreFrom := startupTime.Add(-d) + log.Printf("Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339)) + files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix()) + loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB + if err != nil { + log.Fatalf("Loading checkpoints failed: %s\n", err.Error()) + } else { + log.Printf("Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds()) + } + + // Try to use less memory by forcing a GC run here and then + // lowering the target percentage. The default of 100 means + // that only once the ratio of new allocations execeds the + // previously active heap, a GC is triggered. + // Forcing a GC here will set the "previously active heap" + // to a minumum. + runtime.GC() + + ctx, _ := context.WithCancel(context.Background()) + + var wg sync.WaitGroup + wg.Add(4) + + Retention(&wg, ctx) + Checkpointing(&wg, ctx) + Archiving(&wg, ctx) + avro.DataStaging(&wg, ctx) +} + // Create a new, initialized instance of a MemoryStore. // Will panic if values in the metric configurations are invalid. -func Init(metrics map[string]config.MetricConfig) { +func InitMetrics(metrics map[string]MetricConfig) { singleton.Do(func() { offset := 0 for key, cfg := range metrics { @@ -50,7 +133,7 @@ func Init(metrics map[string]config.MetricConfig) { panic("invalid frequency") } - metrics[key] = config.MetricConfig{ + metrics[key] = MetricConfig{ Frequency: cfg.Frequency, Aggregation: cfg.Aggregation, Offset: offset, @@ -77,16 +160,16 @@ func GetMemoryStore() *MemoryStore { } func Shutdown() { - log.Printf("Writing to '%s'...\n", config.Keys.Checkpoints.RootDir) + log.Printf("Writing to '%s'...\n", Keys.Checkpoints.RootDir) var files int var err error ms := GetMemoryStore() - if config.Keys.Checkpoints.FileFormat == "json" { - files, err = ms.ToCheckpoint(config.Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) + if Keys.Checkpoints.FileFormat == "json" { + files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) } else { - files, err = avro.GetAvroStore().ToCheckpoint(config.Keys.Checkpoints.RootDir, true) + files, err = avro.GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true) close(avro.LineProtocolMessages) } @@ -172,7 +255,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) { go func() { defer wg.Done() - d, err := time.ParseDuration(config.Keys.RetentionInMemory) + d, err := time.ParseDuration(Keys.RetentionInMemory) if err != nil { log.Fatal(err) } @@ -261,7 +344,7 @@ func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metric // If the level does not hold the metric itself, the data will be aggregated recursively from the children. // The second and third return value are the actual from/to for the data. Those can be different from // the range asked for if no data was available. -func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]util.Float, int64, int64, int64, error) { +func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) { if from > to { return nil, 0, 0, 0, errors.New("invalid time range") } @@ -271,7 +354,7 @@ func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, reso return nil, 0, 0, 0, errors.New("unkown metric: " + metric) } - n, data := 0, make([]util.Float, (to-from)/minfo.Frequency+1) + n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1) err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error { cdata, cfrom, cto, err := b.read(from, to, data) @@ -309,12 +392,12 @@ func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, reso } else if n == 0 { return nil, 0, 0, 0, errors.New("metric or host not found") } else if n > 1 { - if minfo.Aggregation == config.AvgAggregation { - normalize := 1. / util.Float(n) + if minfo.Aggregation == AvgAggregation { + normalize := 1. / schema.Float(n) for i := 0; i < len(data); i++ { data[i] *= normalize } - } else if minfo.Aggregation != config.SumAggregation { + } else if minfo.Aggregation != SumAggregation { return nil, 0, 0, 0, errors.New("invalid aggregation") } } diff --git a/internal/memorystore/stats.go b/internal/memorystore/stats.go index 6682d62..831e282 100644 --- a/internal/memorystore/stats.go +++ b/internal/memorystore/stats.go @@ -5,7 +5,6 @@ import ( "math" "github.com/ClusterCockpit/cc-lib/util" - "github.com/ClusterCockpit/cc-metric-store/internal/config" ) type Stats struct { @@ -105,9 +104,9 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6 return nil, 0, 0, ErrNoData } - if minfo.Aggregation == config.AvgAggregation { + if minfo.Aggregation == AvgAggregation { avg /= util.Float(n) - } else if n > 1 && minfo.Aggregation != config.SumAggregation { + } else if n > 1 && minfo.Aggregation != SumAggregation { return nil, 0, 0, errors.New("invalid aggregation") } diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index aa2ea76..666313f 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -14,19 +14,19 @@ import ( ) const NamedJobCacheInsert string = `INSERT INTO job_cache ( - job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data + job_id, hpc_user, project, hpc_cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, + shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data ) VALUES ( - :job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data + :job_id, :hpc_user, :project, :hpc_cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data );` const NamedJobInsert string = `INSERT INTO job ( - job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data + job_id, hpc_user, project, hpc_cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, + shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data ) VALUES ( - :job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data + :job_id, :hpc_user, :project, :hpc_cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data );` func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) { diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql index 003eab0..2c25029 100644 --- a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql @@ -3,7 +3,7 @@ CREATE TABLE "job_cache" ( job_id BIGINT NOT NULL, hpc_cluster VARCHAR(255) NOT NULL, subcluster VARCHAR(255) NOT NULL, - submit_time BIGINT NOT NULL, -- Unix timestamp + submit_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp start_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp hpc_user VARCHAR(255) NOT NULL, project VARCHAR(255) NOT NULL, @@ -30,7 +30,7 @@ CREATE TABLE "job_cache" ( energy REAL NOT NULL DEFAULT 0.0, energy_footprint TEXT DEFAULT NULL, footprint TEXT DEFAULT NULL, - UNIQUE (job_id, cluster, start_time) + UNIQUE (job_id, hpc_cluster, start_time) ); CREATE TABLE "job_new" ( @@ -65,10 +65,21 @@ CREATE TABLE "job_new" ( energy REAL NOT NULL DEFAULT 0.0, energy_footprint TEXT DEFAULT NULL, footprint TEXT DEFAULT NULL, - UNIQUE (job_id, cluster, start_time) + UNIQUE (job_id, hpc_cluster, start_time) ); ALTER TABLE job RENAME COLUMN cluster TO hpc_cluster; -INSERT INTO job_new SELECT * FROM job; +INSERT INTO job_new ( + id, job_id, hpc_cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, + num_nodes, num_hwthreads, num_acc, smt, shared, monitoring_status, energy, + energy_footprint, footprint +) +SELECT + id, job_id, hpc_cluster, subcluster, 0, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, + num_nodes, num_hwthreads, num_acc, smt, exclusive, monitoring_status, energy, + energy_footprint, footprint +FROM job; DROP TABLE job; ALTER TABLE job_new RENAME TO job; diff --git a/internal/taskManager/taskManager.go b/internal/taskManager/taskManager.go index 7231d12..df6c4d0 100644 --- a/internal/taskManager/taskManager.go +++ b/internal/taskManager/taskManager.go @@ -7,6 +7,7 @@ package taskManager import ( "bytes" "encoding/json" + "fmt" "time" "github.com/ClusterCockpit/cc-backend/internal/auth" @@ -65,10 +66,14 @@ func Start(cronCfg, archiveConfig json.RawMessage) { RegisterStopJobsExceedTime() } + fmt.Printf("Keys : %#v\n", Keys) + fmt.Printf("cronCfg : %#v\n", cronCfg) + fmt.Printf("archiveConfig : %#v\n", archiveConfig) + dec := json.NewDecoder(bytes.NewReader(cronCfg)) dec.DisallowUnknownFields() if err := dec.Decode(&Keys); err != nil { - cclog.Errorf("error while decoding ldap config: %v", err) + cclog.Errorf("error while decoding cron config: %v", err) } var cfg struct { diff --git a/var/._job-archive b/var/._job-archive new file mode 100755 index 0000000000000000000000000000000000000000..9d11b52bb7ed13ffc4799b7e3bcb26eb2c0b9b7a GIT binary patch literal 163 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDI}aUl?c_=|y<2;dkJ5(HHS(lG;wxzV&S oBE&_L^K Date: Mon, 8 Sep 2025 11:29:27 +0200 Subject: [PATCH 03/11] Combined metricstore api and functions --- cmd/cc-backend/main.go | 6 +- cmd/cc-backend/server.go | 10 + configs/config-demo.json | 27 +- configs/config.json | 4 + internal/api/rest.go | 15 + internal/auth/auth.go | 36 ++ internal/avro/avroCheckpoint.go | 9 +- internal/avro/avroHelper.go | 5 +- internal/avro/avroStruct.go | 14 +- internal/config/config.go | 12 +- internal/config/memorystore.go | 128 ++++++ internal/config/schema.go | 2 +- internal/memorystore/api.go | 419 ++++++++++++++++++++ internal/memorystore/archive.go | 17 +- internal/memorystore/buffer.go | 22 +- internal/memorystore/checkpoint.go | 81 ++-- internal/memorystore/config.go | 26 -- internal/memorystore/debug.go | 2 +- internal/memorystore/healthcheck.go | 2 +- internal/memorystore/lineprotocol.go | 349 ++++++++++++++++ internal/memorystore/memorystore.go | 139 +++---- internal/memorystore/stats.go | 5 +- internal/metricDataDispatcher/dataLoader.go | 6 +- internal/metricdata/cc-metric-store.go | 325 +++++---------- internal/metricdata/utils.go | 3 +- pkg/archive/clusterConfig.go | 14 + 26 files changed, 1248 insertions(+), 430 deletions(-) create mode 100644 internal/config/memorystore.go create mode 100644 internal/memorystore/api.go delete mode 100644 internal/memorystore/config.go create mode 100644 internal/memorystore/lineprotocol.go diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 0790a0b..9c7ad1f 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -248,8 +248,10 @@ func main() { cclog.Exit("No errors, server flag not set. Exiting cc-backend.") } + var wg sync.WaitGroup + //Metric Store starts after all flags have been processes - memorystore.Init() + memorystore.Init(wg) archiver.Start(repository.GetJobRepository()) @@ -259,8 +261,6 @@ func main() { serverInit() - var wg sync.WaitGroup - wg.Add(1) go func() { defer wg.Done() diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 537270d..18d7ea5 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -119,6 +119,7 @@ func serverInit() { userapi := router.PathPrefix("/userapi").Subrouter() configapi := router.PathPrefix("/config").Subrouter() frontendapi := router.PathPrefix("/frontend").Subrouter() + metricstoreapi := router.PathPrefix("/metricstore").Subrouter() if !config.Keys.DisableAuthentication { router.Handle("/login", authHandle.Login( @@ -199,6 +200,14 @@ func serverInit() { onFailureResponse) }) + metricstoreapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthMetricStoreApi( + // On success; + next, + // On failure: JSON Response + onFailureResponse) + }) + configapi.Use(func(next http.Handler) http.Handler { return authHandle.AuthConfigApi( // On success; @@ -232,6 +241,7 @@ func serverInit() { routerConfig.SetupRoutes(secured, buildInfo) apiHandle.MountApiRoutes(securedapi) apiHandle.MountUserApiRoutes(userapi) + apiHandle.MountMetricStoreApiRoutes(metricstoreapi) apiHandle.MountConfigApiRoutes(configapi) apiHandle.MountFrontendApiRoutes(frontendapi) diff --git a/configs/config-demo.json b/configs/config-demo.json index d388d78..a31d65d 100644 --- a/configs/config-demo.json +++ b/configs/config-demo.json @@ -9,6 +9,10 @@ "apiAllowedIPs": ["*"], "emission-constant": 317 }, + "archive": { + "kind": "file", + "path": "./var/job-archive" + }, "auth": { "jwts": { "max-age": "2000h" @@ -18,9 +22,7 @@ { "name": "fritz", "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "" + "kind": "cc-metric-store" }, "filterRanges": { "numNodes": { @@ -40,9 +42,7 @@ { "name": "alex", "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "" + "kind": "cc-metric-store" }, "filterRanges": { "numNodes": { @@ -59,5 +59,18 @@ } } } - ] + ], + "metric-store": { + "checkpoints": { + "file-format": "avro", + "interval": "2h", + "directory": "./var/checkpoints", + "restore": "48h" + }, + "archive": { + "interval": "48h", + "directory": "./var/archive" + }, + "retention-in-memory": "48h" + } } diff --git a/configs/config.json b/configs/config.json index 27c4ce2..ed7d546 100644 --- a/configs/config.json +++ b/configs/config.json @@ -13,6 +13,10 @@ "resolutions": [600, 300, 120, 60] } }, + "archive": { + "kind": "file", + "path": "./var/job-archive" + }, "clusters": [ { "name": "test", diff --git a/internal/api/rest.go b/internal/api/rest.go index e4411a4..8cefe48 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -15,6 +15,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/repository" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" "github.com/ClusterCockpit/cc-lib/schema" @@ -95,6 +96,20 @@ func (api *RestApi) MountUserApiRoutes(r *mux.Router) { r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) } +func (api *RestApi) MountMetricStoreApiRoutes(r *mux.Router) { + r.StrictSlash(true) + // REST API Uses TokenAuth + r.HandleFunc("/api/free", memorystore.HandleFree).Methods(http.MethodPost) + r.HandleFunc("/api/write", memorystore.HandleWrite).Methods(http.MethodPost) + r.HandleFunc("/api/debug", memorystore.HandleDebug).Methods(http.MethodGet) + r.HandleFunc("/api/healthcheck", memorystore.HandleHealthCheck).Methods(http.MethodGet) + // Refactor + r.HandleFunc("/api/free/", memorystore.HandleFree).Methods(http.MethodPost) + r.HandleFunc("/api/write/", memorystore.HandleWrite).Methods(http.MethodPost) + r.HandleFunc("/api/debug/", memorystore.HandleDebug).Methods(http.MethodGet) + r.HandleFunc("/api/healthcheck/", memorystore.HandleHealthCheck).Methods(http.MethodGet) +} + func (api *RestApi) MountConfigApiRoutes(r *mux.Router) { r.StrictSlash(true) // Settings Frontend Uses SessionAuth diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 6564878..5a80f7c 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -417,6 +417,42 @@ func (auth *Authentication) AuthUserApi( }) } +func (auth *Authentication) AuthMetricStoreApi( + onsuccess http.Handler, + onfailure func(rw http.ResponseWriter, r *http.Request, authErr error), +) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + user, err := auth.JwtAuth.AuthViaJWT(rw, r) + if err != nil { + cclog.Infof("auth metricstore api -> authentication failed: %s", err.Error()) + onfailure(rw, r, err) + return + } + + if user != nil { + switch { + case len(user.Roles) == 1: + if user.HasRole(schema.RoleApi) { + ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) + onsuccess.ServeHTTP(rw, r.WithContext(ctx)) + return + } + case len(user.Roles) >= 2: + if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { + ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) + onsuccess.ServeHTTP(rw, r.WithContext(ctx)) + return + } + default: + cclog.Info("auth metricstore api -> authentication failed: missing role") + onfailure(rw, r, errors.New("unauthorized")) + } + } + cclog.Info("auth metricstore api -> authentication failed: no auth") + onfailure(rw, r, errors.New("unauthorized")) + }) +} + func (auth *Authentication) AuthConfigApi( onsuccess http.Handler, onfailure func(rw http.ResponseWriter, r *http.Request, authErr error), diff --git a/internal/avro/avroCheckpoint.go b/internal/avro/avroCheckpoint.go index 4a3cf19..4d72d36 100644 --- a/internal/avro/avroCheckpoint.go +++ b/internal/avro/avroCheckpoint.go @@ -19,7 +19,8 @@ import ( "sync/atomic" "time" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/schema" "github.com/linkedin/goavro/v2" ) @@ -139,7 +140,7 @@ func getTimestamp(dir string) int64 { } } - interval, _ := time.ParseDuration(Keys.Checkpoints.Interval) + interval, _ := time.ParseDuration(config.MetricStoreKeys.Checkpoints.Interval) updateTime := time.Unix(maxTs, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix() if updateTime < time.Now().Unix() { @@ -408,7 +409,7 @@ func compareSchema(schemaRead, schemaGen string) (bool, string, error) { return true, string(mergedSchemaJson), nil } -func generateSchema(data map[string]util.Float) (string, error) { +func generateSchema(data map[string]schema.Float) (string, error) { // Define the Avro schema structure schema := map[string]interface{}{ "type": "record", @@ -440,7 +441,7 @@ func generateSchema(data map[string]util.Float) (string, error) { return string(schemaString), nil } -func generateRecord(data map[string]util.Float) map[string]interface{} { +func generateRecord(data map[string]schema.Float) map[string]interface{} { record := make(map[string]interface{}) // Iterate through each map in data diff --git a/internal/avro/avroHelper.go b/internal/avro/avroHelper.go index ee09759..ea733cd 100644 --- a/internal/avro/avroHelper.go +++ b/internal/avro/avroHelper.go @@ -6,13 +6,14 @@ import ( "strconv" "sync" + "github.com/ClusterCockpit/cc-backend/internal/config" ) func DataStaging(wg *sync.WaitGroup, ctx context.Context) { // AvroPool is a pool of Avro writers. go func() { - if Keys.Checkpoints.FileFormat == "json" { + if config.MetricStoreKeys.Checkpoints.FileFormat == "json" { wg.Done() // Mark this goroutine as done return // Exit the goroutine } @@ -28,7 +29,7 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) { return case val := <-LineProtocolMessages: //Fetch the frequency of the metric from the global configuration - freq, err := Keys.GetMetricFrequency(val.MetricName) + freq, err := config.MetricStoreKeys.GetMetricFrequency(val.MetricName) if err != nil { fmt.Printf("Error fetching metric frequency: %s\n", err) continue diff --git a/internal/avro/avroStruct.go b/internal/avro/avroStruct.go index 27aac47..ee65291 100644 --- a/internal/avro/avroStruct.go +++ b/internal/avro/avroStruct.go @@ -3,7 +3,7 @@ package avro import ( "sync" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-lib/schema" ) var ( @@ -20,7 +20,7 @@ type AvroStruct struct { Cluster string Node string Selector []string - Value util.Float + Value schema.Float Timestamp int64 } @@ -32,7 +32,7 @@ var avroStore AvroStore type AvroLevel struct { children map[string]*AvroLevel - data map[int64]map[string]util.Float + data map[int64]map[string]schema.Float lock sync.RWMutex } @@ -81,7 +81,7 @@ func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel { } child = &AvroLevel{ - data: make(map[int64]map[string]util.Float, 0), + data: make(map[int64]map[string]schema.Float, 0), children: nil, } @@ -94,7 +94,7 @@ func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel { return child.findAvroLevelOrCreate(selector[1:]) } -func (l *AvroLevel) addMetric(metricName string, value util.Float, timestamp int64, Freq int) { +func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) { l.lock.Lock() defer l.lock.Unlock() @@ -104,7 +104,7 @@ func (l *AvroLevel) addMetric(metricName string, value util.Float, timestamp int if len(l.data) != KeyCounter { if len(l.data) == 0 { for i := range KeyCounter { - l.data[timestamp+int64(i*Freq)] = make(map[string]util.Float, 0) + l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0) } } else { // Get the last timestamp @@ -115,7 +115,7 @@ func (l *AvroLevel) addMetric(metricName string, value util.Float, timestamp int } } // Create keys for the next KeyCounter timestamps - l.data[lastTs+int64(Freq)] = make(map[string]util.Float, 0) + l.data[lastTs+int64(Freq)] = make(map[string]schema.Float, 0) } } diff --git a/internal/config/config.go b/internal/config/config.go index 74ee9b0..183608c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -9,7 +9,6 @@ import ( "encoding/json" "time" - "github.com/ClusterCockpit/cc-backend/internal/memorystore" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" ) @@ -163,16 +162,7 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) { cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error()) } - if Clusters == nil || len(Clusters) < 1 { + if len(Clusters) < 1 { cclog.Abort("Config Init: At least one cluster required in config. Exited with error.") } } - -func InitMetricStore(msConfig json.RawMessage) { - // Validate(msConfigSchema, msConfig) - dec := json.NewDecoder(bytes.NewReader(msConfig)) - dec.DisallowUnknownFields() - if err := dec.Decode(&memorystore.Keys); err != nil { - cclog.Abortf("Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", msConfig, err.Error()) - } -} diff --git a/internal/config/memorystore.go b/internal/config/memorystore.go new file mode 100644 index 0000000..b9273b4 --- /dev/null +++ b/internal/config/memorystore.go @@ -0,0 +1,128 @@ +package config + +import ( + "bytes" + "encoding/json" + "fmt" + + cclog "github.com/ClusterCockpit/cc-lib/ccLogger" +) + +// -------------------- +// Metric Store config +// -------------------- +type MetricStoreConfig struct { + Checkpoints struct { + FileFormat string `json:"file-format"` + Interval string `json:"interval"` + RootDir string `json:"directory"` + Restore string `json:"restore"` + } `json:"checkpoints"` + Debug struct { + DumpToFile string `json:"dump-to-file"` + EnableGops bool `json:"gops"` + } `json:"debug"` + RetentionInMemory string `json:"retention-in-memory"` + Archive struct { + Interval string `json:"interval"` + RootDir string `json:"directory"` + DeleteInstead bool `json:"delete-instead"` + } `json:"archive"` + Nats []*NatsConfig `json:"nats"` +} + +type NatsConfig struct { + // Address of the nats server + Address string `json:"address"` + + // Username/Password, optional + Username string `json:"username"` + Password string `json:"password"` + + //Creds file path + Credsfilepath string `json:"creds-file-path"` + + Subscriptions []struct { + // Channel name + SubscribeTo string `json:"subscribe-to"` + + // Allow lines without a cluster tag, use this as default, optional + ClusterTag string `json:"cluster-tag"` + } `json:"subscriptions"` +} + +var MetricStoreKeys MetricStoreConfig + +// For aggregation over multiple values at different cpus/sockets/..., not time! +type AggregationStrategy int + +const ( + NoAggregation AggregationStrategy = iota + SumAggregation + AvgAggregation +) + +func AssignAggregationStratergy(str string) (AggregationStrategy, error) { + switch str { + case "": + return NoAggregation, nil + case "sum": + return SumAggregation, nil + case "avg": + return AvgAggregation, nil + default: + return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str) + } +} + +type MetricConfig struct { + // Interval in seconds at which measurements will arive. + Frequency int64 + + // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. + Aggregation AggregationStrategy + + // Private, used internally... + Offset int +} + +var Metrics map[string]MetricConfig + +func InitMetricStore(msConfig json.RawMessage) { + // Validate(msConfigSchema, msConfig) + dec := json.NewDecoder(bytes.NewReader(msConfig)) + dec.DisallowUnknownFields() + if err := dec.Decode(&MetricStoreKeys); err != nil { + cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", msConfig, err.Error()) + } +} + +func (c *MetricStoreConfig) GetMetricFrequency(metricName string) (int64, error) { + // if metric, ok := c.Metrics[metricName]; ok { + // return metric.Frequency, nil + // } + return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName) +} + +// add logic to add metrics. Redundant metrics should be updated with max frequency. +// use metric.Name to check if the metric already exists. +// if not, add it to the Metrics map. +func AddMetric(name string, metric MetricConfig) error { + + if Metrics == nil { + Metrics = make(map[string]MetricConfig, 0) + } + + if existingMetric, ok := Metrics[name]; ok { + if existingMetric.Frequency != metric.Frequency { + if existingMetric.Frequency < metric.Frequency { + existingMetric.Frequency = metric.Frequency + Metrics[name] = existingMetric + } + } + } else { + Metrics[name] = metric + } + + return nil +} diff --git a/internal/config/schema.go b/internal/config/schema.go index 37d662a..ca0440e 100644 --- a/internal/config/schema.go +++ b/internal/config/schema.go @@ -144,7 +144,7 @@ var clustersSchema = ` "type": "string" } }, - "required": ["kind", "url"] + "required": ["kind"] }, "filterRanges": { "description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.", diff --git a/internal/memorystore/api.go b/internal/memorystore/api.go new file mode 100644 index 0000000..367f245 --- /dev/null +++ b/internal/memorystore/api.go @@ -0,0 +1,419 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package memorystore + +import ( + "bufio" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "math" + "net/http" + "strconv" + "strings" + + "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/util" + + "github.com/influxdata/line-protocol/v2/lineprotocol" +) + +// @title cc-metric-store REST API +// @version 1.0.0 +// @description API for cc-metric-store + +// @contact.name ClusterCockpit Project +// @contact.url https://clustercockpit.org +// @contact.email support@clustercockpit.org + +// @license.name MIT License +// @license.url https://opensource.org/licenses/MIT + +// @host localhost:8082 +// @basePath /api/ + +// @securityDefinitions.apikey ApiKeyAuth +// @in header +// @name X-Auth-Token + +// ErrorResponse model +type ErrorResponse struct { + // Statustext of Errorcode + Status string `json:"status"` + Error string `json:"error"` // Error Message +} + +type ApiMetricData struct { + Error *string `json:"error,omitempty"` + Data schema.FloatArray `json:"data,omitempty"` + From int64 `json:"from"` + To int64 `json:"to"` + Resolution int64 `json:"resolution"` + Avg schema.Float `json:"avg"` + Min schema.Float `json:"min"` + Max schema.Float `json:"max"` +} + +func handleError(err error, statusCode int, rw http.ResponseWriter) { + // log.Warnf("REST ERROR : %s", err.Error()) + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(statusCode) + json.NewEncoder(rw).Encode(ErrorResponse{ + Status: http.StatusText(statusCode), + Error: err.Error(), + }) +} + +// TODO: Optimize this, just like the stats endpoint! +func (data *ApiMetricData) AddStats() { + n := 0 + sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64 + for _, x := range data.Data { + if x.IsNaN() { + continue + } + + n += 1 + sum += float64(x) + min = math.Min(min, float64(x)) + max = math.Max(max, float64(x)) + } + + if n > 0 { + avg := sum / float64(n) + data.Avg = schema.Float(avg) + data.Min = schema.Float(min) + data.Max = schema.Float(max) + } else { + data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN + } +} + +func (data *ApiMetricData) ScaleBy(f schema.Float) { + if f == 0 || f == 1 { + return + } + + data.Avg *= f + data.Min *= f + data.Max *= f + for i := 0; i < len(data.Data); i++ { + data.Data[i] *= f + } +} + +func (data *ApiMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) { + minfo, ok := ms.Metrics[metric] + if !ok { + return + } + + if (data.From / minfo.Frequency) > (from / minfo.Frequency) { + padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency)) + ndata := make([]schema.Float, 0, padfront+len(data.Data)) + for i := 0; i < padfront; i++ { + ndata = append(ndata, schema.NaN) + } + for j := 0; j < len(data.Data); j++ { + ndata = append(ndata, data.Data[j]) + } + data.Data = ndata + } +} + +// handleFree godoc +// @summary +// @tags free +// @description This endpoint allows the users to free the Buffers from the +// metric store. This endpoint offers the users to remove then systematically +// and also allows then to prune the data under node, if they do not want to +// remove the whole node. +// @produce json +// @param to query string false "up to timestamp" +// @success 200 {string} string "ok" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /free/ [post] +func HandleFree(rw http.ResponseWriter, r *http.Request) { + rawTo := r.URL.Query().Get("to") + if rawTo == "" { + handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw) + return + } + + to, err := strconv.ParseInt(rawTo, 10, 64) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + // // TODO: lastCheckpoint might be modified by different go-routines. + // // Load it using the sync/atomic package? + // freeUpTo := lastCheckpoint.Unix() + // if to < freeUpTo { + // freeUpTo = to + // } + + bodyDec := json.NewDecoder(r.Body) + var selectors [][]string + err = bodyDec.Decode(&selectors) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + ms := GetMemoryStore() + n := 0 + for _, sel := range selectors { + bn, err := ms.Free(sel, to) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + n += bn + } + + rw.WriteHeader(http.StatusOK) + fmt.Fprintf(rw, "buffers freed: %d\n", n) +} + +// handleWrite godoc +// @summary Receive metrics in InfluxDB line-protocol +// @tags write +// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md) + +// @accept plain +// @produce json +// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead." +// @success 200 {string} string "ok" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /write/ [post] +func HandleWrite(rw http.ResponseWriter, r *http.Request) { + bytes, err := io.ReadAll(r.Body) + rw.Header().Add("Content-Type", "application/json") + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + ms := GetMemoryStore() + dec := lineprotocol.NewDecoderWithBytes(bytes) + if err := decodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil { + log.Printf("/api/write error: %s", err.Error()) + handleError(err, http.StatusBadRequest, rw) + return + } + rw.WriteHeader(http.StatusOK) +} + +type ApiQueryRequest struct { + Cluster string `json:"cluster"` + Queries []ApiQuery `json:"queries"` + ForAllNodes []string `json:"for-all-nodes"` + From int64 `json:"from"` + To int64 `json:"to"` + WithStats bool `json:"with-stats"` + WithData bool `json:"with-data"` + WithPadding bool `json:"with-padding"` +} + +type ApiQueryResponse struct { + Queries []ApiQuery `json:"queries,omitempty"` + Results [][]ApiMetricData `json:"results"` +} + +type ApiQuery struct { + Type *string `json:"type,omitempty"` + SubType *string `json:"subtype,omitempty"` + Metric string `json:"metric"` + Hostname string `json:"host"` + Resolution int64 `json:"resolution"` + TypeIds []string `json:"type-ids,omitempty"` + SubTypeIds []string `json:"subtype-ids,omitempty"` + ScaleFactor schema.Float `json:"scale-by,omitempty"` + Aggregate bool `json:"aggreg"` +} + +func FetchData(req ApiQueryRequest) (*ApiQueryResponse, error) { + + req.WithData = true + req.WithData = true + req.WithData = true + + ms := GetMemoryStore() + + response := ApiQueryResponse{ + Results: make([][]ApiMetricData, 0, len(req.Queries)), + } + if req.ForAllNodes != nil { + nodes := ms.ListChildren([]string{req.Cluster}) + for _, node := range nodes { + for _, metric := range req.ForAllNodes { + q := ApiQuery{ + Metric: metric, + Hostname: node, + } + req.Queries = append(req.Queries, q) + response.Queries = append(response.Queries, q) + } + } + } + + for _, query := range req.Queries { + sels := make([]util.Selector, 0, 1) + if query.Aggregate || query.Type == nil { + sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}} + if query.Type != nil { + if len(query.TypeIds) == 1 { + sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]}) + } else { + ids := make([]string, len(query.TypeIds)) + for i, id := range query.TypeIds { + ids[i] = *query.Type + id + } + sel = append(sel, util.SelectorElement{Group: ids}) + } + + if query.SubType != nil { + if len(query.SubTypeIds) == 1 { + sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]}) + } else { + ids := make([]string, len(query.SubTypeIds)) + for i, id := range query.SubTypeIds { + ids[i] = *query.SubType + id + } + sel = append(sel, util.SelectorElement{Group: ids}) + } + } + } + sels = append(sels, sel) + } else { + for _, typeId := range query.TypeIds { + if query.SubType != nil { + for _, subTypeId := range query.SubTypeIds { + sels = append(sels, util.Selector{ + {String: req.Cluster}, + {String: query.Hostname}, + {String: *query.Type + typeId}, + {String: *query.SubType + subTypeId}, + }) + } + } else { + sels = append(sels, util.Selector{ + {String: req.Cluster}, + {String: query.Hostname}, + {String: *query.Type + typeId}, + }) + } + } + } + + // log.Printf("query: %#v\n", query) + // log.Printf("sels: %#v\n", sels) + var err error + res := make([]ApiMetricData, 0, len(sels)) + for _, sel := range sels { + data := ApiMetricData{} + + data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution) + + if err != nil { + msg := err.Error() + data.Error = &msg + res = append(res, data) + continue + } + + if req.WithStats { + data.AddStats() + } + if query.ScaleFactor != 0 { + data.ScaleBy(query.ScaleFactor) + } + if req.WithPadding { + data.PadDataWithNull(ms, req.From, req.To, query.Metric) + } + if !req.WithData { + data.Data = nil + } + res = append(res, data) + } + response.Results = append(response.Results, res) + } + + return &response, nil +} + +// handleDebug godoc +// @summary Debug endpoint +// @tags debug +// @description This endpoint allows the users to print the content of +// nodes/clusters/metrics to review the state of the data. +// @produce json +// @param selector query string false "Selector" +// @success 200 {string} string "Debug dump" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /debug/ [post] +func HandleDebug(rw http.ResponseWriter, r *http.Request) { + raw := r.URL.Query().Get("selector") + rw.Header().Add("Content-Type", "application/json") + selector := []string{} + if len(raw) != 0 { + selector = strings.Split(raw, ":") + } + + ms := GetMemoryStore() + if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } +} + +// handleHealthCheck godoc +// @summary HealthCheck endpoint +// @tags healthcheck +// @description This endpoint allows the users to check if a node is healthy +// @produce json +// @param selector query string false "Selector" +// @success 200 {string} string "Debug dump" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /healthcheck/ [get] +func HandleHealthCheck(rw http.ResponseWriter, r *http.Request) { + rawCluster := r.URL.Query().Get("cluster") + rawNode := r.URL.Query().Get("node") + + if rawCluster == "" || rawNode == "" { + handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + + selector := []string{rawCluster, rawNode} + + ms := GetMemoryStore() + if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } +} diff --git a/internal/memorystore/archive.go b/internal/memorystore/archive.go index 6e25aff..7857d71 100644 --- a/internal/memorystore/archive.go +++ b/internal/memorystore/archive.go @@ -17,15 +17,16 @@ import ( "sync/atomic" "time" + "github.com/ClusterCockpit/cc-backend/internal/config" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" ) func Archiving(wg *sync.WaitGroup, ctx context.Context) { go func() { defer wg.Done() - d, err := time.ParseDuration(Keys.Archive.Interval) + d, err := time.ParseDuration(config.MetricStoreKeys.Archive.Interval) if err != nil { - cclog.Fatalf("error parsing archive interval duration: %v\n", err) + cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err) } if d <= 0 { return @@ -43,14 +44,14 @@ func Archiving(wg *sync.WaitGroup, ctx context.Context) { return case <-ticks: t := time.Now().Add(-d) - cclog.Infof("start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339)) - n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir, - Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead) + cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339)) + n, err := ArchiveCheckpoints(config.MetricStoreKeys.Checkpoints.RootDir, + config.MetricStoreKeys.Archive.RootDir, t.Unix(), config.MetricStoreKeys.Archive.DeleteInstead) if err != nil { - cclog.Warnf("archiving failed: %s\n", err.Error()) + cclog.Warnf("[METRICSTORE]> archiving failed: %s\n", err.Error()) } else { - cclog.Infof("done: %d files zipped and moved to archive\n", n) + cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive\n", n) } } } @@ -127,7 +128,7 @@ func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead return 0, err } - extension := Keys.Checkpoints.FileFormat + extension := config.MetricStoreKeys.Checkpoints.FileFormat files, err := findFiles(entries, from, extension, false) if err != nil { return 0, err diff --git a/internal/memorystore/buffer.go b/internal/memorystore/buffer.go index d084c6d..39e9abc 100644 --- a/internal/memorystore/buffer.go +++ b/internal/memorystore/buffer.go @@ -4,7 +4,7 @@ import ( "errors" "sync" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-lib/schema" ) // Default buffer capacity. @@ -19,14 +19,14 @@ const ( var bufferPool sync.Pool = sync.Pool{ New: func() interface{} { return &buffer{ - data: make([]util.Float, 0, BUFFER_CAP), + data: make([]schema.Float, 0, BUFFER_CAP), } }, } var ( - ErrNoData error = errors.New("no data for this metric/level") - ErrDataDoesNotAlign error = errors.New("data from lower granularities does not align") + ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level") + ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align") ) // Each metric on each level has it's own buffer. @@ -36,7 +36,7 @@ var ( type buffer struct { prev *buffer next *buffer - data []util.Float + data []schema.Float frequency int64 start int64 archived bool @@ -59,9 +59,9 @@ func newBuffer(ts, freq int64) *buffer { // Otherwise, the existing buffer is returnd. // Normaly, only "newer" data should be written, but if the value would // end up in the same buffer anyways it is allowed. -func (b *buffer) write(ts int64, value util.Float) (*buffer, error) { +func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) { if ts < b.start { - return nil, errors.New("cannot write value to buffer from past") + return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past") } // idx := int((ts - b.start + (b.frequency / 3)) / b.frequency) @@ -83,7 +83,7 @@ func (b *buffer) write(ts int64, value util.Float) (*buffer, error) { // Fill up unwritten slots with NaN for i := len(b.data); i < idx; i++ { - b.data = append(b.data, util.NaN) + b.data = append(b.data, schema.NaN) } b.data = append(b.data, value) @@ -147,7 +147,7 @@ func (b *buffer) close() { // This function goes back the buffer chain if `from` is older than the currents buffer start. // The loaded values are added to `data` and `data` is returned, possibly with a shorter length. // If `data` is not long enough to hold all values, this function will panic! -func (b *buffer) read(from, to int64, data []util.Float) ([]util.Float, int64, int64, error) { +func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) { if from < b.firstWrite() { if b.prev != nil { return b.prev.read(from, to, data) @@ -171,9 +171,9 @@ func (b *buffer) read(from, to int64, data []util.Float) ([]util.Float, int64, i if b.next == nil || to <= b.next.start { break } - data[i] += util.NaN + data[i] += schema.NaN } else if t < b.start { - data[i] += util.NaN + data[i] += schema.NaN // } else if b.data[idx].IsNaN() { // data[i] += interpolate(idx, b.data) } else { diff --git a/internal/memorystore/checkpoint.go b/internal/memorystore/checkpoint.go index 80a048b..76a5472 100644 --- a/internal/memorystore/checkpoint.go +++ b/internal/memorystore/checkpoint.go @@ -20,15 +20,16 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/internal/avro" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/schema" "github.com/linkedin/goavro/v2" ) // Whenever changed, update MarshalJSON as well! type CheckpointMetrics struct { - Data []util.Float `json:"data"` - Frequency int64 `json:"frequency"` - Start int64 `json:"start"` + Data []schema.Float `json:"data"` + Frequency int64 `json:"frequency"` + Start int64 `json:"start"` } type CheckpointFile struct { @@ -43,12 +44,12 @@ var lastCheckpoint time.Time func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { lastCheckpoint = time.Now() - if Keys.Checkpoints.FileFormat == "json" { + if config.MetricStoreKeys.Checkpoints.FileFormat == "json" { ms := GetMemoryStore() go func() { defer wg.Done() - d, err := time.ParseDuration(Keys.Checkpoints.Interval) + d, err := time.ParseDuration(config.MetricStoreKeys.Checkpoints.Interval) if err != nil { log.Fatal(err) } @@ -67,14 +68,14 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { case <-ctx.Done(): return case <-ticks: - log.Printf("start checkpointing (starting at %s)...\n", lastCheckpoint.Format(time.RFC3339)) + log.Printf("[METRICSTORE]> start checkpointing (starting at %s)...\n", lastCheckpoint.Format(time.RFC3339)) now := time.Now() - n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, + n, err := ms.ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, lastCheckpoint.Unix(), now.Unix()) if err != nil { - log.Printf("checkpointing failed: %s\n", err.Error()) + log.Printf("[METRICSTORE]> checkpointing failed: %s\n", err.Error()) } else { - log.Printf("done: %d checkpoint files created\n", n) + log.Printf("[METRICSTORE]> done: %d checkpoint files created\n", n) lastCheckpoint = now } } @@ -90,7 +91,7 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { return case <-time.After(time.Duration(avro.CheckpointBufferMinutes) * time.Minute): // This is the first tick untill we collect the data for given minutes. - avro.GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) + avro.GetAvroStore().ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, false) // log.Printf("Checkpointing %d avro files", count) } @@ -108,7 +109,7 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { return case <-ticks: // Regular ticks of 1 minute to write data. - avro.GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) + avro.GetAvroStore().ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, false) // log.Printf("Checkpointing %d avro files", count) } } @@ -179,7 +180,7 @@ func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) { continue } - log.Printf("error while checkpointing %#v: %s", workItem.selector, err.Error()) + log.Printf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error()) atomic.AddInt32(&errs, 1) } else { atomic.AddInt32(&n, 1) @@ -201,7 +202,7 @@ func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) { wg.Wait() if errs > 0 { - return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n) + return int(n), fmt.Errorf("[METRICSTORE]> %d errors happend while creating checkpoints (%d successes)", errs, n) } return int(n), nil } @@ -235,14 +236,14 @@ func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFil continue } - data := make([]util.Float, (to-from)/b.frequency+1) + data := make([]schema.Float, (to-from)/b.frequency+1) data, start, end, err := b.read(from, to, data) if err != nil { return nil, err } for i := int((end - start) / b.frequency); i < len(data); i++ { - data[i] = util.NaN + data[i] = schema.NaN } retval.Metrics[metric] = &CheckpointMetrics{ @@ -314,7 +315,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) ( lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics)) nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension) if err != nil { - log.Fatalf("error while loading checkpoints: %s", err.Error()) + log.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error()) atomic.AddInt32(&errs, 1) } atomic.AddInt32(&n, int32(nn)) @@ -326,7 +327,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) ( clustersDir, err := os.ReadDir(dir) for _, clusterDir := range clustersDir { if !clusterDir.IsDir() { - err = errors.New("expected only directories at first level of checkpoints/ directory") + err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory") goto done } @@ -338,7 +339,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) ( for _, hostDir := range hostsDir { if !hostDir.IsDir() { - err = errors.New("expected only directories at second level of checkpoints/ directory") + err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory") goto done } @@ -364,7 +365,7 @@ done: } if errs > 0 { - return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n) + return int(n), fmt.Errorf("[METRICSTORE]> %d errors happend while creating checkpoints (%d successes)", errs, n) } return int(n), nil } @@ -377,13 +378,13 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { // The directory does not exist, so create it using os.MkdirAll() err := os.MkdirAll(dir, 0755) // 0755 sets the permissions for the directory if err != nil { - log.Fatalf("Error creating directory: %#v\n", err) + log.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err) } - fmt.Printf("%#v Directory created successfully.\n", dir) + fmt.Printf("[METRICSTORE]> %#v Directory created successfully.\n", dir) } // Config read (replace with your actual config read) - fileFormat := Keys.Checkpoints.FileFormat + fileFormat := config.MetricStoreKeys.Checkpoints.FileFormat if fileFormat == "" { fileFormat = "avro" } @@ -396,22 +397,22 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { // First, attempt to load the specified format if found, err := checkFilesWithExtension(dir, fileFormat); err != nil { - return 0, fmt.Errorf("error checking files with extension: %v", err) + return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err) } else if found { - log.Printf("Loading %s files because fileformat is %s\n", fileFormat, fileFormat) + log.Printf("[METRICSTORE]> Loading %s files because fileformat is %s\n", fileFormat, fileFormat) return m.FromCheckpoint(dir, from, fileFormat) } // If not found, attempt the opposite format altFormat := oppositeFormat[fileFormat] if found, err := checkFilesWithExtension(dir, altFormat); err != nil { - return 0, fmt.Errorf("error checking files with extension: %v", err) + return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err) } else if found { - log.Printf("Loading %s files but fileformat is %s\n", altFormat, fileFormat) + log.Printf("[METRICSTORE]> Loading %s files but fileformat is %s\n", altFormat, fileFormat) return m.FromCheckpoint(dir, from, altFormat) } - log.Println("No valid checkpoint files found in the directory.") + log.Println("[METRICSTORE]> No valid checkpoint files found in the directory.") return 0, nil } @@ -420,7 +421,7 @@ func checkFilesWithExtension(dir string, extension string) (bool, error) { err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { if err != nil { - return fmt.Errorf("error accessing path %s: %v", path, err) + return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err) } if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension { found = true @@ -429,7 +430,7 @@ func checkFilesWithExtension(dir string, extension string) (bool, error) { return nil }) if err != nil { - return false, fmt.Errorf("error walking through directories: %s", err) + return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err) } return found, nil @@ -441,7 +442,7 @@ func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:] resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64) if err != nil { - return fmt.Errorf("error while reading avro file (resolution parsing) : %s", err) + return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err) } from_timestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64) @@ -450,7 +451,7 @@ func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { from_timestamp -= (resolution / 2) if err != nil { - return fmt.Errorf("error converting timestamp from the avro file : %s", err) + return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err) } // fmt.Printf("File : %s with resolution : %d\n", fileName, resolution) @@ -463,21 +464,21 @@ func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { panic(err) } - metricsData := make(map[string]util.FloatArray) + metricsData := make(map[string]schema.FloatArray) for ocfReader.Scan() { datum, err := ocfReader.Read() if err != nil { - return fmt.Errorf("error while reading avro file : %s", err) + return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err) } record, ok := datum.(map[string]interface{}) if !ok { - panic("failed to assert datum as map[string]interface{}") + panic("[METRICSTORE]> failed to assert datum as map[string]interface{}") } for key, value := range record { - metricsData[key] = append(metricsData[key], util.ConvertToFloat(value.(float64))) + metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64))) } recordCounter += 1 @@ -518,12 +519,12 @@ func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { leafMetricName := subString[len(subString)-1] err = lvl.createBuffer(m, leafMetricName, floatArray, from_timestamp, resolution) if err != nil { - return fmt.Errorf("error while creating buffers from avroReader : %s", err) + return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) } } else { err = l.createBuffer(m, metricName, floatArray, from_timestamp, resolution) if err != nil { - return fmt.Errorf("error while creating buffers from avroReader : %s", err) + return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) } } @@ -532,7 +533,7 @@ func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { return nil } -func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray util.FloatArray, from int64, resolution int64) error { +func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error { n := len(floatArray) b := &buffer{ frequency: resolution, @@ -566,7 +567,7 @@ func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray util. missingCount /= int(b.frequency) for range missingCount { - prev.data = append(prev.data, util.NaN) + prev.data = append(prev.data, schema.NaN) } prev.data = prev.data[0:len(prev.data):len(prev.data)] diff --git a/internal/memorystore/config.go b/internal/memorystore/config.go deleted file mode 100644 index 0d8a8ab..0000000 --- a/internal/memorystore/config.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package memorystore - -type MetricStoreConfig struct { - Checkpoints struct { - FileFormat string `json:"file-format"` - Interval string `json:"interval"` - RootDir string `json:"directory"` - Restore string `json:"restore"` - } `json:"checkpoints"` - Debug struct { - DumpToFile string `json:"dump-to-file"` - EnableGops bool `json:"gops"` - } `json:"debug"` - RetentionInMemory string `json:"retention-in-memory"` - Archive struct { - Interval string `json:"interval"` - RootDir string `json:"directory"` - DeleteInstead bool `json:"delete-instead"` - } `json:"archive"` -} - -var Keys MetricStoreConfig diff --git a/internal/memorystore/debug.go b/internal/memorystore/debug.go index 2743a45..0f85024 100644 --- a/internal/memorystore/debug.go +++ b/internal/memorystore/debug.go @@ -87,7 +87,7 @@ func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf [ func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error { lvl := m.root.findLevel(selector) if lvl == nil { - return fmt.Errorf("not found: %#v", selector) + return fmt.Errorf("[METRICSTORE]> not found: %#v", selector) } buf := make([]byte, 0, 2048) diff --git a/internal/memorystore/healthcheck.go b/internal/memorystore/healthcheck.go index cb22d49..d655db3 100644 --- a/internal/memorystore/healthcheck.go +++ b/internal/memorystore/healthcheck.go @@ -59,7 +59,7 @@ func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) { func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error { lvl := m.root.findLevel(selector) if lvl == nil { - return fmt.Errorf("not found: %#v", selector) + return fmt.Errorf("[METRICSTORE]> not found: %#v", selector) } buf := make([]byte, 0, 25) diff --git a/internal/memorystore/lineprotocol.go b/internal/memorystore/lineprotocol.go new file mode 100644 index 0000000..e12b9e2 --- /dev/null +++ b/internal/memorystore/lineprotocol.go @@ -0,0 +1,349 @@ +package memorystore + +import ( + "context" + "errors" + "fmt" + "log" + "net" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/avro" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/schema" + "github.com/influxdata/line-protocol/v2/lineprotocol" + "github.com/nats-io/nats.go" +) + +// Each connection is handled in it's own goroutine. This is a blocking function. +func ReceiveRaw(ctx context.Context, + listener net.Listener, + handleLine func(*lineprotocol.Decoder, string) error, +) error { + var wg sync.WaitGroup + + wg.Add(1) + go func() { + defer wg.Done() + <-ctx.Done() + if err := listener.Close(); err != nil { + log.Printf("listener.Close(): %s", err.Error()) + } + }() + + for { + conn, err := listener.Accept() + if err != nil { + if errors.Is(err, net.ErrClosed) { + break + } + + log.Printf("listener.Accept(): %s", err.Error()) + } + + wg.Add(2) + go func() { + defer wg.Done() + defer conn.Close() + + dec := lineprotocol.NewDecoder(conn) + connctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { + defer wg.Done() + select { + case <-connctx.Done(): + conn.Close() + case <-ctx.Done(): + conn.Close() + } + }() + + if err := handleLine(dec, "default"); err != nil { + if errors.Is(err, net.ErrClosed) { + return + } + + log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error()) + errmsg := make([]byte, 128) + errmsg = append(errmsg, `error: `...) + errmsg = append(errmsg, err.Error()...) + errmsg = append(errmsg, '\n') + conn.Write(errmsg) + } + }() + } + + wg.Wait() + return nil +} + +// Connect to a nats server and subscribe to "updates". This is a blocking +// function. handleLine will be called for each line recieved via nats. +// Send `true` through the done channel for gracefull termination. +func ReceiveNats(conf *(config.NatsConfig), + ms *MemoryStore, + workers int, + ctx context.Context, +) error { + var opts []nats.Option + if conf.Username != "" && conf.Password != "" { + opts = append(opts, nats.UserInfo(conf.Username, conf.Password)) + } + + if conf.Credsfilepath != "" { + opts = append(opts, nats.UserCredentials(conf.Credsfilepath)) + } + + nc, err := nats.Connect(conf.Address, opts...) + if err != nil { + return err + } + defer nc.Close() + + var wg sync.WaitGroup + var subs []*nats.Subscription + + msgs := make(chan *nats.Msg, workers*2) + + for _, sc := range conf.Subscriptions { + clusterTag := sc.ClusterTag + var sub *nats.Subscription + if workers > 1 { + wg.Add(workers) + + for i := 0; i < workers; i++ { + go func() { + for m := range msgs { + dec := lineprotocol.NewDecoderWithBytes(m.Data) + if err := decodeLine(dec, ms, clusterTag); err != nil { + log.Printf("error: %s\n", err.Error()) + } + } + + wg.Done() + }() + } + + sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) { + msgs <- m + }) + } else { + sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) { + dec := lineprotocol.NewDecoderWithBytes(m.Data) + if err := decodeLine(dec, ms, clusterTag); err != nil { + log.Printf("error: %s\n", err.Error()) + } + }) + } + + if err != nil { + return err + } + log.Printf("NATS subscription to '%s' on '%s' established\n", sc.SubscribeTo, conf.Address) + subs = append(subs, sub) + } + + <-ctx.Done() + for _, sub := range subs { + err = sub.Unsubscribe() + if err != nil { + log.Printf("NATS unsubscribe failed: %s", err.Error()) + } + } + close(msgs) + wg.Wait() + + nc.Close() + log.Println("NATS connection closed") + return nil +} + +// Place `prefix` in front of `buf` but if possible, +// do that inplace in `buf`. +func reorder(buf, prefix []byte) []byte { + n := len(prefix) + m := len(buf) + if cap(buf) < m+n { + return append(prefix[:n:n], buf...) + } else { + buf = buf[:n+m] + for i := m - 1; i >= 0; i-- { + buf[i+n] = buf[i] + } + for i := 0; i < n; i++ { + buf[i] = prefix[i] + } + return buf + } +} + +// Decode lines using dec and make write calls to the MemoryStore. +// If a line is missing its cluster tag, use clusterDefault as default. +func decodeLine(dec *lineprotocol.Decoder, + ms *MemoryStore, + clusterDefault string, +) error { + // Reduce allocations in loop: + t := time.Now() + metric, metricBuf := Metric{}, make([]byte, 0, 16) + selector := make([]string, 0, 4) + typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0) + + // Optimize for the case where all lines in a "batch" are about the same + // cluster and host. By using `WriteToLevel` (level = host), we do not need + // to take the root- and cluster-level lock as often. + var lvl *Level = nil + prevCluster, prevHost := "", "" + + var ok bool + for dec.Next() { + rawmeasurement, err := dec.Measurement() + if err != nil { + return err + } + + // Needs to be copied because another call to dec.* would + // invalidate the returned slice. + metricBuf = append(metricBuf[:0], rawmeasurement...) + + // The go compiler optimizes map[string(byteslice)] lookups: + metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)] + if !ok { + continue + } + + typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0] + cluster, host := clusterDefault, "" + for { + key, val, err := dec.NextTag() + if err != nil { + return err + } + if key == nil { + break + } + + // The go compiler optimizes string([]byte{...}) == "...": + switch string(key) { + case "cluster": + if string(val) == prevCluster { + cluster = prevCluster + } else { + cluster = string(val) + lvl = nil + } + case "hostname", "host": + if string(val) == prevHost { + host = prevHost + } else { + host = string(val) + lvl = nil + } + case "type": + if string(val) == "node" { + break + } + + // We cannot be sure that the "type" tag comes before the "type-id" tag: + if len(typeBuf) == 0 { + typeBuf = append(typeBuf, val...) + } else { + typeBuf = reorder(typeBuf, val) + } + case "type-id": + typeBuf = append(typeBuf, val...) + case "subtype": + // We cannot be sure that the "subtype" tag comes before the "stype-id" tag: + if len(subTypeBuf) == 0 { + subTypeBuf = append(subTypeBuf, val...) + } else { + subTypeBuf = reorder(subTypeBuf, val) + // subTypeBuf = reorder(typeBuf, val) + } + case "stype-id": + subTypeBuf = append(subTypeBuf, val...) + default: + // Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need) + // return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val)) + } + } + + // If the cluster or host changed, the lvl was set to nil + if lvl == nil { + selector = selector[:2] + selector[0], selector[1] = cluster, host + lvl = ms.GetLevel(selector) + prevCluster, prevHost = cluster, host + } + + // subtypes: + selector = selector[:0] + if len(typeBuf) > 0 { + selector = append(selector, string(typeBuf)) // <- Allocation :( + if len(subTypeBuf) > 0 { + selector = append(selector, string(subTypeBuf)) + } + } + + for { + key, val, err := dec.NextField() + if err != nil { + return err + } + + if key == nil { + break + } + + if string(key) != "value" { + return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val) + } + + if val.Kind() == lineprotocol.Float { + metric.Value = schema.Float(val.FloatV()) + } else if val.Kind() == lineprotocol.Int { + metric.Value = schema.Float(val.IntV()) + } else if val.Kind() == lineprotocol.Uint { + metric.Value = schema.Float(val.UintV()) + } else { + return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String()) + } + } + + if t, err = dec.Time(lineprotocol.Second, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil { + return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) + } + } + } + } + + if err != nil { + return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) + } + + time := t.Unix() + + if config.MetricStoreKeys.Checkpoints.FileFormat != "json" { + avro.LineProtocolMessages <- &avro.AvroStruct{ + MetricName: string(metricBuf), + Cluster: cluster, + Node: host, + Selector: append([]string{}, selector...), + Value: metric.Value, + Timestamp: time} + } + + if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil { + return err + } + } + return nil +} diff --git a/internal/memorystore/memorystore.go b/internal/memorystore/memorystore.go index 76079d4..efa4065 100644 --- a/internal/memorystore/memorystore.go +++ b/internal/memorystore/memorystore.go @@ -2,16 +2,19 @@ package memorystore import ( "context" - "encoding/json" "errors" - "fmt" "log" + "os" + "os/signal" "runtime" "sync" + "syscall" "time" "github.com/ClusterCockpit/cc-backend/internal/avro" + "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-lib/resampler" + "github.com/ClusterCockpit/cc-lib/runtimeEnv" "github.com/ClusterCockpit/cc-lib/schema" "github.com/ClusterCockpit/cc-lib/util" ) @@ -21,6 +24,8 @@ var ( msInstance *MemoryStore ) +var Clusters = make([]string, 0) + var NumWorkers int = 4 func init() { @@ -31,77 +36,38 @@ func init() { } } -// For aggregation over multiple values at different cpus/sockets/..., not time! -type AggregationStrategy int - -const ( - NoAggregation AggregationStrategy = iota - SumAggregation - AvgAggregation -) - -func (as *AggregationStrategy) UnmarshalJSON(data []byte) error { - var str string - if err := json.Unmarshal(data, &str); err != nil { - return err - } - - switch str { - case "": - *as = NoAggregation - case "sum": - *as = SumAggregation - case "avg": - *as = AvgAggregation - default: - return fmt.Errorf("invalid aggregation strategy: %#v", str) - } - return nil -} - -type MetricConfig struct { - // Interval in seconds at which measurements will arive. - Frequency int64 `json:"frequency"` - - // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. - Aggregation AggregationStrategy `json:"aggregation"` - - // Private, used internally... - Offset int -} - type Metric struct { Name string - Value util.Float - MetricConfig MetricConfig + Value schema.Float + MetricConfig config.MetricConfig } type MemoryStore struct { - Metrics map[string]MetricConfig + Metrics map[string]config.MetricConfig root Level } -func Init() { +func Init(wg sync.WaitGroup) { startupTime := time.Now() - //Pass the keys from cluster config - InitMetrics() + //Pass the config.MetricStoreKeys + InitMetrics(config.Metrics) ms := GetMemoryStore() - d, err := time.ParseDuration(Keys.Checkpoints.Restore) + d, err := time.ParseDuration(config.MetricStoreKeys.Checkpoints.Restore) if err != nil { log.Fatal(err) } restoreFrom := startupTime.Add(-d) - log.Printf("Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339)) - files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix()) + log.Printf("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339)) + files, err := ms.FromCheckpointFiles(config.MetricStoreKeys.Checkpoints.RootDir, restoreFrom.Unix()) loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB if err != nil { - log.Fatalf("Loading checkpoints failed: %s\n", err.Error()) + log.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error()) } else { - log.Printf("Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds()) + log.Printf("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds()) } // Try to use less memory by forcing a GC run here and then @@ -112,28 +78,53 @@ func Init() { // to a minumum. runtime.GC() - ctx, _ := context.WithCancel(context.Background()) + ctx, shutdown := context.WithCancel(context.Background()) - var wg sync.WaitGroup wg.Add(4) Retention(&wg, ctx) Checkpointing(&wg, ctx) Archiving(&wg, ctx) avro.DataStaging(&wg, ctx) + + wg.Add(1) + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + go func() { + defer wg.Done() + <-sigs + runtimeEnv.SystemdNotifiy(false, "[METRICSTORE]> Shutting down ...") + shutdown() + }() + + if config.MetricStoreKeys.Nats != nil { + for _, natsConf := range config.MetricStoreKeys.Nats { + // TODO: When multiple nats configs share a URL, do a single connect. + wg.Add(1) + nc := natsConf + go func() { + // err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx) + err := ReceiveNats(nc, ms, 1, ctx) + if err != nil { + log.Fatal(err) + } + wg.Done() + }() + } + } } // Create a new, initialized instance of a MemoryStore. // Will panic if values in the metric configurations are invalid. -func InitMetrics(metrics map[string]MetricConfig) { +func InitMetrics(metrics map[string]config.MetricConfig) { singleton.Do(func() { offset := 0 for key, cfg := range metrics { if cfg.Frequency == 0 { - panic("invalid frequency") + panic("[METRICSTORE]> invalid frequency") } - metrics[key] = MetricConfig{ + metrics[key] = config.MetricConfig{ Frequency: cfg.Frequency, Aggregation: cfg.Aggregation, Offset: offset, @@ -153,30 +144,30 @@ func InitMetrics(metrics map[string]MetricConfig) { func GetMemoryStore() *MemoryStore { if msInstance == nil { - log.Fatalf("MemoryStore not initialized!") + log.Fatalf("[METRICSTORE]> MemoryStore not initialized!") } return msInstance } func Shutdown() { - log.Printf("Writing to '%s'...\n", Keys.Checkpoints.RootDir) + log.Printf("[METRICSTORE]> Writing to '%s'...\n", config.MetricStoreKeys.Checkpoints.RootDir) var files int var err error ms := GetMemoryStore() - if Keys.Checkpoints.FileFormat == "json" { - files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) + if config.MetricStoreKeys.Checkpoints.FileFormat == "json" { + files, err = ms.ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) } else { - files, err = avro.GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true) + files, err = avro.GetAvroStore().ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, true) close(avro.LineProtocolMessages) } if err != nil { - log.Printf("Writing checkpoint failed: %s\n", err.Error()) + log.Printf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error()) } - log.Printf("Done! (%d files written)\n", files) + log.Printf("[METRICSTORE]> Done! (%d files written)\n", files) // ms.PrintHeirarchy() } @@ -255,7 +246,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) { go func() { defer wg.Done() - d, err := time.ParseDuration(Keys.RetentionInMemory) + d, err := time.ParseDuration(config.MetricStoreKeys.RetentionInMemory) if err != nil { log.Fatal(err) } @@ -276,12 +267,12 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) { return case <-ticks: t := time.Now().Add(-d) - log.Printf("start freeing buffers (older than %s)...\n", t.Format(time.RFC3339)) + log.Printf("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339)) freed, err := ms.Free(nil, t.Unix()) if err != nil { - log.Printf("freeing up buffers failed: %s\n", err.Error()) + log.Printf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error()) } else { - log.Printf("done: %d buffers freed\n", freed) + log.Printf("[METRICSTORE]> done: %d buffers freed\n", freed) } } } @@ -346,12 +337,12 @@ func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metric // the range asked for if no data was available. func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) { if from > to { - return nil, 0, 0, 0, errors.New("invalid time range") + return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range") } minfo, ok := m.Metrics[metric] if !ok { - return nil, 0, 0, 0, errors.New("unkown metric: " + metric) + return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric) } n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1) @@ -390,15 +381,15 @@ func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, reso if err != nil { return nil, 0, 0, 0, err } else if n == 0 { - return nil, 0, 0, 0, errors.New("metric or host not found") + return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found") } else if n > 1 { - if minfo.Aggregation == AvgAggregation { + if minfo.Aggregation == config.AvgAggregation { normalize := 1. / schema.Float(n) for i := 0; i < len(data); i++ { data[i] *= normalize } - } else if minfo.Aggregation != SumAggregation { - return nil, 0, 0, 0, errors.New("invalid aggregation") + } else if minfo.Aggregation != config.SumAggregation { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation") } } diff --git a/internal/memorystore/stats.go b/internal/memorystore/stats.go index 831e282..1066dcb 100644 --- a/internal/memorystore/stats.go +++ b/internal/memorystore/stats.go @@ -4,6 +4,7 @@ import ( "errors" "math" + "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-lib/util" ) @@ -104,9 +105,9 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6 return nil, 0, 0, ErrNoData } - if minfo.Aggregation == AvgAggregation { + if minfo.Aggregation == config.AvgAggregation { avg /= util.Float(n) - } else if n > 1 && minfo.Aggregation != SumAggregation { + } else if n > 1 && minfo.Aggregation != config.SumAggregation { return nil, 0, 0, errors.New("invalid aggregation") } diff --git a/internal/metricDataDispatcher/dataLoader.go b/internal/metricDataDispatcher/dataLoader.go index 2b73e11..4f8e3b5 100644 --- a/internal/metricDataDispatcher/dataLoader.go +++ b/internal/metricDataDispatcher/dataLoader.go @@ -91,14 +91,14 @@ func LoadData(job *schema.Job, // Pass the resolution from frontend here. for _, v := range jd { for _, v_ := range v { - timestep := 0 + timestep := int64(0) for i := 0; i < len(v_.Series); i += 1 { - v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, v_.Timestep, resolution) + v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution)) if err != nil { return err, 0, 0 } } - v_.Timestep = timestep + v_.Timestep = int(timestep) } } diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go index 36c0dd7..d8cef4d 100644 --- a/internal/metricdata/cc-metric-store.go +++ b/internal/metricdata/cc-metric-store.go @@ -5,23 +5,22 @@ package metricdata import ( - "bufio" - "bytes" "context" "encoding/json" "fmt" - "net/http" "sort" "strconv" "strings" "time" "github.com/ClusterCockpit/cc-backend/internal/graph/model" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/pkg/archive" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" "github.com/ClusterCockpit/cc-lib/schema" ) +// Bloat Code type CCMetricStoreConfig struct { Kind string `json:"kind"` Url string `json:"url"` @@ -33,141 +32,16 @@ type CCMetricStoreConfig struct { Renamings map[string]string `json:"metricRenamings"` } +// Bloat Code type CCMetricStore struct { - here2there map[string]string - there2here map[string]string - client http.Client - jwt string - url string - queryEndpoint string -} - -type ApiQueryRequest struct { - Cluster string `json:"cluster"` - Queries []ApiQuery `json:"queries"` - ForAllNodes []string `json:"for-all-nodes"` - From int64 `json:"from"` - To int64 `json:"to"` - WithStats bool `json:"with-stats"` - WithData bool `json:"with-data"` -} - -type ApiQuery struct { - Type *string `json:"type,omitempty"` - SubType *string `json:"subtype,omitempty"` - Metric string `json:"metric"` - Hostname string `json:"host"` - Resolution int `json:"resolution"` - TypeIds []string `json:"type-ids,omitempty"` - SubTypeIds []string `json:"subtype-ids,omitempty"` - Aggregate bool `json:"aggreg"` -} - -type ApiQueryResponse struct { - Queries []ApiQuery `json:"queries,omitempty"` - Results [][]ApiMetricData `json:"results"` -} - -type ApiMetricData struct { - Error *string `json:"error"` - Data []schema.Float `json:"data"` - From int64 `json:"from"` - To int64 `json:"to"` - Resolution int `json:"resolution"` - Avg schema.Float `json:"avg"` - Min schema.Float `json:"min"` - Max schema.Float `json:"max"` } +// Bloat Code func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error { - var config CCMetricStoreConfig - if err := json.Unmarshal(rawConfig, &config); err != nil { - cclog.Warn("Error while unmarshaling raw json config") - return err - } - - ccms.url = config.Url - ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url) - ccms.jwt = config.Token - ccms.client = http.Client{ - Timeout: 10 * time.Second, - } - - if config.Renamings != nil { - ccms.here2there = config.Renamings - ccms.there2here = make(map[string]string, len(config.Renamings)) - for k, v := range ccms.here2there { - ccms.there2here[v] = k - } - } else { - ccms.here2there = make(map[string]string) - ccms.there2here = make(map[string]string) - } return nil } -func (ccms *CCMetricStore) toRemoteName(metric string) string { - if renamed, ok := ccms.here2there[metric]; ok { - return renamed - } - - return metric -} - -func (ccms *CCMetricStore) toLocalName(metric string) string { - if renamed, ok := ccms.there2here[metric]; ok { - return renamed - } - - return metric -} - -func (ccms *CCMetricStore) doRequest( - ctx context.Context, - body *ApiQueryRequest, -) (*ApiQueryResponse, error) { - buf := &bytes.Buffer{} - if err := json.NewEncoder(buf).Encode(body); err != nil { - cclog.Errorf("Error while encoding request body: %s", err.Error()) - return nil, err - } - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf) - if err != nil { - cclog.Errorf("Error while building request body: %s", err.Error()) - return nil, err - } - if ccms.jwt != "" { - req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt)) - } - - // versioning the cc-metric-store query API. - // v2 = data with resampling - // v1 = data without resampling - q := req.URL.Query() - q.Add("version", "v2") - req.URL.RawQuery = q.Encode() - - res, err := ccms.client.Do(req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - if res.StatusCode != http.StatusOK { - return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status) - } - - var resBody ApiQueryResponse - if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil { - cclog.Errorf("Error while decoding result body: %s", err.Error()) - return nil, err - } - - return &resBody, nil -} - func (ccms *CCMetricStore) LoadData( job *schema.Job, metrics []string, @@ -175,13 +49,13 @@ func (ccms *CCMetricStore) LoadData( ctx context.Context, resolution int, ) (schema.JobData, error) { - queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution) + queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, int64(resolution)) if err != nil { cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) return nil, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: job.Cluster, From: job.StartTime, To: job.StartTime + int64(job.Duration), @@ -190,9 +64,9 @@ func (ccms *CCMetricStore) LoadData( WithData: true, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } @@ -200,7 +74,7 @@ func (ccms *CCMetricStore) LoadData( jobData := make(schema.JobData) for i, row := range resBody.Results { query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(job.Cluster, metric) if _, ok := jobData[metric]; !ok { @@ -209,7 +83,7 @@ func (ccms *CCMetricStore) LoadData( res := mc.Timestep if len(row) > 0 { - res = row[0].Resolution + res = int(row[0].Resolution) } jobMetric, ok := jobData[metric][scope] @@ -282,9 +156,9 @@ func (ccms *CCMetricStore) buildQueries( job *schema.Job, metrics []string, scopes []schema.MetricScope, - resolution int, -) ([]ApiQuery, []schema.MetricScope, error) { - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) + resolution int64, +) ([]memorystore.ApiQuery, []schema.MetricScope, error) { + queries := make([]memorystore.ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) assignedScope := []schema.MetricScope{} subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster) @@ -294,7 +168,6 @@ func (ccms *CCMetricStore) buildQueries( topology := subcluster.Topology for _, metric := range metrics { - remoteName := ccms.toRemoteName(metric) mc := archive.GetMetricConfig(job.Cluster, metric) if mc == nil { // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster) @@ -306,7 +179,7 @@ func (ccms *CCMetricStore) buildQueries( if len(mc.SubClusters) != 0 { isRemoved := false for _, scConfig := range mc.SubClusters { - if scConfig.Name == job.SubCluster && scConfig.Remove == true { + if scConfig.Name == job.SubCluster && scConfig.Remove { isRemoved = true break } @@ -347,8 +220,8 @@ func (ccms *CCMetricStore) buildQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &acceleratorString, @@ -365,8 +238,8 @@ func (ccms *CCMetricStore) buildQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &acceleratorString, @@ -379,8 +252,8 @@ func (ccms *CCMetricStore) buildQueries( // HWThread -> HWThead if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &hwthreadString, @@ -395,8 +268,8 @@ func (ccms *CCMetricStore) buildQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(hwthreads) for _, core := range cores { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &hwthreadString, @@ -412,8 +285,8 @@ func (ccms *CCMetricStore) buildQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &hwthreadString, @@ -427,8 +300,8 @@ func (ccms *CCMetricStore) buildQueries( // HWThread -> Node if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &hwthreadString, @@ -442,8 +315,8 @@ func (ccms *CCMetricStore) buildQueries( // Core -> Core if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &coreString, @@ -458,8 +331,8 @@ func (ccms *CCMetricStore) buildQueries( if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromCores(hwthreads) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &coreString, @@ -474,8 +347,8 @@ func (ccms *CCMetricStore) buildQueries( // Core -> Node if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &coreString, @@ -489,8 +362,8 @@ func (ccms *CCMetricStore) buildQueries( // MemoryDomain -> MemoryDomain if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &memoryDomainString, @@ -504,8 +377,8 @@ func (ccms *CCMetricStore) buildQueries( // MemoryDoman -> Node if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &memoryDomainString, @@ -519,8 +392,8 @@ func (ccms *CCMetricStore) buildQueries( // Socket -> Socket if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &socketString, @@ -534,8 +407,8 @@ func (ccms *CCMetricStore) buildQueries( // Socket -> Node if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &socketString, @@ -548,8 +421,8 @@ func (ccms *CCMetricStore) buildQueries( // Node -> Node if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Resolution: resolution, }) @@ -576,7 +449,7 @@ func (ccms *CCMetricStore) LoadStats( return nil, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: job.Cluster, From: job.StartTime, To: job.StartTime + int64(job.Duration), @@ -585,16 +458,16 @@ func (ccms *CCMetricStore) LoadStats( WithData: false, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) for i, res := range resBody.Results { query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric data := res[0] if data.Error != nil { cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) @@ -635,7 +508,7 @@ func (ccms *CCMetricStore) LoadScopedStats( return nil, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: job.Cluster, From: job.StartTime, To: job.StartTime + int64(job.Duration), @@ -644,9 +517,9 @@ func (ccms *CCMetricStore) LoadScopedStats( WithData: false, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } @@ -655,7 +528,7 @@ func (ccms *CCMetricStore) LoadScopedStats( for i, row := range resBody.Results { query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric scope := assignedScope[i] if _, ok := scopedJobStats[metric]; !ok { @@ -721,7 +594,7 @@ func (ccms *CCMetricStore) LoadNodeData( from, to time.Time, ctx context.Context, ) (map[string]map[string][]*schema.JobMetric, error) { - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: cluster, From: from.Unix(), To: to.Unix(), @@ -730,38 +603,36 @@ func (ccms *CCMetricStore) LoadNodeData( } if nodes == nil { - for _, metric := range metrics { - req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric)) - } + req.ForAllNodes = append(req.ForAllNodes, metrics...) } else { for _, node := range nodes { for _, metric := range metrics { - req.Queries = append(req.Queries, ApiQuery{ + req.Queries = append(req.Queries, memorystore.ApiQuery{ Hostname: node, - Metric: ccms.toRemoteName(metric), + Metric: metric, Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution }) } } } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } var errors []string data := make(map[string]map[string][]*schema.JobMetric) for i, res := range resBody.Results { - var query ApiQuery + var query memorystore.ApiQuery if resBody.Queries != nil { query = resBody.Queries[i] } else { query = req.Queries[i] } - metric := ccms.toLocalName(query.Metric) + metric := query.Metric qdata := res[0] if qdata.Error != nil { /* Build list for "partial errors", if any */ @@ -861,13 +732,13 @@ func (ccms *CCMetricStore) LoadNodeListData( // Note: Order of node data is not guaranteed after this point, but contents match page and filter criteria - queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) + queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, int64(resolution)) if err != nil { cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) return nil, totalNodes, hasNextPage, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: cluster, Queries: queries, From: from.Unix(), @@ -876,29 +747,29 @@ func (ccms *CCMetricStore) LoadNodeListData( WithData: true, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, totalNodes, hasNextPage, err } var errors []string data := make(map[string]schema.JobData) for i, row := range resBody.Results { - var query ApiQuery + var query memorystore.ApiQuery if resBody.Queries != nil { query = resBody.Queries[i] } else { query = req.Queries[i] } // qdata := res[0] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(cluster, metric) res := mc.Timestep if len(row) > 0 { - res = row[0].Resolution + res = int(row[0].Resolution) } // Init Nested Map Data Structures If Not Found @@ -971,9 +842,9 @@ func (ccms *CCMetricStore) buildNodeQueries( nodes []string, metrics []string, scopes []schema.MetricScope, - resolution int, -) ([]ApiQuery, []schema.MetricScope, error) { - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) + resolution int64, +) ([]memorystore.ApiQuery, []schema.MetricScope, error) { + queries := make([]memorystore.ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) assignedScope := []schema.MetricScope{} // Get Topol before loop if subCluster given @@ -988,7 +859,7 @@ func (ccms *CCMetricStore) buildNodeQueries( } for _, metric := range metrics { - remoteName := ccms.toRemoteName(metric) + metric := metric mc := archive.GetMetricConfig(cluster, metric) if mc == nil { // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster) @@ -1000,7 +871,7 @@ func (ccms *CCMetricStore) buildNodeQueries( if mc.SubClusters != nil { isRemoved := false for _, scConfig := range mc.SubClusters { - if scConfig.Name == subCluster && scConfig.Remove == true { + if scConfig.Name == subCluster && scConfig.Remove { isRemoved = true break } @@ -1056,8 +927,8 @@ func (ccms *CCMetricStore) buildNodeQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &acceleratorString, @@ -1074,8 +945,8 @@ func (ccms *CCMetricStore) buildNodeQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &acceleratorString, @@ -1088,8 +959,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // HWThread -> HWThead if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &hwthreadString, @@ -1104,8 +975,8 @@ func (ccms *CCMetricStore) buildNodeQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(topology.Node) for _, core := range cores { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &hwthreadString, @@ -1121,8 +992,8 @@ func (ccms *CCMetricStore) buildNodeQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &hwthreadString, @@ -1136,8 +1007,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // HWThread -> Node if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &hwthreadString, @@ -1151,8 +1022,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Core -> Core if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &coreString, @@ -1167,8 +1038,8 @@ func (ccms *CCMetricStore) buildNodeQueries( if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromCores(topology.Node) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &coreString, @@ -1183,8 +1054,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Core -> Node if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &coreString, @@ -1198,8 +1069,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // MemoryDomain -> MemoryDomain if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &memoryDomainString, @@ -1213,8 +1084,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // MemoryDoman -> Node if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &memoryDomainString, @@ -1228,8 +1099,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Socket -> Socket if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &socketString, @@ -1243,8 +1114,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Socket -> Node if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &socketString, @@ -1257,8 +1128,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Node -> Node if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Resolution: resolution, }) diff --git a/internal/metricdata/utils.go b/internal/metricdata/utils.go index 59e640e..2e0d423 100644 --- a/internal/metricdata/utils.go +++ b/internal/metricdata/utils.go @@ -74,9 +74,8 @@ func (tmdr *TestMetricDataRepository) LoadNodeListData( } func DeepCopy(jd_temp schema.JobData) schema.JobData { - var jd schema.JobData - jd = make(schema.JobData, len(jd_temp)) + jd := make(schema.JobData, len(jd_temp)) for k, v := range jd_temp { jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jd_temp[k])) for k_, v_ := range v { diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 51b89b1..3317487 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -8,6 +8,8 @@ import ( "errors" "fmt" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" "github.com/ClusterCockpit/cc-lib/schema" ) @@ -31,6 +33,8 @@ func initClusterConfig() error { return err } + memorystore.Clusters = append(memorystore.Clusters, cluster.Name) + if len(cluster.Name) == 0 || len(cluster.MetricConfig) == 0 || len(cluster.SubClusters) == 0 { @@ -122,6 +126,16 @@ func initClusterConfig() error { } ml.Availability = append(metricLookup[mc.Name].Availability, availability) metricLookup[mc.Name] = ml + + agg, err := config.AssignAggregationStratergy(mc.Aggregation) + if err != nil { + return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > in %s/cluster.json: %w", cluster.Name, err) + } + + config.AddMetric(mc.Name, config.MetricConfig{ + Frequency: int64(mc.Timestep), + Aggregation: agg, + }) } Clusters = append(Clusters, cluster) From af43901ca3ddd54cdbb5377682fb4abb3bcf1bbc Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Mon, 8 Sep 2025 22:54:13 +0200 Subject: [PATCH 04/11] Trial and Test MetricStore components --- .gitignore | 5 + api/schema.graphqls | 4 +- api/swagger.json | 30 +++-- api/swagger.yaml | 25 ++-- cmd/cc-backend/main.go | 6 +- configs/config-demo.json | 18 ++- configs/config.json | 18 ++- go.mod | 7 +- go.sum | 19 +++ internal/api/api_test.go | 4 +- internal/api/docs.go | 30 +++-- internal/api/rest.go | 1 - internal/avro/avroHelper.go | 2 +- internal/config/memorystore.go | 8 +- internal/graph/generated/generated.go | 97 +++++---------- internal/graph/model/models_gen.go | 2 +- internal/graph/schema.resolvers.go | 5 - internal/memorystore/checkpoint.go | 2 +- internal/memorystore/lineprotocol.go | 110 +++++++++--------- internal/memorystore/memorystore.go | 16 +-- internal/repository/job.go | 14 +-- internal/repository/jobCreate.go | 2 +- internal/repository/jobFind.go | 10 +- internal/repository/jobQuery.go | 6 +- internal/repository/stats.go | 2 +- internal/tagger/jobclasses/highload.json | 2 +- .../tagger/jobclasses/lowUtilization.json | 2 +- internal/tagger/jobclasses/lowload.json | 2 +- internal/taskManager/commitJobService.go | 4 +- internal/taskManager/taskManager.go | 5 - internal/taskManager/updateDurationService.go | 4 +- .../taskManager/updateFootprintService.go | 4 +- startDemo.sh | 37 +++++- test_ccms_write_api.sh.bak | 110 ++++++++++++++++++ 34 files changed, 394 insertions(+), 219 deletions(-) create mode 100755 test_ccms_write_api.sh.bak diff --git a/.gitignore b/.gitignore index 75cc004..963073d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,11 @@ /var/*.db /var/*.txt +/var/checkpoints* + +migrateTimestamps.pl +test_ccms_write_api.sh + /web/frontend/public/build /web/frontend/node_modules diff --git a/api/schema.graphqls b/api/schema.graphqls index 794c630..070b5b7 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -37,7 +37,7 @@ type Job { numAcc: Int! energy: Float! SMT: Int! - exclusive: Int! + shared: String! partition: String! arrayJobId: Int! monitoringStatus: Int! @@ -419,7 +419,7 @@ input JobFilter { startTime: TimeRange state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int + shared: StringInput node: StringInput } diff --git a/api/swagger.json b/api/swagger.json index 87bf3ed..c60810a 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -1394,12 +1394,6 @@ "format": "float64" } }, - "exclusive": { - "type": "integer", - "maximum": 2, - "minimum": 0, - "example": 1 - }, "footprint": { "type": "object", "additionalProperties": { @@ -1416,12 +1410,18 @@ }, "jobState": { "enum": [ - "completed", - "failed", + "boot_fail", "cancelled", - "stopped", - "timeout", - "out_of_memory" + "completed", + "deadline", + "failed", + "node_fail", + "out-of-memory", + "pending", + "preempted", + "running", + "suspended", + "timeout" ], "allOf": [ { @@ -1477,6 +1477,14 @@ "$ref": "#/definitions/schema.Resource" } }, + "shared": { + "type": "string", + "enum": [ + "none", + "single_user", + "multi_user" + ] + }, "smt": { "type": "integer", "example": 4 diff --git a/api/swagger.yaml b/api/swagger.yaml index 06caa56..6a4adbd 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -207,11 +207,6 @@ definitions: format: float64 type: number type: object - exclusive: - example: 1 - maximum: 2 - minimum: 0 - type: integer footprint: additionalProperties: format: float64 @@ -226,12 +221,18 @@ definitions: allOf: - $ref: '#/definitions/schema.JobState' enum: - - completed - - failed + - boot_fail - cancelled - - stopped + - completed + - deadline + - failed + - node_fail + - out-of-memory + - pending + - preempted + - running + - suspended - timeout - - out_of_memory example: completed metaData: additionalProperties: @@ -269,6 +270,12 @@ definitions: items: $ref: '#/definitions/schema.Resource' type: array + shared: + enum: + - none + - single_user + - multi_user + type: string smt: example: 4 type: integer diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 9c7ad1f..0146118 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -251,13 +251,13 @@ func main() { var wg sync.WaitGroup //Metric Store starts after all flags have been processes - memorystore.Init(wg) + memorystore.Init(&wg) archiver.Start(repository.GetJobRepository()) // // Comment out - // taskManager.Start(ccconf.GetPackageConfig("cron"), - // ccconf.GetPackageConfig("archive")) + taskManager.Start(ccconf.GetPackageConfig("cron"), + ccconf.GetPackageConfig("archive")) serverInit() diff --git a/configs/config-demo.json b/configs/config-demo.json index a31d65d..3c0d858 100644 --- a/configs/config-demo.json +++ b/configs/config-demo.json @@ -4,11 +4,23 @@ "short-running-jobs-duration": 300, "resampling": { "trigger": 30, - "resolutions": [600, 300, 120, 60] + "resolutions": [ + 600, + 300, + 120, + 60 + ] }, - "apiAllowedIPs": ["*"], + "apiAllowedIPs": [ + "*" + ], "emission-constant": 317 }, + "cron": { + "commit-job-worker": "2m", + "duration-worker": "5m", + "footprint-worker": "10m" + }, "archive": { "kind": "file", "path": "./var/job-archive" @@ -73,4 +85,4 @@ }, "retention-in-memory": "48h" } -} +} \ No newline at end of file diff --git a/configs/config.json b/configs/config.json index ed7d546..505e446 100644 --- a/configs/config.json +++ b/configs/config.json @@ -6,13 +6,25 @@ "user": "clustercockpit", "group": "clustercockpit", "validate": false, - "apiAllowedIPs": ["*"], + "apiAllowedIPs": [ + "*" + ], "short-running-jobs-duration": 300, "resampling": { "trigger": 30, - "resolutions": [600, 300, 120, 60] + "resolutions": [ + 600, + 300, + 120, + 60 + ] } }, + "cron": { + "commit-job-worker": "2m", + "duration-worker": "5m", + "footprint-worker": "10m" + }, "archive": { "kind": "file", "path": "./var/job-archive" @@ -41,4 +53,4 @@ } } ] -} +} \ No newline at end of file diff --git a/go.mod b/go.mod index 5858cff..e0add97 100644 --- a/go.mod +++ b/go.mod @@ -19,9 +19,12 @@ require ( github.com/gorilla/handlers v1.5.2 github.com/gorilla/mux v1.8.1 github.com/gorilla/sessions v1.4.0 + github.com/influxdata/line-protocol/v2 v2.2.1 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 + github.com/linkedin/goavro/v2 v2.14.0 github.com/mattn/go-sqlite3 v1.14.24 + github.com/nats-io/nats.go v1.44.0 github.com/prometheus/client_golang v1.23.0 github.com/prometheus/common v0.65.0 github.com/qustavo/sqlhooks/v2 v2.1.0 @@ -62,14 +65,16 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/compress v1.18.0 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect - github.com/linkedin/goavro/v2 v2.14.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect + github.com/nats-io/nkeys v0.4.11 // indirect + github.com/nats-io/nuid v1.0.1 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 3c51770..792ec1c 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,7 @@ github.com/coreos/go-oidc/v3 v3.12.0 h1:sJk+8G2qq94rDI6ehZ71Bol3oUHy63qNYmkiSjrc github.com/coreos/go-oidc/v3 v3.12.0/go.mod h1:gE3LgjOgFoHi9a4ce4/tJczr0Ai2/BoDhf0r5lltWI0= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -57,6 +58,10 @@ github.com/expr-lang/expr v1.17.5 h1:i1WrMvcdLF249nSNlpQZN1S6NXuW9WaOfF5tPi3aw3k github.com/expr-lang/expr v1.17.5/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= +github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk= @@ -94,6 +99,8 @@ github.com/golang-migrate/migrate/v4 v4.18.2/go.mod h1:2CM6tJvn2kqPXwnXO/d3rAQYi github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= @@ -130,6 +137,11 @@ github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjw github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= +github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98= +github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig= +github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo= +github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY= +github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY= github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= @@ -158,8 +170,11 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw= @@ -198,6 +213,7 @@ github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0= github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= @@ -358,15 +374,18 @@ golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxb golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 9f47a1f..1c81fc9 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -241,7 +241,7 @@ func TestRestApi(t *testing.T) { "numNodes": 1, "numHwthreads": 8, "numAcc": 0, - "exclusive": 1, + "shared": "none", "monitoringStatus": 1, "smt": 1, "resources": [ @@ -396,7 +396,7 @@ func TestRestApi(t *testing.T) { "partition": "default", "walltime": 3600, "numNodes": 1, - "exclusive": 1, + "shared": "none", "monitoringStatus": 1, "smt": 1, "resources": [ diff --git a/internal/api/docs.go b/internal/api/docs.go index 50cab92..c10745c 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -1401,12 +1401,6 @@ const docTemplate = `{ "format": "float64" } }, - "exclusive": { - "type": "integer", - "maximum": 2, - "minimum": 0, - "example": 1 - }, "footprint": { "type": "object", "additionalProperties": { @@ -1423,12 +1417,18 @@ const docTemplate = `{ }, "jobState": { "enum": [ - "completed", - "failed", + "boot_fail", "cancelled", - "stopped", - "timeout", - "out_of_memory" + "completed", + "deadline", + "failed", + "node_fail", + "out-of-memory", + "pending", + "preempted", + "running", + "suspended", + "timeout" ], "allOf": [ { @@ -1484,6 +1484,14 @@ const docTemplate = `{ "$ref": "#/definitions/schema.Resource" } }, + "shared": { + "type": "string", + "enum": [ + "none", + "single_user", + "multi_user" + ] + }, "smt": { "type": "integer", "example": 4 diff --git a/internal/api/rest.go b/internal/api/rest.go index 8cefe48..fcadc90 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -97,7 +97,6 @@ func (api *RestApi) MountUserApiRoutes(r *mux.Router) { } func (api *RestApi) MountMetricStoreApiRoutes(r *mux.Router) { - r.StrictSlash(true) // REST API Uses TokenAuth r.HandleFunc("/api/free", memorystore.HandleFree).Methods(http.MethodPost) r.HandleFunc("/api/write", memorystore.HandleWrite).Methods(http.MethodPost) diff --git a/internal/avro/avroHelper.go b/internal/avro/avroHelper.go index ea733cd..21a5617 100644 --- a/internal/avro/avroHelper.go +++ b/internal/avro/avroHelper.go @@ -29,7 +29,7 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) { return case val := <-LineProtocolMessages: //Fetch the frequency of the metric from the global configuration - freq, err := config.MetricStoreKeys.GetMetricFrequency(val.MetricName) + freq, err := config.GetMetricFrequency(val.MetricName) if err != nil { fmt.Printf("Error fetching metric frequency: %s\n", err) continue diff --git a/internal/config/memorystore.go b/internal/config/memorystore.go index b9273b4..c277045 100644 --- a/internal/config/memorystore.go +++ b/internal/config/memorystore.go @@ -97,10 +97,10 @@ func InitMetricStore(msConfig json.RawMessage) { } } -func (c *MetricStoreConfig) GetMetricFrequency(metricName string) (int64, error) { - // if metric, ok := c.Metrics[metricName]; ok { - // return metric.Frequency, nil - // } +func GetMetricFrequency(metricName string) (int64, error) { + if metric, ok := Metrics[metricName]; ok { + return metric.Frequency, nil + } return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName) } diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index 3a85858..eed946d 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -118,7 +118,6 @@ type ComplexityRoot struct { Duration func(childComplexity int) int Energy func(childComplexity int) int EnergyFootprint func(childComplexity int) int - Exclusive func(childComplexity int) int Footprint func(childComplexity int) int ID func(childComplexity int) int JobID func(childComplexity int) int @@ -131,6 +130,7 @@ type ComplexityRoot struct { Project func(childComplexity int) int Resources func(childComplexity int) int SMT func(childComplexity int) int + Shared func(childComplexity int) int StartTime func(childComplexity int) int State func(childComplexity int) int SubCluster func(childComplexity int) int @@ -425,8 +425,6 @@ type ClusterResolver interface { type JobResolver interface { StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) - Exclusive(ctx context.Context, obj *schema.Job) (int, error) - Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) @@ -726,13 +724,6 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return e.complexity.Job.EnergyFootprint(childComplexity), true - case "Job.exclusive": - if e.complexity.Job.Exclusive == nil { - break - } - - return e.complexity.Job.Exclusive(childComplexity), true - case "Job.footprint": if e.complexity.Job.Footprint == nil { break @@ -817,6 +808,13 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return e.complexity.Job.SMT(childComplexity), true + case "Job.shared": + if e.complexity.Job.Shared == nil { + break + } + + return e.complexity.Job.Shared(childComplexity), true + case "Job.startTime": if e.complexity.Job.StartTime == nil { break @@ -2361,7 +2359,7 @@ type Job { numAcc: Int! energy: Float! SMT: Int! - exclusive: Int! + shared: String! partition: String! arrayJobId: Int! monitoringStatus: Int! @@ -2743,7 +2741,7 @@ input JobFilter { startTime: TimeRange state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int + shared: StringInput node: StringInput } @@ -5217,8 +5215,8 @@ func (ec *executionContext) fieldContext_Job_SMT(_ context.Context, field graphq return fc, nil } -func (ec *executionContext) _Job_exclusive(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Job_exclusive(ctx, field) +func (ec *executionContext) _Job_shared(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Job_shared(ctx, field) if err != nil { return graphql.Null } @@ -5231,7 +5229,7 @@ func (ec *executionContext) _Job_exclusive(ctx context.Context, field graphql.Co }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { ctx = rctx // use context from middleware stack in children - return ec.resolvers.Job().Exclusive(rctx, obj) + return obj.Shared, nil }) if err != nil { ec.Error(ctx, err) @@ -5243,19 +5241,19 @@ func (ec *executionContext) _Job_exclusive(ctx context.Context, field graphql.Co } return graphql.Null } - res := resTmp.(int) + res := resTmp.(string) fc.Result = res - return ec.marshalNInt2int(ctx, field.Selections, res) + return ec.marshalNString2string(ctx, field.Selections, res) } -func (ec *executionContext) fieldContext_Job_exclusive(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_Job_shared(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "Job", Field: field, - IsMethod: true, - IsResolver: true, + IsMethod: false, + IsResolver: false, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { - return nil, errors.New("field of type Int does not have child fields") + return nil, errors.New("field of type String does not have child fields") }, } return fc, nil @@ -6404,8 +6402,8 @@ func (ec *executionContext) fieldContext_JobResultList_items(_ context.Context, return ec.fieldContext_Job_energy(ctx, field) case "SMT": return ec.fieldContext_Job_SMT(ctx, field) - case "exclusive": - return ec.fieldContext_Job_exclusive(ctx, field) + case "shared": + return ec.fieldContext_Job_shared(ctx, field) case "partition": return ec.fieldContext_Job_partition(ctx, field) case "arrayJobId": @@ -11042,8 +11040,8 @@ func (ec *executionContext) fieldContext_Query_job(ctx context.Context, field gr return ec.fieldContext_Job_energy(ctx, field) case "SMT": return ec.fieldContext_Job_SMT(ctx, field) - case "exclusive": - return ec.fieldContext_Job_exclusive(ctx, field) + case "shared": + return ec.fieldContext_Job_shared(ctx, field) case "partition": return ec.fieldContext_Job_partition(ctx, field) case "arrayJobId": @@ -16357,7 +16355,7 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any asMap[k] = v } - fieldsInOrder := [...]string{"tags", "dbId", "jobId", "arrayJobId", "user", "project", "jobName", "cluster", "partition", "duration", "energy", "minRunningFor", "numNodes", "numAccelerators", "numHWThreads", "startTime", "state", "metricStats", "exclusive", "node"} + fieldsInOrder := [...]string{"tags", "dbId", "jobId", "arrayJobId", "user", "project", "jobName", "cluster", "partition", "duration", "energy", "minRunningFor", "numNodes", "numAccelerators", "numHWThreads", "startTime", "state", "metricStats", "shared", "node"} for _, k := range fieldsInOrder { v, ok := asMap[k] if !ok { @@ -16490,13 +16488,13 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any return it, err } it.MetricStats = data - case "exclusive": - ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("exclusive")) - data, err := ec.unmarshalOInt2ᚖint(ctx, v) + case "shared": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("shared")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) if err != nil { return it, err } - it.Exclusive = data + it.Shared = data case "node": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("node")) data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) @@ -17397,42 +17395,11 @@ func (ec *executionContext) _Job(ctx context.Context, sel ast.SelectionSet, obj if out.Values[i] == graphql.Null { atomic.AddUint32(&out.Invalids, 1) } - case "exclusive": - field := field - - innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - } - }() - res = ec._Job_exclusive(ctx, field, obj) - if res == graphql.Null { - atomic.AddUint32(&fs.Invalids, 1) - } - return res + case "shared": + out.Values[i] = ec._Job_shared(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) } - - if field.Deferrable != nil { - dfs, ok := deferred[field.Deferrable.Label] - di := 0 - if ok { - dfs.AddField(field) - di = len(dfs.Values) - 1 - } else { - dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) - deferred[field.Deferrable.Label] = dfs - } - dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { - return innerFunc(ctx, dfs) - }) - - // don't run the out.Concurrently() call below - out.Values[i] = graphql.Null - continue - } - - out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) case "partition": out.Values[i] = ec._Job_partition(ctx, field, obj) if out.Values[i] == graphql.Null { diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index e9abf0d..accc344 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -69,7 +69,7 @@ type JobFilter struct { StartTime *config.TimeRange `json:"startTime,omitempty"` State []schema.JobState `json:"state,omitempty"` MetricStats []*MetricStatItem `json:"metricStats,omitempty"` - Exclusive *int `json:"exclusive,omitempty"` + Shared *StringInput `json:"shared,omitempty"` Node *StringInput `json:"node,omitempty"` } diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 8868497..315f1a3 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -35,11 +35,6 @@ func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Tim return ×tamp, nil } -// Exclusive is the resolver for the exclusive field. -func (r *jobResolver) Exclusive(ctx context.Context, obj *schema.Job) (int, error) { - panic(fmt.Errorf("not implemented: Exclusive - exclusive")) -} - // Tags is the resolver for the tags field. func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) { return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID) diff --git a/internal/memorystore/checkpoint.go b/internal/memorystore/checkpoint.go index 76a5472..adee443 100644 --- a/internal/memorystore/checkpoint.go +++ b/internal/memorystore/checkpoint.go @@ -380,7 +380,7 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { if err != nil { log.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err) } - fmt.Printf("[METRICSTORE]> %#v Directory created successfully.\n", dir) + log.Printf("[METRICSTORE]> %#v Directory created successfully.\n", dir) } // Config read (replace with your actual config read) diff --git a/internal/memorystore/lineprotocol.go b/internal/memorystore/lineprotocol.go index e12b9e2..495197d 100644 --- a/internal/memorystore/lineprotocol.go +++ b/internal/memorystore/lineprotocol.go @@ -2,10 +2,8 @@ package memorystore import ( "context" - "errors" "fmt" "log" - "net" "sync" "time" @@ -17,67 +15,67 @@ import ( ) // Each connection is handled in it's own goroutine. This is a blocking function. -func ReceiveRaw(ctx context.Context, - listener net.Listener, - handleLine func(*lineprotocol.Decoder, string) error, -) error { - var wg sync.WaitGroup +// func ReceiveRaw(ctx context.Context, +// listener net.Listener, +// handleLine func(*lineprotocol.Decoder, string) error, +// ) error { +// var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - <-ctx.Done() - if err := listener.Close(); err != nil { - log.Printf("listener.Close(): %s", err.Error()) - } - }() +// wg.Add(1) +// go func() { +// defer wg.Done() +// <-ctx.Done() +// if err := listener.Close(); err != nil { +// log.Printf("listener.Close(): %s", err.Error()) +// } +// }() - for { - conn, err := listener.Accept() - if err != nil { - if errors.Is(err, net.ErrClosed) { - break - } +// for { +// conn, err := listener.Accept() +// if err != nil { +// if errors.Is(err, net.ErrClosed) { +// break +// } - log.Printf("listener.Accept(): %s", err.Error()) - } +// log.Printf("listener.Accept(): %s", err.Error()) +// } - wg.Add(2) - go func() { - defer wg.Done() - defer conn.Close() +// wg.Add(2) +// go func() { +// defer wg.Done() +// defer conn.Close() - dec := lineprotocol.NewDecoder(conn) - connctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go func() { - defer wg.Done() - select { - case <-connctx.Done(): - conn.Close() - case <-ctx.Done(): - conn.Close() - } - }() +// dec := lineprotocol.NewDecoder(conn) +// connctx, cancel := context.WithCancel(context.Background()) +// defer cancel() +// go func() { +// defer wg.Done() +// select { +// case <-connctx.Done(): +// conn.Close() +// case <-ctx.Done(): +// conn.Close() +// } +// }() - if err := handleLine(dec, "default"); err != nil { - if errors.Is(err, net.ErrClosed) { - return - } +// if err := handleLine(dec, "default"); err != nil { +// if errors.Is(err, net.ErrClosed) { +// return +// } - log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error()) - errmsg := make([]byte, 128) - errmsg = append(errmsg, `error: `...) - errmsg = append(errmsg, err.Error()...) - errmsg = append(errmsg, '\n') - conn.Write(errmsg) - } - }() - } +// log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error()) +// errmsg := make([]byte, 128) +// errmsg = append(errmsg, `error: `...) +// errmsg = append(errmsg, err.Error()...) +// errmsg = append(errmsg, '\n') +// conn.Write(errmsg) +// } +// }() +// } - wg.Wait() - return nil -} +// wg.Wait() +// return nil +// } // Connect to a nats server and subscribe to "updates". This is a blocking // function. handleLine will be called for each line recieved via nats. @@ -113,7 +111,7 @@ func ReceiveNats(conf *(config.NatsConfig), if workers > 1 { wg.Add(workers) - for i := 0; i < workers; i++ { + for range workers { go func() { for m := range msgs { dec := lineprotocol.NewDecoderWithBytes(m.Data) diff --git a/internal/memorystore/memorystore.go b/internal/memorystore/memorystore.go index efa4065..4a631c2 100644 --- a/internal/memorystore/memorystore.go +++ b/internal/memorystore/memorystore.go @@ -47,7 +47,7 @@ type MemoryStore struct { root Level } -func Init(wg sync.WaitGroup) { +func Init(wg *sync.WaitGroup) { startupTime := time.Now() //Pass the config.MetricStoreKeys @@ -82,10 +82,10 @@ func Init(wg sync.WaitGroup) { wg.Add(4) - Retention(&wg, ctx) - Checkpointing(&wg, ctx) - Archiving(&wg, ctx) - avro.DataStaging(&wg, ctx) + Retention(wg, ctx) + Checkpointing(wg, ctx) + Archiving(wg, ctx) + avro.DataStaging(wg, ctx) wg.Add(1) sigs := make(chan os.Signal, 1) @@ -337,12 +337,12 @@ func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metric // the range asked for if no data was available. func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) { if from > to { - return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range") + return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range\n") } minfo, ok := m.Metrics[metric] if !ok { - return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric) + return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: \n" + metric) } n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1) @@ -381,7 +381,7 @@ func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, reso if err != nil { return nil, 0, 0, 0, err } else if n == 0 { - return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found") + return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found\n") } else if n > 1 { if minfo.Aggregation == config.AvgAggregation { normalize := 1. / schema.Float(n) diff --git a/internal/repository/job.go b/internal/repository/job.go index dd40ebc..68778e1 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -52,18 +52,18 @@ func GetJobRepository() *JobRepository { } var jobColumns []string = []string{ - "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", + "job.id", "job.job_id", "job.hpc_user", "job.project", "job.hpc_cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes", - "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", + "job.num_hwthreads", "job.num_acc", "job.shared", "job.monitoring_status", "job.smt", "job.job_state", "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", } var jobCacheColumns []string = []string{ - "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster", + "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.hpc_cluster", "job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition", "job_cache.array_job_id", "job_cache.num_nodes", "job_cache.num_hwthreads", - "job_cache.num_acc", "job_cache.exclusive", "job_cache.monitoring_status", "job_cache.smt", + "job_cache.num_acc", "job_cache.shared", "job_cache.monitoring_status", "job_cache.smt", "job_cache.job_state", "job_cache.duration", "job_cache.walltime", "job_cache.resources", "job_cache.footprint", "job_cache.energy", } @@ -390,7 +390,7 @@ func (r *JobRepository) Partitions(cluster string) ([]string, error) { start := time.Now() partitions := r.cache.Get("partitions:"+cluster, func() (any, time.Duration, int) { parts := []string{} - if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil { + if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.hpc_cluster = ?;`, cluster); err != nil { return nil, 0, 1000 } @@ -410,7 +410,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in subclusters := make(map[string]map[string]int) rows, err := sq.Select("resources", "subcluster").From("job"). Where("job.job_state = 'running'"). - Where("job.cluster = ?", cluster). + Where("job.hpc_cluster = ?", cluster). RunWith(r.stmtCache).Query() if err != nil { cclog.Error("Error while running query") @@ -505,7 +505,7 @@ func (r *JobRepository) FindJobIdsByTag(tagId int64) ([]int64, error) { // FIXME: Reconsider filtering short jobs with harcoded threshold func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { query := sq.Select(jobColumns...).From("job"). - Where(fmt.Sprintf("job.cluster = '%s'", cluster)). + Where(fmt.Sprintf("job.hpc_cluster = '%s'", cluster)). Where("job.job_state = 'running'"). Where("job.duration > 600") diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 666313f..f43be58 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -70,7 +70,7 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { } _, err = r.DB.Exec( - "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + "INSERT INTO job (job_id, hpc_cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, hpc_cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") if err != nil { cclog.Warnf("Error while Job sync: %v", err) return nil, err diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 39519d5..3abce8c 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -31,7 +31,7 @@ func (r *JobRepository) Find( Where("job.job_id = ?", *jobId) if cluster != nil { - q = q.Where("job.cluster = ?", *cluster) + q = q.Where("job.hpc_cluster = ?", *cluster) } if startTime != nil { q = q.Where("job.start_time = ?", *startTime) @@ -52,7 +52,7 @@ func (r *JobRepository) FindCached( Where("job_cache.job_id = ?", *jobId) if cluster != nil { - q = q.Where("job_cache.cluster = ?", *cluster) + q = q.Where("job_cache.hpc_cluster = ?", *cluster) } if startTime != nil { q = q.Where("job_cache.start_time = ?", *startTime) @@ -78,7 +78,7 @@ func (r *JobRepository) FindAll( Where("job.job_id = ?", *jobId) if cluster != nil { - q = q.Where("job.cluster = ?", *cluster) + q = q.Where("job.hpc_cluster = ?", *cluster) } if startTime != nil { q = q.Where("job.start_time = ?", *startTime) @@ -183,7 +183,7 @@ func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime q := sq.Select(jobColumns...). From("job"). Where("job.job_id = ?", jobId). - Where("job.cluster = ?", cluster). + Where("job.hpc_cluster = ?", cluster). Where("job.start_time = ?", startTime) q, qerr := SecurityCheck(ctx, q) @@ -203,7 +203,7 @@ func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cl From("job"). Where("job.job_id = ?", jobId). Where("job.hpc_user = ?", user). - Where("job.cluster = ?", cluster). + Where("job.hpc_cluster = ?", cluster). Where("job.start_time = ?", startTime) _, err := scanJob(q.RunWith(r.stmtCache).QueryRow()) diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index fdcc904..19cdd9a 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -168,7 +168,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select query = buildMetaJsonCondition("jobName", filter.JobName, query) } if filter.Cluster != nil { - query = buildStringCondition("job.cluster", filter.Cluster, query) + query = buildStringCondition("job.hpc_cluster", filter.Cluster, query) } if filter.Partition != nil { query = buildStringCondition("job.cluster_partition", filter.Partition, query) @@ -183,8 +183,8 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs. query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor) } - if filter.Exclusive != nil { - query = query.Where("job.exclusive = ?", *filter.Exclusive) + if filter.Shared != nil { + query = query.Where("job.shared = ?", *filter.Shared) } if filter.State != nil { states := make([]string, len(filter.State)) diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 7beb674..25c862f 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -23,7 +23,7 @@ import ( var groupBy2column = map[model.Aggregate]string{ model.AggregateUser: "job.hpc_user", model.AggregateProject: "job.project", - model.AggregateCluster: "job.cluster", + model.AggregateCluster: "job.hpc_cluster", } var sortBy2column = map[model.SortByAggregate]string{ diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index 0d16b45..9667011 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -8,7 +8,7 @@ ], "metrics": ["cpu_load"], "requirements": [ - "job.exclusive == 1", + "job.shared == \"none\"", "job.duration > job_min_duration_seconds" ], "variables": [ diff --git a/internal/tagger/jobclasses/lowUtilization.json b/internal/tagger/jobclasses/lowUtilization.json index 9613b06..e84b81d 100644 --- a/internal/tagger/jobclasses/lowUtilization.json +++ b/internal/tagger/jobclasses/lowUtilization.json @@ -4,7 +4,7 @@ "parameters": ["job_min_duration_seconds"], "metrics": ["flops_any", "mem_bw"], "requirements": [ - "job.exclusive == 1", + "job.shared == \"none\"", "job.duration > job_min_duration_seconds" ], "variables": [ diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index 2212bd1..f952da5 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -8,7 +8,7 @@ ], "metrics": ["cpu_load"], "requirements": [ - "job.exclusive == 1", + "job.shared == \"none\"", "job.duration > job_min_duration_seconds" ], "variables": [ diff --git a/internal/taskManager/commitJobService.go b/internal/taskManager/commitJobService.go index e7c169a..88c2708 100644 --- a/internal/taskManager/commitJobService.go +++ b/internal/taskManager/commitJobService.go @@ -26,9 +26,9 @@ func RegisterCommitJobService() { gocron.NewTask( func() { start := time.Now() - cclog.Printf("Jobcache sync started at %s", start.Format(time.RFC3339)) + cclog.Printf("Jobcache sync started at %s\n", start.Format(time.RFC3339)) jobs, _ := jobRepo.SyncJobs() repository.CallJobStartHooks(jobs) - cclog.Printf("Jobcache sync and job callbacks are done and took %s", time.Since(start)) + cclog.Printf("Jobcache sync and job callbacks are done and took %s\n", time.Since(start)) })) } diff --git a/internal/taskManager/taskManager.go b/internal/taskManager/taskManager.go index df6c4d0..35d6ea5 100644 --- a/internal/taskManager/taskManager.go +++ b/internal/taskManager/taskManager.go @@ -7,7 +7,6 @@ package taskManager import ( "bytes" "encoding/json" - "fmt" "time" "github.com/ClusterCockpit/cc-backend/internal/auth" @@ -66,10 +65,6 @@ func Start(cronCfg, archiveConfig json.RawMessage) { RegisterStopJobsExceedTime() } - fmt.Printf("Keys : %#v\n", Keys) - fmt.Printf("cronCfg : %#v\n", cronCfg) - fmt.Printf("archiveConfig : %#v\n", archiveConfig) - dec := json.NewDecoder(bytes.NewReader(cronCfg)) dec.DisallowUnknownFields() if err := dec.Decode(&Keys); err != nil { diff --git a/internal/taskManager/updateDurationService.go b/internal/taskManager/updateDurationService.go index d650afb..53882f0 100644 --- a/internal/taskManager/updateDurationService.go +++ b/internal/taskManager/updateDurationService.go @@ -25,8 +25,8 @@ func RegisterUpdateDurationWorker() { gocron.NewTask( func() { start := time.Now() - cclog.Printf("Update duration started at %s", start.Format(time.RFC3339)) + cclog.Printf("Update duration started at %s\n", start.Format(time.RFC3339)) jobRepo.UpdateDuration() - cclog.Printf("Update duration is done and took %s", time.Since(start)) + cclog.Printf("Update duration is done and took %s\n", time.Since(start)) })) } diff --git a/internal/taskManager/updateFootprintService.go b/internal/taskManager/updateFootprintService.go index 4025849..2ce9901 100644 --- a/internal/taskManager/updateFootprintService.go +++ b/internal/taskManager/updateFootprintService.go @@ -134,8 +134,8 @@ func RegisterFootprintWorker() { } jobRepo.TransactionEnd(t) } - cclog.Debugf("Finish Cluster %s, took %s", cluster.Name, time.Since(s_cluster)) + cclog.Debugf("Finish Cluster %s, took %s\n", cluster.Name, time.Since(s_cluster)) } - cclog.Printf("Updating %d (of %d; Skipped %d) Footprints is done and took %s", c, cl, ce, time.Since(s)) + cclog.Printf("Updating %d (of %d; Skipped %d) Footprints is done and took %s\n", c, cl, ce, time.Since(s)) })) } diff --git a/startDemo.sh b/startDemo.sh index faf6d35..b027bf5 100755 --- a/startDemo.sh +++ b/startDemo.sh @@ -12,6 +12,41 @@ else cp ./configs/env-template.txt .env cp ./configs/config-demo.json config.json + # mkdir -p ./var/checkpoints + # cp -rf ~/cc-metric-store/var/checkpoints ~/cc-backend/var + ./cc-backend -migrate-db - ./cc-backend -server -dev -init-db -add-user demo:admin:demo + ./cc-backend -dev -init-db -add-user demo:admin,api:demo + + # --- begin: generate JWT for demo and update test_ccms_write_api.sh --- + CC_BIN="./cc-backend" + TEST_FILE="./test_ccms_write_api.sh" + BACKUP_FILE="${TEST_FILE}.bak" + + if [ -x "$CC_BIN" ]; then + echo "Generating JWT for user 'demo'..." + output="$($CC_BIN -jwt demo 2>&1 || true)" + token="$(printf '%s\n' "$output" | grep -oE '[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+' | head -n1 || true)" + + if [ -z "$token" ]; then + echo "Warning: could not extract JWT from output:" >&2 + printf '%s\n' "$output" >&2 + else + if [ -f "$TEST_FILE" ]; then + cp -a "$TEST_FILE" "$BACKUP_FILE" + # replace first line with JWT="..." + sed -i "1s#.*#JWT=\"$token\"#" "$TEST_FILE" + echo "Updated JWT in $TEST_FILE (backup at $BACKUP_FILE)" + else + echo "Warning: $TEST_FILE not found; JWT not written." + fi + fi + else + echo "Warning: $CC_BIN not found or not executable; skipping JWT generation." + fi + # --- end: generate JWT for demo and update test_ccms_write_api.sh --- + + + ./cc-backend -server -dev + fi diff --git a/test_ccms_write_api.sh.bak b/test_ccms_write_api.sh.bak new file mode 100755 index 0000000..f76322f --- /dev/null +++ b/test_ccms_write_api.sh.bak @@ -0,0 +1,110 @@ +JWT="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjQ1NjMzOTUsImlhdCI6MTc1NzM2MzM5NSwicm9sZXMiOlsiYWRtaW4iLCJhcGkiXSwic3ViIjoiZGVtbyJ9.uhtEbS-ty4xNc8GWTKjyh1b06j6b3vtEw7lzQy0Eht5LtISZwRfyRBfdKjbm_t25xGrNH9sxINq4qiYKBjAaDQ" + +# curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" -d $'cpu_load,cluster=alex,hostname=a042,type=hwthread,type-id=0 value=35.0 1725827464642231296' + +rm sample_fritz.txt +rm sample_alex.txt + +while [ true ]; do + echo "Alex Metrics for hwthread types and type-ids" + timestamp="$(date '+%s')" + echo "Timestamp : "+$timestamp + for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do + for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do + for id in {0..127}; do + echo "$metric,cluster=alex,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt + done + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt + + echo "Fritz Metrics for hwthread types and type-ids" + for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do + for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do + for id in {0..71}; do + echo "$metric,cluster=fritz,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt + done + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt + + rm sample_fritz.txt + rm sample_alex.txt + + echo "Alex Metrics for accelerator types and type-ids" + for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do + for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do + for id in 00000000:49:00.0 00000000:0E:00.0 00000000:D1:00.0 00000000:90:00.0 00000000:13:00.0 00000000:96:00.0 00000000:CC:00.0 00000000:4F:00.0; do + echo "$metric,cluster=alex,hostname=$hostname,type=accelerator,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt + done + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt + + rm sample_alex.txt + + echo "Alex Metrics for memoryDomain types and type-ids" + for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do + for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do + for id in {0..7}; do + echo "$metric,cluster=alex,hostname=$hostname,type=memoryDomain,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt + done + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt + + rm sample_alex.txt + + echo "Alex Metrics for socket types and type-ids" + for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do + for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do + for id in {0..1}; do + echo "$metric,cluster=alex,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt + done + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt + + echo "Fritz Metrics for socket types and type-ids" + for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do + for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do + for id in {0..1}; do + echo "$metric,cluster=fritz,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt + done + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt + + rm sample_fritz.txt + rm sample_alex.txt + + echo "Alex Metrics for nodes" + for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do + for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do + echo "$metric,cluster=alex,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt + + echo "Fritz Metrics for nodes" + for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do + for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do + echo "$metric,cluster=fritz,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt + done + done + + curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt + + rm sample_fritz.txt + rm sample_alex.txt + + sleep 1m +done +# curl -X 'POST' 'http://localhost:8081/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" -d $'cpu_load,cluster=alex,hostname=a042,type=hwthread,type-id=0 value=35.0 1725827464642231296' \ No newline at end of file From 39f21763e4ca13de09d447475de5c16caf3f3b6e Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 9 Sep 2025 11:30:20 +0200 Subject: [PATCH 05/11] Revert test database --- internal/repository/testdata/job.db | Bin 118784 -> 118784 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index e9e20cebc366222e58e1ba375f78b4b57a3dd444..43ec9d3c7f36c7ea505a96cc1208c4ce7a148eed 100644 GIT binary patch delta 919 zcmZ`%O=uHA7@f(^?q-usU)Cf>X+z?no{A(|D|+%IUc|E(LyV~g8%?Cp(u0Wy!GfU8 zA_3_k1i`zj_SjxZ3PprakyHgSwHGmg+C!m&&h9pwfaxx?`@Q$QZ+_lkxh5D#*tG>vLl=4E-_dh5G^Y4SYU&sF9gxkZXC4NHi6CbgRS9ppU zld>OdEyet_Ei*#ooeB@5ONnFjl3Wan;ZTH5-p@|YCgwlI)&1x>ADX@fnn=?g`JpurtjgWkA$TB2wZv^m~_-cBC% zSO?t2E;<%BA}YK{RV4nE6y{j)8~o*p@9;}h;7K2qT(+iG@*7&bgN3SAtHy$TCEIA? zJyu}w!bue82f4!MC=B35&R^^Dst}=$0sX+`6(UMEVXWByt+xUtN ztn&@laQ%u?&R4&90Netj=^w#8sT+vhfLTFnQx@!?x#N-rrlecViethrY@&{Lc!@G! z;{ht`NqV+YPSW!|quTh4HM~I;Pq=0l#Q+OO?FLHU%jKss)0vyn08Sjhu_G)LwVK8$ s75=AO8dq8E;;5V*xA4|W|12N(at};0$j1mDyw8c*P(w?x75WYT01oNs3;+NC delta 2449 zcmeHJU1%d!6rMXXnVC$R+{Bp1!Is?qY?DB`Hc4xtyP(||+-hmDrlJp;y-jADtdpOe zOxj&RwyTKiJ`~qMim*b_Hy^iZUW8R}Uls%@E3)8&KlmoA)hAh{-g_sFiLKzPumfl2 z-gCb5Gv|DBll3imeM^4xLBGav+(1ht@;rVeo{DMQ(U;D;$PU1}M}Nx5B+ZMO}2**9dLzY)K z_%xWQOMdT_KY~bpP(j~Q(3h2b_KmgV0oiaw{#QPyY3LA z;Cf@ZQ|Dzm3!7HXDwTEnUDt?*s*1IIu2F9}_EJsFte(;H7K?O&Dq3N3HR4>dO4SN_ zW@a&ca%M4_oScj~>9J|z^SQ199YdjEV&-$O_x`oD6wCxevbCMw5qo-?R^^=N6CG#{fZLr>1dGsp>jl|?`Gdt21RPHL++L)>>hAT z$K4}9-9F`hpH~k*Cbk6Wp&lue9@zXP0Up>M?EOJ#iZ$>j)RZRYmNqze+k4CVj^{Pc z3(Bv`b!kJ{AYYUvO}zT@up9~o!Rya6!qZMs@pjXks=68Ou(4XVDwt4$(wMQalf#Gc zn(1gX#utY!&g5Hk>9BS2B|_L&&0|Y1MB*qyCa;^T&bm}EbgC=(KvEPe(=NLlw&N0Th|KT|X)FwCuWZy>$ c*t@RL|2X#%xt~ZskpK}t5nmI13ciJZ0j;Ni(f|Me From d8e85cf75d051c5ce1e878c4814c17635f8a7d0c Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 9 Sep 2025 11:35:34 +0200 Subject: [PATCH 06/11] Fix migration --- .../sqlite3/09_add-job-cache.up.sql | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql index 2c25029..8c54622 100644 --- a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql @@ -69,17 +69,29 @@ CREATE TABLE "job_new" ( ); ALTER TABLE job RENAME COLUMN cluster TO hpc_cluster; + +CREATE TABLE IF NOT EXISTS lookup_exclusive ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE +); + +INSERT INTO lookup_exclusive (id, name) VALUES + (0, 'multi_user'), + (1, 'none'), + (2, 'single_user'); + INSERT INTO job_new ( - id, job_id, hpc_cluster, subcluster, submit_time, start_time, hpc_user, project, - cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, - num_nodes, num_hwthreads, num_acc, smt, shared, monitoring_status, energy, + id, job_id, hpc_cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, + num_nodes, num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint -) -SELECT - id, job_id, hpc_cluster, subcluster, 0, start_time, hpc_user, project, - cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, - num_nodes, num_hwthreads, num_acc, smt, exclusive, monitoring_status, energy, +) SELECT + id, job_id, hpc_cluster, subcluster, 0, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, + num_nodes, num_hwthreads, num_acc, smt, (SELECT name FROM lookup_exclusive WHERE id=job.exclusive), monitoring_status, energy, energy_footprint, footprint FROM job; + +DROP TABLE lookup_exclusive; DROP TABLE job; ALTER TABLE job_new RENAME TO job; From d00881de2e96e3d5c4537cbc8e4cec8fbd5991d7 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 9 Sep 2025 11:36:02 +0200 Subject: [PATCH 07/11] Refactor and update dependencies --- go.mod | 23 +++++++------- go.sum | 54 ++++++++++++++++----------------- internal/avro/avroCheckpoint.go | 23 +++++++------- internal/avro/avroHelper.go | 10 ++++-- internal/avro/avroStruct.go | 10 ++++-- 5 files changed, 63 insertions(+), 57 deletions(-) diff --git a/go.mod b/go.mod index e0add97..0725a30 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,10 @@ toolchain go1.24.1 require ( github.com/99designs/gqlgen v0.17.78 - github.com/ClusterCockpit/cc-lib v0.7.0 + github.com/ClusterCockpit/cc-lib v0.8.0 github.com/Masterminds/squirrel v1.5.4 github.com/coreos/go-oidc/v3 v3.12.0 - github.com/expr-lang/expr v1.17.5 + github.com/expr-lang/expr v1.17.6 github.com/go-co-op/gocron/v2 v2.16.0 github.com/go-ldap/ldap/v3 v3.4.10 github.com/go-sql-driver/mysql v1.9.0 @@ -24,17 +24,17 @@ require ( github.com/joho/godotenv v1.5.1 github.com/linkedin/goavro/v2 v2.14.0 github.com/mattn/go-sqlite3 v1.14.24 - github.com/nats-io/nats.go v1.44.0 - github.com/prometheus/client_golang v1.23.0 - github.com/prometheus/common v0.65.0 + github.com/nats-io/nats.go v1.45.0 + github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/common v0.66.1 github.com/qustavo/sqlhooks/v2 v2.1.0 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/swaggo/http-swagger v1.3.4 github.com/swaggo/swag v1.16.6 github.com/vektah/gqlparser/v2 v2.5.30 - golang.org/x/crypto v0.40.0 + golang.org/x/crypto v0.41.0 golang.org/x/oauth2 v0.30.0 - golang.org/x/time v0.5.0 + golang.org/x/time v0.12.0 ) require ( @@ -87,13 +87,12 @@ require ( go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect golang.org/x/mod v0.26.0 // indirect - golang.org/x/net v0.42.0 // indirect + golang.org/x/net v0.43.0 // indirect golang.org/x/sync v0.16.0 // indirect - golang.org/x/sys v0.34.0 // indirect - golang.org/x/text v0.27.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect golang.org/x/tools v0.35.0 // indirect - google.golang.org/protobuf v1.36.6 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 792ec1c..81ae22b 100644 --- a/go.sum +++ b/go.sum @@ -6,16 +6,16 @@ github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25 github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib v0.7.0 h1:THuSYrMcn9pSbrMditSI1LMOluq9TnM0/aVId4uK1Hc= -github.com/ClusterCockpit/cc-lib v0.7.0/go.mod h1:TD1PS8pL2RDvEWaqs8VNejoTSm5OawI9Dcc0CTY/yWQ= +github.com/ClusterCockpit/cc-lib v0.8.0 h1:kQRMOx30CJCy+Q6TgCK9rarJnJ/CKZPWlIEdIXYlxoA= +github.com/ClusterCockpit/cc-lib v0.8.0/go.mod h1:5xTwONu9pSp15mJ9CjBKGU9I3Jad8NfhrVHJZl50/yI= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdus0= -github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= +github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= +github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= @@ -54,8 +54,8 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/expr-lang/expr v1.17.5 h1:i1WrMvcdLF249nSNlpQZN1S6NXuW9WaOfF5tPi3aw3k= -github.com/expr-lang/expr v1.17.5/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= +github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec= +github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= @@ -207,8 +207,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/nats-io/nats.go v1.44.0 h1:ECKVrDLdh/kDPV1g0gAQ+2+m2KprqZK5O/eJAyAnH2M= -github.com/nats-io/nats.go v1.44.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= +github.com/nats-io/nats.go v1.45.0 h1:/wGPbnYXDM0pLKFjZTX+2JOw9TQPoIgTFrUaH97giwA= +github.com/nats-io/nats.go v1.45.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0= github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= @@ -225,12 +225,12 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= -github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= -github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0= @@ -257,8 +257,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE= github.com/swaggo/files v1.0.1/go.mod h1:0qXmMNH6sXNf+73t65aKeB+ApmgxdnkQzVTAj2uaMUg= github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64a5ww= @@ -295,8 +295,8 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= -golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= -golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -317,8 +317,8 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= -golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= -golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -341,8 +341,8 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= -golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -361,10 +361,10 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= -golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= +golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= @@ -375,15 +375,13 @@ golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/internal/avro/avroCheckpoint.go b/internal/avro/avroCheckpoint.go index 4d72d36..b7c2ea1 100644 --- a/internal/avro/avroCheckpoint.go +++ b/internal/avro/avroCheckpoint.go @@ -65,7 +65,7 @@ func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) { defer wg.Done() for workItem := range work { - var from int64 = getTimestamp(workItem.dir) + from := getTimestamp(workItem.dir) if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil { if err == ErrNoNewData { @@ -159,7 +159,7 @@ func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { int_res, _ := strconv.Atoi(path.Base(dir)) // find smallest overall timestamp in l.data map and delete it from l.data - var minTs int64 = int64(1<<63 - 1) + minTs := int64(1<<63 - 1) for ts, dat := range l.data { if ts < minTs && len(dat) != 0 { minTs = ts @@ -176,7 +176,7 @@ func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { var schema string var codec *goavro.Codec - record_list := make([]map[string]interface{}, 0) + record_list := make([]map[string]any, 0) var f *os.File @@ -220,7 +220,7 @@ func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { repeat := 60 / int_res for range repeat { - record_list = append(record_list, make(map[string]interface{})) + record_list = append(record_list, make(map[string]any)) } } @@ -262,7 +262,7 @@ func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { return fmt.Errorf("failed to read record: %v", err) } - record_list = append(record_list, record.(map[string]interface{})) + record_list = append(record_list, record.(map[string]any)) } f.Close() @@ -411,10 +411,10 @@ func compareSchema(schemaRead, schemaGen string) (bool, string, error) { func generateSchema(data map[string]schema.Float) (string, error) { // Define the Avro schema structure - schema := map[string]interface{}{ + schema := map[string]any{ "type": "record", "name": "DataRecord", - "fields": []map[string]interface{}{}, + "fields": []map[string]any{}, } fieldTracker := make(map[string]struct{}) @@ -423,12 +423,12 @@ func generateSchema(data map[string]schema.Float) (string, error) { if _, exists := fieldTracker[key]; !exists { key = correctKey(key) - field := map[string]interface{}{ + field := map[string]any{ "name": key, "type": "double", "default": -1.0, } - schema["fields"] = append(schema["fields"].([]map[string]interface{}), field) + schema["fields"] = append(schema["fields"].([]map[string]any), field) fieldTracker[key] = struct{}{} } } @@ -441,14 +441,15 @@ func generateSchema(data map[string]schema.Float) (string, error) { return string(schemaString), nil } -func generateRecord(data map[string]schema.Float) map[string]interface{} { - record := make(map[string]interface{}) +func generateRecord(data map[string]schema.Float) map[string]any { + record := make(map[string]any) // Iterate through each map in data for key, value := range data { key = correctKey(key) // Set the value in the record + // avro only accepts basic types record[key] = value.Double() } diff --git a/internal/avro/avroHelper.go b/internal/avro/avroHelper.go index 21a5617..8ffc770 100644 --- a/internal/avro/avroHelper.go +++ b/internal/avro/avroHelper.go @@ -1,8 +1,13 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. package avro import ( "context" "fmt" + "slices" "strconv" "sync" @@ -10,7 +15,6 @@ import ( ) func DataStaging(wg *sync.WaitGroup, ctx context.Context) { - // AvroPool is a pool of Avro writers. go func() { if config.MetricStoreKeys.Checkpoints.FileFormat == "json" { @@ -28,7 +32,7 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) { case <-ctx.Done(): return case val := <-LineProtocolMessages: - //Fetch the frequency of the metric from the global configuration + // Fetch the frequency of the metric from the global configuration freq, err := config.GetMetricFrequency(val.MetricName) if err != nil { fmt.Printf("Error fetching metric frequency: %s\n", err) @@ -58,7 +62,7 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) { if avroLevel == nil { fmt.Printf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) } - oldSelector = append([]string{}, selector...) + oldSelector = slices.Clone(selector) } avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq)) diff --git a/internal/avro/avroStruct.go b/internal/avro/avroStruct.go index ee65291..b0ded94 100644 --- a/internal/avro/avroStruct.go +++ b/internal/avro/avroStruct.go @@ -1,3 +1,7 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. package avro import ( @@ -37,9 +41,9 @@ type AvroLevel struct { } type AvroField struct { - Name string `json:"name"` - Type interface{} `json:"type"` - Default interface{} `json:"default,omitempty"` + Name string `json:"name"` + Type any `json:"type"` + Default any `json:"default,omitempty"` } type AvroSchema struct { From 3b9d05cc6d5d24f8a6c7f9a2c40a802a34a25181 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Tue, 9 Sep 2025 14:57:05 +0200 Subject: [PATCH 08/11] Fix exclusive to shared in svlete and graphql --- api/schema.graphqls | 2 +- configs/config-demo.json | 2 +- internal/avro/avroHelper.go | 6 +- internal/graph/generated/generated.go | 4 +- internal/graph/model/models_gen.go | 2 +- internal/graph/schema.resolvers.go | 12 ++ internal/memorystore/archive.go | 9 +- .../taskManager/updateFootprintService.go | 2 +- test_ccms_write_api.sh.bak | 110 ------------------ var/._job-archive | Bin 163 -> 0 bytes web/frontend/src/Job.root.svelte | 4 +- web/frontend/src/generic/JobList.svelte | 2 +- .../src/generic/joblist/JobInfo.svelte | 2 +- .../src/generic/joblist/JobListRow.svelte | 2 +- .../src/systems/nodelist/NodeInfo.svelte | 4 +- .../src/systems/nodelist/NodeListRow.svelte | 4 +- 16 files changed, 35 insertions(+), 132 deletions(-) delete mode 100755 test_ccms_write_api.sh.bak delete mode 100755 var/._job-archive diff --git a/api/schema.graphqls b/api/schema.graphqls index 070b5b7..c19fc64 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -419,7 +419,7 @@ input JobFilter { startTime: TimeRange state: [JobState!] metricStats: [MetricStatItem!] - shared: StringInput + shared: String node: StringInput } diff --git a/configs/config-demo.json b/configs/config-demo.json index 3c0d858..d47f926 100644 --- a/configs/config-demo.json +++ b/configs/config-demo.json @@ -80,7 +80,7 @@ "restore": "48h" }, "archive": { - "interval": "48h", + "interval": "2h", "directory": "./var/archive" }, "retention-in-memory": "48h" diff --git a/internal/avro/avroHelper.go b/internal/avro/avroHelper.go index 8ffc770..7710f0f 100644 --- a/internal/avro/avroHelper.go +++ b/internal/avro/avroHelper.go @@ -6,7 +6,7 @@ package avro import ( "context" - "fmt" + "log" "slices" "strconv" "sync" @@ -35,7 +35,7 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) { // Fetch the frequency of the metric from the global configuration freq, err := config.GetMetricFrequency(val.MetricName) if err != nil { - fmt.Printf("Error fetching metric frequency: %s\n", err) + log.Printf("Error fetching metric frequency: %s\n", err) continue } @@ -60,7 +60,7 @@ func DataStaging(wg *sync.WaitGroup, ctx context.Context) { // If the Avro level is nil, create a new one if avroLevel == nil { - fmt.Printf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) + log.Printf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) } oldSelector = slices.Clone(selector) } diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index eed946d..778f1d6 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -2741,7 +2741,7 @@ input JobFilter { startTime: TimeRange state: [JobState!] metricStats: [MetricStatItem!] - shared: StringInput + shared: String node: StringInput } @@ -16490,7 +16490,7 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any it.MetricStats = data case "shared": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("shared")) - data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + data, err := ec.unmarshalOString2ᚖstring(ctx, v) if err != nil { return it, err } diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index accc344..c4948d0 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -69,7 +69,7 @@ type JobFilter struct { StartTime *config.TimeRange `json:"startTime,omitempty"` State []schema.JobState `json:"state,omitempty"` MetricStats []*MetricStatItem `json:"metricStats,omitempty"` - Shared *StringInput `json:"shared,omitempty"` + Shared *string `json:"shared,omitempty"` Node *StringInput `json:"node,omitempty"` } diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 315f1a3..b886c34 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -831,3 +831,15 @@ type mutationResolver struct{ *Resolver } type nodeResolver struct{ *Resolver } type queryResolver struct{ *Resolver } type subClusterResolver struct{ *Resolver } + +// !!! WARNING !!! +// The code below was going to be deleted when updating resolvers. It has been copied here so you have +// one last chance to move it out of harms way if you want. There are two reasons this happens: +// - When renaming or deleting a resolver the old code will be put in here. You can safely delete +// it when you're done. +// - You have helper methods in this file. Move them out to keep these resolver files clean. +/* + func (r *jobResolver) Exclusive(ctx context.Context, obj *schema.Job) (int, error) { + panic(fmt.Errorf("not implemented: Exclusive - exclusive")) +} +*/ diff --git a/internal/memorystore/archive.go b/internal/memorystore/archive.go index 7857d71..9720d20 100644 --- a/internal/memorystore/archive.go +++ b/internal/memorystore/archive.go @@ -11,6 +11,7 @@ import ( "errors" "fmt" "io" + "log" "os" "path/filepath" "sync" @@ -26,7 +27,7 @@ func Archiving(wg *sync.WaitGroup, ctx context.Context) { defer wg.Done() d, err := time.ParseDuration(config.MetricStoreKeys.Archive.Interval) if err != nil { - cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err) + log.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err) } if d <= 0 { return @@ -44,14 +45,14 @@ func Archiving(wg *sync.WaitGroup, ctx context.Context) { return case <-ticks: t := time.Now().Add(-d) - cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339)) + log.Printf("[METRICSTORE]> start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339)) n, err := ArchiveCheckpoints(config.MetricStoreKeys.Checkpoints.RootDir, config.MetricStoreKeys.Archive.RootDir, t.Unix(), config.MetricStoreKeys.Archive.DeleteInstead) if err != nil { - cclog.Warnf("[METRICSTORE]> archiving failed: %s\n", err.Error()) + log.Printf("[METRICSTORE]> archiving failed: %s\n", err.Error()) } else { - cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive\n", n) + log.Printf("[METRICSTORE]> done: %d files zipped and moved to archive\n", n) } } } diff --git a/internal/taskManager/updateFootprintService.go b/internal/taskManager/updateFootprintService.go index 2ce9901..4fb5e45 100644 --- a/internal/taskManager/updateFootprintService.go +++ b/internal/taskManager/updateFootprintService.go @@ -34,7 +34,7 @@ func RegisterFootprintWorker() { c := 0 ce := 0 cl := 0 - cclog.Printf("Update Footprints started at %s", s.Format(time.RFC3339)) + cclog.Printf("Update Footprints started at %s\n", s.Format(time.RFC3339)) for _, cluster := range archive.Clusters { s_cluster := time.Now() diff --git a/test_ccms_write_api.sh.bak b/test_ccms_write_api.sh.bak deleted file mode 100755 index f76322f..0000000 --- a/test_ccms_write_api.sh.bak +++ /dev/null @@ -1,110 +0,0 @@ -JWT="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjQ1NjMzOTUsImlhdCI6MTc1NzM2MzM5NSwicm9sZXMiOlsiYWRtaW4iLCJhcGkiXSwic3ViIjoiZGVtbyJ9.uhtEbS-ty4xNc8GWTKjyh1b06j6b3vtEw7lzQy0Eht5LtISZwRfyRBfdKjbm_t25xGrNH9sxINq4qiYKBjAaDQ" - -# curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" -d $'cpu_load,cluster=alex,hostname=a042,type=hwthread,type-id=0 value=35.0 1725827464642231296' - -rm sample_fritz.txt -rm sample_alex.txt - -while [ true ]; do - echo "Alex Metrics for hwthread types and type-ids" - timestamp="$(date '+%s')" - echo "Timestamp : "+$timestamp - for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do - for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do - for id in {0..127}; do - echo "$metric,cluster=alex,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt - done - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt - - echo "Fritz Metrics for hwthread types and type-ids" - for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do - for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do - for id in {0..71}; do - echo "$metric,cluster=fritz,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt - done - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt - - rm sample_fritz.txt - rm sample_alex.txt - - echo "Alex Metrics for accelerator types and type-ids" - for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do - for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do - for id in 00000000:49:00.0 00000000:0E:00.0 00000000:D1:00.0 00000000:90:00.0 00000000:13:00.0 00000000:96:00.0 00000000:CC:00.0 00000000:4F:00.0; do - echo "$metric,cluster=alex,hostname=$hostname,type=accelerator,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt - done - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt - - rm sample_alex.txt - - echo "Alex Metrics for memoryDomain types and type-ids" - for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do - for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do - for id in {0..7}; do - echo "$metric,cluster=alex,hostname=$hostname,type=memoryDomain,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt - done - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt - - rm sample_alex.txt - - echo "Alex Metrics for socket types and type-ids" - for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do - for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do - for id in {0..1}; do - echo "$metric,cluster=alex,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt - done - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt - - echo "Fritz Metrics for socket types and type-ids" - for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do - for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do - for id in {0..1}; do - echo "$metric,cluster=fritz,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt - done - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt - - rm sample_fritz.txt - rm sample_alex.txt - - echo "Alex Metrics for nodes" - for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do - for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do - echo "$metric,cluster=alex,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt - - echo "Fritz Metrics for nodes" - for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do - for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do - echo "$metric,cluster=fritz,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt - done - done - - curl -X 'POST' 'http://localhost:8080/metricstore/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt - - rm sample_fritz.txt - rm sample_alex.txt - - sleep 1m -done -# curl -X 'POST' 'http://localhost:8081/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" -d $'cpu_load,cluster=alex,hostname=a042,type=hwthread,type-id=0 value=35.0 1725827464642231296' \ No newline at end of file diff --git a/var/._job-archive b/var/._job-archive deleted file mode 100755 index 9d11b52bb7ed13ffc4799b7e3bcb26eb2c0b9b7a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 163 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDI}aUl?c_=|y<2;dkJ5(HHS(lG;wxzV&S oBE&_L^K gm.name == item.metric)?.unit} nativeScope={$initq.data.globalMetrics.find((gm) => gm.name == item.metric)?.scope} presetScopes={item.data.map((x) => x.scope)} - isShared={$initq.data.job.exclusive != 1} + isShared={$initq.data.job.shared != "none"} /> {:else if item.disabled == true} diff --git a/web/frontend/src/generic/JobList.svelte b/web/frontend/src/generic/JobList.svelte index dc6def2..5ca8981 100644 --- a/web/frontend/src/generic/JobList.svelte +++ b/web/frontend/src/generic/JobList.svelte @@ -69,7 +69,7 @@ hostname } SMT - exclusive + shared partition arrayJobId monitoringStatus diff --git a/web/frontend/src/generic/joblist/JobInfo.svelte b/web/frontend/src/generic/joblist/JobInfo.svelte index f56d800..794efe9 100644 --- a/web/frontend/src/generic/joblist/JobInfo.svelte +++ b/web/frontend/src/generic/joblist/JobInfo.svelte @@ -172,7 +172,7 @@ {job.numNodes} {/if} - {#if job.exclusive != 1} + {#if job.shared != "none"} (shared) {/if} {#if job.numAcc > 0} diff --git a/web/frontend/src/generic/joblist/JobListRow.svelte b/web/frontend/src/generic/joblist/JobListRow.svelte index b17f66d..28574d9 100644 --- a/web/frontend/src/generic/joblist/JobListRow.svelte +++ b/web/frontend/src/generic/joblist/JobListRow.svelte @@ -213,7 +213,7 @@ metric={metric.data.name} cluster={cluster.find((c) => c.name == job.cluster)} subCluster={job.subCluster} - isShared={job.exclusive != 1} + isShared={job.shared != "none"} numhwthreads={job.numHWThreads} numaccs={job.numAcc} zoomState={zoomStates[metric.data.name] || null} diff --git a/web/frontend/src/systems/nodelist/NodeInfo.svelte b/web/frontend/src/systems/nodelist/NodeInfo.svelte index 363379f..77e7416 100644 --- a/web/frontend/src/systems/nodelist/NodeInfo.svelte +++ b/web/frontend/src/systems/nodelist/NodeInfo.svelte @@ -92,7 +92,7 @@ Missing Metric - {:else if nodeJobsData.jobs.count == 1 && nodeJobsData.jobs.items[0].exclusive} + {:else if nodeJobsData.jobs.count == 1 && nodeJobsData.jobs.items[0].shared == "none"} @@ -104,7 +104,7 @@ Exclusive - {:else if nodeJobsData.jobs.count >= 1 && !nodeJobsData.jobs.items[0].exclusive} + {:else if nodeJobsData.jobs.count >= 1 && !(nodeJobsData.jobs.items[0].shared == "none")} diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index 5cdf493..a9111f6 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -45,7 +45,7 @@ jobId user project - exclusive + shared resources { hostname accelerators @@ -101,7 +101,7 @@ function buildExtendedLegend() { let pendingExtendedLegendData = null // Build Extended for allocated nodes [Commented: Only Build extended Legend For Shared Nodes] - if ($nodeJobsData.data.jobs.count >= 1) { // "&& !$nodeJobsData.data.jobs.items[0].exclusive)" + if ($nodeJobsData.data.jobs.count >= 1) { const accSet = Array.from(new Set($nodeJobsData.data.jobs.items .map((i) => i.resources .filter((r) => (r.hostname === nodeData.host) && r?.accelerators) From eaca187032698f6c9740b0703e292b131354273e Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 9 Sep 2025 15:04:25 +0200 Subject: [PATCH 09/11] Fix testdata for new schema --- .../importer/testdata/meta-fritzError.input | 2 +- .../importer/testdata/meta-fritzMinimal.input | 2 +- internal/tagger/apps/vasp.txt | 1 + .../emmy/1403/244/1608923076/meta.json | 382 +++++++++--------- .../emmy/1404/397/1609300556/meta.json | 382 +++++++++--------- 5 files changed, 385 insertions(+), 384 deletions(-) diff --git a/internal/importer/testdata/meta-fritzError.input b/internal/importer/testdata/meta-fritzError.input index 2b8d0e8..90e46cf 100644 --- a/internal/importer/testdata/meta-fritzError.input +++ b/internal/importer/testdata/meta-fritzError.input @@ -1 +1 @@ -{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"exclusive":1,"monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}} +{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"shared":"none","monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}} diff --git a/internal/importer/testdata/meta-fritzMinimal.input b/internal/importer/testdata/meta-fritzMinimal.input index f2cce79..f0289fb 100644 --- a/internal/importer/testdata/meta-fritzMinimal.input +++ b/internal/importer/testdata/meta-fritzMinimal.input @@ -1 +1 @@ -{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"exclusive":1,"jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}} +{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"shared":"none","jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}} diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt index bd537e4..9f9b9d5 100644 --- a/internal/tagger/apps/vasp.txt +++ b/internal/tagger/apps/vasp.txt @@ -1 +1,2 @@ vasp +VASP diff --git a/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json b/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json index 1ce3f87..aadf21c 100644 --- a/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json +++ b/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json @@ -1,194 +1,194 @@ { - "exclusive": 1, - "jobId": 1403244, - "statistics": { - "mem_bw": { - "avg": 63.57, - "min": 0, - "unit": { - "base": "B/s", - "prefix": "G" - }, - "max": 74.5 - }, - "rapl_power": { - "avg": 228.07, - "min": 0, - "unit": { - "base": "W" - }, - "max": 258.56 - }, - "ipc": { - "unit": { - "base": "IPC" - }, - "max": 0.510204081632653, - "avg": 1.53846153846154, - "min": 0.0 - }, - "clock": { - "min": 1380.32, - "avg": 2599.39, - "unit": { - "base": "Hz", - "prefix": "M" - }, - "max": 2634.46 - }, - "cpu_load": { - "avg": 18.4, - "min": 0, - "max": 23.58, - "unit": { - "base": "load" - } - }, - "flops_any": { - "max": 404.62, - "unit": { - "base": "F/s", - "prefix": "G" - }, - "avg": 225.59, - "min": 0 - }, - "flops_dp": { - "max": 0.24, - "unit": { - "base": "F/s", - "prefix": "G" - }, - "min": 0, - "avg": 0 - }, - "mem_used": { - "min": 1.55, - "avg": 27.84, - "unit": { - "base": "B", - "prefix": "G" - }, - "max": 37.5 - }, - "flops_sp": { - "min": 0, - "avg": 225.59, - "max": 404.62, - "unit": { - "base": "F/s", - "prefix": "G" - } - } + "shared": "none", + "jobId": 1403244, + "statistics": { + "mem_bw": { + "avg": 63.57, + "min": 0, + "unit": { + "base": "B/s", + "prefix": "G" + }, + "max": 74.5 }, - "resources": [ - { - "hostname": "e0102" - }, - { - "hostname": "e0103" - }, - { - "hostname": "e0105" - }, - { - "hostname": "e0106" - }, - { - "hostname": "e0107" - }, - { - "hostname": "e0108" - }, - { - "hostname": "e0114" - }, - { - "hostname": "e0320" - }, - { - "hostname": "e0321" - }, - { - "hostname": "e0325" - }, - { - "hostname": "e0404" - }, - { - "hostname": "e0415" - }, - { - "hostname": "e0433" - }, - { - "hostname": "e0437" - }, - { - "hostname": "e0439" - }, - { - "hostname": "e0501" - }, - { - "hostname": "e0503" - }, - { - "hostname": "e0505" - }, - { - "hostname": "e0506" - }, - { - "hostname": "e0512" - }, - { - "hostname": "e0513" - }, - { - "hostname": "e0514" - }, - { - "hostname": "e0653" - }, - { - "hostname": "e0701" - }, - { - "hostname": "e0716" - }, - { - "hostname": "e0727" - }, - { - "hostname": "e0728" - }, - { - "hostname": "e0925" - }, - { - "hostname": "e0926" - }, - { - "hostname": "e0929" - }, - { - "hostname": "e0934" - }, - { - "hostname": "e0951" - } - ], - "walltime": 10, - "jobState": "completed", - "cluster": "emmy", - "subCluster": "haswell", - "stopTime": 1609009562, - "user": "emmyUser6", - "startTime": 1608923076, - "partition": "work", - "tags": [], - "project": "no project", - "numNodes": 32, - "duration": 86486 + "rapl_power": { + "avg": 228.07, + "min": 0, + "unit": { + "base": "W" + }, + "max": 258.56 + }, + "ipc": { + "unit": { + "base": "IPC" + }, + "max": 0.510204081632653, + "avg": 1.53846153846154, + "min": 0.0 + }, + "clock": { + "min": 1380.32, + "avg": 2599.39, + "unit": { + "base": "Hz", + "prefix": "M" + }, + "max": 2634.46 + }, + "cpu_load": { + "avg": 18.4, + "min": 0, + "max": 23.58, + "unit": { + "base": "load" + } + }, + "flops_any": { + "max": 404.62, + "unit": { + "base": "F/s", + "prefix": "G" + }, + "avg": 225.59, + "min": 0 + }, + "flops_dp": { + "max": 0.24, + "unit": { + "base": "F/s", + "prefix": "G" + }, + "min": 0, + "avg": 0 + }, + "mem_used": { + "min": 1.55, + "avg": 27.84, + "unit": { + "base": "B", + "prefix": "G" + }, + "max": 37.5 + }, + "flops_sp": { + "min": 0, + "avg": 225.59, + "max": 404.62, + "unit": { + "base": "F/s", + "prefix": "G" + } + } + }, + "resources": [ + { + "hostname": "e0102" + }, + { + "hostname": "e0103" + }, + { + "hostname": "e0105" + }, + { + "hostname": "e0106" + }, + { + "hostname": "e0107" + }, + { + "hostname": "e0108" + }, + { + "hostname": "e0114" + }, + { + "hostname": "e0320" + }, + { + "hostname": "e0321" + }, + { + "hostname": "e0325" + }, + { + "hostname": "e0404" + }, + { + "hostname": "e0415" + }, + { + "hostname": "e0433" + }, + { + "hostname": "e0437" + }, + { + "hostname": "e0439" + }, + { + "hostname": "e0501" + }, + { + "hostname": "e0503" + }, + { + "hostname": "e0505" + }, + { + "hostname": "e0506" + }, + { + "hostname": "e0512" + }, + { + "hostname": "e0513" + }, + { + "hostname": "e0514" + }, + { + "hostname": "e0653" + }, + { + "hostname": "e0701" + }, + { + "hostname": "e0716" + }, + { + "hostname": "e0727" + }, + { + "hostname": "e0728" + }, + { + "hostname": "e0925" + }, + { + "hostname": "e0926" + }, + { + "hostname": "e0929" + }, + { + "hostname": "e0934" + }, + { + "hostname": "e0951" + } + ], + "walltime": 10, + "jobState": "completed", + "cluster": "emmy", + "subCluster": "haswell", + "stopTime": 1609009562, + "user": "emmyUser6", + "startTime": 1608923076, + "partition": "work", + "tags": [], + "project": "no project", + "numNodes": 32, + "duration": 86486 } diff --git a/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json b/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json index e1fff10..c1e603a 100644 --- a/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json +++ b/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json @@ -1,194 +1,194 @@ { - "stopTime": 1609387081, - "resources": [ - { - "hostname": "e0151" - }, - { - "hostname": "e0152" - }, - { - "hostname": "e0153" - }, - { - "hostname": "e0232" - }, - { - "hostname": "e0303" - }, - { - "hostname": "e0314" - }, - { - "hostname": "e0344" - }, - { - "hostname": "e0345" - }, - { - "hostname": "e0348" - }, - { - "hostname": "e0507" - }, - { - "hostname": "e0518" - }, - { - "hostname": "e0520" - }, - { - "hostname": "e0522" - }, - { - "hostname": "e0526" - }, - { - "hostname": "e0527" - }, - { - "hostname": "e0528" - }, - { - "hostname": "e0530" - }, - { - "hostname": "e0551" - }, - { - "hostname": "e0604" - }, - { - "hostname": "e0613" - }, - { - "hostname": "e0634" - }, - { - "hostname": "e0639" - }, - { - "hostname": "e0640" - }, - { - "hostname": "e0651" - }, - { - "hostname": "e0653" - }, - { - "hostname": "e0701" - }, - { - "hostname": "e0704" - }, - { - "hostname": "e0751" - }, - { - "hostname": "e0809" - }, - { - "hostname": "e0814" - }, - { - "hostname": "e0819" - }, - { - "hostname": "e0908" - } - ], - "walltime": 10, - "cluster": "emmy", - "subCluster": "haswell", - "jobState": "completed", - "statistics": { - "clock": { - "max": 2634.9, - "unit": { - "base": "Hz", - "prefix": "M" - }, - "min": 0, - "avg": 2597.8 - }, - "cpu_load": { - "max": 27.41, - "min": 0, - "avg": 18.39, - "unit": { - "base": "load" - } - }, - "mem_bw": { - "min": 0, - "avg": 63.23, - "unit": { - "base": "B/s", - "prefix": "G" - }, - "max": 75.06 - }, - "ipc": { - "min": 0.0, - "avg": 1.53846153846154, - "unit": { - "base": "IPC" - }, - "max": 0.490196078431373 - }, - "rapl_power": { - "min": 0, - "avg": 227.32, - "unit": { - "base": "W" - }, - "max": 256.22 - }, - "mem_used": { - "min": 1.5, - "avg": 27.77, - "unit": { - "base": "B", - "prefix": "G" - }, - "max": 37.43 - }, - "flops_sp": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "max": 413.21, - "min": 0, - "avg": 224.41 - }, - "flops_dp": { - "max": 5.72, - "unit": { - "base": "F/s", - "prefix": "G" - }, - "min": 0, - "avg": 0 - }, - "flops_any": { - "min": 0, - "avg": 224.42, - "max": 413.21, - "unit": { - "base": "F/s", - "prefix": "G" - } - } + "stopTime": 1609387081, + "resources": [ + { + "hostname": "e0151" }, - "exclusive": 1, - "jobId": 1404397, - "tags": [], - "partition": "work", - "project": "no project", - "user": "emmyUser6", - "startTime": 1609300556, - "duration": 86525, - "numNodes": 32 + { + "hostname": "e0152" + }, + { + "hostname": "e0153" + }, + { + "hostname": "e0232" + }, + { + "hostname": "e0303" + }, + { + "hostname": "e0314" + }, + { + "hostname": "e0344" + }, + { + "hostname": "e0345" + }, + { + "hostname": "e0348" + }, + { + "hostname": "e0507" + }, + { + "hostname": "e0518" + }, + { + "hostname": "e0520" + }, + { + "hostname": "e0522" + }, + { + "hostname": "e0526" + }, + { + "hostname": "e0527" + }, + { + "hostname": "e0528" + }, + { + "hostname": "e0530" + }, + { + "hostname": "e0551" + }, + { + "hostname": "e0604" + }, + { + "hostname": "e0613" + }, + { + "hostname": "e0634" + }, + { + "hostname": "e0639" + }, + { + "hostname": "e0640" + }, + { + "hostname": "e0651" + }, + { + "hostname": "e0653" + }, + { + "hostname": "e0701" + }, + { + "hostname": "e0704" + }, + { + "hostname": "e0751" + }, + { + "hostname": "e0809" + }, + { + "hostname": "e0814" + }, + { + "hostname": "e0819" + }, + { + "hostname": "e0908" + } + ], + "walltime": 10, + "cluster": "emmy", + "subCluster": "haswell", + "jobState": "completed", + "statistics": { + "clock": { + "max": 2634.9, + "unit": { + "base": "Hz", + "prefix": "M" + }, + "min": 0, + "avg": 2597.8 + }, + "cpu_load": { + "max": 27.41, + "min": 0, + "avg": 18.39, + "unit": { + "base": "load" + } + }, + "mem_bw": { + "min": 0, + "avg": 63.23, + "unit": { + "base": "B/s", + "prefix": "G" + }, + "max": 75.06 + }, + "ipc": { + "min": 0.0, + "avg": 1.53846153846154, + "unit": { + "base": "IPC" + }, + "max": 0.490196078431373 + }, + "rapl_power": { + "min": 0, + "avg": 227.32, + "unit": { + "base": "W" + }, + "max": 256.22 + }, + "mem_used": { + "min": 1.5, + "avg": 27.77, + "unit": { + "base": "B", + "prefix": "G" + }, + "max": 37.43 + }, + "flops_sp": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "max": 413.21, + "min": 0, + "avg": 224.41 + }, + "flops_dp": { + "max": 5.72, + "unit": { + "base": "F/s", + "prefix": "G" + }, + "min": 0, + "avg": 0 + }, + "flops_any": { + "min": 0, + "avg": 224.42, + "max": 413.21, + "unit": { + "base": "F/s", + "prefix": "G" + } + } + }, + "shared": "none", + "jobId": 1404397, + "tags": [], + "partition": "work", + "project": "no project", + "user": "emmyUser6", + "startTime": 1609300556, + "duration": 86525, + "numNodes": 32 } From 9b644119ae1e2e45ea375a9954e4e1cb74b6fce6 Mon Sep 17 00:00:00 2001 From: Aditya Ujeniya Date: Tue, 9 Sep 2025 18:34:10 +0200 Subject: [PATCH 10/11] Fix to testdata database --- internal/importer/initDB.go | 4 ---- internal/repository/testdata/job.db | Bin 118784 -> 122880 bytes startDemo.sh | 33 ---------------------------- 3 files changed, 37 deletions(-) diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index 79879b2..179c21c 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -142,10 +142,6 @@ func InitDB() error { continue } - if jobMeta.Shared == "" { - jobMeta.Shared = "none" - } - id, err := r.TransactionAddNamed(t, repository.NamedJobInsert, jobMeta) if err != nil { diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index 43ec9d3c7f36c7ea505a96cc1208c4ce7a148eed..c8911a6377687841be6a1f2ffce9a36de1bb38bb 100644 GIT binary patch delta 5331 zcmeHLU2NOd73L+W|2UMZ$%riD9pXjmeJT3Z9~TkUrxw^SV8DPO7%&Vguwm#PvWKP_umVGhW#?Q{ z5+%!SvLbt$VN&Ed=YHqhbM86kUY@<`IQx;~C%xS}7>3yeSN-qk<=9ws2ea?{FW7)Y zgxkp2F5K?={kDgQyd$vg zt{TY`&g zh^>5mQ6K*iVE#Y6n z4N&qYA>aUMV&gda%d)H;-d#5K+7jW#b^hv?p*>xmPZ$q>f)QR5_HHLe%o_1J!JjZ= zHo;F_U59Kb!l0V8?kCU@KXzk$;E`YIC*F`Ar3a^gJQ%j!e2IBA-0|vJ|Zuh z77xuFnVDM@=jP!$K06zuVwrp@nNmtcT`q`~TdshB1B#_(E6r!pa#=m8XLB+lRX%t4 z!^1H#9v6?R+0!D@i@KD{Q&e~=w8{={DvK7Bil+K)s48F3o|98LwWo{()0ku)P_ueg zQ^ltz7pA5s7b2tM<56QGQlTK7Nw!SYV5Cb0iH_Mg`BRdj&Aj`SD30^3vD&>peVyTGI1}sh! z(86>cQPXERS(lP&NteXM#50R@t)Mj`M)l}Z^K(?VAQ!dLY9S>TsZ?|Qv`DF?ToO$$ zf=}Al81gcw^h^OJPWi^fku0TBIKUtjb9&{VbD%6n;EDPc7R^ZA*kVR1$mv!atWh!f z7?QB6p?kpv;0i^i%L`v~B?!0iR!7ZgYF5_@AVwobK@B_|gnJ)eY zlw9;c!>L&d)oDH*H2xu&QG1M^EckR{2lh`HRq5c5Ra3!DOwE_O?$qSPs*n; zF|sG+_V@0V+R)BzDG)%b)YgKfpgk>F+_1@5Bo@!MnryD2X6b5~P=ot*idgxp zi&zJx^>WsMwXv|FkYQ_wf>t&-oh#ZC`|N=MXoOiv`O~_zT1>(%n+9LP?=I8zGjoR% z&xlQ~I6sHp27(xY8?=e&OFFDIzn@$@Mni~6jh71$f|iDZwARq=!_Z-!!A#2KF|QLx z5)19|(~3G>Z`f^Uxwp2OPVL8N0r^rf3FTT#Nl@uF!A_x&uIZApi9k-qI{KA3rkIJe z(s3TEcViG0+R7mGA}u@}+Kh$Q+GiSWz?uf-SuUJ3ihMS$(6X1yo<;(#VHlKfT05np za7le$#Yf7im%xlHDSBqpZf9Y)H@uaSLVF$U%MyJw8(E8jyJ|D_3YvlS(Sr*lR&V=J zchs_bEpOPpdLCpZzt&diiFkYFKuXjzr@t+8%8*!;ZA~2@VCt;I9D#8^SrcduV*HBr z%~jHHBhq?X5IadBG}6V0nH$hh>Om^Q`j5_j(;gV|lbl6Q6Mtt1e1qG>ZNMSg@`H}R z7?|y{z||`g?ty6j2TjznQ9@eHrczDxhSex#Ebr)a)E?OFC#x164eXu{7Pmn*kjipY zi8un^277%LtcJaf`i~?09e_7Vb}+oT1CGvw?SZhL$fmMGmY|}F2R2u+0XG_kZu*#? zc2hr{(FoqVVLL$2UqgGUS3e|uAzV&)JoBF41ML(KWDnKbDNuzcz%ggCr!_(vBHkanbV zeGQ~q?kG8D6E{HtoBHq=;;Y@6C&WwKHG-ch9Q+l=eUQHbroQsOHq~P>wfgrr$j%tc zcp0Y?Zt!=hWZ%hf0YA7cIDl^ZSz*76IGujMu^L@k`{(&|7)p#p_z0*uC~6;6e7tdzT+$`!ALF z&DB+*n2dXw1+{>Ce4UcKS$JTi|;D delta 4478 zcmc&&eQX=$8Nc^FeP<_6T*tXOiODrps3WD!=?86E6w*TY(r95rh)Q%9Cv}LKTPLym zNn0l^ARq`Qy)pmOR#mlW+N5nFMuY(x)ugU$g=vZyD12;!7N!MA)~-WH{1Kk_&Ubba z$KoH#UG9AMeEgpGdEWP)=Xo=9P8vERjn;WphGByF%Y(n0##+`ksmx7>1}gB)!=Gp1 zdAPBXRJ`q3=i&JC?hllAm6P&QawO-idV(`#e%NT?J<8K}c_r27BPaG71AM?D1@feT zP0C&4CAZL>NW`1RtLA{X#=F%`zgsf;4;Ihem)pj;bOUl zQ-d32;&S07NiKR!tPYcN(t55Vqlf$CosvtB!>76FebOJ5edvNFoW-x!I;r&6f<1c;2O-Dw0c2CC66rP zU6gqzp#K9rQQ~8$7YT>V3L>(W47I}YCq<>5KZz30@OO+fjjSA5G19=+GMwMX2mL;R z0XIU!eZ)H27^SN{?^(~KsjIekJ(2!Cbe172$o)gvv2o(heQ`M~Y$8>JVL0Zo#@=|M zKdDFd9MmKG9%MQU8EGR6>>R6i>=-+zgCaw;BD_Bmass2i|9|Ze>HyjdEocn>bXOYjRgfi{kUadB?Fabm8m5T+S;AKri$;S4;B5(gmd z{2v7=#CL zI9nlICj=4G4l-Skjuuyj$JSCkXL9E0)eAv%(GIqm1sqIq0ke|3h-7>K)fRURXi@X3V#=l3L1Zr*SRjPiS?4{7sg&A zxrn@vD8q;ABz4s)a(sjFk@7mfHyIr?e&b$~ezaCn*ZatDAhu^$^uVKuZap4LrgX|r z>d91OFoiVVZ9nSV@<69{_qN+RAJR$*Y1_9^x)w53^i*tjv_+#JEv7h9Q_kzE85D^&Ppb-%jNX4xtA{Y32V7qfA{kJpFMHI%N;J- z(1yMoSwj{m9;;uVxO4iLP{nni$y#bYz~fEM&CQak@gZJwv1}(cdTi$lV{6%_%;C(T z%z?~4K44cL+UH?o?X+KF>k!%wZ2bv3(46nD#Dq*s?XQ*`oDU=DMwZ|X~-DPAYDIdG+^bbg_i1UuF&=yPD zJo=cJAJ(9lA66{i9*lO0<%YE(i#wnY$JG5di&GV7w5L~Ga#W52o!dc@{pT7YPfWJr zh;A!8w-qqNc$CvD_RwvW|E_$pa$Utc6~h&NPqlkWIj@|?vqR~i{wU=u(YEIAwHBod zbq{me^{++gLTab^q$sh|Om7v?@3I!9?fUOj8?<{JYf-w;TwJGGgqdPT12zt_S&Z(n z!<@tieo;xI6Ogn^j$MQ8+MC?lZ9mevGv0THLis%8#$UQ?sm6<@Cz^w4#=?I^Fo0Lp MMCcs3KsTk|00&{a4gdfE diff --git a/startDemo.sh b/startDemo.sh index b027bf5..8087b1c 100755 --- a/startDemo.sh +++ b/startDemo.sh @@ -12,41 +12,8 @@ else cp ./configs/env-template.txt .env cp ./configs/config-demo.json config.json - # mkdir -p ./var/checkpoints - # cp -rf ~/cc-metric-store/var/checkpoints ~/cc-backend/var - ./cc-backend -migrate-db ./cc-backend -dev -init-db -add-user demo:admin,api:demo - - # --- begin: generate JWT for demo and update test_ccms_write_api.sh --- - CC_BIN="./cc-backend" - TEST_FILE="./test_ccms_write_api.sh" - BACKUP_FILE="${TEST_FILE}.bak" - - if [ -x "$CC_BIN" ]; then - echo "Generating JWT for user 'demo'..." - output="$($CC_BIN -jwt demo 2>&1 || true)" - token="$(printf '%s\n' "$output" | grep -oE '[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+' | head -n1 || true)" - - if [ -z "$token" ]; then - echo "Warning: could not extract JWT from output:" >&2 - printf '%s\n' "$output" >&2 - else - if [ -f "$TEST_FILE" ]; then - cp -a "$TEST_FILE" "$BACKUP_FILE" - # replace first line with JWT="..." - sed -i "1s#.*#JWT=\"$token\"#" "$TEST_FILE" - echo "Updated JWT in $TEST_FILE (backup at $BACKUP_FILE)" - else - echo "Warning: $TEST_FILE not found; JWT not written." - fi - fi - else - echo "Warning: $CC_BIN not found or not executable; skipping JWT generation." - fi - # --- end: generate JWT for demo and update test_ccms_write_api.sh --- - - ./cc-backend -server -dev fi From 79605c8a9ea0d21d827567589d0bf939eece889a Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Wed, 10 Sep 2025 09:08:32 +0200 Subject: [PATCH 11/11] Update test pipeline to go 1.25 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a8a7429..6974301 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,7 +7,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.24.x + go-version: 1.25.x - name: Checkout code uses: actions/checkout@v3 - name: Build, Vet & Test