Migration SQL fix

This commit is contained in:
Aditya Ujeniya
2025-09-03 08:22:15 +02:00
parent a50b832c2a
commit bca176170c
13 changed files with 172 additions and 36 deletions

View File

@@ -18,6 +18,7 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/auth"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/importer"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/metricdata"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/internal/tagger"
@@ -96,6 +97,12 @@ func main() {
} else {
cclog.Abort("Cluster configuration must be present")
}
if mscfg := ccconf.GetPackageConfig("metric-store"); mscfg != nil {
config.InitMetricStore(mscfg)
} else {
cclog.Abort("Metric Store configuration must be present")
}
} else {
cclog.Abort("Main configuration must be present")
}
@@ -201,7 +208,7 @@ func main() {
if archiveCfg := ccconf.GetPackageConfig("archive"); archiveCfg != nil {
err = archive.Init(archiveCfg, config.Keys.DisableArchive)
} else {
err = archive.Init(json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`), config.Keys.DisableArchive)
err = archive.Init(json.RawMessage("{\"kind\":\"file\",\"path\":\"./var/job-archive\"}"), config.Keys.DisableArchive)
}
if err != nil {
cclog.Abortf("Init: Failed to initialize archive.\nError: %s\n", err.Error())
@@ -241,10 +248,15 @@ func main() {
cclog.Exit("No errors, server flag not set. Exiting cc-backend.")
}
//Metric Store starts after all flags have been processes
memorystore.Init()
archiver.Start(repository.GetJobRepository())
taskManager.Start(ccconf.GetPackageConfig("cron"),
ccconf.GetPackageConfig("archive"))
// // Comment out
// taskManager.Start(ccconf.GetPackageConfig("cron"),
// ccconf.GetPackageConfig("archive"))
serverInit()
var wg sync.WaitGroup

View File

@@ -26,6 +26,7 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/graph"
"github.com/ClusterCockpit/cc-backend/internal/graph/generated"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
"github.com/ClusterCockpit/cc-backend/internal/routerConfig"
"github.com/ClusterCockpit/cc-backend/web"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
@@ -325,6 +326,9 @@ func serverShutdown() {
// First shut down the server gracefully (waiting for all ongoing requests)
server.Shutdown(context.Background())
//Archive all the metric store data
memorystore.Shutdown()
// Then, wait for any async archivings still pending...
archiver.WaitForArchiving()
}

2
go.mod
View File

@@ -51,6 +51,7 @@ require (
github.com/go-openapi/spec v0.21.0 // indirect
github.com/go-openapi/swag v0.23.1 // indirect
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/securecookie v1.1.2 // indirect
github.com/gorilla/websocket v1.5.3 // indirect
@@ -63,6 +64,7 @@ require (
github.com/json-iterator/go v1.1.12 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
github.com/linkedin/goavro/v2 v2.14.0 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect

6
go.sum
View File

@@ -91,6 +91,9 @@ github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeD
github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
github.com/golang-migrate/migrate/v4 v4.18.2 h1:2VSCMz7x7mjyTXx3m2zPokOY82LTRgxK1yQYKo6wWQ8=
github.com/golang-migrate/migrate/v4 v4.18.2/go.mod h1:2CM6tJvn2kqPXwnXO/d3rAQYiyoIm180VsO8PRX6Rpk=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
@@ -166,6 +169,8 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/linkedin/goavro/v2 v2.14.0 h1:aNO/js65U+Mwq4yB5f1h01c3wiM458qtRad1DN0CMUI=
github.com/linkedin/goavro/v2 v2.14.0/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk=
github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
@@ -233,6 +238,7 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=

View File

@@ -9,6 +9,7 @@ import (
"encoding/json"
"time"
"github.com/ClusterCockpit/cc-backend/internal/memorystore"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
@@ -166,3 +167,12 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) {
cclog.Abort("Config Init: At least one cluster required in config. Exited with error.")
}
}
func InitMetricStore(msConfig json.RawMessage) {
// Validate(msConfigSchema, msConfig)
dec := json.NewDecoder(bytes.NewReader(msConfig))
dec.DisallowUnknownFields()
if err := dec.Decode(&memorystore.Keys); err != nil {
cclog.Abortf("Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", msConfig, err.Error())
}
}

View File

@@ -142,6 +142,10 @@ func InitDB() error {
continue
}
if jobMeta.Shared == "" {
jobMeta.Shared = "none"
}
id, err := r.TransactionAddNamed(t,
repository.NamedJobInsert, jobMeta)
if err != nil {

View File

@@ -19,7 +19,7 @@ import (
"sync/atomic"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/avro"
"github.com/ClusterCockpit/cc-backend/internal/avro"
"github.com/ClusterCockpit/cc-lib/util"
"github.com/linkedin/goavro/v2"
)

View File

@@ -2,16 +2,18 @@ package memorystore
import (
"context"
"encoding/json"
"errors"
"fmt"
"log"
"runtime"
"sync"
"time"
"github.com/ClusterCockpit/cc-backend/pkg/avro"
"github.com/ClusterCockpit/cc-backend/internal/avro"
"github.com/ClusterCockpit/cc-lib/resampler"
"github.com/ClusterCockpit/cc-lib/schema"
"github.com/ClusterCockpit/cc-lib/util"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
)
var (
@@ -29,20 +31,101 @@ func init() {
}
}
// For aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func (as *AggregationStrategy) UnmarshalJSON(data []byte) error {
var str string
if err := json.Unmarshal(data, &str); err != nil {
return err
}
switch str {
case "":
*as = NoAggregation
case "sum":
*as = SumAggregation
case "avg":
*as = AvgAggregation
default:
return fmt.Errorf("invalid aggregation strategy: %#v", str)
}
return nil
}
type MetricConfig struct {
// Interval in seconds at which measurements will arive.
Frequency int64 `json:"frequency"`
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy `json:"aggregation"`
// Private, used internally...
Offset int
}
type Metric struct {
Name string
Value util.Float
MetricConfig config.MetricConfig
MetricConfig MetricConfig
}
type MemoryStore struct {
Metrics map[string]config.MetricConfig
Metrics map[string]MetricConfig
root Level
}
func Init() {
startupTime := time.Now()
//Pass the keys from cluster config
InitMetrics()
ms := GetMemoryStore()
d, err := time.ParseDuration(Keys.Checkpoints.Restore)
if err != nil {
log.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
log.Printf("Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
log.Fatalf("Loading checkpoints failed: %s\n", err.Error())
} else {
log.Printf("Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
ctx, _ := context.WithCancel(context.Background())
var wg sync.WaitGroup
wg.Add(4)
Retention(&wg, ctx)
Checkpointing(&wg, ctx)
Archiving(&wg, ctx)
avro.DataStaging(&wg, ctx)
}
// Create a new, initialized instance of a MemoryStore.
// Will panic if values in the metric configurations are invalid.
func Init(metrics map[string]config.MetricConfig) {
func InitMetrics(metrics map[string]MetricConfig) {
singleton.Do(func() {
offset := 0
for key, cfg := range metrics {
@@ -50,7 +133,7 @@ func Init(metrics map[string]config.MetricConfig) {
panic("invalid frequency")
}
metrics[key] = config.MetricConfig{
metrics[key] = MetricConfig{
Frequency: cfg.Frequency,
Aggregation: cfg.Aggregation,
Offset: offset,
@@ -77,16 +160,16 @@ func GetMemoryStore() *MemoryStore {
}
func Shutdown() {
log.Printf("Writing to '%s'...\n", config.Keys.Checkpoints.RootDir)
log.Printf("Writing to '%s'...\n", Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore()
if config.Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(config.Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
if Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
} else {
files, err = avro.GetAvroStore().ToCheckpoint(config.Keys.Checkpoints.RootDir, true)
files, err = avro.GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true)
close(avro.LineProtocolMessages)
}
@@ -172,7 +255,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
go func() {
defer wg.Done()
d, err := time.ParseDuration(config.Keys.RetentionInMemory)
d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil {
log.Fatal(err)
}
@@ -261,7 +344,7 @@ func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metric
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]util.Float, int64, int64, int64, error) {
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) {
if from > to {
return nil, 0, 0, 0, errors.New("invalid time range")
}
@@ -271,7 +354,7 @@ func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, reso
return nil, 0, 0, 0, errors.New("unkown metric: " + metric)
}
n, data := 0, make([]util.Float, (to-from)/minfo.Frequency+1)
n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
@@ -309,12 +392,12 @@ func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, reso
} else if n == 0 {
return nil, 0, 0, 0, errors.New("metric or host not found")
} else if n > 1 {
if minfo.Aggregation == config.AvgAggregation {
normalize := 1. / util.Float(n)
if minfo.Aggregation == AvgAggregation {
normalize := 1. / schema.Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != config.SumAggregation {
} else if minfo.Aggregation != SumAggregation {
return nil, 0, 0, 0, errors.New("invalid aggregation")
}
}

View File

@@ -5,7 +5,6 @@ import (
"math"
"github.com/ClusterCockpit/cc-lib/util"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
)
type Stats struct {
@@ -105,9 +104,9 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6
return nil, 0, 0, ErrNoData
}
if minfo.Aggregation == config.AvgAggregation {
if minfo.Aggregation == AvgAggregation {
avg /= util.Float(n)
} else if n > 1 && minfo.Aggregation != config.SumAggregation {
} else if n > 1 && minfo.Aggregation != SumAggregation {
return nil, 0, 0, errors.New("invalid aggregation")
}

View File

@@ -14,19 +14,19 @@ import (
)
const NamedJobCacheInsert string = `INSERT INTO job_cache (
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
job_id, hpc_user, project, hpc_cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
) VALUES (
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
:job_id, :hpc_user, :project, :hpc_cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
);`
const NamedJobInsert string = `INSERT INTO job (
job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
job_id, hpc_user, project, hpc_cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc,
shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data
) VALUES (
:job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
:job_id, :hpc_user, :project, :hpc_cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc,
:shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data
);`
func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) {

View File

@@ -3,7 +3,7 @@ CREATE TABLE "job_cache" (
job_id BIGINT NOT NULL,
hpc_cluster VARCHAR(255) NOT NULL,
subcluster VARCHAR(255) NOT NULL,
submit_time BIGINT NOT NULL, -- Unix timestamp
submit_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp
start_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp
hpc_user VARCHAR(255) NOT NULL,
project VARCHAR(255) NOT NULL,
@@ -30,7 +30,7 @@ CREATE TABLE "job_cache" (
energy REAL NOT NULL DEFAULT 0.0,
energy_footprint TEXT DEFAULT NULL,
footprint TEXT DEFAULT NULL,
UNIQUE (job_id, cluster, start_time)
UNIQUE (job_id, hpc_cluster, start_time)
);
CREATE TABLE "job_new" (
@@ -65,10 +65,21 @@ CREATE TABLE "job_new" (
energy REAL NOT NULL DEFAULT 0.0,
energy_footprint TEXT DEFAULT NULL,
footprint TEXT DEFAULT NULL,
UNIQUE (job_id, cluster, start_time)
UNIQUE (job_id, hpc_cluster, start_time)
);
ALTER TABLE job RENAME COLUMN cluster TO hpc_cluster;
INSERT INTO job_new SELECT * FROM job;
INSERT INTO job_new (
id, job_id, hpc_cluster, subcluster, submit_time, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources,
num_nodes, num_hwthreads, num_acc, smt, shared, monitoring_status, energy,
energy_footprint, footprint
)
SELECT
id, job_id, hpc_cluster, subcluster, 0, start_time, hpc_user, project,
cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources,
num_nodes, num_hwthreads, num_acc, smt, exclusive, monitoring_status, energy,
energy_footprint, footprint
FROM job;
DROP TABLE job;
ALTER TABLE job_new RENAME TO job;

View File

@@ -7,6 +7,7 @@ package taskManager
import (
"bytes"
"encoding/json"
"fmt"
"time"
"github.com/ClusterCockpit/cc-backend/internal/auth"
@@ -65,10 +66,14 @@ func Start(cronCfg, archiveConfig json.RawMessage) {
RegisterStopJobsExceedTime()
}
fmt.Printf("Keys : %#v\n", Keys)
fmt.Printf("cronCfg : %#v\n", cronCfg)
fmt.Printf("archiveConfig : %#v\n", archiveConfig)
dec := json.NewDecoder(bytes.NewReader(cronCfg))
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
cclog.Errorf("error while decoding ldap config: %v", err)
cclog.Errorf("error while decoding cron config: %v", err)
}
var cfg struct {

BIN
var/._job-archive Executable file

Binary file not shown.