diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a8a7429..6974301 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,7 +7,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.24.x + go-version: 1.25.x - name: Checkout code uses: actions/checkout@v3 - name: Build, Vet & Test diff --git a/.gitignore b/.gitignore index 75cc004..963073d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,11 @@ /var/*.db /var/*.txt +/var/checkpoints* + +migrateTimestamps.pl +test_ccms_write_api.sh + /web/frontend/public/build /web/frontend/node_modules diff --git a/api/schema.graphqls b/api/schema.graphqls index d1c78f3..d05c658 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -38,7 +38,7 @@ type Job { numAcc: Int! energy: Float! SMT: Int! - exclusive: Int! + shared: String! partition: String! arrayJobId: Int! monitoringStatus: Int! @@ -425,7 +425,7 @@ input JobFilter { startTime: TimeRange state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int + shared: String node: StringInput } diff --git a/api/swagger.json b/api/swagger.json index 87bf3ed..c60810a 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -1394,12 +1394,6 @@ "format": "float64" } }, - "exclusive": { - "type": "integer", - "maximum": 2, - "minimum": 0, - "example": 1 - }, "footprint": { "type": "object", "additionalProperties": { @@ -1416,12 +1410,18 @@ }, "jobState": { "enum": [ - "completed", - "failed", + "boot_fail", "cancelled", - "stopped", - "timeout", - "out_of_memory" + "completed", + "deadline", + "failed", + "node_fail", + "out-of-memory", + "pending", + "preempted", + "running", + "suspended", + "timeout" ], "allOf": [ { @@ -1477,6 +1477,14 @@ "$ref": "#/definitions/schema.Resource" } }, + "shared": { + "type": "string", + "enum": [ + "none", + "single_user", + "multi_user" + ] + }, "smt": { "type": "integer", "example": 4 diff --git a/api/swagger.yaml b/api/swagger.yaml index 06caa56..6a4adbd 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -207,11 +207,6 @@ definitions: format: float64 type: number type: object - exclusive: - example: 1 - maximum: 2 - minimum: 0 - type: integer footprint: additionalProperties: format: float64 @@ -226,12 +221,18 @@ definitions: allOf: - $ref: '#/definitions/schema.JobState' enum: - - completed - - failed + - boot_fail - cancelled - - stopped + - completed + - deadline + - failed + - node_fail + - out-of-memory + - pending + - preempted + - running + - suspended - timeout - - out_of_memory example: completed metaData: additionalProperties: @@ -269,6 +270,12 @@ definitions: items: $ref: '#/definitions/schema.Resource' type: array + shared: + enum: + - none + - single_user + - multi_user + type: string smt: example: 4 type: integer diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 56018c3..0146118 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -18,6 +18,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/importer" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/metricdata" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/tagger" @@ -96,6 +97,12 @@ func main() { } else { cclog.Abort("Cluster configuration must be present") } + + if mscfg := ccconf.GetPackageConfig("metric-store"); mscfg != nil { + config.InitMetricStore(mscfg) + } else { + cclog.Abort("Metric Store configuration must be present") + } } else { cclog.Abort("Main configuration must be present") } @@ -201,7 +208,7 @@ func main() { if archiveCfg := ccconf.GetPackageConfig("archive"); archiveCfg != nil { err = archive.Init(archiveCfg, config.Keys.DisableArchive) } else { - err = archive.Init(json.RawMessage(`{\"kind\":\"file\",\"path\":\"./var/job-archive\"}`), config.Keys.DisableArchive) + err = archive.Init(json.RawMessage("{\"kind\":\"file\",\"path\":\"./var/job-archive\"}"), config.Keys.DisableArchive) } if err != nil { cclog.Abortf("Init: Failed to initialize archive.\nError: %s\n", err.Error()) @@ -241,13 +248,18 @@ func main() { cclog.Exit("No errors, server flag not set. Exiting cc-backend.") } + var wg sync.WaitGroup + + //Metric Store starts after all flags have been processes + memorystore.Init(&wg) + archiver.Start(repository.GetJobRepository()) + // // Comment out taskManager.Start(ccconf.GetPackageConfig("cron"), ccconf.GetPackageConfig("archive")) - serverInit() - var wg sync.WaitGroup + serverInit() wg.Add(1) go func() { diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 3983268..18d7ea5 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -26,6 +26,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph/generated" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/routerConfig" "github.com/ClusterCockpit/cc-backend/web" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" @@ -118,6 +119,7 @@ func serverInit() { userapi := router.PathPrefix("/userapi").Subrouter() configapi := router.PathPrefix("/config").Subrouter() frontendapi := router.PathPrefix("/frontend").Subrouter() + metricstoreapi := router.PathPrefix("/metricstore").Subrouter() if !config.Keys.DisableAuthentication { router.Handle("/login", authHandle.Login( @@ -198,6 +200,14 @@ func serverInit() { onFailureResponse) }) + metricstoreapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthMetricStoreApi( + // On success; + next, + // On failure: JSON Response + onFailureResponse) + }) + configapi.Use(func(next http.Handler) http.Handler { return authHandle.AuthConfigApi( // On success; @@ -231,6 +241,7 @@ func serverInit() { routerConfig.SetupRoutes(secured, buildInfo) apiHandle.MountApiRoutes(securedapi) apiHandle.MountUserApiRoutes(userapi) + apiHandle.MountMetricStoreApiRoutes(metricstoreapi) apiHandle.MountConfigApiRoutes(configapi) apiHandle.MountFrontendApiRoutes(frontendapi) @@ -325,6 +336,9 @@ func serverShutdown() { // First shut down the server gracefully (waiting for all ongoing requests) server.Shutdown(context.Background()) + //Archive all the metric store data + memorystore.Shutdown() + // Then, wait for any async archivings still pending... archiver.WaitForArchiving() } diff --git a/configs/config-demo.json b/configs/config-demo.json index d388d78..d47f926 100644 --- a/configs/config-demo.json +++ b/configs/config-demo.json @@ -4,11 +4,27 @@ "short-running-jobs-duration": 300, "resampling": { "trigger": 30, - "resolutions": [600, 300, 120, 60] + "resolutions": [ + 600, + 300, + 120, + 60 + ] }, - "apiAllowedIPs": ["*"], + "apiAllowedIPs": [ + "*" + ], "emission-constant": 317 }, + "cron": { + "commit-job-worker": "2m", + "duration-worker": "5m", + "footprint-worker": "10m" + }, + "archive": { + "kind": "file", + "path": "./var/job-archive" + }, "auth": { "jwts": { "max-age": "2000h" @@ -18,9 +34,7 @@ { "name": "fritz", "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "" + "kind": "cc-metric-store" }, "filterRanges": { "numNodes": { @@ -40,9 +54,7 @@ { "name": "alex", "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "" + "kind": "cc-metric-store" }, "filterRanges": { "numNodes": { @@ -59,5 +71,18 @@ } } } - ] -} + ], + "metric-store": { + "checkpoints": { + "file-format": "avro", + "interval": "2h", + "directory": "./var/checkpoints", + "restore": "48h" + }, + "archive": { + "interval": "2h", + "directory": "./var/archive" + }, + "retention-in-memory": "48h" + } +} \ No newline at end of file diff --git a/configs/config.json b/configs/config.json index 27c4ce2..505e446 100644 --- a/configs/config.json +++ b/configs/config.json @@ -6,13 +6,29 @@ "user": "clustercockpit", "group": "clustercockpit", "validate": false, - "apiAllowedIPs": ["*"], + "apiAllowedIPs": [ + "*" + ], "short-running-jobs-duration": 300, "resampling": { "trigger": 30, - "resolutions": [600, 300, 120, 60] + "resolutions": [ + 600, + 300, + 120, + 60 + ] } }, + "cron": { + "commit-job-worker": "2m", + "duration-worker": "5m", + "footprint-worker": "10m" + }, + "archive": { + "kind": "file", + "path": "./var/job-archive" + }, "clusters": [ { "name": "test", @@ -37,4 +53,4 @@ } } ] -} +} \ No newline at end of file diff --git a/go.mod b/go.mod index 554ea56..0725a30 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,10 @@ toolchain go1.24.1 require ( github.com/99designs/gqlgen v0.17.78 - github.com/ClusterCockpit/cc-lib v0.7.0 + github.com/ClusterCockpit/cc-lib v0.8.0 github.com/Masterminds/squirrel v1.5.4 github.com/coreos/go-oidc/v3 v3.12.0 - github.com/expr-lang/expr v1.17.5 + github.com/expr-lang/expr v1.17.6 github.com/go-co-op/gocron/v2 v2.16.0 github.com/go-ldap/ldap/v3 v3.4.10 github.com/go-sql-driver/mysql v1.9.0 @@ -19,19 +19,22 @@ require ( github.com/gorilla/handlers v1.5.2 github.com/gorilla/mux v1.8.1 github.com/gorilla/sessions v1.4.0 + github.com/influxdata/line-protocol/v2 v2.2.1 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 + github.com/linkedin/goavro/v2 v2.14.0 github.com/mattn/go-sqlite3 v1.14.24 - github.com/prometheus/client_golang v1.23.0 - github.com/prometheus/common v0.65.0 + github.com/nats-io/nats.go v1.45.0 + github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/common v0.66.1 github.com/qustavo/sqlhooks/v2 v2.1.0 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/swaggo/http-swagger v1.3.4 github.com/swaggo/swag v1.16.6 github.com/vektah/gqlparser/v2 v2.5.30 - golang.org/x/crypto v0.40.0 + golang.org/x/crypto v0.41.0 golang.org/x/oauth2 v0.30.0 - golang.org/x/time v0.5.0 + golang.org/x/time v0.12.0 ) require ( @@ -51,6 +54,7 @@ require ( github.com/go-openapi/spec v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect + github.com/golang/snappy v0.0.4 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect @@ -61,6 +65,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/compress v1.18.0 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect github.com/mailru/easyjson v0.9.0 // indirect @@ -68,6 +73,8 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect + github.com/nats-io/nkeys v0.4.11 // indirect + github.com/nats-io/nuid v1.0.1 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect @@ -80,13 +87,12 @@ require ( go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect golang.org/x/mod v0.26.0 // indirect - golang.org/x/net v0.42.0 // indirect + golang.org/x/net v0.43.0 // indirect golang.org/x/sync v0.16.0 // indirect - golang.org/x/sys v0.34.0 // indirect - golang.org/x/text v0.27.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect golang.org/x/tools v0.35.0 // indirect - google.golang.org/protobuf v1.36.6 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 6f61908..81ae22b 100644 --- a/go.sum +++ b/go.sum @@ -6,16 +6,16 @@ github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25 github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib v0.7.0 h1:THuSYrMcn9pSbrMditSI1LMOluq9TnM0/aVId4uK1Hc= -github.com/ClusterCockpit/cc-lib v0.7.0/go.mod h1:TD1PS8pL2RDvEWaqs8VNejoTSm5OawI9Dcc0CTY/yWQ= +github.com/ClusterCockpit/cc-lib v0.8.0 h1:kQRMOx30CJCy+Q6TgCK9rarJnJ/CKZPWlIEdIXYlxoA= +github.com/ClusterCockpit/cc-lib v0.8.0/go.mod h1:5xTwONu9pSp15mJ9CjBKGU9I3Jad8NfhrVHJZl50/yI= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdus0= -github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= +github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= +github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= @@ -38,6 +38,7 @@ github.com/coreos/go-oidc/v3 v3.12.0 h1:sJk+8G2qq94rDI6ehZ71Bol3oUHy63qNYmkiSjrc github.com/coreos/go-oidc/v3 v3.12.0/go.mod h1:gE3LgjOgFoHi9a4ce4/tJczr0Ai2/BoDhf0r5lltWI0= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -53,10 +54,14 @@ github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/expr-lang/expr v1.17.5 h1:i1WrMvcdLF249nSNlpQZN1S6NXuW9WaOfF5tPi3aw3k= -github.com/expr-lang/expr v1.17.5/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= +github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec= +github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= +github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.7 h1:DTX+lbVTWaTw1hQ+PbZPlnDZPEIs0SS/GCZAl535dDk= @@ -91,6 +96,11 @@ github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeD github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= github.com/golang-migrate/migrate/v4 v4.18.2 h1:2VSCMz7x7mjyTXx3m2zPokOY82LTRgxK1yQYKo6wWQ8= github.com/golang-migrate/migrate/v4 v4.18.2/go.mod h1:2CM6tJvn2kqPXwnXO/d3rAQYiyoIm180VsO8PRX6Rpk= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= @@ -127,6 +137,11 @@ github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjw github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= +github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98= +github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig= +github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo= +github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY= +github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY= github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= @@ -155,8 +170,11 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw= @@ -166,6 +184,8 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/linkedin/goavro/v2 v2.14.0 h1:aNO/js65U+Mwq4yB5f1h01c3wiM458qtRad1DN0CMUI= +github.com/linkedin/goavro/v2 v2.14.0/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= @@ -187,12 +207,13 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/nats-io/nats.go v1.44.0 h1:ECKVrDLdh/kDPV1g0gAQ+2+m2KprqZK5O/eJAyAnH2M= -github.com/nats-io/nats.go v1.44.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= +github.com/nats-io/nats.go v1.45.0 h1:/wGPbnYXDM0pLKFjZTX+2JOw9TQPoIgTFrUaH97giwA= +github.com/nats-io/nats.go v1.45.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0= github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= @@ -204,12 +225,12 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= -github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= -github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0= @@ -233,10 +254,11 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE= github.com/swaggo/files v1.0.1/go.mod h1:0qXmMNH6sXNf+73t65aKeB+ApmgxdnkQzVTAj2uaMUg= github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64a5ww= @@ -273,8 +295,8 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= -golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= -golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -295,8 +317,8 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= -golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= -golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -319,8 +341,8 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= -golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -339,10 +361,10 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= -golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= +golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= @@ -352,15 +374,16 @@ golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxb golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 9f47a1f..1c81fc9 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -241,7 +241,7 @@ func TestRestApi(t *testing.T) { "numNodes": 1, "numHwthreads": 8, "numAcc": 0, - "exclusive": 1, + "shared": "none", "monitoringStatus": 1, "smt": 1, "resources": [ @@ -396,7 +396,7 @@ func TestRestApi(t *testing.T) { "partition": "default", "walltime": 3600, "numNodes": 1, - "exclusive": 1, + "shared": "none", "monitoringStatus": 1, "smt": 1, "resources": [ diff --git a/internal/api/docs.go b/internal/api/docs.go index 50cab92..c10745c 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -1401,12 +1401,6 @@ const docTemplate = `{ "format": "float64" } }, - "exclusive": { - "type": "integer", - "maximum": 2, - "minimum": 0, - "example": 1 - }, "footprint": { "type": "object", "additionalProperties": { @@ -1423,12 +1417,18 @@ const docTemplate = `{ }, "jobState": { "enum": [ - "completed", - "failed", + "boot_fail", "cancelled", - "stopped", - "timeout", - "out_of_memory" + "completed", + "deadline", + "failed", + "node_fail", + "out-of-memory", + "pending", + "preempted", + "running", + "suspended", + "timeout" ], "allOf": [ { @@ -1484,6 +1484,14 @@ const docTemplate = `{ "$ref": "#/definitions/schema.Resource" } }, + "shared": { + "type": "string", + "enum": [ + "none", + "single_user", + "multi_user" + ] + }, "smt": { "type": "integer", "example": 4 diff --git a/internal/api/rest.go b/internal/api/rest.go index e4411a4..fcadc90 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -15,6 +15,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/repository" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" "github.com/ClusterCockpit/cc-lib/schema" @@ -95,6 +96,19 @@ func (api *RestApi) MountUserApiRoutes(r *mux.Router) { r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) } +func (api *RestApi) MountMetricStoreApiRoutes(r *mux.Router) { + // REST API Uses TokenAuth + r.HandleFunc("/api/free", memorystore.HandleFree).Methods(http.MethodPost) + r.HandleFunc("/api/write", memorystore.HandleWrite).Methods(http.MethodPost) + r.HandleFunc("/api/debug", memorystore.HandleDebug).Methods(http.MethodGet) + r.HandleFunc("/api/healthcheck", memorystore.HandleHealthCheck).Methods(http.MethodGet) + // Refactor + r.HandleFunc("/api/free/", memorystore.HandleFree).Methods(http.MethodPost) + r.HandleFunc("/api/write/", memorystore.HandleWrite).Methods(http.MethodPost) + r.HandleFunc("/api/debug/", memorystore.HandleDebug).Methods(http.MethodGet) + r.HandleFunc("/api/healthcheck/", memorystore.HandleHealthCheck).Methods(http.MethodGet) +} + func (api *RestApi) MountConfigApiRoutes(r *mux.Router) { r.StrictSlash(true) // Settings Frontend Uses SessionAuth diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 4275e3b..78c66ae 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -417,6 +417,42 @@ func (auth *Authentication) AuthUserApi( }) } +func (auth *Authentication) AuthMetricStoreApi( + onsuccess http.Handler, + onfailure func(rw http.ResponseWriter, r *http.Request, authErr error), +) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + user, err := auth.JwtAuth.AuthViaJWT(rw, r) + if err != nil { + cclog.Infof("auth metricstore api -> authentication failed: %s", err.Error()) + onfailure(rw, r, err) + return + } + + if user != nil { + switch { + case len(user.Roles) == 1: + if user.HasRole(schema.RoleApi) { + ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) + onsuccess.ServeHTTP(rw, r.WithContext(ctx)) + return + } + case len(user.Roles) >= 2: + if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { + ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) + onsuccess.ServeHTTP(rw, r.WithContext(ctx)) + return + } + default: + cclog.Info("auth metricstore api -> authentication failed: missing role") + onfailure(rw, r, errors.New("unauthorized")) + } + } + cclog.Info("auth metricstore api -> authentication failed: no auth") + onfailure(rw, r, errors.New("unauthorized")) + }) +} + func (auth *Authentication) AuthConfigApi( onsuccess http.Handler, onfailure func(rw http.ResponseWriter, r *http.Request, authErr error), diff --git a/internal/avro/avroCheckpoint.go b/internal/avro/avroCheckpoint.go new file mode 100644 index 0000000..b7c2ea1 --- /dev/null +++ b/internal/avro/avroCheckpoint.go @@ -0,0 +1,475 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package avro + +import ( + "bufio" + "encoding/json" + "errors" + "fmt" + "log" + "os" + "path" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/schema" + "github.com/linkedin/goavro/v2" +) + +var NumWorkers int = 4 + +var ErrNoNewData error = errors.New("no data in the pool") + +func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) { + levels := make([]*AvroLevel, 0) + selectors := make([][]string, 0) + as.root.lock.RLock() + // Cluster + for sel1, l1 := range as.root.children { + l1.lock.RLock() + // Node + for sel2, l2 := range l1.children { + l2.lock.RLock() + // Frequency + for sel3, l3 := range l2.children { + levels = append(levels, l3) + selectors = append(selectors, []string{sel1, sel2, sel3}) + } + l2.lock.RUnlock() + } + l1.lock.RUnlock() + } + as.root.lock.RUnlock() + + type workItem struct { + level *AvroLevel + dir string + selector []string + } + + n, errs := int32(0), int32(0) + + var wg sync.WaitGroup + wg.Add(NumWorkers) + work := make(chan workItem, NumWorkers*2) + for range NumWorkers { + go func() { + defer wg.Done() + + for workItem := range work { + from := getTimestamp(workItem.dir) + + if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil { + if err == ErrNoNewData { + continue + } + + log.Printf("error while checkpointing %#v: %s", workItem.selector, err.Error()) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + } + } + }() + } + + for i := range len(levels) { + dir := path.Join(dir, path.Join(selectors[i]...)) + work <- workItem{ + level: levels[i], + dir: dir, + selector: selectors[i], + } + } + + close(work) + wg.Wait() + + if errs > 0 { + return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +// getTimestamp returns the timestamp from the directory name +func getTimestamp(dir string) int64 { + // Extract the resolution and timestamp from the directory name + // The existing avro file will be in epoch timestamp format + // iterate over all the files in the directory and find the maximum timestamp + // and return it + + resolution := path.Base(dir) + dir = path.Dir(dir) + + files, err := os.ReadDir(dir) + if err != nil { + return 0 + } + var maxTs int64 = 0 + + if len(files) == 0 { + return 0 + } + + for _, file := range files { + if file.IsDir() { + continue + } + name := file.Name() + + if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") { + continue + } + + ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64) + if err != nil { + fmt.Printf("error while parsing timestamp: %s\n", err.Error()) + continue + } + + if ts > maxTs { + maxTs = ts + } + } + + interval, _ := time.ParseDuration(config.MetricStoreKeys.Checkpoints.Interval) + updateTime := time.Unix(maxTs, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix() + + if updateTime < time.Now().Unix() { + return 0 + } + + return maxTs +} + +func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { + l.lock.Lock() + defer l.lock.Unlock() + + // fmt.Printf("Checkpointing directory: %s\n", dir) + // filepath contains the resolution + int_res, _ := strconv.Atoi(path.Base(dir)) + + // find smallest overall timestamp in l.data map and delete it from l.data + minTs := int64(1<<63 - 1) + for ts, dat := range l.data { + if ts < minTs && len(dat) != 0 { + minTs = ts + } + } + + if from == 0 && minTs != int64(1<<63-1) { + from = minTs + } + + if from == 0 { + return ErrNoNewData + } + + var schema string + var codec *goavro.Codec + record_list := make([]map[string]any, 0) + + var f *os.File + + filePath := dir + fmt.Sprintf("_%d.avro", from) + + var err error + + fp_, err_ := os.Stat(filePath) + if errors.Is(err_, os.ErrNotExist) { + err = os.MkdirAll(path.Dir(dir), 0o755) + if err != nil { + return fmt.Errorf("failed to create directory: %v", err) + } + } else if fp_.Size() != 0 { + f, err = os.Open(filePath) + if err != nil { + return fmt.Errorf("failed to open existing avro file: %v", err) + } + + br := bufio.NewReader(f) + + reader, err := goavro.NewOCFReader(br) + if err != nil { + return fmt.Errorf("failed to create OCF reader: %v", err) + } + codec = reader.Codec() + schema = codec.Schema() + + f.Close() + } + + time_ref := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix() + + if dumpAll { + time_ref = time.Now().Unix() + } + + // Empty values + if len(l.data) == 0 { + // we checkpoint avro files every 60 seconds + repeat := 60 / int_res + + for range repeat { + record_list = append(record_list, make(map[string]any)) + } + } + + readFlag := true + + for ts := range l.data { + flag := false + if ts < time_ref { + data := l.data[ts] + + schema_gen, err := generateSchema(data) + if err != nil { + return err + } + + flag, schema, err = compareSchema(schema, schema_gen) + if err != nil { + return fmt.Errorf("failed to compare read and generated schema: %v", err) + } + if flag && readFlag && !errors.Is(err_, os.ErrNotExist) { + + f.Close() + + f, err = os.Open(filePath) + if err != nil { + return fmt.Errorf("failed to open Avro file: %v", err) + } + + br := bufio.NewReader(f) + + ocfReader, err := goavro.NewOCFReader(br) + if err != nil { + return fmt.Errorf("failed to create OCF reader while changing schema: %v", err) + } + + for ocfReader.Scan() { + record, err := ocfReader.Read() + if err != nil { + return fmt.Errorf("failed to read record: %v", err) + } + + record_list = append(record_list, record.(map[string]any)) + } + + f.Close() + + err = os.Remove(filePath) + if err != nil { + return fmt.Errorf("failed to delete file: %v", err) + } + + readFlag = false + } + codec, err = goavro.NewCodec(schema) + if err != nil { + return fmt.Errorf("failed to create codec after merged schema: %v", err) + } + + record_list = append(record_list, generateRecord(data)) + delete(l.data, ts) + } + } + + if len(record_list) == 0 { + return ErrNoNewData + } + + f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644) + if err != nil { + return fmt.Errorf("failed to append new avro file: %v", err) + } + + // fmt.Printf("Codec : %#v\n", codec) + + writer, err := goavro.NewOCFWriter(goavro.OCFConfig{ + W: f, + Codec: codec, + CompressionName: goavro.CompressionDeflateLabel, + }) + if err != nil { + return fmt.Errorf("failed to create OCF writer: %v", err) + } + + // Append the new record + if err := writer.Append(record_list); err != nil { + return fmt.Errorf("failed to append record: %v", err) + } + + f.Close() + + return nil +} + +func compareSchema(schemaRead, schemaGen string) (bool, string, error) { + var genSchema, readSchema AvroSchema + + if schemaRead == "" { + return false, schemaGen, nil + } + + // Unmarshal the schema strings into AvroSchema structs + if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil { + return false, "", fmt.Errorf("failed to parse generated schema: %v", err) + } + if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil { + return false, "", fmt.Errorf("failed to parse read schema: %v", err) + } + + sort.Slice(genSchema.Fields, func(i, j int) bool { + return genSchema.Fields[i].Name < genSchema.Fields[j].Name + }) + + sort.Slice(readSchema.Fields, func(i, j int) bool { + return readSchema.Fields[i].Name < readSchema.Fields[j].Name + }) + + // Check if schemas are identical + schemasEqual := true + if len(genSchema.Fields) <= len(readSchema.Fields) { + + for i := range genSchema.Fields { + if genSchema.Fields[i].Name != readSchema.Fields[i].Name { + schemasEqual = false + break + } + } + + // If schemas are identical, return the read schema + if schemasEqual { + return false, schemaRead, nil + } + } + + // Create a map to hold unique fields from both schemas + fieldMap := make(map[string]AvroField) + + // Add fields from the read schema + for _, field := range readSchema.Fields { + fieldMap[field.Name] = field + } + + // Add or update fields from the generated schema + for _, field := range genSchema.Fields { + fieldMap[field.Name] = field + } + + // Create a union schema by collecting fields from the map + var mergedFields []AvroField + for _, field := range fieldMap { + mergedFields = append(mergedFields, field) + } + + // Sort fields by name for consistency + sort.Slice(mergedFields, func(i, j int) bool { + return mergedFields[i].Name < mergedFields[j].Name + }) + + // Create the merged schema + mergedSchema := AvroSchema{ + Type: "record", + Name: genSchema.Name, + Fields: mergedFields, + } + + // Check if schemas are identical + schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields) + if schemasEqual { + for i := range mergedSchema.Fields { + if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name { + schemasEqual = false + break + } + } + + if schemasEqual { + return false, schemaRead, nil + } + } + + // Marshal the merged schema back to JSON + mergedSchemaJson, err := json.Marshal(mergedSchema) + if err != nil { + return false, "", fmt.Errorf("failed to marshal merged schema: %v", err) + } + + return true, string(mergedSchemaJson), nil +} + +func generateSchema(data map[string]schema.Float) (string, error) { + // Define the Avro schema structure + schema := map[string]any{ + "type": "record", + "name": "DataRecord", + "fields": []map[string]any{}, + } + + fieldTracker := make(map[string]struct{}) + + for key := range data { + if _, exists := fieldTracker[key]; !exists { + key = correctKey(key) + + field := map[string]any{ + "name": key, + "type": "double", + "default": -1.0, + } + schema["fields"] = append(schema["fields"].([]map[string]any), field) + fieldTracker[key] = struct{}{} + } + } + + schemaString, err := json.Marshal(schema) + if err != nil { + return "", fmt.Errorf("failed to marshal schema: %v", err) + } + + return string(schemaString), nil +} + +func generateRecord(data map[string]schema.Float) map[string]any { + record := make(map[string]any) + + // Iterate through each map in data + for key, value := range data { + key = correctKey(key) + + // Set the value in the record + // avro only accepts basic types + record[key] = value.Double() + } + + return record +} + +func correctKey(key string) string { + // Replace any invalid characters in the key + // For example, replace spaces with underscores + key = strings.ReplaceAll(key, ":", "___") + key = strings.ReplaceAll(key, ".", "__") + + return key +} + +func ReplaceKey(key string) string { + // Replace any invalid characters in the key + // For example, replace spaces with underscores + key = strings.ReplaceAll(key, "___", ":") + key = strings.ReplaceAll(key, "__", ".") + + return key +} diff --git a/internal/avro/avroHelper.go b/internal/avro/avroHelper.go new file mode 100644 index 0000000..7710f0f --- /dev/null +++ b/internal/avro/avroHelper.go @@ -0,0 +1,84 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package avro + +import ( + "context" + "log" + "slices" + "strconv" + "sync" + + "github.com/ClusterCockpit/cc-backend/internal/config" +) + +func DataStaging(wg *sync.WaitGroup, ctx context.Context) { + // AvroPool is a pool of Avro writers. + go func() { + if config.MetricStoreKeys.Checkpoints.FileFormat == "json" { + wg.Done() // Mark this goroutine as done + return // Exit the goroutine + } + + defer wg.Done() + + var avroLevel *AvroLevel + oldSelector := make([]string, 0) + + for { + select { + case <-ctx.Done(): + return + case val := <-LineProtocolMessages: + // Fetch the frequency of the metric from the global configuration + freq, err := config.GetMetricFrequency(val.MetricName) + if err != nil { + log.Printf("Error fetching metric frequency: %s\n", err) + continue + } + + metricName := "" + + for _, selector_name := range val.Selector { + metricName += selector_name + Delimiter + } + + metricName += val.MetricName + + // Create a new selector for the Avro level + // The selector is a slice of strings that represents the path to the + // Avro level. It is created by appending the cluster, node, and metric + // name to the selector. + var selector []string + selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10)) + + if !testEq(oldSelector, selector) { + // Get the Avro level for the metric + avroLevel = avroStore.root.findAvroLevelOrCreate(selector) + + // If the Avro level is nil, create a new one + if avroLevel == nil { + log.Printf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) + } + oldSelector = slices.Clone(selector) + } + + avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq)) + } + } + }() +} + +func testEq(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/internal/avro/avroStruct.go b/internal/avro/avroStruct.go new file mode 100644 index 0000000..b0ded94 --- /dev/null +++ b/internal/avro/avroStruct.go @@ -0,0 +1,167 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package avro + +import ( + "sync" + + "github.com/ClusterCockpit/cc-lib/schema" +) + +var ( + LineProtocolMessages = make(chan *AvroStruct) + Delimiter = "ZZZZZ" +) + +// CheckpointBufferMinutes should always be in minutes. +// Its controls the amount of data to hold for given amount of time. +var CheckpointBufferMinutes = 3 + +type AvroStruct struct { + MetricName string + Cluster string + Node string + Selector []string + Value schema.Float + Timestamp int64 +} + +type AvroStore struct { + root AvroLevel +} + +var avroStore AvroStore + +type AvroLevel struct { + children map[string]*AvroLevel + data map[int64]map[string]schema.Float + lock sync.RWMutex +} + +type AvroField struct { + Name string `json:"name"` + Type any `json:"type"` + Default any `json:"default,omitempty"` +} + +type AvroSchema struct { + Type string `json:"type"` + Name string `json:"name"` + Fields []AvroField `json:"fields"` +} + +func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel { + if len(selector) == 0 { + return l + } + + // Allow concurrent reads: + l.lock.RLock() + var child *AvroLevel + var ok bool + if l.children == nil { + // Children map needs to be created... + l.lock.RUnlock() + } else { + child, ok := l.children[selector[0]] + l.lock.RUnlock() + if ok { + return child.findAvroLevelOrCreate(selector[1:]) + } + } + + // The level does not exist, take write lock for unqiue access: + l.lock.Lock() + // While this thread waited for the write lock, another thread + // could have created the child node. + if l.children != nil { + child, ok = l.children[selector[0]] + if ok { + l.lock.Unlock() + return child.findAvroLevelOrCreate(selector[1:]) + } + } + + child = &AvroLevel{ + data: make(map[int64]map[string]schema.Float, 0), + children: nil, + } + + if l.children != nil { + l.children[selector[0]] = child + } else { + l.children = map[string]*AvroLevel{selector[0]: child} + } + l.lock.Unlock() + return child.findAvroLevelOrCreate(selector[1:]) +} + +func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) { + l.lock.Lock() + defer l.lock.Unlock() + + KeyCounter := int(CheckpointBufferMinutes * 60 / Freq) + + // Create keys in advance for the given amount of time + if len(l.data) != KeyCounter { + if len(l.data) == 0 { + for i := range KeyCounter { + l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0) + } + } else { + // Get the last timestamp + var lastTs int64 + for ts := range l.data { + if ts > lastTs { + lastTs = ts + } + } + // Create keys for the next KeyCounter timestamps + l.data[lastTs+int64(Freq)] = make(map[string]schema.Float, 0) + } + } + + closestTs := int64(0) + minDiff := int64(Freq) + 1 // Start with diff just outside the valid range + found := false + + // Iterate over timestamps and choose the one which is within range. + // Since its epoch time, we check if the difference is less than 60 seconds. + for ts, dat := range l.data { + // Check if timestamp is within range + diff := timestamp - ts + if diff < -int64(Freq) || diff > int64(Freq) { + continue + } + + // Metric already present at this timestamp — skip + if _, ok := dat[metricName]; ok { + continue + } + + // Check if this is the closest timestamp so far + if Abs(diff) < minDiff { + minDiff = Abs(diff) + closestTs = ts + found = true + } + } + + if found { + l.data[closestTs][metricName] = value + } +} + +func GetAvroStore() *AvroStore { + return &avroStore +} + +// Abs returns the absolute value of x. +func Abs(x int64) int64 { + if x < 0 { + return -x + } + return x +} diff --git a/internal/config/config.go b/internal/config/config.go index 7332941..183608c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -162,7 +162,7 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) { cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error()) } - if Clusters == nil || len(Clusters) < 1 { + if len(Clusters) < 1 { cclog.Abort("Config Init: At least one cluster required in config. Exited with error.") } } diff --git a/internal/config/memorystore.go b/internal/config/memorystore.go new file mode 100644 index 0000000..c277045 --- /dev/null +++ b/internal/config/memorystore.go @@ -0,0 +1,128 @@ +package config + +import ( + "bytes" + "encoding/json" + "fmt" + + cclog "github.com/ClusterCockpit/cc-lib/ccLogger" +) + +// -------------------- +// Metric Store config +// -------------------- +type MetricStoreConfig struct { + Checkpoints struct { + FileFormat string `json:"file-format"` + Interval string `json:"interval"` + RootDir string `json:"directory"` + Restore string `json:"restore"` + } `json:"checkpoints"` + Debug struct { + DumpToFile string `json:"dump-to-file"` + EnableGops bool `json:"gops"` + } `json:"debug"` + RetentionInMemory string `json:"retention-in-memory"` + Archive struct { + Interval string `json:"interval"` + RootDir string `json:"directory"` + DeleteInstead bool `json:"delete-instead"` + } `json:"archive"` + Nats []*NatsConfig `json:"nats"` +} + +type NatsConfig struct { + // Address of the nats server + Address string `json:"address"` + + // Username/Password, optional + Username string `json:"username"` + Password string `json:"password"` + + //Creds file path + Credsfilepath string `json:"creds-file-path"` + + Subscriptions []struct { + // Channel name + SubscribeTo string `json:"subscribe-to"` + + // Allow lines without a cluster tag, use this as default, optional + ClusterTag string `json:"cluster-tag"` + } `json:"subscriptions"` +} + +var MetricStoreKeys MetricStoreConfig + +// For aggregation over multiple values at different cpus/sockets/..., not time! +type AggregationStrategy int + +const ( + NoAggregation AggregationStrategy = iota + SumAggregation + AvgAggregation +) + +func AssignAggregationStratergy(str string) (AggregationStrategy, error) { + switch str { + case "": + return NoAggregation, nil + case "sum": + return SumAggregation, nil + case "avg": + return AvgAggregation, nil + default: + return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str) + } +} + +type MetricConfig struct { + // Interval in seconds at which measurements will arive. + Frequency int64 + + // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. + Aggregation AggregationStrategy + + // Private, used internally... + Offset int +} + +var Metrics map[string]MetricConfig + +func InitMetricStore(msConfig json.RawMessage) { + // Validate(msConfigSchema, msConfig) + dec := json.NewDecoder(bytes.NewReader(msConfig)) + dec.DisallowUnknownFields() + if err := dec.Decode(&MetricStoreKeys); err != nil { + cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", msConfig, err.Error()) + } +} + +func GetMetricFrequency(metricName string) (int64, error) { + if metric, ok := Metrics[metricName]; ok { + return metric.Frequency, nil + } + return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName) +} + +// add logic to add metrics. Redundant metrics should be updated with max frequency. +// use metric.Name to check if the metric already exists. +// if not, add it to the Metrics map. +func AddMetric(name string, metric MetricConfig) error { + + if Metrics == nil { + Metrics = make(map[string]MetricConfig, 0) + } + + if existingMetric, ok := Metrics[name]; ok { + if existingMetric.Frequency != metric.Frequency { + if existingMetric.Frequency < metric.Frequency { + existingMetric.Frequency = metric.Frequency + Metrics[name] = existingMetric + } + } + } else { + Metrics[name] = metric + } + + return nil +} diff --git a/internal/config/schema.go b/internal/config/schema.go index 37d662a..ca0440e 100644 --- a/internal/config/schema.go +++ b/internal/config/schema.go @@ -144,7 +144,7 @@ var clustersSchema = ` "type": "string" } }, - "required": ["kind", "url"] + "required": ["kind"] }, "filterRanges": { "description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.", diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index 011e396..766b748 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -118,7 +118,6 @@ type ComplexityRoot struct { Duration func(childComplexity int) int Energy func(childComplexity int) int EnergyFootprint func(childComplexity int) int - Exclusive func(childComplexity int) int Footprint func(childComplexity int) int ID func(childComplexity int) int JobID func(childComplexity int) int @@ -131,6 +130,7 @@ type ComplexityRoot struct { Project func(childComplexity int) int Resources func(childComplexity int) int SMT func(childComplexity int) int + Shared func(childComplexity int) int StartTime func(childComplexity int) int State func(childComplexity int) int SubCluster func(childComplexity int) int @@ -427,8 +427,6 @@ type ClusterResolver interface { type JobResolver interface { StartTime(ctx context.Context, obj *schema.Job) (*time.Time, error) - Exclusive(ctx context.Context, obj *schema.Job) (int, error) - Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) ConcurrentJobs(ctx context.Context, obj *schema.Job) (*model.JobLinkResultList, error) @@ -729,13 +727,6 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return e.complexity.Job.EnergyFootprint(childComplexity), true - case "Job.exclusive": - if e.complexity.Job.Exclusive == nil { - break - } - - return e.complexity.Job.Exclusive(childComplexity), true - case "Job.footprint": if e.complexity.Job.Footprint == nil { break @@ -820,6 +811,13 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return e.complexity.Job.SMT(childComplexity), true + case "Job.shared": + if e.complexity.Job.Shared == nil { + break + } + + return e.complexity.Job.Shared(childComplexity), true + case "Job.startTime": if e.complexity.Job.StartTime == nil { break @@ -2379,7 +2377,7 @@ type Job { numAcc: Int! energy: Float! SMT: Int! - exclusive: Int! + shared: String! partition: String! arrayJobId: Int! monitoringStatus: Int! @@ -2766,7 +2764,7 @@ input JobFilter { startTime: TimeRange state: [JobState!] metricStats: [MetricStatItem!] - exclusive: Int + shared: String node: StringInput } @@ -5241,8 +5239,8 @@ func (ec *executionContext) fieldContext_Job_SMT(_ context.Context, field graphq return fc, nil } -func (ec *executionContext) _Job_exclusive(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Job_exclusive(ctx, field) +func (ec *executionContext) _Job_shared(ctx context.Context, field graphql.CollectedField, obj *schema.Job) (ret graphql.Marshaler) { + fc, err := ec.fieldContext_Job_shared(ctx, field) if err != nil { return graphql.Null } @@ -5255,7 +5253,7 @@ func (ec *executionContext) _Job_exclusive(ctx context.Context, field graphql.Co }() resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { ctx = rctx // use context from middleware stack in children - return ec.resolvers.Job().Exclusive(rctx, obj) + return obj.Shared, nil }) if err != nil { ec.Error(ctx, err) @@ -5267,19 +5265,19 @@ func (ec *executionContext) _Job_exclusive(ctx context.Context, field graphql.Co } return graphql.Null } - res := resTmp.(int) + res := resTmp.(string) fc.Result = res - return ec.marshalNInt2int(ctx, field.Selections, res) + return ec.marshalNString2string(ctx, field.Selections, res) } -func (ec *executionContext) fieldContext_Job_exclusive(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_Job_shared(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "Job", Field: field, - IsMethod: true, - IsResolver: true, + IsMethod: false, + IsResolver: false, Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { - return nil, errors.New("field of type Int does not have child fields") + return nil, errors.New("field of type String does not have child fields") }, } return fc, nil @@ -6428,8 +6426,8 @@ func (ec *executionContext) fieldContext_JobResultList_items(_ context.Context, return ec.fieldContext_Job_energy(ctx, field) case "SMT": return ec.fieldContext_Job_SMT(ctx, field) - case "exclusive": - return ec.fieldContext_Job_exclusive(ctx, field) + case "shared": + return ec.fieldContext_Job_shared(ctx, field) case "partition": return ec.fieldContext_Job_partition(ctx, field) case "arrayJobId": @@ -11158,8 +11156,8 @@ func (ec *executionContext) fieldContext_Query_job(ctx context.Context, field gr return ec.fieldContext_Job_energy(ctx, field) case "SMT": return ec.fieldContext_Job_SMT(ctx, field) - case "exclusive": - return ec.fieldContext_Job_exclusive(ctx, field) + case "shared": + return ec.fieldContext_Job_shared(ctx, field) case "partition": return ec.fieldContext_Job_partition(ctx, field) case "arrayJobId": @@ -16475,7 +16473,7 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any asMap[k] = v } - fieldsInOrder := [...]string{"tags", "dbId", "jobId", "arrayJobId", "user", "project", "jobName", "cluster", "partition", "duration", "energy", "minRunningFor", "numNodes", "numAccelerators", "numHWThreads", "startTime", "state", "metricStats", "exclusive", "node"} + fieldsInOrder := [...]string{"tags", "dbId", "jobId", "arrayJobId", "user", "project", "jobName", "cluster", "partition", "duration", "energy", "minRunningFor", "numNodes", "numAccelerators", "numHWThreads", "startTime", "state", "metricStats", "shared", "node"} for _, k := range fieldsInOrder { v, ok := asMap[k] if !ok { @@ -16608,13 +16606,13 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any return it, err } it.MetricStats = data - case "exclusive": - ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("exclusive")) - data, err := ec.unmarshalOInt2ᚖint(ctx, v) + case "shared": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("shared")) + data, err := ec.unmarshalOString2ᚖstring(ctx, v) if err != nil { return it, err } - it.Exclusive = data + it.Shared = data case "node": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("node")) data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) @@ -17522,42 +17520,11 @@ func (ec *executionContext) _Job(ctx context.Context, sel ast.SelectionSet, obj if out.Values[i] == graphql.Null { atomic.AddUint32(&out.Invalids, 1) } - case "exclusive": - field := field - - innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - } - }() - res = ec._Job_exclusive(ctx, field, obj) - if res == graphql.Null { - atomic.AddUint32(&fs.Invalids, 1) - } - return res + case "shared": + out.Values[i] = ec._Job_shared(ctx, field, obj) + if out.Values[i] == graphql.Null { + atomic.AddUint32(&out.Invalids, 1) } - - if field.Deferrable != nil { - dfs, ok := deferred[field.Deferrable.Label] - di := 0 - if ok { - dfs.AddField(field) - di = len(dfs.Values) - 1 - } else { - dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) - deferred[field.Deferrable.Label] = dfs - } - dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { - return innerFunc(ctx, dfs) - }) - - // don't run the out.Concurrently() call below - out.Values[i] = graphql.Null - continue - } - - out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) case "partition": out.Values[i] = ec._Job_partition(ctx, field, obj) if out.Values[i] == graphql.Null { diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index c1e8be6..9b87864 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -69,7 +69,7 @@ type JobFilter struct { StartTime *config.TimeRange `json:"startTime,omitempty"` State []schema.JobState `json:"state,omitempty"` MetricStats []*MetricStatItem `json:"metricStats,omitempty"` - Exclusive *int `json:"exclusive,omitempty"` + Shared *string `json:"shared,omitempty"` Node *StringInput `json:"node,omitempty"` } diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index a7a69c3..cbe3650 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -35,11 +35,6 @@ func (r *jobResolver) StartTime(ctx context.Context, obj *schema.Job) (*time.Tim return ×tamp, nil } -// Exclusive is the resolver for the exclusive field. -func (r *jobResolver) Exclusive(ctx context.Context, obj *schema.Job) (int, error) { - panic(fmt.Errorf("not implemented: Exclusive - exclusive")) -} - // Tags is the resolver for the tags field. func (r *jobResolver) Tags(ctx context.Context, obj *schema.Job) ([]*schema.Tag, error) { return r.Repo.GetTags(repository.GetUserFromContext(ctx), obj.ID) @@ -859,3 +854,15 @@ type mutationResolver struct{ *Resolver } type nodeResolver struct{ *Resolver } type queryResolver struct{ *Resolver } type subClusterResolver struct{ *Resolver } + +// !!! WARNING !!! +// The code below was going to be deleted when updating resolvers. It has been copied here so you have +// one last chance to move it out of harms way if you want. There are two reasons this happens: +// - When renaming or deleting a resolver the old code will be put in here. You can safely delete +// it when you're done. +// - You have helper methods in this file. Move them out to keep these resolver files clean. +/* + func (r *jobResolver) Exclusive(ctx context.Context, obj *schema.Job) (int, error) { + panic(fmt.Errorf("not implemented: Exclusive - exclusive")) +} +*/ diff --git a/internal/importer/testdata/meta-fritzError.input b/internal/importer/testdata/meta-fritzError.input index 2b8d0e8..90e46cf 100644 --- a/internal/importer/testdata/meta-fritzError.input +++ b/internal/importer/testdata/meta-fritzError.input @@ -1 +1 @@ -{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"exclusive":1,"monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}} +{"jobId":398955,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","partition":"singlenode","arrayJobId":0,"numNodes":1,"numHwthreads":72,"numAcc":0,"shared":"none","monitoringStatus":1,"smt":0,"jobState":"completed","duration":260,"walltime":86340,"resources":[{"hostname":"f0720"}],"metaData":{"jobName":"ams_pipeline","jobScript":"#!/bin/bash -l\n#SBATCH --job-name=ams_pipeline\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\nuss=$(whoami)\nfind /dev/shm/ -user $uss -type f -mmin +30 -delete\ncd \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\"\nams_pipeline pipeline.json \u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.out\" 2\u003e \"/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh.err\"\n","slurmInfo":"\nJobId=398955 JobName=ams_pipeline\n UserId=k106eb10(210387) GroupId=80111\n Account=k106eb QOS=normal \n Requeue=False Restarts=0 BatchFlag=True \n TimeLimit=1439\n SubmitTime=2023-02-09T14:11:22\n Partition=singlenode \n NodeList=f0720\n NumNodes=1 NumCPUs=72 NumTasks=72 CPUs/Task=1\n NTasksPerNode:Socket:Core=0:None:None\n TRES_req=cpu=72,mem=250000M,node=1,billing=72\n TRES_alloc=cpu=72,node=1,billing=72\n Command=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11/ams_pipeline_job.sh\n WorkDir=/home/atuin/k106eb/k106eb10/ACE/Ni-Al/DFT/VASP_PBE_500_0.125_0.1_NM/AlNi/binaries/bulk/base-hcp/occ-shaken/hcp16.occ.4.shake.0/cfg/NiAl3NiAl11\n StdErr=\n StdOut=ams_pipeline.o%j\n"},"startTime":1675956725,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":2335.254,"min":800.418,"max":2734.922},"cpu_load":{"unit":{"base":""},"avg":52.72,"min":34.46,"max":71.91},"cpu_power":{"unit":{"base":"W"},"avg":407.767,"min":93.932,"max":497.636},"cpu_user":{"unit":{"base":""},"avg":63.678,"min":19.872,"max":96.633},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":635.672,"min":0,"max":1332.874},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":261.006,"min":0,"max":382.294},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":113.659,"min":0,"max":568.286},"ib_recv":{"unit":{"base":"B/s"},"avg":27981.111,"min":69.4,"max":48084.589},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":398.939,"min":0.5,"max":693.817},"ib_xmit":{"unit":{"base":"B/s"},"avg":188.513,"min":39.597,"max":724.568},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":0.867,"min":0.2,"max":2.933},"ipc":{"unit":{"base":"IPC"},"avg":0.944,"min":0.564,"max":1.291},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":79.565,"min":0.021,"max":116.02},"mem_power":{"unit":{"base":"W"},"avg":24.692,"min":7.883,"max":31.318},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":22.566,"min":8.225,"max":27.613},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":647,"min":0,"max":1946},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6181.6,"min":1270,"max":11411},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":22.4,"min":11,"max":29},"vectorization_ratio":{"unit":{"base":"%"},"avg":77.351,"min":0,"max":98.837}}} diff --git a/internal/importer/testdata/meta-fritzMinimal.input b/internal/importer/testdata/meta-fritzMinimal.input index f2cce79..f0289fb 100644 --- a/internal/importer/testdata/meta-fritzMinimal.input +++ b/internal/importer/testdata/meta-fritzMinimal.input @@ -1 +1 @@ -{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"exclusive":1,"jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}} +{"jobId":398764,"user":"k106eb10","project":"k106eb","cluster":"fritz","subCluster":"main","numNodes":1,"shared":"none","jobState":"completed","duration":177,"resources":[{"hostname":"f0649"}],"startTime":1675954353,"statistics":{"clock":{"unit":{"base":"Hz","prefix":"M"},"avg":1336.519,"min":801.564,"max":2348.215},"cpu_load":{"unit":{"base":""},"avg":31.64,"min":17.36,"max":45.54},"cpu_power":{"unit":{"base":"W"},"avg":150.018,"min":93.672,"max":261.592},"cpu_user":{"unit":{"base":""},"avg":28.518,"min":0.09,"max":57.343},"flops_any":{"unit":{"base":"F/s","prefix":"G"},"avg":45.012,"min":0,"max":135.037},"flops_dp":{"unit":{"base":"F/s","prefix":"G"},"avg":22.496,"min":0,"max":67.488},"flops_sp":{"unit":{"base":"F/s","prefix":"G"},"avg":0.02,"min":0,"max":0.061},"ib_recv":{"unit":{"base":"B/s"},"avg":14442.82,"min":219.998,"max":42581.368},"ib_recv_pkts":{"unit":{"base":"packets/s"},"avg":201.532,"min":1.25,"max":601.345},"ib_xmit":{"unit":{"base":"B/s"},"avg":282.098,"min":56.2,"max":569.363},"ib_xmit_pkts":{"unit":{"base":"packets/s"},"avg":1.228,"min":0.433,"max":2},"ipc":{"unit":{"base":"IPC"},"avg":0.77,"min":0.564,"max":0.906},"mem_bw":{"unit":{"base":"B/s","prefix":"G"},"avg":4.872,"min":0.025,"max":14.552},"mem_power":{"unit":{"base":"W"},"avg":7.725,"min":6.286,"max":10.556},"mem_used":{"unit":{"base":"B","prefix":"G"},"avg":6.162,"min":6.103,"max":6.226},"nfs4_read":{"unit":{"base":"B/s","prefix":"M"},"avg":1045.333,"min":311,"max":1525},"nfs4_total":{"unit":{"base":"B/s","prefix":"M"},"avg":6430,"min":2796,"max":11518},"nfs4_write":{"unit":{"base":"B/s","prefix":"M"},"avg":24.333,"min":0,"max":38},"vectorization_ratio":{"unit":{"base":"%"},"avg":25.528,"min":0,"max":76.585}}} diff --git a/internal/memorystore/api.go b/internal/memorystore/api.go new file mode 100644 index 0000000..367f245 --- /dev/null +++ b/internal/memorystore/api.go @@ -0,0 +1,419 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package memorystore + +import ( + "bufio" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "math" + "net/http" + "strconv" + "strings" + + "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/util" + + "github.com/influxdata/line-protocol/v2/lineprotocol" +) + +// @title cc-metric-store REST API +// @version 1.0.0 +// @description API for cc-metric-store + +// @contact.name ClusterCockpit Project +// @contact.url https://clustercockpit.org +// @contact.email support@clustercockpit.org + +// @license.name MIT License +// @license.url https://opensource.org/licenses/MIT + +// @host localhost:8082 +// @basePath /api/ + +// @securityDefinitions.apikey ApiKeyAuth +// @in header +// @name X-Auth-Token + +// ErrorResponse model +type ErrorResponse struct { + // Statustext of Errorcode + Status string `json:"status"` + Error string `json:"error"` // Error Message +} + +type ApiMetricData struct { + Error *string `json:"error,omitempty"` + Data schema.FloatArray `json:"data,omitempty"` + From int64 `json:"from"` + To int64 `json:"to"` + Resolution int64 `json:"resolution"` + Avg schema.Float `json:"avg"` + Min schema.Float `json:"min"` + Max schema.Float `json:"max"` +} + +func handleError(err error, statusCode int, rw http.ResponseWriter) { + // log.Warnf("REST ERROR : %s", err.Error()) + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(statusCode) + json.NewEncoder(rw).Encode(ErrorResponse{ + Status: http.StatusText(statusCode), + Error: err.Error(), + }) +} + +// TODO: Optimize this, just like the stats endpoint! +func (data *ApiMetricData) AddStats() { + n := 0 + sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64 + for _, x := range data.Data { + if x.IsNaN() { + continue + } + + n += 1 + sum += float64(x) + min = math.Min(min, float64(x)) + max = math.Max(max, float64(x)) + } + + if n > 0 { + avg := sum / float64(n) + data.Avg = schema.Float(avg) + data.Min = schema.Float(min) + data.Max = schema.Float(max) + } else { + data.Avg, data.Min, data.Max = schema.NaN, schema.NaN, schema.NaN + } +} + +func (data *ApiMetricData) ScaleBy(f schema.Float) { + if f == 0 || f == 1 { + return + } + + data.Avg *= f + data.Min *= f + data.Max *= f + for i := 0; i < len(data.Data); i++ { + data.Data[i] *= f + } +} + +func (data *ApiMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) { + minfo, ok := ms.Metrics[metric] + if !ok { + return + } + + if (data.From / minfo.Frequency) > (from / minfo.Frequency) { + padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency)) + ndata := make([]schema.Float, 0, padfront+len(data.Data)) + for i := 0; i < padfront; i++ { + ndata = append(ndata, schema.NaN) + } + for j := 0; j < len(data.Data); j++ { + ndata = append(ndata, data.Data[j]) + } + data.Data = ndata + } +} + +// handleFree godoc +// @summary +// @tags free +// @description This endpoint allows the users to free the Buffers from the +// metric store. This endpoint offers the users to remove then systematically +// and also allows then to prune the data under node, if they do not want to +// remove the whole node. +// @produce json +// @param to query string false "up to timestamp" +// @success 200 {string} string "ok" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /free/ [post] +func HandleFree(rw http.ResponseWriter, r *http.Request) { + rawTo := r.URL.Query().Get("to") + if rawTo == "" { + handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw) + return + } + + to, err := strconv.ParseInt(rawTo, 10, 64) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + // // TODO: lastCheckpoint might be modified by different go-routines. + // // Load it using the sync/atomic package? + // freeUpTo := lastCheckpoint.Unix() + // if to < freeUpTo { + // freeUpTo = to + // } + + bodyDec := json.NewDecoder(r.Body) + var selectors [][]string + err = bodyDec.Decode(&selectors) + if err != nil { + http.Error(rw, err.Error(), http.StatusBadRequest) + return + } + + ms := GetMemoryStore() + n := 0 + for _, sel := range selectors { + bn, err := ms.Free(sel, to) + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + n += bn + } + + rw.WriteHeader(http.StatusOK) + fmt.Fprintf(rw, "buffers freed: %d\n", n) +} + +// handleWrite godoc +// @summary Receive metrics in InfluxDB line-protocol +// @tags write +// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md) + +// @accept plain +// @produce json +// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead." +// @success 200 {string} string "ok" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /write/ [post] +func HandleWrite(rw http.ResponseWriter, r *http.Request) { + bytes, err := io.ReadAll(r.Body) + rw.Header().Add("Content-Type", "application/json") + if err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } + + ms := GetMemoryStore() + dec := lineprotocol.NewDecoderWithBytes(bytes) + if err := decodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil { + log.Printf("/api/write error: %s", err.Error()) + handleError(err, http.StatusBadRequest, rw) + return + } + rw.WriteHeader(http.StatusOK) +} + +type ApiQueryRequest struct { + Cluster string `json:"cluster"` + Queries []ApiQuery `json:"queries"` + ForAllNodes []string `json:"for-all-nodes"` + From int64 `json:"from"` + To int64 `json:"to"` + WithStats bool `json:"with-stats"` + WithData bool `json:"with-data"` + WithPadding bool `json:"with-padding"` +} + +type ApiQueryResponse struct { + Queries []ApiQuery `json:"queries,omitempty"` + Results [][]ApiMetricData `json:"results"` +} + +type ApiQuery struct { + Type *string `json:"type,omitempty"` + SubType *string `json:"subtype,omitempty"` + Metric string `json:"metric"` + Hostname string `json:"host"` + Resolution int64 `json:"resolution"` + TypeIds []string `json:"type-ids,omitempty"` + SubTypeIds []string `json:"subtype-ids,omitempty"` + ScaleFactor schema.Float `json:"scale-by,omitempty"` + Aggregate bool `json:"aggreg"` +} + +func FetchData(req ApiQueryRequest) (*ApiQueryResponse, error) { + + req.WithData = true + req.WithData = true + req.WithData = true + + ms := GetMemoryStore() + + response := ApiQueryResponse{ + Results: make([][]ApiMetricData, 0, len(req.Queries)), + } + if req.ForAllNodes != nil { + nodes := ms.ListChildren([]string{req.Cluster}) + for _, node := range nodes { + for _, metric := range req.ForAllNodes { + q := ApiQuery{ + Metric: metric, + Hostname: node, + } + req.Queries = append(req.Queries, q) + response.Queries = append(response.Queries, q) + } + } + } + + for _, query := range req.Queries { + sels := make([]util.Selector, 0, 1) + if query.Aggregate || query.Type == nil { + sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}} + if query.Type != nil { + if len(query.TypeIds) == 1 { + sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]}) + } else { + ids := make([]string, len(query.TypeIds)) + for i, id := range query.TypeIds { + ids[i] = *query.Type + id + } + sel = append(sel, util.SelectorElement{Group: ids}) + } + + if query.SubType != nil { + if len(query.SubTypeIds) == 1 { + sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]}) + } else { + ids := make([]string, len(query.SubTypeIds)) + for i, id := range query.SubTypeIds { + ids[i] = *query.SubType + id + } + sel = append(sel, util.SelectorElement{Group: ids}) + } + } + } + sels = append(sels, sel) + } else { + for _, typeId := range query.TypeIds { + if query.SubType != nil { + for _, subTypeId := range query.SubTypeIds { + sels = append(sels, util.Selector{ + {String: req.Cluster}, + {String: query.Hostname}, + {String: *query.Type + typeId}, + {String: *query.SubType + subTypeId}, + }) + } + } else { + sels = append(sels, util.Selector{ + {String: req.Cluster}, + {String: query.Hostname}, + {String: *query.Type + typeId}, + }) + } + } + } + + // log.Printf("query: %#v\n", query) + // log.Printf("sels: %#v\n", sels) + var err error + res := make([]ApiMetricData, 0, len(sels)) + for _, sel := range sels { + data := ApiMetricData{} + + data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution) + + if err != nil { + msg := err.Error() + data.Error = &msg + res = append(res, data) + continue + } + + if req.WithStats { + data.AddStats() + } + if query.ScaleFactor != 0 { + data.ScaleBy(query.ScaleFactor) + } + if req.WithPadding { + data.PadDataWithNull(ms, req.From, req.To, query.Metric) + } + if !req.WithData { + data.Data = nil + } + res = append(res, data) + } + response.Results = append(response.Results, res) + } + + return &response, nil +} + +// handleDebug godoc +// @summary Debug endpoint +// @tags debug +// @description This endpoint allows the users to print the content of +// nodes/clusters/metrics to review the state of the data. +// @produce json +// @param selector query string false "Selector" +// @success 200 {string} string "Debug dump" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /debug/ [post] +func HandleDebug(rw http.ResponseWriter, r *http.Request) { + raw := r.URL.Query().Get("selector") + rw.Header().Add("Content-Type", "application/json") + selector := []string{} + if len(raw) != 0 { + selector = strings.Split(raw, ":") + } + + ms := GetMemoryStore() + if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } +} + +// handleHealthCheck godoc +// @summary HealthCheck endpoint +// @tags healthcheck +// @description This endpoint allows the users to check if a node is healthy +// @produce json +// @param selector query string false "Selector" +// @success 200 {string} string "Debug dump" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /healthcheck/ [get] +func HandleHealthCheck(rw http.ResponseWriter, r *http.Request) { + rawCluster := r.URL.Query().Get("cluster") + rawNode := r.URL.Query().Get("node") + + if rawCluster == "" || rawNode == "" { + handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + + selector := []string{rawCluster, rawNode} + + ms := GetMemoryStore() + if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil { + handleError(err, http.StatusBadRequest, rw) + return + } +} diff --git a/internal/memorystore/archive.go b/internal/memorystore/archive.go new file mode 100644 index 0000000..9720d20 --- /dev/null +++ b/internal/memorystore/archive.go @@ -0,0 +1,192 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package memorystore + +import ( + "archive/zip" + "bufio" + "context" + "errors" + "fmt" + "io" + "log" + "os" + "path/filepath" + "sync" + "sync/atomic" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + cclog "github.com/ClusterCockpit/cc-lib/ccLogger" +) + +func Archiving(wg *sync.WaitGroup, ctx context.Context) { + go func() { + defer wg.Done() + d, err := time.ParseDuration(config.MetricStoreKeys.Archive.Interval) + if err != nil { + log.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err) + } + if d <= 0 { + return + } + + ticks := func() <-chan time.Time { + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + for { + select { + case <-ctx.Done(): + return + case <-ticks: + t := time.Now().Add(-d) + log.Printf("[METRICSTORE]> start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339)) + n, err := ArchiveCheckpoints(config.MetricStoreKeys.Checkpoints.RootDir, + config.MetricStoreKeys.Archive.RootDir, t.Unix(), config.MetricStoreKeys.Archive.DeleteInstead) + + if err != nil { + log.Printf("[METRICSTORE]> archiving failed: %s\n", err.Error()) + } else { + log.Printf("[METRICSTORE]> done: %d files zipped and moved to archive\n", n) + } + } + } + }() +} + +var ErrNoNewData error = errors.New("all data already archived") + +// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`, +// deleting them from the `checkpointsDir`. +func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) { + entries1, err := os.ReadDir(checkpointsDir) + if err != nil { + return 0, err + } + + type workItem struct { + cdir, adir string + cluster, host string + } + + var wg sync.WaitGroup + n, errs := int32(0), int32(0) + work := make(chan workItem, NumWorkers) + + wg.Add(NumWorkers) + for worker := 0; worker < NumWorkers; worker++ { + go func() { + defer wg.Done() + for workItem := range work { + m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead) + if err != nil { + cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error()) + atomic.AddInt32(&errs, 1) + } + atomic.AddInt32(&n, int32(m)) + } + }() + } + + for _, de1 := range entries1 { + entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name())) + if e != nil { + err = e + } + + for _, de2 := range entries2 { + cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name()) + adir := filepath.Join(archiveDir, de1.Name(), de2.Name()) + work <- workItem{ + adir: adir, cdir: cdir, + cluster: de1.Name(), host: de2.Name(), + } + } + } + + close(work) + wg.Wait() + + if err != nil { + return int(n), err + } + + if errs > 0 { + return int(n), fmt.Errorf("%d errors happend while archiving (%d successes)", errs, n) + } + return int(n), nil +} + +// Helper function for `ArchiveCheckpoints`. +func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) { + entries, err := os.ReadDir(dir) + if err != nil { + return 0, err + } + + extension := config.MetricStoreKeys.Checkpoints.FileFormat + files, err := findFiles(entries, from, extension, false) + if err != nil { + return 0, err + } + + if deleteInstead { + n := 0 + for _, checkpoint := range files { + filename := filepath.Join(dir, checkpoint) + if err = os.Remove(filename); err != nil { + return n, err + } + n += 1 + } + return n, nil + } + + filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from)) + f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil && os.IsNotExist(err) { + err = os.MkdirAll(archiveDir, 0o755) + if err == nil { + f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) + } + } + if err != nil { + return 0, err + } + defer f.Close() + bw := bufio.NewWriter(f) + defer bw.Flush() + zw := zip.NewWriter(bw) + defer zw.Close() + + n := 0 + for _, checkpoint := range files { + filename := filepath.Join(dir, checkpoint) + r, err := os.Open(filename) + if err != nil { + return n, err + } + defer r.Close() + + w, err := zw.Create(checkpoint) + if err != nil { + return n, err + } + + if _, err = io.Copy(w, r); err != nil { + return n, err + } + + if err = os.Remove(filename); err != nil { + return n, err + } + n += 1 + } + + return n, nil +} diff --git a/internal/memorystore/buffer.go b/internal/memorystore/buffer.go new file mode 100644 index 0000000..39e9abc --- /dev/null +++ b/internal/memorystore/buffer.go @@ -0,0 +1,233 @@ +package memorystore + +import ( + "errors" + "sync" + + "github.com/ClusterCockpit/cc-lib/schema" +) + +// Default buffer capacity. +// `buffer.data` will only ever grow up to it's capacity and a new link +// in the buffer chain will be created if needed so that no copying +// of data or reallocation needs to happen on writes. +const ( + BUFFER_CAP int = 512 +) + +// So that we can reuse allocations +var bufferPool sync.Pool = sync.Pool{ + New: func() interface{} { + return &buffer{ + data: make([]schema.Float, 0, BUFFER_CAP), + } + }, +} + +var ( + ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level") + ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align") +) + +// Each metric on each level has it's own buffer. +// This is where the actual values go. +// If `cap(data)` is reached, a new buffer is created and +// becomes the new head of a buffer list. +type buffer struct { + prev *buffer + next *buffer + data []schema.Float + frequency int64 + start int64 + archived bool + closed bool +} + +func newBuffer(ts, freq int64) *buffer { + b := bufferPool.Get().(*buffer) + b.frequency = freq + b.start = ts - (freq / 2) + b.prev = nil + b.next = nil + b.archived = false + b.closed = false + b.data = b.data[:0] + return b +} + +// If a new buffer was created, the new head is returnd. +// Otherwise, the existing buffer is returnd. +// Normaly, only "newer" data should be written, but if the value would +// end up in the same buffer anyways it is allowed. +func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) { + if ts < b.start { + return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past") + } + + // idx := int((ts - b.start + (b.frequency / 3)) / b.frequency) + idx := int((ts - b.start) / b.frequency) + if idx >= cap(b.data) { + newbuf := newBuffer(ts, b.frequency) + newbuf.prev = b + b.next = newbuf + b.close() + b = newbuf + idx = 0 + } + + // Overwriting value or writing value from past + if idx < len(b.data) { + b.data[idx] = value + return b, nil + } + + // Fill up unwritten slots with NaN + for i := len(b.data); i < idx; i++ { + b.data = append(b.data, schema.NaN) + } + + b.data = append(b.data, value) + return b, nil +} + +func (b *buffer) end() int64 { + return b.firstWrite() + int64(len(b.data))*b.frequency +} + +func (b *buffer) firstWrite() int64 { + return b.start + (b.frequency / 2) +} + +func (b *buffer) close() {} + +/* +func (b *buffer) close() { + if b.closed { + return + } + + b.closed = true + n, sum, min, max := 0, 0., math.MaxFloat64, -math.MaxFloat64 + for _, x := range b.data { + if x.IsNaN() { + continue + } + + n += 1 + f := float64(x) + sum += f + min = math.Min(min, f) + max = math.Max(max, f) + } + + b.statisticts.samples = n + if n > 0 { + b.statisticts.avg = Float(sum / float64(n)) + b.statisticts.min = Float(min) + b.statisticts.max = Float(max) + } else { + b.statisticts.avg = NaN + b.statisticts.min = NaN + b.statisticts.max = NaN + } +} +*/ + +// func interpolate(idx int, data []Float) Float { +// if idx == 0 || idx+1 == len(data) { +// return NaN +// } +// return (data[idx-1] + data[idx+1]) / 2.0 +// } + +// Return all known values from `from` to `to`. Gaps of information are represented as NaN. +// Simple linear interpolation is done between the two neighboring cells if possible. +// If values at the start or end are missing, instead of NaN values, the second and thrid +// return values contain the actual `from`/`to`. +// This function goes back the buffer chain if `from` is older than the currents buffer start. +// The loaded values are added to `data` and `data` is returned, possibly with a shorter length. +// If `data` is not long enough to hold all values, this function will panic! +func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) { + if from < b.firstWrite() { + if b.prev != nil { + return b.prev.read(from, to, data) + } + from = b.firstWrite() + } + + i := 0 + t := from + for ; t < to; t += b.frequency { + idx := int((t - b.start) / b.frequency) + if idx >= cap(b.data) { + if b.next == nil { + break + } + b = b.next + idx = 0 + } + + if idx >= len(b.data) { + if b.next == nil || to <= b.next.start { + break + } + data[i] += schema.NaN + } else if t < b.start { + data[i] += schema.NaN + // } else if b.data[idx].IsNaN() { + // data[i] += interpolate(idx, b.data) + } else { + data[i] += b.data[idx] + } + i++ + } + + return data[:i], from, t, nil +} + +// Returns true if this buffer needs to be freed. +func (b *buffer) free(t int64) (delme bool, n int) { + if b.prev != nil { + delme, m := b.prev.free(t) + n += m + if delme { + b.prev.next = nil + if cap(b.prev.data) == BUFFER_CAP { + bufferPool.Put(b.prev) + } + b.prev = nil + } + } + + end := b.end() + if end < t { + return true, n + 1 + } + + return false, n +} + +// Call `callback` on every buffer that contains data in the range from `from` to `to`. +func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error { + if b == nil { + return nil + } + + if err := b.prev.iterFromTo(from, to, callback); err != nil { + return err + } + + if from <= b.end() && b.start <= to { + return callback(b) + } + + return nil +} + +func (b *buffer) count() int64 { + res := int64(len(b.data)) + if b.prev != nil { + res += b.prev.count() + } + return res +} diff --git a/internal/memorystore/checkpoint.go b/internal/memorystore/checkpoint.go new file mode 100644 index 0000000..adee443 --- /dev/null +++ b/internal/memorystore/checkpoint.go @@ -0,0 +1,765 @@ +package memorystore + +import ( + "bufio" + "context" + "encoding/json" + "errors" + "fmt" + "io/fs" + "log" + "os" + "path" + "path/filepath" + "runtime" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/avro" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/schema" + "github.com/linkedin/goavro/v2" +) + +// Whenever changed, update MarshalJSON as well! +type CheckpointMetrics struct { + Data []schema.Float `json:"data"` + Frequency int64 `json:"frequency"` + Start int64 `json:"start"` +} + +type CheckpointFile struct { + Metrics map[string]*CheckpointMetrics `json:"metrics"` + Children map[string]*CheckpointFile `json:"children"` + From int64 `json:"from"` + To int64 `json:"to"` +} + +var lastCheckpoint time.Time + +func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { + lastCheckpoint = time.Now() + + if config.MetricStoreKeys.Checkpoints.FileFormat == "json" { + ms := GetMemoryStore() + + go func() { + defer wg.Done() + d, err := time.ParseDuration(config.MetricStoreKeys.Checkpoints.Interval) + if err != nil { + log.Fatal(err) + } + if d <= 0 { + return + } + + ticks := func() <-chan time.Time { + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + for { + select { + case <-ctx.Done(): + return + case <-ticks: + log.Printf("[METRICSTORE]> start checkpointing (starting at %s)...\n", lastCheckpoint.Format(time.RFC3339)) + now := time.Now() + n, err := ms.ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, + lastCheckpoint.Unix(), now.Unix()) + if err != nil { + log.Printf("[METRICSTORE]> checkpointing failed: %s\n", err.Error()) + } else { + log.Printf("[METRICSTORE]> done: %d checkpoint files created\n", n) + lastCheckpoint = now + } + } + } + }() + } else { + go func() { + defer wg.Done() + d, _ := time.ParseDuration("1m") + + select { + case <-ctx.Done(): + return + case <-time.After(time.Duration(avro.CheckpointBufferMinutes) * time.Minute): + // This is the first tick untill we collect the data for given minutes. + avro.GetAvroStore().ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, false) + // log.Printf("Checkpointing %d avro files", count) + + } + + ticks := func() <-chan time.Time { + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + + for { + select { + case <-ctx.Done(): + return + case <-ticks: + // Regular ticks of 1 minute to write data. + avro.GetAvroStore().ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, false) + // log.Printf("Checkpointing %d avro files", count) + } + } + }() + } +} + +// As `Float` implements a custom MarshalJSON() function, +// serializing an array of such types has more overhead +// than one would assume (because of extra allocations, interfaces and so on). +func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) { + buf := make([]byte, 0, 128+len(cm.Data)*8) + buf = append(buf, `{"frequency":`...) + buf = strconv.AppendInt(buf, cm.Frequency, 10) + buf = append(buf, `,"start":`...) + buf = strconv.AppendInt(buf, cm.Start, 10) + buf = append(buf, `,"data":[`...) + for i, x := range cm.Data { + if i != 0 { + buf = append(buf, ',') + } + if x.IsNaN() { + buf = append(buf, `null`...) + } else { + buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32) + } + } + buf = append(buf, `]}`...) + return buf, nil +} + +// Metrics stored at the lowest 2 levels are not stored away (root and cluster)! +// On a per-host basis a new JSON file is created. I have no idea if this will scale. +// The good thing: Only a host at a time is locked, so this function can run +// in parallel to writes/reads. +func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) { + levels := make([]*Level, 0) + selectors := make([][]string, 0) + m.root.lock.RLock() + for sel1, l1 := range m.root.children { + l1.lock.RLock() + for sel2, l2 := range l1.children { + levels = append(levels, l2) + selectors = append(selectors, []string{sel1, sel2}) + } + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + type workItem struct { + level *Level + dir string + selector []string + } + + n, errs := int32(0), int32(0) + + var wg sync.WaitGroup + wg.Add(NumWorkers) + work := make(chan workItem, NumWorkers*2) + for worker := 0; worker < NumWorkers; worker++ { + go func() { + defer wg.Done() + + for workItem := range work { + if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil { + if err == ErrNoNewData { + continue + } + + log.Printf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error()) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + } + } + }() + } + + for i := 0; i < len(levels); i++ { + dir := path.Join(dir, path.Join(selectors[i]...)) + work <- workItem{ + level: levels[i], + dir: dir, + selector: selectors[i], + } + } + + close(work) + wg.Wait() + + if errs > 0 { + return int(n), fmt.Errorf("[METRICSTORE]> %d errors happend while creating checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) { + l.lock.RLock() + defer l.lock.RUnlock() + + retval := &CheckpointFile{ + From: from, + To: to, + Metrics: make(map[string]*CheckpointMetrics), + Children: make(map[string]*CheckpointFile), + } + + for metric, minfo := range m.Metrics { + b := l.metrics[minfo.Offset] + if b == nil { + continue + } + + allArchived := true + b.iterFromTo(from, to, func(b *buffer) error { + if !b.archived { + allArchived = false + } + return nil + }) + + if allArchived { + continue + } + + data := make([]schema.Float, (to-from)/b.frequency+1) + data, start, end, err := b.read(from, to, data) + if err != nil { + return nil, err + } + + for i := int((end - start) / b.frequency); i < len(data); i++ { + data[i] = schema.NaN + } + + retval.Metrics[metric] = &CheckpointMetrics{ + Frequency: b.frequency, + Start: start, + Data: data, + } + } + + for name, child := range l.children { + val, err := child.toCheckpointFile(from, to, m) + if err != nil { + return nil, err + } + + if val != nil { + retval.Children[name] = val + } + } + + if len(retval.Children) == 0 && len(retval.Metrics) == 0 { + return nil, nil + } + + return retval, nil +} + +func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error { + cf, err := l.toCheckpointFile(from, to, m) + if err != nil { + return err + } + + if cf == nil { + return ErrNoNewData + } + + filepath := path.Join(dir, fmt.Sprintf("%d.json", from)) + f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil && os.IsNotExist(err) { + err = os.MkdirAll(dir, 0o755) + if err == nil { + f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644) + } + } + if err != nil { + return err + } + defer f.Close() + + bw := bufio.NewWriter(f) + if err = json.NewEncoder(bw).Encode(cf); err != nil { + return err + } + + return bw.Flush() +} + +func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) { + var wg sync.WaitGroup + work := make(chan [2]string, NumWorkers) + n, errs := int32(0), int32(0) + + wg.Add(NumWorkers) + for worker := 0; worker < NumWorkers; worker++ { + go func() { + defer wg.Done() + for host := range work { + lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics)) + nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension) + if err != nil { + log.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error()) + atomic.AddInt32(&errs, 1) + } + atomic.AddInt32(&n, int32(nn)) + } + }() + } + + i := 0 + clustersDir, err := os.ReadDir(dir) + for _, clusterDir := range clustersDir { + if !clusterDir.IsDir() { + err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory") + goto done + } + + hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name())) + if e != nil { + err = e + goto done + } + + for _, hostDir := range hostsDir { + if !hostDir.IsDir() { + err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory") + goto done + } + + i++ + if i%NumWorkers == 0 && i > 100 { + // Forcing garbage collection runs here regulary during the loading of checkpoints + // will decrease the total heap size after loading everything back to memory is done. + // While loading data, the heap will grow fast, so the GC target size will double + // almost always. By forcing GCs here, we can keep it growing more slowly so that + // at the end, less memory is wasted. + runtime.GC() + } + + work <- [2]string{clusterDir.Name(), hostDir.Name()} + } + } +done: + close(work) + wg.Wait() + + if err != nil { + return int(n), err + } + + if errs > 0 { + return int(n), fmt.Errorf("[METRICSTORE]> %d errors happend while creating checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +// Metrics stored at the lowest 2 levels are not loaded (root and cluster)! +// This function can only be called once and before the very first write or read. +// Different host's data is loaded to memory in parallel. +func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { + if _, err := os.Stat(dir); os.IsNotExist(err) { + // The directory does not exist, so create it using os.MkdirAll() + err := os.MkdirAll(dir, 0755) // 0755 sets the permissions for the directory + if err != nil { + log.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err) + } + log.Printf("[METRICSTORE]> %#v Directory created successfully.\n", dir) + } + + // Config read (replace with your actual config read) + fileFormat := config.MetricStoreKeys.Checkpoints.FileFormat + if fileFormat == "" { + fileFormat = "avro" + } + + // Map to easily get the fallback format + oppositeFormat := map[string]string{ + "json": "avro", + "avro": "json", + } + + // First, attempt to load the specified format + if found, err := checkFilesWithExtension(dir, fileFormat); err != nil { + return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err) + } else if found { + log.Printf("[METRICSTORE]> Loading %s files because fileformat is %s\n", fileFormat, fileFormat) + return m.FromCheckpoint(dir, from, fileFormat) + } + + // If not found, attempt the opposite format + altFormat := oppositeFormat[fileFormat] + if found, err := checkFilesWithExtension(dir, altFormat); err != nil { + return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err) + } else if found { + log.Printf("[METRICSTORE]> Loading %s files but fileformat is %s\n", altFormat, fileFormat) + return m.FromCheckpoint(dir, from, altFormat) + } + + log.Println("[METRICSTORE]> No valid checkpoint files found in the directory.") + return 0, nil +} + +func checkFilesWithExtension(dir string, extension string) (bool, error) { + found := false + + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err) + } + if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension { + found = true + return nil + } + return nil + }) + if err != nil { + return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err) + } + + return found, nil +} + +func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + + fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:] + resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64) + if err != nil { + return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err) + } + + from_timestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64) + + // Same logic according to lineprotocol + from_timestamp -= (resolution / 2) + + if err != nil { + return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err) + } + + // fmt.Printf("File : %s with resolution : %d\n", fileName, resolution) + + var recordCounter int64 = 0 + + // Create a new OCF reader from the buffered reader + ocfReader, err := goavro.NewOCFReader(br) + if err != nil { + panic(err) + } + + metricsData := make(map[string]schema.FloatArray) + + for ocfReader.Scan() { + datum, err := ocfReader.Read() + if err != nil { + return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err) + } + + record, ok := datum.(map[string]interface{}) + if !ok { + panic("[METRICSTORE]> failed to assert datum as map[string]interface{}") + } + + for key, value := range record { + metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64))) + } + + recordCounter += 1 + } + + to := (from_timestamp + (recordCounter / (60 / resolution) * 60)) + if to < from { + return nil + } + + for key, floatArray := range metricsData { + metricName := avro.ReplaceKey(key) + + if strings.Contains(metricName, avro.Delimiter) { + subString := strings.Split(metricName, avro.Delimiter) + + lvl := l + + for i := 0; i < len(subString)-1; i++ { + + sel := subString[i] + + if lvl.children == nil { + lvl.children = make(map[string]*Level) + } + + child, ok := lvl.children[sel] + if !ok { + child = &Level{ + metrics: make([]*buffer, len(m.Metrics)), + children: nil, + } + lvl.children[sel] = child + } + lvl = child + } + + leafMetricName := subString[len(subString)-1] + err = lvl.createBuffer(m, leafMetricName, floatArray, from_timestamp, resolution) + if err != nil { + return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) + } + } else { + err = l.createBuffer(m, metricName, floatArray, from_timestamp, resolution) + if err != nil { + return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) + } + } + + } + + return nil +} + +func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error { + n := len(floatArray) + b := &buffer{ + frequency: resolution, + start: from, + data: floatArray[0:n:n], + prev: nil, + next: nil, + archived: true, + } + b.close() + + minfo, ok := m.Metrics[metricName] + if !ok { + return nil + // return errors.New("Unkown metric: " + name) + } + + prev := l.metrics[minfo.Offset] + if prev == nil { + l.metrics[minfo.Offset] = b + } else { + if prev.start > b.start { + return errors.New("wooops") + } + + b.prev = prev + prev.next = b + + missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency)) + if missingCount > 0 { + missingCount /= int(b.frequency) + + for range missingCount { + prev.data = append(prev.data, schema.NaN) + } + + prev.data = prev.data[0:len(prev.data):len(prev.data)] + } + } + l.metrics[minfo.Offset] = b + + return nil +} + +func (l *Level) loadJsonFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + cf := &CheckpointFile{} + if err := json.NewDecoder(br).Decode(cf); err != nil { + return err + } + + if cf.To != 0 && cf.To < from { + return nil + } + + if err := l.loadFile(cf, m); err != nil { + return err + } + + return nil +} + +func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error { + for name, metric := range cf.Metrics { + n := len(metric.Data) + b := &buffer{ + frequency: metric.Frequency, + start: metric.Start, + data: metric.Data[0:n:n], // Space is wasted here :( + prev: nil, + next: nil, + archived: true, + } + b.close() + + minfo, ok := m.Metrics[name] + if !ok { + continue + // return errors.New("Unkown metric: " + name) + } + + prev := l.metrics[minfo.Offset] + if prev == nil { + l.metrics[minfo.Offset] = b + } else { + if prev.start > b.start { + return errors.New("wooops") + } + + b.prev = prev + prev.next = b + } + l.metrics[minfo.Offset] = b + } + + if len(cf.Children) > 0 && l.children == nil { + l.children = make(map[string]*Level) + } + + for sel, childCf := range cf.Children { + child, ok := l.children[sel] + if !ok { + child = &Level{ + metrics: make([]*buffer, len(m.Metrics)), + children: nil, + } + l.children[sel] = child + } + + if err := child.loadFile(childCf, m); err != nil { + return err + } + } + + return nil +} + +func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) { + direntries, err := os.ReadDir(dir) + if err != nil { + if os.IsNotExist(err) { + return 0, nil + } + + return 0, err + } + + allFiles := make([]fs.DirEntry, 0) + filesLoaded := 0 + for _, e := range direntries { + if e.IsDir() { + child := &Level{ + metrics: make([]*buffer, len(m.Metrics)), + children: make(map[string]*Level), + } + + files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension) + filesLoaded += files + if err != nil { + return filesLoaded, err + } + + l.children[e.Name()] = child + } else if strings.HasSuffix(e.Name(), "."+extension) { + allFiles = append(allFiles, e) + } else { + continue + } + } + + files, err := findFiles(allFiles, from, extension, true) + if err != nil { + return filesLoaded, err + } + + loaders := map[string]func(*MemoryStore, *os.File, int64) error{ + "json": l.loadJsonFile, + "avro": l.loadAvroFile, + } + + loader := loaders[extension] + + for _, filename := range files { + f, err := os.Open(path.Join(dir, filename)) + if err != nil { + return filesLoaded, err + } + defer f.Close() + + if err = loader(m, f, from); err != nil { + return filesLoaded, err + } + + filesLoaded += 1 + } + + return filesLoaded, nil +} + +// This will probably get very slow over time! +// A solution could be some sort of an index file in which all other files +// and the timespan they contain is listed. +func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) { + nums := map[string]int64{} + for _, e := range direntries { + if !strings.HasSuffix(e.Name(), "."+extension) { + continue + } + + ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64) + if err != nil { + return nil, err + } + nums[e.Name()] = ts + } + + sort.Slice(direntries, func(i, j int) bool { + a, b := direntries[i], direntries[j] + return nums[a.Name()] < nums[b.Name()] + }) + + filenames := make([]string, 0) + for i := 0; i < len(direntries); i++ { + e := direntries[i] + ts1 := nums[e.Name()] + + if findMoreRecentFiles && t <= ts1 { + filenames = append(filenames, e.Name()) + } + if i == len(direntries)-1 { + continue + } + + enext := direntries[i+1] + ts2 := nums[enext.Name()] + + if findMoreRecentFiles { + if ts1 < t && t < ts2 { + filenames = append(filenames, e.Name()) + } + } else { + if ts2 < t { + filenames = append(filenames, e.Name()) + } + } + } + + return filenames, nil +} diff --git a/internal/memorystore/debug.go b/internal/memorystore/debug.go new file mode 100644 index 0000000..0f85024 --- /dev/null +++ b/internal/memorystore/debug.go @@ -0,0 +1,107 @@ +package memorystore + +import ( + "bufio" + "fmt" + "strconv" +) + +func (b *buffer) debugDump(buf []byte) []byte { + if b.prev != nil { + buf = b.prev.debugDump(buf) + } + + start, len, end := b.start, len(b.data), b.start+b.frequency*int64(len(b.data)) + buf = append(buf, `{"start":`...) + buf = strconv.AppendInt(buf, start, 10) + buf = append(buf, `,"len":`...) + buf = strconv.AppendInt(buf, int64(len), 10) + buf = append(buf, `,"end":`...) + buf = strconv.AppendInt(buf, end, 10) + if b.archived { + buf = append(buf, `,"saved":true`...) + } + if b.next != nil { + buf = append(buf, `},`...) + } else { + buf = append(buf, `}`...) + } + return buf +} + +func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) { + l.lock.RLock() + defer l.lock.RUnlock() + for i := 0; i < depth; i++ { + buf = append(buf, '\t') + } + buf = append(buf, '"') + buf = append(buf, lvlname...) + buf = append(buf, "\":{\n"...) + depth += 1 + objitems := 0 + for name, mc := range m.Metrics { + if b := l.metrics[mc.Offset]; b != nil { + for i := 0; i < depth; i++ { + buf = append(buf, '\t') + } + + buf = append(buf, '"') + buf = append(buf, name...) + buf = append(buf, `":[`...) + buf = b.debugDump(buf) + buf = append(buf, "],\n"...) + objitems++ + } + } + + for name, lvl := range l.children { + _, err := w.Write(buf) + if err != nil { + return nil, err + } + + buf = buf[0:0] + buf, err = lvl.debugDump(m, w, name, buf, depth) + if err != nil { + return nil, err + } + + buf = append(buf, ',', '\n') + objitems++ + } + + // remove final `,`: + if objitems > 0 { + buf = append(buf[0:len(buf)-1], '\n') + } + + depth -= 1 + for i := 0; i < depth; i++ { + buf = append(buf, '\t') + } + buf = append(buf, '}') + return buf, nil +} + +func (m *MemoryStore) DebugDump(w *bufio.Writer, selector []string) error { + lvl := m.root.findLevel(selector) + if lvl == nil { + return fmt.Errorf("[METRICSTORE]> not found: %#v", selector) + } + + buf := make([]byte, 0, 2048) + buf = append(buf, "{"...) + + buf, err := lvl.debugDump(m, w, "data", buf, 0) + if err != nil { + return err + } + + buf = append(buf, "}\n"...) + if _, err = w.Write(buf); err != nil { + return err + } + + return w.Flush() +} diff --git a/internal/memorystore/healthcheck.go b/internal/memorystore/healthcheck.go new file mode 100644 index 0000000..d655db3 --- /dev/null +++ b/internal/memorystore/healthcheck.go @@ -0,0 +1,88 @@ +package memorystore + +import ( + "bufio" + "fmt" + "time" +) + +// This is a threshold that allows a node to be healthy with certain number of data points missing. +// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a +// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy. +const MaxMissingDataPoints int64 = 5 + +// This is a threshold which allows upto certain number of metrics in a node to be unhealthly. +// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last +// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does +// not receive data for MaxMissingDataPoints data points will deem the node unhealthy. +const MaxUnhealthyMetrics int64 = 5 + +func (b *buffer) healthCheck() int64 { + + // Check if the buffer is empty + if b.data == nil { + return 1 + } + + buffer_end := b.start + b.frequency*int64(len(b.data)) + t := time.Now().Unix() + + // Check if the buffer is too old + if t-buffer_end > MaxMissingDataPoints*b.frequency { + return 1 + } + + return 0 +} + +func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) { + l.lock.RLock() + defer l.lock.RUnlock() + + for _, mc := range m.Metrics { + if b := l.metrics[mc.Offset]; b != nil { + count += b.healthCheck() + } + } + + for _, lvl := range l.children { + c, err := lvl.healthCheck(m, 0) + if err != nil { + return 0, err + } + count += c + } + + return count, nil +} + +func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error { + lvl := m.root.findLevel(selector) + if lvl == nil { + return fmt.Errorf("[METRICSTORE]> not found: %#v", selector) + } + + buf := make([]byte, 0, 25) + // buf = append(buf, "{"...) + + var count int64 = 0 + + unhealthyMetricsCount, err := lvl.healthCheck(m, count) + if err != nil { + return err + } + + if unhealthyMetricsCount < MaxUnhealthyMetrics { + buf = append(buf, "Healthy"...) + } else { + buf = append(buf, "Unhealthy"...) + } + + // buf = append(buf, "}\n"...) + + if _, err = w.Write(buf); err != nil { + return err + } + + return w.Flush() +} diff --git a/internal/memorystore/level.go b/internal/memorystore/level.go new file mode 100644 index 0000000..76916e6 --- /dev/null +++ b/internal/memorystore/level.go @@ -0,0 +1,187 @@ +package memorystore + +import ( + "sync" + "unsafe" + + "github.com/ClusterCockpit/cc-lib/util" +) + +// Could also be called "node" as this forms a node in a tree structure. +// Called Level because "node" might be confusing here. +// Can be both a leaf or a inner node. In this tree structue, inner nodes can +// also hold data (in `metrics`). +type Level struct { + children map[string]*Level + metrics []*buffer + lock sync.RWMutex +} + +// Find the correct level for the given selector, creating it if +// it does not exist. Example selector in the context of the +// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }. +// This function would probably benefit a lot from `level.children` beeing a `sync.Map`? +func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level { + if len(selector) == 0 { + return l + } + + // Allow concurrent reads: + l.lock.RLock() + var child *Level + var ok bool + if l.children == nil { + // Children map needs to be created... + l.lock.RUnlock() + } else { + child, ok := l.children[selector[0]] + l.lock.RUnlock() + if ok { + return child.findLevelOrCreate(selector[1:], nMetrics) + } + } + + // The level does not exist, take write lock for unqiue access: + l.lock.Lock() + // While this thread waited for the write lock, another thread + // could have created the child node. + if l.children != nil { + child, ok = l.children[selector[0]] + if ok { + l.lock.Unlock() + return child.findLevelOrCreate(selector[1:], nMetrics) + } + } + + child = &Level{ + metrics: make([]*buffer, nMetrics), + children: nil, + } + + if l.children != nil { + l.children[selector[0]] = child + } else { + l.children = map[string]*Level{selector[0]: child} + } + l.lock.Unlock() + return child.findLevelOrCreate(selector[1:], nMetrics) +} + +func (l *Level) free(t int64) (int, error) { + l.lock.Lock() + defer l.lock.Unlock() + + n := 0 + for i, b := range l.metrics { + if b != nil { + delme, m := b.free(t) + n += m + if delme { + if cap(b.data) == BUFFER_CAP { + bufferPool.Put(b) + } + l.metrics[i] = nil + } + } + } + + for _, l := range l.children { + m, err := l.free(t) + n += m + if err != nil { + return n, err + } + } + + return n, nil +} + +func (l *Level) sizeInBytes() int64 { + l.lock.RLock() + defer l.lock.RUnlock() + size := int64(0) + + for _, b := range l.metrics { + if b != nil { + size += b.count() * int64(unsafe.Sizeof(util.Float(0))) + } + } + + for _, child := range l.children { + size += child.sizeInBytes() + } + + return size +} + +func (l *Level) findLevel(selector []string) *Level { + if len(selector) == 0 { + return l + } + + l.lock.RLock() + defer l.lock.RUnlock() + + lvl := l.children[selector[0]] + if lvl == nil { + return nil + } + + return lvl.findLevel(selector[1:]) +} + +func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error { + l.lock.RLock() + defer l.lock.RUnlock() + + if len(selector) == 0 { + b := l.metrics[offset] + if b != nil { + return f(b) + } + + for _, lvl := range l.children { + err := lvl.findBuffers(nil, offset, f) + if err != nil { + return err + } + } + return nil + } + + sel := selector[0] + if len(sel.String) != 0 && l.children != nil { + lvl, ok := l.children[sel.String] + if ok { + err := lvl.findBuffers(selector[1:], offset, f) + if err != nil { + return err + } + } + return nil + } + + if sel.Group != nil && l.children != nil { + for _, key := range sel.Group { + lvl, ok := l.children[key] + if ok { + err := lvl.findBuffers(selector[1:], offset, f) + if err != nil { + return err + } + } + } + return nil + } + + if sel.Any && l.children != nil { + for _, lvl := range l.children { + if err := lvl.findBuffers(selector[1:], offset, f); err != nil { + return err + } + } + return nil + } + + return nil +} diff --git a/internal/memorystore/lineprotocol.go b/internal/memorystore/lineprotocol.go new file mode 100644 index 0000000..495197d --- /dev/null +++ b/internal/memorystore/lineprotocol.go @@ -0,0 +1,347 @@ +package memorystore + +import ( + "context" + "fmt" + "log" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/avro" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/schema" + "github.com/influxdata/line-protocol/v2/lineprotocol" + "github.com/nats-io/nats.go" +) + +// Each connection is handled in it's own goroutine. This is a blocking function. +// func ReceiveRaw(ctx context.Context, +// listener net.Listener, +// handleLine func(*lineprotocol.Decoder, string) error, +// ) error { +// var wg sync.WaitGroup + +// wg.Add(1) +// go func() { +// defer wg.Done() +// <-ctx.Done() +// if err := listener.Close(); err != nil { +// log.Printf("listener.Close(): %s", err.Error()) +// } +// }() + +// for { +// conn, err := listener.Accept() +// if err != nil { +// if errors.Is(err, net.ErrClosed) { +// break +// } + +// log.Printf("listener.Accept(): %s", err.Error()) +// } + +// wg.Add(2) +// go func() { +// defer wg.Done() +// defer conn.Close() + +// dec := lineprotocol.NewDecoder(conn) +// connctx, cancel := context.WithCancel(context.Background()) +// defer cancel() +// go func() { +// defer wg.Done() +// select { +// case <-connctx.Done(): +// conn.Close() +// case <-ctx.Done(): +// conn.Close() +// } +// }() + +// if err := handleLine(dec, "default"); err != nil { +// if errors.Is(err, net.ErrClosed) { +// return +// } + +// log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error()) +// errmsg := make([]byte, 128) +// errmsg = append(errmsg, `error: `...) +// errmsg = append(errmsg, err.Error()...) +// errmsg = append(errmsg, '\n') +// conn.Write(errmsg) +// } +// }() +// } + +// wg.Wait() +// return nil +// } + +// Connect to a nats server and subscribe to "updates". This is a blocking +// function. handleLine will be called for each line recieved via nats. +// Send `true` through the done channel for gracefull termination. +func ReceiveNats(conf *(config.NatsConfig), + ms *MemoryStore, + workers int, + ctx context.Context, +) error { + var opts []nats.Option + if conf.Username != "" && conf.Password != "" { + opts = append(opts, nats.UserInfo(conf.Username, conf.Password)) + } + + if conf.Credsfilepath != "" { + opts = append(opts, nats.UserCredentials(conf.Credsfilepath)) + } + + nc, err := nats.Connect(conf.Address, opts...) + if err != nil { + return err + } + defer nc.Close() + + var wg sync.WaitGroup + var subs []*nats.Subscription + + msgs := make(chan *nats.Msg, workers*2) + + for _, sc := range conf.Subscriptions { + clusterTag := sc.ClusterTag + var sub *nats.Subscription + if workers > 1 { + wg.Add(workers) + + for range workers { + go func() { + for m := range msgs { + dec := lineprotocol.NewDecoderWithBytes(m.Data) + if err := decodeLine(dec, ms, clusterTag); err != nil { + log.Printf("error: %s\n", err.Error()) + } + } + + wg.Done() + }() + } + + sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) { + msgs <- m + }) + } else { + sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) { + dec := lineprotocol.NewDecoderWithBytes(m.Data) + if err := decodeLine(dec, ms, clusterTag); err != nil { + log.Printf("error: %s\n", err.Error()) + } + }) + } + + if err != nil { + return err + } + log.Printf("NATS subscription to '%s' on '%s' established\n", sc.SubscribeTo, conf.Address) + subs = append(subs, sub) + } + + <-ctx.Done() + for _, sub := range subs { + err = sub.Unsubscribe() + if err != nil { + log.Printf("NATS unsubscribe failed: %s", err.Error()) + } + } + close(msgs) + wg.Wait() + + nc.Close() + log.Println("NATS connection closed") + return nil +} + +// Place `prefix` in front of `buf` but if possible, +// do that inplace in `buf`. +func reorder(buf, prefix []byte) []byte { + n := len(prefix) + m := len(buf) + if cap(buf) < m+n { + return append(prefix[:n:n], buf...) + } else { + buf = buf[:n+m] + for i := m - 1; i >= 0; i-- { + buf[i+n] = buf[i] + } + for i := 0; i < n; i++ { + buf[i] = prefix[i] + } + return buf + } +} + +// Decode lines using dec and make write calls to the MemoryStore. +// If a line is missing its cluster tag, use clusterDefault as default. +func decodeLine(dec *lineprotocol.Decoder, + ms *MemoryStore, + clusterDefault string, +) error { + // Reduce allocations in loop: + t := time.Now() + metric, metricBuf := Metric{}, make([]byte, 0, 16) + selector := make([]string, 0, 4) + typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0) + + // Optimize for the case where all lines in a "batch" are about the same + // cluster and host. By using `WriteToLevel` (level = host), we do not need + // to take the root- and cluster-level lock as often. + var lvl *Level = nil + prevCluster, prevHost := "", "" + + var ok bool + for dec.Next() { + rawmeasurement, err := dec.Measurement() + if err != nil { + return err + } + + // Needs to be copied because another call to dec.* would + // invalidate the returned slice. + metricBuf = append(metricBuf[:0], rawmeasurement...) + + // The go compiler optimizes map[string(byteslice)] lookups: + metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)] + if !ok { + continue + } + + typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0] + cluster, host := clusterDefault, "" + for { + key, val, err := dec.NextTag() + if err != nil { + return err + } + if key == nil { + break + } + + // The go compiler optimizes string([]byte{...}) == "...": + switch string(key) { + case "cluster": + if string(val) == prevCluster { + cluster = prevCluster + } else { + cluster = string(val) + lvl = nil + } + case "hostname", "host": + if string(val) == prevHost { + host = prevHost + } else { + host = string(val) + lvl = nil + } + case "type": + if string(val) == "node" { + break + } + + // We cannot be sure that the "type" tag comes before the "type-id" tag: + if len(typeBuf) == 0 { + typeBuf = append(typeBuf, val...) + } else { + typeBuf = reorder(typeBuf, val) + } + case "type-id": + typeBuf = append(typeBuf, val...) + case "subtype": + // We cannot be sure that the "subtype" tag comes before the "stype-id" tag: + if len(subTypeBuf) == 0 { + subTypeBuf = append(subTypeBuf, val...) + } else { + subTypeBuf = reorder(subTypeBuf, val) + // subTypeBuf = reorder(typeBuf, val) + } + case "stype-id": + subTypeBuf = append(subTypeBuf, val...) + default: + // Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need) + // return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val)) + } + } + + // If the cluster or host changed, the lvl was set to nil + if lvl == nil { + selector = selector[:2] + selector[0], selector[1] = cluster, host + lvl = ms.GetLevel(selector) + prevCluster, prevHost = cluster, host + } + + // subtypes: + selector = selector[:0] + if len(typeBuf) > 0 { + selector = append(selector, string(typeBuf)) // <- Allocation :( + if len(subTypeBuf) > 0 { + selector = append(selector, string(subTypeBuf)) + } + } + + for { + key, val, err := dec.NextField() + if err != nil { + return err + } + + if key == nil { + break + } + + if string(key) != "value" { + return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val) + } + + if val.Kind() == lineprotocol.Float { + metric.Value = schema.Float(val.FloatV()) + } else if val.Kind() == lineprotocol.Int { + metric.Value = schema.Float(val.IntV()) + } else if val.Kind() == lineprotocol.Uint { + metric.Value = schema.Float(val.UintV()) + } else { + return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String()) + } + } + + if t, err = dec.Time(lineprotocol.Second, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil { + return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) + } + } + } + } + + if err != nil { + return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) + } + + time := t.Unix() + + if config.MetricStoreKeys.Checkpoints.FileFormat != "json" { + avro.LineProtocolMessages <- &avro.AvroStruct{ + MetricName: string(metricBuf), + Cluster: cluster, + Node: host, + Selector: append([]string{}, selector...), + Value: metric.Value, + Timestamp: time} + } + + if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil { + return err + } + } + return nil +} diff --git a/internal/memorystore/memorystore.go b/internal/memorystore/memorystore.go new file mode 100644 index 0000000..4a631c2 --- /dev/null +++ b/internal/memorystore/memorystore.go @@ -0,0 +1,446 @@ +package memorystore + +import ( + "context" + "errors" + "log" + "os" + "os/signal" + "runtime" + "sync" + "syscall" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/avro" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/resampler" + "github.com/ClusterCockpit/cc-lib/runtimeEnv" + "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/util" +) + +var ( + singleton sync.Once + msInstance *MemoryStore +) + +var Clusters = make([]string, 0) + +var NumWorkers int = 4 + +func init() { + maxWorkers := 10 + NumWorkers = runtime.NumCPU()/2 + 1 + if NumWorkers > maxWorkers { + NumWorkers = maxWorkers + } +} + +type Metric struct { + Name string + Value schema.Float + MetricConfig config.MetricConfig +} + +type MemoryStore struct { + Metrics map[string]config.MetricConfig + root Level +} + +func Init(wg *sync.WaitGroup) { + startupTime := time.Now() + + //Pass the config.MetricStoreKeys + InitMetrics(config.Metrics) + + ms := GetMemoryStore() + + d, err := time.ParseDuration(config.MetricStoreKeys.Checkpoints.Restore) + if err != nil { + log.Fatal(err) + } + + restoreFrom := startupTime.Add(-d) + log.Printf("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339)) + files, err := ms.FromCheckpointFiles(config.MetricStoreKeys.Checkpoints.RootDir, restoreFrom.Unix()) + loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB + if err != nil { + log.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error()) + } else { + log.Printf("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds()) + } + + // Try to use less memory by forcing a GC run here and then + // lowering the target percentage. The default of 100 means + // that only once the ratio of new allocations execeds the + // previously active heap, a GC is triggered. + // Forcing a GC here will set the "previously active heap" + // to a minumum. + runtime.GC() + + ctx, shutdown := context.WithCancel(context.Background()) + + wg.Add(4) + + Retention(wg, ctx) + Checkpointing(wg, ctx) + Archiving(wg, ctx) + avro.DataStaging(wg, ctx) + + wg.Add(1) + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + go func() { + defer wg.Done() + <-sigs + runtimeEnv.SystemdNotifiy(false, "[METRICSTORE]> Shutting down ...") + shutdown() + }() + + if config.MetricStoreKeys.Nats != nil { + for _, natsConf := range config.MetricStoreKeys.Nats { + // TODO: When multiple nats configs share a URL, do a single connect. + wg.Add(1) + nc := natsConf + go func() { + // err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx) + err := ReceiveNats(nc, ms, 1, ctx) + if err != nil { + log.Fatal(err) + } + wg.Done() + }() + } + } +} + +// Create a new, initialized instance of a MemoryStore. +// Will panic if values in the metric configurations are invalid. +func InitMetrics(metrics map[string]config.MetricConfig) { + singleton.Do(func() { + offset := 0 + for key, cfg := range metrics { + if cfg.Frequency == 0 { + panic("[METRICSTORE]> invalid frequency") + } + + metrics[key] = config.MetricConfig{ + Frequency: cfg.Frequency, + Aggregation: cfg.Aggregation, + Offset: offset, + } + offset += 1 + } + + msInstance = &MemoryStore{ + root: Level{ + metrics: make([]*buffer, len(metrics)), + children: make(map[string]*Level), + }, + Metrics: metrics, + } + }) +} + +func GetMemoryStore() *MemoryStore { + if msInstance == nil { + log.Fatalf("[METRICSTORE]> MemoryStore not initialized!") + } + + return msInstance +} + +func Shutdown() { + log.Printf("[METRICSTORE]> Writing to '%s'...\n", config.MetricStoreKeys.Checkpoints.RootDir) + var files int + var err error + + ms := GetMemoryStore() + + if config.MetricStoreKeys.Checkpoints.FileFormat == "json" { + files, err = ms.ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) + } else { + files, err = avro.GetAvroStore().ToCheckpoint(config.MetricStoreKeys.Checkpoints.RootDir, true) + close(avro.LineProtocolMessages) + } + + if err != nil { + log.Printf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error()) + } + log.Printf("[METRICSTORE]> Done! (%d files written)\n", files) + + // ms.PrintHeirarchy() +} + +// func (m *MemoryStore) PrintHeirarchy() { +// m.root.lock.Lock() +// defer m.root.lock.Unlock() + +// fmt.Printf("Root : \n") + +// for lvl1, sel1 := range m.root.children { +// fmt.Printf("\t%s\n", lvl1) +// for lvl2, sel2 := range sel1.children { +// fmt.Printf("\t\t%s\n", lvl2) +// if lvl1 == "fritz" && lvl2 == "f0201" { + +// for name, met := range m.Metrics { +// mt := sel2.metrics[met.Offset] + +// fmt.Printf("\t\t\t\t%s\n", name) +// fmt.Printf("\t\t\t\t") + +// for mt != nil { +// // if name == "cpu_load" { +// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data) +// // } +// mt = mt.prev +// } +// fmt.Printf("\n") + +// } +// } +// for lvl3, sel3 := range sel2.children { +// if lvl1 == "fritz" && lvl2 == "f0201" && lvl3 == "hwthread70" { + +// fmt.Printf("\t\t\t\t\t%s\n", lvl3) + +// for name, met := range m.Metrics { +// mt := sel3.metrics[met.Offset] + +// fmt.Printf("\t\t\t\t\t\t%s\n", name) + +// fmt.Printf("\t\t\t\t\t\t") + +// for mt != nil { +// // if name == "clock" { +// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data) + +// mt = mt.prev +// } +// fmt.Printf("\n") + +// } + +// // for i, _ := range sel3.metrics { +// // fmt.Printf("\t\t\t\t\t%s\n", getName(configmetrics, i)) +// // } +// } +// } +// } +// } + +// } + +func getName(m *MemoryStore, i int) string { + for key, val := range m.Metrics { + if val.Offset == i { + return key + } + } + return "" +} + +func Retention(wg *sync.WaitGroup, ctx context.Context) { + ms := GetMemoryStore() + + go func() { + defer wg.Done() + d, err := time.ParseDuration(config.MetricStoreKeys.RetentionInMemory) + if err != nil { + log.Fatal(err) + } + if d <= 0 { + return + } + + ticks := func() <-chan time.Time { + d := d / 2 + if d <= 0 { + return nil + } + return time.NewTicker(d).C + }() + for { + select { + case <-ctx.Done(): + return + case <-ticks: + t := time.Now().Add(-d) + log.Printf("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339)) + freed, err := ms.Free(nil, t.Unix()) + if err != nil { + log.Printf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error()) + } else { + log.Printf("[METRICSTORE]> done: %d buffers freed\n", freed) + } + } + } + }() +} + +// Write all values in `metrics` to the level specified by `selector` for time `ts`. +// Look at `findLevelOrCreate` for how selectors work. +func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error { + var ok bool + for i, metric := range metrics { + if metric.MetricConfig.Frequency == 0 { + metric.MetricConfig, ok = m.Metrics[metric.Name] + if !ok { + metric.MetricConfig.Frequency = 0 + } + metrics[i] = metric + } + } + + return m.WriteToLevel(&m.root, selector, ts, metrics) +} + +func (m *MemoryStore) GetLevel(selector []string) *Level { + return m.root.findLevelOrCreate(selector, len(m.Metrics)) +} + +// Assumes that `minfo` in `metrics` is filled in! +func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error { + l = l.findLevelOrCreate(selector, len(m.Metrics)) + l.lock.Lock() + defer l.lock.Unlock() + + for _, metric := range metrics { + if metric.MetricConfig.Frequency == 0 { + continue + } + + b := l.metrics[metric.MetricConfig.Offset] + if b == nil { + // First write to this metric and level + b = newBuffer(ts, metric.MetricConfig.Frequency) + l.metrics[metric.MetricConfig.Offset] = b + } + + nb, err := b.write(ts, metric.Value) + if err != nil { + return err + } + + // Last write created a new buffer... + if b != nb { + l.metrics[metric.MetricConfig.Offset] = nb + } + } + return nil +} + +// Returns all values for metric `metric` from `from` to `to` for the selected level(s). +// If the level does not hold the metric itself, the data will be aggregated recursively from the children. +// The second and third return value are the actual from/to for the data. Those can be different from +// the range asked for if no data was available. +func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) { + if from > to { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range\n") + } + + minfo, ok := m.Metrics[metric] + if !ok { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: \n" + metric) + } + + n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1) + + err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error { + cdata, cfrom, cto, err := b.read(from, to, data) + if err != nil { + return err + } + + if n == 0 { + from, to = cfrom, cto + } else if from != cfrom || to != cto || len(data) != len(cdata) { + missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency) + if missingfront != 0 { + return ErrDataDoesNotAlign + } + + newlen := len(cdata) - missingback + if newlen < 1 { + return ErrDataDoesNotAlign + } + cdata = cdata[0:newlen] + if len(cdata) != len(data) { + return ErrDataDoesNotAlign + } + + from, to = cfrom, cto + } + + data = cdata + n += 1 + return nil + }) + + if err != nil { + return nil, 0, 0, 0, err + } else if n == 0 { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found\n") + } else if n > 1 { + if minfo.Aggregation == config.AvgAggregation { + normalize := 1. / schema.Float(n) + for i := 0; i < len(data); i++ { + data[i] *= normalize + } + } else if minfo.Aggregation != config.SumAggregation { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation") + } + } + + data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution) + if err != nil { + return nil, 0, 0, 0, err + } + + return data, from, to, resolution, nil +} + +// Release all buffers for the selected level and all its children that contain only +// values older than `t`. +func (m *MemoryStore) Free(selector []string, t int64) (int, error) { + return m.GetLevel(selector).free(t) +} + +func (m *MemoryStore) FreeAll() error { + for k := range m.root.children { + delete(m.root.children, k) + } + + return nil +} + +func (m *MemoryStore) SizeInBytes() int64 { + return m.root.sizeInBytes() +} + +// Given a selector, return a list of all children of the level selected. +func (m *MemoryStore) ListChildren(selector []string) []string { + lvl := &m.root + for lvl != nil && len(selector) != 0 { + lvl.lock.RLock() + next := lvl.children[selector[0]] + lvl.lock.RUnlock() + lvl = next + selector = selector[1:] + } + + if lvl == nil { + return nil + } + + lvl.lock.RLock() + defer lvl.lock.RUnlock() + + children := make([]string, 0, len(lvl.children)) + for child := range lvl.children { + children = append(children, child) + } + + return children +} diff --git a/internal/memorystore/stats.go b/internal/memorystore/stats.go new file mode 100644 index 0000000..1066dcb --- /dev/null +++ b/internal/memorystore/stats.go @@ -0,0 +1,120 @@ +package memorystore + +import ( + "errors" + "math" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-lib/util" +) + +type Stats struct { + Samples int + Avg util.Float + Min util.Float + Max util.Float +} + +func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) { + if from < b.start { + if b.prev != nil { + return b.prev.stats(from, to) + } + from = b.start + } + + // TODO: Check if b.closed and if so and the full buffer is queried, + // use b.statistics instead of iterating over the buffer. + + samples := 0 + sum, min, max := 0.0, math.MaxFloat32, -math.MaxFloat32 + + var t int64 + for t = from; t < to; t += b.frequency { + idx := int((t - b.start) / b.frequency) + if idx >= cap(b.data) { + b = b.next + if b == nil { + break + } + idx = 0 + } + + if t < b.start || idx >= len(b.data) { + continue + } + + xf := float64(b.data[idx]) + if math.IsNaN(xf) { + continue + } + + samples += 1 + sum += xf + min = math.Min(min, xf) + max = math.Max(max, xf) + } + + return Stats{ + Samples: samples, + Avg: util.Float(sum) / util.Float(samples), + Min: util.Float(min), + Max: util.Float(max), + }, from, t, nil +} + +// Returns statistics for the requested metric on the selected node/level. +// Data is aggregated to the selected level the same way as in `MemoryStore.Read`. +// If `Stats.Samples` is zero, the statistics should not be considered as valid. +func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) { + if from > to { + return nil, 0, 0, errors.New("invalid time range") + } + + minfo, ok := m.Metrics[metric] + if !ok { + return nil, 0, 0, errors.New("unkown metric: " + metric) + } + + n, samples := 0, 0 + avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32 + err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error { + stats, cfrom, cto, err := b.stats(from, to) + if err != nil { + return err + } + + if n == 0 { + from, to = cfrom, cto + } else if from != cfrom || to != cto { + return ErrDataDoesNotAlign + } + + samples += stats.Samples + avg += stats.Avg + min = math.Min(min, float64(stats.Min)) + max = math.Max(max, float64(stats.Max)) + n += 1 + return nil + }) + if err != nil { + return nil, 0, 0, err + } + + if n == 0 { + return nil, 0, 0, ErrNoData + } + + if minfo.Aggregation == config.AvgAggregation { + avg /= util.Float(n) + } else if n > 1 && minfo.Aggregation != config.SumAggregation { + return nil, 0, 0, errors.New("invalid aggregation") + } + + return &Stats{ + Samples: samples, + Avg: avg, + Min: util.Float(min), + Max: util.Float(max), + }, from, to, nil +} diff --git a/internal/metricDataDispatcher/dataLoader.go b/internal/metricDataDispatcher/dataLoader.go index 2b73e11..4f8e3b5 100644 --- a/internal/metricDataDispatcher/dataLoader.go +++ b/internal/metricDataDispatcher/dataLoader.go @@ -91,14 +91,14 @@ func LoadData(job *schema.Job, // Pass the resolution from frontend here. for _, v := range jd { for _, v_ := range v { - timestep := 0 + timestep := int64(0) for i := 0; i < len(v_.Series); i += 1 { - v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, v_.Timestep, resolution) + v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution)) if err != nil { return err, 0, 0 } } - v_.Timestep = timestep + v_.Timestep = int(timestep) } } diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go index 36c0dd7..d8cef4d 100644 --- a/internal/metricdata/cc-metric-store.go +++ b/internal/metricdata/cc-metric-store.go @@ -5,23 +5,22 @@ package metricdata import ( - "bufio" - "bytes" "context" "encoding/json" "fmt" - "net/http" "sort" "strconv" "strings" "time" "github.com/ClusterCockpit/cc-backend/internal/graph/model" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/pkg/archive" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" "github.com/ClusterCockpit/cc-lib/schema" ) +// Bloat Code type CCMetricStoreConfig struct { Kind string `json:"kind"` Url string `json:"url"` @@ -33,141 +32,16 @@ type CCMetricStoreConfig struct { Renamings map[string]string `json:"metricRenamings"` } +// Bloat Code type CCMetricStore struct { - here2there map[string]string - there2here map[string]string - client http.Client - jwt string - url string - queryEndpoint string -} - -type ApiQueryRequest struct { - Cluster string `json:"cluster"` - Queries []ApiQuery `json:"queries"` - ForAllNodes []string `json:"for-all-nodes"` - From int64 `json:"from"` - To int64 `json:"to"` - WithStats bool `json:"with-stats"` - WithData bool `json:"with-data"` -} - -type ApiQuery struct { - Type *string `json:"type,omitempty"` - SubType *string `json:"subtype,omitempty"` - Metric string `json:"metric"` - Hostname string `json:"host"` - Resolution int `json:"resolution"` - TypeIds []string `json:"type-ids,omitempty"` - SubTypeIds []string `json:"subtype-ids,omitempty"` - Aggregate bool `json:"aggreg"` -} - -type ApiQueryResponse struct { - Queries []ApiQuery `json:"queries,omitempty"` - Results [][]ApiMetricData `json:"results"` -} - -type ApiMetricData struct { - Error *string `json:"error"` - Data []schema.Float `json:"data"` - From int64 `json:"from"` - To int64 `json:"to"` - Resolution int `json:"resolution"` - Avg schema.Float `json:"avg"` - Min schema.Float `json:"min"` - Max schema.Float `json:"max"` } +// Bloat Code func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error { - var config CCMetricStoreConfig - if err := json.Unmarshal(rawConfig, &config); err != nil { - cclog.Warn("Error while unmarshaling raw json config") - return err - } - - ccms.url = config.Url - ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url) - ccms.jwt = config.Token - ccms.client = http.Client{ - Timeout: 10 * time.Second, - } - - if config.Renamings != nil { - ccms.here2there = config.Renamings - ccms.there2here = make(map[string]string, len(config.Renamings)) - for k, v := range ccms.here2there { - ccms.there2here[v] = k - } - } else { - ccms.here2there = make(map[string]string) - ccms.there2here = make(map[string]string) - } return nil } -func (ccms *CCMetricStore) toRemoteName(metric string) string { - if renamed, ok := ccms.here2there[metric]; ok { - return renamed - } - - return metric -} - -func (ccms *CCMetricStore) toLocalName(metric string) string { - if renamed, ok := ccms.there2here[metric]; ok { - return renamed - } - - return metric -} - -func (ccms *CCMetricStore) doRequest( - ctx context.Context, - body *ApiQueryRequest, -) (*ApiQueryResponse, error) { - buf := &bytes.Buffer{} - if err := json.NewEncoder(buf).Encode(body); err != nil { - cclog.Errorf("Error while encoding request body: %s", err.Error()) - return nil, err - } - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf) - if err != nil { - cclog.Errorf("Error while building request body: %s", err.Error()) - return nil, err - } - if ccms.jwt != "" { - req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt)) - } - - // versioning the cc-metric-store query API. - // v2 = data with resampling - // v1 = data without resampling - q := req.URL.Query() - q.Add("version", "v2") - req.URL.RawQuery = q.Encode() - - res, err := ccms.client.Do(req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - if res.StatusCode != http.StatusOK { - return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status) - } - - var resBody ApiQueryResponse - if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil { - cclog.Errorf("Error while decoding result body: %s", err.Error()) - return nil, err - } - - return &resBody, nil -} - func (ccms *CCMetricStore) LoadData( job *schema.Job, metrics []string, @@ -175,13 +49,13 @@ func (ccms *CCMetricStore) LoadData( ctx context.Context, resolution int, ) (schema.JobData, error) { - queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution) + queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, int64(resolution)) if err != nil { cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) return nil, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: job.Cluster, From: job.StartTime, To: job.StartTime + int64(job.Duration), @@ -190,9 +64,9 @@ func (ccms *CCMetricStore) LoadData( WithData: true, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } @@ -200,7 +74,7 @@ func (ccms *CCMetricStore) LoadData( jobData := make(schema.JobData) for i, row := range resBody.Results { query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(job.Cluster, metric) if _, ok := jobData[metric]; !ok { @@ -209,7 +83,7 @@ func (ccms *CCMetricStore) LoadData( res := mc.Timestep if len(row) > 0 { - res = row[0].Resolution + res = int(row[0].Resolution) } jobMetric, ok := jobData[metric][scope] @@ -282,9 +156,9 @@ func (ccms *CCMetricStore) buildQueries( job *schema.Job, metrics []string, scopes []schema.MetricScope, - resolution int, -) ([]ApiQuery, []schema.MetricScope, error) { - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) + resolution int64, +) ([]memorystore.ApiQuery, []schema.MetricScope, error) { + queries := make([]memorystore.ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) assignedScope := []schema.MetricScope{} subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster) @@ -294,7 +168,6 @@ func (ccms *CCMetricStore) buildQueries( topology := subcluster.Topology for _, metric := range metrics { - remoteName := ccms.toRemoteName(metric) mc := archive.GetMetricConfig(job.Cluster, metric) if mc == nil { // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster) @@ -306,7 +179,7 @@ func (ccms *CCMetricStore) buildQueries( if len(mc.SubClusters) != 0 { isRemoved := false for _, scConfig := range mc.SubClusters { - if scConfig.Name == job.SubCluster && scConfig.Remove == true { + if scConfig.Name == job.SubCluster && scConfig.Remove { isRemoved = true break } @@ -347,8 +220,8 @@ func (ccms *CCMetricStore) buildQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &acceleratorString, @@ -365,8 +238,8 @@ func (ccms *CCMetricStore) buildQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &acceleratorString, @@ -379,8 +252,8 @@ func (ccms *CCMetricStore) buildQueries( // HWThread -> HWThead if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &hwthreadString, @@ -395,8 +268,8 @@ func (ccms *CCMetricStore) buildQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(hwthreads) for _, core := range cores { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &hwthreadString, @@ -412,8 +285,8 @@ func (ccms *CCMetricStore) buildQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &hwthreadString, @@ -427,8 +300,8 @@ func (ccms *CCMetricStore) buildQueries( // HWThread -> Node if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &hwthreadString, @@ -442,8 +315,8 @@ func (ccms *CCMetricStore) buildQueries( // Core -> Core if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &coreString, @@ -458,8 +331,8 @@ func (ccms *CCMetricStore) buildQueries( if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromCores(hwthreads) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &coreString, @@ -474,8 +347,8 @@ func (ccms *CCMetricStore) buildQueries( // Core -> Node if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &coreString, @@ -489,8 +362,8 @@ func (ccms *CCMetricStore) buildQueries( // MemoryDomain -> MemoryDomain if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &memoryDomainString, @@ -504,8 +377,8 @@ func (ccms *CCMetricStore) buildQueries( // MemoryDoman -> Node if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &memoryDomainString, @@ -519,8 +392,8 @@ func (ccms *CCMetricStore) buildQueries( // Socket -> Socket if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: false, Type: &socketString, @@ -534,8 +407,8 @@ func (ccms *CCMetricStore) buildQueries( // Socket -> Node if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Aggregate: true, Type: &socketString, @@ -548,8 +421,8 @@ func (ccms *CCMetricStore) buildQueries( // Node -> Node if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: host.Hostname, Resolution: resolution, }) @@ -576,7 +449,7 @@ func (ccms *CCMetricStore) LoadStats( return nil, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: job.Cluster, From: job.StartTime, To: job.StartTime + int64(job.Duration), @@ -585,16 +458,16 @@ func (ccms *CCMetricStore) LoadStats( WithData: false, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) for i, res := range resBody.Results { query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric data := res[0] if data.Error != nil { cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) @@ -635,7 +508,7 @@ func (ccms *CCMetricStore) LoadScopedStats( return nil, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: job.Cluster, From: job.StartTime, To: job.StartTime + int64(job.Duration), @@ -644,9 +517,9 @@ func (ccms *CCMetricStore) LoadScopedStats( WithData: false, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } @@ -655,7 +528,7 @@ func (ccms *CCMetricStore) LoadScopedStats( for i, row := range resBody.Results { query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric scope := assignedScope[i] if _, ok := scopedJobStats[metric]; !ok { @@ -721,7 +594,7 @@ func (ccms *CCMetricStore) LoadNodeData( from, to time.Time, ctx context.Context, ) (map[string]map[string][]*schema.JobMetric, error) { - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: cluster, From: from.Unix(), To: to.Unix(), @@ -730,38 +603,36 @@ func (ccms *CCMetricStore) LoadNodeData( } if nodes == nil { - for _, metric := range metrics { - req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric)) - } + req.ForAllNodes = append(req.ForAllNodes, metrics...) } else { for _, node := range nodes { for _, metric := range metrics { - req.Queries = append(req.Queries, ApiQuery{ + req.Queries = append(req.Queries, memorystore.ApiQuery{ Hostname: node, - Metric: ccms.toRemoteName(metric), + Metric: metric, Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution }) } } } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, err } var errors []string data := make(map[string]map[string][]*schema.JobMetric) for i, res := range resBody.Results { - var query ApiQuery + var query memorystore.ApiQuery if resBody.Queries != nil { query = resBody.Queries[i] } else { query = req.Queries[i] } - metric := ccms.toLocalName(query.Metric) + metric := query.Metric qdata := res[0] if qdata.Error != nil { /* Build list for "partial errors", if any */ @@ -861,13 +732,13 @@ func (ccms *CCMetricStore) LoadNodeListData( // Note: Order of node data is not guaranteed after this point, but contents match page and filter criteria - queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) + queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, int64(resolution)) if err != nil { cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) return nil, totalNodes, hasNextPage, err } - req := ApiQueryRequest{ + req := memorystore.ApiQueryRequest{ Cluster: cluster, Queries: queries, From: from.Unix(), @@ -876,29 +747,29 @@ func (ccms *CCMetricStore) LoadNodeListData( WithData: true, } - resBody, err := ccms.doRequest(ctx, &req) + resBody, err := memorystore.FetchData(req) if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) + cclog.Errorf("Error while fetching data : %s", err.Error()) return nil, totalNodes, hasNextPage, err } var errors []string data := make(map[string]schema.JobData) for i, row := range resBody.Results { - var query ApiQuery + var query memorystore.ApiQuery if resBody.Queries != nil { query = resBody.Queries[i] } else { query = req.Queries[i] } // qdata := res[0] - metric := ccms.toLocalName(query.Metric) + metric := query.Metric scope := assignedScope[i] mc := archive.GetMetricConfig(cluster, metric) res := mc.Timestep if len(row) > 0 { - res = row[0].Resolution + res = int(row[0].Resolution) } // Init Nested Map Data Structures If Not Found @@ -971,9 +842,9 @@ func (ccms *CCMetricStore) buildNodeQueries( nodes []string, metrics []string, scopes []schema.MetricScope, - resolution int, -) ([]ApiQuery, []schema.MetricScope, error) { - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) + resolution int64, +) ([]memorystore.ApiQuery, []schema.MetricScope, error) { + queries := make([]memorystore.ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) assignedScope := []schema.MetricScope{} // Get Topol before loop if subCluster given @@ -988,7 +859,7 @@ func (ccms *CCMetricStore) buildNodeQueries( } for _, metric := range metrics { - remoteName := ccms.toRemoteName(metric) + metric := metric mc := archive.GetMetricConfig(cluster, metric) if mc == nil { // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster) @@ -1000,7 +871,7 @@ func (ccms *CCMetricStore) buildNodeQueries( if mc.SubClusters != nil { isRemoved := false for _, scConfig := range mc.SubClusters { - if scConfig.Name == subCluster && scConfig.Remove == true { + if scConfig.Name == subCluster && scConfig.Remove { isRemoved = true break } @@ -1056,8 +927,8 @@ func (ccms *CCMetricStore) buildNodeQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &acceleratorString, @@ -1074,8 +945,8 @@ func (ccms *CCMetricStore) buildNodeQueries( continue } - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &acceleratorString, @@ -1088,8 +959,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // HWThread -> HWThead if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &hwthreadString, @@ -1104,8 +975,8 @@ func (ccms *CCMetricStore) buildNodeQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(topology.Node) for _, core := range cores { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &hwthreadString, @@ -1121,8 +992,8 @@ func (ccms *CCMetricStore) buildNodeQueries( if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &hwthreadString, @@ -1136,8 +1007,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // HWThread -> Node if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &hwthreadString, @@ -1151,8 +1022,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Core -> Core if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &coreString, @@ -1167,8 +1038,8 @@ func (ccms *CCMetricStore) buildNodeQueries( if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromCores(topology.Node) for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &coreString, @@ -1183,8 +1054,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Core -> Node if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &coreString, @@ -1198,8 +1069,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // MemoryDomain -> MemoryDomain if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &memoryDomainString, @@ -1213,8 +1084,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // MemoryDoman -> Node if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &memoryDomainString, @@ -1228,8 +1099,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Socket -> Socket if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: false, Type: &socketString, @@ -1243,8 +1114,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Socket -> Node if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Aggregate: true, Type: &socketString, @@ -1257,8 +1128,8 @@ func (ccms *CCMetricStore) buildNodeQueries( // Node -> Node if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, + queries = append(queries, memorystore.ApiQuery{ + Metric: metric, Hostname: hostname, Resolution: resolution, }) diff --git a/internal/metricdata/utils.go b/internal/metricdata/utils.go index 59e640e..2e0d423 100644 --- a/internal/metricdata/utils.go +++ b/internal/metricdata/utils.go @@ -74,9 +74,8 @@ func (tmdr *TestMetricDataRepository) LoadNodeListData( } func DeepCopy(jd_temp schema.JobData) schema.JobData { - var jd schema.JobData - jd = make(schema.JobData, len(jd_temp)) + jd := make(schema.JobData, len(jd_temp)) for k, v := range jd_temp { jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jd_temp[k])) for k_, v_ := range v { diff --git a/internal/repository/job.go b/internal/repository/job.go index 957c3b7..8a8197c 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -52,18 +52,18 @@ func GetJobRepository() *JobRepository { } var jobColumns []string = []string{ - "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", + "job.id", "job.job_id", "job.hpc_user", "job.project", "job.hpc_cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes", - "job.num_hwthreads", "job.num_acc", "job.exclusive", "job.monitoring_status", + "job.num_hwthreads", "job.num_acc", "job.shared", "job.monitoring_status", "job.smt", "job.job_state", "job.duration", "job.walltime", "job.resources", "job.footprint", "job.energy", } var jobCacheColumns []string = []string{ - "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster", + "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.hpc_cluster", "job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition", "job_cache.array_job_id", "job_cache.num_nodes", "job_cache.num_hwthreads", - "job_cache.num_acc", "job_cache.exclusive", "job_cache.monitoring_status", "job_cache.smt", + "job_cache.num_acc", "job_cache.shared", "job_cache.monitoring_status", "job_cache.smt", "job_cache.job_state", "job_cache.duration", "job_cache.walltime", "job_cache.resources", "job_cache.footprint", "job_cache.energy", } @@ -390,7 +390,7 @@ func (r *JobRepository) Partitions(cluster string) ([]string, error) { start := time.Now() partitions := r.cache.Get("partitions:"+cluster, func() (any, time.Duration, int) { parts := []string{} - if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.cluster = ?;`, cluster); err != nil { + if err = r.DB.Select(&parts, `SELECT DISTINCT job.cluster_partition FROM job WHERE job.hpc_cluster = ?;`, cluster); err != nil { return nil, 0, 1000 } @@ -410,7 +410,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in subclusters := make(map[string]map[string]int) rows, err := sq.Select("resources", "subcluster").From("job"). Where("job.job_state = 'running'"). - Where("job.cluster = ?", cluster). + Where("job.hpc_cluster = ?", cluster). RunWith(r.stmtCache).Query() if err != nil { cclog.Error("Error while running query") @@ -505,7 +505,7 @@ func (r *JobRepository) FindJobIdsByTag(tagId int64) ([]int64, error) { // FIXME: Reconsider filtering short jobs with harcoded threshold func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { query := sq.Select(jobColumns...).From("job"). - Where(fmt.Sprintf("job.cluster = '%s'", cluster)). + Where(fmt.Sprintf("job.hpc_cluster = '%s'", cluster)). Where("job.job_state = 'running'"). Where("job.duration > 600") diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index aa2ea76..f43be58 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -14,19 +14,19 @@ import ( ) const NamedJobCacheInsert string = `INSERT INTO job_cache ( - job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data + job_id, hpc_user, project, hpc_cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, + shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data ) VALUES ( - :job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data + :job_id, :hpc_user, :project, :hpc_cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data );` const NamedJobInsert string = `INSERT INTO job ( - job_id, hpc_user, project, cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, - exclusive, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data + job_id, hpc_user, project, hpc_cluster, subcluster, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, + shared, monitoring_status, smt, job_state, start_time, duration, walltime, footprint, energy, energy_footprint, resources, meta_data ) VALUES ( - :job_id, :hpc_user, :project, :cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, - :exclusive, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data + :job_id, :hpc_user, :project, :hpc_cluster, :subcluster, :cluster_partition, :array_job_id, :num_nodes, :num_hwthreads, :num_acc, + :shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data );` func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) { @@ -70,7 +70,7 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { } _, err = r.DB.Exec( - "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, exclusive, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + "INSERT INTO job (job_id, hpc_cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, hpc_cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") if err != nil { cclog.Warnf("Error while Job sync: %v", err) return nil, err diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 39519d5..3abce8c 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -31,7 +31,7 @@ func (r *JobRepository) Find( Where("job.job_id = ?", *jobId) if cluster != nil { - q = q.Where("job.cluster = ?", *cluster) + q = q.Where("job.hpc_cluster = ?", *cluster) } if startTime != nil { q = q.Where("job.start_time = ?", *startTime) @@ -52,7 +52,7 @@ func (r *JobRepository) FindCached( Where("job_cache.job_id = ?", *jobId) if cluster != nil { - q = q.Where("job_cache.cluster = ?", *cluster) + q = q.Where("job_cache.hpc_cluster = ?", *cluster) } if startTime != nil { q = q.Where("job_cache.start_time = ?", *startTime) @@ -78,7 +78,7 @@ func (r *JobRepository) FindAll( Where("job.job_id = ?", *jobId) if cluster != nil { - q = q.Where("job.cluster = ?", *cluster) + q = q.Where("job.hpc_cluster = ?", *cluster) } if startTime != nil { q = q.Where("job.start_time = ?", *startTime) @@ -183,7 +183,7 @@ func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime q := sq.Select(jobColumns...). From("job"). Where("job.job_id = ?", jobId). - Where("job.cluster = ?", cluster). + Where("job.hpc_cluster = ?", cluster). Where("job.start_time = ?", startTime) q, qerr := SecurityCheck(ctx, q) @@ -203,7 +203,7 @@ func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cl From("job"). Where("job.job_id = ?", jobId). Where("job.hpc_user = ?", user). - Where("job.cluster = ?", cluster). + Where("job.hpc_cluster = ?", cluster). Where("job.start_time = ?", startTime) _, err := scanJob(q.RunWith(r.stmtCache).QueryRow()) diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index fdcc904..19cdd9a 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -168,7 +168,7 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select query = buildMetaJsonCondition("jobName", filter.JobName, query) } if filter.Cluster != nil { - query = buildStringCondition("job.cluster", filter.Cluster, query) + query = buildStringCondition("job.hpc_cluster", filter.Cluster, query) } if filter.Partition != nil { query = buildStringCondition("job.cluster_partition", filter.Partition, query) @@ -183,8 +183,8 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs. query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor) } - if filter.Exclusive != nil { - query = query.Where("job.exclusive = ?", *filter.Exclusive) + if filter.Shared != nil { + query = query.Where("job.shared = ?", *filter.Shared) } if filter.State != nil { states := make([]string, len(filter.State)) diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql index 003eab0..8c54622 100644 --- a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql @@ -3,7 +3,7 @@ CREATE TABLE "job_cache" ( job_id BIGINT NOT NULL, hpc_cluster VARCHAR(255) NOT NULL, subcluster VARCHAR(255) NOT NULL, - submit_time BIGINT NOT NULL, -- Unix timestamp + submit_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp start_time BIGINT NOT NULL DEFAULT 0, -- Unix timestamp hpc_user VARCHAR(255) NOT NULL, project VARCHAR(255) NOT NULL, @@ -30,7 +30,7 @@ CREATE TABLE "job_cache" ( energy REAL NOT NULL DEFAULT 0.0, energy_footprint TEXT DEFAULT NULL, footprint TEXT DEFAULT NULL, - UNIQUE (job_id, cluster, start_time) + UNIQUE (job_id, hpc_cluster, start_time) ); CREATE TABLE "job_new" ( @@ -65,10 +65,33 @@ CREATE TABLE "job_new" ( energy REAL NOT NULL DEFAULT 0.0, energy_footprint TEXT DEFAULT NULL, footprint TEXT DEFAULT NULL, - UNIQUE (job_id, cluster, start_time) + UNIQUE (job_id, hpc_cluster, start_time) ); ALTER TABLE job RENAME COLUMN cluster TO hpc_cluster; -INSERT INTO job_new SELECT * FROM job; + +CREATE TABLE IF NOT EXISTS lookup_exclusive ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE +); + +INSERT INTO lookup_exclusive (id, name) VALUES + (0, 'multi_user'), + (1, 'none'), + (2, 'single_user'); + +INSERT INTO job_new ( + id, job_id, hpc_cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, + num_nodes, num_hwthreads, num_acc, smt, shared, monitoring_status, energy, + energy_footprint, footprint +) SELECT + id, job_id, hpc_cluster, subcluster, 0, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, + num_nodes, num_hwthreads, num_acc, smt, (SELECT name FROM lookup_exclusive WHERE id=job.exclusive), monitoring_status, energy, + energy_footprint, footprint +FROM job; + +DROP TABLE lookup_exclusive; DROP TABLE job; ALTER TABLE job_new RENAME TO job; diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 1aa3c55..19d17bd 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -21,10 +21,9 @@ import ( // GraphQL validation should make sure that no unkown values can be specified. var groupBy2column = map[model.Aggregate]string{ - model.AggregateUser: "job.hpc_user", - model.AggregateProject: "job.project", - model.AggregateCluster: "job.cluster", - model.AggregateSubcluster: "job.subcluster", + model.AggregateUser: "job.hpc_user", + model.AggregateProject: "job.project", + model.AggregateCluster: "job.hpc_cluster", } var sortBy2column = map[model.SortByAggregate]string{ diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index e9e20ce..c8911a6 100644 Binary files a/internal/repository/testdata/job.db and b/internal/repository/testdata/job.db differ diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt index bd537e4..9f9b9d5 100644 --- a/internal/tagger/apps/vasp.txt +++ b/internal/tagger/apps/vasp.txt @@ -1 +1,2 @@ vasp +VASP diff --git a/internal/tagger/jobclasses/highload.json b/internal/tagger/jobclasses/highload.json index 0d16b45..9667011 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/internal/tagger/jobclasses/highload.json @@ -8,7 +8,7 @@ ], "metrics": ["cpu_load"], "requirements": [ - "job.exclusive == 1", + "job.shared == \"none\"", "job.duration > job_min_duration_seconds" ], "variables": [ diff --git a/internal/tagger/jobclasses/lowUtilization.json b/internal/tagger/jobclasses/lowUtilization.json index 9613b06..e84b81d 100644 --- a/internal/tagger/jobclasses/lowUtilization.json +++ b/internal/tagger/jobclasses/lowUtilization.json @@ -4,7 +4,7 @@ "parameters": ["job_min_duration_seconds"], "metrics": ["flops_any", "mem_bw"], "requirements": [ - "job.exclusive == 1", + "job.shared == \"none\"", "job.duration > job_min_duration_seconds" ], "variables": [ diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json index 2212bd1..f952da5 100644 --- a/internal/tagger/jobclasses/lowload.json +++ b/internal/tagger/jobclasses/lowload.json @@ -8,7 +8,7 @@ ], "metrics": ["cpu_load"], "requirements": [ - "job.exclusive == 1", + "job.shared == \"none\"", "job.duration > job_min_duration_seconds" ], "variables": [ diff --git a/internal/taskManager/commitJobService.go b/internal/taskManager/commitJobService.go index e7c169a..88c2708 100644 --- a/internal/taskManager/commitJobService.go +++ b/internal/taskManager/commitJobService.go @@ -26,9 +26,9 @@ func RegisterCommitJobService() { gocron.NewTask( func() { start := time.Now() - cclog.Printf("Jobcache sync started at %s", start.Format(time.RFC3339)) + cclog.Printf("Jobcache sync started at %s\n", start.Format(time.RFC3339)) jobs, _ := jobRepo.SyncJobs() repository.CallJobStartHooks(jobs) - cclog.Printf("Jobcache sync and job callbacks are done and took %s", time.Since(start)) + cclog.Printf("Jobcache sync and job callbacks are done and took %s\n", time.Since(start)) })) } diff --git a/internal/taskManager/taskManager.go b/internal/taskManager/taskManager.go index 7231d12..35d6ea5 100644 --- a/internal/taskManager/taskManager.go +++ b/internal/taskManager/taskManager.go @@ -68,7 +68,7 @@ func Start(cronCfg, archiveConfig json.RawMessage) { dec := json.NewDecoder(bytes.NewReader(cronCfg)) dec.DisallowUnknownFields() if err := dec.Decode(&Keys); err != nil { - cclog.Errorf("error while decoding ldap config: %v", err) + cclog.Errorf("error while decoding cron config: %v", err) } var cfg struct { diff --git a/internal/taskManager/updateDurationService.go b/internal/taskManager/updateDurationService.go index d650afb..53882f0 100644 --- a/internal/taskManager/updateDurationService.go +++ b/internal/taskManager/updateDurationService.go @@ -25,8 +25,8 @@ func RegisterUpdateDurationWorker() { gocron.NewTask( func() { start := time.Now() - cclog.Printf("Update duration started at %s", start.Format(time.RFC3339)) + cclog.Printf("Update duration started at %s\n", start.Format(time.RFC3339)) jobRepo.UpdateDuration() - cclog.Printf("Update duration is done and took %s", time.Since(start)) + cclog.Printf("Update duration is done and took %s\n", time.Since(start)) })) } diff --git a/internal/taskManager/updateFootprintService.go b/internal/taskManager/updateFootprintService.go index 4025849..4fb5e45 100644 --- a/internal/taskManager/updateFootprintService.go +++ b/internal/taskManager/updateFootprintService.go @@ -34,7 +34,7 @@ func RegisterFootprintWorker() { c := 0 ce := 0 cl := 0 - cclog.Printf("Update Footprints started at %s", s.Format(time.RFC3339)) + cclog.Printf("Update Footprints started at %s\n", s.Format(time.RFC3339)) for _, cluster := range archive.Clusters { s_cluster := time.Now() @@ -134,8 +134,8 @@ func RegisterFootprintWorker() { } jobRepo.TransactionEnd(t) } - cclog.Debugf("Finish Cluster %s, took %s", cluster.Name, time.Since(s_cluster)) + cclog.Debugf("Finish Cluster %s, took %s\n", cluster.Name, time.Since(s_cluster)) } - cclog.Printf("Updating %d (of %d; Skipped %d) Footprints is done and took %s", c, cl, ce, time.Since(s)) + cclog.Printf("Updating %d (of %d; Skipped %d) Footprints is done and took %s\n", c, cl, ce, time.Since(s)) })) } diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 51b89b1..3317487 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -8,6 +8,8 @@ import ( "errors" "fmt" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/memorystore" cclog "github.com/ClusterCockpit/cc-lib/ccLogger" "github.com/ClusterCockpit/cc-lib/schema" ) @@ -31,6 +33,8 @@ func initClusterConfig() error { return err } + memorystore.Clusters = append(memorystore.Clusters, cluster.Name) + if len(cluster.Name) == 0 || len(cluster.MetricConfig) == 0 || len(cluster.SubClusters) == 0 { @@ -122,6 +126,16 @@ func initClusterConfig() error { } ml.Availability = append(metricLookup[mc.Name].Availability, availability) metricLookup[mc.Name] = ml + + agg, err := config.AssignAggregationStratergy(mc.Aggregation) + if err != nil { + return fmt.Errorf("ARCHIVE/CLUSTERCONFIG > in %s/cluster.json: %w", cluster.Name, err) + } + + config.AddMetric(mc.Name, config.MetricConfig{ + Frequency: int64(mc.Timestep), + Aggregation: agg, + }) } Clusters = append(Clusters, cluster) diff --git a/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json b/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json index 1ce3f87..aadf21c 100644 --- a/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json +++ b/pkg/archive/testdata/archive/emmy/1403/244/1608923076/meta.json @@ -1,194 +1,194 @@ { - "exclusive": 1, - "jobId": 1403244, - "statistics": { - "mem_bw": { - "avg": 63.57, - "min": 0, - "unit": { - "base": "B/s", - "prefix": "G" - }, - "max": 74.5 - }, - "rapl_power": { - "avg": 228.07, - "min": 0, - "unit": { - "base": "W" - }, - "max": 258.56 - }, - "ipc": { - "unit": { - "base": "IPC" - }, - "max": 0.510204081632653, - "avg": 1.53846153846154, - "min": 0.0 - }, - "clock": { - "min": 1380.32, - "avg": 2599.39, - "unit": { - "base": "Hz", - "prefix": "M" - }, - "max": 2634.46 - }, - "cpu_load": { - "avg": 18.4, - "min": 0, - "max": 23.58, - "unit": { - "base": "load" - } - }, - "flops_any": { - "max": 404.62, - "unit": { - "base": "F/s", - "prefix": "G" - }, - "avg": 225.59, - "min": 0 - }, - "flops_dp": { - "max": 0.24, - "unit": { - "base": "F/s", - "prefix": "G" - }, - "min": 0, - "avg": 0 - }, - "mem_used": { - "min": 1.55, - "avg": 27.84, - "unit": { - "base": "B", - "prefix": "G" - }, - "max": 37.5 - }, - "flops_sp": { - "min": 0, - "avg": 225.59, - "max": 404.62, - "unit": { - "base": "F/s", - "prefix": "G" - } - } + "shared": "none", + "jobId": 1403244, + "statistics": { + "mem_bw": { + "avg": 63.57, + "min": 0, + "unit": { + "base": "B/s", + "prefix": "G" + }, + "max": 74.5 }, - "resources": [ - { - "hostname": "e0102" - }, - { - "hostname": "e0103" - }, - { - "hostname": "e0105" - }, - { - "hostname": "e0106" - }, - { - "hostname": "e0107" - }, - { - "hostname": "e0108" - }, - { - "hostname": "e0114" - }, - { - "hostname": "e0320" - }, - { - "hostname": "e0321" - }, - { - "hostname": "e0325" - }, - { - "hostname": "e0404" - }, - { - "hostname": "e0415" - }, - { - "hostname": "e0433" - }, - { - "hostname": "e0437" - }, - { - "hostname": "e0439" - }, - { - "hostname": "e0501" - }, - { - "hostname": "e0503" - }, - { - "hostname": "e0505" - }, - { - "hostname": "e0506" - }, - { - "hostname": "e0512" - }, - { - "hostname": "e0513" - }, - { - "hostname": "e0514" - }, - { - "hostname": "e0653" - }, - { - "hostname": "e0701" - }, - { - "hostname": "e0716" - }, - { - "hostname": "e0727" - }, - { - "hostname": "e0728" - }, - { - "hostname": "e0925" - }, - { - "hostname": "e0926" - }, - { - "hostname": "e0929" - }, - { - "hostname": "e0934" - }, - { - "hostname": "e0951" - } - ], - "walltime": 10, - "jobState": "completed", - "cluster": "emmy", - "subCluster": "haswell", - "stopTime": 1609009562, - "user": "emmyUser6", - "startTime": 1608923076, - "partition": "work", - "tags": [], - "project": "no project", - "numNodes": 32, - "duration": 86486 + "rapl_power": { + "avg": 228.07, + "min": 0, + "unit": { + "base": "W" + }, + "max": 258.56 + }, + "ipc": { + "unit": { + "base": "IPC" + }, + "max": 0.510204081632653, + "avg": 1.53846153846154, + "min": 0.0 + }, + "clock": { + "min": 1380.32, + "avg": 2599.39, + "unit": { + "base": "Hz", + "prefix": "M" + }, + "max": 2634.46 + }, + "cpu_load": { + "avg": 18.4, + "min": 0, + "max": 23.58, + "unit": { + "base": "load" + } + }, + "flops_any": { + "max": 404.62, + "unit": { + "base": "F/s", + "prefix": "G" + }, + "avg": 225.59, + "min": 0 + }, + "flops_dp": { + "max": 0.24, + "unit": { + "base": "F/s", + "prefix": "G" + }, + "min": 0, + "avg": 0 + }, + "mem_used": { + "min": 1.55, + "avg": 27.84, + "unit": { + "base": "B", + "prefix": "G" + }, + "max": 37.5 + }, + "flops_sp": { + "min": 0, + "avg": 225.59, + "max": 404.62, + "unit": { + "base": "F/s", + "prefix": "G" + } + } + }, + "resources": [ + { + "hostname": "e0102" + }, + { + "hostname": "e0103" + }, + { + "hostname": "e0105" + }, + { + "hostname": "e0106" + }, + { + "hostname": "e0107" + }, + { + "hostname": "e0108" + }, + { + "hostname": "e0114" + }, + { + "hostname": "e0320" + }, + { + "hostname": "e0321" + }, + { + "hostname": "e0325" + }, + { + "hostname": "e0404" + }, + { + "hostname": "e0415" + }, + { + "hostname": "e0433" + }, + { + "hostname": "e0437" + }, + { + "hostname": "e0439" + }, + { + "hostname": "e0501" + }, + { + "hostname": "e0503" + }, + { + "hostname": "e0505" + }, + { + "hostname": "e0506" + }, + { + "hostname": "e0512" + }, + { + "hostname": "e0513" + }, + { + "hostname": "e0514" + }, + { + "hostname": "e0653" + }, + { + "hostname": "e0701" + }, + { + "hostname": "e0716" + }, + { + "hostname": "e0727" + }, + { + "hostname": "e0728" + }, + { + "hostname": "e0925" + }, + { + "hostname": "e0926" + }, + { + "hostname": "e0929" + }, + { + "hostname": "e0934" + }, + { + "hostname": "e0951" + } + ], + "walltime": 10, + "jobState": "completed", + "cluster": "emmy", + "subCluster": "haswell", + "stopTime": 1609009562, + "user": "emmyUser6", + "startTime": 1608923076, + "partition": "work", + "tags": [], + "project": "no project", + "numNodes": 32, + "duration": 86486 } diff --git a/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json b/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json index e1fff10..c1e603a 100644 --- a/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json +++ b/pkg/archive/testdata/archive/emmy/1404/397/1609300556/meta.json @@ -1,194 +1,194 @@ { - "stopTime": 1609387081, - "resources": [ - { - "hostname": "e0151" - }, - { - "hostname": "e0152" - }, - { - "hostname": "e0153" - }, - { - "hostname": "e0232" - }, - { - "hostname": "e0303" - }, - { - "hostname": "e0314" - }, - { - "hostname": "e0344" - }, - { - "hostname": "e0345" - }, - { - "hostname": "e0348" - }, - { - "hostname": "e0507" - }, - { - "hostname": "e0518" - }, - { - "hostname": "e0520" - }, - { - "hostname": "e0522" - }, - { - "hostname": "e0526" - }, - { - "hostname": "e0527" - }, - { - "hostname": "e0528" - }, - { - "hostname": "e0530" - }, - { - "hostname": "e0551" - }, - { - "hostname": "e0604" - }, - { - "hostname": "e0613" - }, - { - "hostname": "e0634" - }, - { - "hostname": "e0639" - }, - { - "hostname": "e0640" - }, - { - "hostname": "e0651" - }, - { - "hostname": "e0653" - }, - { - "hostname": "e0701" - }, - { - "hostname": "e0704" - }, - { - "hostname": "e0751" - }, - { - "hostname": "e0809" - }, - { - "hostname": "e0814" - }, - { - "hostname": "e0819" - }, - { - "hostname": "e0908" - } - ], - "walltime": 10, - "cluster": "emmy", - "subCluster": "haswell", - "jobState": "completed", - "statistics": { - "clock": { - "max": 2634.9, - "unit": { - "base": "Hz", - "prefix": "M" - }, - "min": 0, - "avg": 2597.8 - }, - "cpu_load": { - "max": 27.41, - "min": 0, - "avg": 18.39, - "unit": { - "base": "load" - } - }, - "mem_bw": { - "min": 0, - "avg": 63.23, - "unit": { - "base": "B/s", - "prefix": "G" - }, - "max": 75.06 - }, - "ipc": { - "min": 0.0, - "avg": 1.53846153846154, - "unit": { - "base": "IPC" - }, - "max": 0.490196078431373 - }, - "rapl_power": { - "min": 0, - "avg": 227.32, - "unit": { - "base": "W" - }, - "max": 256.22 - }, - "mem_used": { - "min": 1.5, - "avg": 27.77, - "unit": { - "base": "B", - "prefix": "G" - }, - "max": 37.43 - }, - "flops_sp": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "max": 413.21, - "min": 0, - "avg": 224.41 - }, - "flops_dp": { - "max": 5.72, - "unit": { - "base": "F/s", - "prefix": "G" - }, - "min": 0, - "avg": 0 - }, - "flops_any": { - "min": 0, - "avg": 224.42, - "max": 413.21, - "unit": { - "base": "F/s", - "prefix": "G" - } - } + "stopTime": 1609387081, + "resources": [ + { + "hostname": "e0151" }, - "exclusive": 1, - "jobId": 1404397, - "tags": [], - "partition": "work", - "project": "no project", - "user": "emmyUser6", - "startTime": 1609300556, - "duration": 86525, - "numNodes": 32 + { + "hostname": "e0152" + }, + { + "hostname": "e0153" + }, + { + "hostname": "e0232" + }, + { + "hostname": "e0303" + }, + { + "hostname": "e0314" + }, + { + "hostname": "e0344" + }, + { + "hostname": "e0345" + }, + { + "hostname": "e0348" + }, + { + "hostname": "e0507" + }, + { + "hostname": "e0518" + }, + { + "hostname": "e0520" + }, + { + "hostname": "e0522" + }, + { + "hostname": "e0526" + }, + { + "hostname": "e0527" + }, + { + "hostname": "e0528" + }, + { + "hostname": "e0530" + }, + { + "hostname": "e0551" + }, + { + "hostname": "e0604" + }, + { + "hostname": "e0613" + }, + { + "hostname": "e0634" + }, + { + "hostname": "e0639" + }, + { + "hostname": "e0640" + }, + { + "hostname": "e0651" + }, + { + "hostname": "e0653" + }, + { + "hostname": "e0701" + }, + { + "hostname": "e0704" + }, + { + "hostname": "e0751" + }, + { + "hostname": "e0809" + }, + { + "hostname": "e0814" + }, + { + "hostname": "e0819" + }, + { + "hostname": "e0908" + } + ], + "walltime": 10, + "cluster": "emmy", + "subCluster": "haswell", + "jobState": "completed", + "statistics": { + "clock": { + "max": 2634.9, + "unit": { + "base": "Hz", + "prefix": "M" + }, + "min": 0, + "avg": 2597.8 + }, + "cpu_load": { + "max": 27.41, + "min": 0, + "avg": 18.39, + "unit": { + "base": "load" + } + }, + "mem_bw": { + "min": 0, + "avg": 63.23, + "unit": { + "base": "B/s", + "prefix": "G" + }, + "max": 75.06 + }, + "ipc": { + "min": 0.0, + "avg": 1.53846153846154, + "unit": { + "base": "IPC" + }, + "max": 0.490196078431373 + }, + "rapl_power": { + "min": 0, + "avg": 227.32, + "unit": { + "base": "W" + }, + "max": 256.22 + }, + "mem_used": { + "min": 1.5, + "avg": 27.77, + "unit": { + "base": "B", + "prefix": "G" + }, + "max": 37.43 + }, + "flops_sp": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "max": 413.21, + "min": 0, + "avg": 224.41 + }, + "flops_dp": { + "max": 5.72, + "unit": { + "base": "F/s", + "prefix": "G" + }, + "min": 0, + "avg": 0 + }, + "flops_any": { + "min": 0, + "avg": 224.42, + "max": 413.21, + "unit": { + "base": "F/s", + "prefix": "G" + } + } + }, + "shared": "none", + "jobId": 1404397, + "tags": [], + "partition": "work", + "project": "no project", + "user": "emmyUser6", + "startTime": 1609300556, + "duration": 86525, + "numNodes": 32 } diff --git a/startDemo.sh b/startDemo.sh index faf6d35..8087b1c 100755 --- a/startDemo.sh +++ b/startDemo.sh @@ -13,5 +13,7 @@ else cp ./configs/config-demo.json config.json ./cc-backend -migrate-db - ./cc-backend -server -dev -init-db -add-user demo:admin:demo + ./cc-backend -dev -init-db -add-user demo:admin,api:demo + ./cc-backend -server -dev + fi diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index d481436..b003434 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -56,7 +56,7 @@ job(id: "${dbid}") { id, jobId, user, project, cluster, startTime, duration, numNodes, numHWThreads, numAcc, energy, - SMT, exclusive, partition, subCluster, arrayJobId, + SMT, shared, partition, subCluster, arrayJobId, monitoringStatus, state, walltime, tags { id, type, scope, name }, resources { hostname, hwthreads, accelerators }, @@ -325,7 +325,7 @@ metricUnit={$initq.data.globalMetrics.find((gm) => gm.name == item.metric)?.unit} nativeScope={$initq.data.globalMetrics.find((gm) => gm.name == item.metric)?.scope} presetScopes={item.data.map((x) => x.scope)} - isShared={$initq.data.job.exclusive != 1} + isShared={$initq.data.job.shared != "none"} /> {:else if item.disabled == true} diff --git a/web/frontend/src/generic/JobList.svelte b/web/frontend/src/generic/JobList.svelte index dc6def2..5ca8981 100644 --- a/web/frontend/src/generic/JobList.svelte +++ b/web/frontend/src/generic/JobList.svelte @@ -69,7 +69,7 @@ hostname } SMT - exclusive + shared partition arrayJobId monitoringStatus diff --git a/web/frontend/src/generic/joblist/JobInfo.svelte b/web/frontend/src/generic/joblist/JobInfo.svelte index f56d800..794efe9 100644 --- a/web/frontend/src/generic/joblist/JobInfo.svelte +++ b/web/frontend/src/generic/joblist/JobInfo.svelte @@ -172,7 +172,7 @@ {job.numNodes} {/if} - {#if job.exclusive != 1} + {#if job.shared != "none"} (shared) {/if} {#if job.numAcc > 0} diff --git a/web/frontend/src/generic/joblist/JobListRow.svelte b/web/frontend/src/generic/joblist/JobListRow.svelte index b17f66d..28574d9 100644 --- a/web/frontend/src/generic/joblist/JobListRow.svelte +++ b/web/frontend/src/generic/joblist/JobListRow.svelte @@ -213,7 +213,7 @@ metric={metric.data.name} cluster={cluster.find((c) => c.name == job.cluster)} subCluster={job.subCluster} - isShared={job.exclusive != 1} + isShared={job.shared != "none"} numhwthreads={job.numHWThreads} numaccs={job.numAcc} zoomState={zoomStates[metric.data.name] || null} diff --git a/web/frontend/src/systems/nodelist/NodeInfo.svelte b/web/frontend/src/systems/nodelist/NodeInfo.svelte index 363379f..77e7416 100644 --- a/web/frontend/src/systems/nodelist/NodeInfo.svelte +++ b/web/frontend/src/systems/nodelist/NodeInfo.svelte @@ -92,7 +92,7 @@ Missing Metric - {:else if nodeJobsData.jobs.count == 1 && nodeJobsData.jobs.items[0].exclusive} + {:else if nodeJobsData.jobs.count == 1 && nodeJobsData.jobs.items[0].shared == "none"} @@ -104,7 +104,7 @@ Exclusive - {:else if nodeJobsData.jobs.count >= 1 && !nodeJobsData.jobs.items[0].exclusive} + {:else if nodeJobsData.jobs.count >= 1 && !(nodeJobsData.jobs.items[0].shared == "none")} diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index 5cdf493..a9111f6 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -45,7 +45,7 @@ jobId user project - exclusive + shared resources { hostname accelerators @@ -101,7 +101,7 @@ function buildExtendedLegend() { let pendingExtendedLegendData = null // Build Extended for allocated nodes [Commented: Only Build extended Legend For Shared Nodes] - if ($nodeJobsData.data.jobs.count >= 1) { // "&& !$nodeJobsData.data.jobs.items[0].exclusive)" + if ($nodeJobsData.data.jobs.count >= 1) { const accSet = Array.from(new Set($nodeJobsData.data.jobs.items .map((i) => i.resources .filter((r) => (r.hostname === nodeData.host) && r?.accelerators)