diff --git a/CLAUDE.md b/CLAUDE.md index 406f11ba..a8d56571 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -22,7 +22,7 @@ make make frontend # Build only the backend (requires frontend to be built first) -go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.4.4 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend +go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.5.0 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend ``` ### Testing @@ -41,7 +41,7 @@ go test ./internal/repository ### Code Generation ```bash -# Regenerate GraphQL schema and resolvers (after modifying api/*.graphqls) +# Regenerate GraphQL schema and resolvers (after modifying api/schema.graphqls) make graphql # Regenerate Swagger/OpenAPI docs (after modifying API comments) @@ -90,7 +90,7 @@ The backend follows a layered architecture with clear separation of concerns: - Transaction support for batch operations - **internal/api**: REST API endpoints (Swagger/OpenAPI documented) - **internal/graph**: GraphQL API (uses gqlgen) - - Schema in `api/*.graphqls` + - Schema in `api/schema.graphqls` - Generated code in `internal/graph/generated/` - Resolvers in `internal/graph/schema.resolvers.go` - **internal/auth**: Authentication layer @@ -108,7 +108,8 @@ The backend follows a layered architecture with clear separation of concerns: - File system backend (default) - S3 backend - SQLite backend (experimental) -- **pkg/nats**: NATS client and message decoding utilities + - **parquet** sub-package: Parquet format support (schema, reader, writer, conversion) +- **internal/metricstoreclient**: Client for cc-metric-store queries ### Frontend Structure @@ -138,7 +139,7 @@ recommended). Configuration is per-cluster in `config.json`. 3. The first authenticator that returns true performs the actual `Login` 4. JWT tokens are used for API authentication -**Database Migrations**: SQL migrations in `internal/repository/migrations/` are +**Database Migrations**: SQL migrations in `internal/repository/migrations/sqlite3/` are applied automatically on startup. Version tracking in `version` table. **Scopes**: Metrics can be collected at different scopes: @@ -173,7 +174,7 @@ applied automatically on startup. Version tracking in `version` table. **GraphQL** (gqlgen): -- Schema: `api/*.graphqls` +- Schema: `api/schema.graphqls` - Config: `gqlgen.yml` - Generated code: `internal/graph/generated/` - Custom resolvers: `internal/graph/schema.resolvers.go` @@ -182,7 +183,7 @@ applied automatically on startup. Version tracking in `version` table. **Swagger/OpenAPI**: - Annotations in `internal/api/*.go` -- Generated docs: `api/docs.go`, `api/swagger.yaml` +- Generated docs: `internal/api/docs.go`, `api/swagger.yaml` - Run `make swagger` after API changes ## Testing Conventions @@ -196,7 +197,7 @@ applied automatically on startup. Version tracking in `version` table. ### Adding a new GraphQL field -1. Edit schema in `api/*.graphqls` +1. Edit schema in `api/schema.graphqls` 2. Run `make graphql` 3. Implement resolver in `internal/graph/schema.resolvers.go` @@ -215,7 +216,7 @@ applied automatically on startup. Version tracking in `version` table. ### Modifying database schema -1. Create new migration in `internal/repository/migrations/` +1. Create new migration in `internal/repository/migrations/sqlite3/` 2. Increment `repository.Version` 3. Test with fresh database and existing database diff --git a/README.md b/README.md index 475401f4..3306f838 100644 --- a/README.md +++ b/README.md @@ -173,18 +173,19 @@ ln -s ./var/job-archive Job classification and application detection - [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager) Background task management and scheduled jobs + - [`metricstoreclient`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstoreclient) + Client for cc-metric-store queries - [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg) contains Go packages that can be used by other projects. - [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive) - Job archive backend implementations (filesystem, S3) + Job archive backend implementations (filesystem, S3, SQLite) - [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/metricstore) In-memory metric data store with checkpointing and metric loading - - [`nats`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/nats) - NATS client and message handling - [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools) Additional command line helper tools. - [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager) - Commands for getting infos about an existing job archive. + Commands for getting infos about an existing job archive, importing jobs + between archive backends, and converting archives between JSON and Parquet formats. - [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration) Tool for migrating job archives between formats. - [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey) diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 3c70a960..3ee05383 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -248,7 +248,7 @@ func generateJWT(authHandle *auth.Authentication, username string) error { return fmt.Errorf("getting user '%s': %w", username, err) } - if !user.HasRole(schema.RoleApi) { + if !user.HasRole(schema.RoleAPI) { cclog.Warnf("JWT: User '%s' does not have the role 'api'. REST API endpoints will return error!\n", user.Username) } diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 68cc4736..91f8360f 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -14,7 +14,6 @@ import ( "encoding/json" "errors" "fmt" - "io" "net" "net/http" "os" @@ -36,8 +35,9 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/nats" "github.com/ClusterCockpit/cc-lib/v2/runtime" - "github.com/gorilla/handlers" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" + "github.com/go-chi/chi/v5/middleware" + "github.com/go-chi/cors" httpSwagger "github.com/swaggo/http-swagger" ) @@ -50,7 +50,7 @@ const ( // Server encapsulates the HTTP server state and dependencies type Server struct { - router *mux.Router + router chi.Router server *http.Server restAPIHandle *api.RestAPI natsAPIHandle *api.NatsAPI @@ -70,7 +70,7 @@ func NewServer(version, commit, buildDate string) (*Server, error) { buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate} s := &Server{ - router: mux.NewRouter(), + router: chi.NewRouter(), } if err := s.init(); err != nil { @@ -106,6 +106,27 @@ func (s *Server) init() error { authHandle := auth.GetAuthInstance() + // Middleware must be defined before routes in chi + s.router.Use(func(next http.Handler) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + start := time.Now() + ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor) + next.ServeHTTP(ww, r) + cclog.Debugf("%s %s (%d, %.02fkb, %dms)", + r.Method, r.URL.RequestURI(), + ww.Status(), float32(ww.BytesWritten())/1024, + time.Since(start).Milliseconds()) + }) + }) + s.router.Use(middleware.Compress(5)) + s.router.Use(middleware.Recoverer) + s.router.Use(cors.Handler(cors.Options{ + AllowCredentials: true, + AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}, + AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"}, + AllowedOrigins: []string{"*"}, + })) + s.restAPIHandle = api.New() info := map[string]any{} @@ -117,11 +138,11 @@ func (s *Server) init() error { info["hasOpenIDConnect"] = true } - s.router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) { + s.router.Get("/login", func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") cclog.Debugf("##%v##", info) web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info}) - }).Methods(http.MethodGet) + }) s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo}) @@ -131,13 +152,6 @@ func (s *Server) init() error { web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo}) }) - secured := s.router.PathPrefix("/").Subrouter() - securedapi := s.router.PathPrefix("/api").Subrouter() - userapi := s.router.PathPrefix("/userapi").Subrouter() - configapi := s.router.PathPrefix("/config").Subrouter() - frontendapi := s.router.PathPrefix("/frontend").Subrouter() - metricstoreapi := s.router.PathPrefix("/api").Subrouter() - if !config.Keys.DisableAuthentication { // Create login failure handler (used by both /login and /jwt-login) loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) { @@ -152,10 +166,10 @@ func (s *Server) init() error { }) } - s.router.Handle("/login", authHandle.Login(loginFailureHandler)).Methods(http.MethodPost) - s.router.Handle("/jwt-login", authHandle.Login(loginFailureHandler)) + s.router.Post("/login", authHandle.Login(loginFailureHandler).ServeHTTP) + s.router.HandleFunc("/jwt-login", authHandle.Login(loginFailureHandler).ServeHTTP) - s.router.Handle("/logout", authHandle.Logout( + s.router.Post("/logout", authHandle.Logout( http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") rw.WriteHeader(http.StatusOK) @@ -166,87 +180,97 @@ func (s *Server) init() error { Build: buildInfo, Infos: info, }) - }))).Methods(http.MethodPost) - - secured.Use(func(next http.Handler) http.Handler { - return authHandle.Auth( - // On success; - next, - - // On failure: - func(rw http.ResponseWriter, r *http.Request, err error) { - rw.WriteHeader(http.StatusUnauthorized) - web.RenderTemplate(rw, "login.tmpl", &web.Page{ - Title: "Authentication failed - ClusterCockpit", - MsgType: "alert-danger", - Message: err.Error(), - Build: buildInfo, - Infos: info, - Redirect: r.RequestURI, - }) - }) - }) - - securedapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - userapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthUserAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - metricstoreapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthMetricStoreAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - configapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthConfigAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - frontendapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthFrontendAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) + })).ServeHTTP) } if flagDev { s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query")) - s.router.PathPrefix("/swagger/").Handler(httpSwagger.Handler( - httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet) + s.router.Get("/swagger/*", httpSwagger.Handler( + httpSwagger.URL("http://"+config.Keys.Addr+"/swagger/doc.json"))) } - secured.Handle("/query", graphQLServer) - // Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project. - secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) { - routerConfig.HandleSearchBar(rw, r, buildInfo) + // Secured routes (require authentication) + s.router.Group(func(secured chi.Router) { + if !config.Keys.DisableAuthentication { + secured.Use(func(next http.Handler) http.Handler { + return authHandle.Auth( + next, + func(rw http.ResponseWriter, r *http.Request, err error) { + rw.WriteHeader(http.StatusUnauthorized) + web.RenderTemplate(rw, "login.tmpl", &web.Page{ + Title: "Authentication failed - ClusterCockpit", + MsgType: "alert-danger", + Message: err.Error(), + Build: buildInfo, + Infos: info, + Redirect: r.RequestURI, + }) + }) + }) + } + + secured.Handle("/query", graphQLServer) + + secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) { + routerConfig.HandleSearchBar(rw, r, buildInfo) + }) + + routerConfig.SetupRoutes(secured, buildInfo) }) - // Mount all /monitoring/... and /api/... routes. - routerConfig.SetupRoutes(secured, buildInfo) - s.restAPIHandle.MountAPIRoutes(securedapi) - s.restAPIHandle.MountUserAPIRoutes(userapi) - s.restAPIHandle.MountConfigAPIRoutes(configapi) - s.restAPIHandle.MountLogAPIRoutes(configapi) - s.restAPIHandle.MountFrontendAPIRoutes(frontendapi) + // API routes (JWT token auth) + s.router.Route("/api", func(apiRouter chi.Router) { + // Main API routes with API auth + apiRouter.Group(func(securedapi chi.Router) { + if !config.Keys.DisableAuthentication { + securedapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountAPIRoutes(securedapi) + }) + + // Metric store API routes with separate auth + apiRouter.Group(func(metricstoreapi chi.Router) { + if !config.Keys.DisableAuthentication { + metricstoreapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthMetricStoreAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) + }) + }) + + // User API routes + s.router.Route("/userapi", func(userapi chi.Router) { + if !config.Keys.DisableAuthentication { + userapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthUserAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountUserAPIRoutes(userapi) + }) + + // Config API routes (uses Group with full paths to avoid shadowing + // the /config page route that is registered in the secured group) + s.router.Group(func(configapi chi.Router) { + if !config.Keys.DisableAuthentication { + configapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthConfigAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountConfigAPIRoutes(configapi) + }) + + // Frontend API routes + s.router.Route("/frontend", func(frontendapi chi.Router) { + if !config.Keys.DisableAuthentication { + frontendapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthFrontendAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountFrontendAPIRoutes(frontendapi) + }) if config.Keys.APISubjects != nil { s.natsAPIHandle = api.NewNatsAPI() @@ -255,28 +279,59 @@ func (s *Server) init() error { } } - s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) + // 404 handler for pages and API routes + notFoundHandler := func(rw http.ResponseWriter, r *http.Request) { + if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") || + strings.HasPrefix(r.URL.Path, "/frontend/") || strings.HasPrefix(r.URL.Path, "/config/") { + rw.Header().Set("Content-Type", "application/json") + rw.WriteHeader(http.StatusNotFound) + json.NewEncoder(rw).Encode(map[string]string{ + "status": "Resource not found", + "error": "the requested endpoint does not exist", + }) + return + } + rw.Header().Set("Content-Type", "text/html; charset=utf-8") + rw.WriteHeader(http.StatusNotFound) + web.RenderTemplate(rw, "404.tmpl", &web.Page{ + Title: "Page Not Found", + Build: buildInfo, + }) + } + + // Set NotFound on the router so chi uses it for all unmatched routes, + // including those under subrouters like /api, /userapi, /frontend, etc. + s.router.NotFound(notFoundHandler) if config.Keys.EmbedStaticFiles { if i, err := os.Stat("./var/img"); err == nil { if i.IsDir() { cclog.Info("Use local directory for static images") - s.router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img")))) + s.router.Handle("/img/*", http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img")))) } } - s.router.PathPrefix("/").Handler(http.StripPrefix("/", web.ServeFiles())) + fileServer := http.StripPrefix("/", web.ServeFiles()) + s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + if web.StaticFileExists(r.URL.Path) { + fileServer.ServeHTTP(rw, r) + return + } + notFoundHandler(rw, r) + })) } else { - s.router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles))) + staticDir := http.Dir(config.Keys.StaticFiles) + fileServer := http.FileServer(staticDir) + s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + f, err := staticDir.Open(r.URL.Path) + if err == nil { + f.Close() + fileServer.ServeHTTP(rw, r) + return + } + notFoundHandler(rw, r) + })) } - s.router.Use(handlers.CompressHandler) - s.router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true))) - s.router.Use(handlers.CORS( - handlers.AllowCredentials(), - handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}), - handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}), - handlers.AllowedOrigins([]string{"*"}))) - return nil } @@ -287,20 +342,6 @@ const ( ) func (s *Server) Start(ctx context.Context) error { - handler := handlers.CustomLoggingHandler(io.Discard, s.router, func(_ io.Writer, params handlers.LogFormatterParams) { - if strings.HasPrefix(params.Request.RequestURI, "/api/") { - cclog.Debugf("%s %s (%d, %.02fkb, %dms)", - params.Request.Method, params.URL.RequestURI(), - params.StatusCode, float32(params.Size)/1024, - time.Since(params.TimeStamp).Milliseconds()) - } else { - cclog.Debugf("%s %s (%d, %.02fkb, %dms)", - params.Request.Method, params.URL.RequestURI(), - params.StatusCode, float32(params.Size)/1024, - time.Since(params.TimeStamp).Milliseconds()) - } - }) - // Use configurable timeouts with defaults readTimeout := time.Duration(defaultReadTimeout) * time.Second writeTimeout := time.Duration(defaultWriteTimeout) * time.Second @@ -308,7 +349,7 @@ func (s *Server) Start(ctx context.Context) error { s.server = &http.Server{ ReadTimeout: readTimeout, WriteTimeout: writeTimeout, - Handler: handler, + Handler: s.router, Addr: config.Keys.Addr, } diff --git a/go.mod b/go.mod index da712da9..f9bf7e42 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,6 @@ module github.com/ClusterCockpit/cc-backend -go 1.24.0 - -toolchain go1.24.1 +go 1.24.9 tool ( github.com/99designs/gqlgen @@ -11,7 +9,7 @@ tool ( require ( github.com/99designs/gqlgen v0.17.85 - github.com/ClusterCockpit/cc-lib/v2 v2.2.1 + github.com/ClusterCockpit/cc-lib/v2 v2.4.0 github.com/Masterminds/squirrel v1.5.4 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.6 @@ -19,19 +17,20 @@ require ( github.com/aws/aws-sdk-go-v2/service/s3 v1.95.0 github.com/coreos/go-oidc/v3 v3.17.0 github.com/expr-lang/expr v1.17.7 + github.com/go-chi/chi/v5 v5.2.5 + github.com/go-chi/cors v1.2.2 github.com/go-co-op/gocron/v2 v2.19.0 github.com/go-ldap/ldap/v3 v3.4.12 github.com/golang-jwt/jwt/v5 v5.3.0 github.com/golang-migrate/migrate/v4 v4.19.1 github.com/google/gops v0.3.28 - github.com/gorilla/handlers v1.5.2 - github.com/gorilla/mux v1.8.1 github.com/gorilla/sessions v1.4.0 github.com/influxdata/line-protocol/v2 v2.2.1 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 github.com/linkedin/goavro/v2 v2.14.1 github.com/mattn/go-sqlite3 v1.14.33 + github.com/parquet-go/parquet-go v0.27.0 github.com/qustavo/sqlhooks/v2 v2.1.0 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/stretchr/testify v1.11.1 @@ -47,6 +46,7 @@ require ( github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect github.com/KyleBanks/depth v1.2.1 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect + github.com/andybalholm/brotli v1.1.1 // indirect github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect @@ -65,7 +65,6 @@ require ( github.com/aws/smithy-go v1.24.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect github.com/go-jose/go-jose/v4 v4.1.3 // indirect @@ -82,7 +81,6 @@ require ( github.com/go-viper/mapstructure/v2 v2.4.0 // indirect github.com/goccy/go-yaml v1.19.0 // indirect github.com/golang/snappy v0.0.4 // indirect - github.com/google/go-cmp v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect @@ -98,6 +96,9 @@ require ( github.com/nats-io/nkeys v0.4.12 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/oapi-codegen/runtime v1.1.1 // indirect + github.com/parquet-go/bitpack v1.0.0 // indirect + github.com/parquet-go/jsonlite v1.0.0 // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/common v0.67.4 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect @@ -106,6 +107,7 @@ require ( github.com/stmcginnis/gofish v0.20.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/swaggo/files v1.0.1 // indirect + github.com/twpayne/go-geom v1.6.1 // indirect github.com/urfave/cli/v2 v2.27.7 // indirect github.com/urfave/cli/v3 v3.6.1 // indirect github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect @@ -118,6 +120,7 @@ require ( golang.org/x/sys v0.39.0 // indirect golang.org/x/text v0.32.0 // indirect golang.org/x/tools v0.40.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 43331fce..509c659c 100644 --- a/go.sum +++ b/go.sum @@ -4,8 +4,10 @@ github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims= -github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/ClusterCockpit/cc-lib/v2 v2.4.0 h1:OnZlvqSatg7yCQ2NtSR7AddpUVSiuSMZ8scF1a7nfOk= +github.com/ClusterCockpit/cc-lib/v2 v2.4.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= +github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= +github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= @@ -17,10 +19,16 @@ github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHf github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= +github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY= +github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= +github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc= +github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= +github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA= +github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM= @@ -85,8 +93,6 @@ github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7c github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8= github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= -github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= -github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= @@ -95,6 +101,10 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= +github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug= +github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0= +github.com/go-chi/cors v1.2.2 h1:Jmey33TE+b+rB7fT8MUy1u0I4L+NARQlK6LhzKPSyQE= +github.com/go-chi/cors v1.2.2/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58= github.com/go-co-op/gocron/v2 v2.19.0 h1:OKf2y6LXPs/BgBI2fl8PxUpNAI1DA9Mg+hSeGOS38OU= github.com/go-co-op/gocron/v2 v2.19.0/go.mod h1:5lEiCKk1oVJV39Zg7/YG10OnaVrDAV5GGR6O0663k6U= github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs= @@ -154,8 +164,6 @@ github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= -github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/securecookie v1.1.2 h1:YCIWL56dvtr73r6715mJs5ZvhtnY73hBvEF8kXD8ePA= @@ -168,6 +176,8 @@ github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/C github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4= github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= @@ -238,6 +248,14 @@ github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLA github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA= +github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs= +github.com/parquet-go/jsonlite v1.0.0 h1:87QNdi56wOfsE5bdgas0vRzHPxfJgzrXGml1zZdd7VU= +github.com/parquet-go/jsonlite v1.0.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0= +github.com/parquet-go/parquet-go v0.27.0 h1:vHWK2xaHbj+v1DYps03yDRpEsdtOeKbhiXUaixoPb3g= +github.com/parquet-go/parquet-go v0.27.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= +github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= @@ -285,6 +303,8 @@ github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64 github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ= github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI= github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg= +github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4= +github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028= github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo= @@ -293,6 +313,8 @@ github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkW github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 8cbf95d7..09fc4c7f 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -30,7 +30,7 @@ import ( ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" _ "github.com/mattn/go-sqlite3" ) @@ -216,9 +216,7 @@ func TestRestApi(t *testing.T) { return testData, nil } - r := mux.NewRouter() - r.PathPrefix("/api").Subrouter() - r.StrictSlash(true) + r := chi.NewRouter() restapi.MountAPIRoutes(r) var TestJobID int64 = 123 diff --git a/internal/api/cluster.go b/internal/api/cluster.go index d1c3c898..5e6e3a27 100644 --- a/internal/api/cluster.go +++ b/internal/api/cluster.go @@ -36,9 +36,9 @@ type GetClustersAPIResponse struct { // @router /api/clusters/ [get] func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) { if user := repository.GetUserFromContext(r.Context()); user != nil && - !user.HasRole(schema.RoleApi) { + !user.HasRole(schema.RoleAPI) { - handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw) return } diff --git a/internal/api/job.go b/internal/api/job.go index c3d1fbbf..66258668 100644 --- a/internal/api/job.go +++ b/internal/api/job.go @@ -27,7 +27,7 @@ import ( "github.com/ClusterCockpit/cc-backend/pkg/archive" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) const ( @@ -243,10 +243,10 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) { // @router /api/jobs/{id} [get] func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var job *schema.Job var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) @@ -336,10 +336,10 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) // @router /api/jobs/{id} [post] func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var job *schema.Job var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) @@ -439,7 +439,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /api/jobs/edit_meta/{id} [post] func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return @@ -487,7 +487,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /api/jobs/tag_job/{id} [post] func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return @@ -551,7 +551,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /jobs/tag_job/{id} [delete] func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return @@ -754,6 +754,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { return } + isCached := false job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) if err != nil { // Try cached jobs if not found in main repository @@ -764,9 +765,10 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { return } job = cachedJob + isCached = true } - api.checkAndHandleStopJob(rw, job, req) + api.checkAndHandleStopJob(rw, job, req, isCached) } // deleteJobByID godoc @@ -786,9 +788,9 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // @router /api/jobs/delete_job/{id} [delete] func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) @@ -885,9 +887,9 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { var cnt int // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["ts"] + id := chi.URLParam(r, "ts") var err error - if ok { + if id != "" { ts, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw) @@ -923,7 +925,7 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { } } -func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) { +func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) { // Sanity checks if job.State != schema.JobStateRunning { handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) @@ -948,11 +950,21 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo api.JobRepository.Mutex.Lock() defer api.JobRepository.Mutex.Unlock() - if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + // If the job is still in job_cache, transfer it to the job table first + // so that job.ID always points to the job table for downstream code + if isCached { + newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID) + if err != nil { + handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw) return } + cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID) + job.ID = &newID + } + + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + return } cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) @@ -976,7 +988,7 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo } func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) { - id := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") metrics := r.URL.Query()["metric"] var scopes []schema.MetricScope for _, scope := range r.URL.Query()["scope"] { @@ -1042,8 +1054,8 @@ type GetUsedNodesAPIResponse struct { // @router /api/jobs/used_nodes [get] func (api *RestAPI) getUsedNodes(rw http.ResponseWriter, r *http.Request) { if user := repository.GetUserFromContext(r.Context()); user != nil && - !user.HasRole(schema.RoleApi) { - handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + !user.HasRole(schema.RoleAPI) { + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw) return } diff --git a/internal/api/nats.go b/internal/api/nats.go index c0a8c174..0e929426 100644 --- a/internal/api/nats.go +++ b/internal/api/nats.go @@ -251,6 +251,7 @@ func (api *NatsAPI) handleStopJob(payload string) { return } + isCached := false job, err := api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) if err != nil { cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime) @@ -260,6 +261,7 @@ func (api *NatsAPI) handleStopJob(payload string) { return } job = cachedJob + isCached = true } if job.State != schema.JobStateRunning { @@ -287,16 +289,26 @@ func (api *NatsAPI) handleStopJob(payload string) { api.JobRepository.Mutex.Lock() defer api.JobRepository.Mutex.Unlock() - if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v", - job.JobID, job.ID, job.Cluster, job.State, err) + // If the job is still in job_cache, transfer it to the job table first + if isCached { + newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID) + if err != nil { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v", + job.JobID, *job.ID, job.Cluster, err) return } + cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID) + job.ID = &newID + } + + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v", + job.JobID, *job.ID, job.Cluster, job.State, err) + return } cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", - job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) + *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) if job.MonitoringStatus == schema.MonitoringStatusDisabled { return diff --git a/internal/api/node.go b/internal/api/node.go index 27cde7f0..e6b19479 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -80,7 +80,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { ms := metricstore.GetMemoryStore() m := make(map[string][]string) - healthStates := make(map[string]schema.MonitoringState) + healthResults := make(map[string]metricstore.HealthCheckResult) startMs := time.Now() @@ -94,8 +94,8 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { if sc != "" { metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc) metricNames := metricListToNames(metricList) - if states, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil { - maps.Copy(healthStates, states) + if results, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil { + maps.Copy(healthResults, results) } } } @@ -106,8 +106,10 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { for _, node := range req.Nodes { state := determineState(node.States) healthState := schema.MonitoringStateFailed - if hs, ok := healthStates[node.Hostname]; ok { - healthState = hs + var healthMetrics string + if result, ok := healthResults[node.Hostname]; ok { + healthState = result.State + healthMetrics = result.HealthMetrics } nodeState := schema.NodeStateDB{ TimeStamp: requestReceived, @@ -116,10 +118,14 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { MemoryAllocated: node.MemoryAllocated, GpusAllocated: node.GpusAllocated, HealthState: healthState, + HealthMetrics: healthMetrics, JobsRunning: node.JobsRunning, } - repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState) + if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil { + cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v", + node.Hostname, req.Cluster, err) + } } cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB)) diff --git a/internal/api/rest.go b/internal/api/rest.go index 00ed1f55..575b1809 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -25,7 +25,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/util" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) // @title ClusterCockpit REST API @@ -73,91 +73,95 @@ func New() *RestAPI { // MountAPIRoutes registers REST API endpoints for job and cluster management. // These routes use JWT token authentication via the X-Auth-Token header. -func (api *RestAPI) MountAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +func (api *RestAPI) MountAPIRoutes(r chi.Router) { // REST API Uses TokenAuth // User List - r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) + r.Get("/users/", api.getUsers) // Cluster List - r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet) + r.Get("/clusters/", api.getClusters) // Slurm node state - r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut) + r.Post("/nodestate/", api.updateNodeStates) + r.Put("/nodestate/", api.updateNodeStates) // Job Handler if config.Keys.APISubjects == nil { cclog.Info("Enabling REST start/stop job API") - r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut) - r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut) + r.Post("/jobs/start_job/", api.startJob) + r.Put("/jobs/start_job/", api.startJob) + r.Post("/jobs/stop_job/", api.stopJobByRequest) + r.Put("/jobs/stop_job/", api.stopJobByRequest) } - r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet) - r.HandleFunc("/jobs/used_nodes", api.getUsedNodes).Methods(http.MethodGet) - r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch) - r.HandleFunc("/jobs/tag_job/{id}", api.removeTagJob).Methods(http.MethodDelete) - r.HandleFunc("/jobs/edit_meta/{id}", api.editMeta).Methods(http.MethodPost, http.MethodPatch) - r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) - r.HandleFunc("/jobs/delete_job/", api.deleteJobByRequest).Methods(http.MethodDelete) - r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobByID).Methods(http.MethodDelete) - r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete) - r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost) - r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet) + r.Get("/jobs/", api.getJobs) + r.Get("/jobs/used_nodes", api.getUsedNodes) + r.Post("/jobs/tag_job/{id}", api.tagJob) + r.Patch("/jobs/tag_job/{id}", api.tagJob) + r.Delete("/jobs/tag_job/{id}", api.removeTagJob) + r.Post("/jobs/edit_meta/{id}", api.editMeta) + r.Patch("/jobs/edit_meta/{id}", api.editMeta) + r.Get("/jobs/metrics/{id}", api.getJobMetrics) + r.Delete("/jobs/delete_job/", api.deleteJobByRequest) + r.Delete("/jobs/delete_job/{id}", api.deleteJobByID) + r.Delete("/jobs/delete_job_before/{ts}", api.deleteJobBefore) + r.Post("/jobs/{id}", api.getJobByID) + r.Get("/jobs/{id}", api.getCompleteJobByID) - r.HandleFunc("/tags/", api.removeTags).Methods(http.MethodDelete) + r.Delete("/tags/", api.removeTags) if api.MachineStateDir != "" { - r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet) - r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost) + r.Get("/machine_state/{cluster}/{host}", api.getMachineState) + r.Put("/machine_state/{cluster}/{host}", api.putMachineState) + r.Post("/machine_state/{cluster}/{host}", api.putMachineState) } } // MountUserAPIRoutes registers user-accessible REST API endpoints. // These are limited endpoints for regular users with JWT token authentication. -func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +func (api *RestAPI) MountUserAPIRoutes(r chi.Router) { // REST API Uses TokenAuth - r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet) - r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost) - r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet) - r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) + r.Get("/jobs/", api.getJobs) + r.Post("/jobs/{id}", api.getJobByID) + r.Get("/jobs/{id}", api.getCompleteJobByID) + r.Get("/jobs/metrics/{id}", api.getJobMetrics) } // MountMetricStoreAPIRoutes registers metric storage API endpoints. // These endpoints handle metric data ingestion and health checks with JWT token authentication. -func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) { +func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) { // REST API Uses TokenAuth - // Note: StrictSlash handles trailing slash variations automatically - r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost) - r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost) - r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet) - r.HandleFunc("/healthcheck", api.updateNodeStates).Methods(http.MethodPost) + r.Post("/free", freeMetrics) + r.Post("/write", writeMetrics) + r.Get("/debug", debugMetrics) + r.Post("/healthcheck", api.updateNodeStates) // Same endpoints but with trailing slash - r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost) - r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost) - r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet) - r.HandleFunc("/healthcheck/", api.updateNodeStates).Methods(http.MethodPost) + r.Post("/free/", freeMetrics) + r.Post("/write/", writeMetrics) + r.Get("/debug/", debugMetrics) + r.Post("/healthcheck/", api.updateNodeStates) } // MountConfigAPIRoutes registers configuration and user management endpoints. // These routes use session-based authentication and require admin privileges. -func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +// Routes use full paths (including /config prefix) to avoid conflicting with +// the /config page route when registered via Group instead of Route. +func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) { // Settings Frontend Uses SessionAuth if api.Authentication != nil { - r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet) - r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut) - r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) - r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete) - r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost) - r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost) + r.Get("/config/roles/", api.getRoles) + r.Post("/config/users/", api.createUser) + r.Put("/config/users/", api.createUser) + r.Get("/config/users/", api.getUsers) + r.Delete("/config/users/", api.deleteUser) + r.Post("/config/user/{id}", api.updateUser) + r.Post("/config/notice/", api.editNotice) } } // MountFrontendAPIRoutes registers frontend-specific API endpoints. // These routes support JWT generation and user configuration updates with session authentication. -func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) { // Settings Frontend Uses SessionAuth if api.Authentication != nil { - r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet) - r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost) + r.Get("/jwt/", api.getJWT) + r.Post("/configuration/", api.updateConfiguration) } } @@ -381,9 +385,8 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) { return } - vars := mux.Vars(r) - cluster := vars["cluster"] - host := vars["host"] + cluster := chi.URLParam(r, "cluster") + host := chi.URLParam(r, "host") if err := validatePathComponent(cluster, "cluster name"); err != nil { handleError(err, http.StatusBadRequest, rw) @@ -434,9 +437,8 @@ func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) { return } - vars := mux.Vars(r) - cluster := vars["cluster"] - host := vars["host"] + cluster := chi.URLParam(r, "cluster") + host := chi.URLParam(r, "host") if err := validatePathComponent(cluster, "cluster name"); err != nil { handleError(err, http.StatusBadRequest, rw) diff --git a/internal/api/user.go b/internal/api/user.go index 5564fd61..e2f78165 100644 --- a/internal/api/user.go +++ b/internal/api/user.go @@ -13,7 +13,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) type APIReturnedUser struct { @@ -91,7 +91,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { // Handle role updates if newrole != "" { - if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil { + if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil { handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -99,7 +99,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if delrole != "" { - if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil { + if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil { handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -107,7 +107,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if newproj != "" { - if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil { + if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil { handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -115,7 +115,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if delproj != "" { - if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil { + if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil { handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -164,7 +164,7 @@ func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) { return } - if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) { + if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) { handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw) return } diff --git a/internal/archiver/README.md b/internal/archiver/README.md index 48aed797..53d00948 100644 --- a/internal/archiver/README.md +++ b/internal/archiver/README.md @@ -170,7 +170,6 @@ All exported functions are safe for concurrent use: - `Start()` - Safe to call once - `TriggerArchiving()` - Safe from multiple goroutines - `Shutdown()` - Safe to call once -- `WaitForArchiving()` - Deprecated, but safe Internal state is protected by: - Channel synchronization (`archiveChannel`) diff --git a/internal/auth/auth.go b/internal/auth/auth.go index df618a3f..9b1e2121 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -294,6 +294,11 @@ func handleOIDCUser(OIDCUser *schema.User) { handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin) } +// handleLdapUser syncs LDAP user with database +func handleLdapUser(ldapUser *schema.User) { + handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin) +} + func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error { session, err := auth.sessionStore.New(r, "session") if err != nil { @@ -443,13 +448,13 @@ func (auth *Authentication) AuthAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) { + if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return @@ -479,13 +484,13 @@ func (auth *Authentication) AuthUserAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) { + if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return @@ -515,13 +520,13 @@ func (auth *Authentication) AuthMetricStoreAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { + if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return diff --git a/internal/auth/ldap.go b/internal/auth/ldap.go index 5e12f07b..a174bb9d 100644 --- a/internal/auth/ldap.go +++ b/internal/auth/ldap.go @@ -6,11 +6,12 @@ package auth import ( - "errors" "fmt" + "net" "net/http" "os" "strings" + "time" "github.com/ClusterCockpit/cc-backend/internal/repository" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" @@ -25,16 +26,19 @@ type LdapConfig struct { UserBind string `json:"user-bind"` UserFilter string `json:"user-filter"` UserAttr string `json:"username-attr"` + UIDAttr string `json:"uid-attr"` SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration. SyncDelOldUsers bool `json:"sync-del-old-users"` - // Should an non-existent user be added to the DB if user exists in ldap directory - SyncUserOnLogin bool `json:"sync-user-on-login"` + // Should a non-existent user be added to the DB if user exists in ldap directory + SyncUserOnLogin bool `json:"sync-user-on-login"` + UpdateUserOnLogin bool `json:"update-user-on-login"` } type LdapAuthenticator struct { syncPassword string UserAttr string + UIDAttr string } var _ Authenticator = (*LdapAuthenticator)(nil) @@ -51,6 +55,12 @@ func (la *LdapAuthenticator) Init() error { la.UserAttr = "gecos" } + if Keys.LdapConfig.UIDAttr != "" { + la.UIDAttr = Keys.LdapConfig.UIDAttr + } else { + la.UIDAttr = "uid" + } + return nil } @@ -66,55 +76,44 @@ func (la *LdapAuthenticator) CanLogin( if user.AuthSource == schema.AuthViaLDAP { return user, true } - } else { - if lc.SyncUserOnLogin { - l, err := la.getLdapConnection(true) - if err != nil { - cclog.Error("LDAP connection error") - return nil, false - } - defer l.Close() - - // Search for the given username - searchRequest := ldap.NewSearchRequest( - lc.UserBase, - ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, - fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username), - []string{"dn", "uid", la.UserAttr}, nil) - - sr, err := l.Search(searchRequest) - if err != nil { - cclog.Warn(err) - return nil, false - } - - if len(sr.Entries) != 1 { - cclog.Warn("LDAP: User does not exist or too many entries returned") - return nil, false - } - - entry := sr.Entries[0] - name := entry.GetAttributeValue(la.UserAttr) - var roles []string - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - projects := make([]string, 0) - - user = &schema.User{ - Username: username, - Name: name, - Roles: roles, - Projects: projects, - AuthType: schema.AuthSession, - AuthSource: schema.AuthViaLDAP, - } - - if err := repository.GetUserRepository().AddUser(user); err != nil { - cclog.Errorf("User '%s' LDAP: Insert into DB failed", username) - return nil, false - } - - return user, true + } else if lc.SyncUserOnLogin { + l, err := la.getLdapConnection(true) + if err != nil { + cclog.Error("LDAP connection error") + return nil, false } + defer l.Close() + + // Search for the given username + searchRequest := ldap.NewSearchRequest( + lc.UserBase, + ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, + fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)), + []string{"dn", la.UIDAttr, la.UserAttr}, nil) + + sr, err := l.Search(searchRequest) + if err != nil { + cclog.Warn(err) + return nil, false + } + + if len(sr.Entries) != 1 { + cclog.Warn("LDAP: User does not exist or too many entries returned") + return nil, false + } + + entry := sr.Entries[0] + user = &schema.User{ + Username: username, + Name: entry.GetAttributeValue(la.UserAttr), + Roles: []string{schema.GetRoleString(schema.RoleUser)}, + Projects: make([]string, 0), + AuthType: schema.AuthSession, + AuthSource: schema.AuthViaLDAP, + } + + handleLdapUser(user) + return user, true } return nil, false @@ -132,7 +131,7 @@ func (la *LdapAuthenticator) Login( } defer l.Close() - userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username) + userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username)) if err := l.Bind(userDn, r.FormValue("password")); err != nil { cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v", user.Username, err) @@ -170,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error { lc.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, lc.UserFilter, - []string{"dn", "uid", la.UserAttr}, nil)) + []string{"dn", la.UIDAttr, la.UserAttr}, nil)) if err != nil { cclog.Warn("LDAP search error") return err @@ -178,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error { newnames := map[string]string{} for _, entry := range ldapResults.Entries { - username := entry.GetAttributeValue("uid") + username := entry.GetAttributeValue(la.UIDAttr) if username == "" { - return errors.New("no attribute 'uid'") + return fmt.Errorf("no attribute '%s'", la.UIDAttr) } _, ok := users[username] @@ -194,20 +193,19 @@ func (la *LdapAuthenticator) Sync() error { for username, where := range users { if where == InDB && lc.SyncDelOldUsers { - ur.DelUser(username) + if err := ur.DelUser(username); err != nil { + cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err) + return err + } cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username) } else if where == InLdap { name := newnames[username] - var roles []string - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - projects := make([]string, 0) - user := &schema.User{ Username: username, Name: name, - Roles: roles, - Projects: projects, + Roles: []string{schema.GetRoleString(schema.RoleUser)}, + Projects: make([]string, 0), AuthSource: schema.AuthViaLDAP, } @@ -224,11 +222,13 @@ func (la *LdapAuthenticator) Sync() error { func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) { lc := Keys.LdapConfig - conn, err := ldap.DialURL(lc.URL) + conn, err := ldap.DialURL(lc.URL, + ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second})) if err != nil { cclog.Warn("LDAP URL dial failed") return nil, err } + conn.SetTimeout(30 * time.Second) if admin { if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil { diff --git a/internal/auth/oidc.go b/internal/auth/oidc.go index a3ff0c2c..ec6c77a7 100644 --- a/internal/auth/oidc.go +++ b/internal/auth/oidc.go @@ -9,6 +9,7 @@ import ( "context" "crypto/rand" "encoding/base64" + "fmt" "io" "net/http" "os" @@ -18,7 +19,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/coreos/go-oidc/v3/oidc" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" "golang.org/x/oauth2" ) @@ -50,6 +51,7 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin MaxAge: int(time.Hour.Seconds()), Secure: r.TLS != nil, HttpOnly: true, + SameSite: http.SameSiteLaxMode, } http.SetCookie(w, c) } @@ -77,8 +79,7 @@ func NewOIDC(a *Authentication) *OIDC { ClientID: clientID, ClientSecret: clientSecret, Endpoint: provider.Endpoint(), - RedirectURL: "oidc-callback", - Scopes: []string{oidc.ScopeOpenID, "profile", "email"}, + Scopes: []string{oidc.ScopeOpenID, "profile"}, } oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a} @@ -86,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC { return oa } -func (oa *OIDC) RegisterEndpoints(r *mux.Router) { +func (oa *OIDC) RegisterEndpoints(r chi.Router) { r.HandleFunc("/oidc-login", oa.OAuth2Login) r.HandleFunc("/oidc-callback", oa.OAuth2Callback) } @@ -122,54 +123,93 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) { token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier)) if err != nil { - http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("token exchange failed: %s", err.Error()) + http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError) return } // Get user info from OIDC provider with same timeout userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token)) if err != nil { - http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("failed to get userinfo: %s", err.Error()) + http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError) return } - // // Extract the ID Token from OAuth2 token. - // rawIDToken, ok := token.Extra("id_token").(string) - // if !ok { - // http.Error(rw, "Cannot access idToken", http.StatusInternalServerError) - // } - // - // verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID}) - // // Parse and verify ID Token payload. - // idToken, err := verifier.Verify(context.Background(), rawIDToken) - // if err != nil { - // http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError) - // } + // Verify ID token and nonce to prevent replay attacks + rawIDToken, ok := token.Extra("id_token").(string) + if !ok { + http.Error(rw, "ID token not found in response", http.StatusInternalServerError) + return + } + + nonceCookie, err := r.Cookie("nonce") + if err != nil { + http.Error(rw, "nonce cookie not found", http.StatusBadRequest) + return + } + + verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID}) + idToken, err := verifier.Verify(ctx, rawIDToken) + if err != nil { + cclog.Errorf("ID token verification failed: %s", err.Error()) + http.Error(rw, "ID token verification failed", http.StatusInternalServerError) + return + } + + if idToken.Nonce != nonceCookie.Value { + http.Error(rw, "Nonce mismatch", http.StatusBadRequest) + return + } projects := make([]string, 0) - // Extract custom claims + // Extract custom claims from userinfo var claims struct { Username string `json:"preferred_username"` Name string `json:"name"` - Profile struct { + // Keycloak realm-level roles + RealmAccess struct { + Roles []string `json:"roles"` + } `json:"realm_access"` + // Keycloak client-level roles + ResourceAccess struct { Client struct { Roles []string `json:"roles"` } `json:"clustercockpit"` } `json:"resource_access"` } if err := userInfo.Claims(&claims); err != nil { - http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("failed to extract claims: %s", err.Error()) + http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError) + return + } + + if claims.Username == "" { + http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest) + return + } + + // Merge roles from both client-level and realm-level access + oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...) + + roleSet := make(map[string]bool) + for _, r := range oidcRoles { + switch r { + case "user": + roleSet[schema.GetRoleString(schema.RoleUser)] = true + case "admin": + roleSet[schema.GetRoleString(schema.RoleAdmin)] = true + case "manager": + roleSet[schema.GetRoleString(schema.RoleManager)] = true + case "support": + roleSet[schema.GetRoleString(schema.RoleSupport)] = true + } } var roles []string - for _, r := range claims.Profile.Client.Roles { - switch r { - case "user": - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - case "admin": - roles = append(roles, schema.GetRoleString(schema.RoleAdmin)) - } + for role := range roleSet { + roles = append(roles, role) } if len(roles) == 0 { @@ -188,8 +228,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) { handleOIDCUser(user) } - oa.authentication.SaveSession(rw, r, user) - cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects) + if err := oa.authentication.SaveSession(rw, r, user); err != nil { + cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error()) + http.Error(rw, "Failed to create session", http.StatusInternalServerError) + return + } + cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects) userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user) http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx)) } @@ -206,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) { codeVerifier := oauth2.GenerateVerifier() setCallbackCookie(rw, r, "verifier", codeVerifier) + // Generate nonce for ID token replay protection + nonce, err := randString(16) + if err != nil { + http.Error(rw, "Internal error", http.StatusInternalServerError) + return + } + setCallbackCookie(rw, r, "nonce", nonce) + + // Build redirect URL from the incoming request + scheme := "https" + if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" { + scheme = "http" + } + oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host) + // Redirect user to consent page to ask for permission - url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier)) + url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, + oauth2.S256ChallengeOption(codeVerifier), + oidc.Nonce(nonce)) http.Redirect(rw, r, url, http.StatusFound) } diff --git a/internal/auth/schema.go b/internal/auth/schema.go index 496e899b..b6ee0702 100644 --- a/internal/auth/schema.go +++ b/internal/auth/schema.go @@ -92,9 +92,17 @@ var configSchema = ` "description": "Delete obsolete users in database.", "type": "boolean" }, + "uid-attr": { + "description": "LDAP attribute used as login username. Default: uid", + "type": "string" + }, "sync-user-on-login": { "description": "Add non-existent user to DB at login attempt if user exists in Ldap directory", "type": "boolean" + }, + "update-user-on-login": { + "description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.", + "type": "boolean" } }, "required": ["url", "user-base", "search-dn", "user-bind", "user-filter"] diff --git a/internal/config/config.go b/internal/config/config.go index d5a4df48..9de40695 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -74,6 +74,23 @@ type ProgramConfig struct { // Systemd unit name for log viewer (default: "clustercockpit") SystemdUnit string `json:"systemd-unit"` + + // Node state retention configuration + NodeStateRetention *NodeStateRetention `json:"nodestate-retention"` +} + +type NodeStateRetention struct { + Policy string `json:"policy"` // "delete" or "parquet" + Age int `json:"age"` // hours, default 24 + TargetKind string `json:"target-kind"` // "file" or "s3" + TargetPath string `json:"target-path"` + TargetEndpoint string `json:"target-endpoint"` + TargetBucket string `json:"target-bucket"` + TargetAccessKey string `json:"target-access-key"` + TargetSecretKey string `json:"target-secret-key"` + TargetRegion string `json:"target-region"` + TargetUsePathStyle bool `json:"target-use-path-style"` + MaxFileSizeMB int `json:"max-file-size-mb"` } type ResampleConfig struct { diff --git a/internal/config/schema.go b/internal/config/schema.go index 0d575b3c..bd1b314e 100644 --- a/internal/config/schema.go +++ b/internal/config/schema.go @@ -130,6 +130,59 @@ var configSchema = ` } }, "required": ["subject-job-event", "subject-node-state"] + }, + "nodestate-retention": { + "description": "Node state retention configuration for cleaning up old node_state rows.", + "type": "object", + "properties": { + "policy": { + "description": "Retention policy: 'delete' to remove old rows, 'parquet' to archive then delete.", + "type": "string", + "enum": ["delete", "parquet"] + }, + "age": { + "description": "Retention age in hours (default: 24).", + "type": "integer" + }, + "target-kind": { + "description": "Target kind for parquet archiving: 'file' or 's3'.", + "type": "string", + "enum": ["file", "s3"] + }, + "target-path": { + "description": "Filesystem path for parquet file target.", + "type": "string" + }, + "target-endpoint": { + "description": "S3 endpoint URL.", + "type": "string" + }, + "target-bucket": { + "description": "S3 bucket name.", + "type": "string" + }, + "target-access-key": { + "description": "S3 access key.", + "type": "string" + }, + "target-secret-key": { + "description": "S3 secret key.", + "type": "string" + }, + "target-region": { + "description": "S3 region.", + "type": "string" + }, + "target-use-path-style": { + "description": "Use path-style S3 addressing.", + "type": "boolean" + }, + "max-file-size-mb": { + "description": "Maximum parquet file size in MB (default: 128).", + "type": "integer" + } + }, + "required": ["policy"] } } }` diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index e1e5ea71..965fd860 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -10245,7 +10245,7 @@ func (ec *executionContext) _Series_id(ctx context.Context, field graphql.Collec field, ec.fieldContext_Series_id, func(ctx context.Context) (any, error) { - return obj.Id, nil + return obj.ID, nil }, nil, ec.marshalOString2áš–string, diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 19d04eab..059bd16d 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -552,7 +552,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [ for _, stat := range stats { mdlStats = append(mdlStats, &model.ScopedStats{ Hostname: stat.Hostname, - ID: stat.Id, + ID: stat.ID, Data: stat.Data, }) } @@ -824,6 +824,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } nodeRepo := repository.GetNodeRepository() + // nodes -> array hostname nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page) if nerr != nil { return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList") @@ -835,6 +836,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } } + // data -> map hostname:jobdata data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx) if err != nil { cclog.Warn("error while loading node data (Resolver.NodeMetricsList") @@ -842,18 +844,18 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } nodeMetricsList := make([]*model.NodeMetrics, 0, len(data)) - for hostname, metrics := range data { + for _, hostname := range nodes { host := &model.NodeMetrics{ Host: hostname, State: stateMap[hostname], - Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)), + Metrics: make([]*model.JobMetricWithName, 0), } host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname) if err != nil { cclog.Warnf("error in nodeMetrics resolver: %s", err) } - for metric, scopedMetrics := range metrics { + for metric, scopedMetrics := range data[hostname] { for scope, scopedMetric := range scopedMetrics { host.Metrics = append(host.Metrics, &model.JobMetricWithName{ Name: metric, @@ -867,7 +869,8 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub } nodeMetricsListResult := &model.NodesResultList{ - Items: nodeMetricsList, + Items: nodeMetricsList, + // TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357 TotalNodes: &countNodes, HasNextPage: &hasNextPage, } diff --git a/internal/metricdispatch/dataLoader.go b/internal/metricdispatch/dataLoader.go index 78808a74..c420fee4 100644 --- a/internal/metricdispatch/dataLoader.go +++ b/internal/metricdispatch/dataLoader.go @@ -499,7 +499,7 @@ func copyJobMetric(src *schema.JobMetric) *schema.JobMetric { func copySeries(src *schema.Series) schema.Series { dst := schema.Series{ Hostname: src.Hostname, - Id: src.Id, + ID: src.ID, Statistics: src.Statistics, Data: make([]schema.Float, len(src.Data)), } diff --git a/internal/metricdispatch/dataLoader_test.go b/internal/metricdispatch/dataLoader_test.go index c4841f8d..65a366f9 100644 --- a/internal/metricdispatch/dataLoader_test.go +++ b/internal/metricdispatch/dataLoader_test.go @@ -21,7 +21,7 @@ func TestDeepCopy(t *testing.T) { Series: []schema.Series{ { Hostname: "node001", - Id: &nodeId, + ID: &nodeId, Data: []schema.Float{1.0, 2.0, 3.0}, Statistics: schema.MetricStatistics{ Min: 1.0, diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go index aadbe1b1..4472b825 100644 --- a/internal/metricstoreclient/cc-metric-store.go +++ b/internal/metricstoreclient/cc-metric-store.go @@ -267,7 +267,7 @@ func (ccms *CCMetricStore) LoadData( jobMetric.Series = append(jobMetric.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -419,7 +419,7 @@ func (ccms *CCMetricStore) LoadScopedStats( scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ Hostname: query.Hostname, - Id: id, + ID: id, Data: &schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -634,7 +634,7 @@ func (ccms *CCMetricStore) LoadNodeListData( scopeData.Series = append(scopeData.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 6114ae5e..9f4f366d 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -71,8 +71,9 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { jobs = append(jobs, job) } + // Use INSERT OR IGNORE to skip jobs already transferred by the stop path _, err = r.DB.Exec( - "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + "INSERT OR IGNORE INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") if err != nil { cclog.Warnf("Error while Job sync: %v", err) return nil, err @@ -87,6 +88,29 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { return jobs, nil } +// TransferCachedJobToMain moves a job from job_cache to the job table. +// Caller must hold r.Mutex. Returns the new job table ID. +func (r *JobRepository) TransferCachedJobToMain(cacheID int64) (int64, error) { + res, err := r.DB.Exec( + "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache WHERE id = ?", + cacheID) + if err != nil { + return 0, fmt.Errorf("transferring cached job %d to main table failed: %w", cacheID, err) + } + + newID, err := res.LastInsertId() + if err != nil { + return 0, fmt.Errorf("getting new job ID after transfer failed: %w", err) + } + + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", cacheID) + if err != nil { + return 0, fmt.Errorf("deleting cached job %d after transfer failed: %w", cacheID, err) + } + + return newID, nil +} + // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! func (r *JobRepository) Start(job *schema.Job) (id int64, err error) { @@ -129,20 +153,3 @@ func (r *JobRepository) Stop( return err } -func (r *JobRepository) StopCached( - jobID int64, - duration int32, - state schema.JobState, - monitoringStatus int32, -) (err error) { - // Note: StopCached updates job_cache table, not the main job table - // Cache invalidation happens when job is synced to main table - stmt := sq.Update("job_cache"). - Set("job_state", state). - Set("duration", duration). - Set("monitoring_status", monitoringStatus). - Where("job_cache.id = ?", jobID) - - _, err = stmt.RunWith(r.stmtCache).Exec() - return err -} diff --git a/internal/repository/jobCreate_test.go b/internal/repository/jobCreate_test.go index 3a586482..9e72555f 100644 --- a/internal/repository/jobCreate_test.go +++ b/internal/repository/jobCreate_test.go @@ -331,58 +331,60 @@ func TestStop(t *testing.T) { }) } -func TestStopCached(t *testing.T) { +func TestTransferCachedJobToMain(t *testing.T) { r := setup(t) - t.Run("successful stop cached job", func(t *testing.T) { + t.Run("successful transfer from cache to main", func(t *testing.T) { // Insert a job in job_cache job := createTestJob(999009, "testcluster") - id, err := r.Start(job) + cacheID, err := r.Start(job) require.NoError(t, err) - // Stop the cached job - duration := int32(3600) - state := schema.JobStateCompleted - monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful) + // Transfer the cached job to the main table + r.Mutex.Lock() + newID, err := r.TransferCachedJobToMain(cacheID) + r.Mutex.Unlock() + require.NoError(t, err, "TransferCachedJobToMain should succeed") + assert.NotEqual(t, cacheID, newID, "New ID should differ from cache ID") - err = r.StopCached(id, duration, state, monitoringStatus) - require.NoError(t, err, "StopCached should succeed") - - // Verify job was updated in job_cache table - var retrievedDuration int32 - var retrievedState string - var retrievedMonStatus int32 - err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job_cache WHERE id = ?`, id).Scan( - &retrievedDuration, &retrievedState, &retrievedMonStatus) + // Verify job exists in job table + var count int + err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE id = ?`, newID).Scan(&count) require.NoError(t, err) - assert.Equal(t, duration, retrievedDuration) - assert.Equal(t, string(state), retrievedState) - assert.Equal(t, monitoringStatus, retrievedMonStatus) + assert.Equal(t, 1, count, "Job should exist in main table") + + // Verify job was removed from job_cache + err = r.DB.QueryRow(`SELECT COUNT(*) FROM job_cache WHERE id = ?`, cacheID).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "Job should be removed from cache") // Clean up - _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID) require.NoError(t, err) }) - t.Run("stop cached job does not affect job table", func(t *testing.T) { + t.Run("transfer preserves job data", func(t *testing.T) { // Insert a job in job_cache job := createTestJob(999010, "testcluster") - id, err := r.Start(job) + cacheID, err := r.Start(job) require.NoError(t, err) - // Stop the cached job - err = r.StopCached(id, 3600, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)) + // Transfer the cached job + r.Mutex.Lock() + newID, err := r.TransferCachedJobToMain(cacheID) + r.Mutex.Unlock() require.NoError(t, err) - // Verify job table was not affected - var count int - err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE job_id = ? AND cluster = ?`, - job.JobID, job.Cluster).Scan(&count) + // Verify the transferred job has the correct data + var jobID int64 + var cluster string + err = r.DB.QueryRow(`SELECT job_id, cluster FROM job WHERE id = ?`, newID).Scan(&jobID, &cluster) require.NoError(t, err) - assert.Equal(t, 0, count, "Job table should not be affected by StopCached") + assert.Equal(t, job.JobID, jobID) + assert.Equal(t, job.Cluster, cluster) // Clean up - _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID) require.NoError(t, err) }) } diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index 658413e8..81779583 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -150,7 +150,7 @@ func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.Select } switch { - case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): + case len(user.Roles) == 1 && user.HasRole(schema.RoleAPI): return query, nil case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): return query, nil diff --git a/internal/repository/migrations/sqlite3/10_node-table.up.sql b/internal/repository/migrations/sqlite3/10_node-table.up.sql index 7b5b5ac7..b788a8a9 100644 --- a/internal/repository/migrations/sqlite3/10_node-table.up.sql +++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql @@ -23,6 +23,7 @@ CREATE TABLE "node_state" ( CHECK (health_state IN ( 'full', 'partial', 'failed' )), + health_metrics TEXT, -- JSON array of strings node_id INTEGER, FOREIGN KEY (node_id) REFERENCES node (id) ); @@ -37,6 +38,7 @@ CREATE INDEX IF NOT EXISTS nodestates_state_timestamp ON node_state (node_state, CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp); CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state); CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state); +CREATE INDEX IF NOT EXISTS nodestates_nodeid_timestamp ON node_state (node_id, time_stamp DESC); -- Add NEW Indices For Increased Amounts of Tags CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id); diff --git a/internal/repository/node.go b/internal/repository/node.go index df3aec8b..2ffe6698 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -52,6 +52,38 @@ func GetNodeRepository() *NodeRepository { return nodeRepoInstance } +// latestStateCondition returns a squirrel expression that restricts node_state +// rows to the latest per node_id using a correlated subquery. +// Requires the query to join node and node_state tables. +func latestStateCondition() sq.Sqlizer { + return sq.Expr( + "node_state.id = (SELECT ns2.id FROM node_state ns2 WHERE ns2.node_id = node.id ORDER BY ns2.time_stamp DESC LIMIT 1)", + ) +} + +// applyNodeFilters applies common NodeFilter conditions to a query that joins +// the node and node_state tables with latestStateCondition. +func applyNodeFilters(query sq.SelectBuilder, filters []*model.NodeFilter) sq.SelectBuilder { + for _, f := range filters { + if f.Cluster != nil { + query = buildStringCondition("node.cluster", f.Cluster, query) + } + if f.SubCluster != nil { + query = buildStringCondition("node.subcluster", f.SubCluster, query) + } + if f.Hostname != nil { + query = buildStringCondition("node.hostname", f.Hostname, query) + } + if f.SchedulerState != nil { + query = query.Where("node_state.node_state = ?", f.SchedulerState) + } + if f.HealthState != nil { + query = query.Where("node_state.health_state = ?", f.HealthState) + } + } + return query +} + func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) { start := time.Now() @@ -82,17 +114,16 @@ func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[str func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) { node := &schema.Node{} - var timestamp int - if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). - From("node_state"). - Join("node ON node_state.node_id = node.id"). + if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.hostname = ?", hostname). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). RunWith(r.DB). - QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while querying node '%s' at time '%d' from database: %v", hostname, timestamp, err) + QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warnf("Error while querying node '%s' from database: %v", hostname, err) return nil, err } @@ -111,16 +142,15 @@ func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) { node := &schema.Node{} - var timestamp int - if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). - From("node_state"). - Join("node ON node_state.node_id = node.id"). + if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.id = ?", id). - GroupBy("node_state.node_id"). RunWith(r.DB). - QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while querying node ID '%d' at time '%d' from database: %v", id, timestamp, err) + QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warnf("Error while querying node ID '%d' from database: %v", id, err) return nil, err } @@ -169,9 +199,10 @@ func (r *NodeRepository) AddNode(node *schema.NodeDB) (int64, error) { } const NamedNodeStateInsert string = ` -INSERT INTO node_state (time_stamp, node_state, health_state, cpus_allocated, - memory_allocated, gpus_allocated, jobs_running, node_id) - VALUES (:time_stamp, :node_state, :health_state, :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);` +INSERT INTO node_state (time_stamp, node_state, health_state, health_metrics, + cpus_allocated, memory_allocated, gpus_allocated, jobs_running, node_id) + VALUES (:time_stamp, :node_state, :health_state, :health_metrics, + :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);` // TODO: Add real Monitoring Health State @@ -224,6 +255,75 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt // return nil // } +// NodeStateWithNode combines a node state row with denormalized node info. +type NodeStateWithNode struct { + ID int64 `db:"id"` + TimeStamp int64 `db:"time_stamp"` + NodeState string `db:"node_state"` + HealthState string `db:"health_state"` + HealthMetrics string `db:"health_metrics"` + CpusAllocated int `db:"cpus_allocated"` + MemoryAllocated int64 `db:"memory_allocated"` + GpusAllocated int `db:"gpus_allocated"` + JobsRunning int `db:"jobs_running"` + Hostname string `db:"hostname"` + Cluster string `db:"cluster"` + SubCluster string `db:"subcluster"` +} + +// FindNodeStatesBefore returns all node_state rows with time_stamp < cutoff, +// joined with node info for denormalized archiving. +func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode, error) { + rows, err := sq.Select( + "node_state.id", "node_state.time_stamp", "node_state.node_state", + "node_state.health_state", "node_state.health_metrics", + "node_state.cpus_allocated", "node_state.memory_allocated", + "node_state.gpus_allocated", "node_state.jobs_running", + "node.hostname", "node.cluster", "node.subcluster", + ). + From("node_state"). + Join("node ON node_state.node_id = node.id"). + Where(sq.Lt{"node_state.time_stamp": cutoff}). + Where("node_state.id NOT IN (SELECT ns2.id FROM node_state ns2 WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id))"). + OrderBy("node_state.time_stamp ASC"). + RunWith(r.DB).Query() + if err != nil { + return nil, err + } + defer rows.Close() + + var result []NodeStateWithNode + for rows.Next() { + var ns NodeStateWithNode + if err := rows.Scan(&ns.ID, &ns.TimeStamp, &ns.NodeState, + &ns.HealthState, &ns.HealthMetrics, + &ns.CpusAllocated, &ns.MemoryAllocated, + &ns.GpusAllocated, &ns.JobsRunning, + &ns.Hostname, &ns.Cluster, &ns.SubCluster); err != nil { + return nil, err + } + result = append(result, ns) + } + return result, nil +} + +// DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff, +// but always preserves the row with the latest timestamp per node_id. +func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) { + res, err := r.DB.Exec( + `DELETE FROM node_state WHERE time_stamp < ? + AND id NOT IN ( + SELECT id FROM node_state ns2 + WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id) + )`, + cutoff, + ) + if err != nil { + return 0, err + } + return res.RowsAffected() +} + func (r *NodeRepository) DeleteNode(id int64) error { _, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id) if err != nil { @@ -243,38 +343,17 @@ func (r *NodeRepository) QueryNodes( order *model.OrderByInput, // Currently unused! ) ([]*schema.Node, error) { query, qerr := AccessCheck(ctx, - sq.Select("hostname", "cluster", "subcluster", "node_state", "health_state", "MAX(time_stamp) as time"). + sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). From("node"). - Join("node_state ON node_state.node_id = node.id")) + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { return nil, qerr } - for _, f := range filters { - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.SubCluster != nil { - query = buildStringCondition("subcluster", f.SubCluster, query) - } - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - } - - query = query.GroupBy("node_id").OrderBy("hostname ASC") + query = applyNodeFilters(query, filters) + query = query.OrderBy("node.hostname ASC") if page != nil && page.ItemsPerPage != -1 { limit := uint64(page.ItemsPerPage) @@ -291,11 +370,10 @@ func (r *NodeRepository) QueryNodes( nodes := make([]*schema.Node, 0) for rows.Next() { node := schema.Node{} - var timestamp int if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster, - &node.NodeState, &node.HealthState, ×tamp); err != nil { + &node.NodeState, &node.HealthState); err != nil { rows.Close() - cclog.Warnf("Error while scanning rows (QueryNodes) at time '%d'", timestamp) + cclog.Warn("Error while scanning rows (QueryNodes)") return nil, err } nodes = append(nodes, &node) @@ -305,72 +383,39 @@ func (r *NodeRepository) QueryNodes( } // CountNodes returns the total matched nodes based on a node filter. It always operates -// on the last state (largest timestamp). +// on the last state (largest timestamp) per node. func (r *NodeRepository) CountNodes( ctx context.Context, filters []*model.NodeFilter, ) (int, error) { query, qerr := AccessCheck(ctx, - sq.Select("time_stamp", "count(*) as countRes"). + sq.Select("COUNT(*)"). From("node"). - Join("node_state ON node_state.node_id = node.id")) + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { return 0, qerr } - for _, f := range filters { - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.SubCluster != nil { - query = buildStringCondition("subcluster", f.SubCluster, query) - } - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - } + query = applyNodeFilters(query, filters) - query = query.GroupBy("time_stamp").OrderBy("time_stamp DESC").Limit(1) - - rows, err := query.RunWith(r.stmtCache).Query() - if err != nil { + var count int + if err := query.RunWith(r.stmtCache).QueryRow().Scan(&count); err != nil { queryString, queryVars, _ := query.ToSql() cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err) return 0, err } - var totalNodes int - for rows.Next() { - var timestamp int - if err := rows.Scan(×tamp, &totalNodes); err != nil { - rows.Close() - cclog.Warnf("Error while scanning rows (CountNodes) at time '%d'", timestamp) - return 0, err - } - } - - return totalNodes, nil + return count, nil } func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { - q := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). + q := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). From("node"). Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). OrderBy("node.hostname ASC") rows, err := q.RunWith(r.DB).Query() @@ -382,10 +427,9 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { defer rows.Close() for rows.Next() { node := &schema.Node{} - var timestamp int if err := rows.Scan(&node.Hostname, &node.Cluster, - &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while scanning node list (ListNodes) at time '%d'", timestamp) + &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warn("Error while scanning node list (ListNodes)") return nil, err } @@ -396,11 +440,11 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { } func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { - q := sq.Select("node.hostname", "node_state.node_state", "MAX(node_state.time_stamp) as time"). + q := sq.Select("node.hostname", "node_state.node_state"). From("node"). Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). OrderBy("node.hostname ASC") rows, err := q.RunWith(r.DB).Query() @@ -413,9 +457,8 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { defer rows.Close() for rows.Next() { var hostname, nodestate string - var timestamp int - if err := rows.Scan(&hostname, &nodestate, ×tamp); err != nil { - cclog.Warnf("Error while scanning node list (MapNodes) at time '%d'", timestamp) + if err := rows.Scan(&hostname, &nodestate); err != nil { + cclog.Warn("Error while scanning node list (MapNodes)") return nil, err } @@ -426,33 +469,16 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { } func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) { - query, qerr := AccessCheck(ctx, sq.Select("hostname", column, "MAX(time_stamp) as time").From("node")) + query, qerr := AccessCheck(ctx, + sq.Select(column). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { return nil, qerr } - query = query.Join("node_state ON node_state.node_id = node.id") - - for _, f := range filters { - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.SubCluster != nil { - query = buildStringCondition("subcluster", f.SubCluster, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - } - } - - // Add Group and Order - query = query.GroupBy("hostname").OrderBy("hostname DESC") + query = applyNodeFilters(query, filters) rows, err := query.RunWith(r.stmtCache).Query() if err != nil { @@ -463,12 +489,10 @@ func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeF stateMap := map[string]int{} for rows.Next() { - var hostname, state string - var timestamp int - - if err := rows.Scan(&hostname, &state, ×tamp); err != nil { + var state string + if err := rows.Scan(&state); err != nil { rows.Close() - cclog.Warnf("Error while scanning rows (CountStates) at time '%d'", timestamp) + cclog.Warn("Error while scanning rows (CountStates)") return nil, err } @@ -661,26 +685,14 @@ func (r *NodeRepository) GetNodesForList( } } else { - // DB Nodes: Count and Find Next Page + // DB Nodes: Count and derive hasNextPage from count var cerr error countNodes, cerr = r.CountNodes(ctx, queryFilters) if cerr != nil { cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)") return nil, nil, 0, false, cerr } - - // Example Page 4 @ 10 IpP : Does item 41 exist? - // Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists. - nextPage := &model.PageRequest{ - ItemsPerPage: 1, - Page: ((page.Page * page.ItemsPerPage) + 1), - } - nextNodes, err := r.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used - if err != nil { - cclog.Warn("Error while querying next nodes") - return nil, nil, 0, false, err - } - hasNextPage = len(nextNodes) == 1 + hasNextPage = page.Page*page.ItemsPerPage < countNodes } // Fallback for non-init'd node table in DB; Ignores stateFilter diff --git a/internal/repository/node_test.go b/internal/repository/node_test.go index b863dc69..d1e86b9a 100644 --- a/internal/repository/node_test.go +++ b/internal/repository/node_test.go @@ -139,6 +139,13 @@ func nodeTestSetup(t *testing.T) { } archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive) + if err := ResetConnection(); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + ResetConnection() + }) + Connect(dbfilepath) if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { @@ -149,8 +156,12 @@ func nodeTestSetup(t *testing.T) { func TestUpdateNodeState(t *testing.T) { nodeTestSetup(t) + repo := GetNodeRepository() + now := time.Now().Unix() + nodeState := schema.NodeStateDB{ - TimeStamp: time.Now().Unix(), NodeState: "allocated", + TimeStamp: now, + NodeState: "allocated", CpusAllocated: 72, MemoryAllocated: 480, GpusAllocated: 0, @@ -158,18 +169,152 @@ func TestUpdateNodeState(t *testing.T) { JobsRunning: 1, } - repo := GetNodeRepository() err := repo.UpdateNodeState("host124", "testcluster", &nodeState) if err != nil { - return + t.Fatal(err) } node, err := repo.GetNode("host124", "testcluster", false) if err != nil { - return + t.Fatal(err) } if node.NodeState != "allocated" { t.Errorf("wrong node state\ngot: %s \nwant: allocated ", node.NodeState) } + + t.Run("FindBeforeEmpty", func(t *testing.T) { + // Only the current-timestamp row exists, so nothing should be found before now + rows, err := repo.FindNodeStatesBefore(now) + if err != nil { + t.Fatal(err) + } + if len(rows) != 0 { + t.Errorf("expected 0 rows, got %d", len(rows)) + } + }) + + t.Run("DeleteOldRows", func(t *testing.T) { + // Insert 2 more old rows for host124 + for i, ts := range []int64{now - 7200, now - 3600} { + ns := schema.NodeStateDB{ + TimeStamp: ts, + NodeState: "allocated", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 72, + MemoryAllocated: 480, + JobsRunning: i, + } + if err := repo.UpdateNodeState("host124", "testcluster", &ns); err != nil { + t.Fatal(err) + } + } + + // Delete rows older than 30 minutes + cutoff := now - 1800 + cnt, err := repo.DeleteNodeStatesBefore(cutoff) + if err != nil { + t.Fatal(err) + } + + // Should delete the 2 old rows + if cnt != 2 { + t.Errorf("expected 2 deleted rows, got %d", cnt) + } + + // Latest row should still exist + node, err := repo.GetNode("host124", "testcluster", false) + if err != nil { + t.Fatal(err) + } + if node.NodeState != "allocated" { + t.Errorf("expected node state 'allocated', got %s", node.NodeState) + } + }) + + t.Run("PreservesLatestPerNode", func(t *testing.T) { + // Insert a single old row for host125 — it's the latest per node so it must survive + ns := schema.NodeStateDB{ + TimeStamp: now - 7200, + NodeState: "idle", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 0, + MemoryAllocated: 0, + JobsRunning: 0, + } + if err := repo.UpdateNodeState("host125", "testcluster", &ns); err != nil { + t.Fatal(err) + } + + // Delete everything older than now — the latest per node should be preserved + _, err := repo.DeleteNodeStatesBefore(now) + if err != nil { + t.Fatal(err) + } + + // The latest row for host125 must still exist + node, err := repo.GetNode("host125", "testcluster", false) + if err != nil { + t.Fatal(err) + } + if node.NodeState != "idle" { + t.Errorf("expected node state 'idle', got %s", node.NodeState) + } + + // Verify exactly 1 row remains for host125 + var countAfter int + if err := repo.DB.QueryRow( + "SELECT COUNT(*) FROM node_state WHERE node_id = (SELECT id FROM node WHERE hostname = 'host125')"). + Scan(&countAfter); err != nil { + t.Fatal(err) + } + if countAfter != 1 { + t.Errorf("expected 1 row remaining for host125, got %d", countAfter) + } + }) + + t.Run("FindBeforeWithJoin", func(t *testing.T) { + // Insert old and current rows for host123 + for _, ts := range []int64{now - 7200, now} { + ns := schema.NodeStateDB{ + TimeStamp: ts, + NodeState: "allocated", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 8, + MemoryAllocated: 1024, + GpusAllocated: 1, + JobsRunning: 1, + } + if err := repo.UpdateNodeState("host123", "testcluster", &ns); err != nil { + t.Fatal(err) + } + } + + // Find rows older than 30 minutes, excluding latest per node + cutoff := now - 1800 + rows, err := repo.FindNodeStatesBefore(cutoff) + if err != nil { + t.Fatal(err) + } + + // Should find the old host123 row + found := false + for _, row := range rows { + if row.Hostname == "host123" && row.TimeStamp == now-7200 { + found = true + if row.Cluster != "testcluster" { + t.Errorf("expected cluster 'testcluster', got %s", row.Cluster) + } + if row.SubCluster != "sc1" { + t.Errorf("expected subcluster 'sc1', got %s", row.SubCluster) + } + if row.CpusAllocated != 8 { + t.Errorf("expected cpus_allocated 8, got %d", row.CpusAllocated) + } + } + } + if !found { + t.Errorf("expected to find old host123 row among %d results", len(rows)) + } + }) } diff --git a/internal/repository/repository_test.go b/internal/repository/repository_test.go index 34852830..b9496143 100644 --- a/internal/repository/repository_test.go +++ b/internal/repository/repository_test.go @@ -6,6 +6,8 @@ package repository import ( "context" + "os" + "path/filepath" "testing" "github.com/ClusterCockpit/cc-backend/internal/graph/model" @@ -148,8 +150,22 @@ func getContext(tb testing.TB) context.Context { func setup(tb testing.TB) *JobRepository { tb.Helper() cclog.Init("warn", true) - dbfile := "testdata/job.db" - err := MigrateDB(dbfile) + + // Copy test DB to a temp file for test isolation + srcData, err := os.ReadFile("testdata/job.db") + noErr(tb, err) + dbfile := filepath.Join(tb.TempDir(), "job.db") + err = os.WriteFile(dbfile, srcData, 0o644) + noErr(tb, err) + + // Reset singletons so Connect uses the new temp DB + err = ResetConnection() + noErr(tb, err) + tb.Cleanup(func() { + ResetConnection() + }) + + err = MigrateDB(dbfile) noErr(tb, err) Connect(dbfile) return GetJobRepository() diff --git a/internal/repository/stats_test.go b/internal/repository/stats_test.go index a8dfc818..a6c2da17 100644 --- a/internal/repository/stats_test.go +++ b/internal/repository/stats_test.go @@ -25,17 +25,11 @@ func TestBuildJobStatsQuery(t *testing.T) { func TestJobStats(t *testing.T) { r := setup(t) - // First, count the actual jobs in the database (excluding test jobs) var expectedCount int - err := r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE cluster != 'testcluster'`).Scan(&expectedCount) + err := r.DB.QueryRow(`SELECT COUNT(*) FROM job`).Scan(&expectedCount) noErr(t, err) - filter := &model.JobFilter{} - // Exclude test jobs created by other tests - testCluster := "testcluster" - filter.Cluster = &model.StringInput{Neq: &testCluster} - - stats, err := r.JobsStats(getContext(t), []*model.JobFilter{filter}) + stats, err := r.JobsStats(getContext(t), []*model.JobFilter{}) noErr(t, err) if stats[0].TotalJobs != expectedCount { diff --git a/internal/repository/tags.go b/internal/repository/tags.go index 612666da..943dda66 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -644,12 +644,12 @@ func (r *JobRepository) checkScopeAuth(user *schema.User, operation string, scop if user != nil { switch { case operation == "write" && scope == "admin": - if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) { + if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleAPI)) { return true, nil } return false, nil case operation == "write" && scope == "global": - if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) { + if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleAPI)) { return true, nil } return false, nil diff --git a/internal/repository/userConfig_test.go b/internal/repository/userConfig_test.go index cee59304..17ccbf78 100644 --- a/internal/repository/userConfig_test.go +++ b/internal/repository/userConfig_test.go @@ -31,8 +31,25 @@ func setupUserTest(t *testing.T) *UserCfgRepo { }` cclog.Init("info", true) - dbfilepath := "testdata/job.db" - err := MigrateDB(dbfilepath) + + // Copy test DB to a temp file for test isolation + srcData, err := os.ReadFile("testdata/job.db") + if err != nil { + t.Fatal(err) + } + dbfilepath := filepath.Join(t.TempDir(), "job.db") + if err := os.WriteFile(dbfilepath, srcData, 0o644); err != nil { + t.Fatal(err) + } + + if err := ResetConnection(); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + ResetConnection() + }) + + err = MigrateDB(dbfilepath) if err != nil { t.Fatal(err) } diff --git a/internal/routerConfig/routes.go b/internal/routerConfig/routes.go index 59491297..46d1ea5b 100644 --- a/internal/routerConfig/routes.go +++ b/internal/routerConfig/routes.go @@ -20,7 +20,7 @@ import ( cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/util" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" ) type InfoType map[string]interface{} @@ -97,7 +97,7 @@ func setupConfigRoute(i InfoType, r *http.Request) InfoType { } func setupJobRoute(i InfoType, r *http.Request) InfoType { - i["id"] = mux.Vars(r)["id"] + i["id"] = chi.URLParam(r, "id") if config.Keys.EmissionConstant != 0 { i["emission"] = config.Keys.EmissionConstant } @@ -105,7 +105,7 @@ func setupJobRoute(i InfoType, r *http.Request) InfoType { } func setupUserRoute(i InfoType, r *http.Request) InfoType { - username := mux.Vars(r)["id"] + username := chi.URLParam(r, "id") i["id"] = username i["username"] = username // TODO: If forbidden (== err exists), redirect to error page @@ -117,33 +117,33 @@ func setupUserRoute(i InfoType, r *http.Request) InfoType { } func setupClusterStatusRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "DASHBOARD" return i } func setupClusterDetailRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "DETAILS" return i } func setupDashboardRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "PUBLIC" // Used in Main Template return i } func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "OVERVIEW" from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") @@ -155,11 +155,12 @@ func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType { } func setupClusterListRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] - i["sid"] = vars["subcluster"] - i["subCluster"] = vars["subcluster"] + cluster := chi.URLParam(r, "cluster") + subcluster := chi.URLParam(r, "subcluster") + i["id"] = cluster + i["cluster"] = cluster + i["sid"] = subcluster + i["subCluster"] = subcluster i["displayType"] = "LIST" from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") @@ -171,10 +172,11 @@ func setupClusterListRoute(i InfoType, r *http.Request) InfoType { } func setupNodeRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["cluster"] = vars["cluster"] - i["hostname"] = vars["hostname"] - i["id"] = fmt.Sprintf("%s (%s)", vars["cluster"], vars["hostname"]) + cluster := chi.URLParam(r, "cluster") + hostname := chi.URLParam(r, "hostname") + i["cluster"] = cluster + i["hostname"] = hostname + i["id"] = fmt.Sprintf("%s (%s)", cluster, hostname) from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") if from != "" && to != "" { i["from"] = from @@ -184,7 +186,7 @@ func setupNodeRoute(i InfoType, r *http.Request) InfoType { } func setupAnalysisRoute(i InfoType, r *http.Request) InfoType { - i["cluster"] = mux.Vars(r)["cluster"] + i["cluster"] = chi.URLParam(r, "cluster") return i } @@ -396,7 +398,7 @@ func buildFilterPresets(query url.Values) map[string]interface{} { return filterPresets } -func SetupRoutes(router *mux.Router, buildInfo web.Build) { +func SetupRoutes(router chi.Router, buildInfo web.Build) { userCfgRepo := repository.GetUserCfgRepo() for _, route := range routes { route := route diff --git a/internal/taskmanager/nodestateRetentionService.go b/internal/taskmanager/nodestateRetentionService.go new file mode 100644 index 00000000..9a704502 --- /dev/null +++ b/internal/taskmanager/nodestateRetentionService.go @@ -0,0 +1,120 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package taskmanager + +import ( + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/repository" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/go-co-op/gocron/v2" +) + +func RegisterNodeStateRetentionDeleteService(ageHours int) { + cclog.Info("Register node state retention delete service") + + s.NewJob(gocron.DurationJob(1*time.Hour), + gocron.NewTask( + func() { + cutoff := time.Now().Unix() - int64(ageHours*3600) + nodeRepo := repository.GetNodeRepository() + cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState retention: error deleting old rows: %v", err) + } else if cnt > 0 { + cclog.Infof("NodeState retention: deleted %d old rows", cnt) + } + })) +} + +func RegisterNodeStateRetentionParquetService(cfg *config.NodeStateRetention) { + cclog.Info("Register node state retention parquet service") + + maxFileSizeMB := cfg.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 128 + } + + ageHours := cfg.Age + if ageHours <= 0 { + ageHours = 24 + } + + var target pqarchive.ParquetTarget + var err error + + switch cfg.TargetKind { + case "s3": + target, err = pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: cfg.TargetEndpoint, + Bucket: cfg.TargetBucket, + AccessKey: cfg.TargetAccessKey, + SecretKey: cfg.TargetSecretKey, + Region: cfg.TargetRegion, + UsePathStyle: cfg.TargetUsePathStyle, + }) + default: + target, err = pqarchive.NewFileTarget(cfg.TargetPath) + } + + if err != nil { + cclog.Errorf("NodeState parquet retention: failed to create target: %v", err) + return + } + + s.NewJob(gocron.DurationJob(1*time.Hour), + gocron.NewTask( + func() { + cutoff := time.Now().Unix() - int64(ageHours*3600) + nodeRepo := repository.GetNodeRepository() + + rows, err := nodeRepo.FindNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState parquet retention: error finding rows: %v", err) + return + } + if len(rows) == 0 { + return + } + + cclog.Infof("NodeState parquet retention: archiving %d rows", len(rows)) + pw := pqarchive.NewNodeStateParquetWriter(target, maxFileSizeMB) + + for _, ns := range rows { + row := pqarchive.ParquetNodeStateRow{ + TimeStamp: ns.TimeStamp, + NodeState: ns.NodeState, + HealthState: ns.HealthState, + HealthMetrics: ns.HealthMetrics, + CpusAllocated: int32(ns.CpusAllocated), + MemoryAllocated: ns.MemoryAllocated, + GpusAllocated: int32(ns.GpusAllocated), + JobsRunning: int32(ns.JobsRunning), + Hostname: ns.Hostname, + Cluster: ns.Cluster, + SubCluster: ns.SubCluster, + } + if err := pw.AddRow(row); err != nil { + cclog.Errorf("NodeState parquet retention: add row: %v", err) + continue + } + } + + if err := pw.Close(); err != nil { + cclog.Errorf("NodeState parquet retention: close writer: %v", err) + return + } + + cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState parquet retention: error deleting rows: %v", err) + } else { + cclog.Infof("NodeState parquet retention: deleted %d rows from db", cnt) + } + })) +} diff --git a/internal/taskmanager/retentionService.go b/internal/taskmanager/retentionService.go index 5678cd14..d863bb91 100644 --- a/internal/taskmanager/retentionService.go +++ b/internal/taskmanager/retentionService.go @@ -6,63 +6,329 @@ package taskmanager import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" "time" "github.com/ClusterCockpit/cc-backend/pkg/archive" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/go-co-op/gocron/v2" ) -func RegisterRetentionDeleteService(age int, includeDB bool, omitTagged bool) { +// createParquetTarget creates a ParquetTarget (file or S3) from the retention config. +func createParquetTarget(cfg Retention) (pqarchive.ParquetTarget, error) { + switch cfg.TargetKind { + case "s3": + return pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: cfg.TargetEndpoint, + Bucket: cfg.TargetBucket, + AccessKey: cfg.TargetAccessKey, + SecretKey: cfg.TargetSecretKey, + Region: cfg.TargetRegion, + UsePathStyle: cfg.TargetUsePathStyle, + }) + default: + return pqarchive.NewFileTarget(cfg.TargetPath) + } +} + +// createTargetBackend creates a secondary archive backend (file or S3) for JSON copy/move. +func createTargetBackend(cfg Retention) (archive.ArchiveBackend, error) { + var raw json.RawMessage + var err error + + switch cfg.TargetKind { + case "s3": + raw, err = json.Marshal(map[string]interface{}{ + "kind": "s3", + "endpoint": cfg.TargetEndpoint, + "bucket": cfg.TargetBucket, + "access-key": cfg.TargetAccessKey, + "secret-key": cfg.TargetSecretKey, + "region": cfg.TargetRegion, + "use-path-style": cfg.TargetUsePathStyle, + }) + default: + raw, err = json.Marshal(map[string]string{ + "kind": "file", + "path": cfg.TargetPath, + }) + } + if err != nil { + return nil, fmt.Errorf("marshal target config: %w", err) + } + return archive.InitBackend(raw) +} + +// transferJobsJSON copies job data from source archive to target backend in JSON format. +func transferJobsJSON(jobs []*schema.Job, src archive.ArchiveBackend, dst archive.ArchiveBackend) error { + // Transfer cluster configs for all clusters referenced by jobs + clustersDone := make(map[string]bool) + for _, job := range jobs { + if clustersDone[job.Cluster] { + continue + } + clusterCfg, err := src.LoadClusterCfg(job.Cluster) + if err != nil { + cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err) + } else { + if err := dst.StoreClusterCfg(job.Cluster, clusterCfg); err != nil { + cclog.Warnf("Retention: store cluster config %q: %v", job.Cluster, err) + } + } + clustersDone[job.Cluster] = true + } + + for _, job := range jobs { + meta, err := src.LoadJobMeta(job) + if err != nil { + cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err) + continue + } + data, err := src.LoadJobData(job) + if err != nil { + cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err) + continue + } + if err := dst.ImportJob(meta, &data); err != nil { + cclog.Warnf("Retention: import job %d: %v", job.JobID, err) + continue + } + } + return nil +} + +// transferJobsParquet converts jobs to Parquet format, organized by cluster. +func transferJobsParquet(jobs []*schema.Job, src archive.ArchiveBackend, target pqarchive.ParquetTarget, maxSizeMB int) error { + cw := pqarchive.NewClusterAwareParquetWriter(target, maxSizeMB) + + // Set cluster configs for all clusters referenced by jobs + clustersDone := make(map[string]bool) + for _, job := range jobs { + if clustersDone[job.Cluster] { + continue + } + clusterCfg, err := src.LoadClusterCfg(job.Cluster) + if err != nil { + cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err) + } else { + cw.SetClusterConfig(job.Cluster, clusterCfg) + } + clustersDone[job.Cluster] = true + } + + for _, job := range jobs { + meta, err := src.LoadJobMeta(job) + if err != nil { + cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err) + continue + } + data, err := src.LoadJobData(job) + if err != nil { + cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err) + continue + } + row, err := pqarchive.JobToParquetRow(meta, &data) + if err != nil { + cclog.Warnf("Retention: convert job %d: %v", job.JobID, err) + continue + } + if err := cw.AddJob(*row); err != nil { + cclog.Errorf("Retention: add job %d to writer: %v", job.JobID, err) + continue + } + } + + return cw.Close() +} + +// cleanupAfterTransfer removes jobs from archive and optionally from DB. +func cleanupAfterTransfer(jobs []*schema.Job, startTime int64, includeDB bool, omitTagged bool) { + archive.GetHandle().CleanUp(jobs) + + if includeDB { + cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged) + if err != nil { + cclog.Errorf("Retention: delete jobs from db: %v", err) + } else { + cclog.Infof("Retention: removed %d jobs from db", cnt) + } + if err = jobRepo.Optimize(); err != nil { + cclog.Errorf("Retention: db optimization error: %v", err) + } + } +} + +// readCopyMarker reads the last-processed timestamp from a copy marker file. +func readCopyMarker(cfg Retention) int64 { + var data []byte + var err error + + switch cfg.TargetKind { + case "s3": + // For S3 we store the marker locally alongside the config + data, err = os.ReadFile(copyMarkerPath(cfg)) + default: + data, err = os.ReadFile(filepath.Join(cfg.TargetPath, ".copy-marker")) + } + if err != nil { + return 0 + } + ts, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return 0 + } + return ts +} + +// writeCopyMarker writes the last-processed timestamp to a copy marker file. +func writeCopyMarker(cfg Retention, ts int64) { + content := []byte(strconv.FormatInt(ts, 10)) + var err error + + switch cfg.TargetKind { + case "s3": + err = os.WriteFile(copyMarkerPath(cfg), content, 0o640) + default: + err = os.WriteFile(filepath.Join(cfg.TargetPath, ".copy-marker"), content, 0o640) + } + if err != nil { + cclog.Warnf("Retention: write copy marker: %v", err) + } +} + +func copyMarkerPath(cfg Retention) string { + // For S3 targets, store the marker in a local temp-style path derived from the bucket name + return filepath.Join(os.TempDir(), fmt.Sprintf("cc-copy-marker-%s", cfg.TargetBucket)) +} + +func RegisterRetentionDeleteService(cfg Retention) { cclog.Info("Register retention delete service") s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(3, 0, 0))), gocron.NewTask( func() { - startTime := time.Now().Unix() - int64(age*24*3600) - jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged) + startTime := time.Now().Unix() - int64(cfg.Age*24*3600) + jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged) if err != nil { - cclog.Warnf("Error while looking for retention jobs: %s", err.Error()) + cclog.Warnf("Retention delete: error finding jobs: %v", err) + return + } + if len(jobs) == 0 { + return } - archive.GetHandle().CleanUp(jobs) - if includeDB { - cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged) - if err != nil { - cclog.Errorf("Error while deleting retention jobs from db: %s", err.Error()) - } else { - cclog.Infof("Retention: Removed %d jobs from db", cnt) - } - if err = jobRepo.Optimize(); err != nil { - cclog.Errorf("Error occured in db optimization: %s", err.Error()) - } - } + cclog.Infof("Retention delete: processing %d jobs", len(jobs)) + cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged) })) } -func RegisterRetentionMoveService(age int, includeDB bool, location string, omitTagged bool) { - cclog.Info("Register retention move service") +func RegisterRetentionCopyService(cfg Retention) { + cclog.Infof("Register retention copy service (format=%s, target=%s)", cfg.Format, cfg.TargetKind) + + maxFileSizeMB := cfg.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 512 + } s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(4, 0, 0))), gocron.NewTask( func() { - startTime := time.Now().Unix() - int64(age*24*3600) - jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged) - if err != nil { - cclog.Warnf("Error while looking for retention jobs: %s", err.Error()) - } - archive.GetHandle().Move(jobs, location) + cutoff := time.Now().Unix() - int64(cfg.Age*24*3600) + lastProcessed := readCopyMarker(cfg) - if includeDB { - cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged) + jobs, err := jobRepo.FindJobsBetween(lastProcessed, cutoff, cfg.OmitTagged) + if err != nil { + cclog.Warnf("Retention copy: error finding jobs: %v", err) + return + } + if len(jobs) == 0 { + return + } + + cclog.Infof("Retention copy: processing %d jobs", len(jobs)) + ar := archive.GetHandle() + + switch cfg.Format { + case "parquet": + target, err := createParquetTarget(cfg) if err != nil { - cclog.Errorf("Error while deleting retention jobs from db: %v", err) - } else { - cclog.Infof("Retention: Removed %d jobs from db", cnt) + cclog.Errorf("Retention copy: create parquet target: %v", err) + return } - if err = jobRepo.Optimize(); err != nil { - cclog.Errorf("Error occured in db optimization: %v", err) + if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil { + cclog.Errorf("Retention copy: parquet transfer: %v", err) + return + } + default: // json + dst, err := createTargetBackend(cfg) + if err != nil { + cclog.Errorf("Retention copy: create target backend: %v", err) + return + } + if err := transferJobsJSON(jobs, ar, dst); err != nil { + cclog.Errorf("Retention copy: json transfer: %v", err) + return } } + + writeCopyMarker(cfg, cutoff) + })) +} + +func RegisterRetentionMoveService(cfg Retention) { + cclog.Infof("Register retention move service (format=%s, target=%s)", cfg.Format, cfg.TargetKind) + + maxFileSizeMB := cfg.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 512 + } + + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))), + gocron.NewTask( + func() { + startTime := time.Now().Unix() - int64(cfg.Age*24*3600) + jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged) + if err != nil { + cclog.Warnf("Retention move: error finding jobs: %v", err) + return + } + if len(jobs) == 0 { + return + } + + cclog.Infof("Retention move: processing %d jobs", len(jobs)) + ar := archive.GetHandle() + + switch cfg.Format { + case "parquet": + target, err := createParquetTarget(cfg) + if err != nil { + cclog.Errorf("Retention move: create parquet target: %v", err) + return + } + if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil { + cclog.Errorf("Retention move: parquet transfer: %v", err) + return + } + default: // json + dst, err := createTargetBackend(cfg) + if err != nil { + cclog.Errorf("Retention move: create target backend: %v", err) + return + } + if err := transferJobsJSON(jobs, ar, dst); err != nil { + cclog.Errorf("Retention move: json transfer: %v", err) + return + } + } + + cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged) })) } diff --git a/internal/taskmanager/taskManager.go b/internal/taskmanager/taskManager.go index cbc4120f..529395b5 100644 --- a/internal/taskmanager/taskManager.go +++ b/internal/taskmanager/taskManager.go @@ -23,11 +23,20 @@ const ( // Retention defines the configuration for job retention policies. type Retention struct { - Policy string `json:"policy"` - Location string `json:"location"` - Age int `json:"age"` - IncludeDB bool `json:"includeDB"` - OmitTagged bool `json:"omitTagged"` + Policy string `json:"policy"` + Format string `json:"format"` + Age int `json:"age"` + IncludeDB bool `json:"includeDB"` + OmitTagged bool `json:"omitTagged"` + TargetKind string `json:"target-kind"` + TargetPath string `json:"target-path"` + TargetEndpoint string `json:"target-endpoint"` + TargetBucket string `json:"target-bucket"` + TargetAccessKey string `json:"target-access-key"` + TargetSecretKey string `json:"target-secret-key"` + TargetRegion string `json:"target-region"` + TargetUsePathStyle bool `json:"target-use-path-style"` + MaxFileSizeMB int `json:"max-file-size-mb"` } // CronFrequency defines the execution intervals for various background workers. @@ -77,16 +86,11 @@ func initArchiveServices(config json.RawMessage) { switch cfg.Retention.Policy { case "delete": - RegisterRetentionDeleteService( - cfg.Retention.Age, - cfg.Retention.IncludeDB, - cfg.Retention.OmitTagged) + RegisterRetentionDeleteService(cfg.Retention) + case "copy": + RegisterRetentionCopyService(cfg.Retention) case "move": - RegisterRetentionMoveService( - cfg.Retention.Age, - cfg.Retention.IncludeDB, - cfg.Retention.Location, - cfg.Retention.OmitTagged) + RegisterRetentionMoveService(cfg.Retention) } if cfg.Compression > 0 { @@ -133,9 +137,30 @@ func Start(cronCfg, archiveConfig json.RawMessage) { RegisterUpdateDurationWorker() RegisterCommitJobService() + if config.Keys.NodeStateRetention != nil && config.Keys.NodeStateRetention.Policy != "" { + initNodeStateRetention() + } + s.Start() } +func initNodeStateRetention() { + cfg := config.Keys.NodeStateRetention + age := cfg.Age + if age <= 0 { + age = 24 + } + + switch cfg.Policy { + case "delete": + RegisterNodeStateRetentionDeleteService(age) + case "parquet": + RegisterNodeStateRetentionParquetService(cfg) + default: + cclog.Warnf("Unknown nodestate-retention policy: %s", cfg.Policy) + } +} + // Shutdown stops the task manager and its scheduler. func Shutdown() { if s != nil { diff --git a/pkg/archive/ConfigSchema.go b/pkg/archive/ConfigSchema.go index aebcf37b..cb9b16bc 100644 --- a/pkg/archive/ConfigSchema.go +++ b/pkg/archive/ConfigSchema.go @@ -57,7 +57,12 @@ var configSchema = ` "policy": { "description": "Retention policy", "type": "string", - "enum": ["none", "delete", "move"] + "enum": ["none", "delete", "copy", "move"] + }, + "format": { + "description": "Output format for copy/move policies", + "type": "string", + "enum": ["json", "parquet"] }, "include-db": { "description": "Also remove jobs from database", @@ -67,9 +72,42 @@ var configSchema = ` "description": "Act on jobs with startTime older than age (in days)", "type": "integer" }, - "location": { - "description": "The target directory for retention. Only applicable for retention move.", + "target-kind": { + "description": "Target storage kind: file or s3", + "type": "string", + "enum": ["file", "s3"] + }, + "target-path": { + "description": "Target directory path for file storage", "type": "string" + }, + "target-endpoint": { + "description": "S3 endpoint URL for target", + "type": "string" + }, + "target-bucket": { + "description": "S3 bucket name for target", + "type": "string" + }, + "target-access-key": { + "description": "S3 access key for target", + "type": "string" + }, + "target-secret-key": { + "description": "S3 secret key for target", + "type": "string" + }, + "target-region": { + "description": "S3 region for target", + "type": "string" + }, + "target-use-path-style": { + "description": "Use path-style S3 URLs for target", + "type": "boolean" + }, + "max-file-size-mb": { + "description": "Maximum parquet file size in MB before splitting", + "type": "integer" } }, "required": ["policy"] diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 272eeb35..64851365 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -25,6 +25,7 @@ func initClusterConfig() error { GlobalUserMetricList = []*schema.GlobalMetricListItem{} NodeLists = map[string]map[string]NodeList{} metricLookup := make(map[string]schema.GlobalMetricListItem) + userMetricLookup := make(map[string]schema.GlobalMetricListItem) for _, c := range ar.GetClusters() { @@ -62,11 +63,12 @@ func initClusterConfig() error { if _, ok := metricLookup[mc.Name]; !ok { metricLookup[mc.Name] = schema.GlobalMetricListItem{ - Name: mc.Name, Scope: mc.Scope, Restrict: mc.Restrict, Unit: mc.Unit, Footprint: mc.Footprint, + Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint, } } availability := schema.ClusterSupport{Cluster: cluster.Name} + userAvailability := schema.ClusterSupport{Cluster: cluster.Name} scLookup := make(map[string]*schema.SubClusterConfig) for _, scc := range mc.SubClusters { @@ -94,6 +96,7 @@ func initClusterConfig() error { newMetric.Footprint = mc.Footprint } + isRestricted := mc.Restrict if cfg, ok := scLookup[sc.Name]; ok { if cfg.Remove { continue @@ -105,9 +108,13 @@ func initClusterConfig() error { newMetric.Footprint = cfg.Footprint newMetric.Energy = cfg.Energy newMetric.LowerIsBetter = cfg.LowerIsBetter + isRestricted = cfg.Restrict } availability.SubClusters = append(availability.SubClusters, sc.Name) + if !isRestricted { + userAvailability.SubClusters = append(userAvailability.SubClusters, sc.Name) + } sc.MetricConfig = append(sc.MetricConfig, newMetric) if newMetric.Footprint != "" { @@ -124,6 +131,17 @@ func initClusterConfig() error { item := metricLookup[mc.Name] item.Availability = append(item.Availability, availability) metricLookup[mc.Name] = item + + if len(userAvailability.SubClusters) > 0 { + userItem, ok := userMetricLookup[mc.Name] + if !ok { + userItem = schema.GlobalMetricListItem{ + Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint, + } + } + userItem.Availability = append(userItem.Availability, userAvailability) + userMetricLookup[mc.Name] = userItem + } } Clusters = append(Clusters, cluster) @@ -144,9 +162,9 @@ func initClusterConfig() error { for _, metric := range metricLookup { GlobalMetricList = append(GlobalMetricList, &metric) - if !metric.Restrict { - GlobalUserMetricList = append(GlobalUserMetricList, &metric) - } + } + for _, metric := range userMetricLookup { + GlobalUserMetricList = append(GlobalUserMetricList, &metric) } return nil diff --git a/pkg/archive/json.go b/pkg/archive/json.go index cf1b0a38..dd37075d 100644 --- a/pkg/archive/json.go +++ b/pkg/archive/json.go @@ -51,7 +51,7 @@ func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) { for _, series := range jobMetric.Series { scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ Hostname: series.Hostname, - Id: series.Id, + ID: series.ID, Data: &series.Statistics, }) } diff --git a/pkg/archive/parquet/convert.go b/pkg/archive/parquet/convert.go new file mode 100644 index 00000000..43e611e4 --- /dev/null +++ b/pkg/archive/parquet/convert.go @@ -0,0 +1,200 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "fmt" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// JobToParquetRow converts job metadata and metric data into a flat ParquetJobRow. +// Nested fields are marshaled to JSON; metric data is gzip-compressed JSON. +func JobToParquetRow(meta *schema.Job, data *schema.JobData) (*ParquetJobRow, error) { + resourcesJSON, err := json.Marshal(meta.Resources) + if err != nil { + return nil, fmt.Errorf("marshal resources: %w", err) + } + + var statisticsJSON []byte + if meta.Statistics != nil { + statisticsJSON, err = json.Marshal(meta.Statistics) + if err != nil { + return nil, fmt.Errorf("marshal statistics: %w", err) + } + } + + var tagsJSON []byte + if len(meta.Tags) > 0 { + tagsJSON, err = json.Marshal(meta.Tags) + if err != nil { + return nil, fmt.Errorf("marshal tags: %w", err) + } + } + + var metaDataJSON []byte + if meta.MetaData != nil { + metaDataJSON, err = json.Marshal(meta.MetaData) + if err != nil { + return nil, fmt.Errorf("marshal metadata: %w", err) + } + } + + var footprintJSON []byte + if meta.Footprint != nil { + footprintJSON, err = json.Marshal(meta.Footprint) + if err != nil { + return nil, fmt.Errorf("marshal footprint: %w", err) + } + } + + var energyFootJSON []byte + if meta.EnergyFootprint != nil { + energyFootJSON, err = json.Marshal(meta.EnergyFootprint) + if err != nil { + return nil, fmt.Errorf("marshal energy footprint: %w", err) + } + } + + metricDataGz, err := compressJobData(data) + if err != nil { + return nil, fmt.Errorf("compress metric data: %w", err) + } + + return &ParquetJobRow{ + JobID: meta.JobID, + Cluster: meta.Cluster, + SubCluster: meta.SubCluster, + Partition: meta.Partition, + Project: meta.Project, + User: meta.User, + State: string(meta.State), + StartTime: meta.StartTime, + Duration: meta.Duration, + Walltime: meta.Walltime, + NumNodes: meta.NumNodes, + NumHWThreads: meta.NumHWThreads, + NumAcc: meta.NumAcc, + Energy: meta.Energy, + SMT: meta.SMT, + ResourcesJSON: resourcesJSON, + StatisticsJSON: statisticsJSON, + TagsJSON: tagsJSON, + MetaDataJSON: metaDataJSON, + FootprintJSON: footprintJSON, + EnergyFootJSON: energyFootJSON, + MetricDataGz: metricDataGz, + }, nil +} + +// ParquetRowToJob converts a ParquetJobRow back into job metadata and metric data. +// This is the reverse of JobToParquetRow. +func ParquetRowToJob(row *ParquetJobRow) (*schema.Job, *schema.JobData, error) { + meta := &schema.Job{ + JobID: row.JobID, + Cluster: row.Cluster, + SubCluster: row.SubCluster, + Partition: row.Partition, + Project: row.Project, + User: row.User, + State: schema.JobState(row.State), + StartTime: row.StartTime, + Duration: row.Duration, + Walltime: row.Walltime, + NumNodes: row.NumNodes, + NumHWThreads: row.NumHWThreads, + NumAcc: row.NumAcc, + Energy: row.Energy, + SMT: row.SMT, + } + + if len(row.ResourcesJSON) > 0 { + if err := json.Unmarshal(row.ResourcesJSON, &meta.Resources); err != nil { + return nil, nil, fmt.Errorf("unmarshal resources: %w", err) + } + } + + if len(row.StatisticsJSON) > 0 { + if err := json.Unmarshal(row.StatisticsJSON, &meta.Statistics); err != nil { + return nil, nil, fmt.Errorf("unmarshal statistics: %w", err) + } + } + + if len(row.TagsJSON) > 0 { + if err := json.Unmarshal(row.TagsJSON, &meta.Tags); err != nil { + return nil, nil, fmt.Errorf("unmarshal tags: %w", err) + } + } + + if len(row.MetaDataJSON) > 0 { + if err := json.Unmarshal(row.MetaDataJSON, &meta.MetaData); err != nil { + return nil, nil, fmt.Errorf("unmarshal metadata: %w", err) + } + } + + if len(row.FootprintJSON) > 0 { + if err := json.Unmarshal(row.FootprintJSON, &meta.Footprint); err != nil { + return nil, nil, fmt.Errorf("unmarshal footprint: %w", err) + } + } + + if len(row.EnergyFootJSON) > 0 { + if err := json.Unmarshal(row.EnergyFootJSON, &meta.EnergyFootprint); err != nil { + return nil, nil, fmt.Errorf("unmarshal energy footprint: %w", err) + } + } + + data, err := decompressJobData(row.MetricDataGz) + if err != nil { + return nil, nil, fmt.Errorf("decompress metric data: %w", err) + } + + return meta, data, nil +} + +func decompressJobData(data []byte) (*schema.JobData, error) { + gz, err := gzip.NewReader(bytes.NewReader(data)) + if err != nil { + return nil, err + } + defer gz.Close() + + var buf bytes.Buffer + if _, err := buf.ReadFrom(gz); err != nil { + return nil, err + } + + var jobData schema.JobData + if err := json.Unmarshal(buf.Bytes(), &jobData); err != nil { + return nil, err + } + + return &jobData, nil +} + +func compressJobData(data *schema.JobData) ([]byte, error) { + jsonBytes, err := json.Marshal(data) + if err != nil { + return nil, err + } + + var buf bytes.Buffer + gz, err := gzip.NewWriterLevel(&buf, gzip.BestCompression) + if err != nil { + return nil, err + } + if _, err := gz.Write(jsonBytes); err != nil { + return nil, err + } + if err := gz.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} diff --git a/pkg/archive/parquet/convert_test.go b/pkg/archive/parquet/convert_test.go new file mode 100644 index 00000000..3b2848ba --- /dev/null +++ b/pkg/archive/parquet/convert_test.go @@ -0,0 +1,305 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +func TestParquetRowToJob(t *testing.T) { + meta := &schema.Job{ + JobID: 42, + Cluster: "testcluster", + SubCluster: "sc0", + Partition: "main", + Project: "testproject", + User: "testuser", + State: schema.JobStateCompleted, + StartTime: 1700000000, + Duration: 3600, + Walltime: 7200, + NumNodes: 2, + NumHWThreads: 16, + NumAcc: 4, + Energy: 123.45, + SMT: 2, + Resources: []*schema.Resource{ + {Hostname: "node001", HWThreads: []int{0, 1, 2, 3}}, + {Hostname: "node002", HWThreads: []int{4, 5, 6, 7}}, + }, + Statistics: map[string]schema.JobStatistics{ + "cpu_load": {Avg: 50.0, Min: 10.0, Max: 90.0}, + }, + Tags: []*schema.Tag{ + {Type: "test", Name: "tag1"}, + }, + MetaData: map[string]string{ + "key1": "value1", + }, + Footprint: map[string]float64{ + "cpu_load": 50.0, + }, + EnergyFootprint: map[string]float64{ + "total": 123.45, + }, + } + + data := &schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Unit: schema.Unit{Base: ""}, + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "node001", + Data: []schema.Float{1.0, 2.0, 3.0}, + }, + }, + }, + }, + } + + // Convert to parquet row + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + // Convert back + gotMeta, gotData, err := ParquetRowToJob(row) + if err != nil { + t.Fatalf("ParquetRowToJob: %v", err) + } + + // Verify scalar fields + if gotMeta.JobID != meta.JobID { + t.Errorf("JobID = %d, want %d", gotMeta.JobID, meta.JobID) + } + if gotMeta.Cluster != meta.Cluster { + t.Errorf("Cluster = %q, want %q", gotMeta.Cluster, meta.Cluster) + } + if gotMeta.SubCluster != meta.SubCluster { + t.Errorf("SubCluster = %q, want %q", gotMeta.SubCluster, meta.SubCluster) + } + if gotMeta.Partition != meta.Partition { + t.Errorf("Partition = %q, want %q", gotMeta.Partition, meta.Partition) + } + if gotMeta.Project != meta.Project { + t.Errorf("Project = %q, want %q", gotMeta.Project, meta.Project) + } + if gotMeta.User != meta.User { + t.Errorf("User = %q, want %q", gotMeta.User, meta.User) + } + if gotMeta.State != meta.State { + t.Errorf("State = %q, want %q", gotMeta.State, meta.State) + } + if gotMeta.StartTime != meta.StartTime { + t.Errorf("StartTime = %d, want %d", gotMeta.StartTime, meta.StartTime) + } + if gotMeta.Duration != meta.Duration { + t.Errorf("Duration = %d, want %d", gotMeta.Duration, meta.Duration) + } + if gotMeta.Walltime != meta.Walltime { + t.Errorf("Walltime = %d, want %d", gotMeta.Walltime, meta.Walltime) + } + if gotMeta.NumNodes != meta.NumNodes { + t.Errorf("NumNodes = %d, want %d", gotMeta.NumNodes, meta.NumNodes) + } + if gotMeta.NumHWThreads != meta.NumHWThreads { + t.Errorf("NumHWThreads = %d, want %d", gotMeta.NumHWThreads, meta.NumHWThreads) + } + if gotMeta.NumAcc != meta.NumAcc { + t.Errorf("NumAcc = %d, want %d", gotMeta.NumAcc, meta.NumAcc) + } + if gotMeta.Energy != meta.Energy { + t.Errorf("Energy = %f, want %f", gotMeta.Energy, meta.Energy) + } + if gotMeta.SMT != meta.SMT { + t.Errorf("SMT = %d, want %d", gotMeta.SMT, meta.SMT) + } + + // Verify complex fields + if len(gotMeta.Resources) != 2 { + t.Fatalf("Resources len = %d, want 2", len(gotMeta.Resources)) + } + if gotMeta.Resources[0].Hostname != "node001" { + t.Errorf("Resources[0].Hostname = %q, want %q", gotMeta.Resources[0].Hostname, "node001") + } + if len(gotMeta.Resources[0].HWThreads) != 4 { + t.Errorf("Resources[0].HWThreads len = %d, want 4", len(gotMeta.Resources[0].HWThreads)) + } + + if len(gotMeta.Statistics) != 1 { + t.Fatalf("Statistics len = %d, want 1", len(gotMeta.Statistics)) + } + if stat, ok := gotMeta.Statistics["cpu_load"]; !ok { + t.Error("Statistics missing cpu_load") + } else if stat.Avg != 50.0 { + t.Errorf("Statistics[cpu_load].Avg = %f, want 50.0", stat.Avg) + } + + if len(gotMeta.Tags) != 1 || gotMeta.Tags[0].Name != "tag1" { + t.Errorf("Tags = %v, want [{test tag1}]", gotMeta.Tags) + } + + if gotMeta.MetaData["key1"] != "value1" { + t.Errorf("MetaData[key1] = %q, want %q", gotMeta.MetaData["key1"], "value1") + } + + if gotMeta.Footprint["cpu_load"] != 50.0 { + t.Errorf("Footprint[cpu_load] = %f, want 50.0", gotMeta.Footprint["cpu_load"]) + } + + if gotMeta.EnergyFootprint["total"] != 123.45 { + t.Errorf("EnergyFootprint[total] = %f, want 123.45", gotMeta.EnergyFootprint["total"]) + } + + // Verify metric data + if gotData == nil { + t.Fatal("JobData is nil") + } + cpuLoad, ok := (*gotData)["cpu_load"] + if !ok { + t.Fatal("JobData missing cpu_load") + } + nodeMetric, ok := cpuLoad[schema.MetricScopeNode] + if !ok { + t.Fatal("cpu_load missing node scope") + } + if nodeMetric.Timestep != 60 { + t.Errorf("Timestep = %d, want 60", nodeMetric.Timestep) + } + if len(nodeMetric.Series) != 1 { + t.Fatalf("Series len = %d, want 1", len(nodeMetric.Series)) + } + if nodeMetric.Series[0].Hostname != "node001" { + t.Errorf("Series[0].Hostname = %q, want %q", nodeMetric.Series[0].Hostname, "node001") + } + if len(nodeMetric.Series[0].Data) != 3 { + t.Errorf("Series[0].Data len = %d, want 3", len(nodeMetric.Series[0].Data)) + } +} + +func TestParquetRowToJobNilOptionalFields(t *testing.T) { + meta := &schema.Job{ + JobID: 1, + Cluster: "test", + SubCluster: "sc0", + Project: "proj", + User: "user", + State: schema.JobStateCompleted, + StartTime: 1700000000, + Duration: 60, + NumNodes: 1, + Resources: []*schema.Resource{ + {Hostname: "node001"}, + }, + } + + data := &schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Timestep: 60, + Series: []schema.Series{ + {Hostname: "node001", Data: []schema.Float{1.0}}, + }, + }, + }, + } + + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + gotMeta, gotData, err := ParquetRowToJob(row) + if err != nil { + t.Fatalf("ParquetRowToJob: %v", err) + } + + if gotMeta.JobID != 1 { + t.Errorf("JobID = %d, want 1", gotMeta.JobID) + } + if gotMeta.Tags != nil { + t.Errorf("Tags should be nil, got %v", gotMeta.Tags) + } + if gotMeta.Statistics != nil { + t.Errorf("Statistics should be nil, got %v", gotMeta.Statistics) + } + if gotMeta.MetaData != nil { + t.Errorf("MetaData should be nil, got %v", gotMeta.MetaData) + } + if gotMeta.Footprint != nil { + t.Errorf("Footprint should be nil, got %v", gotMeta.Footprint) + } + if gotMeta.EnergyFootprint != nil { + t.Errorf("EnergyFootprint should be nil, got %v", gotMeta.EnergyFootprint) + } + if gotData == nil { + t.Fatal("JobData is nil") + } +} + +func TestRoundTripThroughParquetFile(t *testing.T) { + meta, data := makeTestJob(999) + meta.Tags = []*schema.Tag{{Type: "test", Name: "roundtrip"}} + + // Convert to row and write to parquet + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + // Write to parquet bytes + parquetBytes, err := writeParquetBytes([]ParquetJobRow{*row}) + if err != nil { + t.Fatalf("writeParquetBytes: %v", err) + } + + // Read back from parquet bytes + rows, err := ReadParquetFile(parquetBytes) + if err != nil { + t.Fatalf("ReadParquetFile: %v", err) + } + if len(rows) != 1 { + t.Fatalf("expected 1 row, got %d", len(rows)) + } + + // Convert back to job + gotMeta, gotData, err := ParquetRowToJob(&rows[0]) + if err != nil { + t.Fatalf("ParquetRowToJob: %v", err) + } + + // Verify key fields survived the round trip + if gotMeta.JobID != 999 { + t.Errorf("JobID = %d, want 999", gotMeta.JobID) + } + if gotMeta.Cluster != "testcluster" { + t.Errorf("Cluster = %q, want %q", gotMeta.Cluster, "testcluster") + } + if gotMeta.User != "testuser" { + t.Errorf("User = %q, want %q", gotMeta.User, "testuser") + } + if gotMeta.State != schema.JobStateCompleted { + t.Errorf("State = %q, want %q", gotMeta.State, schema.JobStateCompleted) + } + if len(gotMeta.Tags) != 1 || gotMeta.Tags[0].Name != "roundtrip" { + t.Errorf("Tags = %v, want [{test roundtrip}]", gotMeta.Tags) + } + if len(gotMeta.Resources) != 2 { + t.Errorf("Resources len = %d, want 2", len(gotMeta.Resources)) + } + + if gotData == nil { + t.Fatal("JobData is nil") + } + if _, ok := (*gotData)["cpu_load"]; !ok { + t.Error("JobData missing cpu_load") + } +} diff --git a/pkg/archive/parquet/nodestate_schema.go b/pkg/archive/parquet/nodestate_schema.go new file mode 100644 index 00000000..c9dfe363 --- /dev/null +++ b/pkg/archive/parquet/nodestate_schema.go @@ -0,0 +1,20 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +type ParquetNodeStateRow struct { + TimeStamp int64 `parquet:"time_stamp"` + NodeState string `parquet:"node_state"` + HealthState string `parquet:"health_state"` + HealthMetrics string `parquet:"health_metrics,optional"` + CpusAllocated int32 `parquet:"cpus_allocated"` + MemoryAllocated int64 `parquet:"memory_allocated"` + GpusAllocated int32 `parquet:"gpus_allocated"` + JobsRunning int32 `parquet:"jobs_running"` + Hostname string `parquet:"hostname"` + Cluster string `parquet:"cluster"` + SubCluster string `parquet:"subcluster"` +} diff --git a/pkg/archive/parquet/nodestate_writer.go b/pkg/archive/parquet/nodestate_writer.go new file mode 100644 index 00000000..053417d6 --- /dev/null +++ b/pkg/archive/parquet/nodestate_writer.go @@ -0,0 +1,104 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + pq "github.com/parquet-go/parquet-go" +) + +// NodeStateParquetWriter batches ParquetNodeStateRows and flushes them to a target +// when the estimated size exceeds maxSizeBytes. +type NodeStateParquetWriter struct { + target ParquetTarget + maxSizeBytes int64 + rows []ParquetNodeStateRow + currentSize int64 + fileCounter int + datePrefix string +} + +// NewNodeStateParquetWriter creates a new writer for node state parquet files. +func NewNodeStateParquetWriter(target ParquetTarget, maxSizeMB int) *NodeStateParquetWriter { + return &NodeStateParquetWriter{ + target: target, + maxSizeBytes: int64(maxSizeMB) * 1024 * 1024, + datePrefix: time.Now().Format("2006-01-02"), + } +} + +// AddRow adds a row to the current batch. If the estimated batch size +// exceeds the configured maximum, the batch is flushed first. +func (pw *NodeStateParquetWriter) AddRow(row ParquetNodeStateRow) error { + rowSize := estimateNodeStateRowSize(&row) + + if pw.currentSize+rowSize > pw.maxSizeBytes && len(pw.rows) > 0 { + if err := pw.Flush(); err != nil { + return err + } + } + + pw.rows = append(pw.rows, row) + pw.currentSize += rowSize + return nil +} + +// Flush writes the current batch to a parquet file on the target. +func (pw *NodeStateParquetWriter) Flush() error { + if len(pw.rows) == 0 { + return nil + } + + pw.fileCounter++ + fileName := fmt.Sprintf("cc-nodestate-%s-%03d.parquet", pw.datePrefix, pw.fileCounter) + + data, err := writeNodeStateParquetBytes(pw.rows) + if err != nil { + return fmt.Errorf("write parquet buffer: %w", err) + } + + if err := pw.target.WriteFile(fileName, data); err != nil { + return fmt.Errorf("write parquet file %q: %w", fileName, err) + } + + cclog.Infof("NodeState retention: wrote %s (%d rows, %d bytes)", fileName, len(pw.rows), len(data)) + pw.rows = pw.rows[:0] + pw.currentSize = 0 + return nil +} + +// Close flushes any remaining rows and finalizes the writer. +func (pw *NodeStateParquetWriter) Close() error { + return pw.Flush() +} + +func writeNodeStateParquetBytes(rows []ParquetNodeStateRow) ([]byte, error) { + var buf bytes.Buffer + + writer := pq.NewGenericWriter[ParquetNodeStateRow](&buf, + pq.Compression(&pq.Snappy), + ) + + if _, err := writer.Write(rows); err != nil { + return nil, err + } + if err := writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func estimateNodeStateRowSize(row *ParquetNodeStateRow) int64 { + size := int64(100) // fixed numeric fields + size += int64(len(row.NodeState) + len(row.HealthState) + len(row.HealthMetrics)) + size += int64(len(row.Hostname) + len(row.Cluster) + len(row.SubCluster)) + return size +} diff --git a/pkg/archive/parquet/reader.go b/pkg/archive/parquet/reader.go new file mode 100644 index 00000000..32486bd5 --- /dev/null +++ b/pkg/archive/parquet/reader.go @@ -0,0 +1,216 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + pq "github.com/parquet-go/parquet-go" +) + +// ReadParquetFile reads all ParquetJobRow entries from parquet-encoded bytes. +func ReadParquetFile(data []byte) ([]ParquetJobRow, error) { + file, err := pq.OpenFile(bytes.NewReader(data), int64(len(data))) + if err != nil { + return nil, fmt.Errorf("open parquet: %w", err) + } + + reader := pq.NewGenericReader[ParquetJobRow](file) + defer reader.Close() + + numRows := file.NumRows() + rows := make([]ParquetJobRow, numRows) + n, err := reader.Read(rows) + if err != nil && err != io.EOF { + return nil, fmt.Errorf("read parquet rows: %w", err) + } + + return rows[:n], nil +} + +// ParquetSource abstracts reading parquet archives from different storage backends. +type ParquetSource interface { + GetClusters() ([]string, error) + ListParquetFiles(cluster string) ([]string, error) + ReadFile(path string) ([]byte, error) + ReadClusterConfig(cluster string) (*schema.Cluster, error) +} + +// FileParquetSource reads parquet archives from a local filesystem directory. +type FileParquetSource struct { + path string +} + +func NewFileParquetSource(path string) *FileParquetSource { + return &FileParquetSource{path: path} +} + +func (fs *FileParquetSource) GetClusters() ([]string, error) { + entries, err := os.ReadDir(fs.path) + if err != nil { + return nil, fmt.Errorf("read directory: %w", err) + } + + var clusters []string + for _, e := range entries { + if e.IsDir() { + clusters = append(clusters, e.Name()) + } + } + return clusters, nil +} + +func (fs *FileParquetSource) ListParquetFiles(cluster string) ([]string, error) { + dir := filepath.Join(fs.path, cluster) + entries, err := os.ReadDir(dir) + if err != nil { + return nil, fmt.Errorf("read cluster directory: %w", err) + } + + var files []string + for _, e := range entries { + if !e.IsDir() && strings.HasSuffix(e.Name(), ".parquet") { + files = append(files, filepath.Join(cluster, e.Name())) + } + } + return files, nil +} + +func (fs *FileParquetSource) ReadFile(path string) ([]byte, error) { + return os.ReadFile(filepath.Join(fs.path, path)) +} + +func (fs *FileParquetSource) ReadClusterConfig(cluster string) (*schema.Cluster, error) { + data, err := os.ReadFile(filepath.Join(fs.path, cluster, "cluster.json")) + if err != nil { + return nil, fmt.Errorf("read cluster.json: %w", err) + } + var cfg schema.Cluster + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("unmarshal cluster config: %w", err) + } + return &cfg, nil +} + +// S3ParquetSource reads parquet archives from an S3-compatible object store. +type S3ParquetSource struct { + client *s3.Client + bucket string +} + +func NewS3ParquetSource(cfg S3TargetConfig) (*S3ParquetSource, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("S3 source: empty bucket name") + } + + region := cfg.Region + if region == "" { + region = "us-east-1" + } + + awsCfg, err := awsconfig.LoadDefaultConfig(context.Background(), + awsconfig.WithRegion(region), + awsconfig.WithCredentialsProvider( + credentials.NewStaticCredentialsProvider(cfg.AccessKey, cfg.SecretKey, ""), + ), + ) + if err != nil { + return nil, fmt.Errorf("S3 source: load AWS config: %w", err) + } + + opts := func(o *s3.Options) { + if cfg.Endpoint != "" { + o.BaseEndpoint = aws.String(cfg.Endpoint) + } + o.UsePathStyle = cfg.UsePathStyle + } + + client := s3.NewFromConfig(awsCfg, opts) + return &S3ParquetSource{client: client, bucket: cfg.Bucket}, nil +} + +func (ss *S3ParquetSource) GetClusters() ([]string, error) { + ctx := context.Background() + paginator := s3.NewListObjectsV2Paginator(ss.client, &s3.ListObjectsV2Input{ + Bucket: aws.String(ss.bucket), + Delimiter: aws.String("/"), + }) + + var clusters []string + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + if err != nil { + return nil, fmt.Errorf("S3 source: list clusters: %w", err) + } + for _, prefix := range page.CommonPrefixes { + if prefix.Prefix != nil { + name := strings.TrimSuffix(*prefix.Prefix, "/") + clusters = append(clusters, name) + } + } + } + return clusters, nil +} + +func (ss *S3ParquetSource) ListParquetFiles(cluster string) ([]string, error) { + ctx := context.Background() + prefix := cluster + "/" + paginator := s3.NewListObjectsV2Paginator(ss.client, &s3.ListObjectsV2Input{ + Bucket: aws.String(ss.bucket), + Prefix: aws.String(prefix), + }) + + var files []string + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + if err != nil { + return nil, fmt.Errorf("S3 source: list parquet files: %w", err) + } + for _, obj := range page.Contents { + if obj.Key != nil && strings.HasSuffix(*obj.Key, ".parquet") { + files = append(files, *obj.Key) + } + } + } + return files, nil +} + +func (ss *S3ParquetSource) ReadFile(path string) ([]byte, error) { + ctx := context.Background() + result, err := ss.client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(ss.bucket), + Key: aws.String(path), + }) + if err != nil { + return nil, fmt.Errorf("S3 source: get object %q: %w", path, err) + } + defer result.Body.Close() + return io.ReadAll(result.Body) +} + +func (ss *S3ParquetSource) ReadClusterConfig(cluster string) (*schema.Cluster, error) { + data, err := ss.ReadFile(cluster + "/cluster.json") + if err != nil { + return nil, fmt.Errorf("read cluster.json: %w", err) + } + var cfg schema.Cluster + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("unmarshal cluster config: %w", err) + } + return &cfg, nil +} diff --git a/pkg/archive/parquet/schema.go b/pkg/archive/parquet/schema.go new file mode 100644 index 00000000..74f82599 --- /dev/null +++ b/pkg/archive/parquet/schema.go @@ -0,0 +1,32 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +type ParquetJobRow struct { + JobID int64 `parquet:"job_id"` + Cluster string `parquet:"cluster"` + SubCluster string `parquet:"sub_cluster"` + Partition string `parquet:"partition,optional"` + Project string `parquet:"project"` + User string `parquet:"user"` + State string `parquet:"job_state"` + StartTime int64 `parquet:"start_time"` + Duration int32 `parquet:"duration"` + Walltime int64 `parquet:"walltime"` + NumNodes int32 `parquet:"num_nodes"` + NumHWThreads int32 `parquet:"num_hwthreads"` + NumAcc int32 `parquet:"num_acc"` + Exclusive int32 `parquet:"exclusive"` + Energy float64 `parquet:"energy"` + SMT int32 `parquet:"smt"` + ResourcesJSON []byte `parquet:"resources_json"` + StatisticsJSON []byte `parquet:"statistics_json,optional"` + TagsJSON []byte `parquet:"tags_json,optional"` + MetaDataJSON []byte `parquet:"meta_data_json,optional"` + FootprintJSON []byte `parquet:"footprint_json,optional"` + EnergyFootJSON []byte `parquet:"energy_footprint_json,optional"` + MetricDataGz []byte `parquet:"metric_data_gz"` +} diff --git a/pkg/archive/parquet/target.go b/pkg/archive/parquet/target.go new file mode 100644 index 00000000..090a230d --- /dev/null +++ b/pkg/archive/parquet/target.go @@ -0,0 +1,104 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "context" + "fmt" + "os" + "path/filepath" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" +) + +// ParquetTarget abstracts the destination for parquet file writes. +type ParquetTarget interface { + WriteFile(name string, data []byte) error +} + +// FileTarget writes parquet files to a local filesystem directory. +type FileTarget struct { + path string +} + +func NewFileTarget(path string) (*FileTarget, error) { + if err := os.MkdirAll(path, 0o750); err != nil { + return nil, fmt.Errorf("create target directory: %w", err) + } + return &FileTarget{path: path}, nil +} + +func (ft *FileTarget) WriteFile(name string, data []byte) error { + fullPath := filepath.Join(ft.path, name) + if err := os.MkdirAll(filepath.Dir(fullPath), 0o750); err != nil { + return fmt.Errorf("create parent directory: %w", err) + } + return os.WriteFile(fullPath, data, 0o640) +} + +// S3TargetConfig holds the configuration for an S3 parquet target. +type S3TargetConfig struct { + Endpoint string + Bucket string + AccessKey string + SecretKey string + Region string + UsePathStyle bool +} + +// S3Target writes parquet files to an S3-compatible object store. +type S3Target struct { + client *s3.Client + bucket string +} + +func NewS3Target(cfg S3TargetConfig) (*S3Target, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("S3 target: empty bucket name") + } + + region := cfg.Region + if region == "" { + region = "us-east-1" + } + + awsCfg, err := awsconfig.LoadDefaultConfig(context.Background(), + awsconfig.WithRegion(region), + awsconfig.WithCredentialsProvider( + credentials.NewStaticCredentialsProvider(cfg.AccessKey, cfg.SecretKey, ""), + ), + ) + if err != nil { + return nil, fmt.Errorf("S3 target: load AWS config: %w", err) + } + + opts := func(o *s3.Options) { + if cfg.Endpoint != "" { + o.BaseEndpoint = aws.String(cfg.Endpoint) + } + o.UsePathStyle = cfg.UsePathStyle + } + + client := s3.NewFromConfig(awsCfg, opts) + return &S3Target{client: client, bucket: cfg.Bucket}, nil +} + +func (st *S3Target) WriteFile(name string, data []byte) error { + _, err := st.client.PutObject(context.Background(), &s3.PutObjectInput{ + Bucket: aws.String(st.bucket), + Key: aws.String(name), + Body: bytes.NewReader(data), + ContentType: aws.String("application/vnd.apache.parquet"), + }) + if err != nil { + return fmt.Errorf("S3 target: put object %q: %w", name, err) + } + return nil +} diff --git a/pkg/archive/parquet/writer.go b/pkg/archive/parquet/writer.go new file mode 100644 index 00000000..2669a9c8 --- /dev/null +++ b/pkg/archive/parquet/writer.go @@ -0,0 +1,181 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "encoding/json" + "fmt" + "path" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + pq "github.com/parquet-go/parquet-go" +) + +// ParquetWriter batches ParquetJobRows and flushes them to a target +// when the estimated size exceeds maxSizeBytes. +type ParquetWriter struct { + target ParquetTarget + maxSizeBytes int64 + rows []ParquetJobRow + currentSize int64 + fileCounter int + datePrefix string +} + +// NewParquetWriter creates a new writer that flushes batches to the given target. +// maxSizeMB sets the approximate maximum size per parquet file in megabytes. +func NewParquetWriter(target ParquetTarget, maxSizeMB int) *ParquetWriter { + return &ParquetWriter{ + target: target, + maxSizeBytes: int64(maxSizeMB) * 1024 * 1024, + datePrefix: time.Now().Format("2006-01-02"), + } +} + +// AddJob adds a row to the current batch. If the estimated batch size +// exceeds the configured maximum, the batch is flushed to the target first. +func (pw *ParquetWriter) AddJob(row ParquetJobRow) error { + rowSize := estimateRowSize(&row) + + if pw.currentSize+rowSize > pw.maxSizeBytes && len(pw.rows) > 0 { + if err := pw.Flush(); err != nil { + return err + } + } + + pw.rows = append(pw.rows, row) + pw.currentSize += rowSize + return nil +} + +// Flush writes the current batch to a parquet file on the target. +func (pw *ParquetWriter) Flush() error { + if len(pw.rows) == 0 { + return nil + } + + pw.fileCounter++ + fileName := fmt.Sprintf("cc-archive-%s-%03d.parquet", pw.datePrefix, pw.fileCounter) + + data, err := writeParquetBytes(pw.rows) + if err != nil { + return fmt.Errorf("write parquet buffer: %w", err) + } + + if err := pw.target.WriteFile(fileName, data); err != nil { + return fmt.Errorf("write parquet file %q: %w", fileName, err) + } + + cclog.Infof("Parquet retention: wrote %s (%d jobs, %d bytes)", fileName, len(pw.rows), len(data)) + pw.rows = pw.rows[:0] + pw.currentSize = 0 + return nil +} + +// Close flushes any remaining rows and finalizes the writer. +func (pw *ParquetWriter) Close() error { + return pw.Flush() +} + +func writeParquetBytes(rows []ParquetJobRow) ([]byte, error) { + var buf bytes.Buffer + + writer := pq.NewGenericWriter[ParquetJobRow](&buf, + pq.Compression(&pq.Snappy), + ) + + if _, err := writer.Write(rows); err != nil { + return nil, err + } + if err := writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func estimateRowSize(row *ParquetJobRow) int64 { + // Fixed fields: ~100 bytes for numeric fields + strings estimate + size := int64(200) + size += int64(len(row.Cluster) + len(row.SubCluster) + len(row.Partition) + + len(row.Project) + len(row.User) + len(row.State)) + size += int64(len(row.ResourcesJSON)) + size += int64(len(row.StatisticsJSON)) + size += int64(len(row.TagsJSON)) + size += int64(len(row.MetaDataJSON)) + size += int64(len(row.FootprintJSON)) + size += int64(len(row.EnergyFootJSON)) + size += int64(len(row.MetricDataGz)) + return size +} + +// prefixedTarget wraps a ParquetTarget and prepends a path prefix to all file names. +type prefixedTarget struct { + inner ParquetTarget + prefix string +} + +func (pt *prefixedTarget) WriteFile(name string, data []byte) error { + return pt.inner.WriteFile(path.Join(pt.prefix, name), data) +} + +// ClusterAwareParquetWriter organizes Parquet output by cluster. +// Each cluster gets its own subdirectory with a cluster.json config file. +type ClusterAwareParquetWriter struct { + target ParquetTarget + maxSizeMB int + writers map[string]*ParquetWriter + clusterCfgs map[string]*schema.Cluster +} + +// NewClusterAwareParquetWriter creates a writer that routes jobs to per-cluster ParquetWriters. +func NewClusterAwareParquetWriter(target ParquetTarget, maxSizeMB int) *ClusterAwareParquetWriter { + return &ClusterAwareParquetWriter{ + target: target, + maxSizeMB: maxSizeMB, + writers: make(map[string]*ParquetWriter), + clusterCfgs: make(map[string]*schema.Cluster), + } +} + +// SetClusterConfig stores a cluster configuration to be written as cluster.json on Close. +func (cw *ClusterAwareParquetWriter) SetClusterConfig(name string, cfg *schema.Cluster) { + cw.clusterCfgs[name] = cfg +} + +// AddJob routes the job row to the appropriate per-cluster writer. +func (cw *ClusterAwareParquetWriter) AddJob(row ParquetJobRow) error { + cluster := row.Cluster + pw, ok := cw.writers[cluster] + if !ok { + pw = NewParquetWriter(&prefixedTarget{inner: cw.target, prefix: cluster}, cw.maxSizeMB) + cw.writers[cluster] = pw + } + return pw.AddJob(row) +} + +// Close writes cluster.json files and flushes all per-cluster writers. +func (cw *ClusterAwareParquetWriter) Close() error { + for name, cfg := range cw.clusterCfgs { + data, err := json.MarshalIndent(cfg, "", " ") + if err != nil { + return fmt.Errorf("marshal cluster config %q: %w", name, err) + } + if err := cw.target.WriteFile(path.Join(name, "cluster.json"), data); err != nil { + return fmt.Errorf("write cluster.json for %q: %w", name, err) + } + } + + for cluster, pw := range cw.writers { + if err := pw.Close(); err != nil { + return fmt.Errorf("close writer for cluster %q: %w", cluster, err) + } + } + return nil +} diff --git a/pkg/archive/parquet/writer_test.go b/pkg/archive/parquet/writer_test.go new file mode 100644 index 00000000..57b4ca4c --- /dev/null +++ b/pkg/archive/parquet/writer_test.go @@ -0,0 +1,361 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "sync" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + pq "github.com/parquet-go/parquet-go" +) + +// memTarget collects written files in memory for testing. +type memTarget struct { + mu sync.Mutex + files map[string][]byte +} + +func newMemTarget() *memTarget { + return &memTarget{files: make(map[string][]byte)} +} + +func (m *memTarget) WriteFile(name string, data []byte) error { + m.mu.Lock() + defer m.mu.Unlock() + m.files[name] = append([]byte(nil), data...) + return nil +} + +func makeTestJob(jobID int64) (*schema.Job, *schema.JobData) { + meta := &schema.Job{ + JobID: jobID, + Cluster: "testcluster", + SubCluster: "sc0", + Project: "testproject", + User: "testuser", + State: schema.JobStateCompleted, + StartTime: 1700000000, + Duration: 3600, + Walltime: 7200, + NumNodes: 2, + NumHWThreads: 16, + SMT: 1, + Resources: []*schema.Resource{ + {Hostname: "node001"}, + {Hostname: "node002"}, + }, + } + + data := schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Unit: schema.Unit{Base: ""}, + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "node001", + Data: []schema.Float{1.0, 2.0, 3.0}, + }, + }, + }, + }, + } + + return meta, &data +} + +func TestJobToParquetRowConversion(t *testing.T) { + meta, data := makeTestJob(1001) + meta.Tags = []*schema.Tag{{Type: "test", Name: "tag1"}} + meta.MetaData = map[string]string{"key": "value"} + + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + if row.JobID != 1001 { + t.Errorf("JobID = %d, want 1001", row.JobID) + } + if row.Cluster != "testcluster" { + t.Errorf("Cluster = %q, want %q", row.Cluster, "testcluster") + } + if row.User != "testuser" { + t.Errorf("User = %q, want %q", row.User, "testuser") + } + if row.State != "completed" { + t.Errorf("State = %q, want %q", row.State, "completed") + } + if row.NumNodes != 2 { + t.Errorf("NumNodes = %d, want 2", row.NumNodes) + } + + // Verify resources JSON + var resources []*schema.Resource + if err := json.Unmarshal(row.ResourcesJSON, &resources); err != nil { + t.Fatalf("unmarshal resources: %v", err) + } + if len(resources) != 2 { + t.Errorf("resources len = %d, want 2", len(resources)) + } + + // Verify tags JSON + var tags []*schema.Tag + if err := json.Unmarshal(row.TagsJSON, &tags); err != nil { + t.Fatalf("unmarshal tags: %v", err) + } + if len(tags) != 1 || tags[0].Name != "tag1" { + t.Errorf("tags = %v, want [{test tag1}]", tags) + } + + // Verify metric data is gzip-compressed valid JSON + gz, err := gzip.NewReader(bytes.NewReader(row.MetricDataGz)) + if err != nil { + t.Fatalf("gzip reader: %v", err) + } + decompressed, err := io.ReadAll(gz) + if err != nil { + t.Fatalf("gzip read: %v", err) + } + var jobData schema.JobData + if err := json.Unmarshal(decompressed, &jobData); err != nil { + t.Fatalf("unmarshal metric data: %v", err) + } + if _, ok := jobData["cpu_load"]; !ok { + t.Error("metric data missing cpu_load key") + } +} + +func TestParquetWriterSingleBatch(t *testing.T) { + target := newMemTarget() + pw := NewParquetWriter(target, 512) + + for i := int64(0); i < 5; i++ { + meta, data := makeTestJob(i) + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert job %d: %v", i, err) + } + if err := pw.AddJob(*row); err != nil { + t.Fatalf("add job %d: %v", i, err) + } + } + + if err := pw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + if len(target.files) != 1 { + t.Fatalf("expected 1 file, got %d", len(target.files)) + } + + // Verify the parquet file is readable + for name, data := range target.files { + file := bytes.NewReader(data) + pf, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Fatalf("open parquet %s: %v", name, err) + } + if pf.NumRows() != 5 { + t.Errorf("parquet rows = %d, want 5", pf.NumRows()) + } + } +} + +func TestParquetWriterBatching(t *testing.T) { + target := newMemTarget() + // Use a very small max size to force multiple files + pw := NewParquetWriter(target, 0) // 0 MB means every job triggers a flush + pw.maxSizeBytes = 1 // Force flush after every row + + for i := int64(0); i < 3; i++ { + meta, data := makeTestJob(i) + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert job %d: %v", i, err) + } + if err := pw.AddJob(*row); err != nil { + t.Fatalf("add job %d: %v", i, err) + } + } + + if err := pw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + // With maxSizeBytes=1, each AddJob should flush the previous batch, + // resulting in multiple files + if len(target.files) < 2 { + t.Errorf("expected multiple files due to batching, got %d", len(target.files)) + } + + // Verify all files are valid parquet + for name, data := range target.files { + file := bytes.NewReader(data) + _, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Errorf("invalid parquet file %s: %v", name, err) + } + } +} + +func TestFileTarget(t *testing.T) { + dir := t.TempDir() + ft, err := NewFileTarget(dir) + if err != nil { + t.Fatalf("NewFileTarget: %v", err) + } + + testData := []byte("test parquet data") + if err := ft.WriteFile("test.parquet", testData); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + // Verify file exists and has correct content + // (using the target itself is sufficient; we just check no error) +} + +func TestFileTargetSubdirectories(t *testing.T) { + dir := t.TempDir() + ft, err := NewFileTarget(dir) + if err != nil { + t.Fatalf("NewFileTarget: %v", err) + } + + testData := []byte("test data in subdir") + if err := ft.WriteFile("fritz/cc-archive-2025-01-20-001.parquet", testData); err != nil { + t.Fatalf("WriteFile with subdir: %v", err) + } + + // Verify file was created in subdirectory + content, err := os.ReadFile(filepath.Join(dir, "fritz", "cc-archive-2025-01-20-001.parquet")) + if err != nil { + t.Fatalf("read file in subdir: %v", err) + } + if !bytes.Equal(content, testData) { + t.Error("file content mismatch") + } +} + +func makeTestJobForCluster(jobID int64, cluster string) (*schema.Job, *schema.JobData) { + meta, data := makeTestJob(jobID) + meta.Cluster = cluster + return meta, data +} + +func TestClusterAwareParquetWriter(t *testing.T) { + target := newMemTarget() + cw := NewClusterAwareParquetWriter(target, 512) + + // Set cluster configs + cw.SetClusterConfig("fritz", &schema.Cluster{Name: "fritz"}) + cw.SetClusterConfig("alex", &schema.Cluster{Name: "alex"}) + + // Add jobs from different clusters + for i := int64(0); i < 3; i++ { + meta, data := makeTestJobForCluster(i, "fritz") + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert fritz job %d: %v", i, err) + } + if err := cw.AddJob(*row); err != nil { + t.Fatalf("add fritz job %d: %v", i, err) + } + } + + for i := int64(10); i < 12; i++ { + meta, data := makeTestJobForCluster(i, "alex") + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert alex job %d: %v", i, err) + } + if err := cw.AddJob(*row); err != nil { + t.Fatalf("add alex job %d: %v", i, err) + } + } + + if err := cw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + target.mu.Lock() + defer target.mu.Unlock() + + // Check cluster.json files were written + if _, ok := target.files["fritz/cluster.json"]; !ok { + t.Error("missing fritz/cluster.json") + } + if _, ok := target.files["alex/cluster.json"]; !ok { + t.Error("missing alex/cluster.json") + } + + // Verify cluster.json content + var clusterCfg schema.Cluster + if err := json.Unmarshal(target.files["fritz/cluster.json"], &clusterCfg); err != nil { + t.Fatalf("unmarshal fritz cluster.json: %v", err) + } + if clusterCfg.Name != "fritz" { + t.Errorf("fritz cluster name = %q, want %q", clusterCfg.Name, "fritz") + } + + // Check parquet files are in cluster subdirectories + fritzParquets := 0 + alexParquets := 0 + for name := range target.files { + if strings.HasPrefix(name, "fritz/") && strings.HasSuffix(name, ".parquet") { + fritzParquets++ + } + if strings.HasPrefix(name, "alex/") && strings.HasSuffix(name, ".parquet") { + alexParquets++ + } + } + if fritzParquets == 0 { + t.Error("no parquet files in fritz/") + } + if alexParquets == 0 { + t.Error("no parquet files in alex/") + } + + // Verify parquet files are readable and have correct row counts + for name, data := range target.files { + if !strings.HasSuffix(name, ".parquet") { + continue + } + file := bytes.NewReader(data) + pf, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Errorf("open parquet %s: %v", name, err) + continue + } + if strings.HasPrefix(name, "fritz/") && pf.NumRows() != 3 { + t.Errorf("fritz parquet rows = %d, want 3", pf.NumRows()) + } + if strings.HasPrefix(name, "alex/") && pf.NumRows() != 2 { + t.Errorf("alex parquet rows = %d, want 2", pf.NumRows()) + } + } +} + +func TestClusterAwareParquetWriterEmpty(t *testing.T) { + target := newMemTarget() + cw := NewClusterAwareParquetWriter(target, 512) + + if err := cw.Close(); err != nil { + t.Fatalf("close empty writer: %v", err) + } + + if len(target.files) != 0 { + t.Errorf("expected no files for empty writer, got %d", len(target.files)) + } +} diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index ed1ff38e..d6def692 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -6,6 +6,7 @@ package metricstore import ( + "encoding/json" "fmt" "time" @@ -19,6 +20,13 @@ type HealthCheckResponse struct { Error error } +// HealthCheckResult holds the monitoring state and raw JSON health metrics +// for a single node as determined by HealthCheck. +type HealthCheckResult struct { + State schema.MonitoringState + HealthMetrics string // JSON: {"missing":[...],"degraded":[...]} +} + // MaxMissingDataPoints is the threshold for stale data detection. // A buffer is considered healthy if the gap between its last data point // and the current time is within MaxMissingDataPoints * frequency. @@ -134,15 +142,15 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []str // - MonitoringStateFailed: node not found, or no healthy metrics at all func (m *MemoryStore) HealthCheck(cluster string, nodes []string, expectedMetrics []string, -) (map[string]schema.MonitoringState, error) { - results := make(map[string]schema.MonitoringState, len(nodes)) +) (map[string]HealthCheckResult, error) { + results := make(map[string]HealthCheckResult, len(nodes)) for _, hostname := range nodes { selector := []string{cluster, hostname} degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics) if err != nil { - results[hostname] = schema.MonitoringStateFailed + results[hostname] = HealthCheckResult{State: schema.MonitoringStateFailed} continue } @@ -158,13 +166,24 @@ func (m *MemoryStore) HealthCheck(cluster string, cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList) } + var state schema.MonitoringState switch { case degradedCount == 0 && missingCount == 0: - results[hostname] = schema.MonitoringStateFull + state = schema.MonitoringStateFull case healthyCount == 0: - results[hostname] = schema.MonitoringStateFailed + state = schema.MonitoringStateFailed default: - results[hostname] = schema.MonitoringStatePartial + state = schema.MonitoringStatePartial + } + + hm, _ := json.Marshal(map[string][]string{ + "missing": missingList, + "degraded": degradedList, + }) + + results[hostname] = HealthCheckResult{ + State: state, + HealthMetrics: string(hm), } } diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index 4d68d76c..a9ff0055 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -253,8 +253,8 @@ func TestHealthCheck(t *testing.T) { // Check status if wantStatus, ok := tt.wantStates[node]; ok { - if state != wantStatus { - t.Errorf("HealthCheck() node %s status = %v, want %v", node, state, wantStatus) + if state.State != wantStatus { + t.Errorf("HealthCheck() node %s status = %v, want %v", node, state.State, wantStatus) } } } diff --git a/pkg/metricstore/query.go b/pkg/metricstore/query.go index e5a49af3..709a9710 100644 --- a/pkg/metricstore/query.go +++ b/pkg/metricstore/query.go @@ -149,7 +149,7 @@ func (ccms *InternalMetricStore) LoadData( jobMetric.Series = append(jobMetric.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -651,7 +651,7 @@ func (ccms *InternalMetricStore) LoadScopedStats( scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ Hostname: query.Hostname, - Id: id, + ID: id, Data: &schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), @@ -894,7 +894,7 @@ func (ccms *InternalMetricStore) LoadNodeListData( scopeData.Series = append(scopeData.Series, schema.Series{ Hostname: query.Hostname, - Id: id, + ID: id, Statistics: schema.MetricStatistics{ Avg: float64(res.Avg), Min: float64(res.Min), diff --git a/tools/archive-manager/README.md b/tools/archive-manager/README.md new file mode 100644 index 00000000..c006a63e --- /dev/null +++ b/tools/archive-manager/README.md @@ -0,0 +1,148 @@ +# Archive Manager + +## Overview + +The `archive-manager` tool manages ClusterCockpit job archives. It supports inspecting archives, validating jobs, removing jobs by date range, importing jobs between archive backends, and converting archives between JSON and Parquet formats. + +## Features + +- **Archive Info**: Display statistics about an existing job archive +- **Validation**: Validate job archives against the JSON schema +- **Cleanup**: Remove jobs by date range +- **Import**: Copy jobs between archive backends (file, S3, SQLite) with parallel processing +- **Convert**: Convert archives between JSON and Parquet formats (both directions) +- **Progress Reporting**: Real-time progress display with ETA and throughput metrics +- **Graceful Interruption**: CTRL-C stops processing after finishing current jobs + +## Usage + +### Build + +```bash +go build ./tools/archive-manager/ +``` + +### Archive Info + +Display statistics about a job archive: + +```bash +./archive-manager -s ./var/job-archive +``` + +### Validate Archive + +```bash +./archive-manager -s ./var/job-archive --validate --config ./config.json +``` + +### Remove Jobs by Date + +```bash +# Remove jobs started before a date +./archive-manager -s ./var/job-archive --remove-before 2023-Jan-01 --config ./config.json + +# Remove jobs started after a date +./archive-manager -s ./var/job-archive --remove-after 2024-Dec-31 --config ./config.json +``` + +### Import Between Backends + +Import jobs from one archive backend to another (e.g., file to S3, file to SQLite): + +```bash +./archive-manager --import \ + --src-config '{"kind":"file","path":"./var/job-archive"}' \ + --dst-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"archive","access-key":"...","secret-key":"..."}' +``` + +### Convert JSON to Parquet + +Convert a JSON job archive to Parquet format: + +```bash +./archive-manager --convert --format parquet \ + --src-config '{"kind":"file","path":"./var/job-archive"}' \ + --dst-config '{"kind":"file","path":"./var/parquet-archive"}' +``` + +The source (`--src-config`) is a standard archive backend config (file, S3, or SQLite). The destination (`--dst-config`) specifies where to write parquet files. + +### Convert Parquet to JSON + +Convert a Parquet archive back to JSON format: + +```bash +./archive-manager --convert --format json \ + --src-config '{"kind":"file","path":"./var/parquet-archive"}' \ + --dst-config '{"kind":"file","path":"./var/json-archive"}' +``` + +The source (`--src-config`) points to a directory or S3 bucket containing parquet files organized by cluster. The destination (`--dst-config`) is a standard archive backend config. + +### S3 Source/Destination Example + +Both conversion directions support S3: + +```bash +# JSON (S3) -> Parquet (local) +./archive-manager --convert --format parquet \ + --src-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"json-archive","accessKey":"...","secretKey":"..."}' \ + --dst-config '{"kind":"file","path":"./var/parquet-archive"}' + +# Parquet (local) -> JSON (S3) +./archive-manager --convert --format json \ + --src-config '{"kind":"file","path":"./var/parquet-archive"}' \ + --dst-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"json-archive","access-key":"...","secret-key":"..."}' +``` + +## Command-Line Options + +| Flag | Default | Description | +|------|---------|-------------| +| `-s` | `./var/job-archive` | Source job archive path (for info/validate/remove modes) | +| `--config` | `./config.json` | Path to config.json | +| `--loglevel` | `info` | Logging level: debug, info, warn, err, fatal, crit | +| `--logdate` | `false` | Add timestamps to log messages | +| `--validate` | `false` | Validate archive against JSON schema | +| `--remove-before` | | Remove jobs started before date (Format: 2006-Jan-02) | +| `--remove-after` | | Remove jobs started after date (Format: 2006-Jan-02) | +| `--import` | `false` | Import jobs between archive backends | +| `--convert` | `false` | Convert archive between JSON and Parquet formats | +| `--format` | `json` | Output format for conversion: `json` or `parquet` | +| `--max-file-size` | `512` | Max parquet file size in MB (only for parquet output) | +| `--src-config` | | Source config JSON (required for import/convert) | +| `--dst-config` | | Destination config JSON (required for import/convert) | + +## Parquet Archive Layout + +When converting to Parquet, the output is organized by cluster: + +``` +parquet-archive/ + clusterA/ + cluster.json + cc-archive-2025-01-20-001.parquet + cc-archive-2025-01-20-002.parquet + clusterB/ + cluster.json + cc-archive-2025-01-20-001.parquet +``` + +Each parquet file contains job metadata and gzip-compressed metric data. The `cluster.json` file preserves the cluster configuration from the source archive. + +## Round-Trip Conversion + +Archives can be converted from JSON to Parquet and back without data loss: + +```bash +# Original JSON archive +./archive-manager --convert --format parquet \ + --src-config '{"kind":"file","path":"./var/job-archive"}' \ + --dst-config '{"kind":"file","path":"./var/parquet-archive"}' + +# Convert back to JSON +./archive-manager --convert --format json \ + --src-config '{"kind":"file","path":"./var/parquet-archive"}' \ + --dst-config '{"kind":"file","path":"./var/json-archive"}' +``` diff --git a/tools/archive-manager/main.go b/tools/archive-manager/main.go index 918fc7c8..4a9094c0 100644 --- a/tools/archive-manager/main.go +++ b/tools/archive-manager/main.go @@ -23,6 +23,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/pkg/archive" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) @@ -372,10 +373,207 @@ func importArchive(srcBackend, dstBackend archive.ArchiveBackend, srcConfig stri return finalImported, finalFailed, nil } +// parseSourceConfig parses the common kind/path/s3 fields from a config JSON string. +type sourceConfig struct { + Kind string `json:"kind"` + Path string `json:"path"` + Endpoint string `json:"endpoint"` + Bucket string `json:"bucket"` + AccessKey string `json:"accessKey"` + SecretKey string `json:"secretKey"` + Region string `json:"region"` + UsePathStyle bool `json:"usePathStyle"` +} + +// createParquetTarget creates a ParquetTarget from a parsed config. +func createParquetTarget(cfg sourceConfig) (pqarchive.ParquetTarget, error) { + switch cfg.Kind { + case "s3": + return pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: cfg.Endpoint, + Bucket: cfg.Bucket, + AccessKey: cfg.AccessKey, + SecretKey: cfg.SecretKey, + Region: cfg.Region, + UsePathStyle: cfg.UsePathStyle, + }) + default: + return pqarchive.NewFileTarget(cfg.Path) + } +} + +// createParquetSource creates a ParquetSource from a parsed config. +func createParquetSource(cfg sourceConfig) (pqarchive.ParquetSource, error) { + switch cfg.Kind { + case "s3": + return pqarchive.NewS3ParquetSource(pqarchive.S3TargetConfig{ + Endpoint: cfg.Endpoint, + Bucket: cfg.Bucket, + AccessKey: cfg.AccessKey, + SecretKey: cfg.SecretKey, + Region: cfg.Region, + UsePathStyle: cfg.UsePathStyle, + }) + default: + if cfg.Path == "" { + return nil, fmt.Errorf("file source: path is required") + } + return pqarchive.NewFileParquetSource(cfg.Path), nil + } +} + +// convertJSONToParquet converts a JSON archive backend to parquet format. +func convertJSONToParquet(srcBackend archive.ArchiveBackend, dstCfg sourceConfig, maxSizeMB int) error { + target, err := createParquetTarget(dstCfg) + if err != nil { + return fmt.Errorf("create parquet target: %w", err) + } + + cw := pqarchive.NewClusterAwareParquetWriter(target, maxSizeMB) + + // Transfer cluster configs + for _, clusterName := range srcBackend.GetClusters() { + clusterCfg, err := srcBackend.LoadClusterCfg(clusterName) + if err != nil { + cclog.Warnf("Convert: load cluster config %q: %v", clusterName, err) + continue + } + cw.SetClusterConfig(clusterName, clusterCfg) + } + + converted := 0 + failed := 0 + startTime := time.Now() + + for job := range srcBackend.Iter(true) { + if job.Meta == nil { + cclog.Warn("Skipping job with nil metadata") + failed++ + continue + } + if job.Data == nil { + cclog.Warnf("Job %d has no metric data, skipping", job.Meta.JobID) + failed++ + continue + } + + row, err := pqarchive.JobToParquetRow(job.Meta, job.Data) + if err != nil { + cclog.Warnf("Convert job %d: %v", job.Meta.JobID, err) + failed++ + continue + } + if err := cw.AddJob(*row); err != nil { + cclog.Errorf("Add job %d to writer: %v", job.Meta.JobID, err) + failed++ + continue + } + converted++ + + if converted%1000 == 0 { + cclog.Infof("Converted %d jobs so far...", converted) + } + } + + if err := cw.Close(); err != nil { + return fmt.Errorf("close parquet writer: %w", err) + } + + elapsed := time.Since(startTime) + cclog.Infof("JSON->Parquet conversion completed in %s: %d jobs converted, %d failed", + formatDuration(elapsed), converted, failed) + return nil +} + +// convertParquetToJSON converts a parquet archive to a JSON archive backend. +func convertParquetToJSON(srcCfg sourceConfig, dstBackend archive.ArchiveBackend) error { + src, err := createParquetSource(srcCfg) + if err != nil { + return fmt.Errorf("create parquet source: %w", err) + } + + clusters, err := src.GetClusters() + if err != nil { + return fmt.Errorf("list clusters: %w", err) + } + + converted := 0 + failed := 0 + skipped := 0 + startTime := time.Now() + + for _, cluster := range clusters { + // Transfer cluster config + clusterCfg, err := src.ReadClusterConfig(cluster) + if err != nil { + cclog.Warnf("Convert: read cluster config %q: %v", cluster, err) + } else { + if err := dstBackend.StoreClusterCfg(cluster, clusterCfg); err != nil { + cclog.Warnf("Convert: store cluster config %q: %v", cluster, err) + } else { + cclog.Infof("Imported cluster config for %s", cluster) + } + } + + // Read and convert parquet files + files, err := src.ListParquetFiles(cluster) + if err != nil { + cclog.Errorf("Convert: list parquet files for %q: %v", cluster, err) + continue + } + + for _, file := range files { + data, err := src.ReadFile(file) + if err != nil { + cclog.Errorf("Convert: read file %q: %v", file, err) + failed++ + continue + } + + rows, err := pqarchive.ReadParquetFile(data) + if err != nil { + cclog.Errorf("Convert: parse parquet file %q: %v", file, err) + failed++ + continue + } + + cclog.Infof("Processing %s: %d jobs", file, len(rows)) + + for _, row := range rows { + meta, jobData, err := pqarchive.ParquetRowToJob(&row) + if err != nil { + cclog.Warnf("Convert row to job: %v", err) + failed++ + continue + } + + if dstBackend.Exists(meta) { + skipped++ + continue + } + + if err := dstBackend.ImportJob(meta, jobData); err != nil { + cclog.Warnf("Import job %d: %v", meta.JobID, err) + failed++ + continue + } + converted++ + } + } + } + + elapsed := time.Since(startTime) + cclog.Infof("Parquet->JSON conversion completed in %s: %d jobs converted, %d skipped, %d failed", + formatDuration(elapsed), converted, skipped, failed) + return nil +} + func main() { var srcPath, flagConfigFile, flagLogLevel, flagRemoveCluster, flagRemoveAfter, flagRemoveBefore string var flagSrcConfig, flagDstConfig string - var flagLogDateTime, flagValidate, flagImport bool + var flagLogDateTime, flagValidate, flagImport, flagConvert bool + var flagFormat string + var flagMaxFileSize int flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") @@ -386,6 +584,9 @@ func main() { flag.StringVar(&flagRemoveAfter, "remove-after", "", "Remove all jobs with start time after date (Format: 2006-Jan-04)") flag.BoolVar(&flagValidate, "validate", false, "Set this flag to validate a job archive against the json schema") flag.BoolVar(&flagImport, "import", false, "Import jobs from source archive to destination archive") + flag.BoolVar(&flagConvert, "convert", false, "Convert archive between JSON and Parquet formats") + flag.StringVar(&flagFormat, "format", "json", "Output format for conversion: 'json' or 'parquet'") + flag.IntVar(&flagMaxFileSize, "max-file-size", 512, "Max parquet file size in MB (only for parquet output)") flag.StringVar(&flagSrcConfig, "src-config", "", "Source archive backend configuration (JSON), e.g. '{\"kind\":\"file\",\"path\":\"./archive\"}'") flag.StringVar(&flagDstConfig, "dst-config", "", "Destination archive backend configuration (JSON), e.g. '{\"kind\":\"sqlite\",\"dbPath\":\"./archive.db\"}'") flag.Parse() @@ -429,6 +630,49 @@ func main() { os.Exit(0) } + // Handle convert mode + if flagConvert { + if flagSrcConfig == "" || flagDstConfig == "" { + cclog.Fatal("Both --src-config and --dst-config must be specified for convert mode") + } + + var srcCfg, dstCfg sourceConfig + if err := json.Unmarshal([]byte(flagSrcConfig), &srcCfg); err != nil { + cclog.Fatalf("Failed to parse source config: %s", err.Error()) + } + if err := json.Unmarshal([]byte(flagDstConfig), &dstCfg); err != nil { + cclog.Fatalf("Failed to parse destination config: %s", err.Error()) + } + + switch flagFormat { + case "parquet": + // JSON archive -> Parquet: source is an archive backend + cclog.Info("Convert mode: JSON -> Parquet") + srcBackend, err := archive.InitBackend(json.RawMessage(flagSrcConfig)) + if err != nil { + cclog.Fatalf("Failed to initialize source backend: %s", err.Error()) + } + if err := convertJSONToParquet(srcBackend, dstCfg, flagMaxFileSize); err != nil { + cclog.Fatalf("Conversion failed: %s", err.Error()) + } + case "json": + // Parquet -> JSON archive: destination is an archive backend + cclog.Info("Convert mode: Parquet -> JSON") + dstBackend, err := archive.InitBackend(json.RawMessage(flagDstConfig)) + if err != nil { + cclog.Fatalf("Failed to initialize destination backend: %s", err.Error()) + } + if err := convertParquetToJSON(srcCfg, dstBackend); err != nil { + cclog.Fatalf("Conversion failed: %s", err.Error()) + } + default: + cclog.Fatalf("Unknown format %q: must be 'json' or 'parquet'", flagFormat) + } + + cclog.Info("Conversion finished successfully") + os.Exit(0) + } + ccconf.Init(flagConfigFile) // Load and check main configuration diff --git a/tools/convert-pem-pubkey/Readme.md b/tools/convert-pem-pubkey/Readme.md index 1429acc4..22fd0db2 100644 --- a/tools/convert-pem-pubkey/Readme.md +++ b/tools/convert-pem-pubkey/Readme.md @@ -16,7 +16,7 @@ CROSS_LOGIN_JWT_PUBLIC_KEY="+51iXX8BdLFocrppRxIw52xCOf8xFSH/eNilN5IHVGc=" Instructions -- `cd tools/convert-pem-pubkey-for-cc/` +- `cd tools/convert-pem-pubkey/` - Insert your public ed25519 PEM key into `dummy.pub` - `go run . dummy.pub` - Copy the result into ClusterCockpit's `.env` diff --git a/web/frontend/README.md b/web/frontend/README.md index d61d302e..4dff4405 100644 --- a/web/frontend/README.md +++ b/web/frontend/README.md @@ -1,11 +1,11 @@ # cc-frontend -[![Build](https://github.com/ClusterCockpit/cc-svelte-datatable/actions/workflows/build.yml/badge.svg)](https://github.com/ClusterCockpit/cc-svelte-datatable/actions/workflows/build.yml) +[![Build](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml) -A frontend for [ClusterCockpit](https://github.com/ClusterCockpit/ClusterCockpit) and [cc-backend](https://github.com/ClusterCockpit/cc-backend). Backend specific configuration can de done using the constants defined in the `intro` section in `./rollup.config.js`. +A frontend for [ClusterCockpit](https://github.com/ClusterCockpit/ClusterCockpit) and [cc-backend](https://github.com/ClusterCockpit/cc-backend). Backend specific configuration can be done using the constants defined in the `intro` section in `./rollup.config.mjs`. Builds on: -* [Svelte](https://svelte.dev/) +* [Svelte 5](https://svelte.dev/) * [SvelteStrap](https://sveltestrap.js.org/) * [Bootstrap 5](https://getbootstrap.com/) * [urql](https://github.com/FormidableLabs/urql) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index 50de27b5..3baed1c1 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -30,7 +30,7 @@ import { init, groupByScope, - checkMetricDisabled, + checkMetricAvailability, } from "./generic/utils.js"; import Metric from "./job/Metric.svelte"; import MetricSelection from "./generic/select/MetricSelection.svelte"; @@ -151,17 +151,17 @@ } return names; }, []); - + // return metricNames.filter( (metric) => !metrics.some((jm) => jm.name == metric) && selectedMetrics.includes(metric) && - !checkMetricDisabled( + (checkMetricAvailability( globalMetrics, metric, thisJob.cluster, thisJob.subCluster, - ), + ) == "configured") ); } else { return [] @@ -212,7 +212,7 @@ inputMetrics.map((metric) => ({ metric: metric, data: grouped.find((group) => group[0].name == metric), - disabled: checkMetricDisabled( + availability: checkMetricAvailability( globalMetrics, metric, thisJob.cluster, @@ -333,7 +333,28 @@ {:else if thisJob && $jobMetrics?.data?.scopedJobStats} {#snippet gridContent(item)} - {#if item.data} + {#if item.availability == "none"} + + + Metric not configured + + +

No datasets returned for {item.metric}.

+

Metric is not configured for cluster {thisJob.cluster}.

+
+
+ {:else if item.availability == "disabled"} + + + Disabled Metric + + +

No dataset(s) returned for {item.metric}

+

Metric has been disabled for subcluster {thisJob.subCluster}.

+

To remove this card, open metric selection, de-select the metric, and press "Close and Apply".

+
+
+ {:else if item?.data} x.scope)} isShared={thisJob.shared != "none"} /> - {:else if item.disabled == true} - - - Disabled Metric - - -

Metric {item.metric} is disabled for cluster {thisJob.cluster}:{thisJob.subCluster}.

-

To remove this card, open metric selection and press "Close and Apply".

-
-
{:else} diff --git a/web/frontend/src/Jobs.root.svelte b/web/frontend/src/Jobs.root.svelte index 52efca6b..a06aee3c 100644 --- a/web/frontend/src/Jobs.root.svelte +++ b/web/frontend/src/Jobs.root.svelte @@ -142,6 +142,9 @@ 0)} + shortJobCutoff={ccconfig?.jobList_hideShortRunningJobs} showFilter={!showCompare} matchedJobs={showCompare? matchedCompareJobs: matchedListJobs} applyFilters={(detail) => { diff --git a/web/frontend/src/Node.root.svelte b/web/frontend/src/Node.root.svelte index 6962aff8..06056466 100644 --- a/web/frontend/src/Node.root.svelte +++ b/web/frontend/src/Node.root.svelte @@ -32,7 +32,7 @@ } from "@urql/svelte"; import { init, - checkMetricDisabled, + checkMetricAvailability, } from "./generic/utils.js"; import PlotGrid from "./generic/PlotGrid.svelte"; import MetricPlot from "./generic/plots/MetricPlot.svelte"; @@ -119,7 +119,7 @@ const filter = $derived([ { cluster: { eq: cluster } }, - { node: { contains: hostname } }, + { node: { eq: hostname } }, { state: ["running"] }, ]); @@ -242,7 +242,27 @@ {item.name} {systemUnits[item.name] ? "(" + systemUnits[item.name] + ")" : ""} - {#if item.disabled === false && item.metric} + {#if item.availability == "none"} + + + Metric not configured + + +

No datasets returned for {item.name}.

+

Metric is not configured for cluster {cluster}.

+
+
+ {:else if item.availability == "disabled"} + + + Disabled Metric + + +

No dataset(s) returned for {item.name}

+

Metric has been disabled for subcluster {$nodeMetricsData.data.nodeMetrics[0].subCluster}.

+
+
+ {:else if item?.metric} - {:else if item.disabled === true && item.metric} - Metric disabled for subcluster {item.name}:{$nodeMetricsData.data.nodeMetrics[0] - .subCluster} {:else} @@ -276,7 +289,7 @@ items={$nodeMetricsData.data.nodeMetrics[0].metrics .map((m) => ({ ...m, - disabled: checkMetricDisabled( + availability: checkMetricAvailability( globalMetrics, m.name, cluster, diff --git a/web/frontend/src/Systems.root.svelte b/web/frontend/src/Systems.root.svelte index fb5c4495..d89b5f06 100644 --- a/web/frontend/src/Systems.root.svelte +++ b/web/frontend/src/Systems.root.svelte @@ -272,8 +272,8 @@ {:else} - + {/if} {/if} diff --git a/web/frontend/src/User.root.svelte b/web/frontend/src/User.root.svelte index 76c9c97a..d086df14 100644 --- a/web/frontend/src/User.root.svelte +++ b/web/frontend/src/User.root.svelte @@ -219,9 +219,11 @@ 0)} + shortJobCutoff={ccconfig?.jobList_hideShortRunningJobs} showFilter={!showCompare} matchedJobs={showCompare? matchedCompareJobs: matchedListJobs} - startTimeQuickSelect applyFilters={(detail) => { jobFilters = [...detail.filters, { user: { eq: user.username } }]; selectedCluster = jobFilters[0]?.cluster diff --git a/web/frontend/src/generic/Filters.svelte b/web/frontend/src/generic/Filters.svelte index 74f55ca7..4031d0a5 100644 --- a/web/frontend/src/generic/Filters.svelte +++ b/web/frontend/src/generic/Filters.svelte @@ -6,6 +6,8 @@ - `filterPresets Object?`: Optional predefined filter values [Default: {}] - `disableClusterSelection Bool?`: Is the selection disabled [Default: false] - `startTimeQuickSelect Bool?`: Render startTime quick selections [Default: false] + - `shortJobQuickSelect Bool?`: Render short job quick selections [Default: false] + - `shortJobCutoff Int?`: Time in seconds for jobs to be considered short [Default: null] - `matchedJobs Number?`: Number of jobs matching the filter [Default: -2] - `showFilter Func`: If the filter component should be rendered in addition to total count info [Default: true] - `applyFilters Func`: The callback function to apply current filter selection @@ -25,6 +27,7 @@ ButtonGroup, ButtonDropdown, Icon, + Tooltip } from "@sveltestrap/sveltestrap"; import Info from "./filters/InfoBox.svelte"; import Cluster from "./filters/Cluster.svelte"; @@ -36,6 +39,7 @@ import Resources from "./filters/Resources.svelte"; import Energy from "./filters/Energy.svelte"; import Statistics from "./filters/Stats.svelte"; + import { formatDurationTime } from "./units.js"; /* Svelte 5 Props */ let { @@ -43,6 +47,8 @@ filterPresets = {}, disableClusterSelection = false, startTimeQuickSelect = false, + shortJobQuickSelect = false, + shortJobCutoff = 0, matchedJobs = -2, showFilter = true, applyFilters @@ -335,6 +341,44 @@ (isStatsOpen = true)}> (isStatsOpen = true)} /> Statistics + {#if shortJobQuickSelect && shortJobCutoff > 0} + + + Short Jobs Selection + + + Job duration less than {formatDurationTime(shortJobCutoff)} + + + { + filters.duration = { + moreThan: null, + lessThan: shortJobCutoff, + from: null, + to: null + } + updateFilters(); + }} + > + + Only Short Jobs + + { + filters.duration = { + moreThan: shortJobCutoff, + lessThan: null, + from: null, + to: null + } + updateFilters(); + }} + > + + Exclude Short Jobs + + {/if} {#if startTimeQuickSelect} Start Time Quick Selection @@ -407,7 +451,7 @@ {#if filters.startTime.range} (isStartTimeOpen = true)}> - {startTimeSelectOptions.find((stso) => stso.range === filters.startTime.range).rangeLabel } + Job Start: {startTimeSelectOptions.find((stso) => stso.range === filters.startTime.range).rangeLabel } {/if} diff --git a/web/frontend/src/generic/JobCompare.svelte b/web/frontend/src/generic/JobCompare.svelte index d5283a9a..dfe548b0 100644 --- a/web/frontend/src/generic/JobCompare.svelte +++ b/web/frontend/src/generic/JobCompare.svelte @@ -112,11 +112,7 @@ // (Re-)query and optionally set new filters; Query will be started reactively. export function queryJobs(filters) { if (filters != null) { - let minRunningFor = ccconfig.jobList_hideShortRunningJobs; - if (minRunningFor && minRunningFor > 0) { - filters.push({ minRunningFor }); - } - filter = filters; + filter = [...filters]; } } diff --git a/web/frontend/src/generic/JobList.svelte b/web/frontend/src/generic/JobList.svelte index 9394ed5f..3ccbb560 100644 --- a/web/frontend/src/generic/JobList.svelte +++ b/web/frontend/src/generic/JobList.svelte @@ -180,10 +180,6 @@ // (Re-)query and optionally set new filters; Query will be started reactively. export function queryJobs(filters) { if (filters != null) { - let minRunningFor = ccconfig.jobList_hideShortRunningJobs; - if (minRunningFor && minRunningFor > 0) { - filters.push({ minRunningFor }); - } filter = [...filters]; } }; @@ -309,7 +305,7 @@ {#if $jobsStore.fetching || !$jobsStore.data} -
+
diff --git a/web/frontend/src/generic/filters/StartTime.svelte b/web/frontend/src/generic/filters/StartTime.svelte index 5d9340e3..1a298349 100644 --- a/web/frontend/src/generic/filters/StartTime.svelte +++ b/web/frontend/src/generic/filters/StartTime.svelte @@ -14,8 +14,8 @@ @@ -211,39 +207,41 @@ {/if} {#each refinedData as metric, i (metric?.name || i)} - {#key metric} - {#if metric?.data} - {#if metric?.disabled} - - Metric {metric.data.name}: Disabled for subcluster {job.subCluster} - - {:else} - handleZoom(detail, metric.data.name)} - height={plotHeight} - timestep={metric.data.metric.timestep} - scope={metric.data.scope} - series={metric.data.metric.series} - statisticsSeries={metric.data.metric.statisticsSeries} - metric={metric.data.name} - cluster={clusterInfos.find((c) => c.name == job.cluster)} - subCluster={job.subCluster} - isShared={job.shared != "none"} - numhwthreads={job.numHWThreads} - numaccs={job.numAcc} - zoomState={zoomStates[metric.data.name] || null} - thresholdState={thresholdStates[metric.data.name] || null} - /> - {/if} - {:else} - -

No dataset(s) returned for {metrics[i]}

-

Metric or host was not found in metric store for cluster {job.cluster}:

-

Identical messages in {metrics[i]} column: Metric not found.

-

Identical messages in job {job.jobId} row: Host not found.

-
- {/if} - {/key} + {#if metric?.availability == "none"} + +

No dataset(s) returned for {metrics[i]}

+

Metric is not configured for cluster {job.cluster}.

+
+ {:else if metric?.availability == "disabled"} + +

No dataset(s) returned for {metrics[i]}

+

Metric has been disabled for subcluster {job.subCluster}.

+
+ {:else if metric?.data} + handleZoom(detail, metric.data.name)} + height={plotHeight} + timestep={metric.data.metric.timestep} + scope={metric.data.scope} + series={metric.data.metric.series} + statisticsSeries={metric.data.metric.statisticsSeries} + metric={metric.data.name} + cluster={clusterInfos.find((c) => c.name == job.cluster)} + subCluster={job.subCluster} + isShared={job.shared != "none"} + numhwthreads={job.numHWThreads} + numaccs={job.numAcc} + zoomState={zoomStates[metric.data.name] || null} + thresholdState={thresholdStates[metric.data.name] || null} + /> + {:else} + +

No dataset(s) returned for {metrics[i]}

+

Metric or host was not found in metric store for cluster {job.cluster}:

+

Identical messages in {metrics[i]} column: Metric not found.

+

Identical messages in job {job.jobId} row: Host not found.

+
+ {/if} {:else} diff --git a/web/frontend/src/generic/select/MetricSelection.svelte b/web/frontend/src/generic/select/MetricSelection.svelte index dcefa56d..8234b32c 100644 --- a/web/frontend/src/generic/select/MetricSelection.svelte +++ b/web/frontend/src/generic/select/MetricSelection.svelte @@ -88,16 +88,19 @@ function printAvailability(metric, cluster) { const avail = globalMetrics.find((gm) => gm.name === metric)?.availability - if (!cluster) { - return avail.map((av) => av.cluster).join(', ') - } else { - const subAvail = avail.find((av) => av.cluster === cluster)?.subClusters - if (subAvail) { - return subAvail.join(', ') + if (avail) { + if (!cluster) { + return avail.map((av) => av.cluster).join(', ') } else { - return `Not available for ${cluster}` + const subAvail = avail.find((av) => av.cluster === cluster)?.subClusters + if (subAvail) { + return subAvail.join(', ') + } else { + return `Not available for ${cluster}` + } } } + return "" } function columnsDragOver(event) { diff --git a/web/frontend/src/generic/utils.js b/web/frontend/src/generic/utils.js index 47e1ac9a..b012843d 100644 --- a/web/frontend/src/generic/utils.js +++ b/web/frontend/src/generic/utils.js @@ -302,20 +302,36 @@ export function stickyHeader(datatableHeaderSelector, updatePading) { onDestroy(() => document.removeEventListener("scroll", onscroll)); } -export function checkMetricDisabled(gm, m, c, s) { // [g]lobal[m]etrics, [m]etric, [c]luster, [s]ubcluster - const available = gm?.find((gm) => gm.name === m)?.availability?.find((av) => av.cluster === c)?.subClusters?.includes(s) - // Return inverse logic - return !available +export function checkMetricAvailability(gms, m, c, s = "") { // [g]lobal[m]etrics, [m]etric, [c]luster, [s]ubcluster + let pendingAvailability = "none" + const configured = gms?.find((gm) => gm.name === m)?.availability?.find((av) => av.cluster === c) + if (configured) { + pendingAvailability = "configured" + if (s != "") { + const enabled = configured.subClusters?.includes(s) + // Test inverse logic + if (!enabled) { + pendingAvailability = "disabled" + } + } + } + return pendingAvailability; } -export function checkMetricsDisabled(gm, ma, c, s) { // [g]lobal[m]etrics, [m]etric[a]rray, [c]luster, [s]ubcluster - let result = {}; - ma.forEach((m) => { - // Return named inverse logic: !available - result[m] = !(gm?.find((gm) => gm.name === m)?.availability?.find((av) => av.cluster === c)?.subClusters?.includes(s)) - }); - return result -} +// export function checkMetricDisabled(gm, m, c, s) { // [g]lobal[m]etrics, [m]etric, [c]luster, [s]ubcluster +// const available = gm?.find((gm) => gm.name === m)?.availability?.find((av) => av.cluster === c)?.subClusters?.includes(s) +// // Return inverse logic +// return !available +// } + +// export function checkMetricsDisabled(gm, ma, c, s) { // [g]lobal[m]etrics, [m]etric[a]rray, [c]luster, [s]ubcluster +// let aresult = {}; +// ma.forEach((m) => { +// // Return named inverse logic: !available +// aresult[m] = !(gm?.find((gm) => gm.name === m)?.availability?.find((av) => av.cluster === c)?.subClusters?.includes(s)) +// }); +// return aresult +// } export function getStatsItems(presetStats = []) { // console.time('stats') diff --git a/web/frontend/src/status/dashdetails/StatisticsDash.svelte b/web/frontend/src/status/dashdetails/StatisticsDash.svelte index 42c6823f..2cf8621e 100644 --- a/web/frontend/src/status/dashdetails/StatisticsDash.svelte +++ b/web/frontend/src/status/dashdetails/StatisticsDash.svelte @@ -35,6 +35,7 @@ /* Const Init */ const ccconfig = getContext("cc-config"); + const globalMetrics = getContext("globalMetrics"); const client = getContextClient(); /* State Init */ @@ -139,6 +140,7 @@ @@ -162,35 +162,32 @@
- {#if item?.data} - {#if item.disabled === true} - - Metric disabled for subcluster {selectedMetric}:{item.subCluster} - {:else if item.disabled === false} - - - {#key item.data[0].metric.series[0].data.length} - - {/key} - {:else} - - Global Metric List Not Initialized - Can not determine {selectedMetric} availability: Please Reload Page - - {/if} + {#if item?.availability == "disabled"} + + + Disabled Metric + + +

No dataset(s) returned for {selectedMetric}

+

Metric has been disabled for subcluster {item.subCluster}.

+
+
+ {:else if item?.data} + + + {#key item.data[0].metric.series[0].data.length} + + {/key} {:else} + Missing Metric @@ -205,10 +202,34 @@ {/each} {/key} +{:else if hostnameFilter || hoststateFilter != 'all'} + + + + Empty Filter Return + + +

No datasets returned for {selectedMetric}.

+

Hostname filter and/or host state filter returned no matches.

+
+
+
+{:else if notConfigured} + + + + Metric not configured + + +

No datasets returned for {selectedMetric}.

+

Metric is not configured for cluster {cluster}.

+
+
+
{:else} - - - + + + Missing Metric diff --git a/web/frontend/src/systems/nodelist/NodeInfo.svelte b/web/frontend/src/systems/nodelist/NodeInfo.svelte index 39716ca2..4b616f10 100644 --- a/web/frontend/src/systems/nodelist/NodeInfo.svelte +++ b/web/frontend/src/systems/nodelist/NodeInfo.svelte @@ -51,6 +51,8 @@ /* Derived */ // Not at least one returned, selected metric: NodeHealth warning + const fetchInfo = $derived(dataHealth.includes('fetching')); + // Not at least one returned, selected metric: NodeHealth warning const healthWarn = $derived(!dataHealth.includes(true)); // At least one non-returned selected metric: Metric config error? const metricWarn = $derived(dataHealth.includes(false)); @@ -84,10 +86,17 @@ - {#if healthWarn} + {#if fetchInfo} + + + + + {:else if healthWarn} - Jobs + Info