Merge pull request #488 from ClusterCockpit/log-aggregator

Log aggregator
This commit is contained in:
Jan Eitzinger
2026-02-16 09:38:31 +01:00
committed by GitHub
30 changed files with 2128 additions and 356 deletions

View File

@@ -108,6 +108,7 @@ The backend follows a layered architecture with clear separation of concerns:
- File system backend (default) - File system backend (default)
- S3 backend - S3 backend
- SQLite backend (experimental) - SQLite backend (experimental)
- **parquet** sub-package: Parquet format support (schema, reader, writer, conversion)
- **internal/metricstoreclient**: Client for cc-metric-store queries - **internal/metricstoreclient**: Client for cc-metric-store queries
### Frontend Structure ### Frontend Structure

View File

@@ -184,7 +184,8 @@ ln -s <your-existing-job-archive> ./var/job-archive
- [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools) - [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools)
Additional command line helper tools. Additional command line helper tools.
- [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager) - [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager)
Commands for getting infos about an existing job archive. Commands for getting infos about an existing job archive, importing jobs
between archive backends, and converting archives between JSON and Parquet formats.
- [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration) - [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration)
Tool for migrating job archives between formats. Tool for migrating job archives between formats.
- [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey) - [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey)

View File

@@ -12,7 +12,7 @@ NotifyAccess=all
Restart=on-failure Restart=on-failure
RestartSec=30 RestartSec=30
TimeoutStopSec=100 TimeoutStopSec=100
ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json --server
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

165
internal/api/log.go Normal file
View File

@@ -0,0 +1,165 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"fmt"
"net/http"
"os/exec"
"regexp"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/internal/repository"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
type LogEntry struct {
Timestamp string `json:"timestamp"`
Priority int `json:"priority"`
Message string `json:"message"`
Unit string `json:"unit"`
}
var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`)
func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) {
user := repository.GetUserFromContext(r.Context())
if !user.HasRole(schema.RoleAdmin) {
handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw)
return
}
since := r.URL.Query().Get("since")
if since == "" {
since = "1 hour ago"
}
if !safePattern.MatchString(since) {
handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw)
return
}
lines := 200
if l := r.URL.Query().Get("lines"); l != "" {
n, err := strconv.Atoi(l)
if err != nil || n < 1 {
handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw)
return
}
if n > 1000 {
n = 1000
}
lines = n
}
unit := config.Keys.SystemdUnit
if unit == "" {
unit = "clustercockpit.service"
}
args := []string{
"--output=json",
"--no-pager",
"-n", fmt.Sprintf("%d", lines),
"--since", since,
"-u", unit,
}
if level := r.URL.Query().Get("level"); level != "" {
n, err := strconv.Atoi(level)
if err != nil || n < 0 || n > 7 {
handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw)
return
}
args = append(args, "--priority", fmt.Sprintf("%d", n))
}
if search := r.URL.Query().Get("search"); search != "" {
if !safePattern.MatchString(search) {
handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw)
return
}
args = append(args, "--grep", search)
}
cclog.Debugf("calling journalctl with %s", strings.Join(args, " "))
cmd := exec.CommandContext(r.Context(), "journalctl", args...)
stdout, err := cmd.StdoutPipe()
if err != nil {
handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw)
return
}
if err := cmd.Start(); err != nil {
handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw)
return
}
entries := make([]LogEntry, 0, lines)
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
var raw map[string]any
if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
cclog.Debugf("error unmarshal log output: %v", err)
continue
}
priority := 6 // default info
if p, ok := raw["PRIORITY"]; ok {
switch v := p.(type) {
case string:
if n, err := strconv.Atoi(v); err == nil {
priority = n
}
case float64:
priority = int(v)
}
}
msg := ""
if m, ok := raw["MESSAGE"]; ok {
if s, ok := m.(string); ok {
msg = s
}
}
ts := ""
if t, ok := raw["__REALTIME_TIMESTAMP"]; ok {
if s, ok := t.(string); ok {
ts = s
}
}
unitName := ""
if u, ok := raw["_SYSTEMD_UNIT"]; ok {
if s, ok := u.(string); ok {
unitName = s
}
}
entries = append(entries, LogEntry{
Timestamp: ts,
Priority: priority,
Message: msg,
Unit: unitName,
})
}
if err := cmd.Wait(); err != nil {
// journalctl returns exit code 1 when --grep matches nothing
if len(entries) == 0 {
cclog.Debugf("journalctl exited with: %v", err)
}
}
rw.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(rw).Encode(entries); err != nil {
cclog.Errorf("Failed to encode log entries: %v", err)
}
}

View File

@@ -158,6 +158,7 @@ func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) {
// MountFrontendAPIRoutes registers frontend-specific API endpoints. // MountFrontendAPIRoutes registers frontend-specific API endpoints.
// These routes support JWT generation and user configuration updates with session authentication. // These routes support JWT generation and user configuration updates with session authentication.
func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) { func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) {
r.Get("/logs/", api.getJournalLog)
// Settings Frontend Uses SessionAuth // Settings Frontend Uses SessionAuth
if api.Authentication != nil { if api.Authentication != nil {
r.Get("/jwt/", api.getJWT) r.Get("/jwt/", api.getJWT)

View File

@@ -72,14 +72,17 @@ type ProgramConfig struct {
// If exists, will enable dynamic zoom in frontend metric plots using the configured values // If exists, will enable dynamic zoom in frontend metric plots using the configured values
EnableResampling *ResampleConfig `json:"resampling"` EnableResampling *ResampleConfig `json:"resampling"`
// Systemd unit name for log viewer (default: "clustercockpit")
SystemdUnit string `json:"systemd-unit"`
// Node state retention configuration // Node state retention configuration
NodeStateRetention *NodeStateRetention `json:"nodestate-retention"` NodeStateRetention *NodeStateRetention `json:"nodestate-retention"`
} }
type NodeStateRetention struct { type NodeStateRetention struct {
Policy string `json:"policy"` // "delete" or "parquet" Policy string `json:"policy"` // "delete" or "parquet"
Age int `json:"age"` // hours, default 24 Age int `json:"age"` // hours, default 24
TargetKind string `json:"target-kind"` // "file" or "s3" TargetKind string `json:"target-kind"` // "file" or "s3"
TargetPath string `json:"target-path"` TargetPath string `json:"target-path"`
TargetEndpoint string `json:"target-endpoint"` TargetEndpoint string `json:"target-endpoint"`
TargetBucket string `json:"target-bucket"` TargetBucket string `json:"target-bucket"`

View File

@@ -38,6 +38,7 @@ CREATE INDEX IF NOT EXISTS nodestates_state_timestamp ON node_state (node_state,
CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp); CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp);
CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state); CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state);
CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state); CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state);
CREATE INDEX IF NOT EXISTS nodestates_nodeid_timestamp ON node_state (node_id, time_stamp DESC);
-- Add NEW Indices For Increased Amounts of Tags -- Add NEW Indices For Increased Amounts of Tags
CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id); CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id);

View File

@@ -52,6 +52,38 @@ func GetNodeRepository() *NodeRepository {
return nodeRepoInstance return nodeRepoInstance
} }
// latestStateCondition returns a squirrel expression that restricts node_state
// rows to the latest per node_id using a correlated subquery.
// Requires the query to join node and node_state tables.
func latestStateCondition() sq.Sqlizer {
return sq.Expr(
"node_state.id = (SELECT ns2.id FROM node_state ns2 WHERE ns2.node_id = node.id ORDER BY ns2.time_stamp DESC LIMIT 1)",
)
}
// applyNodeFilters applies common NodeFilter conditions to a query that joins
// the node and node_state tables with latestStateCondition.
func applyNodeFilters(query sq.SelectBuilder, filters []*model.NodeFilter) sq.SelectBuilder {
for _, f := range filters {
if f.Cluster != nil {
query = buildStringCondition("node.cluster", f.Cluster, query)
}
if f.SubCluster != nil {
query = buildStringCondition("node.subcluster", f.SubCluster, query)
}
if f.Hostname != nil {
query = buildStringCondition("node.hostname", f.Hostname, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state.node_state = ?", f.SchedulerState)
}
if f.HealthState != nil {
query = query.Where("node_state.health_state = ?", f.HealthState)
}
}
return query
}
func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) { func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) {
start := time.Now() start := time.Now()
@@ -82,17 +114,16 @@ func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[str
func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) { func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) {
node := &schema.Node{} node := &schema.Node{}
var timestamp int if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", "node_state.node_state", "node_state.health_state").
"node_state.health_state", "MAX(node_state.time_stamp) as time"). From("node").
From("node_state"). Join("node_state ON node_state.node_id = node.id").
Join("node ON node_state.node_id = node.id"). Where(latestStateCondition()).
Where("node.hostname = ?", hostname). Where("node.hostname = ?", hostname).
Where("node.cluster = ?", cluster). Where("node.cluster = ?", cluster).
GroupBy("node_state.node_id").
RunWith(r.DB). RunWith(r.DB).
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil { QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
cclog.Warnf("Error while querying node '%s' at time '%d' from database: %v", hostname, timestamp, err) cclog.Warnf("Error while querying node '%s' from database: %v", hostname, err)
return nil, err return nil, err
} }
@@ -111,16 +142,15 @@ func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool)
func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) { func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) {
node := &schema.Node{} node := &schema.Node{}
var timestamp int if err := sq.Select("node.hostname", "node.cluster", "node.subcluster",
if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", "node_state.node_state", "node_state.health_state").
"node_state.health_state", "MAX(node_state.time_stamp) as time"). From("node").
From("node_state"). Join("node_state ON node_state.node_id = node.id").
Join("node ON node_state.node_id = node.id"). Where(latestStateCondition()).
Where("node.id = ?", id). Where("node.id = ?", id).
GroupBy("node_state.node_id").
RunWith(r.DB). RunWith(r.DB).
QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil { QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
cclog.Warnf("Error while querying node ID '%d' at time '%d' from database: %v", id, timestamp, err) cclog.Warnf("Error while querying node ID '%d' from database: %v", id, err)
return nil, err return nil, err
} }
@@ -313,40 +343,17 @@ func (r *NodeRepository) QueryNodes(
order *model.OrderByInput, // Currently unused! order *model.OrderByInput, // Currently unused!
) ([]*schema.Node, error) { ) ([]*schema.Node, error) {
query, qerr := AccessCheck(ctx, query, qerr := AccessCheck(ctx,
sq.Select("hostname", "cluster", "subcluster", "node_state", "health_state", "MAX(time_stamp) as time"). sq.Select("node.hostname", "node.cluster", "node.subcluster",
"node_state.node_state", "node_state.health_state").
From("node"). From("node").
Join("node_state ON node_state.node_id = node.id")) Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()))
if qerr != nil { if qerr != nil {
return nil, qerr return nil, qerr
} }
for _, f := range filters { query = applyNodeFilters(query, filters)
if f.Cluster != nil { query = query.OrderBy("node.hostname ASC")
query = buildStringCondition("cluster", f.Cluster, query)
}
if f.SubCluster != nil {
query = buildStringCondition("subcluster", f.SubCluster, query)
}
if f.Hostname != nil {
query = buildStringCondition("hostname", f.Hostname, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state = ?", f.SchedulerState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
// TODO: Hardcoded TimeDiff Suboptimal - Use Config Option?
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 300)})
}
if f.HealthState != nil {
query = query.Where("health_state = ?", f.HealthState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
// TODO: Hardcoded TimeDiff Suboptimal - Use Config Option?
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 300)})
}
}
query = query.GroupBy("node_id").OrderBy("hostname ASC")
if page != nil && page.ItemsPerPage != -1 { if page != nil && page.ItemsPerPage != -1 {
limit := uint64(page.ItemsPerPage) limit := uint64(page.ItemsPerPage)
@@ -363,11 +370,10 @@ func (r *NodeRepository) QueryNodes(
nodes := make([]*schema.Node, 0) nodes := make([]*schema.Node, 0)
for rows.Next() { for rows.Next() {
node := schema.Node{} node := schema.Node{}
var timestamp int
if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster, if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster,
&node.NodeState, &node.HealthState, &timestamp); err != nil { &node.NodeState, &node.HealthState); err != nil {
rows.Close() rows.Close()
cclog.Warnf("Error while scanning rows (QueryNodes) at time '%d'", timestamp) cclog.Warn("Error while scanning rows (QueryNodes)")
return nil, err return nil, err
} }
nodes = append(nodes, &node) nodes = append(nodes, &node)
@@ -377,74 +383,39 @@ func (r *NodeRepository) QueryNodes(
} }
// CountNodes returns the total matched nodes based on a node filter. It always operates // CountNodes returns the total matched nodes based on a node filter. It always operates
// on the last state (largest timestamp). // on the last state (largest timestamp) per node.
func (r *NodeRepository) CountNodes( func (r *NodeRepository) CountNodes(
ctx context.Context, ctx context.Context,
filters []*model.NodeFilter, filters []*model.NodeFilter,
) (int, error) { ) (int, error) {
query, qerr := AccessCheck(ctx, query, qerr := AccessCheck(ctx,
sq.Select("time_stamp", "count(*) as countRes"). sq.Select("COUNT(*)").
From("node"). From("node").
Join("node_state ON node_state.node_id = node.id")) Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()))
if qerr != nil { if qerr != nil {
return 0, qerr return 0, qerr
} }
for _, f := range filters { query = applyNodeFilters(query, filters)
if f.Cluster != nil {
query = buildStringCondition("cluster", f.Cluster, query)
}
if f.SubCluster != nil {
query = buildStringCondition("subcluster", f.SubCluster, query)
}
if f.Hostname != nil {
query = buildStringCondition("hostname", f.Hostname, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state = ?", f.SchedulerState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
// TODO: Hardcoded TimeDiff Suboptimal - Use Config Option?
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 300)})
}
if f.HealthState != nil {
query = query.Where("health_state = ?", f.HealthState)
// Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned
// TODO: Hardcoded TimeDiff Suboptimal - Use Config Option?
now := time.Now().Unix()
query = query.Where(sq.Gt{"time_stamp": (now - 300)})
}
}
query = query.GroupBy("time_stamp").OrderBy("time_stamp DESC").Limit(1) var count int
if err := query.RunWith(r.stmtCache).QueryRow().Scan(&count); err != nil {
rows, err := query.RunWith(r.stmtCache).Query()
if err != nil {
queryString, queryVars, _ := query.ToSql() queryString, queryVars, _ := query.ToSql()
cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err) cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err)
return 0, err return 0, err
} }
var totalNodes int return count, nil
for rows.Next() {
var timestamp int
if err := rows.Scan(&timestamp, &totalNodes); err != nil {
rows.Close()
cclog.Warnf("Error while scanning rows (CountNodes) at time '%d'", timestamp)
return 0, err
}
}
return totalNodes, nil
} }
func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
q := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", q := sq.Select("node.hostname", "node.cluster", "node.subcluster",
"node_state.health_state", "MAX(node_state.time_stamp) as time"). "node_state.node_state", "node_state.health_state").
From("node"). From("node").
Join("node_state ON node_state.node_id = node.id"). Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()).
Where("node.cluster = ?", cluster). Where("node.cluster = ?", cluster).
GroupBy("node_state.node_id").
OrderBy("node.hostname ASC") OrderBy("node.hostname ASC")
rows, err := q.RunWith(r.DB).Query() rows, err := q.RunWith(r.DB).Query()
@@ -456,10 +427,9 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
defer rows.Close() defer rows.Close()
for rows.Next() { for rows.Next() {
node := &schema.Node{} node := &schema.Node{}
var timestamp int
if err := rows.Scan(&node.Hostname, &node.Cluster, if err := rows.Scan(&node.Hostname, &node.Cluster,
&node.SubCluster, &node.NodeState, &node.HealthState, &timestamp); err != nil { &node.SubCluster, &node.NodeState, &node.HealthState); err != nil {
cclog.Warnf("Error while scanning node list (ListNodes) at time '%d'", timestamp) cclog.Warn("Error while scanning node list (ListNodes)")
return nil, err return nil, err
} }
@@ -470,11 +440,11 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) {
} }
func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
q := sq.Select("node.hostname", "node_state.node_state", "MAX(node_state.time_stamp) as time"). q := sq.Select("node.hostname", "node_state.node_state").
From("node"). From("node").
Join("node_state ON node_state.node_id = node.id"). Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()).
Where("node.cluster = ?", cluster). Where("node.cluster = ?", cluster).
GroupBy("node_state.node_id").
OrderBy("node.hostname ASC") OrderBy("node.hostname ASC")
rows, err := q.RunWith(r.DB).Query() rows, err := q.RunWith(r.DB).Query()
@@ -487,9 +457,8 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
defer rows.Close() defer rows.Close()
for rows.Next() { for rows.Next() {
var hostname, nodestate string var hostname, nodestate string
var timestamp int if err := rows.Scan(&hostname, &nodestate); err != nil {
if err := rows.Scan(&hostname, &nodestate, &timestamp); err != nil { cclog.Warn("Error while scanning node list (MapNodes)")
cclog.Warnf("Error while scanning node list (MapNodes) at time '%d'", timestamp)
return nil, err return nil, err
} }
@@ -500,33 +469,16 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) {
} }
func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) { func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) {
query, qerr := AccessCheck(ctx, sq.Select("hostname", column, "MAX(time_stamp) as time").From("node")) query, qerr := AccessCheck(ctx,
sq.Select(column).
From("node").
Join("node_state ON node_state.node_id = node.id").
Where(latestStateCondition()))
if qerr != nil { if qerr != nil {
return nil, qerr return nil, qerr
} }
query = query.Join("node_state ON node_state.node_id = node.id") query = applyNodeFilters(query, filters)
for _, f := range filters {
if f.Hostname != nil {
query = buildStringCondition("hostname", f.Hostname, query)
}
if f.Cluster != nil {
query = buildStringCondition("cluster", f.Cluster, query)
}
if f.SubCluster != nil {
query = buildStringCondition("subcluster", f.SubCluster, query)
}
if f.SchedulerState != nil {
query = query.Where("node_state = ?", f.SchedulerState)
}
if f.HealthState != nil {
query = query.Where("health_state = ?", f.HealthState)
}
}
// Add Group and Order
query = query.GroupBy("hostname").OrderBy("hostname DESC")
rows, err := query.RunWith(r.stmtCache).Query() rows, err := query.RunWith(r.stmtCache).Query()
if err != nil { if err != nil {
@@ -537,12 +489,10 @@ func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeF
stateMap := map[string]int{} stateMap := map[string]int{}
for rows.Next() { for rows.Next() {
var hostname, state string var state string
var timestamp int if err := rows.Scan(&state); err != nil {
if err := rows.Scan(&hostname, &state, &timestamp); err != nil {
rows.Close() rows.Close()
cclog.Warnf("Error while scanning rows (CountStates) at time '%d'", timestamp) cclog.Warn("Error while scanning rows (CountStates)")
return nil, err return nil, err
} }
@@ -735,26 +685,14 @@ func (r *NodeRepository) GetNodesForList(
} }
} else { } else {
// DB Nodes: Count and Find Next Page // DB Nodes: Count and derive hasNextPage from count
var cerr error var cerr error
countNodes, cerr = r.CountNodes(ctx, queryFilters) countNodes, cerr = r.CountNodes(ctx, queryFilters)
if cerr != nil { if cerr != nil {
cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)") cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)")
return nil, nil, 0, false, cerr return nil, nil, 0, false, cerr
} }
hasNextPage = page.Page*page.ItemsPerPage < countNodes
// Example Page 4 @ 10 IpP : Does item 41 exist?
// Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists.
nextPage := &model.PageRequest{
ItemsPerPage: 1,
Page: ((page.Page * page.ItemsPerPage) + 1),
}
nextNodes, err := r.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used
if err != nil {
cclog.Warn("Error while querying next nodes")
return nil, nil, 0, false, err
}
hasNextPage = len(nextNodes) == 1
} }
// Fallback for non-init'd node table in DB; Ignores stateFilter // Fallback for non-init'd node table in DB; Ignores stateFilter

View File

@@ -50,6 +50,7 @@ var routes []Route = []Route{
{"/monitoring/status/{cluster}", "monitoring/status.tmpl", "<ID> Dashboard - ClusterCockpit", false, setupClusterStatusRoute}, {"/monitoring/status/{cluster}", "monitoring/status.tmpl", "<ID> Dashboard - ClusterCockpit", false, setupClusterStatusRoute},
{"/monitoring/status/detail/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterDetailRoute}, {"/monitoring/status/detail/{cluster}", "monitoring/status.tmpl", "Status of <ID> - ClusterCockpit", false, setupClusterDetailRoute},
{"/monitoring/dashboard/{cluster}", "monitoring/dashboard.tmpl", "<ID> Dashboard - ClusterCockpit", false, setupDashboardRoute}, {"/monitoring/dashboard/{cluster}", "monitoring/dashboard.tmpl", "<ID> Dashboard - ClusterCockpit", false, setupDashboardRoute},
{"/monitoring/logs", "monitoring/logs.tmpl", "Logs - ClusterCockpit", false, func(i InfoType, r *http.Request) InfoType { return i }},
} }
func setupHomeRoute(i InfoType, r *http.Request) InfoType { func setupHomeRoute(i InfoType, r *http.Request) InfoType {

View File

@@ -6,157 +6,329 @@
package taskmanager package taskmanager
import ( import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time" "time"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/go-co-op/gocron/v2" "github.com/go-co-op/gocron/v2"
) )
func RegisterRetentionDeleteService(age int, includeDB bool, omitTagged bool) { // createParquetTarget creates a ParquetTarget (file or S3) from the retention config.
func createParquetTarget(cfg Retention) (pqarchive.ParquetTarget, error) {
switch cfg.TargetKind {
case "s3":
return pqarchive.NewS3Target(pqarchive.S3TargetConfig{
Endpoint: cfg.TargetEndpoint,
Bucket: cfg.TargetBucket,
AccessKey: cfg.TargetAccessKey,
SecretKey: cfg.TargetSecretKey,
Region: cfg.TargetRegion,
UsePathStyle: cfg.TargetUsePathStyle,
})
default:
return pqarchive.NewFileTarget(cfg.TargetPath)
}
}
// createTargetBackend creates a secondary archive backend (file or S3) for JSON copy/move.
func createTargetBackend(cfg Retention) (archive.ArchiveBackend, error) {
var raw json.RawMessage
var err error
switch cfg.TargetKind {
case "s3":
raw, err = json.Marshal(map[string]interface{}{
"kind": "s3",
"endpoint": cfg.TargetEndpoint,
"bucket": cfg.TargetBucket,
"access-key": cfg.TargetAccessKey,
"secret-key": cfg.TargetSecretKey,
"region": cfg.TargetRegion,
"use-path-style": cfg.TargetUsePathStyle,
})
default:
raw, err = json.Marshal(map[string]string{
"kind": "file",
"path": cfg.TargetPath,
})
}
if err != nil {
return nil, fmt.Errorf("marshal target config: %w", err)
}
return archive.InitBackend(raw)
}
// transferJobsJSON copies job data from source archive to target backend in JSON format.
func transferJobsJSON(jobs []*schema.Job, src archive.ArchiveBackend, dst archive.ArchiveBackend) error {
// Transfer cluster configs for all clusters referenced by jobs
clustersDone := make(map[string]bool)
for _, job := range jobs {
if clustersDone[job.Cluster] {
continue
}
clusterCfg, err := src.LoadClusterCfg(job.Cluster)
if err != nil {
cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err)
} else {
if err := dst.StoreClusterCfg(job.Cluster, clusterCfg); err != nil {
cclog.Warnf("Retention: store cluster config %q: %v", job.Cluster, err)
}
}
clustersDone[job.Cluster] = true
}
for _, job := range jobs {
meta, err := src.LoadJobMeta(job)
if err != nil {
cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err)
continue
}
data, err := src.LoadJobData(job)
if err != nil {
cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err)
continue
}
if err := dst.ImportJob(meta, &data); err != nil {
cclog.Warnf("Retention: import job %d: %v", job.JobID, err)
continue
}
}
return nil
}
// transferJobsParquet converts jobs to Parquet format, organized by cluster.
func transferJobsParquet(jobs []*schema.Job, src archive.ArchiveBackend, target pqarchive.ParquetTarget, maxSizeMB int) error {
cw := pqarchive.NewClusterAwareParquetWriter(target, maxSizeMB)
// Set cluster configs for all clusters referenced by jobs
clustersDone := make(map[string]bool)
for _, job := range jobs {
if clustersDone[job.Cluster] {
continue
}
clusterCfg, err := src.LoadClusterCfg(job.Cluster)
if err != nil {
cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err)
} else {
cw.SetClusterConfig(job.Cluster, clusterCfg)
}
clustersDone[job.Cluster] = true
}
for _, job := range jobs {
meta, err := src.LoadJobMeta(job)
if err != nil {
cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err)
continue
}
data, err := src.LoadJobData(job)
if err != nil {
cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err)
continue
}
row, err := pqarchive.JobToParquetRow(meta, &data)
if err != nil {
cclog.Warnf("Retention: convert job %d: %v", job.JobID, err)
continue
}
if err := cw.AddJob(*row); err != nil {
cclog.Errorf("Retention: add job %d to writer: %v", job.JobID, err)
continue
}
}
return cw.Close()
}
// cleanupAfterTransfer removes jobs from archive and optionally from DB.
func cleanupAfterTransfer(jobs []*schema.Job, startTime int64, includeDB bool, omitTagged bool) {
archive.GetHandle().CleanUp(jobs)
if includeDB {
cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged)
if err != nil {
cclog.Errorf("Retention: delete jobs from db: %v", err)
} else {
cclog.Infof("Retention: removed %d jobs from db", cnt)
}
if err = jobRepo.Optimize(); err != nil {
cclog.Errorf("Retention: db optimization error: %v", err)
}
}
}
// readCopyMarker reads the last-processed timestamp from a copy marker file.
func readCopyMarker(cfg Retention) int64 {
var data []byte
var err error
switch cfg.TargetKind {
case "s3":
// For S3 we store the marker locally alongside the config
data, err = os.ReadFile(copyMarkerPath(cfg))
default:
data, err = os.ReadFile(filepath.Join(cfg.TargetPath, ".copy-marker"))
}
if err != nil {
return 0
}
ts, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64)
if err != nil {
return 0
}
return ts
}
// writeCopyMarker writes the last-processed timestamp to a copy marker file.
func writeCopyMarker(cfg Retention, ts int64) {
content := []byte(strconv.FormatInt(ts, 10))
var err error
switch cfg.TargetKind {
case "s3":
err = os.WriteFile(copyMarkerPath(cfg), content, 0o640)
default:
err = os.WriteFile(filepath.Join(cfg.TargetPath, ".copy-marker"), content, 0o640)
}
if err != nil {
cclog.Warnf("Retention: write copy marker: %v", err)
}
}
func copyMarkerPath(cfg Retention) string {
// For S3 targets, store the marker in a local temp-style path derived from the bucket name
return filepath.Join(os.TempDir(), fmt.Sprintf("cc-copy-marker-%s", cfg.TargetBucket))
}
func RegisterRetentionDeleteService(cfg Retention) {
cclog.Info("Register retention delete service") cclog.Info("Register retention delete service")
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(3, 0, 0))), s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(3, 0, 0))),
gocron.NewTask( gocron.NewTask(
func() { func() {
startTime := time.Now().Unix() - int64(age*24*3600) startTime := time.Now().Unix() - int64(cfg.Age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged) jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged)
if err != nil { if err != nil {
cclog.Warnf("Error while looking for retention jobs: %s", err.Error()) cclog.Warnf("Retention delete: error finding jobs: %v", err)
}
archive.GetHandle().CleanUp(jobs)
if includeDB {
cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged)
if err != nil {
cclog.Errorf("Error while deleting retention jobs from db: %s", err.Error())
} else {
cclog.Infof("Retention: Removed %d jobs from db", cnt)
}
if err = jobRepo.Optimize(); err != nil {
cclog.Errorf("Error occured in db optimization: %s", err.Error())
}
}
}))
}
func RegisterRetentionMoveService(age int, includeDB bool, location string, omitTagged bool) {
cclog.Info("Register retention move service")
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(4, 0, 0))),
gocron.NewTask(
func() {
startTime := time.Now().Unix() - int64(age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged)
if err != nil {
cclog.Warnf("Error while looking for retention jobs: %s", err.Error())
}
archive.GetHandle().Move(jobs, location)
if includeDB {
cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged)
if err != nil {
cclog.Errorf("Error while deleting retention jobs from db: %v", err)
} else {
cclog.Infof("Retention: Removed %d jobs from db", cnt)
}
if err = jobRepo.Optimize(); err != nil {
cclog.Errorf("Error occured in db optimization: %v", err)
}
}
}))
}
func RegisterRetentionParquetService(retention Retention) {
cclog.Info("Register retention parquet service")
maxFileSizeMB := retention.MaxFileSizeMB
if maxFileSizeMB <= 0 {
maxFileSizeMB = 512
}
var target pqarchive.ParquetTarget
var err error
switch retention.TargetKind {
case "s3":
target, err = pqarchive.NewS3Target(pqarchive.S3TargetConfig{
Endpoint: retention.TargetEndpoint,
Bucket: retention.TargetBucket,
AccessKey: retention.TargetAccessKey,
SecretKey: retention.TargetSecretKey,
Region: retention.TargetRegion,
UsePathStyle: retention.TargetUsePathStyle,
})
default:
target, err = pqarchive.NewFileTarget(retention.TargetPath)
}
if err != nil {
cclog.Errorf("Parquet retention: failed to create target: %v", err)
return
}
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))),
gocron.NewTask(
func() {
startTime := time.Now().Unix() - int64(retention.Age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime, retention.OmitTagged)
if err != nil {
cclog.Warnf("Parquet retention: error finding jobs: %v", err)
return return
} }
if len(jobs) == 0 { if len(jobs) == 0 {
return return
} }
cclog.Infof("Parquet retention: processing %d jobs", len(jobs)) cclog.Infof("Retention delete: processing %d jobs", len(jobs))
ar := archive.GetHandle() cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged)
pw := pqarchive.NewParquetWriter(target, maxFileSizeMB) }))
}
for _, job := range jobs { func RegisterRetentionCopyService(cfg Retention) {
meta, err := ar.LoadJobMeta(job) cclog.Infof("Register retention copy service (format=%s, target=%s)", cfg.Format, cfg.TargetKind)
if err != nil {
cclog.Warnf("Parquet retention: load meta for job %d: %v", job.JobID, err)
continue
}
data, err := ar.LoadJobData(job) maxFileSizeMB := cfg.MaxFileSizeMB
if err != nil { if maxFileSizeMB <= 0 {
cclog.Warnf("Parquet retention: load data for job %d: %v", job.JobID, err) maxFileSizeMB = 512
continue }
}
row, err := pqarchive.JobToParquetRow(meta, &data) s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(4, 0, 0))),
if err != nil { gocron.NewTask(
cclog.Warnf("Parquet retention: convert job %d: %v", job.JobID, err) func() {
continue cutoff := time.Now().Unix() - int64(cfg.Age*24*3600)
} lastProcessed := readCopyMarker(cfg)
if err := pw.AddJob(*row); err != nil { jobs, err := jobRepo.FindJobsBetween(lastProcessed, cutoff, cfg.OmitTagged)
cclog.Errorf("Parquet retention: add job %d to writer: %v", job.JobID, err) if err != nil {
continue cclog.Warnf("Retention copy: error finding jobs: %v", err)
} return
} }
if len(jobs) == 0 {
if err := pw.Close(); err != nil {
cclog.Errorf("Parquet retention: close writer: %v", err)
return return
} }
ar.CleanUp(jobs) cclog.Infof("Retention copy: processing %d jobs", len(jobs))
ar := archive.GetHandle()
if retention.IncludeDB { switch cfg.Format {
cnt, err := jobRepo.DeleteJobsBefore(startTime, retention.OmitTagged) case "parquet":
target, err := createParquetTarget(cfg)
if err != nil { if err != nil {
cclog.Errorf("Parquet retention: delete jobs from db: %v", err) cclog.Errorf("Retention copy: create parquet target: %v", err)
} else { return
cclog.Infof("Parquet retention: removed %d jobs from db", cnt)
} }
if err = jobRepo.Optimize(); err != nil { if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil {
cclog.Errorf("Parquet retention: db optimization error: %v", err) cclog.Errorf("Retention copy: parquet transfer: %v", err)
return
}
default: // json
dst, err := createTargetBackend(cfg)
if err != nil {
cclog.Errorf("Retention copy: create target backend: %v", err)
return
}
if err := transferJobsJSON(jobs, ar, dst); err != nil {
cclog.Errorf("Retention copy: json transfer: %v", err)
return
} }
} }
writeCopyMarker(cfg, cutoff)
}))
}
func RegisterRetentionMoveService(cfg Retention) {
cclog.Infof("Register retention move service (format=%s, target=%s)", cfg.Format, cfg.TargetKind)
maxFileSizeMB := cfg.MaxFileSizeMB
if maxFileSizeMB <= 0 {
maxFileSizeMB = 512
}
s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))),
gocron.NewTask(
func() {
startTime := time.Now().Unix() - int64(cfg.Age*24*3600)
jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged)
if err != nil {
cclog.Warnf("Retention move: error finding jobs: %v", err)
return
}
if len(jobs) == 0 {
return
}
cclog.Infof("Retention move: processing %d jobs", len(jobs))
ar := archive.GetHandle()
switch cfg.Format {
case "parquet":
target, err := createParquetTarget(cfg)
if err != nil {
cclog.Errorf("Retention move: create parquet target: %v", err)
return
}
if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil {
cclog.Errorf("Retention move: parquet transfer: %v", err)
return
}
default: // json
dst, err := createTargetBackend(cfg)
if err != nil {
cclog.Errorf("Retention move: create target backend: %v", err)
return
}
if err := transferJobsJSON(jobs, ar, dst); err != nil {
cclog.Errorf("Retention move: json transfer: %v", err)
return
}
}
cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged)
})) }))
} }

View File

@@ -23,20 +23,20 @@ const (
// Retention defines the configuration for job retention policies. // Retention defines the configuration for job retention policies.
type Retention struct { type Retention struct {
Policy string `json:"policy"` Policy string `json:"policy"`
Location string `json:"location"` Format string `json:"format"`
Age int `json:"age"` Age int `json:"age"`
IncludeDB bool `json:"includeDB"` IncludeDB bool `json:"includeDB"`
OmitTagged bool `json:"omitTagged"` OmitTagged bool `json:"omitTagged"`
TargetKind string `json:"target-kind"` TargetKind string `json:"target-kind"`
TargetPath string `json:"target-path"` TargetPath string `json:"target-path"`
TargetEndpoint string `json:"target-endpoint"` TargetEndpoint string `json:"target-endpoint"`
TargetBucket string `json:"target-bucket"` TargetBucket string `json:"target-bucket"`
TargetAccessKey string `json:"target-access-key"` TargetAccessKey string `json:"target-access-key"`
TargetSecretKey string `json:"target-secret-key"` TargetSecretKey string `json:"target-secret-key"`
TargetRegion string `json:"target-region"` TargetRegion string `json:"target-region"`
TargetUsePathStyle bool `json:"target-use-path-style"` TargetUsePathStyle bool `json:"target-use-path-style"`
MaxFileSizeMB int `json:"max-file-size-mb"` MaxFileSizeMB int `json:"max-file-size-mb"`
} }
// CronFrequency defines the execution intervals for various background workers. // CronFrequency defines the execution intervals for various background workers.
@@ -86,18 +86,11 @@ func initArchiveServices(config json.RawMessage) {
switch cfg.Retention.Policy { switch cfg.Retention.Policy {
case "delete": case "delete":
RegisterRetentionDeleteService( RegisterRetentionDeleteService(cfg.Retention)
cfg.Retention.Age, case "copy":
cfg.Retention.IncludeDB, RegisterRetentionCopyService(cfg.Retention)
cfg.Retention.OmitTagged)
case "move": case "move":
RegisterRetentionMoveService( RegisterRetentionMoveService(cfg.Retention)
cfg.Retention.Age,
cfg.Retention.IncludeDB,
cfg.Retention.Location,
cfg.Retention.OmitTagged)
case "parquet":
RegisterRetentionParquetService(cfg.Retention)
} }
if cfg.Compression > 0 { if cfg.Compression > 0 {

View File

@@ -57,7 +57,12 @@ var configSchema = `
"policy": { "policy": {
"description": "Retention policy", "description": "Retention policy",
"type": "string", "type": "string",
"enum": ["none", "delete", "move", "parquet"] "enum": ["none", "delete", "copy", "move"]
},
"format": {
"description": "Output format for copy/move policies",
"type": "string",
"enum": ["json", "parquet"]
}, },
"include-db": { "include-db": {
"description": "Also remove jobs from database", "description": "Also remove jobs from database",
@@ -67,41 +72,37 @@ var configSchema = `
"description": "Act on jobs with startTime older than age (in days)", "description": "Act on jobs with startTime older than age (in days)",
"type": "integer" "type": "integer"
}, },
"location": {
"description": "The target directory for retention. Only applicable for retention move.",
"type": "string"
},
"target-kind": { "target-kind": {
"description": "Target storage kind for parquet retention: file or s3", "description": "Target storage kind: file or s3",
"type": "string", "type": "string",
"enum": ["file", "s3"] "enum": ["file", "s3"]
}, },
"target-path": { "target-path": {
"description": "Target directory path for parquet file storage", "description": "Target directory path for file storage",
"type": "string" "type": "string"
}, },
"target-endpoint": { "target-endpoint": {
"description": "S3 endpoint URL for parquet target", "description": "S3 endpoint URL for target",
"type": "string" "type": "string"
}, },
"target-bucket": { "target-bucket": {
"description": "S3 bucket name for parquet target", "description": "S3 bucket name for target",
"type": "string" "type": "string"
}, },
"target-access-key": { "target-access-key": {
"description": "S3 access key for parquet target", "description": "S3 access key for target",
"type": "string" "type": "string"
}, },
"target-secret-key": { "target-secret-key": {
"description": "S3 secret key for parquet target", "description": "S3 secret key for target",
"type": "string" "type": "string"
}, },
"target-region": { "target-region": {
"description": "S3 region for parquet target", "description": "S3 region for target",
"type": "string" "type": "string"
}, },
"target-use-path-style": { "target-use-path-style": {
"description": "Use path-style S3 URLs for parquet target", "description": "Use path-style S3 URLs for target",
"type": "boolean" "type": "boolean"
}, },
"max-file-size-mb": { "max-file-size-mb": {

View File

@@ -93,6 +93,91 @@ func JobToParquetRow(meta *schema.Job, data *schema.JobData) (*ParquetJobRow, er
}, nil }, nil
} }
// ParquetRowToJob converts a ParquetJobRow back into job metadata and metric data.
// This is the reverse of JobToParquetRow.
func ParquetRowToJob(row *ParquetJobRow) (*schema.Job, *schema.JobData, error) {
meta := &schema.Job{
JobID: row.JobID,
Cluster: row.Cluster,
SubCluster: row.SubCluster,
Partition: row.Partition,
Project: row.Project,
User: row.User,
State: schema.JobState(row.State),
StartTime: row.StartTime,
Duration: row.Duration,
Walltime: row.Walltime,
NumNodes: row.NumNodes,
NumHWThreads: row.NumHWThreads,
NumAcc: row.NumAcc,
Energy: row.Energy,
SMT: row.SMT,
}
if len(row.ResourcesJSON) > 0 {
if err := json.Unmarshal(row.ResourcesJSON, &meta.Resources); err != nil {
return nil, nil, fmt.Errorf("unmarshal resources: %w", err)
}
}
if len(row.StatisticsJSON) > 0 {
if err := json.Unmarshal(row.StatisticsJSON, &meta.Statistics); err != nil {
return nil, nil, fmt.Errorf("unmarshal statistics: %w", err)
}
}
if len(row.TagsJSON) > 0 {
if err := json.Unmarshal(row.TagsJSON, &meta.Tags); err != nil {
return nil, nil, fmt.Errorf("unmarshal tags: %w", err)
}
}
if len(row.MetaDataJSON) > 0 {
if err := json.Unmarshal(row.MetaDataJSON, &meta.MetaData); err != nil {
return nil, nil, fmt.Errorf("unmarshal metadata: %w", err)
}
}
if len(row.FootprintJSON) > 0 {
if err := json.Unmarshal(row.FootprintJSON, &meta.Footprint); err != nil {
return nil, nil, fmt.Errorf("unmarshal footprint: %w", err)
}
}
if len(row.EnergyFootJSON) > 0 {
if err := json.Unmarshal(row.EnergyFootJSON, &meta.EnergyFootprint); err != nil {
return nil, nil, fmt.Errorf("unmarshal energy footprint: %w", err)
}
}
data, err := decompressJobData(row.MetricDataGz)
if err != nil {
return nil, nil, fmt.Errorf("decompress metric data: %w", err)
}
return meta, data, nil
}
func decompressJobData(data []byte) (*schema.JobData, error) {
gz, err := gzip.NewReader(bytes.NewReader(data))
if err != nil {
return nil, err
}
defer gz.Close()
var buf bytes.Buffer
if _, err := buf.ReadFrom(gz); err != nil {
return nil, err
}
var jobData schema.JobData
if err := json.Unmarshal(buf.Bytes(), &jobData); err != nil {
return nil, err
}
return &jobData, nil
}
func compressJobData(data *schema.JobData) ([]byte, error) { func compressJobData(data *schema.JobData) ([]byte, error) {
jsonBytes, err := json.Marshal(data) jsonBytes, err := json.Marshal(data)
if err != nil { if err != nil {

View File

@@ -0,0 +1,305 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package parquet
import (
"testing"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
func TestParquetRowToJob(t *testing.T) {
meta := &schema.Job{
JobID: 42,
Cluster: "testcluster",
SubCluster: "sc0",
Partition: "main",
Project: "testproject",
User: "testuser",
State: schema.JobStateCompleted,
StartTime: 1700000000,
Duration: 3600,
Walltime: 7200,
NumNodes: 2,
NumHWThreads: 16,
NumAcc: 4,
Energy: 123.45,
SMT: 2,
Resources: []*schema.Resource{
{Hostname: "node001", HWThreads: []int{0, 1, 2, 3}},
{Hostname: "node002", HWThreads: []int{4, 5, 6, 7}},
},
Statistics: map[string]schema.JobStatistics{
"cpu_load": {Avg: 50.0, Min: 10.0, Max: 90.0},
},
Tags: []*schema.Tag{
{Type: "test", Name: "tag1"},
},
MetaData: map[string]string{
"key1": "value1",
},
Footprint: map[string]float64{
"cpu_load": 50.0,
},
EnergyFootprint: map[string]float64{
"total": 123.45,
},
}
data := &schema.JobData{
"cpu_load": {
schema.MetricScopeNode: &schema.JobMetric{
Unit: schema.Unit{Base: ""},
Timestep: 60,
Series: []schema.Series{
{
Hostname: "node001",
Data: []schema.Float{1.0, 2.0, 3.0},
},
},
},
},
}
// Convert to parquet row
row, err := JobToParquetRow(meta, data)
if err != nil {
t.Fatalf("JobToParquetRow: %v", err)
}
// Convert back
gotMeta, gotData, err := ParquetRowToJob(row)
if err != nil {
t.Fatalf("ParquetRowToJob: %v", err)
}
// Verify scalar fields
if gotMeta.JobID != meta.JobID {
t.Errorf("JobID = %d, want %d", gotMeta.JobID, meta.JobID)
}
if gotMeta.Cluster != meta.Cluster {
t.Errorf("Cluster = %q, want %q", gotMeta.Cluster, meta.Cluster)
}
if gotMeta.SubCluster != meta.SubCluster {
t.Errorf("SubCluster = %q, want %q", gotMeta.SubCluster, meta.SubCluster)
}
if gotMeta.Partition != meta.Partition {
t.Errorf("Partition = %q, want %q", gotMeta.Partition, meta.Partition)
}
if gotMeta.Project != meta.Project {
t.Errorf("Project = %q, want %q", gotMeta.Project, meta.Project)
}
if gotMeta.User != meta.User {
t.Errorf("User = %q, want %q", gotMeta.User, meta.User)
}
if gotMeta.State != meta.State {
t.Errorf("State = %q, want %q", gotMeta.State, meta.State)
}
if gotMeta.StartTime != meta.StartTime {
t.Errorf("StartTime = %d, want %d", gotMeta.StartTime, meta.StartTime)
}
if gotMeta.Duration != meta.Duration {
t.Errorf("Duration = %d, want %d", gotMeta.Duration, meta.Duration)
}
if gotMeta.Walltime != meta.Walltime {
t.Errorf("Walltime = %d, want %d", gotMeta.Walltime, meta.Walltime)
}
if gotMeta.NumNodes != meta.NumNodes {
t.Errorf("NumNodes = %d, want %d", gotMeta.NumNodes, meta.NumNodes)
}
if gotMeta.NumHWThreads != meta.NumHWThreads {
t.Errorf("NumHWThreads = %d, want %d", gotMeta.NumHWThreads, meta.NumHWThreads)
}
if gotMeta.NumAcc != meta.NumAcc {
t.Errorf("NumAcc = %d, want %d", gotMeta.NumAcc, meta.NumAcc)
}
if gotMeta.Energy != meta.Energy {
t.Errorf("Energy = %f, want %f", gotMeta.Energy, meta.Energy)
}
if gotMeta.SMT != meta.SMT {
t.Errorf("SMT = %d, want %d", gotMeta.SMT, meta.SMT)
}
// Verify complex fields
if len(gotMeta.Resources) != 2 {
t.Fatalf("Resources len = %d, want 2", len(gotMeta.Resources))
}
if gotMeta.Resources[0].Hostname != "node001" {
t.Errorf("Resources[0].Hostname = %q, want %q", gotMeta.Resources[0].Hostname, "node001")
}
if len(gotMeta.Resources[0].HWThreads) != 4 {
t.Errorf("Resources[0].HWThreads len = %d, want 4", len(gotMeta.Resources[0].HWThreads))
}
if len(gotMeta.Statistics) != 1 {
t.Fatalf("Statistics len = %d, want 1", len(gotMeta.Statistics))
}
if stat, ok := gotMeta.Statistics["cpu_load"]; !ok {
t.Error("Statistics missing cpu_load")
} else if stat.Avg != 50.0 {
t.Errorf("Statistics[cpu_load].Avg = %f, want 50.0", stat.Avg)
}
if len(gotMeta.Tags) != 1 || gotMeta.Tags[0].Name != "tag1" {
t.Errorf("Tags = %v, want [{test tag1}]", gotMeta.Tags)
}
if gotMeta.MetaData["key1"] != "value1" {
t.Errorf("MetaData[key1] = %q, want %q", gotMeta.MetaData["key1"], "value1")
}
if gotMeta.Footprint["cpu_load"] != 50.0 {
t.Errorf("Footprint[cpu_load] = %f, want 50.0", gotMeta.Footprint["cpu_load"])
}
if gotMeta.EnergyFootprint["total"] != 123.45 {
t.Errorf("EnergyFootprint[total] = %f, want 123.45", gotMeta.EnergyFootprint["total"])
}
// Verify metric data
if gotData == nil {
t.Fatal("JobData is nil")
}
cpuLoad, ok := (*gotData)["cpu_load"]
if !ok {
t.Fatal("JobData missing cpu_load")
}
nodeMetric, ok := cpuLoad[schema.MetricScopeNode]
if !ok {
t.Fatal("cpu_load missing node scope")
}
if nodeMetric.Timestep != 60 {
t.Errorf("Timestep = %d, want 60", nodeMetric.Timestep)
}
if len(nodeMetric.Series) != 1 {
t.Fatalf("Series len = %d, want 1", len(nodeMetric.Series))
}
if nodeMetric.Series[0].Hostname != "node001" {
t.Errorf("Series[0].Hostname = %q, want %q", nodeMetric.Series[0].Hostname, "node001")
}
if len(nodeMetric.Series[0].Data) != 3 {
t.Errorf("Series[0].Data len = %d, want 3", len(nodeMetric.Series[0].Data))
}
}
func TestParquetRowToJobNilOptionalFields(t *testing.T) {
meta := &schema.Job{
JobID: 1,
Cluster: "test",
SubCluster: "sc0",
Project: "proj",
User: "user",
State: schema.JobStateCompleted,
StartTime: 1700000000,
Duration: 60,
NumNodes: 1,
Resources: []*schema.Resource{
{Hostname: "node001"},
},
}
data := &schema.JobData{
"cpu_load": {
schema.MetricScopeNode: &schema.JobMetric{
Timestep: 60,
Series: []schema.Series{
{Hostname: "node001", Data: []schema.Float{1.0}},
},
},
},
}
row, err := JobToParquetRow(meta, data)
if err != nil {
t.Fatalf("JobToParquetRow: %v", err)
}
gotMeta, gotData, err := ParquetRowToJob(row)
if err != nil {
t.Fatalf("ParquetRowToJob: %v", err)
}
if gotMeta.JobID != 1 {
t.Errorf("JobID = %d, want 1", gotMeta.JobID)
}
if gotMeta.Tags != nil {
t.Errorf("Tags should be nil, got %v", gotMeta.Tags)
}
if gotMeta.Statistics != nil {
t.Errorf("Statistics should be nil, got %v", gotMeta.Statistics)
}
if gotMeta.MetaData != nil {
t.Errorf("MetaData should be nil, got %v", gotMeta.MetaData)
}
if gotMeta.Footprint != nil {
t.Errorf("Footprint should be nil, got %v", gotMeta.Footprint)
}
if gotMeta.EnergyFootprint != nil {
t.Errorf("EnergyFootprint should be nil, got %v", gotMeta.EnergyFootprint)
}
if gotData == nil {
t.Fatal("JobData is nil")
}
}
func TestRoundTripThroughParquetFile(t *testing.T) {
meta, data := makeTestJob(999)
meta.Tags = []*schema.Tag{{Type: "test", Name: "roundtrip"}}
// Convert to row and write to parquet
row, err := JobToParquetRow(meta, data)
if err != nil {
t.Fatalf("JobToParquetRow: %v", err)
}
// Write to parquet bytes
parquetBytes, err := writeParquetBytes([]ParquetJobRow{*row})
if err != nil {
t.Fatalf("writeParquetBytes: %v", err)
}
// Read back from parquet bytes
rows, err := ReadParquetFile(parquetBytes)
if err != nil {
t.Fatalf("ReadParquetFile: %v", err)
}
if len(rows) != 1 {
t.Fatalf("expected 1 row, got %d", len(rows))
}
// Convert back to job
gotMeta, gotData, err := ParquetRowToJob(&rows[0])
if err != nil {
t.Fatalf("ParquetRowToJob: %v", err)
}
// Verify key fields survived the round trip
if gotMeta.JobID != 999 {
t.Errorf("JobID = %d, want 999", gotMeta.JobID)
}
if gotMeta.Cluster != "testcluster" {
t.Errorf("Cluster = %q, want %q", gotMeta.Cluster, "testcluster")
}
if gotMeta.User != "testuser" {
t.Errorf("User = %q, want %q", gotMeta.User, "testuser")
}
if gotMeta.State != schema.JobStateCompleted {
t.Errorf("State = %q, want %q", gotMeta.State, schema.JobStateCompleted)
}
if len(gotMeta.Tags) != 1 || gotMeta.Tags[0].Name != "roundtrip" {
t.Errorf("Tags = %v, want [{test roundtrip}]", gotMeta.Tags)
}
if len(gotMeta.Resources) != 2 {
t.Errorf("Resources len = %d, want 2", len(gotMeta.Resources))
}
if gotData == nil {
t.Fatal("JobData is nil")
}
if _, ok := (*gotData)["cpu_load"]; !ok {
t.Error("JobData missing cpu_load")
}
}

View File

@@ -0,0 +1,216 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package parquet
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/ClusterCockpit/cc-lib/v2/schema"
"github.com/aws/aws-sdk-go-v2/aws"
awsconfig "github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/credentials"
"github.com/aws/aws-sdk-go-v2/service/s3"
pq "github.com/parquet-go/parquet-go"
)
// ReadParquetFile reads all ParquetJobRow entries from parquet-encoded bytes.
func ReadParquetFile(data []byte) ([]ParquetJobRow, error) {
file, err := pq.OpenFile(bytes.NewReader(data), int64(len(data)))
if err != nil {
return nil, fmt.Errorf("open parquet: %w", err)
}
reader := pq.NewGenericReader[ParquetJobRow](file)
defer reader.Close()
numRows := file.NumRows()
rows := make([]ParquetJobRow, numRows)
n, err := reader.Read(rows)
if err != nil && err != io.EOF {
return nil, fmt.Errorf("read parquet rows: %w", err)
}
return rows[:n], nil
}
// ParquetSource abstracts reading parquet archives from different storage backends.
type ParquetSource interface {
GetClusters() ([]string, error)
ListParquetFiles(cluster string) ([]string, error)
ReadFile(path string) ([]byte, error)
ReadClusterConfig(cluster string) (*schema.Cluster, error)
}
// FileParquetSource reads parquet archives from a local filesystem directory.
type FileParquetSource struct {
path string
}
func NewFileParquetSource(path string) *FileParquetSource {
return &FileParquetSource{path: path}
}
func (fs *FileParquetSource) GetClusters() ([]string, error) {
entries, err := os.ReadDir(fs.path)
if err != nil {
return nil, fmt.Errorf("read directory: %w", err)
}
var clusters []string
for _, e := range entries {
if e.IsDir() {
clusters = append(clusters, e.Name())
}
}
return clusters, nil
}
func (fs *FileParquetSource) ListParquetFiles(cluster string) ([]string, error) {
dir := filepath.Join(fs.path, cluster)
entries, err := os.ReadDir(dir)
if err != nil {
return nil, fmt.Errorf("read cluster directory: %w", err)
}
var files []string
for _, e := range entries {
if !e.IsDir() && strings.HasSuffix(e.Name(), ".parquet") {
files = append(files, filepath.Join(cluster, e.Name()))
}
}
return files, nil
}
func (fs *FileParquetSource) ReadFile(path string) ([]byte, error) {
return os.ReadFile(filepath.Join(fs.path, path))
}
func (fs *FileParquetSource) ReadClusterConfig(cluster string) (*schema.Cluster, error) {
data, err := os.ReadFile(filepath.Join(fs.path, cluster, "cluster.json"))
if err != nil {
return nil, fmt.Errorf("read cluster.json: %w", err)
}
var cfg schema.Cluster
if err := json.Unmarshal(data, &cfg); err != nil {
return nil, fmt.Errorf("unmarshal cluster config: %w", err)
}
return &cfg, nil
}
// S3ParquetSource reads parquet archives from an S3-compatible object store.
type S3ParquetSource struct {
client *s3.Client
bucket string
}
func NewS3ParquetSource(cfg S3TargetConfig) (*S3ParquetSource, error) {
if cfg.Bucket == "" {
return nil, fmt.Errorf("S3 source: empty bucket name")
}
region := cfg.Region
if region == "" {
region = "us-east-1"
}
awsCfg, err := awsconfig.LoadDefaultConfig(context.Background(),
awsconfig.WithRegion(region),
awsconfig.WithCredentialsProvider(
credentials.NewStaticCredentialsProvider(cfg.AccessKey, cfg.SecretKey, ""),
),
)
if err != nil {
return nil, fmt.Errorf("S3 source: load AWS config: %w", err)
}
opts := func(o *s3.Options) {
if cfg.Endpoint != "" {
o.BaseEndpoint = aws.String(cfg.Endpoint)
}
o.UsePathStyle = cfg.UsePathStyle
}
client := s3.NewFromConfig(awsCfg, opts)
return &S3ParquetSource{client: client, bucket: cfg.Bucket}, nil
}
func (ss *S3ParquetSource) GetClusters() ([]string, error) {
ctx := context.Background()
paginator := s3.NewListObjectsV2Paginator(ss.client, &s3.ListObjectsV2Input{
Bucket: aws.String(ss.bucket),
Delimiter: aws.String("/"),
})
var clusters []string
for paginator.HasMorePages() {
page, err := paginator.NextPage(ctx)
if err != nil {
return nil, fmt.Errorf("S3 source: list clusters: %w", err)
}
for _, prefix := range page.CommonPrefixes {
if prefix.Prefix != nil {
name := strings.TrimSuffix(*prefix.Prefix, "/")
clusters = append(clusters, name)
}
}
}
return clusters, nil
}
func (ss *S3ParquetSource) ListParquetFiles(cluster string) ([]string, error) {
ctx := context.Background()
prefix := cluster + "/"
paginator := s3.NewListObjectsV2Paginator(ss.client, &s3.ListObjectsV2Input{
Bucket: aws.String(ss.bucket),
Prefix: aws.String(prefix),
})
var files []string
for paginator.HasMorePages() {
page, err := paginator.NextPage(ctx)
if err != nil {
return nil, fmt.Errorf("S3 source: list parquet files: %w", err)
}
for _, obj := range page.Contents {
if obj.Key != nil && strings.HasSuffix(*obj.Key, ".parquet") {
files = append(files, *obj.Key)
}
}
}
return files, nil
}
func (ss *S3ParquetSource) ReadFile(path string) ([]byte, error) {
ctx := context.Background()
result, err := ss.client.GetObject(ctx, &s3.GetObjectInput{
Bucket: aws.String(ss.bucket),
Key: aws.String(path),
})
if err != nil {
return nil, fmt.Errorf("S3 source: get object %q: %w", path, err)
}
defer result.Body.Close()
return io.ReadAll(result.Body)
}
func (ss *S3ParquetSource) ReadClusterConfig(cluster string) (*schema.Cluster, error) {
data, err := ss.ReadFile(cluster + "/cluster.json")
if err != nil {
return nil, fmt.Errorf("read cluster.json: %w", err)
}
var cfg schema.Cluster
if err := json.Unmarshal(data, &cfg); err != nil {
return nil, fmt.Errorf("unmarshal cluster config: %w", err)
}
return &cfg, nil
}

View File

@@ -36,7 +36,11 @@ func NewFileTarget(path string) (*FileTarget, error) {
} }
func (ft *FileTarget) WriteFile(name string, data []byte) error { func (ft *FileTarget) WriteFile(name string, data []byte) error {
return os.WriteFile(filepath.Join(ft.path, name), data, 0o640) fullPath := filepath.Join(ft.path, name)
if err := os.MkdirAll(filepath.Dir(fullPath), 0o750); err != nil {
return fmt.Errorf("create parent directory: %w", err)
}
return os.WriteFile(fullPath, data, 0o640)
} }
// S3TargetConfig holds the configuration for an S3 parquet target. // S3TargetConfig holds the configuration for an S3 parquet target.

View File

@@ -7,10 +7,13 @@ package parquet
import ( import (
"bytes" "bytes"
"encoding/json"
"fmt" "fmt"
"path"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
pq "github.com/parquet-go/parquet-go" pq "github.com/parquet-go/parquet-go"
) )
@@ -111,3 +114,68 @@ func estimateRowSize(row *ParquetJobRow) int64 {
size += int64(len(row.MetricDataGz)) size += int64(len(row.MetricDataGz))
return size return size
} }
// prefixedTarget wraps a ParquetTarget and prepends a path prefix to all file names.
type prefixedTarget struct {
inner ParquetTarget
prefix string
}
func (pt *prefixedTarget) WriteFile(name string, data []byte) error {
return pt.inner.WriteFile(path.Join(pt.prefix, name), data)
}
// ClusterAwareParquetWriter organizes Parquet output by cluster.
// Each cluster gets its own subdirectory with a cluster.json config file.
type ClusterAwareParquetWriter struct {
target ParquetTarget
maxSizeMB int
writers map[string]*ParquetWriter
clusterCfgs map[string]*schema.Cluster
}
// NewClusterAwareParquetWriter creates a writer that routes jobs to per-cluster ParquetWriters.
func NewClusterAwareParquetWriter(target ParquetTarget, maxSizeMB int) *ClusterAwareParquetWriter {
return &ClusterAwareParquetWriter{
target: target,
maxSizeMB: maxSizeMB,
writers: make(map[string]*ParquetWriter),
clusterCfgs: make(map[string]*schema.Cluster),
}
}
// SetClusterConfig stores a cluster configuration to be written as cluster.json on Close.
func (cw *ClusterAwareParquetWriter) SetClusterConfig(name string, cfg *schema.Cluster) {
cw.clusterCfgs[name] = cfg
}
// AddJob routes the job row to the appropriate per-cluster writer.
func (cw *ClusterAwareParquetWriter) AddJob(row ParquetJobRow) error {
cluster := row.Cluster
pw, ok := cw.writers[cluster]
if !ok {
pw = NewParquetWriter(&prefixedTarget{inner: cw.target, prefix: cluster}, cw.maxSizeMB)
cw.writers[cluster] = pw
}
return pw.AddJob(row)
}
// Close writes cluster.json files and flushes all per-cluster writers.
func (cw *ClusterAwareParquetWriter) Close() error {
for name, cfg := range cw.clusterCfgs {
data, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return fmt.Errorf("marshal cluster config %q: %w", name, err)
}
if err := cw.target.WriteFile(path.Join(name, "cluster.json"), data); err != nil {
return fmt.Errorf("write cluster.json for %q: %w", name, err)
}
}
for cluster, pw := range cw.writers {
if err := pw.Close(); err != nil {
return fmt.Errorf("close writer for cluster %q: %w", cluster, err)
}
}
return nil
}

View File

@@ -10,6 +10,9 @@ import (
"compress/gzip" "compress/gzip"
"encoding/json" "encoding/json"
"io" "io"
"os"
"path/filepath"
"strings"
"sync" "sync"
"testing" "testing"
@@ -222,3 +225,137 @@ func TestFileTarget(t *testing.T) {
// Verify file exists and has correct content // Verify file exists and has correct content
// (using the target itself is sufficient; we just check no error) // (using the target itself is sufficient; we just check no error)
} }
func TestFileTargetSubdirectories(t *testing.T) {
dir := t.TempDir()
ft, err := NewFileTarget(dir)
if err != nil {
t.Fatalf("NewFileTarget: %v", err)
}
testData := []byte("test data in subdir")
if err := ft.WriteFile("fritz/cc-archive-2025-01-20-001.parquet", testData); err != nil {
t.Fatalf("WriteFile with subdir: %v", err)
}
// Verify file was created in subdirectory
content, err := os.ReadFile(filepath.Join(dir, "fritz", "cc-archive-2025-01-20-001.parquet"))
if err != nil {
t.Fatalf("read file in subdir: %v", err)
}
if !bytes.Equal(content, testData) {
t.Error("file content mismatch")
}
}
func makeTestJobForCluster(jobID int64, cluster string) (*schema.Job, *schema.JobData) {
meta, data := makeTestJob(jobID)
meta.Cluster = cluster
return meta, data
}
func TestClusterAwareParquetWriter(t *testing.T) {
target := newMemTarget()
cw := NewClusterAwareParquetWriter(target, 512)
// Set cluster configs
cw.SetClusterConfig("fritz", &schema.Cluster{Name: "fritz"})
cw.SetClusterConfig("alex", &schema.Cluster{Name: "alex"})
// Add jobs from different clusters
for i := int64(0); i < 3; i++ {
meta, data := makeTestJobForCluster(i, "fritz")
row, err := JobToParquetRow(meta, data)
if err != nil {
t.Fatalf("convert fritz job %d: %v", i, err)
}
if err := cw.AddJob(*row); err != nil {
t.Fatalf("add fritz job %d: %v", i, err)
}
}
for i := int64(10); i < 12; i++ {
meta, data := makeTestJobForCluster(i, "alex")
row, err := JobToParquetRow(meta, data)
if err != nil {
t.Fatalf("convert alex job %d: %v", i, err)
}
if err := cw.AddJob(*row); err != nil {
t.Fatalf("add alex job %d: %v", i, err)
}
}
if err := cw.Close(); err != nil {
t.Fatalf("close: %v", err)
}
target.mu.Lock()
defer target.mu.Unlock()
// Check cluster.json files were written
if _, ok := target.files["fritz/cluster.json"]; !ok {
t.Error("missing fritz/cluster.json")
}
if _, ok := target.files["alex/cluster.json"]; !ok {
t.Error("missing alex/cluster.json")
}
// Verify cluster.json content
var clusterCfg schema.Cluster
if err := json.Unmarshal(target.files["fritz/cluster.json"], &clusterCfg); err != nil {
t.Fatalf("unmarshal fritz cluster.json: %v", err)
}
if clusterCfg.Name != "fritz" {
t.Errorf("fritz cluster name = %q, want %q", clusterCfg.Name, "fritz")
}
// Check parquet files are in cluster subdirectories
fritzParquets := 0
alexParquets := 0
for name := range target.files {
if strings.HasPrefix(name, "fritz/") && strings.HasSuffix(name, ".parquet") {
fritzParquets++
}
if strings.HasPrefix(name, "alex/") && strings.HasSuffix(name, ".parquet") {
alexParquets++
}
}
if fritzParquets == 0 {
t.Error("no parquet files in fritz/")
}
if alexParquets == 0 {
t.Error("no parquet files in alex/")
}
// Verify parquet files are readable and have correct row counts
for name, data := range target.files {
if !strings.HasSuffix(name, ".parquet") {
continue
}
file := bytes.NewReader(data)
pf, err := pq.OpenFile(file, int64(len(data)))
if err != nil {
t.Errorf("open parquet %s: %v", name, err)
continue
}
if strings.HasPrefix(name, "fritz/") && pf.NumRows() != 3 {
t.Errorf("fritz parquet rows = %d, want 3", pf.NumRows())
}
if strings.HasPrefix(name, "alex/") && pf.NumRows() != 2 {
t.Errorf("alex parquet rows = %d, want 2", pf.NumRows())
}
}
}
func TestClusterAwareParquetWriterEmpty(t *testing.T) {
target := newMemTarget()
cw := NewClusterAwareParquetWriter(target, 512)
if err := cw.Close(); err != nil {
t.Fatalf("close empty writer: %v", err)
}
if len(target.files) != 0 {
t.Errorf("expected no files for empty writer, got %d", len(target.files))
}
}

View File

@@ -0,0 +1,148 @@
# Archive Manager
## Overview
The `archive-manager` tool manages ClusterCockpit job archives. It supports inspecting archives, validating jobs, removing jobs by date range, importing jobs between archive backends, and converting archives between JSON and Parquet formats.
## Features
- **Archive Info**: Display statistics about an existing job archive
- **Validation**: Validate job archives against the JSON schema
- **Cleanup**: Remove jobs by date range
- **Import**: Copy jobs between archive backends (file, S3, SQLite) with parallel processing
- **Convert**: Convert archives between JSON and Parquet formats (both directions)
- **Progress Reporting**: Real-time progress display with ETA and throughput metrics
- **Graceful Interruption**: CTRL-C stops processing after finishing current jobs
## Usage
### Build
```bash
go build ./tools/archive-manager/
```
### Archive Info
Display statistics about a job archive:
```bash
./archive-manager -s ./var/job-archive
```
### Validate Archive
```bash
./archive-manager -s ./var/job-archive --validate --config ./config.json
```
### Remove Jobs by Date
```bash
# Remove jobs started before a date
./archive-manager -s ./var/job-archive --remove-before 2023-Jan-01 --config ./config.json
# Remove jobs started after a date
./archive-manager -s ./var/job-archive --remove-after 2024-Dec-31 --config ./config.json
```
### Import Between Backends
Import jobs from one archive backend to another (e.g., file to S3, file to SQLite):
```bash
./archive-manager --import \
--src-config '{"kind":"file","path":"./var/job-archive"}' \
--dst-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"archive","access-key":"...","secret-key":"..."}'
```
### Convert JSON to Parquet
Convert a JSON job archive to Parquet format:
```bash
./archive-manager --convert --format parquet \
--src-config '{"kind":"file","path":"./var/job-archive"}' \
--dst-config '{"kind":"file","path":"./var/parquet-archive"}'
```
The source (`--src-config`) is a standard archive backend config (file, S3, or SQLite). The destination (`--dst-config`) specifies where to write parquet files.
### Convert Parquet to JSON
Convert a Parquet archive back to JSON format:
```bash
./archive-manager --convert --format json \
--src-config '{"kind":"file","path":"./var/parquet-archive"}' \
--dst-config '{"kind":"file","path":"./var/json-archive"}'
```
The source (`--src-config`) points to a directory or S3 bucket containing parquet files organized by cluster. The destination (`--dst-config`) is a standard archive backend config.
### S3 Source/Destination Example
Both conversion directions support S3:
```bash
# JSON (S3) -> Parquet (local)
./archive-manager --convert --format parquet \
--src-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"json-archive","accessKey":"...","secretKey":"..."}' \
--dst-config '{"kind":"file","path":"./var/parquet-archive"}'
# Parquet (local) -> JSON (S3)
./archive-manager --convert --format json \
--src-config '{"kind":"file","path":"./var/parquet-archive"}' \
--dst-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"json-archive","access-key":"...","secret-key":"..."}'
```
## Command-Line Options
| Flag | Default | Description |
|------|---------|-------------|
| `-s` | `./var/job-archive` | Source job archive path (for info/validate/remove modes) |
| `--config` | `./config.json` | Path to config.json |
| `--loglevel` | `info` | Logging level: debug, info, warn, err, fatal, crit |
| `--logdate` | `false` | Add timestamps to log messages |
| `--validate` | `false` | Validate archive against JSON schema |
| `--remove-before` | | Remove jobs started before date (Format: 2006-Jan-02) |
| `--remove-after` | | Remove jobs started after date (Format: 2006-Jan-02) |
| `--import` | `false` | Import jobs between archive backends |
| `--convert` | `false` | Convert archive between JSON and Parquet formats |
| `--format` | `json` | Output format for conversion: `json` or `parquet` |
| `--max-file-size` | `512` | Max parquet file size in MB (only for parquet output) |
| `--src-config` | | Source config JSON (required for import/convert) |
| `--dst-config` | | Destination config JSON (required for import/convert) |
## Parquet Archive Layout
When converting to Parquet, the output is organized by cluster:
```
parquet-archive/
clusterA/
cluster.json
cc-archive-2025-01-20-001.parquet
cc-archive-2025-01-20-002.parquet
clusterB/
cluster.json
cc-archive-2025-01-20-001.parquet
```
Each parquet file contains job metadata and gzip-compressed metric data. The `cluster.json` file preserves the cluster configuration from the source archive.
## Round-Trip Conversion
Archives can be converted from JSON to Parquet and back without data loss:
```bash
# Original JSON archive
./archive-manager --convert --format parquet \
--src-config '{"kind":"file","path":"./var/job-archive"}' \
--dst-config '{"kind":"file","path":"./var/parquet-archive"}'
# Convert back to JSON
./archive-manager --convert --format json \
--src-config '{"kind":"file","path":"./var/parquet-archive"}' \
--dst-config '{"kind":"file","path":"./var/json-archive"}'
```

View File

@@ -23,6 +23,7 @@ import (
"github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/config"
"github.com/ClusterCockpit/cc-backend/pkg/archive" "github.com/ClusterCockpit/cc-backend/pkg/archive"
pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
) )
@@ -372,10 +373,207 @@ func importArchive(srcBackend, dstBackend archive.ArchiveBackend, srcConfig stri
return finalImported, finalFailed, nil return finalImported, finalFailed, nil
} }
// parseSourceConfig parses the common kind/path/s3 fields from a config JSON string.
type sourceConfig struct {
Kind string `json:"kind"`
Path string `json:"path"`
Endpoint string `json:"endpoint"`
Bucket string `json:"bucket"`
AccessKey string `json:"accessKey"`
SecretKey string `json:"secretKey"`
Region string `json:"region"`
UsePathStyle bool `json:"usePathStyle"`
}
// createParquetTarget creates a ParquetTarget from a parsed config.
func createParquetTarget(cfg sourceConfig) (pqarchive.ParquetTarget, error) {
switch cfg.Kind {
case "s3":
return pqarchive.NewS3Target(pqarchive.S3TargetConfig{
Endpoint: cfg.Endpoint,
Bucket: cfg.Bucket,
AccessKey: cfg.AccessKey,
SecretKey: cfg.SecretKey,
Region: cfg.Region,
UsePathStyle: cfg.UsePathStyle,
})
default:
return pqarchive.NewFileTarget(cfg.Path)
}
}
// createParquetSource creates a ParquetSource from a parsed config.
func createParquetSource(cfg sourceConfig) (pqarchive.ParquetSource, error) {
switch cfg.Kind {
case "s3":
return pqarchive.NewS3ParquetSource(pqarchive.S3TargetConfig{
Endpoint: cfg.Endpoint,
Bucket: cfg.Bucket,
AccessKey: cfg.AccessKey,
SecretKey: cfg.SecretKey,
Region: cfg.Region,
UsePathStyle: cfg.UsePathStyle,
})
default:
if cfg.Path == "" {
return nil, fmt.Errorf("file source: path is required")
}
return pqarchive.NewFileParquetSource(cfg.Path), nil
}
}
// convertJSONToParquet converts a JSON archive backend to parquet format.
func convertJSONToParquet(srcBackend archive.ArchiveBackend, dstCfg sourceConfig, maxSizeMB int) error {
target, err := createParquetTarget(dstCfg)
if err != nil {
return fmt.Errorf("create parquet target: %w", err)
}
cw := pqarchive.NewClusterAwareParquetWriter(target, maxSizeMB)
// Transfer cluster configs
for _, clusterName := range srcBackend.GetClusters() {
clusterCfg, err := srcBackend.LoadClusterCfg(clusterName)
if err != nil {
cclog.Warnf("Convert: load cluster config %q: %v", clusterName, err)
continue
}
cw.SetClusterConfig(clusterName, clusterCfg)
}
converted := 0
failed := 0
startTime := time.Now()
for job := range srcBackend.Iter(true) {
if job.Meta == nil {
cclog.Warn("Skipping job with nil metadata")
failed++
continue
}
if job.Data == nil {
cclog.Warnf("Job %d has no metric data, skipping", job.Meta.JobID)
failed++
continue
}
row, err := pqarchive.JobToParquetRow(job.Meta, job.Data)
if err != nil {
cclog.Warnf("Convert job %d: %v", job.Meta.JobID, err)
failed++
continue
}
if err := cw.AddJob(*row); err != nil {
cclog.Errorf("Add job %d to writer: %v", job.Meta.JobID, err)
failed++
continue
}
converted++
if converted%1000 == 0 {
cclog.Infof("Converted %d jobs so far...", converted)
}
}
if err := cw.Close(); err != nil {
return fmt.Errorf("close parquet writer: %w", err)
}
elapsed := time.Since(startTime)
cclog.Infof("JSON->Parquet conversion completed in %s: %d jobs converted, %d failed",
formatDuration(elapsed), converted, failed)
return nil
}
// convertParquetToJSON converts a parquet archive to a JSON archive backend.
func convertParquetToJSON(srcCfg sourceConfig, dstBackend archive.ArchiveBackend) error {
src, err := createParquetSource(srcCfg)
if err != nil {
return fmt.Errorf("create parquet source: %w", err)
}
clusters, err := src.GetClusters()
if err != nil {
return fmt.Errorf("list clusters: %w", err)
}
converted := 0
failed := 0
skipped := 0
startTime := time.Now()
for _, cluster := range clusters {
// Transfer cluster config
clusterCfg, err := src.ReadClusterConfig(cluster)
if err != nil {
cclog.Warnf("Convert: read cluster config %q: %v", cluster, err)
} else {
if err := dstBackend.StoreClusterCfg(cluster, clusterCfg); err != nil {
cclog.Warnf("Convert: store cluster config %q: %v", cluster, err)
} else {
cclog.Infof("Imported cluster config for %s", cluster)
}
}
// Read and convert parquet files
files, err := src.ListParquetFiles(cluster)
if err != nil {
cclog.Errorf("Convert: list parquet files for %q: %v", cluster, err)
continue
}
for _, file := range files {
data, err := src.ReadFile(file)
if err != nil {
cclog.Errorf("Convert: read file %q: %v", file, err)
failed++
continue
}
rows, err := pqarchive.ReadParquetFile(data)
if err != nil {
cclog.Errorf("Convert: parse parquet file %q: %v", file, err)
failed++
continue
}
cclog.Infof("Processing %s: %d jobs", file, len(rows))
for _, row := range rows {
meta, jobData, err := pqarchive.ParquetRowToJob(&row)
if err != nil {
cclog.Warnf("Convert row to job: %v", err)
failed++
continue
}
if dstBackend.Exists(meta) {
skipped++
continue
}
if err := dstBackend.ImportJob(meta, jobData); err != nil {
cclog.Warnf("Import job %d: %v", meta.JobID, err)
failed++
continue
}
converted++
}
}
}
elapsed := time.Since(startTime)
cclog.Infof("Parquet->JSON conversion completed in %s: %d jobs converted, %d skipped, %d failed",
formatDuration(elapsed), converted, skipped, failed)
return nil
}
func main() { func main() {
var srcPath, flagConfigFile, flagLogLevel, flagRemoveCluster, flagRemoveAfter, flagRemoveBefore string var srcPath, flagConfigFile, flagLogLevel, flagRemoveCluster, flagRemoveAfter, flagRemoveBefore string
var flagSrcConfig, flagDstConfig string var flagSrcConfig, flagDstConfig string
var flagLogDateTime, flagValidate, flagImport bool var flagLogDateTime, flagValidate, flagImport, flagConvert bool
var flagFormat string
var flagMaxFileSize int
flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive") flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive")
flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages")
@@ -386,6 +584,9 @@ func main() {
flag.StringVar(&flagRemoveAfter, "remove-after", "", "Remove all jobs with start time after date (Format: 2006-Jan-04)") flag.StringVar(&flagRemoveAfter, "remove-after", "", "Remove all jobs with start time after date (Format: 2006-Jan-04)")
flag.BoolVar(&flagValidate, "validate", false, "Set this flag to validate a job archive against the json schema") flag.BoolVar(&flagValidate, "validate", false, "Set this flag to validate a job archive against the json schema")
flag.BoolVar(&flagImport, "import", false, "Import jobs from source archive to destination archive") flag.BoolVar(&flagImport, "import", false, "Import jobs from source archive to destination archive")
flag.BoolVar(&flagConvert, "convert", false, "Convert archive between JSON and Parquet formats")
flag.StringVar(&flagFormat, "format", "json", "Output format for conversion: 'json' or 'parquet'")
flag.IntVar(&flagMaxFileSize, "max-file-size", 512, "Max parquet file size in MB (only for parquet output)")
flag.StringVar(&flagSrcConfig, "src-config", "", "Source archive backend configuration (JSON), e.g. '{\"kind\":\"file\",\"path\":\"./archive\"}'") flag.StringVar(&flagSrcConfig, "src-config", "", "Source archive backend configuration (JSON), e.g. '{\"kind\":\"file\",\"path\":\"./archive\"}'")
flag.StringVar(&flagDstConfig, "dst-config", "", "Destination archive backend configuration (JSON), e.g. '{\"kind\":\"sqlite\",\"dbPath\":\"./archive.db\"}'") flag.StringVar(&flagDstConfig, "dst-config", "", "Destination archive backend configuration (JSON), e.g. '{\"kind\":\"sqlite\",\"dbPath\":\"./archive.db\"}'")
flag.Parse() flag.Parse()
@@ -429,6 +630,49 @@ func main() {
os.Exit(0) os.Exit(0)
} }
// Handle convert mode
if flagConvert {
if flagSrcConfig == "" || flagDstConfig == "" {
cclog.Fatal("Both --src-config and --dst-config must be specified for convert mode")
}
var srcCfg, dstCfg sourceConfig
if err := json.Unmarshal([]byte(flagSrcConfig), &srcCfg); err != nil {
cclog.Fatalf("Failed to parse source config: %s", err.Error())
}
if err := json.Unmarshal([]byte(flagDstConfig), &dstCfg); err != nil {
cclog.Fatalf("Failed to parse destination config: %s", err.Error())
}
switch flagFormat {
case "parquet":
// JSON archive -> Parquet: source is an archive backend
cclog.Info("Convert mode: JSON -> Parquet")
srcBackend, err := archive.InitBackend(json.RawMessage(flagSrcConfig))
if err != nil {
cclog.Fatalf("Failed to initialize source backend: %s", err.Error())
}
if err := convertJSONToParquet(srcBackend, dstCfg, flagMaxFileSize); err != nil {
cclog.Fatalf("Conversion failed: %s", err.Error())
}
case "json":
// Parquet -> JSON archive: destination is an archive backend
cclog.Info("Convert mode: Parquet -> JSON")
dstBackend, err := archive.InitBackend(json.RawMessage(flagDstConfig))
if err != nil {
cclog.Fatalf("Failed to initialize destination backend: %s", err.Error())
}
if err := convertParquetToJSON(srcCfg, dstBackend); err != nil {
cclog.Fatalf("Conversion failed: %s", err.Error())
}
default:
cclog.Fatalf("Unknown format %q: must be 'json' or 'parquet'", flagFormat)
}
cclog.Info("Conversion finished successfully")
os.Exit(0)
}
ccconf.Init(flagConfigFile) ccconf.Init(flagConfigFile)
// Load and check main configuration // Load and check main configuration

View File

@@ -250,7 +250,6 @@
"cpu": [ "cpu": [
"arm" "arm"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -264,7 +263,6 @@
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -278,7 +276,6 @@
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -292,7 +289,6 @@
"cpu": [ "cpu": [
"x64" "x64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -306,7 +302,6 @@
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -320,7 +315,6 @@
"cpu": [ "cpu": [
"x64" "x64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -334,7 +328,6 @@
"cpu": [ "cpu": [
"arm" "arm"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -348,7 +341,6 @@
"cpu": [ "cpu": [
"arm" "arm"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -362,7 +354,6 @@
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -376,7 +367,6 @@
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -390,7 +380,6 @@
"cpu": [ "cpu": [
"loong64" "loong64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -404,7 +393,6 @@
"cpu": [ "cpu": [
"loong64" "loong64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -418,7 +406,6 @@
"cpu": [ "cpu": [
"ppc64" "ppc64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -432,7 +419,6 @@
"cpu": [ "cpu": [
"ppc64" "ppc64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -446,7 +432,6 @@
"cpu": [ "cpu": [
"riscv64" "riscv64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -460,7 +445,6 @@
"cpu": [ "cpu": [
"riscv64" "riscv64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -474,7 +458,6 @@
"cpu": [ "cpu": [
"s390x" "s390x"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -488,7 +471,6 @@
"cpu": [ "cpu": [
"x64" "x64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -502,7 +484,6 @@
"cpu": [ "cpu": [
"x64" "x64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -516,7 +497,6 @@
"cpu": [ "cpu": [
"x64" "x64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -530,7 +510,6 @@
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -544,7 +523,6 @@
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -558,7 +536,6 @@
"cpu": [ "cpu": [
"ia32" "ia32"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -572,7 +549,6 @@
"cpu": [ "cpu": [
"x64" "x64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -586,7 +562,6 @@
"cpu": [ "cpu": [
"x64" "x64"
], ],
"dev": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
@@ -837,7 +812,6 @@
"version": "2.3.3", "version": "2.3.3",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
"integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
"dev": true,
"hasInstallScript": true, "hasInstallScript": true,
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,

View File

@@ -75,5 +75,6 @@ export default [
entrypoint('analysis', 'src/analysis.entrypoint.js'), entrypoint('analysis', 'src/analysis.entrypoint.js'),
entrypoint('status', 'src/status.entrypoint.js'), entrypoint('status', 'src/status.entrypoint.js'),
entrypoint('dashpublic', 'src/dashpublic.entrypoint.js'), entrypoint('dashpublic', 'src/dashpublic.entrypoint.js'),
entrypoint('config', 'src/config.entrypoint.js') entrypoint('config', 'src/config.entrypoint.js'),
entrypoint('logs', 'src/logs.entrypoint.js')
]; ];

View File

@@ -135,6 +135,16 @@
listOptions: true, listOptions: true,
menu: "Info", menu: "Info",
}, },
{
title: "Logs",
// svelte-ignore state_referenced_locally
requiredRole: roles.admin,
href: "/monitoring/logs",
icon: "journal-text",
perCluster: false,
listOptions: false,
menu: "Info",
},
]; ];
/* State Init */ /* State Init */

View File

@@ -0,0 +1,254 @@
<!--
@component Systemd Journal Log Viewer (Admin only)
Properties:
- `isAdmin Bool!`: Is currently logged in user admin authority
-->
<script>
import {
Card,
CardHeader,
CardBody,
Table,
Input,
Button,
Badge,
Spinner,
InputGroup,
InputGroupText,
Icon,
} from "@sveltestrap/sveltestrap";
let { isAdmin } = $props();
const timeRanges = [
{ label: "Last 15 minutes", value: "15 min ago" },
{ label: "Last 1 hour", value: "1 hour ago" },
{ label: "Last 6 hours", value: "6 hours ago" },
{ label: "Last 24 hours", value: "24 hours ago" },
{ label: "Last 7 days", value: "7 days ago" },
];
const levels = [
{ label: "All levels", value: "" },
{ label: "Emergency (0)", value: "0" },
{ label: "Alert (1)", value: "1" },
{ label: "Critical (2)", value: "2" },
{ label: "Error (3)", value: "3" },
{ label: "Warning (4)", value: "4" },
{ label: "Notice (5)", value: "5" },
{ label: "Info (6)", value: "6" },
{ label: "Debug (7)", value: "7" },
];
const refreshIntervals = [
{ label: "Off", value: 0 },
{ label: "5s", value: 5000 },
{ label: "10s", value: 10000 },
{ label: "30s", value: 30000 },
];
let since = $state("1 hour ago");
let level = $state("");
let search = $state("");
let linesParam = $state("200");
let refreshInterval = $state(0);
let entries = $state([]);
let loading = $state(false);
let error = $state(null);
let timer = $state(null);
function levelColor(priority) {
if (priority <= 2) return "danger";
if (priority === 3) return "warning";
if (priority === 4) return "info";
if (priority <= 6) return "secondary";
return "light";
}
function levelName(priority) {
const names = [
"EMERG",
"ALERT",
"CRIT",
"ERR",
"WARN",
"NOTICE",
"INFO",
"DEBUG",
];
return names[priority] || "UNKNOWN";
}
function formatTimestamp(usec) {
if (!usec) return "";
const ms = parseInt(usec) / 1000;
const d = new Date(ms);
return d.toLocaleString();
}
async function fetchLogs() {
loading = true;
error = null;
try {
const params = new URLSearchParams();
params.set("since", since);
params.set("lines", linesParam);
if (level) params.set("level", level);
if (search.trim()) params.set("search", search.trim());
const resp = await fetch(`/frontend/logs/?${params.toString()}`);
if (!resp.ok) {
const body = await resp.json();
throw new Error(body.error || `HTTP ${resp.status}`);
}
entries = await resp.json();
} catch (e) {
error = e.message;
entries = [];
} finally {
loading = false;
}
}
function setupAutoRefresh(interval) {
if (timer) {
clearInterval(timer);
timer = null;
}
if (interval > 0) {
timer = setInterval(fetchLogs, interval);
}
}
$effect(() => {
setupAutoRefresh(refreshInterval);
return () => {
if (timer) clearInterval(timer);
};
});
// Fetch on mount
$effect(() => {
fetchLogs();
});
</script>
{#if !isAdmin}
<Card>
<CardBody>
<p>Access denied. Admin privileges required.</p>
</CardBody>
</Card>
{:else}
<Card class="mb-3">
<CardHeader>
<div class="d-flex flex-wrap align-items-center gap-2">
<InputGroup size="sm" style="max-width: 200px;">
<Input type="select" bind:value={since}>
{#each timeRanges as tr}
<option value={tr.value}>{tr.label}</option>
{/each}
</Input>
</InputGroup>
<InputGroup size="sm" style="max-width: 180px;">
<Input type="select" bind:value={level}>
{#each levels as lv}
<option value={lv.value}>{lv.label}</option>
{/each}
</Input>
</InputGroup>
<InputGroup size="sm" style="max-width: 150px;">
<InputGroupText>Lines</InputGroupText>
<Input type="select" bind:value={linesParam}>
<option value="100">100</option>
<option value="200">200</option>
<option value="500">500</option>
<option value="1000">1000</option>
</Input>
</InputGroup>
<InputGroup size="sm" style="max-width: 250px;">
<Input
type="text"
placeholder="Search..."
bind:value={search}
onkeydown={(e) => {
if (e.key === "Enter") fetchLogs();
}}
/>
</InputGroup>
<Button
size="sm"
color="primary"
onclick={fetchLogs}
disabled={loading}
>
{#if loading}
<Spinner size="sm" />
{:else}
<Icon name="arrow-clockwise" />
{/if}
Refresh
</Button>
<InputGroup size="sm" style="max-width: 140px;">
<InputGroupText>Auto</InputGroupText>
<Input type="select" bind:value={refreshInterval}>
{#each refreshIntervals as ri}
<option value={ri.value}>{ri.label}</option>
{/each}
</Input>
</InputGroup>
{#if entries.length > 0}
<small class="text-muted ms-auto">{entries.length} entries</small>
{/if}
</div>
</CardHeader>
<CardBody style="padding: 0;">
{#if error}
<div class="alert alert-danger m-3">{error}</div>
{/if}
<div style="max-height: 75vh; overflow-y: auto;">
<Table size="sm" striped hover responsive class="mb-0">
<thead class="sticky-top bg-white">
<tr>
<th style="width: 170px;">Timestamp</th>
<th style="width: 80px;">Level</th>
<th>Message</th>
</tr>
</thead>
<tbody style="font-family: monospace; font-size: 0.85rem;">
{#each entries as entry}
<tr>
<td class="text-nowrap">{formatTimestamp(entry.timestamp)}</td>
<td
><Badge color={levelColor(entry.priority)}
>{levelName(entry.priority)}</Badge
></td
>
<td style="white-space: pre-wrap; word-break: break-all;"
>{entry.message}</td
>
</tr>
{:else}
{#if !loading && !error}
<tr
><td colspan="3" class="text-center text-muted py-3"
>No log entries found</td
></tr
>
{/if}
{/each}
</tbody>
</Table>
</div>
</CardBody>
</Card>
{/if}

View File

@@ -305,7 +305,7 @@
{#if $jobsStore.fetching || !$jobsStore.data} {#if $jobsStore.fetching || !$jobsStore.data}
<tr> <tr>
<td colspan={metrics.length + 1}> <td colspan={metrics.length + 1}>
<div style="text-align:center;"> <div style="text-align:center; margin-top: 1rem;">
<Spinner secondary /> <Spinner secondary />
</div> </div>
</td> </td>

View File

@@ -0,0 +1,10 @@
import { mount } from 'svelte';
import {} from './header.entrypoint.js'
import Logs from './Logs.root.svelte'
mount(Logs, {
target: document.getElementById('svelte-app'),
props: {
isAdmin: isAdmin,
}
})

View File

@@ -104,7 +104,7 @@
let itemsPerPage = $derived(usePaging ? (ccconfig?.nodeList_nodesPerPage || 10) : 10); let itemsPerPage = $derived(usePaging ? (ccconfig?.nodeList_nodesPerPage || 10) : 10);
let paging = $derived({ itemsPerPage, page }); let paging = $derived({ itemsPerPage, page });
const nodesQuery = $derived(queryStore({ const nodesStore = $derived(queryStore({
client: client, client: client,
query: nodeListQuery, query: nodeListQuery,
variables: { variables: {
@@ -122,7 +122,7 @@
requestPolicy: "network-only", // Resolution queries are cached, but how to access them? For now: reload on every change requestPolicy: "network-only", // Resolution queries are cached, but how to access them? For now: reload on every change
})); }));
const matchedNodes = $derived($nodesQuery?.data?.nodeMetricsList?.totalNodes || 0); const matchedNodes = $derived($nodesStore?.data?.nodeMetricsList?.totalNodes || 0);
/* Effects */ /* Effects */
$effect(() => { $effect(() => {
@@ -135,7 +135,7 @@
} = document.documentElement; } = document.documentElement;
// Add 100 px offset to trigger load earlier // Add 100 px offset to trigger load earlier
if (scrollTop + clientHeight >= scrollHeight - 100 && $nodesQuery?.data?.nodeMetricsList?.hasNextPage) { if (scrollTop + clientHeight >= scrollHeight - 100 && $nodesStore?.data?.nodeMetricsList?.hasNextPage) {
page += 1 page += 1
}; };
}); });
@@ -143,21 +143,30 @@
}); });
$effect(() => { $effect(() => {
if ($nodesQuery?.data) { if ($nodesStore?.data) {
untrack(() => { untrack(() => {
handleNodes($nodesQuery?.data?.nodeMetricsList?.items); handleNodes($nodesStore?.data?.nodeMetricsList?.items);
}); });
selectedMetrics = [...pendingSelectedMetrics]; // Trigger Rerender in NodeListRow Only After Data is Fetched selectedMetrics = [...pendingSelectedMetrics]; // Trigger Rerender in NodeListRow Only After Data is Fetched
}; };
}); });
$effect(() => { $effect(() => {
// Triggers (Except Paging) // Update NodeListRows metrics only: Keep ordered nodes on page 1
from, to from, to
pendingSelectedMetrics, selectedResolution pendingSelectedMetrics, selectedResolution
// Continous Scroll: Paging if parameters change: Existing entries will not match new selections
if (!usePaging) {
nodes = [];
page = 1;
}
});
$effect(() => {
// Update NodeListRows metrics only: Keep ordered nodes on page 1
hostnameFilter, hoststateFilter hostnameFilter, hoststateFilter
// Continous Scroll: Paging if parameters change: Existing entries will not match new selections // Continous Scroll: Paging if parameters change: Existing entries will not match new selections
// Nodes Array Reset in HandleNodes func nodes = [];
if (!usePaging) { if (!usePaging) {
page = 1; page = 1;
} }
@@ -228,7 +237,7 @@
style="padding-top: {headerPaddingTop}px;" style="padding-top: {headerPaddingTop}px;"
> >
{cluster} Node Info {cluster} Node Info
{#if $nodesQuery.fetching} {#if $nodesStore.fetching}
<Spinner size="sm" style="margin-left:10px;" secondary /> <Spinner size="sm" style="margin-left:10px;" secondary />
{/if} {/if}
</th> </th>
@@ -245,22 +254,24 @@
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{#if $nodesQuery.error} {#if $nodesStore.error}
<Row> <Row>
<Col> <Col>
<Card body color="danger">{$nodesQuery.error.message}</Card> <Card body color="danger">{$nodesStore.error.message}</Card>
</Col> </Col>
</Row> </Row>
{:else} {:else}
{#each nodes as nodeData (nodeData.host)} {#each nodes as nodeData (nodeData.host)}
<NodeListRow {nodeData} {cluster} {selectedMetrics} {globalMetrics}/> <NodeListRow {nodeData} {cluster} {selectedMetrics} {globalMetrics} nodeDataFetching={$nodesStore.fetching}/>
{:else} {:else}
<tr> {#if !$nodesStore.fetching}
<td colspan={selectedMetrics.length + 1}> No nodes found </td> <tr>
</tr> <td colspan={selectedMetrics.length + 1}> No nodes found </td>
</tr>
{/if}
{/each} {/each}
{/if} {/if}
{#if $nodesQuery.fetching || !$nodesQuery.data} {#if $nodesStore.fetching || !$nodesStore.data}
<tr> <tr>
<td colspan={pendingSelectedMetrics.length + 1}> <td colspan={pendingSelectedMetrics.length + 1}>
<div style="text-align:center;"> <div style="text-align:center;">

View File

@@ -51,6 +51,8 @@
/* Derived */ /* Derived */
// Not at least one returned, selected metric: NodeHealth warning // Not at least one returned, selected metric: NodeHealth warning
const fetchInfo = $derived(dataHealth.includes('fetching'));
// Not at least one returned, selected metric: NodeHealth warning
const healthWarn = $derived(!dataHealth.includes(true)); const healthWarn = $derived(!dataHealth.includes(true));
// At least one non-returned selected metric: Metric config error? // At least one non-returned selected metric: Metric config error?
const metricWarn = $derived(dataHealth.includes(false)); const metricWarn = $derived(dataHealth.includes(false));
@@ -84,10 +86,17 @@
<Row cols={{xs: 1, lg: 2}}> <Row cols={{xs: 1, lg: 2}}>
<Col class="mb-2 mb-lg-0"> <Col class="mb-2 mb-lg-0">
<InputGroup size="sm"> <InputGroup size="sm">
{#if healthWarn} {#if fetchInfo}
<InputGroupText class="flex-grow-1 flex-lg-grow-0">
<Icon name="arrow-clockwise" style="padding-right: 0.5rem;"/>
</InputGroupText>
<Button class="flex-grow-1" color="dark" outline disabled>
Fetching
</Button>
{:else if healthWarn}
<InputGroupText class="flex-grow-1 flex-lg-grow-0"> <InputGroupText class="flex-grow-1 flex-lg-grow-0">
<Icon name="exclamation-circle" style="padding-right: 0.5rem;"/> <Icon name="exclamation-circle" style="padding-right: 0.5rem;"/>
<span>Jobs</span> <span>Info</span>
</InputGroupText> </InputGroupText>
<Button class="flex-grow-1" color="danger" disabled> <Button class="flex-grow-1" color="danger" disabled>
No Metrics No Metrics
@@ -95,7 +104,7 @@
{:else if metricWarn} {:else if metricWarn}
<InputGroupText class="flex-grow-1 flex-lg-grow-0"> <InputGroupText class="flex-grow-1 flex-lg-grow-0">
<Icon name="info-circle" style="padding-right: 0.5rem;"/> <Icon name="info-circle" style="padding-right: 0.5rem;"/>
<span>Jobs</span> <span>Info</span>
</InputGroupText> </InputGroupText>
<Button class="flex-grow-1" color="warning" disabled> <Button class="flex-grow-1" color="warning" disabled>
Missing Metric Missing Metric

View File

@@ -4,6 +4,7 @@
Properties: Properties:
- `cluster String`: The nodes' cluster - `cluster String`: The nodes' cluster
- `nodeData Object`: The node data object including metric data - `nodeData Object`: The node data object including metric data
- `nodeDataFetching Bool`: Whether the metric query still runs
- `selectedMetrics [String]`: The array of selected metrics - `selectedMetrics [String]`: The array of selected metrics
- `globalMetrics [Obj]`: Includes the backend supplied availabilities for cluster and subCluster - `globalMetrics [Obj]`: Includes the backend supplied availabilities for cluster and subCluster
--> -->
@@ -24,6 +25,7 @@
let { let {
cluster, cluster,
nodeData, nodeData,
nodeDataFetching,
selectedMetrics, selectedMetrics,
globalMetrics globalMetrics
} = $props(); } = $props();
@@ -73,7 +75,7 @@
const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null); const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(selectedMetrics, nodeData.metrics) : []); const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(selectedMetrics, nodeData.metrics) : []);
const dataHealth = $derived(refinedData.filter((rd) => rd.availability == "configured").map((enabled) => (enabled?.data?.metric?.series?.length > 0))); const dataHealth = $derived(refinedData.filter((rd) => rd.availability == "configured").map((enabled) => (nodeDataFetching ? 'fetching' : enabled?.data?.metric?.series?.length > 0)));
/* Functions */ /* Functions */
function sortAndSelectScope(metricList = [], nodeMetrics = []) { function sortAndSelectScope(metricList = [], nodeMetrics = []) {
@@ -153,7 +155,11 @@
{#each refinedData as metricData, i (metricData?.data?.name || i)} {#each refinedData as metricData, i (metricData?.data?.name || i)}
{#key metricData} {#key metricData}
<td> <td>
{#if metricData?.availability == "none"} {#if !metricData?.data && nodeDataFetching}
<div style="text-align:center; margin-top: 1rem;">
<Spinner secondary />
</div>
{:else if metricData?.availability == "none"}
<Card body class="mx-2" color="light"> <Card body class="mx-2" color="light">
<p>No dataset(s) returned for <b>{selectedMetrics[i]}</b></p> <p>No dataset(s) returned for <b>{selectedMetrics[i]}</b></p>
<p class="mb-1">Metric is not configured for cluster <b>{cluster}</b>.</p> <p class="mb-1">Metric is not configured for cluster <b>{cluster}</b>.</p>

View File

@@ -0,0 +1,13 @@
{{define "content"}}
<div id="svelte-app"></div>
{{end}}
{{define "stylesheets"}}
<link rel='stylesheet' href='/build/logs.css'>
{{end}}
{{define "javascript"}}
<script>
const isAdmin = {{ .User.HasRole .Roles.admin }};
</script>
<script src='/build/logs.js'></script>
{{end}}