From 10b4fa5a0673f2cf3f30ad9b660062527c322a54 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Thu, 19 Mar 2026 15:55:58 +0100 Subject: [PATCH] change: remove heuristic metricHealth, replace with DB metricHealth - add metricHealth to single Node view --- api/schema.graphqls | 3 +- internal/graph/generated/generated.go | 82 +++++++++++++++---- internal/graph/model/models_gen.go | 9 +- internal/graph/schema.resolvers.go | 18 ++-- internal/repository/node.go | 35 ++++---- web/frontend/src/DashPublic.root.svelte | 2 +- web/frontend/src/Node.root.svelte | 41 +++++++--- web/frontend/src/Systems.root.svelte | 8 +- web/frontend/src/systems/NodeList.svelte | 13 +-- web/frontend/src/systems/NodeOverview.svelte | 24 +++--- .../src/systems/nodelist/NodeInfo.svelte | 34 +++----- .../src/systems/nodelist/NodeListRow.svelte | 6 +- 12 files changed, 171 insertions(+), 104 deletions(-) diff --git a/api/schema.graphqls b/api/schema.graphqls index e6830956..cf8f5273 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -270,7 +270,8 @@ enum SortByAggregate { type NodeMetrics { host: String! - state: String! + nodeState: String! + metricHealth: String! subCluster: String! metrics: [JobMetricWithName!]! } diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index f003c04a..a5319fc7 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -288,10 +288,11 @@ type ComplexityRoot struct { } NodeMetrics struct { - Host func(childComplexity int) int - Metrics func(childComplexity int) int - State func(childComplexity int) int - SubCluster func(childComplexity int) int + Host func(childComplexity int) int + MetricHealth func(childComplexity int) int + Metrics func(childComplexity int) int + NodeState func(childComplexity int) int + SubCluster func(childComplexity int) int } NodeStateResultList struct { @@ -1501,18 +1502,24 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin } return e.ComplexityRoot.NodeMetrics.Host(childComplexity), true + case "NodeMetrics.metricHealth": + if e.ComplexityRoot.NodeMetrics.MetricHealth == nil { + break + } + + return e.ComplexityRoot.NodeMetrics.MetricHealth(childComplexity), true case "NodeMetrics.metrics": if e.ComplexityRoot.NodeMetrics.Metrics == nil { break } return e.ComplexityRoot.NodeMetrics.Metrics(childComplexity), true - case "NodeMetrics.state": - if e.ComplexityRoot.NodeMetrics.State == nil { + case "NodeMetrics.nodeState": + if e.ComplexityRoot.NodeMetrics.NodeState == nil { break } - return e.ComplexityRoot.NodeMetrics.State(childComplexity), true + return e.ComplexityRoot.NodeMetrics.NodeState(childComplexity), true case "NodeMetrics.subCluster": if e.ComplexityRoot.NodeMetrics.SubCluster == nil { break @@ -2537,7 +2544,8 @@ enum SortByAggregate { type NodeMetrics { host: String! - state: String! + nodeState: String! + metricHealth: String! subCluster: String! metrics: [JobMetricWithName!]! } @@ -8316,14 +8324,14 @@ func (ec *executionContext) fieldContext_NodeMetrics_host(_ context.Context, fie return fc, nil } -func (ec *executionContext) _NodeMetrics_state(ctx context.Context, field graphql.CollectedField, obj *model.NodeMetrics) (ret graphql.Marshaler) { +func (ec *executionContext) _NodeMetrics_nodeState(ctx context.Context, field graphql.CollectedField, obj *model.NodeMetrics) (ret graphql.Marshaler) { return graphql.ResolveField( ctx, ec.OperationContext, field, - ec.fieldContext_NodeMetrics_state, + ec.fieldContext_NodeMetrics_nodeState, func(ctx context.Context) (any, error) { - return obj.State, nil + return obj.NodeState, nil }, nil, ec.marshalNString2string, @@ -8332,7 +8340,36 @@ func (ec *executionContext) _NodeMetrics_state(ctx context.Context, field graphq ) } -func (ec *executionContext) fieldContext_NodeMetrics_state(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { +func (ec *executionContext) fieldContext_NodeMetrics_nodeState(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "NodeMetrics", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _NodeMetrics_metricHealth(ctx context.Context, field graphql.CollectedField, obj *model.NodeMetrics) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_NodeMetrics_metricHealth, + func(ctx context.Context) (any, error) { + return obj.MetricHealth, nil + }, + nil, + ec.marshalNString2string, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_NodeMetrics_metricHealth(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { fc = &graphql.FieldContext{ Object: "NodeMetrics", Field: field, @@ -8666,8 +8703,10 @@ func (ec *executionContext) fieldContext_NodesResultList_items(_ context.Context switch field.Name { case "host": return ec.fieldContext_NodeMetrics_host(ctx, field) - case "state": - return ec.fieldContext_NodeMetrics_state(ctx, field) + case "nodeState": + return ec.fieldContext_NodeMetrics_nodeState(ctx, field) + case "metricHealth": + return ec.fieldContext_NodeMetrics_metricHealth(ctx, field) case "subCluster": return ec.fieldContext_NodeMetrics_subCluster(ctx, field) case "metrics": @@ -9844,8 +9883,10 @@ func (ec *executionContext) fieldContext_Query_nodeMetrics(ctx context.Context, switch field.Name { case "host": return ec.fieldContext_NodeMetrics_host(ctx, field) - case "state": - return ec.fieldContext_NodeMetrics_state(ctx, field) + case "nodeState": + return ec.fieldContext_NodeMetrics_nodeState(ctx, field) + case "metricHealth": + return ec.fieldContext_NodeMetrics_metricHealth(ctx, field) case "subCluster": return ec.fieldContext_NodeMetrics_subCluster(ctx, field) case "metrics": @@ -15917,8 +15958,13 @@ func (ec *executionContext) _NodeMetrics(ctx context.Context, sel ast.SelectionS if out.Values[i] == graphql.Null { out.Invalids++ } - case "state": - out.Values[i] = ec._NodeMetrics_state(ctx, field, obj) + case "nodeState": + out.Values[i] = ec._NodeMetrics_nodeState(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "metricHealth": + out.Values[i] = ec._NodeMetrics_metricHealth(ctx, field, obj) if out.Values[i] == graphql.Null { out.Invalids++ } diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index bdf63560..7611bf22 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -193,10 +193,11 @@ type NodeFilter struct { } type NodeMetrics struct { - Host string `json:"host"` - State string `json:"state"` - SubCluster string `json:"subCluster"` - Metrics []*JobMetricWithName `json:"metrics"` + Host string `json:"host"` + NodeState string `json:"nodeState"` + MetricHealth string `json:"metricHealth"` + SubCluster string `json:"subCluster"` + Metrics []*JobMetricWithName `json:"metrics"` } type NodeStateResultList struct { diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index adbc5f80..c84cb713 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -840,14 +840,15 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [ } nodeRepo := repository.GetNodeRepository() - stateMap, _ := nodeRepo.MapNodes(cluster) + nodeStateMap, metricHealthMap, _ := nodeRepo.MapNodes(cluster) nodeMetrics := make([]*model.NodeMetrics, 0, len(data)) for hostname, metrics := range data { host := &model.NodeMetrics{ - Host: hostname, - State: stateMap[hostname], - Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)), + Host: hostname, + NodeState: nodeStateMap[hostname], + MetricHealth: metricHealthMap[hostname], + Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)), } host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname) if err != nil { @@ -889,7 +890,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub nodeRepo := repository.GetNodeRepository() // nodes -> array hostname - nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page) + nodes, nodeStateMap, metricHealthMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page) if nerr != nil { return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList") } @@ -910,9 +911,10 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub nodeMetricsList := make([]*model.NodeMetrics, 0, len(data)) for _, hostname := range nodes { host := &model.NodeMetrics{ - Host: hostname, - State: stateMap[hostname], - Metrics: make([]*model.JobMetricWithName, 0), + Host: hostname, + NodeState: nodeStateMap[hostname], + MetricHealth: metricHealthMap[hostname], + Metrics: make([]*model.JobMetricWithName, 0), } host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname) if err != nil { diff --git a/internal/repository/node.go b/internal/repository/node.go index 2e174e95..eaa47079 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -593,8 +593,8 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { return nodeList, nil } -func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { - q := sq.Select("node.hostname", "node_state.node_state"). +func (r *NodeRepository) MapNodes(cluster string) (map[string]string, map[string]string, error) { + q := sq.Select("node.hostname", "node_state.node_state", "node_state.health_state"). From("node"). Join("node_state ON node_state.node_id = node.id"). Where(latestStateCondition()). @@ -604,22 +604,25 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { rows, err := q.RunWith(r.DB).Query() if err != nil { cclog.Warn("Error while querying node list") - return nil, err + return nil, nil, err } - stateMap := make(map[string]string) + nodeStateMap := make(map[string]string) + metricHealthMap := make(map[string]string) + defer rows.Close() for rows.Next() { - var hostname, nodestate string - if err := rows.Scan(&hostname, &nodestate); err != nil { + var hostname, nodeState, metricHealth string + if err := rows.Scan(&hostname, &nodeState, &metricHealth); err != nil { cclog.Warn("Error while scanning node list (MapNodes)") - return nil, err + return nil, nil, err } - stateMap[hostname] = nodestate + nodeStateMap[hostname] = nodeState + metricHealthMap[hostname] = metricHealth } - return stateMap, nil + return nodeStateMap, metricHealthMap, nil } func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) { @@ -741,10 +744,11 @@ func (r *NodeRepository) GetNodesForList( stateFilter string, nodeFilter string, page *model.PageRequest, -) ([]string, map[string]string, int, bool, error) { +) ([]string, map[string]string, map[string]string, int, bool, error) { // Init Return Vars nodes := make([]string, 0) - stateMap := make(map[string]string) + nodeStateMap := make(map[string]string) + metricHealthMap := make(map[string]string) countNodes := 0 hasNextPage := false @@ -778,7 +782,7 @@ func (r *NodeRepository) GetNodesForList( rawNodes, serr := r.QueryNodes(ctx, queryFilters, page, nil) // Order not Used if serr != nil { cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)") - return nil, nil, 0, false, serr + return nil, nil, nil, 0, false, serr } // Intermediate Node Result Info @@ -787,7 +791,8 @@ func (r *NodeRepository) GetNodesForList( continue } nodes = append(nodes, node.Hostname) - stateMap[node.Hostname] = string(node.NodeState) + nodeStateMap[node.Hostname] = string(node.NodeState) + metricHealthMap[node.Hostname] = string(node.HealthState) } // Special Case: Find Nodes not in DB node table but in metricStore only @@ -847,7 +852,7 @@ func (r *NodeRepository) GetNodesForList( countNodes, cerr = r.CountNodes(ctx, queryFilters) if cerr != nil { cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)") - return nil, nil, 0, false, cerr + return nil, nil, nil, 0, false, cerr } hasNextPage = page.Page*page.ItemsPerPage < countNodes } @@ -857,7 +862,7 @@ func (r *NodeRepository) GetNodesForList( nodes, countNodes, hasNextPage = getNodesFromTopol(cluster, subCluster, nodeFilter, page) } - return nodes, stateMap, countNodes, hasNextPage, nil + return nodes, nodeStateMap, metricHealthMap, countNodes, hasNextPage, nil } func AccessCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) { diff --git a/web/frontend/src/DashPublic.root.svelte b/web/frontend/src/DashPublic.root.svelte index e824f881..6301fba4 100644 --- a/web/frontend/src/DashPublic.root.svelte +++ b/web/frontend/src/DashPublic.root.svelte @@ -130,7 +130,7 @@ name count } - # Get Current States fir Pie Charts + # Get Current States for Pie Charts nodeStates(filter: $nodeFilter) { state count diff --git a/web/frontend/src/Node.root.svelte b/web/frontend/src/Node.root.svelte index a9ce8a74..35cfcca9 100644 --- a/web/frontend/src/Node.root.svelte +++ b/web/frontend/src/Node.root.svelte @@ -57,7 +57,8 @@ query ($cluster: String!, $nodes: [String!], $from: Time!, $to: Time!) { nodeMetrics(cluster: $cluster, nodes: $nodes, from: $from, to: $to) { host - state + nodeState + metricHealth subCluster metrics { name @@ -92,7 +93,7 @@ } } `; - // Node State Colors + // Node/Metric State Colors const stateColors = { allocated: 'success', reserved: 'info', @@ -100,7 +101,10 @@ mixed: 'warning', down: 'danger', unknown: 'dark', - notindb: 'secondary' + notindb: 'secondary', + full: 'success', + partial: 'warning', + failed: 'danger' } /* State Init */ @@ -153,31 +157,46 @@ }) ); - const thisNodeState = $derived($nodeMetricsData?.data?.nodeMetrics[0]?.state ? $nodeMetricsData.data.nodeMetrics[0].state : 'notindb'); + const thisNodeState = $derived($nodeMetricsData?.data?.nodeMetrics[0]?.nodeState || 'notindb'); + const thisMetricHealth = $derived($nodeMetricsData?.data?.nodeMetrics[0]?.metricHealth || 'unknown'); - + {#if $initq.error} {$initq.error.message} {:else if $initq.fetching} {:else} - + Selected Node - - + + Node State + + + + + + + Metric Health + - {:else if healthWarn} + {#if metricHealth == "failed"} Info @@ -101,13 +89,17 @@ - {:else if metricWarn} + {:else if metricHealth == "partial" || metricHealth == "unknown"} Info {:else if nodeJobsData.jobs.count == 1 && nodeJobsData?.jobs?.items[0]?.shared == "none"} @@ -150,8 +142,8 @@ State - diff --git a/web/frontend/src/systems/nodelist/NodeListRow.svelte b/web/frontend/src/systems/nodelist/NodeListRow.svelte index dc8ea09e..2c99f604 100644 --- a/web/frontend/src/systems/nodelist/NodeListRow.svelte +++ b/web/frontend/src/systems/nodelist/NodeListRow.svelte @@ -75,7 +75,6 @@ const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null); const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(selectedMetrics, nodeData.metrics) : []); - const dataHealth = $derived(refinedData.filter((rd) => rd.availability == "configured").map((enabled) => (nodeDataFetching ? 'fetching' : enabled?.data?.metric?.series?.length > 0))); /* Functions */ function sortAndSelectScope(metricList = [], nodeMetrics = []) { @@ -145,11 +144,12 @@ {:else} + nodeState={nodeData?.nodeState || 'notindb'} + metricHealth={nodeData?.metricHealth || 'unknown'} + /> {/if} {#each refinedData as metricData, i (metricData?.data?.name || i)}