Merge pull request #454 from ClusterCockpit/dev

Dev
This commit is contained in:
Jan Eitzinger
2025-12-18 15:52:06 +01:00
committed by GitHub
7 changed files with 97 additions and 86 deletions

2
go.mod
View File

@@ -11,7 +11,7 @@ tool (
require (
github.com/99designs/gqlgen v0.17.84
github.com/ClusterCockpit/cc-lib v1.0.0
github.com/ClusterCockpit/cc-lib v1.0.2
github.com/Masterminds/squirrel v1.5.4
github.com/aws/aws-sdk-go-v2 v1.41.0
github.com/aws/aws-sdk-go-v2/config v1.31.20

4
go.sum
View File

@@ -6,8 +6,8 @@ github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8=
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
github.com/ClusterCockpit/cc-lib v1.0.0 h1:/8DFRomt4BpVWKWrsEZ/ru4K8x76QTVnEgdwHc5eSps=
github.com/ClusterCockpit/cc-lib v1.0.0/go.mod h1:UGdOvXEnjFqlnPSxtvtFwO6BtXYW6NnXFoud9FtN93k=
github.com/ClusterCockpit/cc-lib v1.0.2 h1:ZWn3oZkXgxrr3zSigBdlOOfayZ4Om4xL20DhmritPPg=
github.com/ClusterCockpit/cc-lib v1.0.2/go.mod h1:UGdOvXEnjFqlnPSxtvtFwO6BtXYW6NnXFoud9FtN93k=
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM=

View File

@@ -88,14 +88,14 @@ func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]*
res := []*model.EnergyFootprintValue{}
for name, value := range rawEnergyFootprint {
// Suboptimal: Nearly hardcoded metric name expectations
matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`)
matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`)
matchAcc := regexp.MustCompile(`acc|Acc|ACC`)
matchMem := regexp.MustCompile(`mem|Mem|MEM`)
matchCore := regexp.MustCompile(`core|Core|CORE`)
hwType := ""
switch test := name; { // NOtice ';' for var declaration
case matchCpu.MatchString(test):
case matchCPU.MatchString(test):
hwType = "CPU"
case matchAcc.MatchString(test):
hwType = "Accelerator"
@@ -175,9 +175,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id")
return nil, err
@@ -222,9 +222,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta
}
tags := []*schema.Tag{}
for _, tagId := range tagIds {
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id")
return nil, err
@@ -265,9 +265,9 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin
}
tags := []int{}
for _, tagId := range tagIds {
for _, tagID := range tagIds {
// Get ID
tid, err := strconv.ParseInt(tagId, 10, 64)
tid, err := strconv.ParseInt(tagID, 10, 64)
if err != nil {
cclog.Warn("Error while parsing tag id for removal")
return nil, err
@@ -317,7 +317,7 @@ func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (sc
if obj.NodeState != "" {
return obj.NodeState, nil
} else {
return "", fmt.Errorf("No SchedulerState (NodeState) on Object")
return "", fmt.Errorf("no SchedulerState (NodeState) on Object")
}
}
@@ -343,6 +343,14 @@ func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) {
// GlobalMetrics is the resolver for the globalMetrics field.
func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) {
user := repository.GetUserFromContext(ctx)
if user != nil {
if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) {
return archive.GlobalUserMetricList, nil
}
}
return archive.GlobalMetricList, nil
}
@@ -373,12 +381,12 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]*
// Node is the resolver for the node field.
func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) {
repo := repository.GetNodeRepository()
numericId, err := strconv.ParseInt(id, 10, 64)
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
cclog.Warn("Error while parsing job id")
return nil, err
}
return repo.GetNodeByID(numericId, false)
return repo.GetNodeByID(numericID, false)
}
// Nodes is the resolver for the nodes field.
@@ -405,8 +413,7 @@ func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilt
return nil, herr
}
allCounts := make([]*model.NodeStates, 0)
allCounts = append(stateCounts, healthCounts...)
allCounts := append(stateCounts, healthCounts...)
return allCounts, nil
}
@@ -433,18 +440,18 @@ func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.Nod
return healthCounts, nil
}
return nil, errors.New("Unknown Node State Query Type")
return nil, errors.New("unknown Node State Query Type")
}
// Job is the resolver for the job field.
func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) {
numericId, err := strconv.ParseInt(id, 10, 64)
numericID, err := strconv.ParseInt(id, 10, 64)
if err != nil {
cclog.Warn("Error while parsing job id")
return nil, err
}
job, err := r.Repo.FindByID(ctx, numericId)
job, err := r.Repo.FindByID(ctx, numericID)
if err != nil {
cclog.Warn("Error while finding job by id")
return nil, err
@@ -809,7 +816,7 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub
nodeRepo := repository.GetNodeRepository()
nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page)
if nerr != nil {
return nil, errors.New("Could not retrieve node list required for resolving NodeMetricsList")
return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList")
}
if metrics == nil {
@@ -898,9 +905,7 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
collectorUnit[metric] = scopedMetric.Unit
// Collect Initial Data
for _, ser := range scopedMetric.Series {
for _, val := range ser.Data {
collectorData[metric] = append(collectorData[metric], val)
}
collectorData[metric] = append(collectorData[metric], ser.Data...)
}
}
} else {

View File

@@ -6,7 +6,6 @@
package archive
import (
"errors"
"fmt"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
@@ -14,13 +13,16 @@ import (
)
var (
Clusters []*schema.Cluster
GlobalMetricList []*schema.GlobalMetricListItem
NodeLists map[string]map[string]NodeList
Clusters []*schema.Cluster
GlobalMetricList []*schema.GlobalMetricListItem
GlobalUserMetricList []*schema.GlobalMetricListItem
NodeLists map[string]map[string]NodeList
)
func initClusterConfig() error {
Clusters = []*schema.Cluster{}
GlobalMetricList = []*schema.GlobalMetricListItem{}
GlobalUserMetricList = []*schema.GlobalMetricListItem{}
NodeLists = map[string]map[string]NodeList{}
metricLookup := make(map[string]schema.GlobalMetricListItem)
@@ -29,38 +31,41 @@ func initClusterConfig() error {
cluster, err := ar.LoadClusterCfg(c)
if err != nil {
cclog.Warnf("Error while loading cluster config for cluster '%v'", c)
return err
return fmt.Errorf("failed to load cluster config for '%s': %w", c, err)
}
if len(cluster.Name) == 0 ||
len(cluster.MetricConfig) == 0 ||
len(cluster.SubClusters) == 0 {
return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty")
if len(cluster.Name) == 0 {
return fmt.Errorf("cluster name is empty in config for '%s'", c)
}
if len(cluster.MetricConfig) == 0 {
return fmt.Errorf("cluster '%s' has no metric configurations", cluster.Name)
}
if len(cluster.SubClusters) == 0 {
return fmt.Errorf("cluster '%s' has no subclusters defined", cluster.Name)
}
for _, mc := range cluster.MetricConfig {
if len(mc.Name) == 0 {
return errors.New("cluster.metricConfig.name should not be empty")
return fmt.Errorf("cluster '%s' has a metric config with empty name", cluster.Name)
}
if mc.Timestep < 1 {
return errors.New("cluster.metricConfig.timestep should not be smaller than one")
return fmt.Errorf("metric '%s' in cluster '%s' has invalid timestep %d (must be >= 1)", mc.Name, cluster.Name, mc.Timestep)
}
// For backwards compability...
// For backwards compatibility...
if mc.Scope == "" {
mc.Scope = schema.MetricScopeNode
}
if !mc.Scope.Valid() {
return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)")
return fmt.Errorf("metric '%s' in cluster '%s' has invalid scope '%s' (must be 'node', 'socket', 'core', etc.)", mc.Name, cluster.Name, mc.Scope)
}
ml, ok := metricLookup[mc.Name]
if !ok {
if _, ok := metricLookup[mc.Name]; !ok {
metricLookup[mc.Name] = schema.GlobalMetricListItem{
Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint,
Name: mc.Name, Scope: mc.Scope, Restrict: mc.Restrict, Unit: mc.Unit, Footprint: mc.Footprint,
}
ml = metricLookup[mc.Name]
}
availability := schema.ClusterSupport{Cluster: cluster.Name}
scLookup := make(map[string]*schema.SubClusterConfig)
@@ -90,39 +95,35 @@ func initClusterConfig() error {
}
if cfg, ok := scLookup[sc.Name]; ok {
if !cfg.Remove {
availability.SubClusters = append(availability.SubClusters, sc.Name)
newMetric.Peak = cfg.Peak
newMetric.Normal = cfg.Normal
newMetric.Caution = cfg.Caution
newMetric.Alert = cfg.Alert
newMetric.Footprint = cfg.Footprint
newMetric.Energy = cfg.Energy
newMetric.LowerIsBetter = cfg.LowerIsBetter
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if cfg.Remove {
continue
}
newMetric.Peak = cfg.Peak
newMetric.Normal = cfg.Normal
newMetric.Caution = cfg.Caution
newMetric.Alert = cfg.Alert
newMetric.Footprint = cfg.Footprint
newMetric.Energy = cfg.Energy
newMetric.LowerIsBetter = cfg.LowerIsBetter
}
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
ml.Footprint = newMetric.Footprint
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
} else {
availability.SubClusters = append(availability.SubClusters, sc.Name)
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
availability.SubClusters = append(availability.SubClusters, sc.Name)
sc.MetricConfig = append(sc.MetricConfig, *newMetric)
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
if newMetric.Footprint != "" {
sc.Footprint = append(sc.Footprint, newMetric.Name)
item := metricLookup[mc.Name]
item.Footprint = newMetric.Footprint
metricLookup[mc.Name] = item
}
if newMetric.Energy != "" {
sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name)
}
}
ml.Availability = append(metricLookup[mc.Name].Availability, availability)
metricLookup[mc.Name] = ml
item := metricLookup[mc.Name]
item.Availability = append(item.Availability, availability)
metricLookup[mc.Name] = item
}
Clusters = append(Clusters, cluster)
@@ -141,8 +142,11 @@ func initClusterConfig() error {
}
}
for _, ml := range metricLookup {
GlobalMetricList = append(GlobalMetricList, &ml)
for _, metric := range metricLookup {
GlobalMetricList = append(GlobalMetricList, &metric)
if !metric.Restrict {
GlobalUserMetricList = append(GlobalUserMetricList, &metric)
}
}
return nil

View File

@@ -83,7 +83,7 @@ func Connect() {
client, err := NewClient(nil)
if err != nil {
cclog.Errorf("NATS connection failed: %v", err)
cclog.Warnf("NATS connection failed: %v", err)
return
}

View File

@@ -1,22 +1,18 @@
#!/bin/sh
# rm -rf var
if [ -d './var' ]; then
echo 'Directory ./var already exists! Skipping initialization.'
./cc-backend -server -dev
./cc-backend -server -dev -loglevel info
else
make
wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-dev.tar
tar xf job-archive-dev.tar
rm ./job-archive-dev.tar
cp ./configs/env-template.txt .env
./cc-backend --init
cp ./configs/config-demo.json config.json
./cc-backend -migrate-db
wget https://hpc-mover.rrze.uni-erlangen.de/HPC-Data/0x7b58aefb/eig7ahyo6fo2bais0ephuf2aitohv1ai/job-archive-demo.tar
tar xf job-archive-demo.tar
rm ./job-archive-demo.tar
./cc-backend -dev -init-db -add-user demo:admin,api:demo
./cc-backend -server -dev -loglevel info
fi
./cc-backend -server -dev
fi

View File

@@ -148,13 +148,19 @@
hoststate={nodeData?.state? nodeData.state: 'notindb'}/>
{/if}
</td>
{#each refinedData as metricData (metricData.data.name)}
{#each refinedData as metricData, i (metricData?.data?.name || i)}
{#key metricData}
<td>
{#if metricData?.disabled}
<Card body class="mx-3" color="info"
>Metric disabled for subcluster <code
>{metricData.data.name}:{nodeData.subCluster}</code
>{metricData?.data?.name ? metricData.data.name : `Metric Index ${i}`}:{nodeData.subCluster}</code
></Card
>
{:else if !metricData?.data?.name}
<Card body class="mx-3" color="warning"
>Metric without name for subcluster <code
>{`Metric Index ${i}`}:{nodeData.subCluster}</code
></Card
>
{:else if !!metricData.data?.metric.statisticsSeries}