Merge pull request #210 from ClusterCockpit/hotfix

Hotfix
This commit is contained in:
Jan Eitzinger 2023-08-31 16:20:40 +02:00 committed by GitHub
commit 07acbf673a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 57 additions and 46 deletions

View File

@ -2,7 +2,7 @@ TARGET = ./cc-backend
VAR = ./var VAR = ./var
CFG = config.json .env CFG = config.json .env
FRONTEND = ./web/frontend FRONTEND = ./web/frontend
VERSION = 1.1.0 VERSION = 1.2.0
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development') GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S") CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}' LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'

View File

@ -7,12 +7,12 @@ implementation of ClusterCockpit.
** Breaking changes ** ** Breaking changes **
The LDAP configuration option user_filter was changed and now should not include * The LDAP configuration option user_filter was changed and now should not include
the wildcard. Example: the uid wildcard. Example:
* Old: `"user_filter": "(&(objectclass=posixAccount)(uid=*))"` - Old: `"user_filter": "(&(objectclass=posixAccount)(uid=*))"`
* New: `"user_filter": "&(objectclass=posixAccount)"` - New: `"user_filter": "(&(objectclass=posixAccount))"`
The aggregate job statistic core hours is now computed using the job table * The aggregate job statistic core hours is now computed using the job table
column `num_hwthreads`. In a future release this column will be renamed to column `num_hwthreads`. In a future release this column will be renamed to
`num_cores`. For correct display of core hours `num_hwthreads` must be correctly `num_cores`. For correct display of core hours `num_hwthreads` must be correctly
filled on job start. If your existing jobs do not provide the correct value in filled on job start. If your existing jobs do not provide the correct value in
@ -21,6 +21,10 @@ if you have exclusive jobs, only. Please be aware that we treat this column as
it is the number of cores. In case you have SMT enabled and `num_hwthreads` it is the number of cores. In case you have SMT enabled and `num_hwthreads`
is not the number of cores the core hours will be too high by a factor! is not the number of cores the core hours will be too high by a factor!
* The jwts key is now mandatory in config.json. It has to set max-age for
validity. Some key names have changed, please refer to
[config documentation](./configs/README.md) for details.
** NOTE ** ** NOTE **
If you are using the sqlite3 backend the `PRAGMA` option `foreign_keys` must be If you are using the sqlite3 backend the `PRAGMA` option `foreign_keys` must be
explicitly set to ON. If using the sqlite3 console it is per default set to explicitly set to ON. If using the sqlite3 console it is per default set to

View File

@ -16,26 +16,41 @@ It is supported to set these by means of a `.env` file in the project root.
* `static-files`: Type string. Folder where static assets can be found, if `embed-static-files` is `false`. No default. * `static-files`: Type string. Folder where static assets can be found, if `embed-static-files` is `false`. No default.
* `db-driver`: Type string. 'sqlite3' or 'mysql' (mysql will work for mariadb as well). Default `sqlite3`. * `db-driver`: Type string. 'sqlite3' or 'mysql' (mysql will work for mariadb as well). Default `sqlite3`.
* `db`: Type string. For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!). Default: `./var/job.db`. * `db`: Type string. For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!). Default: `./var/job.db`.
* `job-archive`: Type string. Path to the job-archive. Default: `./var/job-archive`. * `job-archive`: Type object.
- `kind`: Type string. At them moment only file is supported as value.
- `path`: Type string. Path to the job-archive. Default: `./var/job-archive`.
- `compression`: Type integer. Setup automatic compression for jobs older than number of days.
- `retention`: Type object.
- `policy`: Type string (required). Retention policy. Possible values none, delete,
move.
- `includeDB`: Type boolean. Also remove jobs from database.
- `age`: Type integer. Act on jobs with startTime older than age (in days).
- `location`: Type string. The target directory for retention. Only applicable for retention policy move.
* `disable-archive`: Type bool. Keep all metric data in the metric data repositories, do not write to the job-archive. Default `false`. * `disable-archive`: Type bool. Keep all metric data in the metric data repositories, do not write to the job-archive. Default `false`.
* `validate`: Type bool. Validate all input json documents against json schema. * `validate`: Type bool. Validate all input json documents against json schema.
* `session-max-age`: Type string. Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire! Default `168h`. * `session-max-age`: Type string. Specifies for how long a session shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire! Default `168h`.
* `jwt-max-age`: Type string. Specifies for how long a JWT token shall be valid as a string parsable by time.ParseDuration(). If 0 or empty, the session/token does not expire! Default `0`.
* `https-cert-file` and `https-key-file`: Type string. If both those options are not empty, use HTTPS using those certificates. * `https-cert-file` and `https-key-file`: Type string. If both those options are not empty, use HTTPS using those certificates.
* `redirect-http-to`: Type string. If not the empty string and `addr` does not end in ":80", redirect every request incoming at port 80 to that url. * `redirect-http-to`: Type string. If not the empty string and `addr` does not end in ":80", redirect every request incoming at port 80 to that url.
* `machine-state-dir`: Type string. Where to store MachineState files. TODO: Explain in more detail! * `machine-state-dir`: Type string. Where to store MachineState files. TODO: Explain in more detail!
* `stop-jobs-exceeding-walltime`: Type int. If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job. Default `0`. * `stop-jobs-exceeding-walltime`: Type int. If not zero, automatically mark jobs as stopped running X seconds longer than their walltime. Only applies if walltime is set for job. Default `0`.
* `short-running-jobs-duration`: Type int. Do not show running jobs shorter than X seconds. Default `300`. * `short-running-jobs-duration`: Type int. Do not show running jobs shorter than X seconds. Default `300`.
* `jwts`: Type object (required). For JWT Authentication.
- `max-age`: Type string (required). Configure how long a token is valid. As string parsable by time.ParseDuration().
- `cookieName`: Type string. Cookie that should be checked for a JWT token.
- `vaidateUser`: Type boolean. Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.
- `trustedIssuer`: Type string. Issuer that should be accepted when validating external JWTs.
- `syncUserOnLogin`: Type boolean. Add non-existent user to DB at login attempt with values provided in JWT.
* `ldap`: Type object. For LDAP Authentication and user synchronisation. Default `nil`. * `ldap`: Type object. For LDAP Authentication and user synchronisation. Default `nil`.
- `url`: Type string. URL of LDAP directory server. - `url`: Type string (required). URL of LDAP directory server.
- `user_base`: Type string. Base DN of user tree root. - `user_base`: Type string (required). Base DN of user tree root.
- `search_dn`: Type string. DN for authenticating LDAP admin account with general read rights. - `search_dn`: Type string (required). DN for authenticating LDAP admin account with general read rights.
- `user_bind`: Type string. Expression used to authenticate users via LDAP bind. Must contain `uid={username}`. - `user_bind`: Type string (required). Expression used to authenticate users via LDAP bind. Must contain `uid={username}`.
- `user_filter`: Type string. Filter to extract users for syncing. - `user_filter`: Type string (required). Filter to extract users for syncing.
- `username_attr`: Type string. Attribute with full user name. Defaults to `gecos` if not provided. - `username_attr`: Type string. Attribute with full user name. Defaults to `gecos` if not provided.
- `sync_interval`: Type string. Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration. - `sync_interval`: Type string. Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.
- `sync_del_old_users`: Type bool. Delete obsolete users in database. - `sync_del_old_users`: Type boolean. Delete obsolete users in database.
* `clusters`: Type array of objects - `syncUserOnLogin`: Type boolean. Add non-existent user to DB at login attempt if user exists in Ldap directory.
* `clusters`: Type array of objects (required)
- `name`: Type string. The name of the cluster. - `name`: Type string. The name of the cluster.
- `metricDataRepository`: Type object with properties: `kind` (Type string, can be one of `cc-metric-store`, `influxdb` ), `url` (Type string), `token` (Type string) - `metricDataRepository`: Type object with properties: `kind` (Type string, can be one of `cc-metric-store`, `influxdb` ), `url` (Type string), `token` (Type string)
- `filterRanges` Type object. This option controls the slider ranges for the UI controls of numNodes, duration, and startTime. Example: - `filterRanges` Type object. This option controls the slider ranges for the UI controls of numNodes, duration, and startTime. Example:

View File

@ -5,7 +5,7 @@
"user_base": "ou=people,ou=hpc,dc=test,dc=de", "user_base": "ou=people,ou=hpc,dc=test,dc=de",
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de", "search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=test,dc=de",
"user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de", "user_bind": "uid={username},ou=people,ou=hpc,dc=test,dc=de",
"user_filter": "(&(objectclass=posixAccount)(uid=*))" "user_filter": "(&(objectclass=posixAccount))"
}, },
"https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem", "https-cert-file": "/etc/letsencrypt/live/url/fullchain.pem",
"https-key-file": "/etc/letsencrypt/live/url/privkey.pem", "https-key-file": "/etc/letsencrypt/live/url/privkey.pem",

View File

@ -6,7 +6,6 @@ package graph
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"math" "math"
@ -33,7 +32,7 @@ func (r *queryResolver) rooflineHeatmap(
return nil, err return nil, err
} }
if len(jobs) > MAX_JOBS_FOR_ANALYSIS { if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
return nil, fmt.Errorf("GRAPH/STATS > too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS) return nil, fmt.Errorf("GRAPH/UTIL > too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
} }
fcols, frows := float64(cols), float64(rows) fcols, frows := float64(cols), float64(rows)
@ -50,20 +49,24 @@ func (r *queryResolver) rooflineHeatmap(
jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx) jobdata, err := metricdata.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx)
if err != nil { if err != nil {
log.Error("Error while loading metrics for roofline") log.Errorf("Error while loading roofline metrics for job %d", job.ID)
return nil, err return nil, err
} }
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"] flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
if flops_ == nil && membw_ == nil { if flops_ == nil && membw_ == nil {
return nil, fmt.Errorf("GRAPH/STATS > 'flops_any' or 'mem_bw' missing for job %d", job.ID) log.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
continue
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
} }
flops, ok1 := flops_["node"] flops, ok1 := flops_["node"]
membw, ok2 := membw_["node"] membw, ok2 := membw_["node"]
if !ok1 || !ok2 { if !ok1 || !ok2 {
log.Info("rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
continue
// TODO/FIXME: // TODO/FIXME:
return nil, errors.New("GRAPH/STATS > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level") // return nil, errors.New("GRAPH/UTIL > todo: rooflineHeatmap() query not implemented for where flops_any or mem_bw not available at 'node' level")
} }
for n := 0; n < len(flops.Series); n++ { for n := 0; n < len(flops.Series); n++ {
@ -99,7 +102,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF
return nil, err return nil, err
} }
if len(jobs) > MAX_JOBS_FOR_ANALYSIS { if len(jobs) > MAX_JOBS_FOR_ANALYSIS {
return nil, fmt.Errorf("GRAPH/STATS > too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS) return nil, fmt.Errorf("GRAPH/UTIL > too many jobs matched (max: %d)", MAX_JOBS_FOR_ANALYSIS)
} }
avgs := make([][]schema.Float, len(metrics)) avgs := make([][]schema.Float, len(metrics))

View File

@ -533,7 +533,9 @@ func (ccms *CCMetricStore) LoadStats(
metric := ccms.toLocalName(query.Metric) metric := ccms.toLocalName(query.Metric)
data := res[0] data := res[0]
if data.Error != nil { if data.Error != nil {
return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) log.Infof("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
continue
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
} }
metricdata, ok := stats[metric] metricdata, ok := stats[metric]
@ -543,7 +545,9 @@ func (ccms *CCMetricStore) LoadStats(
} }
if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() { if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() {
return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN") log.Infof("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname)
continue
// return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN")
} }
metricdata[query.Hostname] = schema.MetricStatistics{ metricdata[query.Hostname] = schema.MetricStatistics{

View File

@ -350,31 +350,16 @@ export function binsFromFootprint(weights, scope, values, numBins) {
scopeWeights = weights.nodeHours scopeWeights = weights.nodeHours
} }
const bins = new Array(numBins).fill(0) const rawBins = new Array(numBins).fill(0)
for (let i = 0; i < values.length; i++) for (let i = 0; i < values.length; i++)
bins[Math.floor(((values[i] - min) / (max - min)) * numBins)] += scopeWeights ? scopeWeights[i] : 1 rawBins[Math.floor(((values[i] - min) / (max - min)) * numBins)] += scopeWeights ? scopeWeights[i] : 1
// Manual Canvas Original const bins = rawBins.map((count, idx) => ({
// return { value: Math.floor(min + ((idx + 1) / numBins) * (max - min)),
// label: idx => { count: count
// let start = min + (idx / numBins) * (max - min) }))
// let stop = min + ((idx + 1) / numBins) * (max - min)
// return `${formatNumber(start)} - ${formatNumber(stop)}`
// },
// bins: bins.map((count, idx) => ({ value: idx, count: count })),
// min: min,
// max: max
// }
return { return {
bins: bins.map((count, idx) => ({ bins: bins
value: idx => { // Use bins' max value instead of mean
// let start = min + (idx / numBins) * (max - min)
let stop = min + ((idx + 1) / numBins) * (max - min)
// return `${formatNumber(Math.floor((start+stop)/2))}`
return Math.floor(stop)
},
count: count
}))
} }
} }