Refactor directory structure

This commit is contained in:
Jan Eitzinger
2022-06-21 17:52:36 +02:00
parent 45359cca9d
commit 81819db436
54 changed files with 767 additions and 454 deletions

125
pkg/log/log.go Normal file
View File

@@ -0,0 +1,125 @@
// Provides a simple way of logging with different levels.
// Time/Data are not logged on purpose because systemd adds
// them for us.
//
// Uses these prefixes: https://www.freedesktop.org/software/systemd/man/sd-daemon.html
package log
import (
"fmt"
"io"
"log"
"os"
)
var (
DebugWriter io.Writer = os.Stderr
InfoWriter io.Writer = os.Stderr
WarnWriter io.Writer = os.Stderr
ErrWriter io.Writer = os.Stderr
)
var (
DebugPrefix string = "<7>[DEBUG] "
InfoPrefix string = "<6>[INFO] "
WarnPrefix string = "<4>[WARNING] "
ErrPrefix string = "<3>[ERROR] "
)
var (
DebugLog *log.Logger = log.New(DebugWriter, DebugPrefix, 0)
InfoLog *log.Logger = log.New(InfoWriter, InfoPrefix, 0)
WarnLog *log.Logger = log.New(WarnWriter, WarnPrefix, 0)
ErrLog *log.Logger = log.New(ErrWriter, ErrPrefix, 0)
)
func init() {
if lvl, ok := os.LookupEnv("LOGLEVEL"); ok {
switch lvl {
case "err", "fatal":
WarnWriter = io.Discard
fallthrough
case "warn":
InfoWriter = io.Discard
fallthrough
case "info":
DebugWriter = io.Discard
case "debug":
// Nothing to do...
default:
Warnf("environment variable LOGLEVEL has invalid value %#v", lvl)
}
}
}
func Debug(v ...interface{}) {
if DebugWriter != io.Discard {
DebugLog.Print(v...)
}
}
func Info(v ...interface{}) {
if InfoWriter != io.Discard {
InfoLog.Print(v...)
}
}
func Print(v ...interface{}) {
Info(v...)
}
func Warn(v ...interface{}) {
if WarnWriter != io.Discard {
WarnLog.Print(v...)
}
}
func Error(v ...interface{}) {
if ErrWriter != io.Discard {
ErrLog.Print(v...)
}
}
func Fatal(v ...interface{}) {
Error(v...)
os.Exit(1)
}
func Debugf(format string, v ...interface{}) {
if DebugWriter != io.Discard {
DebugLog.Printf(format, v...)
}
}
func Infof(format string, v ...interface{}) {
if InfoWriter != io.Discard {
InfoLog.Printf(format, v...)
}
}
func Printf(format string, v ...interface{}) {
Infof(format, v...)
}
func Finfof(w io.Writer, format string, v ...interface{}) {
if w != io.Discard {
fmt.Fprintf(InfoWriter, InfoPrefix+format+"\n", v...)
}
}
func Warnf(format string, v ...interface{}) {
if WarnWriter != io.Discard {
WarnLog.Printf(format, v...)
}
}
func Errorf(format string, v ...interface{}) {
if ErrWriter != io.Discard {
ErrLog.Printf(format, v...)
}
}
func Fatalf(format string, v ...interface{}) {
Errorf(format, v...)
os.Exit(1)
}

105
pkg/schema/float.go Normal file
View File

@@ -0,0 +1,105 @@
package schema
import (
"errors"
"io"
"math"
"strconv"
)
// A custom float type is used so that (Un)MarshalJSON and
// (Un)MarshalGQL can be overloaded and NaN/null can be used.
// The default behaviour of putting every nullable value behind
// a pointer has a bigger overhead.
type Float float64
var NaN Float = Float(math.NaN())
var nullAsBytes []byte = []byte("null")
func (f Float) IsNaN() bool {
return math.IsNaN(float64(f))
}
// NaN will be serialized to `null`.
func (f Float) MarshalJSON() ([]byte, error) {
if f.IsNaN() {
return nullAsBytes, nil
}
return strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64), nil
}
// `null` will be unserialized to NaN.
func (f *Float) UnmarshalJSON(input []byte) error {
s := string(input)
if s == "null" {
*f = NaN
return nil
}
val, err := strconv.ParseFloat(s, 64)
if err != nil {
return err
}
*f = Float(val)
return nil
}
// UnmarshalGQL implements the graphql.Unmarshaler interface.
func (f *Float) UnmarshalGQL(v interface{}) error {
f64, ok := v.(float64)
if !ok {
return errors.New("invalid Float scalar")
}
*f = Float(f64)
return nil
}
// MarshalGQL implements the graphql.Marshaler interface.
// NaN will be serialized to `null`.
func (f Float) MarshalGQL(w io.Writer) {
if f.IsNaN() {
w.Write(nullAsBytes)
} else {
w.Write(strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 2, 64))
}
}
// Only used via REST-API, not via GraphQL.
// This uses a lot less allocations per series,
// but it turns out that the performance increase
// from using this is not that big.
func (s *Series) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 512+len(s.Data)*8)
buf = append(buf, `{"hostname":"`...)
buf = append(buf, s.Hostname...)
buf = append(buf, '"')
if s.Id != nil {
buf = append(buf, `,"id":`...)
buf = strconv.AppendInt(buf, int64(*s.Id), 10)
}
if s.Statistics != nil {
buf = append(buf, `,"statistics":{"min":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Min, 'f', 2, 64)
buf = append(buf, `,"avg":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Avg, 'f', 2, 64)
buf = append(buf, `,"max":`...)
buf = strconv.AppendFloat(buf, s.Statistics.Max, 'f', 2, 64)
buf = append(buf, '}')
}
buf = append(buf, `,"data":[`...)
for i := 0; i < len(s.Data); i++ {
if i != 0 {
buf = append(buf, ',')
}
if s.Data[i].IsNaN() {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(s.Data[i]), 'f', 2, 32)
}
}
buf = append(buf, ']', '}')
return buf, nil
}

135
pkg/schema/job.go Normal file
View File

@@ -0,0 +1,135 @@
package schema
import (
"errors"
"fmt"
"io"
"time"
)
// Common subset of Job and JobMeta. Use one of those, not
// this type directly.
type BaseJob struct {
JobID int64 `json:"jobId" db:"job_id"`
User string `json:"user" db:"user"`
Project string `json:"project" db:"project"`
Cluster string `json:"cluster" db:"cluster"`
SubCluster string `json:"subCluster" db:"subcluster"`
Partition string `json:"partition" db:"partition"`
ArrayJobId int32 `json:"arrayJobId" db:"array_job_id"`
NumNodes int32 `json:"numNodes" db:"num_nodes"`
NumHWThreads int32 `json:"numHwthreads" db:"num_hwthreads"`
NumAcc int32 `json:"numAcc" db:"num_acc"`
Exclusive int32 `json:"exclusive" db:"exclusive"`
MonitoringStatus int32 `json:"monitoringStatus" db:"monitoring_status"`
SMT int32 `json:"smt" db:"smt"`
State JobState `json:"jobState" db:"job_state"`
Duration int32 `json:"duration" db:"duration"`
Walltime int64 `json:"walltime" db:"walltime"`
Tags []*Tag `json:"tags"`
RawResources []byte `json:"-" db:"resources"`
Resources []*Resource `json:"resources"`
RawMetaData []byte `json:"-" db:"meta_data"`
MetaData map[string]string `json:"metaData"`
}
// This type is used as the GraphQL interface and using sqlx as a table row.
type Job struct {
ID int64 `json:"id" db:"id"`
BaseJob
StartTimeUnix int64 `json:"-" db:"start_time"`
StartTime time.Time `json:"startTime"`
MemUsedMax float64 `json:"-" db:"mem_used_max"`
FlopsAnyAvg float64 `json:"-" db:"flops_any_avg"`
MemBwAvg float64 `json:"-" db:"mem_bw_avg"`
LoadAvg float64 `json:"-" db:"load_avg"`
NetBwAvg float64 `json:"-" db:"net_bw_avg"`
NetDataVolTotal float64 `json:"-" db:"net_data_vol_total"`
FileBwAvg float64 `json:"-" db:"file_bw_avg"`
FileDataVolTotal float64 `json:"-" db:"file_data_vol_total"`
}
// When reading from the database or sending data via GraphQL, the start time can be in the much more
// convenient time.Time type. In the `meta.json` files, the start time is encoded as a unix epoch timestamp.
// This is why there is this struct, which contains all fields from the regular job struct, but "overwrites"
// the StartTime field with one of type int64.
type JobMeta struct {
ID *int64 `json:"id,omitempty"` // never used in the job-archive, only available via REST-API
BaseJob
StartTime int64 `json:"startTime" db:"start_time"`
Statistics map[string]JobStatistics `json:"statistics,omitempty"`
}
const (
MonitoringStatusDisabled int32 = 0
MonitoringStatusRunningOrArchiving int32 = 1
MonitoringStatusArchivingFailed int32 = 2
MonitoringStatusArchivingSuccessful int32 = 3
)
var JobDefaults BaseJob = BaseJob{
Exclusive: 1,
MonitoringStatus: MonitoringStatusRunningOrArchiving,
}
type JobStatistics struct {
Unit string `json:"unit"`
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
type Tag struct {
ID int64 `json:"id" db:"id"`
Type string `json:"type" db:"tag_type"`
Name string `json:"name" db:"tag_name"`
}
type Resource struct {
Hostname string `json:"hostname"`
HWThreads []int `json:"hwthreads,omitempty"`
Accelerators []string `json:"accelerators,omitempty"`
Configuration string `json:"configuration,omitempty"`
}
type JobState string
const (
JobStateRunning JobState = "running"
JobStateCompleted JobState = "completed"
JobStateFailed JobState = "failed"
JobStateCancelled JobState = "cancelled"
JobStateStopped JobState = "stopped"
JobStateTimeout JobState = "timeout"
JobStatePreempted JobState = "preempted"
JobStateOutOfMemory JobState = "out_of_memory"
)
func (e *JobState) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = JobState(str)
if !e.Valid() {
return errors.New("invalid job state")
}
return nil
}
func (e JobState) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}
func (e JobState) Valid() bool {
return e == JobStateRunning ||
e == JobStateCompleted ||
e == JobStateFailed ||
e == JobStateCancelled ||
e == JobStateStopped ||
e == JobStateTimeout ||
e == JobStatePreempted ||
e == JobStateOutOfMemory
}

319
pkg/schema/metrics.go Normal file
View File

@@ -0,0 +1,319 @@
package schema
import (
"fmt"
"io"
"math"
"sort"
"unsafe"
)
type JobData map[string]map[MetricScope]*JobMetric
type JobMetric struct {
Unit string `json:"unit"`
Scope MetricScope `json:"scope"`
Timestep int `json:"timestep"`
Series []Series `json:"series"`
StatisticsSeries *StatsSeries `json:"statisticsSeries"`
}
type Series struct {
Hostname string `json:"hostname"`
Id *int `json:"id,omitempty"`
Statistics *MetricStatistics `json:"statistics"`
Data []Float `json:"data"`
}
type MetricStatistics struct {
Avg float64 `json:"avg"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
type StatsSeries struct {
Mean []Float `json:"mean"`
Min []Float `json:"min"`
Max []Float `json:"max"`
Percentiles map[int][]Float `json:"percentiles,omitempty"`
}
type MetricScope string
const (
MetricScopeInvalid MetricScope = "invalid_scope"
MetricScopeNode MetricScope = "node"
MetricScopeSocket MetricScope = "socket"
MetricScopeMemoryDomain MetricScope = "memoryDomain"
MetricScopeCore MetricScope = "core"
MetricScopeHWThread MetricScope = "hwthread"
MetricScopeAccelerator MetricScope = "accelerator"
)
var metricScopeGranularity map[MetricScope]int = map[MetricScope]int{
MetricScopeNode: 10,
MetricScopeSocket: 5,
MetricScopeMemoryDomain: 3,
MetricScopeCore: 2,
MetricScopeHWThread: 1,
MetricScopeAccelerator: 5, // Special/Randomly choosen
MetricScopeInvalid: -1,
}
func (e *MetricScope) LT(other MetricScope) bool {
a := metricScopeGranularity[*e]
b := metricScopeGranularity[other]
return a < b
}
func (e *MetricScope) LTE(other MetricScope) bool {
a := metricScopeGranularity[*e]
b := metricScopeGranularity[other]
return a <= b
}
func (e *MetricScope) Max(other MetricScope) MetricScope {
a := metricScopeGranularity[*e]
b := metricScopeGranularity[other]
if a > b {
return *e
}
return other
}
func (e *MetricScope) UnmarshalGQL(v interface{}) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
}
*e = MetricScope(str)
if !e.Valid() {
return fmt.Errorf("%s is not a valid MetricScope", str)
}
return nil
}
func (e MetricScope) MarshalGQL(w io.Writer) {
fmt.Fprintf(w, "\"%s\"", e)
}
func (e MetricScope) Valid() bool {
gran, ok := metricScopeGranularity[e]
return ok && gran > 0
}
func (jd *JobData) Size() int {
n := 128
for _, scopes := range *jd {
for _, metric := range scopes {
if metric.StatisticsSeries != nil {
n += len(metric.StatisticsSeries.Max)
n += len(metric.StatisticsSeries.Mean)
n += len(metric.StatisticsSeries.Min)
}
for _, series := range metric.Series {
n += len(series.Data)
}
}
}
return n * int(unsafe.Sizeof(Float(0)))
}
const smooth bool = false
func (jm *JobMetric) AddStatisticsSeries() {
if jm.StatisticsSeries != nil || len(jm.Series) < 4 {
return
}
n, m := 0, len(jm.Series[0].Data)
for _, series := range jm.Series {
if len(series.Data) > n {
n = len(series.Data)
}
if len(series.Data) < m {
m = len(series.Data)
}
}
min, mean, max := make([]Float, n), make([]Float, n), make([]Float, n)
i := 0
for ; i < m; i++ {
smin, ssum, smax := math.MaxFloat32, 0.0, -math.MaxFloat32
notnan := 0
for j := 0; j < len(jm.Series); j++ {
x := float64(jm.Series[j].Data[i])
if math.IsNaN(x) {
continue
}
notnan += 1
ssum += x
smin = math.Min(smin, x)
smax = math.Max(smax, x)
}
if notnan < 3 {
min[i] = NaN
mean[i] = NaN
max[i] = NaN
} else {
min[i] = Float(smin)
mean[i] = Float(ssum / float64(notnan))
max[i] = Float(smax)
}
}
for ; i < n; i++ {
min[i] = NaN
mean[i] = NaN
max[i] = NaN
}
if smooth {
for i := 2; i < len(mean)-2; i++ {
if min[i].IsNaN() {
continue
}
min[i] = (min[i-2] + min[i-1] + min[i] + min[i+1] + min[i+2]) / 5
max[i] = (max[i-2] + max[i-1] + max[i] + max[i+1] + max[i+2]) / 5
mean[i] = (mean[i-2] + mean[i-1] + mean[i] + mean[i+1] + mean[i+2]) / 5
}
}
jm.StatisticsSeries = &StatsSeries{Mean: mean, Min: min, Max: max}
}
func (jd *JobData) AddNodeScope(metric string) bool {
scopes, ok := (*jd)[metric]
if !ok {
return false
}
var maxScope MetricScope = MetricScopeInvalid
for scope := range scopes {
maxScope = maxScope.Max(scope)
}
if maxScope == MetricScopeInvalid || maxScope == MetricScopeNode {
return false
}
jm := scopes[maxScope]
hosts := make(map[string][]Series, 32)
for _, series := range jm.Series {
hosts[series.Hostname] = append(hosts[series.Hostname], series)
}
nodeJm := &JobMetric{
Unit: jm.Unit,
Scope: MetricScopeNode,
Timestep: jm.Timestep,
Series: make([]Series, 0, len(hosts)),
}
for hostname, series := range hosts {
min, sum, max := math.MaxFloat32, 0.0, -math.MaxFloat32
for _, series := range series {
if series.Statistics == nil {
min, sum, max = math.NaN(), math.NaN(), math.NaN()
break
}
sum += series.Statistics.Avg
min = math.Min(min, series.Statistics.Min)
max = math.Max(max, series.Statistics.Max)
}
n, m := 0, len(jm.Series[0].Data)
for _, series := range jm.Series {
if len(series.Data) > n {
n = len(series.Data)
}
if len(series.Data) < m {
m = len(series.Data)
}
}
i, data := 0, make([]Float, len(series[0].Data))
for ; i < m; i++ {
x := Float(0.0)
for _, series := range jm.Series {
x += series.Data[i]
}
data[i] = x
}
for ; i < n; i++ {
data[i] = NaN
}
nodeJm.Series = append(nodeJm.Series, Series{
Hostname: hostname,
Statistics: &MetricStatistics{Min: min, Avg: sum / float64(len(series)), Max: max},
Data: data,
})
}
scopes[MetricScopeNode] = nodeJm
return true
}
func (jm *JobMetric) AddPercentiles(ps []int) bool {
if jm.StatisticsSeries == nil {
jm.AddStatisticsSeries()
}
if len(jm.Series) < 3 {
return false
}
if jm.StatisticsSeries.Percentiles == nil {
jm.StatisticsSeries.Percentiles = make(map[int][]Float, len(ps))
}
n := 0
for _, series := range jm.Series {
if len(series.Data) > n {
n = len(series.Data)
}
}
data := make([][]float64, n)
for i := 0; i < n; i++ {
vals := make([]float64, 0, len(jm.Series))
for _, series := range jm.Series {
if i < len(series.Data) {
vals = append(vals, float64(series.Data[i]))
}
}
sort.Float64s(vals)
data[i] = vals
}
for _, p := range ps {
if p < 1 || p > 99 {
panic("invalid percentile")
}
if _, ok := jm.StatisticsSeries.Percentiles[p]; ok {
continue
}
percentiles := make([]Float, n)
for i := 0; i < n; i++ {
sorted := data[i]
percentiles[i] = Float(sorted[(len(sorted)*p)/100])
}
jm.StatisticsSeries.Percentiles[p] = percentiles
}
return true
}