mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-01-15 17:21:46 +01:00
Remove embedded tagger rules
This commit is contained in:
@@ -1 +0,0 @@
|
||||
alf
|
||||
@@ -1,7 +0,0 @@
|
||||
calc_rate
|
||||
qmdffgen
|
||||
dynamic
|
||||
evbopt
|
||||
explore
|
||||
black_box
|
||||
poly_qmdff
|
||||
@@ -1,3 +0,0 @@
|
||||
chroma
|
||||
qdp
|
||||
qmp
|
||||
@@ -1 +0,0 @@
|
||||
cp2k
|
||||
@@ -1 +0,0 @@
|
||||
cpmd
|
||||
@@ -1 +0,0 @@
|
||||
flame
|
||||
@@ -1,3 +0,0 @@
|
||||
gromacs
|
||||
gmx
|
||||
mdrun
|
||||
@@ -1 +0,0 @@
|
||||
julia
|
||||
@@ -1 +0,0 @@
|
||||
lmp
|
||||
@@ -1 +0,0 @@
|
||||
matlab
|
||||
@@ -1 +0,0 @@
|
||||
openfoam
|
||||
@@ -1 +0,0 @@
|
||||
orca
|
||||
@@ -1,4 +0,0 @@
|
||||
python
|
||||
pip
|
||||
anaconda
|
||||
conda
|
||||
@@ -1,2 +0,0 @@
|
||||
starccm+
|
||||
-podkey
|
||||
@@ -1,10 +0,0 @@
|
||||
dscf
|
||||
grad
|
||||
ridft
|
||||
rdgrad
|
||||
ricc2
|
||||
statpt
|
||||
aoforce
|
||||
escf
|
||||
egrad
|
||||
odft
|
||||
@@ -1,2 +0,0 @@
|
||||
vasp
|
||||
VASP
|
||||
@@ -2,15 +2,16 @@
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package tagger
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"embed"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"maps"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"text/template"
|
||||
|
||||
@@ -23,8 +24,16 @@ import (
|
||||
"github.com/expr-lang/expr/vm"
|
||||
)
|
||||
|
||||
//go:embed jobclasses/*
|
||||
var jobClassFiles embed.FS
|
||||
const (
|
||||
// defaultJobClassConfigPath is the default path for job classification configuration
|
||||
defaultJobClassConfigPath = "./var/tagger/jobclasses"
|
||||
// tagTypeJobClass is the tag type identifier for job classification tags
|
||||
tagTypeJobClass = "jobClass"
|
||||
// jobClassConfigDirMatch is the directory name used for matching filesystem events
|
||||
jobClassConfigDirMatch = "jobclasses"
|
||||
// parametersFileName is the name of the parameters configuration file
|
||||
parametersFileName = "parameters.json"
|
||||
)
|
||||
|
||||
// Variable defines a named expression that can be computed and reused in rules.
|
||||
// Variables are evaluated before the main rule and their results are added to the environment.
|
||||
@@ -45,21 +54,21 @@ type ruleVariable struct {
|
||||
// and the final rule expression that determines if the job matches the classification.
|
||||
type RuleFormat struct {
|
||||
// Name is a human-readable description of the rule
|
||||
Name string `json:"name"`
|
||||
Name string `json:"name"`
|
||||
// Tag is the classification tag to apply if the rule matches
|
||||
Tag string `json:"tag"`
|
||||
Tag string `json:"tag"`
|
||||
// Parameters are shared values referenced in the rule (e.g., thresholds)
|
||||
Parameters []string `json:"parameters"`
|
||||
Parameters []string `json:"parameters"`
|
||||
// Metrics are the job metrics required for this rule (e.g., "cpu_load", "mem_used")
|
||||
Metrics []string `json:"metrics"`
|
||||
Metrics []string `json:"metrics"`
|
||||
// Requirements are boolean expressions that must be true for the rule to apply
|
||||
Requirements []string `json:"requirements"`
|
||||
Requirements []string `json:"requirements"`
|
||||
// Variables are computed values used in the rule expression
|
||||
Variables []Variable `json:"variables"`
|
||||
Variables []Variable `json:"variables"`
|
||||
// Rule is the boolean expression that determines if the job matches
|
||||
Rule string `json:"rule"`
|
||||
Rule string `json:"rule"`
|
||||
// Hint is a template string that generates a message when the rule matches
|
||||
Hint string `json:"hint"`
|
||||
Hint string `json:"hint"`
|
||||
}
|
||||
|
||||
type ruleInfo struct {
|
||||
@@ -75,29 +84,29 @@ type ruleInfo struct {
|
||||
// This interface allows for easier testing and decoupling from the concrete repository implementation.
|
||||
type JobRepository interface {
|
||||
// HasTag checks if a job already has a specific tag
|
||||
HasTag(jobId int64, tagType string, tagName string) bool
|
||||
HasTag(jobID int64, tagType string, tagName string) bool
|
||||
// AddTagOrCreateDirect adds a tag to a job or creates it if it doesn't exist
|
||||
AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error)
|
||||
AddTagOrCreateDirect(jobID int64, tagType string, tagName string) (tagID int64, err error)
|
||||
// UpdateMetadata updates job metadata with a key-value pair
|
||||
UpdateMetadata(job *schema.Job, key, val string) (err error)
|
||||
}
|
||||
|
||||
// JobClassTagger classifies jobs based on configurable rules that evaluate job metrics and properties.
|
||||
// Rules are loaded from embedded JSON files and can be dynamically reloaded from a watched directory.
|
||||
// Rules are loaded from an external configuration directory and can be dynamically reloaded when files change.
|
||||
// When a job matches a rule, it is tagged with the corresponding classification and an optional hint message.
|
||||
type JobClassTagger struct {
|
||||
// rules maps classification tags to their compiled rule information
|
||||
rules map[string]ruleInfo
|
||||
rules map[string]ruleInfo
|
||||
// parameters are shared values (e.g., thresholds) used across multiple rules
|
||||
parameters map[string]any
|
||||
parameters map[string]any
|
||||
// tagType is the type of tag ("jobClass")
|
||||
tagType string
|
||||
tagType string
|
||||
// cfgPath is the path to watch for configuration changes
|
||||
cfgPath string
|
||||
cfgPath string
|
||||
// repo provides access to job database operations
|
||||
repo JobRepository
|
||||
repo JobRepository
|
||||
// getStatistics retrieves job statistics for analysis
|
||||
getStatistics func(job *schema.Job) (map[string]schema.JobStatistics, error)
|
||||
getStatistics func(job *schema.Job) (map[string]schema.JobStatistics, error)
|
||||
// getMetricConfig retrieves metric configuration (limits) for a cluster
|
||||
getMetricConfig func(cluster, subCluster string) map[string]*schema.Metric
|
||||
}
|
||||
@@ -169,7 +178,7 @@ func (t *JobClassTagger) prepareRule(b []byte, fns string) {
|
||||
// EventMatch checks if a filesystem event should trigger configuration reload.
|
||||
// It returns true if the event path contains "jobclasses".
|
||||
func (t *JobClassTagger) EventMatch(s string) bool {
|
||||
return strings.Contains(s, "jobclasses")
|
||||
return strings.Contains(s, jobClassConfigDirMatch)
|
||||
}
|
||||
|
||||
// EventCallback is triggered when the configuration directory changes.
|
||||
@@ -181,9 +190,10 @@ func (t *JobClassTagger) EventCallback() {
|
||||
cclog.Fatal(err)
|
||||
}
|
||||
|
||||
if util.CheckFileExists(t.cfgPath + "/parameters.json") {
|
||||
parametersFile := filepath.Join(t.cfgPath, parametersFileName)
|
||||
if util.CheckFileExists(parametersFile) {
|
||||
cclog.Info("Merge parameters")
|
||||
b, err := os.ReadFile(t.cfgPath + "/parameters.json")
|
||||
b, err := os.ReadFile(parametersFile)
|
||||
if err != nil {
|
||||
cclog.Warnf("prepareRule() > open file error: %v", err)
|
||||
}
|
||||
@@ -198,13 +208,13 @@ func (t *JobClassTagger) EventCallback() {
|
||||
|
||||
for _, fn := range files {
|
||||
fns := fn.Name()
|
||||
if fns != "parameters.json" {
|
||||
if fns != parametersFileName {
|
||||
cclog.Debugf("Process: %s", fns)
|
||||
filename := fmt.Sprintf("%s/%s", t.cfgPath, fns)
|
||||
filename := filepath.Join(t.cfgPath, fns)
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
cclog.Warnf("prepareRule() > open file error: %v", err)
|
||||
return
|
||||
continue
|
||||
}
|
||||
t.prepareRule(b, fns)
|
||||
}
|
||||
@@ -213,7 +223,8 @@ func (t *JobClassTagger) EventCallback() {
|
||||
|
||||
func (t *JobClassTagger) initParameters() error {
|
||||
cclog.Info("Initialize parameters")
|
||||
b, err := jobClassFiles.ReadFile("jobclasses/parameters.json")
|
||||
parametersFile := filepath.Join(t.cfgPath, parametersFileName)
|
||||
b, err := os.ReadFile(parametersFile)
|
||||
if err != nil {
|
||||
cclog.Warnf("prepareRule() > open file error: %v", err)
|
||||
return err
|
||||
@@ -227,13 +238,20 @@ func (t *JobClassTagger) initParameters() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Register initializes the JobClassTagger by loading parameters and classification rules.
|
||||
// It loads embedded configuration files and sets up a file watch on ./var/tagger/jobclasses
|
||||
// if it exists, allowing for dynamic configuration updates without restarting the application.
|
||||
// Returns an error if the embedded configuration files cannot be read or parsed.
|
||||
// Register initializes the JobClassTagger by loading parameters and classification rules from external folder.
|
||||
// It sets up a file watch on ./var/tagger/jobclasses if it exists, allowing for
|
||||
// dynamic configuration updates without restarting the application.
|
||||
// Returns an error if the configuration path does not exist or cannot be read.
|
||||
func (t *JobClassTagger) Register() error {
|
||||
t.cfgPath = "./var/tagger/jobclasses"
|
||||
t.tagType = "jobClass"
|
||||
if t.cfgPath == "" {
|
||||
t.cfgPath = defaultJobClassConfigPath
|
||||
}
|
||||
t.tagType = tagTypeJobClass
|
||||
t.rules = make(map[string]ruleInfo)
|
||||
|
||||
if !util.CheckFileExists(t.cfgPath) {
|
||||
return fmt.Errorf("configuration path does not exist: %s", t.cfgPath)
|
||||
}
|
||||
|
||||
err := t.initParameters()
|
||||
if err != nil {
|
||||
@@ -241,31 +259,28 @@ func (t *JobClassTagger) Register() error {
|
||||
return err
|
||||
}
|
||||
|
||||
files, err := jobClassFiles.ReadDir("jobclasses")
|
||||
files, err := os.ReadDir(t.cfgPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading app folder: %#v", err)
|
||||
return fmt.Errorf("error reading jobclasses folder: %#v", err)
|
||||
}
|
||||
t.rules = make(map[string]ruleInfo)
|
||||
|
||||
for _, fn := range files {
|
||||
fns := fn.Name()
|
||||
if fns != "parameters.json" {
|
||||
filename := fmt.Sprintf("jobclasses/%s", fns)
|
||||
if fns != parametersFileName {
|
||||
cclog.Infof("Process: %s", fns)
|
||||
filename := filepath.Join(t.cfgPath, fns)
|
||||
|
||||
b, err := jobClassFiles.ReadFile(filename)
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
cclog.Warnf("prepareRule() > open file error: %v", err)
|
||||
return err
|
||||
continue
|
||||
}
|
||||
t.prepareRule(b, fns)
|
||||
}
|
||||
}
|
||||
|
||||
if util.CheckFileExists(t.cfgPath) {
|
||||
t.EventCallback()
|
||||
cclog.Infof("Setup file watch for %s", t.cfgPath)
|
||||
util.AddListener(t.cfgPath, t)
|
||||
}
|
||||
cclog.Infof("Setup file watch for %s", t.cfgPath)
|
||||
util.AddListener(t.cfgPath, t)
|
||||
|
||||
t.repo = repository.GetJobRepository()
|
||||
t.getStatistics = archive.GetStatistics
|
||||
|
||||
@@ -13,13 +13,13 @@ type MockJobRepository struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
func (m *MockJobRepository) HasTag(jobId int64, tagType string, tagName string) bool {
|
||||
args := m.Called(jobId, tagType, tagName)
|
||||
func (m *MockJobRepository) HasTag(jobID int64, tagType string, tagName string) bool {
|
||||
args := m.Called(jobID, tagType, tagName)
|
||||
return args.Bool(0)
|
||||
}
|
||||
|
||||
func (m *MockJobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) {
|
||||
args := m.Called(jobId, tagType, tagName)
|
||||
func (m *MockJobRepository) AddTagOrCreateDirect(jobID int64, tagType string, tagName string) (tagID int64, err error) {
|
||||
args := m.Called(jobID, tagType, tagName)
|
||||
return args.Get(0).(int64), args.Error(1)
|
||||
}
|
||||
|
||||
|
||||
@@ -7,9 +7,7 @@ package tagger
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"embed"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
@@ -21,8 +19,14 @@ import (
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
)
|
||||
|
||||
//go:embed apps/*
|
||||
var appFiles embed.FS
|
||||
const (
|
||||
// defaultConfigPath is the default path for application tagging configuration
|
||||
defaultConfigPath = "./var/tagger/apps"
|
||||
// tagTypeApp is the tag type identifier for application tags
|
||||
tagTypeApp = "app"
|
||||
// configDirMatch is the directory name used for matching filesystem events
|
||||
configDirMatch = "apps"
|
||||
)
|
||||
|
||||
type appInfo struct {
|
||||
tag string
|
||||
@@ -30,19 +34,19 @@ type appInfo struct {
|
||||
}
|
||||
|
||||
// AppTagger detects applications by matching patterns in job scripts.
|
||||
// It loads application patterns from embedded files and can dynamically reload
|
||||
// configuration from a watched directory. When a job script matches a pattern,
|
||||
// It loads application patterns from an external configuration directory and can dynamically reload
|
||||
// configuration when files change. When a job script matches a pattern,
|
||||
// the corresponding application tag is automatically applied.
|
||||
type AppTagger struct {
|
||||
// apps maps application tags to their matching patterns
|
||||
apps map[string]appInfo
|
||||
apps map[string]appInfo
|
||||
// tagType is the type of tag ("app")
|
||||
tagType string
|
||||
// cfgPath is the path to watch for configuration changes
|
||||
cfgPath string
|
||||
}
|
||||
|
||||
func (t *AppTagger) scanApp(f fs.File, fns string) {
|
||||
func (t *AppTagger) scanApp(f *os.File, fns string) {
|
||||
scanner := bufio.NewScanner(f)
|
||||
ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)}
|
||||
|
||||
@@ -56,7 +60,7 @@ func (t *AppTagger) scanApp(f fs.File, fns string) {
|
||||
// EventMatch checks if a filesystem event should trigger configuration reload.
|
||||
// It returns true if the event path contains "apps".
|
||||
func (t *AppTagger) EventMatch(s string) bool {
|
||||
return strings.Contains(s, "apps")
|
||||
return strings.Contains(s, configDirMatch)
|
||||
}
|
||||
|
||||
// EventCallback is triggered when the configuration directory changes.
|
||||
@@ -71,43 +75,50 @@ func (t *AppTagger) EventCallback() {
|
||||
for _, fn := range files {
|
||||
fns := fn.Name()
|
||||
cclog.Debugf("Process: %s", fns)
|
||||
f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns))
|
||||
f, err := os.Open(filepath.Join(t.cfgPath, fns))
|
||||
if err != nil {
|
||||
cclog.Errorf("error opening app file %s: %#v", fns, err)
|
||||
continue
|
||||
}
|
||||
t.scanApp(f, fns)
|
||||
f.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// Register initializes the AppTagger by loading application patterns from embedded files.
|
||||
// It also sets up a file watch on ./var/tagger/apps if it exists, allowing for
|
||||
// Register initializes the AppTagger by loading application patterns from external folder.
|
||||
// It sets up a file watch on ./var/tagger/apps if it exists, allowing for
|
||||
// dynamic configuration updates without restarting the application.
|
||||
// Returns an error if the embedded application files cannot be read.
|
||||
// Returns an error if the configuration path does not exist or cannot be read.
|
||||
func (t *AppTagger) Register() error {
|
||||
t.cfgPath = "./var/tagger/apps"
|
||||
t.tagType = "app"
|
||||
if t.cfgPath == "" {
|
||||
t.cfgPath = defaultConfigPath
|
||||
}
|
||||
t.tagType = tagTypeApp
|
||||
t.apps = make(map[string]appInfo, 0)
|
||||
|
||||
files, err := appFiles.ReadDir("apps")
|
||||
if !util.CheckFileExists(t.cfgPath) {
|
||||
return fmt.Errorf("configuration path does not exist: %s", t.cfgPath)
|
||||
}
|
||||
|
||||
files, err := os.ReadDir(t.cfgPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading app folder: %#v", err)
|
||||
}
|
||||
t.apps = make(map[string]appInfo, 0)
|
||||
|
||||
for _, fn := range files {
|
||||
fns := fn.Name()
|
||||
cclog.Debugf("Process: %s", fns)
|
||||
f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns))
|
||||
f, err := os.Open(filepath.Join(t.cfgPath, fns))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening app file %s: %#v", fns, err)
|
||||
cclog.Errorf("error opening app file %s: %#v", fns, err)
|
||||
continue
|
||||
}
|
||||
defer f.Close()
|
||||
t.scanApp(f, fns)
|
||||
f.Close()
|
||||
}
|
||||
|
||||
if util.CheckFileExists(t.cfgPath) {
|
||||
t.EventCallback()
|
||||
cclog.Infof("Setup file watch for %s", t.cfgPath)
|
||||
util.AddListener(t.cfgPath, t)
|
||||
}
|
||||
cclog.Infof("Setup file watch for %s", t.cfgPath)
|
||||
util.AddListener(t.cfgPath, t)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
package tagger
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
@@ -29,28 +31,88 @@ func noErr(tb testing.TB, err error) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegister(t *testing.T) {
|
||||
var tagger AppTagger
|
||||
func setupAppTaggerTestDir(t *testing.T) string {
|
||||
t.Helper()
|
||||
|
||||
err := tagger.Register()
|
||||
testDir := t.TempDir()
|
||||
appsDir := filepath.Join(testDir, "apps")
|
||||
err := os.MkdirAll(appsDir, 0o755)
|
||||
noErr(t, err)
|
||||
|
||||
srcDir := "../../configs/tagger/apps"
|
||||
files, err := os.ReadDir(srcDir)
|
||||
noErr(t, err)
|
||||
|
||||
for _, file := range files {
|
||||
if file.IsDir() {
|
||||
continue
|
||||
}
|
||||
srcPath := filepath.Join(srcDir, file.Name())
|
||||
dstPath := filepath.Join(appsDir, file.Name())
|
||||
|
||||
data, err := os.ReadFile(srcPath)
|
||||
noErr(t, err)
|
||||
|
||||
err = os.WriteFile(dstPath, data, 0o644)
|
||||
noErr(t, err)
|
||||
}
|
||||
|
||||
return appsDir
|
||||
}
|
||||
|
||||
func TestRegister(t *testing.T) {
|
||||
appsDir := setupAppTaggerTestDir(t)
|
||||
|
||||
var tagger AppTagger
|
||||
tagger.cfgPath = appsDir
|
||||
tagger.tagType = tagTypeApp
|
||||
tagger.apps = make(map[string]appInfo, 0)
|
||||
|
||||
files, err := os.ReadDir(appsDir)
|
||||
noErr(t, err)
|
||||
|
||||
for _, fn := range files {
|
||||
if fn.IsDir() {
|
||||
continue
|
||||
}
|
||||
fns := fn.Name()
|
||||
f, err := os.Open(filepath.Join(appsDir, fns))
|
||||
noErr(t, err)
|
||||
tagger.scanApp(f, fns)
|
||||
f.Close()
|
||||
}
|
||||
|
||||
if len(tagger.apps) != 16 {
|
||||
t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 16", len(tagger.apps))
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatch(t *testing.T) {
|
||||
appsDir := setupAppTaggerTestDir(t)
|
||||
r := setup(t)
|
||||
|
||||
job, err := r.FindByIDDirect(317)
|
||||
noErr(t, err)
|
||||
|
||||
var tagger AppTagger
|
||||
tagger.cfgPath = appsDir
|
||||
tagger.tagType = tagTypeApp
|
||||
tagger.apps = make(map[string]appInfo, 0)
|
||||
|
||||
err = tagger.Register()
|
||||
files, err := os.ReadDir(appsDir)
|
||||
noErr(t, err)
|
||||
|
||||
for _, fn := range files {
|
||||
if fn.IsDir() {
|
||||
continue
|
||||
}
|
||||
fns := fn.Name()
|
||||
f, err := os.Open(filepath.Join(appsDir, fns))
|
||||
noErr(t, err)
|
||||
tagger.scanApp(f, fns)
|
||||
f.Close()
|
||||
}
|
||||
|
||||
tagger.Match(job)
|
||||
|
||||
if !r.HasTag(317, "app", "vasp") {
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
{
|
||||
"name": "Excessive CPU load",
|
||||
"tag": "excessiveload",
|
||||
"parameters": [
|
||||
"excessivecpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": ["cpu_load"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "load_threshold",
|
||||
"expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"name": "load_perc",
|
||||
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
|
||||
}
|
||||
],
|
||||
"rule": "cpu_load.avg > load_threshold",
|
||||
"hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.load_threshold}}."
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
{
|
||||
"name": "Low ressource utilization",
|
||||
"tag": "lowutilization",
|
||||
"parameters": ["job_min_duration_seconds"],
|
||||
"metrics": ["flops_any", "mem_bw"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "mem_bw_perc",
|
||||
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)"
|
||||
},
|
||||
{
|
||||
"name": "flops_any_perc",
|
||||
"expr": "1.0 - (flops_any.avg / flops_any.limits.peak)"
|
||||
}
|
||||
],
|
||||
"rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert",
|
||||
"hint": "This job was detected as low utilization because the average flop rate {{.flops_any.avg}} falls below the threshold {{.flops_any.limits.alert}}."
|
||||
}
|
||||
@@ -1,26 +0,0 @@
|
||||
{
|
||||
"name": "Low CPU load",
|
||||
"tag": "lowload",
|
||||
"parameters": [
|
||||
"lowcpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": ["cpu_load"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "load_threshold",
|
||||
"expr": "job.numCores * lowcpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"name": "load_perc",
|
||||
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
|
||||
}
|
||||
],
|
||||
"rule": "cpu_load.avg < cpu_load.limits.caution",
|
||||
"hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}."
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
{
|
||||
"lowcpuload_threshold_factor": 0.9,
|
||||
"excessivecpuload_threshold_factor": 1.1,
|
||||
"highmemoryusage_threshold_factor": 0.9,
|
||||
"node_load_imbalance_threshold_factor": 0.1,
|
||||
"core_load_imbalance_threshold_factor": 0.1,
|
||||
"high_memory_load_threshold_factor": 0.9,
|
||||
"lowgpuload_threshold_factor": 0.7,
|
||||
"memory_leak_slope_threshold": 0.1,
|
||||
"job_min_duration_seconds": 600.0,
|
||||
"sampling_interval_seconds": 30.0,
|
||||
"cpu_load_pre_cutoff_samples": 11.0,
|
||||
"cpu_load_core_pre_cutoff_samples": 6.0
|
||||
}
|
||||
Reference in New Issue
Block a user