mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-06-30 15:20:38 +02:00
fix: support SLUID-based Slurm cgroup directories
This commit is contained in:
committed by
Thomas Gruber
parent
14d624fd22
commit
aaabcc639f
@@ -7,6 +7,7 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -45,6 +46,37 @@ type SlurmCgroupCollector struct {
|
|||||||
|
|
||||||
const defaultCgroupBase = "/sys/fs/cgroup/system.slice/slurmstepd.scope"
|
const defaultCgroupBase = "/sys/fs/cgroup/system.slice/slurmstepd.scope"
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Slurm cgroup v2 directory layout:
|
||||||
|
// - Slurm <= 25.11: job_<numeric job id>
|
||||||
|
// - Slurm >= 26.05: SLUID, encoded as "s" + 13 Crockford Base32 characters
|
||||||
|
jobIDDirRE = regexp.MustCompile(`^job_[0-9]+$`)
|
||||||
|
sluidDirRE = regexp.MustCompile(`(?i)^s[0-9A-HJKMNP-TV-Z]{13}$`)
|
||||||
|
)
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) findSlurmJobDirs() ([]string, error) {
|
||||||
|
entries, err := os.ReadDir(m.cgroupBase)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
jobDirs := make([]string, 0)
|
||||||
|
|
||||||
|
for _, entry := range entries {
|
||||||
|
if !entry.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
name := entry.Name()
|
||||||
|
|
||||||
|
if jobIDDirRE.MatchString(name) || sluidDirRE.MatchString(name) {
|
||||||
|
jobDirs = append(jobDirs, filepath.Join(m.cgroupBase, name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return jobDirs, nil
|
||||||
|
}
|
||||||
|
|
||||||
func ParseCPUs(cpuset string) ([]int, error) {
|
func ParseCPUs(cpuset string) ([]int, error) {
|
||||||
var result []int
|
var result []int
|
||||||
if cpuset == "" {
|
if cpuset == "" {
|
||||||
@@ -237,10 +269,9 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
|||||||
delete(m.cpuUsed, k)
|
delete(m.cpuUsed, k)
|
||||||
}
|
}
|
||||||
|
|
||||||
globPattern := filepath.Join(m.cgroupBase, "job_*")
|
jobDirs, err := m.findSlurmJobDirs()
|
||||||
jobDirs, err := filepath.Glob(globPattern)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error())
|
cclog.ComponentError(m.name, "Error reading job directories:", err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user