* Removed unused code
* Use cclog for logging
* Wrap errors so that they can be unwrapped
* Revert wrong use of slices.Delete()
* Fix derivative values should be float
* Suggestions from the gocritic linter
* Fixed: interface method AddChannel must have all named params (inamedparam)
* Enable linter: errorlint
* Replace fmt.Sprintf("%d", i)) by strconv.Itoa(i) for improved performance
* Correct misspelled words
* Break up very long lines into multiple lines
* lp.NewMessage -> lp.NewMetric
* Preallocate slices of known length
This commit is contained in:
Holger Obermaier
2026-02-13 09:36:14 +01:00
committed by GitHub
parent b69281dae6
commit 9908d76aac
29 changed files with 311 additions and 395 deletions

View File

@@ -79,9 +79,10 @@ func ParseCPUs(cpuset string) ([]int, error) {
}
func GetAllCPUs() ([]int, error) {
data, err := os.ReadFile("/sys/devices/system/cpu/online")
cpuOnline := "/sys/devices/system/cpu/online"
data, err := os.ReadFile(cpuOnline)
if err != nil {
return nil, fmt.Errorf("failed to read /sys/devices/system/cpu/online: %v", err)
return nil, fmt.Errorf("failed to read file \"%s\": %w", cpuOnline, err)
}
return ParseCPUs(strings.TrimSpace(string(data)))
}
@@ -106,16 +107,18 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
m.tags = map[string]string{"type": "hwthread"}
m.meta = map[string]string{
"source": m.name,
"group": "SLURM"}
m.tags = map[string]string{
"type": "hwthread"}
m.cpuUsed = make(map[int]bool)
m.cgroupBase = defaultCgroupBase
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
return fmt.Errorf("%s Init(): Error reading JSON config: %w", m.name, err)
}
m.excludeMetrics = make(map[string]struct{})
for _, metric := range m.config.ExcludeMetrics {
@@ -130,19 +133,16 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
if !m.useSudo {
user, err := user.Current()
if err != nil {
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
}
if user.Uid != "0" {
cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)")
return fmt.Errorf("not root")
return fmt.Errorf("%s Init(): Reading cgroup files requires root privileges (or enable use_sudo in config)", m.name)
}
}
m.allCPUs, err = GetAllCPUs()
if err != nil {
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
return err
return fmt.Errorf("%s Init(): Error reading online CPUs: %w", m.name, err)
}
m.init = true
@@ -159,7 +159,9 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
CpuSet: []int{},
}
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
cg := func(f string) string {
return filepath.Join(m.cgroupBase, jobdir, f)
}
memUsage, err := m.readFile(cg("memory.current"))
if err == nil {
@@ -208,8 +210,8 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
}
}
if usageUsec > 0 {
jobdata.CpuUsageUser = (userUsec * 100 / usageUsec)
jobdata.CpuUsageSys = (systemUsec * 100 / usageUsec)
jobdata.CpuUsageUser = (userUsec * 100.0 / usageUsec)
jobdata.CpuUsageSys = (systemUsec * 100.0 / usageUsec)
}
}
@@ -252,12 +254,18 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
for _, cpu := range jobdata.CpuSet {
coreTags := map[string]string{
"type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu),
"type-id": strconv.Itoa(cpu),
}
if coreCount > 0 && !m.isExcluded("job_mem_used") {
memPerCore := jobdata.MemoryUsage / coreCount
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": memPerCore}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": memPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
@@ -265,7 +273,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": maxMemPerCore}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": maxMemPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
@@ -273,7 +287,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_mem_limit") {
limitPerCore := jobdata.LimitMemoryUsage / coreCount
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": limitPerCore}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": limitPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
@@ -281,7 +301,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_user_cpu") {
cpuUserPerCore := jobdata.CpuUsageUser / coreCount
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": cpuUserPerCore}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_user_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuUserPerCore},
timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
@@ -289,7 +315,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
cpuSysPerCore := jobdata.CpuUsageSys / coreCount
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": cpuSysPerCore}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_sys_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuSysPerCore},
timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
@@ -304,25 +336,43 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if !m.cpuUsed[cpu] {
coreTags := map[string]string{
"type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu),
"type-id": strconv.Itoa(cpu),
}
if !m.isExcluded("job_mem_used") {
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_max_mem_used") {
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_mem_limit") {
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
if y, err := lp.NewMessage(
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}