mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-10-07 23:04:32 +02:00
add support for passwordless sudo
This commit is contained in:
committed by
Thomas Gruber
parent
c5183feafc
commit
69d4567ecf
@@ -4,6 +4,8 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"os/user"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -25,6 +27,7 @@ type SlurmJobData struct {
|
|||||||
type SlurmCgroupsConfig struct {
|
type SlurmCgroupsConfig struct {
|
||||||
CgroupBase string `json:"cgroup_base"`
|
CgroupBase string `json:"cgroup_base"`
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
UseSudo bool `json:"use_sudo,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type SlurmCgroupCollector struct {
|
type SlurmCgroupCollector struct {
|
||||||
@@ -36,6 +39,7 @@ type SlurmCgroupCollector struct {
|
|||||||
cpuUsed map[int]bool
|
cpuUsed map[int]bool
|
||||||
cgroupBase string
|
cgroupBase string
|
||||||
excludeMetrics map[string]struct{}
|
excludeMetrics map[string]struct{}
|
||||||
|
useSudo bool
|
||||||
}
|
}
|
||||||
|
|
||||||
const defaultCgroupBase = "/sys/fs/cgroup/system.slice/slurmstepd.scope"
|
const defaultCgroupBase = "/sys/fs/cgroup/system.slice/slurmstepd.scope"
|
||||||
@@ -88,6 +92,14 @@ func (m *SlurmCgroupCollector) isExcluded(metric string) bool {
|
|||||||
return found
|
return found
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) readFile(path string) ([]byte, error) {
|
||||||
|
if m.useSudo {
|
||||||
|
cmd := exec.Command("sudo", "cat", path)
|
||||||
|
return cmd.Output()
|
||||||
|
}
|
||||||
|
return os.ReadFile(path)
|
||||||
|
}
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
||||||
var err error
|
var err error
|
||||||
m.name = "SlurmCgroupCollector"
|
m.name = "SlurmCgroupCollector"
|
||||||
@@ -96,24 +108,36 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
|||||||
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
|
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
|
||||||
m.tags = map[string]string{"type": "hwthread"}
|
m.tags = map[string]string{"type": "hwthread"}
|
||||||
m.cpuUsed = make(map[int]bool)
|
m.cpuUsed = make(map[int]bool)
|
||||||
|
|
||||||
m.cgroupBase = defaultCgroupBase
|
m.cgroupBase = defaultCgroupBase
|
||||||
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err = json.Unmarshal(config, &m.config)
|
||||||
m.excludeMetrics = make(map[string]struct{})
|
|
||||||
for _, metric := range m.config.ExcludeMetrics {
|
|
||||||
m.excludeMetrics[metric] = struct{}{}
|
|
||||||
}
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
m.excludeMetrics = make(map[string]struct{})
|
||||||
|
for _, metric := range m.config.ExcludeMetrics {
|
||||||
|
m.excludeMetrics[metric] = struct{}{}
|
||||||
|
}
|
||||||
if m.config.CgroupBase != "" {
|
if m.config.CgroupBase != "" {
|
||||||
m.cgroupBase = m.config.CgroupBase
|
m.cgroupBase = m.config.CgroupBase
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m.useSudo = m.config.UseSudo
|
||||||
|
if !m.useSudo {
|
||||||
|
user, err := user.Current()
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if user.Uid != "0" {
|
||||||
|
cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)")
|
||||||
|
return fmt.Errorf("not root")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
m.allCPUs, err = GetAllCPUs()
|
m.allCPUs, err = GetAllCPUs()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
|
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
|
||||||
@@ -136,7 +160,7 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
|
|||||||
|
|
||||||
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
|
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
|
||||||
|
|
||||||
memUsage, err := os.ReadFile(cg("memory.current"))
|
memUsage, err := m.readFile(cg("memory.current"))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
x, err := strconv.ParseFloat(strings.TrimSpace(string(memUsage)), 64)
|
x, err := strconv.ParseFloat(strings.TrimSpace(string(memUsage)), 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -144,7 +168,7 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
maxMem, err := os.ReadFile(cg("memory.peak"))
|
maxMem, err := m.readFile(cg("memory.peak"))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
x, err := strconv.ParseFloat(strings.TrimSpace(string(maxMem)), 64)
|
x, err := strconv.ParseFloat(strings.TrimSpace(string(maxMem)), 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -152,7 +176,7 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
limitMem, err := os.ReadFile(cg("memory.max"))
|
limitMem, err := m.readFile(cg("memory.max"))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
x, err := strconv.ParseFloat(strings.TrimSpace(string(limitMem)), 64)
|
x, err := strconv.ParseFloat(strings.TrimSpace(string(limitMem)), 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -160,7 +184,7 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cpuStat, err := os.ReadFile(cg("cpu.stat"))
|
cpuStat, err := m.readFile(cg("cpu.stat"))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
lines := strings.Split(strings.TrimSpace(string(cpuStat)), "\n")
|
lines := strings.Split(strings.TrimSpace(string(cpuStat)), "\n")
|
||||||
var usageUsec, userUsec, systemUsec float64
|
var usageUsec, userUsec, systemUsec float64
|
||||||
@@ -188,7 +212,7 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cpuSet, err := os.ReadFile(cg("cpuset.cpus"))
|
cpuSet, err := m.readFile(cg("cpuset.cpus"))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
cpus, err := ParseCPUs(strings.TrimSpace(string(cpuSet)))
|
cpus, err := ParseCPUs(strings.TrimSpace(string(cpuSet)))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -288,24 +312,28 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
|||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !m.isExcluded("job_max_mem_used") {
|
if !m.isExcluded("job_max_mem_used") {
|
||||||
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !m.isExcluded("job_mem_limit") {
|
if !m.isExcluded("job_mem_limit") {
|
||||||
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !m.isExcluded("job_user_cpu") {
|
if !m.isExcluded("job_user_cpu") {
|
||||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !m.isExcluded("job_sys_cpu") {
|
if !m.isExcluded("job_sys_cpu") {
|
||||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
@@ -314,8 +342,8 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) Close() {
|
func (m *SlurmCgroupCollector) Close() {
|
||||||
m.init = false
|
m.init = false
|
||||||
}
|
}
|
||||||
|
@@ -21,12 +21,14 @@ The `slurm_cgroup` collector reads job-specific resource metrics from the cgroup
|
|||||||
"exclude_metrics": [
|
"exclude_metrics": [
|
||||||
"job_sys_cpu",
|
"job_sys_cpu",
|
||||||
"job_mem_limit"
|
"job_mem_limit"
|
||||||
]
|
],
|
||||||
|
"use_sudo": false
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
* The `cgroup_base` parameter (optional) can be set to specify the root path to SLURM job cgroups. The default is `/sys/fs/cgroup/system.slice/slurmstepd.scope`.
|
* The `cgroup_base` parameter (optional) can be set to specify the root path to SLURM job cgroups. The default is `/sys/fs/cgroup/system.slice/slurmstepd.scope`.
|
||||||
* The `exclude_metrics` array can be used to suppress individual metrics from being sent to the sink.
|
* The `exclude_metrics` array can be used to suppress individual metrics from being sent to the sink.
|
||||||
|
* The cgroups metrics are only available for root users. If password-less sudo is configured, you can enable sudo in the configuration.
|
||||||
|
|
||||||
### Reported metrics
|
### Reported metrics
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user