mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-05-14 01:07:29 +02:00
Compare commits
1 Commits
main
...
nvidiaColl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a1c7a9911 |
@@ -27,7 +27,6 @@ const CPUSTATFILE = `/proc/stat`
|
||||
|
||||
type CpustatCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
excludeNumCPUs bool
|
||||
}
|
||||
|
||||
type CpustatCollector struct {
|
||||
@@ -80,7 +79,6 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
m.matches[match] = index
|
||||
}
|
||||
}
|
||||
m.config.excludeNumCPUs = slices.Contains(m.config.ExcludeMetrics, "num_cpus")
|
||||
|
||||
// Check input file
|
||||
file, err := os.Open(CPUSTATFILE)
|
||||
@@ -97,13 +95,11 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if strings.Compare(linefields[0], "cpu") == 0 {
|
||||
// Kernel system statistics for all CPUs
|
||||
m.olddata["cpu"] = make(map[string]int64)
|
||||
for k, v := range m.matches {
|
||||
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||
}
|
||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||
// Kernel system statistics per CPU
|
||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
m.cputags[linefields[0]] = map[string]string{
|
||||
@@ -195,11 +191,10 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
}
|
||||
|
||||
if !m.config.excludeNumCPUs {
|
||||
if num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now); err == nil {
|
||||
num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now)
|
||||
if err == nil {
|
||||
output <- num_cpus_metric
|
||||
}
|
||||
}
|
||||
|
||||
m.lastTimestamp = now
|
||||
}
|
||||
|
||||
@@ -72,8 +72,7 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
switch len(linefields) {
|
||||
case 3:
|
||||
if len(linefields) == 3 {
|
||||
v, err := strconv.ParseFloat(linefields[1], 64)
|
||||
if err == nil {
|
||||
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||
@@ -81,7 +80,7 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
unit: linefields[2],
|
||||
}
|
||||
}
|
||||
case 5:
|
||||
} else if len(linefields) == 5 {
|
||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||
if err == nil {
|
||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||
@@ -107,10 +106,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Memory",
|
||||
}
|
||||
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
||||
m.stats = make(map[string]int64)
|
||||
m.matches = make(map[string]string)
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
@@ -149,7 +145,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
"KernelStack": "mem_kernelstack",
|
||||
}
|
||||
for k, v := range matches {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, v) {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
||||
m.matches[k] = v
|
||||
}
|
||||
}
|
||||
@@ -157,7 +153,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
||||
m.sendMemUsed = true
|
||||
}
|
||||
if len(m.matches) == 0 && !m.sendMemUsed {
|
||||
if len(m.matches) == 0 {
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
if err := m.setup(); err != nil {
|
||||
|
||||
@@ -1115,6 +1115,31 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
return nil
|
||||
}
|
||||
|
||||
func readEfficiency(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_util_eff"] {
|
||||
maxPower, ret := nvml.DeviceGetEnforcedPowerLimit(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
curPower, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
factor := float64(curPower) / float64(maxPower)
|
||||
eff := uint32(float64(util.Gpu) * factor)
|
||||
if eff > 100 {
|
||||
eff = 100
|
||||
}
|
||||
y, err := lp.NewMetric("nv_util_eff", device.tags, device.meta, eff, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("unit", "percent")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
var err error
|
||||
if !m.init {
|
||||
@@ -1220,6 +1245,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||
}
|
||||
|
||||
err = readEfficiency(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEfficiency for device", name, "failed")
|
||||
}
|
||||
}
|
||||
|
||||
// Actual read loop over all attached Nvidia GPUs
|
||||
|
||||
@@ -85,5 +85,6 @@ Metrics:
|
||||
* `nv_energy`
|
||||
* `nv_energy_abs`
|
||||
* `nv_average_power`
|
||||
* `nv_util_eff` (`nv_util` * (`nv_power_usage` / `nv_power_max_limit`))
|
||||
|
||||
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
||||
|
||||
Reference in New Issue
Block a user