mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-05-15 09:47:29 +02:00
Compare commits
1 Commits
main
...
nvidiaColl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a1c7a9911 |
@@ -27,7 +27,6 @@ const CPUSTATFILE = `/proc/stat`
|
|||||||
|
|
||||||
type CpustatCollectorConfig struct {
|
type CpustatCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
excludeNumCPUs bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type CpustatCollector struct {
|
type CpustatCollector struct {
|
||||||
@@ -80,7 +79,6 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
m.matches[match] = index
|
m.matches[match] = index
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.config.excludeNumCPUs = slices.Contains(m.config.ExcludeMetrics, "num_cpus")
|
|
||||||
|
|
||||||
// Check input file
|
// Check input file
|
||||||
file, err := os.Open(CPUSTATFILE)
|
file, err := os.Open(CPUSTATFILE)
|
||||||
@@ -97,13 +95,11 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
linefields := strings.Fields(line)
|
linefields := strings.Fields(line)
|
||||||
if strings.Compare(linefields[0], "cpu") == 0 {
|
if strings.Compare(linefields[0], "cpu") == 0 {
|
||||||
// Kernel system statistics for all CPUs
|
|
||||||
m.olddata["cpu"] = make(map[string]int64)
|
m.olddata["cpu"] = make(map[string]int64)
|
||||||
for k, v := range m.matches {
|
for k, v := range m.matches {
|
||||||
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||||
}
|
}
|
||||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||||
// Kernel system statistics per CPU
|
|
||||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
m.cputags[linefields[0]] = map[string]string{
|
m.cputags[linefields[0]] = map[string]string{
|
||||||
@@ -195,11 +191,10 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !m.config.excludeNumCPUs {
|
num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now)
|
||||||
if num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now); err == nil {
|
if err == nil {
|
||||||
output <- num_cpus_metric
|
output <- num_cpus_metric
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
m.lastTimestamp = now
|
m.lastTimestamp = now
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -72,8 +72,7 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
linefields := strings.Fields(line)
|
linefields := strings.Fields(line)
|
||||||
switch len(linefields) {
|
if len(linefields) == 3 {
|
||||||
case 3:
|
|
||||||
v, err := strconv.ParseFloat(linefields[1], 64)
|
v, err := strconv.ParseFloat(linefields[1], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||||
@@ -81,7 +80,7 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
unit: linefields[2],
|
unit: linefields[2],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case 5:
|
} else if len(linefields) == 5 {
|
||||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||||
@@ -107,10 +106,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
||||||
"source": m.name,
|
|
||||||
"group": "Memory",
|
|
||||||
}
|
|
||||||
m.stats = make(map[string]int64)
|
m.stats = make(map[string]int64)
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@@ -149,7 +145,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
"KernelStack": "mem_kernelstack",
|
"KernelStack": "mem_kernelstack",
|
||||||
}
|
}
|
||||||
for k, v := range matches {
|
for k, v := range matches {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, v) {
|
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
||||||
m.matches[k] = v
|
m.matches[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -157,7 +153,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
||||||
m.sendMemUsed = true
|
m.sendMemUsed = true
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 && !m.sendMemUsed {
|
if len(m.matches) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
if err := m.setup(); err != nil {
|
if err := m.setup(); err != nil {
|
||||||
|
|||||||
@@ -1115,6 +1115,31 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func readEfficiency(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
|
if !device.excludeMetrics["nv_util_eff"] {
|
||||||
|
maxPower, ret := nvml.DeviceGetEnforcedPowerLimit(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
curPower, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
factor := float64(curPower) / float64(maxPower)
|
||||||
|
eff := uint32(float64(util.Gpu) * factor)
|
||||||
|
if eff > 100 {
|
||||||
|
eff = 100
|
||||||
|
}
|
||||||
|
y, err := lp.NewMetric("nv_util_eff", device.tags, device.meta, eff, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
y.AddTag("unit", "percent")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
var err error
|
var err error
|
||||||
if !m.init {
|
if !m.init {
|
||||||
@@ -1220,6 +1245,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = readEfficiency(device, output)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentDebug(m.name, "readEfficiency for device", name, "failed")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Actual read loop over all attached Nvidia GPUs
|
// Actual read loop over all attached Nvidia GPUs
|
||||||
|
|||||||
@@ -85,5 +85,6 @@ Metrics:
|
|||||||
* `nv_energy`
|
* `nv_energy`
|
||||||
* `nv_energy_abs`
|
* `nv_energy_abs`
|
||||||
* `nv_average_power`
|
* `nv_average_power`
|
||||||
|
* `nv_util_eff` (`nv_util` * (`nv_power_usage` / `nv_power_max_limit`))
|
||||||
|
|
||||||
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
||||||
|
|||||||
Reference in New Issue
Block a user