mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-05-14 17:27:30 +02:00
Compare commits
2 Commits
nvidiaColl
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
656ea73d12 | ||
|
|
330f923596 |
@@ -27,6 +27,7 @@ const CPUSTATFILE = `/proc/stat`
|
|||||||
|
|
||||||
type CpustatCollectorConfig struct {
|
type CpustatCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
excludeNumCPUs bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type CpustatCollector struct {
|
type CpustatCollector struct {
|
||||||
@@ -79,6 +80,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
m.matches[match] = index
|
m.matches[match] = index
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.config.excludeNumCPUs = slices.Contains(m.config.ExcludeMetrics, "num_cpus")
|
||||||
|
|
||||||
// Check input file
|
// Check input file
|
||||||
file, err := os.Open(CPUSTATFILE)
|
file, err := os.Open(CPUSTATFILE)
|
||||||
@@ -95,11 +97,13 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
linefields := strings.Fields(line)
|
linefields := strings.Fields(line)
|
||||||
if strings.Compare(linefields[0], "cpu") == 0 {
|
if strings.Compare(linefields[0], "cpu") == 0 {
|
||||||
|
// Kernel system statistics for all CPUs
|
||||||
m.olddata["cpu"] = make(map[string]int64)
|
m.olddata["cpu"] = make(map[string]int64)
|
||||||
for k, v := range m.matches {
|
for k, v := range m.matches {
|
||||||
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||||
}
|
}
|
||||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||||
|
// Kernel system statistics per CPU
|
||||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
m.cputags[linefields[0]] = map[string]string{
|
m.cputags[linefields[0]] = map[string]string{
|
||||||
@@ -191,10 +195,11 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now)
|
if !m.config.excludeNumCPUs {
|
||||||
if err == nil {
|
if num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now); err == nil {
|
||||||
output <- num_cpus_metric
|
output <- num_cpus_metric
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
m.lastTimestamp = now
|
m.lastTimestamp = now
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -72,7 +72,8 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
linefields := strings.Fields(line)
|
linefields := strings.Fields(line)
|
||||||
if len(linefields) == 3 {
|
switch len(linefields) {
|
||||||
|
case 3:
|
||||||
v, err := strconv.ParseFloat(linefields[1], 64)
|
v, err := strconv.ParseFloat(linefields[1], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||||
@@ -80,7 +81,7 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
unit: linefields[2],
|
unit: linefields[2],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if len(linefields) == 5 {
|
case 5:
|
||||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||||
@@ -106,7 +107,10 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Memory",
|
||||||
|
}
|
||||||
m.stats = make(map[string]int64)
|
m.stats = make(map[string]int64)
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@@ -145,7 +149,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
"KernelStack": "mem_kernelstack",
|
"KernelStack": "mem_kernelstack",
|
||||||
}
|
}
|
||||||
for k, v := range matches {
|
for k, v := range matches {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
if !slices.Contains(m.config.ExcludeMetrics, v) {
|
||||||
m.matches[k] = v
|
m.matches[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -153,7 +157,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
||||||
m.sendMemUsed = true
|
m.sendMemUsed = true
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 && !m.sendMemUsed {
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
if err := m.setup(); err != nil {
|
if err := m.setup(); err != nil {
|
||||||
|
|||||||
@@ -1115,31 +1115,6 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEfficiency(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|
||||||
if !device.excludeMetrics["nv_util_eff"] {
|
|
||||||
maxPower, ret := nvml.DeviceGetEnforcedPowerLimit(device.device)
|
|
||||||
if ret == nvml.SUCCESS {
|
|
||||||
curPower, ret := nvml.DeviceGetPowerUsage(device.device)
|
|
||||||
if ret == nvml.SUCCESS {
|
|
||||||
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
|
||||||
if ret == nvml.SUCCESS {
|
|
||||||
factor := float64(curPower) / float64(maxPower)
|
|
||||||
eff := uint32(float64(util.Gpu) * factor)
|
|
||||||
if eff > 100 {
|
|
||||||
eff = 100
|
|
||||||
}
|
|
||||||
y, err := lp.NewMetric("nv_util_eff", device.tags, device.meta, eff, time.Now())
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("unit", "percent")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
var err error
|
var err error
|
||||||
if !m.init {
|
if !m.init {
|
||||||
@@ -1245,11 +1220,6 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readEfficiency(device, output)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentDebug(m.name, "readEfficiency for device", name, "failed")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Actual read loop over all attached Nvidia GPUs
|
// Actual read loop over all attached Nvidia GPUs
|
||||||
|
|||||||
@@ -85,6 +85,5 @@ Metrics:
|
|||||||
* `nv_energy`
|
* `nv_energy`
|
||||||
* `nv_energy_abs`
|
* `nv_energy_abs`
|
||||||
* `nv_average_power`
|
* `nv_average_power`
|
||||||
* `nv_util_eff` (`nv_util` * (`nv_power_usage` / `nv_power_max_limit`))
|
|
||||||
|
|
||||||
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
||||||
|
|||||||
Reference in New Issue
Block a user