mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-20 11:51:40 +02:00
Compare commits
1 Commits
nvidia_run
...
v0.6.7
Author | SHA1 | Date | |
---|---|---|---|
|
f496db4905 |
2
.github/workflows/Release.yml
vendored
2
.github/workflows/Release.yml
vendored
@@ -195,7 +195,7 @@ jobs:
|
|||||||
Release:
|
Release:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# We need the RPMs, so add dependency
|
# We need the RPMs, so add dependency
|
||||||
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build]
|
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-jammy-build]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
# See: https://github.com/actions/download-artifact
|
# See: https://github.com/actions/download-artifact
|
||||||
|
@@ -6,7 +6,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||||
@@ -25,81 +24,6 @@ type NvidiaCollectorConfig struct {
|
|||||||
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
||||||
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
||||||
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
|
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
|
||||||
AveragePowerInterval string `json:"average_power_interval,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type powerAverager struct {
|
|
||||||
device nvml.Device
|
|
||||||
interval time.Duration
|
|
||||||
done chan bool
|
|
||||||
wg sync.WaitGroup
|
|
||||||
powerSum float64
|
|
||||||
powerSamples int
|
|
||||||
ticker *time.Ticker
|
|
||||||
running bool
|
|
||||||
}
|
|
||||||
|
|
||||||
type PowerAverager interface {
|
|
||||||
Start()
|
|
||||||
IsRunning() bool
|
|
||||||
Get() float64
|
|
||||||
Close()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pa *powerAverager) IsRunning() bool {
|
|
||||||
return pa.running
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pa *powerAverager) Start() {
|
|
||||||
pa.wg.Add(1)
|
|
||||||
|
|
||||||
go func(avger *powerAverager) {
|
|
||||||
avger.running = true
|
|
||||||
avger.ticker = time.NewTicker(avger.interval)
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-avger.done:
|
|
||||||
avger.wg.Done()
|
|
||||||
avger.running = false
|
|
||||||
return
|
|
||||||
case <-avger.ticker.C:
|
|
||||||
power, ret := nvml.DeviceGetPowerUsage(avger.device)
|
|
||||||
if ret == nvml.SUCCESS {
|
|
||||||
avger.powerSum += float64(power) / 1000
|
|
||||||
avger.powerSamples += 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}(pa)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pa *powerAverager) Get() float64 {
|
|
||||||
avg := float64(0)
|
|
||||||
if pa.powerSamples > 0 {
|
|
||||||
pa.ticker.Stop()
|
|
||||||
avg = pa.powerSum / float64(pa.powerSamples)
|
|
||||||
pa.powerSum = 0
|
|
||||||
pa.powerSamples = 0
|
|
||||||
pa.ticker.Reset(pa.interval)
|
|
||||||
}
|
|
||||||
return avg
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pa *powerAverager) Close() {
|
|
||||||
pa.done <- true
|
|
||||||
pa.wg.Wait()
|
|
||||||
pa.running = false
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewPowerAverager(device nvml.Device, interval time.Duration) (PowerAverager, error) {
|
|
||||||
pa := new(powerAverager)
|
|
||||||
pa.device = device
|
|
||||||
pa.interval = interval
|
|
||||||
pa.done = make(chan bool)
|
|
||||||
pa.powerSamples = 0
|
|
||||||
pa.powerSum = 0
|
|
||||||
pa.running = false
|
|
||||||
return pa, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollectorDevice struct {
|
type NvidiaCollectorDevice struct {
|
||||||
@@ -107,8 +31,6 @@ type NvidiaCollectorDevice struct {
|
|||||||
excludeMetrics map[string]bool
|
excludeMetrics map[string]bool
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
meta map[string]string
|
meta map[string]string
|
||||||
powerInterval time.Duration
|
|
||||||
averager PowerAverager
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollector struct {
|
type NvidiaCollector struct {
|
||||||
@@ -133,7 +55,6 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.ProcessMigDevices = false
|
m.config.ProcessMigDevices = false
|
||||||
m.config.UseUuidForMigDevices = false
|
m.config.UseUuidForMigDevices = false
|
||||||
m.config.UseSliceForMigDevices = false
|
m.config.UseSliceForMigDevices = false
|
||||||
m.config.AveragePowerInterval = ""
|
|
||||||
m.setup()
|
m.setup()
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err = json.Unmarshal(config, &m.config)
|
||||||
@@ -172,16 +93,6 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
powerDur := time.Duration(0)
|
|
||||||
if len(m.config.AveragePowerInterval) > 0 {
|
|
||||||
d, err := time.ParseDuration(m.config.AveragePowerInterval)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, "Unable to parse average_power_interval ", m.config.AveragePowerInterval, ":", err.Error())
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
powerDur = d
|
|
||||||
}
|
|
||||||
|
|
||||||
// For all GPUs
|
// For all GPUs
|
||||||
idx := 0
|
idx := 0
|
||||||
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
||||||
@@ -286,15 +197,6 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
g.excludeMetrics[e] = true
|
g.excludeMetrics[e] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if powerDur > 0 {
|
|
||||||
a, err := NewPowerAverager(g.device, powerDur)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, "Failed to initialize power averager for device at index", i, ":", err.Error())
|
|
||||||
} else {
|
|
||||||
g.averager = a
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Increment the index for the next device
|
// Increment the index for the next device
|
||||||
idx++
|
idx++
|
||||||
}
|
}
|
||||||
@@ -534,21 +436,6 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPowerUsageAverage(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
|
|
||||||
if !device.excludeMetrics["nv_power_usage_avg"] && device.averager != nil {
|
|
||||||
if !device.averager.IsRunning() {
|
|
||||||
device.averager.Start()
|
|
||||||
} else {
|
|
||||||
y, err := lp.New("nv_power_usage_avg", device.tags, device.meta, map[string]interface{}{"value": device.averager.Get()}, time.Now())
|
|
||||||
if err == nil {
|
|
||||||
y.AddMeta("unit", "watts")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
|
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
|
||||||
if !device.excludeMetrics["nv_power_usage"] {
|
if !device.excludeMetrics["nv_power_usage"] {
|
||||||
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
||||||
@@ -1135,100 +1022,95 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
name = "NoName"
|
name = "NoName"
|
||||||
}
|
}
|
||||||
// err = readMemoryInfo(device, output)
|
err = readMemoryInfo(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readUtilization(device, output)
|
err = readUtilization(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readTemp(device, output)
|
err = readTemp(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readFan(device, output)
|
err = readFan(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readEccMode(device, output)
|
err = readEccMode(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readPerfState(device, output)
|
err = readPerfState(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
err = readPowerUsage(device, output)
|
err = readPowerUsage(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readPowerUsageAverage(device, output)
|
err = readClocks(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readPowerUsageAverage for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
// err = readClocks(device, output)
|
err = readMaxClocks(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readMaxClocks(device, output)
|
err = readEccErrors(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readEccErrors(device, output)
|
err = readPowerLimit(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readPowerLimit(device, output)
|
err = readEncUtilization(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readEncUtilization(device, output)
|
err = readDecUtilization(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readDecUtilization(device, output)
|
err = readRemappedRows(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readRemappedRows(device, output)
|
err = readBarMemoryInfo(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readBarMemoryInfo(device, output)
|
err = readProcessCounts(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readProcessCounts(device, output)
|
err = readViolationStats(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readViolationStats(device, output)
|
err = readNVLinkStats(device, output)
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||||
// }
|
}
|
||||||
|
|
||||||
// err = readNVLinkStats(device, output)
|
|
||||||
// if err != nil {
|
|
||||||
// cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Actual read loop over all attached Nvidia GPUs
|
// Actual read loop over all attached Nvidia GPUs
|
||||||
@@ -1316,9 +1198,6 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
|
|
||||||
func (m *NvidiaCollector) Close() {
|
func (m *NvidiaCollector) Close() {
|
||||||
if m.init {
|
if m.init {
|
||||||
for i := 0; i < m.num_gpus; i++ {
|
|
||||||
m.gpus[i].averager.Close()
|
|
||||||
}
|
|
||||||
nvml.Shutdown()
|
nvml.Shutdown()
|
||||||
m.init = false
|
m.init = false
|
||||||
}
|
}
|
||||||
|
@@ -25,7 +25,7 @@ CC_USER=clustercockpit
|
|||||||
CC_GROUP=clustercockpit
|
CC_GROUP=clustercockpit
|
||||||
CONF_DIR=/etc/cc-metric-collector
|
CONF_DIR=/etc/cc-metric-collector
|
||||||
PID_FILE=/var/run/$NAME.pid
|
PID_FILE=/var/run/$NAME.pid
|
||||||
DAEMON=/usr/bin/$NAME
|
DAEMON=/usr/sbin/$NAME
|
||||||
CONF_FILE=${CONF_DIR}/cc-metric-collector.json
|
CONF_FILE=${CONF_DIR}/cc-metric-collector.json
|
||||||
|
|
||||||
umask 0027
|
umask 0027
|
||||||
|
@@ -45,9 +45,6 @@ type HttpSinkConfig struct {
|
|||||||
|
|
||||||
// Maximum number of retries to connect to the http server (default: 3)
|
// Maximum number of retries to connect to the http server (default: 3)
|
||||||
MaxRetries int `json:"max_retries,omitempty"`
|
MaxRetries int `json:"max_retries,omitempty"`
|
||||||
|
|
||||||
// Timestamp precision
|
|
||||||
Precision string `json:"precision,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type key_value_pair struct {
|
type key_value_pair struct {
|
||||||
@@ -144,7 +141,7 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
|
|||||||
|
|
||||||
// Check that encoding worked
|
// Check that encoding worked
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("encoding failed: %v", err)
|
return fmt.Errorf("Encoding failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.config.flushDelay == 0 {
|
if s.config.flushDelay == 0 {
|
||||||
@@ -271,7 +268,6 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
s.config.Timeout = "5s"
|
s.config.Timeout = "5s"
|
||||||
s.config.FlushDelay = "5s"
|
s.config.FlushDelay = "5s"
|
||||||
s.config.MaxRetries = 3
|
s.config.MaxRetries = 3
|
||||||
s.config.Precision = "ns"
|
|
||||||
cclog.ComponentDebug(s.name, "Init()")
|
cclog.ComponentDebug(s.name, "Init()")
|
||||||
|
|
||||||
// Read config
|
// Read config
|
||||||
@@ -319,19 +315,6 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
cclog.ComponentDebug(s.name, "Init(): flushDelay", t)
|
cclog.ComponentDebug(s.name, "Init(): flushDelay", t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
precision := influx.Nanosecond
|
|
||||||
if len(s.config.Precision) > 0 {
|
|
||||||
switch s.config.Precision {
|
|
||||||
case "s":
|
|
||||||
precision = influx.Second
|
|
||||||
case "ms":
|
|
||||||
precision = influx.Millisecond
|
|
||||||
case "us":
|
|
||||||
precision = influx.Microsecond
|
|
||||||
case "ns":
|
|
||||||
precision = influx.Nanosecond
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create http client
|
// Create http client
|
||||||
s.client = &http.Client{
|
s.client = &http.Client{
|
||||||
@@ -343,7 +326,7 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Configure influx line protocol encoder
|
// Configure influx line protocol encoder
|
||||||
s.encoder.SetPrecision(precision)
|
s.encoder.SetPrecision(influx.Nanosecond)
|
||||||
s.extended_tag_list = make([]key_value_pair, 0)
|
s.extended_tag_list = make([]key_value_pair, 0)
|
||||||
|
|
||||||
return s, nil
|
return s, nil
|
||||||
|
@@ -18,8 +18,7 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
|
|||||||
"timeout": "5s",
|
"timeout": "5s",
|
||||||
"idle_connection_timeout" : "5s",
|
"idle_connection_timeout" : "5s",
|
||||||
"flush_delay": "2s",
|
"flush_delay": "2s",
|
||||||
"batch_size": 1000,
|
"batch_size": 1000
|
||||||
"precision": "s"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@@ -35,8 +34,3 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
|
|||||||
- `idle_connection_timeout`: Timeout for idle connections (default '120s'). Should be larger than the measurement interval to keep the connection open
|
- `idle_connection_timeout`: Timeout for idle connections (default '120s'). Should be larger than the measurement interval to keep the connection open
|
||||||
- `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0)
|
- `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0)
|
||||||
- `batch_size`: Maximal batch size. If `batch_size` is reached before the end of `flush_delay`, the metrics are sent without further delay
|
- `batch_size`: Maximal batch size. If `batch_size` is reached before the end of `flush_delay`, the metrics are sent without further delay
|
||||||
- `precision`: Precision of the timestamp. Valid values are 's', 'ms', 'us' and 'ns'. (default is 'ns')
|
|
||||||
|
|
||||||
### Using HttpSink for communication with cc-metric-store
|
|
||||||
|
|
||||||
The cc-metric-store only accepts metrics with a timestamp precision in seconds, so it is required to set `"precision": "s"`.
|
|
||||||
|
@@ -25,4 +25,3 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is
|
|||||||
- `user`: Username for basic authentication
|
- `user`: Username for basic authentication
|
||||||
- `password`: Password for basic authentication
|
- `password`: Password for basic authentication
|
||||||
- `meta_as_tags`: print all meta information as tags in the output (optional)
|
- `meta_as_tags`: print all meta information as tags in the output (optional)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user