mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-19 19:31:41 +02:00
Compare commits
124 Commits
nvidia_ene
...
nvidia_run
Author | SHA1 | Date | |
---|---|---|---|
|
2505b2f20b | ||
|
656e5899b0 | ||
|
9b671ce68f | ||
|
226e8425cb | ||
|
a37f6603c8 | ||
|
78902305e8 | ||
|
c8a91903f6 | ||
|
5d19c31fa8 | ||
|
4bee75d4b5 | ||
|
78fac33a06 | ||
|
0b509ca9e4 | ||
|
595399e7d9 | ||
|
f059d52d43 | ||
|
b618e81cbb | ||
|
8837400bf2 | ||
|
3be11984f2 | ||
|
dd40c852ca | ||
|
39ae211530 | ||
|
a4d7593af5 | ||
|
fd1cdc5c07 | ||
|
94c88f23df | ||
|
9dae829f9d | ||
|
b0f0462995 | ||
|
778bb62602 | ||
|
5aa9603c01 | ||
|
0db1cda27f | ||
|
013ae7ec6d | ||
|
9f65365f9d | ||
|
1e606a1aa1 | ||
|
19ec6d06db | ||
|
553fcff468 | ||
|
7b5a4caf6a | ||
|
a401e4cdd1 | ||
|
94d5822426 | ||
|
f6b5f7fb07 | ||
|
0c95db50ad | ||
|
75b705aa87 | ||
|
8da5c692bb | ||
|
42a9423203 | ||
|
c87c77a810 | ||
|
c472029c2d | ||
|
9e73849081 | ||
|
d1a960e6e1 | ||
|
9530c489b5 | ||
|
64ffa3d23e | ||
|
3f4b11db47 | ||
|
fd227ed8b3 | ||
|
2d41531b51 | ||
|
e34b0166f9 | ||
|
baa45b833b | ||
|
aac475fc98 | ||
|
2dfeac8ce8 | ||
|
6a4731ab7e | ||
|
be68aeb44f | ||
|
9975ee6e00 | ||
|
38478ce8c2 | ||
|
9c9fd59ed0 | ||
|
5895490b53 | ||
|
4e08acf509 | ||
|
e1bb3dbef6 | ||
|
562bcbf486 | ||
|
262a119413 | ||
|
609cafeb2c | ||
|
6dc4e7708a | ||
|
3fdb60d708 | ||
|
12130361fd | ||
|
faad23ed64 | ||
|
674e78b3d0 | ||
|
302e42d1d0 | ||
|
1aca1b6caf | ||
|
1b60935f38 | ||
|
188f0261b5 | ||
|
1b06270e9b | ||
|
f3ffa29a37 | ||
|
7246278745 | ||
|
e5173bb9a2 | ||
|
fd56a14eb6 | ||
|
35c20110ca | ||
|
a871753bdf | ||
|
fbf178326a | ||
|
8fedef9024 | ||
|
094f124a18 | ||
|
1f5856c671 | ||
|
ae106566dd | ||
|
b3922b3255 | ||
|
5fa53a7ab8 | ||
|
3ac1ada204 | ||
|
2dc78ee0aa | ||
|
4b16ca4a30 | ||
|
6a2b74b0dc | ||
|
3171792bd6 | ||
|
99ccc04933 | ||
|
34436ac261 | ||
|
ae44b7f826 | ||
|
0cf32d5988 | ||
|
013aa9ec92 | ||
|
62720dec13 | ||
|
c64943a954 | ||
|
6eea1325bf | ||
|
e205c16cdb | ||
|
fa755ae401 | ||
|
1b97953cdb | ||
|
fc19b2b9a5 | ||
|
e425b2c38e | ||
|
f5d2d27090 | ||
|
41ea9139c6 | ||
|
da946472df | ||
|
0ffbedb3ec | ||
|
eafeea1a76 | ||
|
fcda7a6921 | ||
|
a25f4f8b8d | ||
|
ceff67085b | ||
|
ec86a83a27 | ||
|
89c93185d4 | ||
|
c3004f8c6d | ||
|
a1c2c3856d | ||
|
fa8dd5992d | ||
|
0b28c55162 | ||
|
fb480993ed | ||
|
ef49701f14 | ||
|
34bc23fbbd | ||
|
a7e8a1dfb5 | ||
|
547e2546c7 | ||
|
e7b77f7721 |
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
@@ -24,6 +25,81 @@ type NvidiaCollectorConfig struct {
|
||||
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
||||
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
||||
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
|
||||
AveragePowerInterval string `json:"average_power_interval,omitempty"`
|
||||
}
|
||||
|
||||
type powerAverager struct {
|
||||
device nvml.Device
|
||||
interval time.Duration
|
||||
done chan bool
|
||||
wg sync.WaitGroup
|
||||
powerSum float64
|
||||
powerSamples int
|
||||
ticker *time.Ticker
|
||||
running bool
|
||||
}
|
||||
|
||||
type PowerAverager interface {
|
||||
Start()
|
||||
IsRunning() bool
|
||||
Get() float64
|
||||
Close()
|
||||
}
|
||||
|
||||
func (pa *powerAverager) IsRunning() bool {
|
||||
return pa.running
|
||||
}
|
||||
|
||||
func (pa *powerAverager) Start() {
|
||||
pa.wg.Add(1)
|
||||
|
||||
go func(avger *powerAverager) {
|
||||
avger.running = true
|
||||
avger.ticker = time.NewTicker(avger.interval)
|
||||
for {
|
||||
select {
|
||||
case <-avger.done:
|
||||
avger.wg.Done()
|
||||
avger.running = false
|
||||
return
|
||||
case <-avger.ticker.C:
|
||||
power, ret := nvml.DeviceGetPowerUsage(avger.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
avger.powerSum += float64(power) / 1000
|
||||
avger.powerSamples += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}(pa)
|
||||
}
|
||||
|
||||
func (pa *powerAverager) Get() float64 {
|
||||
avg := float64(0)
|
||||
if pa.powerSamples > 0 {
|
||||
pa.ticker.Stop()
|
||||
avg = pa.powerSum / float64(pa.powerSamples)
|
||||
pa.powerSum = 0
|
||||
pa.powerSamples = 0
|
||||
pa.ticker.Reset(pa.interval)
|
||||
}
|
||||
return avg
|
||||
}
|
||||
|
||||
func (pa *powerAverager) Close() {
|
||||
pa.done <- true
|
||||
pa.wg.Wait()
|
||||
pa.running = false
|
||||
}
|
||||
|
||||
func NewPowerAverager(device nvml.Device, interval time.Duration) (PowerAverager, error) {
|
||||
pa := new(powerAverager)
|
||||
pa.device = device
|
||||
pa.interval = interval
|
||||
pa.done = make(chan bool)
|
||||
pa.powerSamples = 0
|
||||
pa.powerSum = 0
|
||||
pa.running = false
|
||||
return pa, nil
|
||||
}
|
||||
|
||||
type NvidiaCollectorDevice struct {
|
||||
@@ -31,6 +107,8 @@ type NvidiaCollectorDevice struct {
|
||||
excludeMetrics map[string]bool
|
||||
tags map[string]string
|
||||
meta map[string]string
|
||||
powerInterval time.Duration
|
||||
averager PowerAverager
|
||||
}
|
||||
|
||||
type NvidiaCollector struct {
|
||||
@@ -55,6 +133,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
m.config.ProcessMigDevices = false
|
||||
m.config.UseUuidForMigDevices = false
|
||||
m.config.UseSliceForMigDevices = false
|
||||
m.config.AveragePowerInterval = ""
|
||||
m.setup()
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
@@ -93,6 +172,16 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
return err
|
||||
}
|
||||
|
||||
powerDur := time.Duration(0)
|
||||
if len(m.config.AveragePowerInterval) > 0 {
|
||||
d, err := time.ParseDuration(m.config.AveragePowerInterval)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Unable to parse average_power_interval ", m.config.AveragePowerInterval, ":", err.Error())
|
||||
return err
|
||||
}
|
||||
powerDur = d
|
||||
}
|
||||
|
||||
// For all GPUs
|
||||
idx := 0
|
||||
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
||||
@@ -197,6 +286,15 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
g.excludeMetrics[e] = true
|
||||
}
|
||||
|
||||
if powerDur > 0 {
|
||||
a, err := NewPowerAverager(g.device, powerDur)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Failed to initialize power averager for device at index", i, ":", err.Error())
|
||||
} else {
|
||||
g.averager = a
|
||||
}
|
||||
}
|
||||
|
||||
// Increment the index for the next device
|
||||
idx++
|
||||
}
|
||||
@@ -436,6 +534,21 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error
|
||||
return nil
|
||||
}
|
||||
|
||||
func readPowerUsageAverage(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
|
||||
if !device.excludeMetrics["nv_power_usage_avg"] && device.averager != nil {
|
||||
if !device.averager.IsRunning() {
|
||||
device.averager.Start()
|
||||
} else {
|
||||
y, err := lp.New("nv_power_usage_avg", device.tags, device.meta, map[string]interface{}{"value": device.averager.Get()}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "watts")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
|
||||
if !device.excludeMetrics["nv_power_usage"] {
|
||||
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
||||
@@ -1022,95 +1135,100 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
if ret != nvml.SUCCESS {
|
||||
name = "NoName"
|
||||
}
|
||||
err = readMemoryInfo(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
|
||||
}
|
||||
// err = readMemoryInfo(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
|
||||
}
|
||||
// err = readUtilization(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readTemp(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
|
||||
}
|
||||
// err = readTemp(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readFan(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
|
||||
}
|
||||
// err = readFan(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readEccMode(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
|
||||
}
|
||||
// err = readEccMode(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readPerfState(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
|
||||
}
|
||||
// err = readPerfState(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readPowerUsage(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
||||
}
|
||||
|
||||
err = readClocks(device, output)
|
||||
err = readPowerUsageAverage(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||
cclog.ComponentDebug(m.name, "readPowerUsageAverage for device", name, "failed")
|
||||
}
|
||||
|
||||
err = readMaxClocks(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
|
||||
}
|
||||
// err = readClocks(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readEccErrors(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
|
||||
}
|
||||
// err = readMaxClocks(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readPowerLimit(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
|
||||
}
|
||||
// err = readEccErrors(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readEncUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
|
||||
}
|
||||
// err = readPowerLimit(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readDecUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
|
||||
}
|
||||
// err = readEncUtilization(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readRemappedRows(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
|
||||
}
|
||||
// err = readDecUtilization(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readBarMemoryInfo(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
|
||||
}
|
||||
// err = readRemappedRows(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readProcessCounts(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
|
||||
}
|
||||
// err = readBarMemoryInfo(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readViolationStats(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
|
||||
}
|
||||
// err = readProcessCounts(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
|
||||
// }
|
||||
|
||||
err = readNVLinkStats(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||
}
|
||||
// err = readViolationStats(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
|
||||
// }
|
||||
|
||||
// err = readNVLinkStats(device, output)
|
||||
// if err != nil {
|
||||
// cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||
// }
|
||||
}
|
||||
|
||||
// Actual read loop over all attached Nvidia GPUs
|
||||
@@ -1198,6 +1316,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
|
||||
func (m *NvidiaCollector) Close() {
|
||||
if m.init {
|
||||
for i := 0; i < m.num_gpus; i++ {
|
||||
m.gpus[i].averager.Close()
|
||||
}
|
||||
nvml.Shutdown()
|
||||
m.init = false
|
||||
}
|
||||
|
@@ -25,7 +25,7 @@ CC_USER=clustercockpit
|
||||
CC_GROUP=clustercockpit
|
||||
CONF_DIR=/etc/cc-metric-collector
|
||||
PID_FILE=/var/run/$NAME.pid
|
||||
DAEMON=/usr/sbin/$NAME
|
||||
DAEMON=/usr/bin/$NAME
|
||||
CONF_FILE=${CONF_DIR}/cc-metric-collector.json
|
||||
|
||||
umask 0027
|
||||
|
@@ -45,6 +45,9 @@ type HttpSinkConfig struct {
|
||||
|
||||
// Maximum number of retries to connect to the http server (default: 3)
|
||||
MaxRetries int `json:"max_retries,omitempty"`
|
||||
|
||||
// Timestamp precision
|
||||
Precision string `json:"precision,omitempty"`
|
||||
}
|
||||
|
||||
type key_value_pair struct {
|
||||
@@ -141,7 +144,7 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
|
||||
|
||||
// Check that encoding worked
|
||||
if err != nil {
|
||||
return fmt.Errorf("Encoding failed: %v", err)
|
||||
return fmt.Errorf("encoding failed: %v", err)
|
||||
}
|
||||
|
||||
if s.config.flushDelay == 0 {
|
||||
@@ -268,6 +271,7 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
|
||||
s.config.Timeout = "5s"
|
||||
s.config.FlushDelay = "5s"
|
||||
s.config.MaxRetries = 3
|
||||
s.config.Precision = "ns"
|
||||
cclog.ComponentDebug(s.name, "Init()")
|
||||
|
||||
// Read config
|
||||
@@ -315,6 +319,19 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
|
||||
cclog.ComponentDebug(s.name, "Init(): flushDelay", t)
|
||||
}
|
||||
}
|
||||
precision := influx.Nanosecond
|
||||
if len(s.config.Precision) > 0 {
|
||||
switch s.config.Precision {
|
||||
case "s":
|
||||
precision = influx.Second
|
||||
case "ms":
|
||||
precision = influx.Millisecond
|
||||
case "us":
|
||||
precision = influx.Microsecond
|
||||
case "ns":
|
||||
precision = influx.Nanosecond
|
||||
}
|
||||
}
|
||||
|
||||
// Create http client
|
||||
s.client = &http.Client{
|
||||
@@ -326,7 +343,7 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
|
||||
}
|
||||
|
||||
// Configure influx line protocol encoder
|
||||
s.encoder.SetPrecision(influx.Nanosecond)
|
||||
s.encoder.SetPrecision(precision)
|
||||
s.extended_tag_list = make([]key_value_pair, 0)
|
||||
|
||||
return s, nil
|
||||
|
@@ -18,7 +18,8 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
|
||||
"timeout": "5s",
|
||||
"idle_connection_timeout" : "5s",
|
||||
"flush_delay": "2s",
|
||||
"batch_size": 1000
|
||||
"batch_size": 1000,
|
||||
"precision": "s"
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -34,3 +35,8 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
|
||||
- `idle_connection_timeout`: Timeout for idle connections (default '120s'). Should be larger than the measurement interval to keep the connection open
|
||||
- `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0)
|
||||
- `batch_size`: Maximal batch size. If `batch_size` is reached before the end of `flush_delay`, the metrics are sent without further delay
|
||||
- `precision`: Precision of the timestamp. Valid values are 's', 'ms', 'us' and 'ns'. (default is 'ns')
|
||||
|
||||
### Using HttpSink for communication with cc-metric-store
|
||||
|
||||
The cc-metric-store only accepts metrics with a timestamp precision in seconds, so it is required to set `"precision": "s"`.
|
||||
|
@@ -25,3 +25,4 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is
|
||||
- `user`: Username for basic authentication
|
||||
- `password`: Password for basic authentication
|
||||
- `meta_as_tags`: print all meta information as tags in the output (optional)
|
||||
|
||||
|
Reference in New Issue
Block a user