mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-30 00:16:07 +02:00
Merge latest development changes to main branch (#79)
* Cleanup: Remove unused code * Use Golang duration parser for 'interval' and 'duration' in main config * Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 * Units with cc-units (#64) * Add option to normalize units with cc-unit * Add unit conversion to router * Add option to change unit prefix in the router * Add to MetricRouter README * Add order of operations in router to README * Use second add_tags/del_tags only if metric gets renamed * Skip disks in DiskstatCollector that have size=0 * Check readability of sensor files in TempCollector * Fix for --once option * Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend * Collectors in parallel (#74) * Provide info to CollectorManager whether the collector can be executed in parallel with others * Split serial and parallel collectors. Read in parallel first * Update NvidiaCollector with new metrics, MIG and NvLink support (#75) * CC topology module update (#76) * Rename CPU to hardware thread, write some comments * Do renaming in other parts * Remove CpuList and SocketList function from metricCollector. Available in ccTopology * Option to use MIG UUID as subtype-id in NvidiaCollector * Option to use MIG slice name as subtype-id in NvidiaCollector * MetricRouter: Fix JSON in README * Fix for Github Action to really use the selected version * Remove Ganglia installation in runonce Action and add Go 1.18 * Fix daemon options in init script * Add separate go.mod files to use it with deprecated 1.16 * Minor updates for Makefiles * fix string comparison * AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description * Use http instead of ftp to download likwid * Fix serial number in rocmCollector * Improved http sink (#78) * automatic flush in NatsSink * tweak default options of HttpSink * shorter cirt. section and retries for HttpSink * fix error handling * Remove file added by mistake. * Use http instead of ftp to download likwid * Fix serial number in rocmCollector Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Lou <lou.knauer@gmx.de>
This commit is contained in:
@@ -1,22 +1,28 @@
|
||||
|
||||
all: likwid
|
||||
|
||||
|
||||
# LIKWID version
|
||||
LIKWID_VERSION = 5.2.1
|
||||
LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null) 2>/dev/null)
|
||||
|
||||
LIKWID_FOLDER="$(shell pwd)/likwid"
|
||||
|
||||
all: $(LIKWID_FOLDER)/likwid.h
|
||||
|
||||
.ONESHELL:
|
||||
.PHONY: likwid
|
||||
likwid:
|
||||
INSTALL_FOLDER="$${PWD}/likwid"
|
||||
BUILD_FOLDER="$${PWD}/likwidbuild"
|
||||
if [ -d $${INSTALL_FOLDER} ]; then rm -r $${INSTALL_FOLDER}; fi
|
||||
mkdir --parents --verbose $${INSTALL_FOLDER} $${BUILD_FOLDER}
|
||||
wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz
|
||||
tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz
|
||||
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $${INSTALL_FOLDER}/
|
||||
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $${INSTALL_FOLDER}/
|
||||
rm -r $${BUILD_FOLDER}
|
||||
.PHONY: $(LIKWID_FOLDER)/likwid.h
|
||||
$(LIKWID_FOLDER)/likwid.h:
|
||||
if [ "$(LIKWID_INSTALLED_FOLDER)" != "" ]; then \
|
||||
BASE="$(LIKWID_INSTALLED_FOLDER)/../include"; \
|
||||
mkdir -p $(LIKWID_FOLDER); \
|
||||
cp $$BASE/*.h $(LIKWID_FOLDER); \
|
||||
else \
|
||||
BUILD_FOLDER="$${PWD}/likwidbuild"; \
|
||||
if [ -d $(LIKWID_FOLDER) ]; then rm -r $(LIKWID_FOLDER); fi; \
|
||||
mkdir --parents --verbose $(LIKWID_FOLDER) $${BUILD_FOLDER}; \
|
||||
wget -P "$${BUILD_FOLDER}" http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz; \
|
||||
tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz; \
|
||||
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(LIKWID_FOLDER)/; \
|
||||
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(LIKWID_FOLDER)/; \
|
||||
rm -r $${BUILD_FOLDER}; \
|
||||
fi
|
||||
|
||||
|
||||
clean:
|
||||
|
@@ -39,6 +39,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c
|
||||
* [`gpfs`](./gpfsMetric.md)
|
||||
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
||||
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
||||
* [`rocm_smi`](./rocmsmiMetric.md)
|
||||
|
||||
## Todos
|
||||
|
||||
|
@@ -55,6 +55,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.name = "BeegfsMetaCollector"
|
||||
m.setup()
|
||||
m.parallel = true
|
||||
// Set default beegfs-ctl binary
|
||||
|
||||
m.config.Beegfs = DEFAULT_BEEGFS_CMD
|
||||
|
@@ -48,6 +48,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.name = "BeegfsStorageCollector"
|
||||
m.setup()
|
||||
m.parallel = true
|
||||
// Set default beegfs-ctl binary
|
||||
|
||||
m.config.Beegfs = DEFAULT_BEEGFS_CMD
|
||||
|
@@ -14,39 +14,43 @@ import (
|
||||
// Map of all available metric collectors
|
||||
var AvailableCollectors = map[string]MetricCollector{
|
||||
|
||||
"likwid": new(LikwidCollector),
|
||||
"loadavg": new(LoadavgCollector),
|
||||
"memstat": new(MemstatCollector),
|
||||
"netstat": new(NetstatCollector),
|
||||
"ibstat": new(InfinibandCollector),
|
||||
"lustrestat": new(LustreCollector),
|
||||
"cpustat": new(CpustatCollector),
|
||||
"topprocs": new(TopProcsCollector),
|
||||
"nvidia": new(NvidiaCollector),
|
||||
"customcmd": new(CustomCmdCollector),
|
||||
"iostat": new(IOstatCollector),
|
||||
"diskstat": new(DiskstatCollector),
|
||||
"tempstat": new(TempCollector),
|
||||
"ipmistat": new(IpmiCollector),
|
||||
"gpfs": new(GpfsCollector),
|
||||
"cpufreq": new(CPUFreqCollector),
|
||||
"cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector),
|
||||
"nfs3stat": new(Nfs3Collector),
|
||||
"nfs4stat": new(Nfs4Collector),
|
||||
"numastats": new(NUMAStatsCollector),
|
||||
"beegfs_meta": new(BeegfsMetaCollector),
|
||||
"beegfs_storage": new(BeegfsStorageCollector),
|
||||
"likwid": new(LikwidCollector),
|
||||
"loadavg": new(LoadavgCollector),
|
||||
"memstat": new(MemstatCollector),
|
||||
"netstat": new(NetstatCollector),
|
||||
"ibstat": new(InfinibandCollector),
|
||||
"lustrestat": new(LustreCollector),
|
||||
"cpustat": new(CpustatCollector),
|
||||
"topprocs": new(TopProcsCollector),
|
||||
"nvidia": new(NvidiaCollector),
|
||||
"customcmd": new(CustomCmdCollector),
|
||||
"iostat": new(IOstatCollector),
|
||||
"diskstat": new(DiskstatCollector),
|
||||
"tempstat": new(TempCollector),
|
||||
"ipmistat": new(IpmiCollector),
|
||||
"gpfs": new(GpfsCollector),
|
||||
"cpufreq": new(CPUFreqCollector),
|
||||
"cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector),
|
||||
"nfs3stat": new(Nfs3Collector),
|
||||
"nfs4stat": new(Nfs4Collector),
|
||||
"numastats": new(NUMAStatsCollector),
|
||||
"beegfs_meta": new(BeegfsMetaCollector),
|
||||
"beegfs_storage": new(BeegfsStorageCollector),
|
||||
"rocm_smi": new(RocmSmiCollector),
|
||||
}
|
||||
|
||||
// Metric collector manager data structure
|
||||
type collectorManager struct {
|
||||
collectors []MetricCollector // List of metric collectors to use
|
||||
output chan lp.CCMetric // Output channels
|
||||
done chan bool // channel to finish / stop metric collector manager
|
||||
ticker mct.MultiChanTicker // periodically ticking once each interval
|
||||
duration time.Duration // duration (for metrics that measure over a given duration)
|
||||
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
|
||||
config map[string]json.RawMessage // json encoded config for collector manager
|
||||
collectors []MetricCollector // List of metric collectors to read in parallel
|
||||
serial []MetricCollector // List of metric collectors to read serially
|
||||
output chan lp.CCMetric // Output channels
|
||||
done chan bool // channel to finish / stop metric collector manager
|
||||
ticker mct.MultiChanTicker // periodically ticking once each interval
|
||||
duration time.Duration // duration (for metrics that measure over a given duration)
|
||||
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
|
||||
config map[string]json.RawMessage // json encoded config for collector manager
|
||||
collector_wg sync.WaitGroup // internally used wait group for the parallel reading of collector
|
||||
parallel_run bool // Flag whether the collectors are currently read in parallel
|
||||
}
|
||||
|
||||
// Metric collector manager access functions
|
||||
@@ -66,6 +70,7 @@ type CollectorManager interface {
|
||||
// Initialization is done for all configured collectors
|
||||
func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error {
|
||||
cm.collectors = make([]MetricCollector, 0)
|
||||
cm.serial = make([]MetricCollector, 0)
|
||||
cm.output = nil
|
||||
cm.done = make(chan bool)
|
||||
cm.wg = wg
|
||||
@@ -100,7 +105,11 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
||||
continue
|
||||
}
|
||||
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
||||
cm.collectors = append(cm.collectors, collector)
|
||||
if collector.Parallel() {
|
||||
cm.collectors = append(cm.collectors, collector)
|
||||
} else {
|
||||
cm.serial = append(cm.serial, collector)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -116,6 +125,10 @@ func (cm *collectorManager) Start() {
|
||||
// Collector manager is done
|
||||
done := func() {
|
||||
// close all metric collectors
|
||||
if cm.parallel_run {
|
||||
cm.collector_wg.Wait()
|
||||
cm.parallel_run = false
|
||||
}
|
||||
for _, c := range cm.collectors {
|
||||
c.Close()
|
||||
}
|
||||
@@ -130,7 +143,26 @@ func (cm *collectorManager) Start() {
|
||||
done()
|
||||
return
|
||||
case t := <-tick:
|
||||
cm.parallel_run = true
|
||||
for _, c := range cm.collectors {
|
||||
// Wait for done signal or execute the collector
|
||||
select {
|
||||
case <-cm.done:
|
||||
done()
|
||||
return
|
||||
default:
|
||||
// Read metrics from collector c via goroutine
|
||||
cclog.ComponentDebug("CollectorManager", c.Name(), t)
|
||||
cm.collector_wg.Add(1)
|
||||
go func(myc MetricCollector) {
|
||||
myc.Read(cm.duration, cm.output)
|
||||
cm.collector_wg.Done()
|
||||
}(c)
|
||||
}
|
||||
}
|
||||
cm.collector_wg.Wait()
|
||||
cm.parallel_run = false
|
||||
for _, c := range cm.serial {
|
||||
// Wait for done signal or execute the collector
|
||||
select {
|
||||
case <-cm.done:
|
||||
|
@@ -48,6 +48,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
m.setup()
|
||||
|
||||
m.name = "CPUFreqCpuInfoCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "CPU",
|
||||
@@ -150,7 +151,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
t.numNonHT = numNonHT
|
||||
t.numNonHT_int = numNonHT_int
|
||||
t.tagSet = map[string]string{
|
||||
"type": "cpu",
|
||||
"type": "hwthread",
|
||||
"type-id": t.processor,
|
||||
"package_id": t.physicalPackageID,
|
||||
}
|
||||
|
@@ -4,7 +4,7 @@
|
||||
"cpufreq_cpuinfo": {}
|
||||
```
|
||||
|
||||
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **cpu** metrics.
|
||||
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.
|
||||
|
||||
Metrics:
|
||||
* `cpufreq`
|
||||
|
@@ -53,6 +53,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.name = "CPUFreqCollector"
|
||||
m.setup()
|
||||
m.parallel = true
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
@@ -161,7 +162,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
t.numNonHT = numNonHT
|
||||
t.numNonHT_int = numNonHT_int
|
||||
t.tagSet = map[string]string{
|
||||
"type": "cpu",
|
||||
"type": "hwthread",
|
||||
"type-id": t.processor,
|
||||
"package_id": t.physicalPackageID,
|
||||
}
|
||||
|
@@ -5,7 +5,7 @@
|
||||
}
|
||||
```
|
||||
|
||||
The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **cpu** metrics.
|
||||
The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **hwthread** metrics.
|
||||
|
||||
Metrics:
|
||||
* `cpufreq`
|
@@ -30,6 +30,7 @@ type CpustatCollector struct {
|
||||
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
m.name = "CpustatCollector"
|
||||
m.setup()
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "CPU", "unit": "Percent"}
|
||||
m.nodetags = map[string]string{"type": "node"}
|
||||
if len(config) > 0 {
|
||||
@@ -82,7 +83,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
m.cputags[linefields[0]] = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)}
|
||||
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||
num_cpus++
|
||||
}
|
||||
}
|
||||
|
@@ -33,6 +33,7 @@ type CustomCmdCollector struct {
|
||||
func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "CustomCmdCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Custom"}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
|
@@ -29,6 +29,7 @@ type DiskstatCollector struct {
|
||||
|
||||
func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
||||
m.name = "DiskstatCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||
m.setup()
|
||||
if len(config) > 0 {
|
||||
@@ -77,11 +78,18 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
|
||||
continue
|
||||
}
|
||||
path := strings.Replace(linefields[1], `\040`, " ", -1)
|
||||
stat := syscall.Statfs_t{}
|
||||
stat := syscall.Statfs_t{
|
||||
Blocks: 0,
|
||||
Bsize: 0,
|
||||
Bfree: 0,
|
||||
}
|
||||
err := syscall.Statfs(path, &stat)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if stat.Blocks == 0 || stat.Bsize == 0 {
|
||||
continue
|
||||
}
|
||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
|
||||
y, err := lp.New("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
||||
@@ -95,9 +103,11 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
}
|
||||
perc := (100 * (total - free)) / total
|
||||
if perc > part_max_used {
|
||||
part_max_used = perc
|
||||
if total > 0 {
|
||||
perc := (100 * (total - free)) / total
|
||||
if perc > part_max_used {
|
||||
part_max_used = perc
|
||||
}
|
||||
}
|
||||
}
|
||||
y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
||||
|
@@ -46,6 +46,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "GpfsCollector"
|
||||
m.setup()
|
||||
m.parallel = true
|
||||
|
||||
// Set default mmpmon binary
|
||||
m.config.Mmpmon = DEFAULT_GPFS_CMD
|
||||
|
@@ -54,6 +54,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "InfinibandCollector"
|
||||
m.setup()
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
|
@@ -37,6 +37,7 @@ type IOstatCollector struct {
|
||||
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "IOstatCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||
m.setup()
|
||||
if len(config) > 0 {
|
||||
|
@@ -34,6 +34,7 @@ type IpmiCollector struct {
|
||||
func (m *IpmiCollector) Init(config json.RawMessage) error {
|
||||
m.name = "IpmiCollector"
|
||||
m.setup()
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "IPMI"}
|
||||
m.config.IpmitoolPath = string(IPMITOOL_PATH)
|
||||
m.config.IpmisensorsPath = string(IPMISENSORS_PATH)
|
||||
|
@@ -177,6 +177,7 @@ func getBaseFreq() float64 {
|
||||
|
||||
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
m.name = "LikwidCollector"
|
||||
m.parallel = false
|
||||
m.initialized = false
|
||||
m.running = false
|
||||
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||
@@ -204,7 +205,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.meta = map[string]string{"group": "PerfCounter"}
|
||||
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
||||
cpulist := topo.CpuList()
|
||||
cpulist := topo.HwthreadList()
|
||||
m.cpulist = make([]C.int, len(cpulist))
|
||||
m.cpu2tid = make(map[int]int)
|
||||
for i, c := range cpulist {
|
||||
|
@@ -19,7 +19,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
|
||||
"calc": "COUNTER0 + COUNTER1",
|
||||
"publish": false,
|
||||
"unit": "myunit",
|
||||
"type": "cpu"
|
||||
"type": "hwthread"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -30,7 +30,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
|
||||
"calc": "sum_01",
|
||||
"publish": true,
|
||||
"unit": "myunit",
|
||||
"type": "cpu"
|
||||
"type": "hwthread"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -51,15 +51,15 @@ Additional options:
|
||||
|
||||
Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric.
|
||||
|
||||
- `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"`
|
||||
- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"`
|
||||
- `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`
|
||||
|
||||
**Note:** You should not specify the `socket` type for a metric that is measured at `cpu` scope and vice versa, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
|
||||
**Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
|
||||
|
||||
As a guideline:
|
||||
- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu`
|
||||
- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `hwthread`
|
||||
- All counters names containing `BOX` have the scope `socket`
|
||||
- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope (AMD Zen)
|
||||
- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` scope
|
||||
- All `DFCx` counters have scope `socket`
|
||||
|
||||
### Help with the configuration
|
||||
@@ -90,7 +90,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
|
||||
"name": "Runtime (RDTSC) [s]",
|
||||
"publish": true,
|
||||
"unit": "seconds"
|
||||
"scope": "cpu"
|
||||
"scope": "hwthread"
|
||||
},
|
||||
{
|
||||
"..." : "..."
|
||||
@@ -147,20 +147,20 @@ One might think this does not happen often but often used metrics in the world o
|
||||
{
|
||||
"name": "ipc",
|
||||
"calc": "PMC0/PMC1",
|
||||
"type": "cpu",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"calc": "0.000001*PMC2/time",
|
||||
"unit": "MFlops/s",
|
||||
"type": "cpu",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||
"type": "cpu",
|
||||
"type": "hwthread",
|
||||
"unit": "MHz",
|
||||
"publish": true
|
||||
},
|
||||
@@ -219,3 +219,33 @@ One might think this does not happen often but often used metrics in the world o
|
||||
}
|
||||
```
|
||||
|
||||
### How to get the eventsets and metrics from LIKWID
|
||||
|
||||
The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility.
|
||||
|
||||
The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference:
|
||||
```
|
||||
EVENTSET -> "events": {
|
||||
FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK",
|
||||
FIXC2 MAX_CPU_CLOCK -> "FIXC2": "MAX_CPU_CLOCK",
|
||||
PMC0 RETIRED_INSTRUCTIONS -> "PMC0" : "RETIRED_INSTRUCTIONS",
|
||||
PMC1 CPU_CLOCKS_UNHALTED -> "PMC1" : "CPU_CLOCKS_UNHALTED",
|
||||
PMC2 RETIRED_SSE_AVX_FLOPS_ALL -> "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
|
||||
PMC3 MERGE -> "PMC3": "MERGE",
|
||||
-> }
|
||||
```
|
||||
|
||||
The metrics are following the same procedure:
|
||||
|
||||
```
|
||||
METRICS -> "metrics": [
|
||||
IPC PMC0/PMC1 -> {
|
||||
-> "name" : "IPC",
|
||||
-> "calc" : "PMC0/PMC1",
|
||||
-> "scope": "hwthread",
|
||||
-> "publish": true
|
||||
-> }
|
||||
-> ]
|
||||
```
|
||||
|
||||
The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.
|
||||
|
@@ -36,6 +36,7 @@ type LoadavgCollector struct {
|
||||
|
||||
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||
m.name = "LoadavgCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
|
@@ -288,6 +288,7 @@ var LustreDeriveMetrics = []LustreMetricDefinition{
|
||||
func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "LustreCollector"
|
||||
m.parallel = true
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
|
@@ -81,6 +81,7 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "MemstatCollector"
|
||||
m.parallel = true
|
||||
m.config.NodeStats = true
|
||||
m.config.NumaStats = false
|
||||
if len(config) > 0 {
|
||||
@@ -159,6 +160,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
|
||||
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
if !m.init {
|
||||
cclog.ComponentPrint(m.name, "Here")
|
||||
return
|
||||
}
|
||||
|
||||
|
@@ -3,27 +3,25 @@ package collectors
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||
)
|
||||
|
||||
type MetricCollector interface {
|
||||
Name() string // Name of the metric collector
|
||||
Init(config json.RawMessage) error // Initialize metric collector
|
||||
Initialized() bool // Is metric collector initialized?
|
||||
Name() string // Name of the metric collector
|
||||
Init(config json.RawMessage) error // Initialize metric collector
|
||||
Initialized() bool // Is metric collector initialized?
|
||||
Parallel() bool
|
||||
Read(duration time.Duration, output chan lp.CCMetric) // Read metrics from metric collector
|
||||
Close() // Close / finish metric collector
|
||||
}
|
||||
|
||||
type metricCollector struct {
|
||||
name string // name of the metric
|
||||
init bool // is metric collector initialized?
|
||||
meta map[string]string // static meta data tags
|
||||
name string // name of the metric
|
||||
init bool // is metric collector initialized?
|
||||
parallel bool // can the metric collector be executed in parallel with others
|
||||
meta map[string]string // static meta data tags
|
||||
}
|
||||
|
||||
// Name returns the name of the metric collector
|
||||
@@ -31,6 +29,11 @@ func (c *metricCollector) Name() string {
|
||||
return c.name
|
||||
}
|
||||
|
||||
// Name returns the name of the metric collector
|
||||
func (c *metricCollector) Parallel() bool {
|
||||
return c.parallel
|
||||
}
|
||||
|
||||
// Setup is for future use
|
||||
func (c *metricCollector) setup() error {
|
||||
return nil
|
||||
@@ -65,58 +68,6 @@ func stringArrayContains(array []string, str string) (int, bool) {
|
||||
return -1, false
|
||||
}
|
||||
|
||||
// SocketList returns the list of physical sockets as read from /proc/cpuinfo
|
||||
func SocketList() []int {
|
||||
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return nil
|
||||
}
|
||||
ll := strings.Split(string(buffer), "\n")
|
||||
var packs []int
|
||||
for _, line := range ll {
|
||||
if strings.HasPrefix(line, "physical id") {
|
||||
lv := strings.Fields(line)
|
||||
id, err := strconv.ParseInt(lv[3], 10, 32)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return packs
|
||||
}
|
||||
_, found := intArrayContains(packs, int(id))
|
||||
if !found {
|
||||
packs = append(packs, int(id))
|
||||
}
|
||||
}
|
||||
}
|
||||
return packs
|
||||
}
|
||||
|
||||
// CpuList returns the list of physical CPUs (in contrast to logical CPUs) as read from /proc/cpuinfo
|
||||
func CpuList() []int {
|
||||
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return nil
|
||||
}
|
||||
ll := strings.Split(string(buffer), "\n")
|
||||
var cpulist []int
|
||||
for _, line := range ll {
|
||||
if strings.HasPrefix(line, "processor") {
|
||||
lv := strings.Fields(line)
|
||||
id, err := strconv.ParseInt(lv[2], 10, 32)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return cpulist
|
||||
}
|
||||
_, found := intArrayContains(cpulist, int(id))
|
||||
if !found {
|
||||
cpulist = append(cpulist, int(id))
|
||||
}
|
||||
}
|
||||
}
|
||||
return cpulist
|
||||
}
|
||||
|
||||
// RemoveFromStringList removes the string r from the array of strings s
|
||||
// If r is not contained in the array an error is returned
|
||||
func RemoveFromStringList(s []string, r string) ([]string, error) {
|
||||
|
@@ -39,6 +39,7 @@ type NetstatCollector struct {
|
||||
|
||||
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
m.name = "NetstatCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
m.lastTimestamp = time.Now()
|
||||
|
||||
|
@@ -114,6 +114,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
||||
m.data = make(map[string]NfsCollectorData)
|
||||
m.initStats()
|
||||
m.init = true
|
||||
m.parallel = true
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@@ -54,6 +54,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
m.name = "NUMAStatsCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -3,38 +3,74 @@
|
||||
|
||||
```json
|
||||
"nvidia": {
|
||||
"exclude_devices" : [
|
||||
"0","1"
|
||||
"exclude_devices": [
|
||||
"0","1", "0000000:ff:01.0"
|
||||
],
|
||||
"exclude_metrics": [
|
||||
"nv_fb_memory",
|
||||
"nv_fb_mem_used",
|
||||
"nv_fan"
|
||||
]
|
||||
],
|
||||
"process_mig_devices": false,
|
||||
"use_pci_info_as_type_id": true,
|
||||
"add_pci_info_tag": false,
|
||||
"add_uuid_meta": false,
|
||||
"add_board_number_meta": false,
|
||||
"add_serial_meta": false,
|
||||
"use_uuid_for_mig_device": false,
|
||||
"use_slice_for_mig_device": false
|
||||
}
|
||||
```
|
||||
|
||||
The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
|
||||
|
||||
The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
||||
|
||||
Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
|
||||
|
||||
|
||||
Metrics:
|
||||
* `nv_util`
|
||||
* `nv_mem_util`
|
||||
* `nv_mem_total`
|
||||
* `nv_fb_memory`
|
||||
* `nv_fb_mem_total`
|
||||
* `nv_fb_mem_used`
|
||||
* `nv_bar1_mem_total`
|
||||
* `nv_bar1_mem_used`
|
||||
* `nv_temp`
|
||||
* `nv_fan`
|
||||
* `nv_ecc_mode`
|
||||
* `nv_perf_state`
|
||||
* `nv_power_usage_report`
|
||||
* `nv_graphics_clock_report`
|
||||
* `nv_sm_clock_report`
|
||||
* `nv_mem_clock_report`
|
||||
* `nv_power_usage`
|
||||
* `nv_graphics_clock`
|
||||
* `nv_sm_clock`
|
||||
* `nv_mem_clock`
|
||||
* `nv_video_clock`
|
||||
* `nv_max_graphics_clock`
|
||||
* `nv_max_sm_clock`
|
||||
* `nv_max_mem_clock`
|
||||
* `nv_ecc_db_error`
|
||||
* `nv_ecc_sb_error`
|
||||
* `nv_power_man_limit`
|
||||
* `nv_max_video_clock`
|
||||
* `nv_ecc_uncorrected_error`
|
||||
* `nv_ecc_corrected_error`
|
||||
* `nv_power_max_limit`
|
||||
* `nv_encoder_util`
|
||||
* `nv_decoder_util`
|
||||
* `nv_remapped_rows_corrected`
|
||||
* `nv_remapped_rows_uncorrected`
|
||||
* `nv_remapped_rows_pending`
|
||||
* `nv_remapped_rows_failure`
|
||||
* `nv_compute_processes`
|
||||
* `nv_graphics_processes`
|
||||
* `nv_violation_power`
|
||||
* `nv_violation_thermal`
|
||||
* `nv_violation_sync_boost`
|
||||
* `nv_violation_board_limit`
|
||||
* `nv_violation_low_util`
|
||||
* `nv_violation_reliability`
|
||||
* `nv_violation_below_app_clock`
|
||||
* `nv_violation_below_base_clock`
|
||||
* `nv_nvlink_crc_flit_errors`
|
||||
* `nv_nvlink_crc_errors`
|
||||
* `nv_nvlink_ecc_errors`
|
||||
* `nv_nvlink_replay_errors`
|
||||
* `nv_nvlink_recovery_errors`
|
||||
|
||||
It uses a separate `type` in the metrics. The output metric looks like this:
|
||||
`<name>,type=accelerator,type-id=<nvidia-gpu-id> value=<metric value> <timestamp>`
|
||||
|
||||
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
319
collectors/rocmsmiMetric.go
Normal file
319
collectors/rocmsmiMetric.go
Normal file
@@ -0,0 +1,319 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
|
||||
)
|
||||
|
||||
type RocmSmiCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
|
||||
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
|
||||
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
|
||||
}
|
||||
|
||||
type RocmSmiCollectorDevice struct {
|
||||
device rocm_smi.DeviceHandle
|
||||
index int
|
||||
tags map[string]string // default tags
|
||||
meta map[string]string // default meta information
|
||||
excludeMetrics map[string]bool // copy of exclude metrics from config
|
||||
}
|
||||
|
||||
type RocmSmiCollector struct {
|
||||
metricCollector
|
||||
config RocmSmiCollectorConfig // the configuration structure
|
||||
devices []RocmSmiCollectorDevice
|
||||
}
|
||||
|
||||
// Functions to implement MetricCollector interface
|
||||
// Init(...), Read(...), Close()
|
||||
// See: metricCollector.go
|
||||
|
||||
// Init initializes the sample collector
|
||||
// Called once by the collector manager
|
||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "RocmSmiCollector"
|
||||
// This is for later use, also call it early
|
||||
m.setup()
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
|
||||
// Define tags sent with each metric
|
||||
// The 'type' tag is always needed, it defines the granulatity of the metric
|
||||
// node -> whole system
|
||||
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
||||
//m.tags = map[string]string{"type": "node"}
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
ret := rocm_smi.Init()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("Failed to initialize ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
numDevs, ret := rocm_smi.NumMonitorDevices()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("Failed to get number of GPUs from ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
exclDev := func(s string) bool {
|
||||
skip_device := false
|
||||
for _, excl := range m.config.ExcludeDevices {
|
||||
if excl == s {
|
||||
skip_device = true
|
||||
break
|
||||
}
|
||||
}
|
||||
return skip_device
|
||||
}
|
||||
|
||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||
|
||||
for i := 0; i < numDevs; i++ {
|
||||
str_i := fmt.Sprintf("%d", i)
|
||||
if exclDev(str_i) {
|
||||
continue
|
||||
}
|
||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("Failed to get handle for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("Failed to get PCI information for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
pciId := fmt.Sprintf(
|
||||
"%08X:%02X:%02X.%X",
|
||||
pciInfo.Domain,
|
||||
pciInfo.Bus,
|
||||
pciInfo.Device,
|
||||
pciInfo.Function)
|
||||
|
||||
if exclDev(pciId) {
|
||||
continue
|
||||
}
|
||||
|
||||
dev := RocmSmiCollectorDevice{
|
||||
device: device,
|
||||
tags: map[string]string{
|
||||
"type": "accelerator",
|
||||
"type-id": str_i,
|
||||
},
|
||||
meta: map[string]string{
|
||||
"source": m.name,
|
||||
"group": "AMD",
|
||||
},
|
||||
}
|
||||
if m.config.UsePciInfoAsTypeId {
|
||||
dev.tags["type-id"] = pciId
|
||||
} else if m.config.AddPciInfoTag {
|
||||
dev.tags["pci_identifier"] = pciId
|
||||
}
|
||||
|
||||
if m.config.AddSerialMeta {
|
||||
serial, ret := rocm_smi.DeviceGetSerialNumber(device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
|
||||
} else {
|
||||
dev.meta["serial"] = serial
|
||||
}
|
||||
}
|
||||
// Add excluded metrics
|
||||
dev.excludeMetrics = map[string]bool{}
|
||||
for _, e := range m.config.ExcludeMetrics {
|
||||
dev.excludeMetrics[e] = true
|
||||
}
|
||||
dev.index = i
|
||||
m.devices = append(m.devices, dev)
|
||||
}
|
||||
|
||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||
m.init = true
|
||||
return err
|
||||
}
|
||||
|
||||
// Read collects all metrics belonging to the sample collector
|
||||
// and sends them through the output channel to the collector manager
|
||||
func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
// Create a sample metric
|
||||
timestamp := time.Now()
|
||||
|
||||
for _, dev := range m.devices {
|
||||
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
|
||||
continue
|
||||
}
|
||||
|
||||
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||
value := metrics.Average_gfx_activity
|
||||
y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||
value := metrics.Average_umc_activity
|
||||
y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||
value := metrics.Average_mm_activity
|
||||
y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||
value := metrics.Average_socket_power
|
||||
y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||
value := metrics.Temperature_mem
|
||||
y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||
value := metrics.Temperature_hotspot
|
||||
y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||
value := metrics.Temperature_edge
|
||||
y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||
value := metrics.Temperature_vrgfx
|
||||
y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||
value := metrics.Temperature_vrsoc
|
||||
y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||
value := metrics.Temperature_vrmem
|
||||
y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||
value := metrics.Average_gfxclk_frequency
|
||||
y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||
value := metrics.Average_socclk_frequency
|
||||
y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||
value := metrics.Average_uclk_frequency
|
||||
y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||
value := metrics.Average_vclk0_frequency
|
||||
y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||
value := metrics.Average_vclk1_frequency
|
||||
y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||
value := metrics.Average_dclk0_frequency
|
||||
y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||
value := metrics.Average_dclk1_frequency
|
||||
y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
|
||||
value := metrics.Temperature_hbm[i]
|
||||
y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype", "device")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Close metric collector: close network connection, close files, close libraries, ...
|
||||
// Called once by the collector manager
|
||||
func (m *RocmSmiCollector) Close() {
|
||||
// Unset flag
|
||||
ret := rocm_smi.Shutdown()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
cclog.ComponentError(m.name, "Failed to shutdown ROCm SMI library")
|
||||
}
|
||||
m.init = false
|
||||
}
|
47
collectors/rocmsmiMetric.md
Normal file
47
collectors/rocmsmiMetric.md
Normal file
@@ -0,0 +1,47 @@
|
||||
|
||||
## `rocm_smi` collector
|
||||
|
||||
```json
|
||||
"rocm_smi": {
|
||||
"exclude_devices": [
|
||||
"0","1", "0000000:ff:01.0"
|
||||
],
|
||||
"exclude_metrics": [
|
||||
"rocm_mm_util",
|
||||
"rocm_temp_vrsoc"
|
||||
],
|
||||
"use_pci_info_as_type_id": true,
|
||||
"add_pci_info_tag": false,
|
||||
"add_serial_meta": false,
|
||||
}
|
||||
```
|
||||
|
||||
The `rocm_smi` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes logical IDs in the list of available devices or the PCI address similar to NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option.
|
||||
|
||||
The metrics sent by the `rocm_smi` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
||||
|
||||
Optionally, it is possible to add the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
|
||||
|
||||
|
||||
Metrics:
|
||||
* `rocm_gfx_util`
|
||||
* `rocm_umc_util`
|
||||
* `rocm_mm_util`
|
||||
* `rocm_avg_power`
|
||||
* `rocm_temp_mem`
|
||||
* `rocm_temp_hotspot`
|
||||
* `rocm_temp_edge`
|
||||
* `rocm_temp_vrgfx`
|
||||
* `rocm_temp_vrsoc`
|
||||
* `rocm_temp_vrmem`
|
||||
* `rocm_gfx_clock`
|
||||
* `rocm_soc_clock`
|
||||
* `rocm_u_clock`
|
||||
* `rocm_v0_clock`
|
||||
* `rocm_v1_clock`
|
||||
* `rocm_d0_clock`
|
||||
* `rocm_d1_clock`
|
||||
* `rocm_temp_hbm`
|
||||
|
||||
|
||||
Some metrics add the additional sub type tag (`stype`) like the `rocm_temp_hbm` metrics set `stype=device,stype-id=<HBM_slice_number>`.
|
@@ -35,6 +35,10 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
||||
m.name = "InternalCollector"
|
||||
// This is for later use, also call it early
|
||||
m.setup()
|
||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||
// or it should be run serially, mostly for collectors acutally doing measurements
|
||||
// because they should not measure the execution of the other collectors
|
||||
m.parallel = true
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
||||
@@ -42,7 +46,12 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
||||
// The 'type' tag is always needed, it defines the granulatity of the metric
|
||||
// node -> whole system
|
||||
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
||||
// die -> CPU die (requires CPU die ID as 'type-id' tag)
|
||||
// memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag)
|
||||
// llc -> Last level cache (requires last level cache ID as 'type-id' tag)
|
||||
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
|
||||
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
|
||||
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
|
@@ -50,6 +50,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
m.name = "TempCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
@@ -116,6 +117,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Sensor file
|
||||
_, err = ioutil.ReadFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sensor.file = file
|
||||
|
||||
// Sensor tags
|
||||
|
@@ -28,6 +28,7 @@ type TopProcsCollector struct {
|
||||
func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "TopProcsCollector"
|
||||
m.parallel = true
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
|
||||
if len(config) > 0 {
|
||||
|
Reference in New Issue
Block a user