mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-31 00:46:08 +02:00
Merge branch 'develop' into slurm_cgroup_collector
This commit is contained in:
@@ -1,31 +1,33 @@
|
||||
# LIKWID version
|
||||
LIKWID_VERSION = 5.2.2
|
||||
LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null) 2>/dev/null)
|
||||
LIKWID_VERSION := 5.2.2
|
||||
LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null)
|
||||
|
||||
LIKWID_FOLDER="$(shell pwd)/likwid"
|
||||
LIKWID_FOLDER := $(CURDIR)/likwid
|
||||
|
||||
all: $(LIKWID_FOLDER)/likwid.h
|
||||
all: likwid
|
||||
|
||||
.ONESHELL:
|
||||
.PHONY: $(LIKWID_FOLDER)/likwid.h
|
||||
$(LIKWID_FOLDER)/likwid.h:
|
||||
if [ "$(LIKWID_INSTALLED_FOLDER)" != "" ]; then \
|
||||
BASE="$(LIKWID_INSTALLED_FOLDER)/../include"; \
|
||||
mkdir -p $(LIKWID_FOLDER); \
|
||||
cp $$BASE/*.h $(LIKWID_FOLDER); \
|
||||
else \
|
||||
BUILD_FOLDER="$${PWD}/likwidbuild"; \
|
||||
if [ -d $(LIKWID_FOLDER) ]; then rm -r $(LIKWID_FOLDER); fi; \
|
||||
mkdir --parents --verbose $(LIKWID_FOLDER) $${BUILD_FOLDER}; \
|
||||
wget -P "$${BUILD_FOLDER}" http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz; \
|
||||
tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz; \
|
||||
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(LIKWID_FOLDER)/; \
|
||||
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(LIKWID_FOLDER)/; \
|
||||
rm -r $${BUILD_FOLDER}; \
|
||||
.PHONY: likwid
|
||||
likwid:
|
||||
if [ -n "$(LIKWID_INSTALLED_FOLDER)" ]; then
|
||||
# Using likwid include files from system installation
|
||||
INCLUDE_DIR="$(LIKWID_INSTALLED_FOLDER)/../include"
|
||||
mkdir --parents --verbose "$(LIKWID_FOLDER)"
|
||||
cp "$${INCLUDE_DIR}"/*.h "$(LIKWID_FOLDER)"
|
||||
else
|
||||
# Using likwid include files from downloaded tar archive
|
||||
if [ -d "$(LIKWID_FOLDER)" ]; then
|
||||
rm --recursive "$(LIKWID_FOLDER)"
|
||||
fi
|
||||
BUILD_FOLDER="$${PWD}/likwidbuild"
|
||||
mkdir --parents --verbose "$${BUILD_FOLDER}"
|
||||
wget --output-document=- http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz |
|
||||
tar --directory="$${BUILD_FOLDER}" --extract --gz
|
||||
install -D --verbose --preserve-timestamps --mode=0644 --target-directory="$(LIKWID_FOLDER)" "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/likwid*.h "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/bstrlib.h
|
||||
rm --recursive "$${BUILD_FOLDER}"
|
||||
fi
|
||||
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -rf likwid
|
||||
|
||||
.PHONY: clean
|
||||
|
@@ -41,6 +41,7 @@ var AvailableCollectors = map[string]MetricCollector{
|
||||
"self": new(SelfCollector),
|
||||
"schedstat": new(SchedstatCollector),
|
||||
"slurm": new(SlurmJobDetector),
|
||||
"nfsiostat": new(NfsIOStatCollector),
|
||||
}
|
||||
|
||||
// Metric collector manager data structure
|
||||
|
@@ -14,29 +14,18 @@ import (
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||
)
|
||||
|
||||
//
|
||||
// CPUFreqCollector
|
||||
// a metric collector to measure the current frequency of the CPUs
|
||||
// as obtained from /proc/cpuinfo
|
||||
// Only measure on the first hyperthread
|
||||
//
|
||||
type CPUFreqCpuInfoCollectorTopology struct {
|
||||
processor string // logical processor number (continuous, starting at 0)
|
||||
coreID string // socket local core ID
|
||||
coreID_int int64
|
||||
physicalPackageID string // socket / package ID
|
||||
physicalPackageID_int int64
|
||||
numPhysicalPackages string // number of sockets / packages
|
||||
numPhysicalPackages_int int64
|
||||
isHT bool
|
||||
numNonHT string // number of non hyperthreading processors
|
||||
numNonHT_int int64
|
||||
tagSet map[string]string
|
||||
isHT bool
|
||||
tagSet map[string]string
|
||||
}
|
||||
|
||||
type CPUFreqCpuInfoCollector struct {
|
||||
metricCollector
|
||||
topology []*CPUFreqCpuInfoCollectorTopology
|
||||
topology []CPUFreqCpuInfoCollectorTopology
|
||||
}
|
||||
|
||||
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
@@ -65,11 +54,9 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
// Collect topology information from file cpuinfo
|
||||
foundFreq := false
|
||||
processor := ""
|
||||
var numNonHT_int int64 = 0
|
||||
coreID := ""
|
||||
physicalPackageID := ""
|
||||
var maxPhysicalPackageID int64 = 0
|
||||
m.topology = make([]*CPUFreqCpuInfoCollectorTopology, 0)
|
||||
m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0)
|
||||
coreSeenBefore := make(map[string]bool)
|
||||
|
||||
// Read cpuinfo file, line by line
|
||||
@@ -98,41 +85,22 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
len(coreID) > 0 &&
|
||||
len(physicalPackageID) > 0 {
|
||||
|
||||
topology := new(CPUFreqCpuInfoCollectorTopology)
|
||||
|
||||
// Processor
|
||||
topology.processor = processor
|
||||
|
||||
// Core ID
|
||||
topology.coreID = coreID
|
||||
topology.coreID_int, err = strconv.ParseInt(coreID, 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert coreID '%s' to int64: %v", coreID, err)
|
||||
}
|
||||
|
||||
// Physical package ID
|
||||
topology.physicalPackageID = physicalPackageID
|
||||
topology.physicalPackageID_int, err = strconv.ParseInt(physicalPackageID, 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert physicalPackageID '%s' to int64: %v", physicalPackageID, err)
|
||||
}
|
||||
|
||||
// increase maximun socket / package ID, when required
|
||||
if topology.physicalPackageID_int > maxPhysicalPackageID {
|
||||
maxPhysicalPackageID = topology.physicalPackageID_int
|
||||
}
|
||||
|
||||
// is hyperthread?
|
||||
globalID := physicalPackageID + ":" + coreID
|
||||
topology.isHT = coreSeenBefore[globalID]
|
||||
coreSeenBefore[globalID] = true
|
||||
if !topology.isHT {
|
||||
// increase number on non hyper thread cores
|
||||
numNonHT_int++
|
||||
}
|
||||
|
||||
// store collected topology information
|
||||
m.topology = append(m.topology, topology)
|
||||
m.topology = append(m.topology,
|
||||
CPUFreqCpuInfoCollectorTopology{
|
||||
isHT: coreSeenBefore[globalID],
|
||||
tagSet: map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": processor,
|
||||
"package_id": physicalPackageID,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
// mark core as seen before
|
||||
coreSeenBefore[globalID] = true
|
||||
|
||||
// reset topology information
|
||||
foundFreq = false
|
||||
@@ -142,24 +110,9 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Check if at least one CPU with frequency information was detected
|
||||
if len(m.topology) == 0 {
|
||||
return fmt.Errorf("No CPU frequency info found in %s", cpuInfoFile)
|
||||
}
|
||||
|
||||
numPhysicalPackageID_int := maxPhysicalPackageID + 1
|
||||
numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int)
|
||||
numNonHT := fmt.Sprint(numNonHT_int)
|
||||
for _, t := range m.topology {
|
||||
t.numPhysicalPackages = numPhysicalPackageID
|
||||
t.numPhysicalPackages_int = numPhysicalPackageID_int
|
||||
t.numNonHT = numNonHT
|
||||
t.numNonHT_int = numNonHT_int
|
||||
t.tagSet = map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": t.processor,
|
||||
"package_id": t.physicalPackageID,
|
||||
}
|
||||
// Check if at least one CPU with frequency information was detected
|
||||
if len(m.topology) == 0 {
|
||||
return fmt.Errorf("No CPU frequency info found in %s", cpuInfoFile)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
|
@@ -1,5 +1,5 @@
|
||||
|
||||
## `cpufreq_cpuinfo` collector
|
||||
|
||||
```json
|
||||
"cpufreq_cpuinfo": {}
|
||||
```
|
||||
@@ -7,4 +7,5 @@
|
||||
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.
|
||||
|
||||
Metrics:
|
||||
|
||||
* `cpufreq`
|
||||
|
@@ -11,22 +11,13 @@ import (
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type CPUFreqCollectorTopology struct {
|
||||
processor string // logical processor number (continuous, starting at 0)
|
||||
coreID string // socket local core ID
|
||||
coreID_int int64
|
||||
physicalPackageID string // socket / package ID
|
||||
physicalPackageID_int int64
|
||||
numPhysicalPackages string // number of sockets / packages
|
||||
numPhysicalPackages_int int64
|
||||
isHT bool
|
||||
numNonHT string // number of non hyper-threading processors
|
||||
numNonHT_int int64
|
||||
scalingCurFreqFile string
|
||||
tagSet map[string]string
|
||||
scalingCurFreqFile string
|
||||
tagSet map[string]string
|
||||
}
|
||||
|
||||
// CPUFreqCollector
|
||||
@@ -64,112 +55,38 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
"unit": "Hz",
|
||||
}
|
||||
|
||||
// Loop for all CPU directories
|
||||
baseDir := "/sys/devices/system/cpu"
|
||||
globPattern := filepath.Join(baseDir, "cpu[0-9]*")
|
||||
cpuDirs, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
|
||||
}
|
||||
if cpuDirs == nil {
|
||||
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
|
||||
}
|
||||
m.topology = make([]CPUFreqCollectorTopology, 0)
|
||||
for _, c := range ccTopology.CpuData() {
|
||||
|
||||
// Initialize CPU topology
|
||||
m.topology = make([]CPUFreqCollectorTopology, len(cpuDirs))
|
||||
for _, cpuDir := range cpuDirs {
|
||||
processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu")
|
||||
processor_int, err := strconv.ParseInt(processor, 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert cpuID '%s' to int64: %v", processor, err)
|
||||
}
|
||||
|
||||
// Read package ID
|
||||
physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id")
|
||||
line, err := os.ReadFile(physicalPackageIDFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err)
|
||||
}
|
||||
physicalPackageID := strings.TrimSpace(string(line))
|
||||
physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert packageID '%s' to int64: %v", physicalPackageID, err)
|
||||
}
|
||||
|
||||
// Read core ID
|
||||
coreIDFile := filepath.Join(cpuDir, "topology", "core_id")
|
||||
line, err = os.ReadFile(coreIDFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to read core ID from file '%s': %v", coreIDFile, err)
|
||||
}
|
||||
coreID := strings.TrimSpace(string(line))
|
||||
coreID_int, err := strconv.ParseInt(coreID, 10, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert coreID '%s' to int64: %v", coreID, err)
|
||||
// Skip hyper threading CPUs
|
||||
if c.CpuID != c.CoreCPUsList[0] {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check access to current frequency file
|
||||
scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq")
|
||||
err = unix.Access(scalingCurFreqFile, unix.R_OK)
|
||||
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
||||
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err)
|
||||
}
|
||||
|
||||
t := &m.topology[processor_int]
|
||||
t.processor = processor
|
||||
t.physicalPackageID = physicalPackageID
|
||||
t.physicalPackageID_int = physicalPackageID_int
|
||||
t.coreID = coreID
|
||||
t.coreID_int = coreID_int
|
||||
t.scalingCurFreqFile = scalingCurFreqFile
|
||||
}
|
||||
|
||||
// is processor a hyper-thread?
|
||||
coreSeenBefore := make(map[string]bool)
|
||||
for i := range m.topology {
|
||||
t := &m.topology[i]
|
||||
|
||||
globalID := t.physicalPackageID + ":" + t.coreID
|
||||
t.isHT = coreSeenBefore[globalID]
|
||||
coreSeenBefore[globalID] = true
|
||||
}
|
||||
|
||||
// number of non hyper-thread cores and packages / sockets
|
||||
var numNonHT_int int64 = 0
|
||||
PhysicalPackageIDs := make(map[int64]struct{})
|
||||
for i := range m.topology {
|
||||
t := &m.topology[i]
|
||||
|
||||
if !t.isHT {
|
||||
numNonHT_int++
|
||||
}
|
||||
|
||||
PhysicalPackageIDs[t.physicalPackageID_int] = struct{}{}
|
||||
}
|
||||
|
||||
numPhysicalPackageID_int := int64(len(PhysicalPackageIDs))
|
||||
numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int)
|
||||
numNonHT := fmt.Sprint(numNonHT_int)
|
||||
for i := range m.topology {
|
||||
t := &m.topology[i]
|
||||
t.numPhysicalPackages = numPhysicalPackageID
|
||||
t.numPhysicalPackages_int = numPhysicalPackageID_int
|
||||
t.numNonHT = numNonHT
|
||||
t.numNonHT_int = numNonHT_int
|
||||
t.tagSet = map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": t.processor,
|
||||
"package_id": t.physicalPackageID,
|
||||
}
|
||||
m.topology = append(m.topology,
|
||||
CPUFreqCollectorTopology{
|
||||
tagSet: map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": fmt.Sprint(c.CpuID),
|
||||
"package_id": fmt.Sprint(c.Socket),
|
||||
},
|
||||
scalingCurFreqFile: scalingCurFreqFile,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// Initialized
|
||||
cclog.ComponentDebug(
|
||||
m.name,
|
||||
"initialized",
|
||||
numPhysicalPackageID_int, "physical packages,",
|
||||
len(cpuDirs), "CPUs,",
|
||||
numNonHT, "non-hyper-threading CPUs")
|
||||
len(m.topology), "non-hyper-threading CPUs")
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@@ -184,11 +101,6 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
for i := range m.topology {
|
||||
t := &m.topology[i]
|
||||
|
||||
// skip hyper-threads
|
||||
if t.isHT {
|
||||
continue
|
||||
}
|
||||
|
||||
// Read current frequency
|
||||
line, err := os.ReadFile(t.scalingCurFreqFile)
|
||||
if err != nil {
|
||||
|
@@ -31,6 +31,7 @@ type GpfsCollector struct {
|
||||
Mmpmon string `json:"mmpmon_path,omitempty"`
|
||||
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||
SendBandwidths bool `json:"send_bandwidths"`
|
||||
SendTotalValues bool `json:"send_total_values"`
|
||||
}
|
||||
skipFS map[string]struct{}
|
||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||
@@ -216,13 +217,33 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err))
|
||||
continue
|
||||
}
|
||||
if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil {
|
||||
if y, err :=
|
||||
lp.New(
|
||||
"gpfs_bytes_read",
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": bytesRead,
|
||||
},
|
||||
timestamp,
|
||||
); err == nil {
|
||||
y.AddMeta("unit", "bytes")
|
||||
output <- y
|
||||
}
|
||||
if m.config.SendBandwidths {
|
||||
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
|
||||
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
|
||||
if y, err := lp.New("gpfs_bw_read", m.tags, m.meta, map[string]interface{}{"value": bwRead}, timestamp); err == nil {
|
||||
if y, err :=
|
||||
lp.New(
|
||||
"gpfs_bw_read",
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": bwRead,
|
||||
},
|
||||
timestamp,
|
||||
); err == nil {
|
||||
y.AddMeta("unit", "bytes/sec")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
@@ -236,13 +257,33 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err))
|
||||
continue
|
||||
}
|
||||
if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil {
|
||||
if y, err :=
|
||||
lp.New(
|
||||
"gpfs_bytes_written",
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": bytesWritten,
|
||||
},
|
||||
timestamp,
|
||||
); err == nil {
|
||||
y.AddMeta("unit", "bytes")
|
||||
output <- y
|
||||
}
|
||||
if m.config.SendBandwidths {
|
||||
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
|
||||
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
|
||||
if y, err := lp.New("gpfs_bw_write", m.tags, m.meta, map[string]interface{}{"value": bwWrite}, timestamp); err == nil {
|
||||
if y, err :=
|
||||
lp.New(
|
||||
"gpfs_bw_write",
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": bwWrite,
|
||||
},
|
||||
timestamp,
|
||||
); err == nil {
|
||||
y.AddMeta("unit", "bytes/sec")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
@@ -326,6 +367,47 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
// Total values
|
||||
if m.config.SendTotalValues {
|
||||
bytesTotal := bytesRead + bytesWritten
|
||||
if y, err :=
|
||||
lp.New("gpfs_bytes_total",
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": bytesTotal,
|
||||
},
|
||||
timestamp,
|
||||
); err == nil {
|
||||
y.AddMeta("unit", "bytes")
|
||||
output <- y
|
||||
}
|
||||
iops := numReads + numWrites
|
||||
if y, err :=
|
||||
lp.New("gpfs_iops",
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": iops,
|
||||
},
|
||||
timestamp,
|
||||
); err == nil {
|
||||
output <- y
|
||||
}
|
||||
metaops := numInodeUpdates + numCloses + numOpens + numReaddirs
|
||||
if y, err :=
|
||||
lp.New("gpfs_metaops",
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": metaops,
|
||||
},
|
||||
timestamp,
|
||||
); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -6,7 +6,8 @@
|
||||
"exclude_filesystem": [
|
||||
"fs1"
|
||||
],
|
||||
"send_bandwidths" : true
|
||||
"send_bandwidths": true,
|
||||
"send_total_values": true
|
||||
}
|
||||
```
|
||||
|
||||
@@ -26,8 +27,12 @@ Metrics:
|
||||
* `gpfs_num_opens`
|
||||
* `gpfs_num_closes`
|
||||
* `gpfs_num_reads`
|
||||
* `gpfs_num_writes`
|
||||
* `gpfs_num_readdirs`
|
||||
* `gpfs_num_inode_updates`
|
||||
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true`)
|
||||
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true`)
|
||||
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true`)
|
||||
* `gpfs_bw_read` (if `send_bandwidths == true`)
|
||||
* `gpfs_bw_write` (if `send_bandwidths == true`)
|
||||
|
||||
|
@@ -18,18 +18,22 @@ import (
|
||||
const IB_BASEPATH = "/sys/class/infiniband/"
|
||||
|
||||
type InfinibandCollectorMetric struct {
|
||||
path string
|
||||
unit string
|
||||
scale int64
|
||||
name string
|
||||
path string
|
||||
unit string
|
||||
scale int64
|
||||
addToIBTotal bool
|
||||
addToIBTotalPkgs bool
|
||||
currentState int64
|
||||
lastState int64
|
||||
}
|
||||
|
||||
type InfinibandCollectorInfo struct {
|
||||
LID string // IB local Identifier (LID)
|
||||
device string // IB device
|
||||
port string // IB device port
|
||||
portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
|
||||
tagSet map[string]string // corresponding tag list
|
||||
lastState map[string]int64 // State from last measurement
|
||||
LID string // IB local Identifier (LID)
|
||||
device string // IB device
|
||||
port string // IB device port
|
||||
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
|
||||
tagSet map[string]string // corresponding tag list
|
||||
}
|
||||
|
||||
type InfinibandCollector struct {
|
||||
@@ -37,9 +41,10 @@ type InfinibandCollector struct {
|
||||
config struct {
|
||||
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
|
||||
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
|
||||
SendTotalValues bool `json:"send_total_values"` // Send computed total values
|
||||
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
|
||||
}
|
||||
info []*InfinibandCollectorInfo
|
||||
info []InfinibandCollectorInfo
|
||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||
}
|
||||
|
||||
@@ -112,11 +117,39 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Check access to counter files
|
||||
countersDir := filepath.Join(path, "counters")
|
||||
portCounterFiles := map[string]InfinibandCollectorMetric{
|
||||
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes", scale: 4},
|
||||
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes", scale: 4},
|
||||
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets", scale: 1},
|
||||
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets", scale: 1},
|
||||
portCounterFiles := []InfinibandCollectorMetric{
|
||||
{
|
||||
name: "ib_recv",
|
||||
path: filepath.Join(countersDir, "port_rcv_data"),
|
||||
unit: "bytes",
|
||||
scale: 4,
|
||||
addToIBTotal: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
name: "ib_xmit",
|
||||
path: filepath.Join(countersDir, "port_xmit_data"),
|
||||
unit: "bytes",
|
||||
scale: 4,
|
||||
addToIBTotal: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
name: "ib_recv_pkts",
|
||||
path: filepath.Join(countersDir, "port_rcv_packets"),
|
||||
unit: "packets",
|
||||
scale: 1,
|
||||
addToIBTotalPkgs: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
name: "ib_xmit_pkts",
|
||||
path: filepath.Join(countersDir, "port_xmit_packets"),
|
||||
unit: "packets",
|
||||
scale: 1,
|
||||
addToIBTotalPkgs: true,
|
||||
lastState: -1,
|
||||
},
|
||||
}
|
||||
for _, counter := range portCounterFiles {
|
||||
err := unix.Access(counter.path, unix.R_OK)
|
||||
@@ -125,14 +158,8 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize last state
|
||||
lastState := make(map[string]int64)
|
||||
for counter := range portCounterFiles {
|
||||
lastState[counter] = -1
|
||||
}
|
||||
|
||||
m.info = append(m.info,
|
||||
&InfinibandCollectorInfo{
|
||||
InfinibandCollectorInfo{
|
||||
LID: LID,
|
||||
device: device,
|
||||
port: port,
|
||||
@@ -143,7 +170,6 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
"port": port,
|
||||
"lid": LID,
|
||||
},
|
||||
lastState: lastState,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -170,8 +196,12 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
||||
// Save current timestamp
|
||||
m.lastTimestamp = now
|
||||
|
||||
for _, info := range m.info {
|
||||
for counterName, counterDef := range info.portCounterFiles {
|
||||
for i := range m.info {
|
||||
info := &m.info[i]
|
||||
|
||||
var ib_total, ib_total_pkts int64
|
||||
for i := range info.portCounterFiles {
|
||||
counterDef := &info.portCounterFiles[i]
|
||||
|
||||
// Read counter file
|
||||
line, err := os.ReadFile(counterDef.path)
|
||||
@@ -188,15 +218,26 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err))
|
||||
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err))
|
||||
continue
|
||||
}
|
||||
// Scale raw value
|
||||
v *= counterDef.scale
|
||||
|
||||
// Save current state
|
||||
counterDef.currentState = v
|
||||
|
||||
// Send absolut values
|
||||
if m.config.SendAbsoluteValues {
|
||||
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
|
||||
if y, err :=
|
||||
lp.New(
|
||||
counterDef.name,
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": counterDef.currentState,
|
||||
},
|
||||
now); err == nil {
|
||||
y.AddMeta("unit", counterDef.unit)
|
||||
output <- y
|
||||
}
|
||||
@@ -204,18 +245,64 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
||||
|
||||
// Send derived values
|
||||
if m.config.SendDerivedValues {
|
||||
if info.lastState[counterName] >= 0 {
|
||||
rate := float64((v - info.lastState[counterName])) / timeDiff
|
||||
if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
|
||||
if counterDef.lastState >= 0 {
|
||||
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
|
||||
if y, err :=
|
||||
lp.New(
|
||||
counterDef.name+"_bw",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": rate,
|
||||
},
|
||||
now); err == nil {
|
||||
y.AddMeta("unit", counterDef.unit+"/sec")
|
||||
output <- y
|
||||
|
||||
}
|
||||
}
|
||||
// Save current state
|
||||
info.lastState[counterName] = v
|
||||
counterDef.lastState = counterDef.currentState
|
||||
}
|
||||
|
||||
// Sum up total values
|
||||
if m.config.SendTotalValues {
|
||||
switch {
|
||||
case counterDef.addToIBTotal:
|
||||
ib_total += counterDef.currentState
|
||||
case counterDef.addToIBTotalPkgs:
|
||||
ib_total_pkts += counterDef.currentState
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send total values
|
||||
if m.config.SendTotalValues {
|
||||
if y, err :=
|
||||
lp.New(
|
||||
"ib_total",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": ib_total,
|
||||
},
|
||||
now); err == nil {
|
||||
y.AddMeta("unit", "bytes")
|
||||
output <- y
|
||||
}
|
||||
|
||||
if y, err :=
|
||||
lp.New(
|
||||
"ib_total_pkts",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": ib_total_pkts,
|
||||
},
|
||||
now); err == nil {
|
||||
y.AddMeta("unit", "packets")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -17,13 +17,16 @@ LID file (`/sys/class/infiniband/<dev>/ports/<port>/lid`)
|
||||
|
||||
The devices can be filtered with the `exclude_devices` option in the configuration.
|
||||
|
||||
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`.
|
||||
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`. (See: <https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband>)
|
||||
|
||||
Metrics:
|
||||
|
||||
* `ib_recv`
|
||||
* `ib_xmit`
|
||||
* `ib_recv_pkts`
|
||||
* `ib_xmit_pkts`
|
||||
* `ib_total = ib_recv + ib_xmit` (if `send_total_values == true`)
|
||||
* `ib_total_pkts = ib_recv_pkts + ib_xmit_pkts` (if `send_total_values == true`)
|
||||
* `ib_recv_bw` (if `send_derived_values == true`)
|
||||
* `ib_xmit_bw` (if `send_derived_values == true`)
|
||||
* `ib_recv_pkts_bw` (if `send_derived_values == true`)
|
||||
|
@@ -29,8 +29,8 @@ import (
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||
"github.com/fsnotify/fsnotify"
|
||||
"golang.design/x/thread"
|
||||
fsnotify "gopkg.in/fsnotify.v0"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -41,11 +41,14 @@ const (
|
||||
)
|
||||
|
||||
type LikwidCollectorMetricConfig struct {
|
||||
Name string `json:"name"` // Name of the metric
|
||||
Calc string `json:"calc"` // Calculation for the metric using
|
||||
Type string `json:"type"` // Metric type (aka node, socket, cpu, ...)
|
||||
Publish bool `json:"publish"`
|
||||
Unit string `json:"unit"` // Unit of metric if any
|
||||
Name string `json:"name"` // Name of the metric
|
||||
Calc string `json:"calc"` // Calculation for the metric using
|
||||
Type string `json:"type"` // Metric type (aka node, socket, cpu, ...)
|
||||
Publish bool `json:"publish"`
|
||||
SendCoreTotalVal bool `json:"send_core_total_values,omitempty"`
|
||||
SendSocketTotalVal bool `json:"send_socket_total_values,omitempty"`
|
||||
SendNodeTotalVal bool `json:"send_node_total_values,omitempty"`
|
||||
Unit string `json:"unit"` // Unit of metric if any
|
||||
}
|
||||
|
||||
type LikwidCollectorEventsetConfig struct {
|
||||
@@ -59,7 +62,7 @@ type LikwidEventsetConfig struct {
|
||||
eorder []*C.char
|
||||
estr *C.char
|
||||
go_estr string
|
||||
results map[int]map[string]interface{}
|
||||
results map[int]map[string]float64
|
||||
metrics map[int]map[string]float64
|
||||
}
|
||||
|
||||
@@ -79,10 +82,11 @@ type LikwidCollector struct {
|
||||
cpulist []C.int
|
||||
cpu2tid map[int]int
|
||||
sock2tid map[int]int
|
||||
tid2core map[int]int
|
||||
tid2socket map[int]int
|
||||
metrics map[C.int]map[string]int
|
||||
groups []C.int
|
||||
config LikwidCollectorConfig
|
||||
gmresults map[int]map[string]float64
|
||||
basefreq float64
|
||||
running bool
|
||||
initialized bool
|
||||
@@ -134,10 +138,10 @@ func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig
|
||||
elist = append(elist, c_counter)
|
||||
}
|
||||
estr := strings.Join(tmplist, ",")
|
||||
res := make(map[int]map[string]interface{})
|
||||
res := make(map[int]map[string]float64)
|
||||
met := make(map[int]map[string]float64)
|
||||
for _, i := range topo.CpuList() {
|
||||
res[i] = make(map[string]interface{})
|
||||
res[i] = make(map[string]float64)
|
||||
for k := range input.Events {
|
||||
res[i][k] = 0.0
|
||||
}
|
||||
@@ -157,7 +161,7 @@ func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig
|
||||
}
|
||||
|
||||
func testLikwidMetricFormula(formula string, params []string) bool {
|
||||
myparams := make(map[string]interface{})
|
||||
myparams := make(map[string]float64)
|
||||
for _, p := range params {
|
||||
myparams[p] = float64(1.0)
|
||||
}
|
||||
@@ -236,13 +240,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.likwidGroups = make(map[C.int]LikwidEventsetConfig)
|
||||
|
||||
// m.results = make(map[int]map[int]map[string]interface{})
|
||||
// m.mresults = make(map[int]map[int]map[string]float64)
|
||||
m.gmresults = make(map[int]map[string]float64)
|
||||
for _, tid := range m.cpu2tid {
|
||||
m.gmresults[tid] = make(map[string]float64)
|
||||
}
|
||||
|
||||
// This is for the global metrics computation test
|
||||
totalMetrics := 0
|
||||
// Generate parameter list for the metric computing test
|
||||
@@ -306,6 +303,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
m.measureThread = thread.New()
|
||||
switch m.config.AccessMode {
|
||||
case "direct":
|
||||
C.HPMmode(0)
|
||||
@@ -315,8 +313,20 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||
}
|
||||
C.HPMmode(1)
|
||||
retCode := C.HPMinit()
|
||||
if retCode != 0 {
|
||||
err := fmt.Errorf("C.HPMinit() failed with return code %v", retCode)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
}
|
||||
for _, c := range m.cpulist {
|
||||
C.HPMaddThread(c)
|
||||
m.measureThread.Call(
|
||||
func() {
|
||||
retCode := C.HPMaddThread(c)
|
||||
if retCode != 0 {
|
||||
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
m.sock2tid = make(map[int]int)
|
||||
@@ -330,8 +340,22 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
C.free(unsafe.Pointer(cstr))
|
||||
}
|
||||
|
||||
cpuData := topo.CpuData()
|
||||
m.tid2core = make(map[int]int, len(cpuData))
|
||||
m.tid2socket = make(map[int]int, len(cpuData))
|
||||
for i := range cpuData {
|
||||
c := &cpuData[i]
|
||||
// Hardware thread ID to core ID mapping
|
||||
if len(c.CoreCPUsList) > 0 {
|
||||
m.tid2core[c.CpuID] = c.CoreCPUsList[0]
|
||||
} else {
|
||||
m.tid2core[c.CpuID] = c.CpuID
|
||||
}
|
||||
// Hardware thead ID to socket ID mapping
|
||||
m.tid2socket[c.CpuID] = c.Socket
|
||||
}
|
||||
|
||||
m.basefreq = getBaseFreq()
|
||||
m.measureThread = thread.New()
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@@ -341,9 +365,12 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
var ret C.int
|
||||
var gid C.int = -1
|
||||
sigchan := make(chan os.Signal, 1)
|
||||
|
||||
// Watch changes for the lock file ()
|
||||
watcher, err := fsnotify.NewWatcher()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return true, err
|
||||
}
|
||||
defer watcher.Close()
|
||||
if len(m.config.LockfilePath) > 0 {
|
||||
@@ -351,26 +378,28 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
stat := info.Sys().(*syscall.Stat_t)
|
||||
if stat.Uid != uint32(os.Getuid()) {
|
||||
usr, err := user.LookupId(strconv.FormatUint(uint64(stat.Uid), 10))
|
||||
uid := info.Sys().(*syscall.Stat_t).Uid
|
||||
if uid != uint32(os.Getuid()) {
|
||||
usr, err := user.LookupId(fmt.Sprint(uid))
|
||||
if err == nil {
|
||||
return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username)
|
||||
} else {
|
||||
return true, fmt.Errorf("Access to performance counters locked by %d", stat.Uid)
|
||||
return true, fmt.Errorf("Access to performance counters locked by %d", uid)
|
||||
}
|
||||
}
|
||||
err = watcher.Watch(m.config.LockfilePath)
|
||||
err = watcher.Add(m.config.LockfilePath)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
}
|
||||
}
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
// Initialize the performance monitoring feature by creating basic data structures
|
||||
select {
|
||||
case e := <-watcher.Event:
|
||||
case e := <-watcher.Events:
|
||||
ret = -1
|
||||
if !e.IsAttrib() {
|
||||
if e.Op != fsnotify.Chmod {
|
||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||
}
|
||||
default:
|
||||
@@ -381,12 +410,14 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
}
|
||||
signal.Notify(sigchan, os.Interrupt)
|
||||
signal.Notify(sigchan, syscall.SIGCHLD)
|
||||
|
||||
// Add an event string to LIKWID
|
||||
select {
|
||||
case <-sigchan:
|
||||
gid = -1
|
||||
case e := <-watcher.Event:
|
||||
case e := <-watcher.Events:
|
||||
gid = -1
|
||||
if !e.IsAttrib() {
|
||||
if e.Op != fsnotify.Chmod {
|
||||
gid = C.perfmon_addEventSet(evset.estr)
|
||||
}
|
||||
default:
|
||||
@@ -396,13 +427,14 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
return true, fmt.Errorf("failed to add events %s, error %d", evset.go_estr, gid)
|
||||
} else {
|
||||
evset.gid = gid
|
||||
//m.likwidGroups[gid] = evset
|
||||
}
|
||||
|
||||
// Setup all performance monitoring counters of an eventSet
|
||||
select {
|
||||
case <-sigchan:
|
||||
ret = -1
|
||||
case e := <-watcher.Event:
|
||||
if !e.IsAttrib() {
|
||||
case e := <-watcher.Events:
|
||||
if e.Op != fsnotify.Chmod {
|
||||
ret = C.perfmon_setupCounters(gid)
|
||||
}
|
||||
default:
|
||||
@@ -411,11 +443,13 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
if ret != 0 {
|
||||
return true, fmt.Errorf("failed to setup events '%s', error %d", evset.go_estr, ret)
|
||||
}
|
||||
|
||||
// Start counters
|
||||
select {
|
||||
case <-sigchan:
|
||||
ret = -1
|
||||
case e := <-watcher.Event:
|
||||
if !e.IsAttrib() {
|
||||
case e := <-watcher.Events:
|
||||
if e.Op != fsnotify.Chmod {
|
||||
ret = C.perfmon_startCounters()
|
||||
}
|
||||
default:
|
||||
@@ -427,8 +461,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
select {
|
||||
case <-sigchan:
|
||||
ret = -1
|
||||
case e := <-watcher.Event:
|
||||
if !e.IsAttrib() {
|
||||
case e := <-watcher.Events:
|
||||
if e.Op != fsnotify.Chmod {
|
||||
ret = C.perfmon_readCounters()
|
||||
}
|
||||
default:
|
||||
@@ -437,12 +471,16 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
if ret != 0 {
|
||||
return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret)
|
||||
}
|
||||
|
||||
// Wait
|
||||
time.Sleep(interval)
|
||||
|
||||
// Read counters
|
||||
select {
|
||||
case <-sigchan:
|
||||
ret = -1
|
||||
case e := <-watcher.Event:
|
||||
if !e.IsAttrib() {
|
||||
case e := <-watcher.Events:
|
||||
if e.Op != fsnotify.Chmod {
|
||||
ret = C.perfmon_readCounters()
|
||||
}
|
||||
default:
|
||||
@@ -451,6 +489,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
if ret != 0 {
|
||||
return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret)
|
||||
}
|
||||
|
||||
// Store counters
|
||||
for eidx, counter := range evset.eorder {
|
||||
gctr := C.GoString(counter)
|
||||
for _, tid := range m.cpu2tid {
|
||||
@@ -462,14 +502,18 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
evset.results[tid][gctr] = fres
|
||||
}
|
||||
}
|
||||
|
||||
// Store time in seconds the event group was measured the last time
|
||||
for _, tid := range m.cpu2tid {
|
||||
evset.results[tid]["time"] = float64(C.perfmon_getLastTimeOfGroup(gid))
|
||||
}
|
||||
|
||||
// Stop counters
|
||||
select {
|
||||
case <-sigchan:
|
||||
ret = -1
|
||||
case e := <-watcher.Event:
|
||||
if !e.IsAttrib() {
|
||||
case e := <-watcher.Events:
|
||||
if e.Op != fsnotify.Chmod {
|
||||
ret = C.perfmon_stopCounters()
|
||||
}
|
||||
default:
|
||||
@@ -478,10 +522,12 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
if ret != 0 {
|
||||
return true, fmt.Errorf("failed to stop events '%s', error %d", evset.go_estr, ret)
|
||||
}
|
||||
|
||||
// Deallocates all internal data that is used during performance monitoring
|
||||
signal.Stop(sigchan)
|
||||
select {
|
||||
case e := <-watcher.Event:
|
||||
if !e.IsAttrib() {
|
||||
case e := <-watcher.Events:
|
||||
if e.Op != fsnotify.Chmod {
|
||||
C.perfmon_finalize()
|
||||
}
|
||||
default:
|
||||
@@ -506,6 +552,9 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
if metric.Type == "socket" {
|
||||
scopemap = m.sock2tid
|
||||
}
|
||||
// Send all metrics with same time stamp
|
||||
// This function does only computiation, counter measurement is done before
|
||||
now := time.Now()
|
||||
for domain, tid := range scopemap {
|
||||
if tid >= 0 && len(metric.Calc) > 0 {
|
||||
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
||||
@@ -518,23 +567,137 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
}
|
||||
evset.metrics[tid][metric.Name] = value
|
||||
// Now we have the result, send it with the proper tags
|
||||
if !math.IsNaN(value) {
|
||||
if metric.Publish {
|
||||
fields := map[string]interface{}{"value": value}
|
||||
y, err := lp.New(metric.Name, map[string]string{"type": metric.Type}, m.meta, fields, time.Now())
|
||||
if err == nil {
|
||||
if metric.Type != "node" {
|
||||
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
||||
}
|
||||
if len(metric.Unit) > 0 {
|
||||
y.AddMeta("unit", metric.Unit)
|
||||
}
|
||||
output <- y
|
||||
if !math.IsNaN(value) && metric.Publish {
|
||||
fields := map[string]interface{}{"value": value}
|
||||
y, err :=
|
||||
lp.New(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": metric.Type,
|
||||
},
|
||||
m.meta,
|
||||
fields,
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
if metric.Type != "node" {
|
||||
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
||||
}
|
||||
if len(metric.Unit) > 0 {
|
||||
y.AddMeta("unit", metric.Unit)
|
||||
}
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send per core aggregated values
|
||||
if metric.SendCoreTotalVal {
|
||||
totalCoreValues := make(map[int]float64)
|
||||
for _, tid := range scopemap {
|
||||
if tid >= 0 && len(metric.Calc) > 0 {
|
||||
coreID := m.tid2core[tid]
|
||||
value := evset.metrics[tid][metric.Name]
|
||||
if !math.IsNaN(value) && metric.Publish {
|
||||
totalCoreValues[coreID] += value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for coreID, value := range totalCoreValues {
|
||||
y, err :=
|
||||
lp.New(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "core",
|
||||
"type-id": fmt.Sprintf("%d", coreID),
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if len(metric.Unit) > 0 {
|
||||
y.AddMeta("unit", metric.Unit)
|
||||
}
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
// Send per socket aggregated values
|
||||
if metric.SendSocketTotalVal {
|
||||
totalSocketValues := make(map[int]float64)
|
||||
for _, tid := range scopemap {
|
||||
if tid >= 0 && len(metric.Calc) > 0 {
|
||||
socketID := m.tid2socket[tid]
|
||||
value := evset.metrics[tid][metric.Name]
|
||||
if !math.IsNaN(value) && metric.Publish {
|
||||
totalSocketValues[socketID] += value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for socketID, value := range totalSocketValues {
|
||||
y, err :=
|
||||
lp.New(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "socket",
|
||||
"type-id": fmt.Sprintf("%d", socketID),
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if len(metric.Unit) > 0 {
|
||||
y.AddMeta("unit", metric.Unit)
|
||||
}
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
// Send per node aggregated value
|
||||
if metric.SendNodeTotalVal {
|
||||
var totalNodeValue float64 = 0.0
|
||||
for _, tid := range scopemap {
|
||||
if tid >= 0 && len(metric.Calc) > 0 {
|
||||
value := evset.metrics[tid][metric.Name]
|
||||
if !math.IsNaN(value) && metric.Publish {
|
||||
totalNodeValue += value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
y, err :=
|
||||
lp.New(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "node",
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": totalNodeValue,
|
||||
},
|
||||
now,
|
||||
)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if len(metric.Unit) > 0 {
|
||||
y.AddMeta("unit", metric.Unit)
|
||||
}
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -542,7 +705,13 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
|
||||
// Go over the global metrics, derive the value out of the event sets' metric values and send it
|
||||
func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
||||
// Send all metrics with same time stamp
|
||||
// This function does only computiation, counter measurement is done before
|
||||
now := time.Now()
|
||||
|
||||
for _, metric := range m.config.Metrics {
|
||||
// The metric scope is determined in the Init() function
|
||||
// Get the map scope-id -> tids
|
||||
scopemap := m.cpu2tid
|
||||
if metric.Type == "socket" {
|
||||
scopemap = m.sock2tid
|
||||
@@ -550,7 +719,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
for domain, tid := range scopemap {
|
||||
if tid >= 0 {
|
||||
// Here we generate parameter list
|
||||
params := make(map[string]interface{})
|
||||
params := make(map[string]float64)
|
||||
for _, evset := range groups {
|
||||
for mname, mres := range evset.metrics[tid] {
|
||||
params[mname] = mres
|
||||
@@ -565,13 +734,21 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||
value = 0.0
|
||||
}
|
||||
//m.gmresults[tid][metric.Name] = value
|
||||
// Now we have the result, send it with the proper tags
|
||||
if !math.IsNaN(value) {
|
||||
if metric.Publish {
|
||||
tags := map[string]string{"type": metric.Type}
|
||||
fields := map[string]interface{}{"value": value}
|
||||
y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now())
|
||||
y, err :=
|
||||
lp.New(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": metric.Type,
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
if metric.Type != "node" {
|
||||
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
||||
@@ -589,7 +766,6 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) {
|
||||
var err error = nil
|
||||
groups := make([]LikwidEventsetConfig, 0)
|
||||
@@ -619,8 +795,6 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
|
||||
|
||||
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
||||
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
//var skip bool = false
|
||||
//var err error
|
||||
if !m.init {
|
||||
return
|
||||
}
|
||||
|
@@ -41,10 +41,12 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
|
||||
```
|
||||
|
||||
The `likwid` configuration consists of two parts, the `eventsets` and `globalmetrics`:
|
||||
|
||||
- An event set list itself has two parts, the `events` and a set of derivable `metrics`. Each of the `events` is a `counter:event` pair in LIKWID's syntax. The `metrics` are a list of formulas to derive the metric value from the measurements of the `events`' values. Each metric has a name, the formula, a type and a publish flag. There is an optional `unit` field. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. You can optionally use `time` for the measurement time and `inverseClock` for `1.0/baseCpuFrequency`. The type tells the LikwidCollector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the LikwidCollector whether a metric should be sent to the router or is only used internally to compute a global metric.
|
||||
- The `globalmetrics` are metrics which require data from multiple event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a type and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. Also `time` and `inverseClock` cannot be used anymore. So, the idea is to derive a metric in the `eventsets` section and reuse it in the `globalmetrics` part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`"publish": false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field.
|
||||
|
||||
Additional options:
|
||||
|
||||
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
|
||||
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. See below in [seperate section](./likwidMetric.md#invalid_to_zero-option)
|
||||
- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested.
|
||||
@@ -62,6 +64,7 @@ Hardware performance counters are scattered all over the system nowadays. A coun
|
||||
**Note:** You cannot specify `socket` type for a metric that is measured at `hwthread` type, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the type of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
|
||||
|
||||
As a guideline:
|
||||
|
||||
- All counters `FIXCx`, `PMCy` and `TMAz` have the type `hwthread`
|
||||
- All counters names containing `BOX` have the type `socket`
|
||||
- All `PWRx` counters have type `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` type
|
||||
@@ -70,6 +73,7 @@ As a guideline:
|
||||
### Help with the configuration
|
||||
|
||||
The configuration for the `likwid` collector is quite complicated. Most users don't use LIKWID with the event:counter notation but rely on the performance groups defined by the LIKWID team for each architecture. In order to help with the `likwid` collector configuration, we included a script `scripts/likwid_perfgroup_to_cc_config.py` that creates the configuration of an `eventset` from a performance group (using a LIKWID installation in `$PATH`):
|
||||
|
||||
```
|
||||
$ likwid-perfctr -i
|
||||
[...]
|
||||
@@ -111,20 +115,28 @@ You can copy this JSON and add it to the `eventsets` list. If you specify multip
|
||||
LIKWID checks the file `/var/run/likwid.lock` before performing any interfering operations. Who is allowed to access the counters is determined by the owner of the file. If it does not exist, it is created for the current user. So, if you want to temporarly allow counter access to a user (e.g. in a job):
|
||||
|
||||
Before (SLURM prolog, ...)
|
||||
```
|
||||
$ chown $JOBUSER /var/run/likwid.lock
|
||||
|
||||
```bash
|
||||
chown $JOBUSER /var/run/likwid.lock
|
||||
```
|
||||
|
||||
After (SLURM epilog, ...)
|
||||
```
|
||||
$ chown $CCUSER /var/run/likwid.lock
|
||||
|
||||
```bash
|
||||
chown $CCUSER /var/run/likwid.lock
|
||||
```
|
||||
|
||||
### `invalid_to_zero` option
|
||||
|
||||
In some cases LIKWID returns `0.0` for some events that are further used in processing and maybe used as divisor in a calculation. After evaluation of a metric, the result might be `NaN` or `+-Inf`. These resulting metrics are commonly not created and forwarded to the router because the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#float) does not support these special floating-point values. If you want to have them sent, this option forces these metric values to be `0.0` instead.
|
||||
|
||||
One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC).
|
||||
|
||||
### `send_*_total values` option
|
||||
|
||||
- `send_core_total_values`: Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per CPU core.
|
||||
- `send_socket_total_values` Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per CPU socket.
|
||||
- `send_node_total_values` Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per node.
|
||||
|
||||
### Example configuration
|
||||
|
||||
@@ -229,6 +241,7 @@ One might think this does not happen often but often used metrics in the world o
|
||||
The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility.
|
||||
|
||||
The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference:
|
||||
|
||||
```
|
||||
EVENTSET -> "events": {
|
||||
FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK",
|
||||
|
@@ -101,7 +101,7 @@ func getMetricData(lines []string, prefix string, offset int) (int64, error) {
|
||||
// llitedir := filepath.Join(LUSTRE_SYSFS, "llite")
|
||||
// devdir := filepath.Join(llitedir, device)
|
||||
// statsfile := filepath.Join(devdir, "stats")
|
||||
// buffer, err := ioutil.ReadFile(statsfile)
|
||||
// buffer, err := os.ReadFile(statsfile)
|
||||
// if err != nil {
|
||||
// return make([]string, 0)
|
||||
// }
|
||||
|
@@ -102,7 +102,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Check if device is a included device
|
||||
if _, ok := stringArrayContains(m.config.IncludeDevices, dev); ok {
|
||||
tags := map[string]string{"device": dev, "type": "node"}
|
||||
tags := map[string]string{"stype": "network", "stype-id": dev, "type": "node"}
|
||||
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
||||
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
|
||||
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
|
||||
|
@@ -23,5 +23,5 @@ Metrics:
|
||||
* `net_pkts_in_bw` (`unit=packets/sec` if `send_derived_values == true`)
|
||||
* `net_pkts_out_bw` (`unit=packets/sec` if `send_derived_values == true`)
|
||||
|
||||
The device name is added as tag `device`.
|
||||
The device name is added as tag `stype=network,stype-id=<device>`.
|
||||
|
||||
|
@@ -71,6 +71,14 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Initialize NVIDIA Management Library (NVML)
|
||||
ret := nvml.Init()
|
||||
|
||||
// Error: NVML library not found
|
||||
// (nvml.ErrorString can not be used in this case)
|
||||
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||
err = fmt.Errorf("NVML library not found")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
|
||||
|
Reference in New Issue
Block a user