Merge branch 'develop' of github.com:ClusterCockpit/cc-metric-collector into develop

This commit is contained in:
Thomas Roehl 2022-02-07 13:22:34 +01:00
commit c313055570
10 changed files with 195 additions and 131 deletions

View File

@ -103,9 +103,8 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric)
} }
// Each metric has exactly one field: value ! // Each metric has exactly one field: value !
value := map[string]interface{}{"value": int(x)} value := map[string]interface{}{"value": int64(x)}
y, err := lp.New("sample_metric", tags, m.meta, value, time.Now()) if y, err := lp.New("sample_metric", tags, m.meta, value, time.Now()); err == nil {
if err == nil {
output <- y output <- y
} }
} }

View File

@ -23,14 +23,14 @@ import (
type CPUFreqCpuInfoCollectorTopology struct { type CPUFreqCpuInfoCollectorTopology struct {
processor string // logical processor number (continuous, starting at 0) processor string // logical processor number (continuous, starting at 0)
coreID string // socket local core ID coreID string // socket local core ID
coreID_int int coreID_int int64
physicalPackageID string // socket / package ID physicalPackageID string // socket / package ID
physicalPackageID_int int physicalPackageID_int int64
numPhysicalPackages string // number of sockets / packages numPhysicalPackages string // number of sockets / packages
numPhysicalPackages_int int numPhysicalPackages_int int64
isHT bool isHT bool
numNonHT string // number of non hyperthreading processors numNonHT string // number of non hyperthreading processors
numNonHT_int int numNonHT_int int64
tagSet map[string]string tagSet map[string]string
} }
@ -40,26 +40,32 @@ type CPUFreqCpuInfoCollector struct {
} }
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "CPUFreqCpuInfoCollector" m.name = "CPUFreqCpuInfoCollector"
m.meta = map[string]string{ m.meta = map[string]string{
"source": m.name, "source": m.name,
"group": "cpufreq", "group": "CPU",
"unit": "MHz",
} }
const cpuInfoFile = "/proc/cpuinfo" const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile) file, err := os.Open(cpuInfoFile)
if err != nil { if err != nil {
return fmt.Errorf("Failed to open '%s': %v", cpuInfoFile, err) return fmt.Errorf("Failed to open file '%s': %v", cpuInfoFile, err)
} }
defer file.Close() defer file.Close()
// Collect topology information from file cpuinfo // Collect topology information from file cpuinfo
foundFreq := false foundFreq := false
processor := "" processor := ""
numNonHT_int := 0 var numNonHT_int int64 = 0
coreID := "" coreID := ""
physicalPackageID := "" physicalPackageID := ""
maxPhysicalPackageID := 0 var maxPhysicalPackageID int64 = 0
m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0) m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0)
coreSeenBefore := make(map[string]bool) coreSeenBefore := make(map[string]bool)
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
@ -87,13 +93,13 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
len(coreID) > 0 && len(coreID) > 0 &&
len(physicalPackageID) > 0 { len(physicalPackageID) > 0 {
coreID_int, err := strconv.Atoi(coreID) coreID_int, err := strconv.ParseInt(coreID, 10, 64)
if err != nil { if err != nil {
return fmt.Errorf("Unable to convert coreID to int: %v", err) return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err)
} }
physicalPackageID_int, err := strconv.Atoi(physicalPackageID) physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64)
if err != nil { if err != nil {
return fmt.Errorf("Unable to convert physicalPackageID to int: %v", err) return fmt.Errorf("Unable to convert physicalPackageID '%s' to int64: %v", physicalPackageID, err)
} }
// increase maximun socket / package ID, when required // increase maximun socket / package ID, when required
@ -152,15 +158,17 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
} }
func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) { func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) {
// Check if already initialized
if !m.init { if !m.init {
return return
} }
const cpuInfoFile = "/proc/cpuinfo" const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile) file, err := os.Open(cpuInfoFile)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to open '%s': %v", cpuInfoFile, err)) fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
return return
} }
defer file.Close() defer file.Close()
@ -181,11 +189,10 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert cpu MHz to float: %v", err)) fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
return return
} }
y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now) if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
if err == nil {
output <- y output <- y
} }
} }

View File

@ -35,14 +35,14 @@ func readOneLine(filename string) (text string, ok bool) {
type CPUFreqCollectorTopology struct { type CPUFreqCollectorTopology struct {
processor string // logical processor number (continuous, starting at 0) processor string // logical processor number (continuous, starting at 0)
coreID string // socket local core ID coreID string // socket local core ID
coreID_int int coreID_int int64
physicalPackageID string // socket / package ID physicalPackageID string // socket / package ID
physicalPackageID_int int physicalPackageID_int int64
numPhysicalPackages string // number of sockets / packages numPhysicalPackages string // number of sockets / packages
numPhysicalPackages_int int numPhysicalPackages_int int64
isHT bool isHT bool
numNonHT string // number of non hyperthreading processors numNonHT string // number of non hyperthreading processors
numNonHT_int int numNonHT_int int64
scalingCurFreqFile string scalingCurFreqFile string
tagSet map[string]string tagSet map[string]string
} }
@ -64,6 +64,11 @@ type CPUFreqCollector struct {
} }
func (m *CPUFreqCollector) Init(config json.RawMessage) error { func (m *CPUFreqCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "CPUFreqCollector" m.name = "CPUFreqCollector"
m.setup() m.setup()
if len(config) > 0 { if len(config) > 0 {
@ -74,7 +79,8 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
} }
m.meta = map[string]string{ m.meta = map[string]string{
"source": m.name, "source": m.name,
"group": "CPU Frequency", "group": "CPU",
"unit": "MHz",
} }
// Loop for all CPU directories // Loop for all CPU directories
@ -82,48 +88,48 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join(baseDir, "cpu[0-9]*") globPattern := filepath.Join(baseDir, "cpu[0-9]*")
cpuDirs, err := filepath.Glob(globPattern) cpuDirs, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to glob files with pattern %s: %v", globPattern, err) return fmt.Errorf("Unable to glob files with pattern '%s': %v", globPattern, err)
} }
if cpuDirs == nil { if cpuDirs == nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to find any files with pattern %s", globPattern) return fmt.Errorf("Unable to find any files with pattern '%s'", globPattern)
} }
// Initialize CPU topology // Initialize CPU topology
m.topology = make([]CPUFreqCollectorTopology, len(cpuDirs)) m.topology = make([]CPUFreqCollectorTopology, len(cpuDirs))
for _, cpuDir := range cpuDirs { for _, cpuDir := range cpuDirs {
processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu")
processor_int, err := strconv.Atoi(processor) processor_int, err := strconv.ParseInt(processor, 10, 64)
if err != nil { if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to convert cpuID to int: %v", err) return fmt.Errorf("Unable to convert cpuID '%s' to int64: %v", processor, err)
} }
// Read package ID // Read package ID
physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id")
physicalPackageID, ok := readOneLine(physicalPackageIDFile) physicalPackageID, ok := readOneLine(physicalPackageIDFile)
if !ok { if !ok {
return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", physicalPackageIDFile) return fmt.Errorf("Unable to read physical package ID from file '%s'", physicalPackageIDFile)
} }
physicalPackageID_int, err := strconv.Atoi(physicalPackageID) physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64)
if err != nil { if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) return fmt.Errorf("Unable to convert packageID '%s' to int64: %v", physicalPackageID, err)
} }
// Read core ID // Read core ID
coreIDFile := filepath.Join(cpuDir, "topology", "core_id") coreIDFile := filepath.Join(cpuDir, "topology", "core_id")
coreID, ok := readOneLine(coreIDFile) coreID, ok := readOneLine(coreIDFile)
if !ok { if !ok {
return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) return fmt.Errorf("Unable to read core ID from file '%s'", coreIDFile)
} }
coreID_int, err := strconv.Atoi(coreID) coreID_int, err := strconv.ParseInt(coreID, 10, 64)
if err != nil { if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err)
} }
// Check access to current frequency file // Check access to current frequency file
scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq")
err = unix.Access(scalingCurFreqFile, unix.R_OK) err = unix.Access(scalingCurFreqFile, unix.R_OK)
if err != nil { if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) return fmt.Errorf("Unable to access file '%s': %v", scalingCurFreqFile, err)
} }
t := &m.topology[processor_int] t := &m.topology[processor_int]
@ -146,8 +152,8 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
} }
// number of non hyper thread cores and packages / sockets // number of non hyper thread cores and packages / sockets
numNonHT_int := 0 var numNonHT_int int64 = 0
maxPhysicalPackageID := 0 var maxPhysicalPackageID int64 = 0
for i := range m.topology { for i := range m.topology {
t := &m.topology[i] t := &m.topology[i]
@ -184,6 +190,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
} }
func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) { func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) {
// Check if already initialized
if !m.init { if !m.init {
return return
} }
@ -205,16 +212,15 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric)
fmt.Sprintf("Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile)) fmt.Sprintf("Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile))
continue continue
} }
cpuFreq, err := strconv.Atoi(line) cpuFreq, err := strconv.ParseInt(line, 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert CPU frequency '%s': %v", line, err)) fmt.Sprintf("Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err))
continue continue
} }
y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now) if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
if err == nil {
output <- y output <- y
} }
} }

View File

@ -26,6 +26,11 @@ type GpfsCollector struct {
} }
func (m *GpfsCollector) Init(config json.RawMessage) error { func (m *GpfsCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
var err error var err error
m.name = "GpfsCollector" m.name = "GpfsCollector"
m.setup() m.setup()
@ -53,16 +58,16 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
return fmt.Errorf("GpfsCollector.Init(): Failed to get current user: %v", err) return fmt.Errorf("Failed to get current user: %v", err)
} }
if user.Uid != "0" { if user.Uid != "0" {
return fmt.Errorf("GpfsCollector.Init(): GPFS file system statistics can only be queried by user root") return fmt.Errorf("GPFS file system statistics can only be queried by user root")
} }
// Check if mmpmon is in executable search path // Check if mmpmon is in executable search path
_, err = exec.LookPath(m.config.Mmpmon) _, err = exec.LookPath(m.config.Mmpmon)
if err != nil { if err != nil {
return fmt.Errorf("GpfsCollector.Init(): Failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err) return fmt.Errorf("Failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
} }
m.init = true m.init = true
@ -70,6 +75,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
} }
func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
// Check if already initialized
if !m.init { if !m.init {
return return
} }
@ -135,7 +141,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if rc != 0 { if rc != 0 {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Filesystem %s not ok.", filesystem)) fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
continue continue
} }
@ -143,14 +149,14 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert seconds '%s' to int: %v", key_value["_t_"], err)) fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
continue continue
} }
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64) msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int: %v", key_value["_tu_"], err)) fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
continue continue
} }
timestamp := time.Unix(sec, msec*1000) timestamp := time.Unix(sec, msec*1000)
@ -160,12 +166,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int: %v", key_value["_br_"], err)) fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err))
continue continue
} }
if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil {
y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp)
if err == nil {
output <- y output <- y
} }
@ -174,12 +178,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int: %v", key_value["_bw_"], err)) fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err))
continue continue
} }
if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil {
y, err = lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp)
if err == nil {
output <- y output <- y
} }
@ -188,11 +190,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int: %v", key_value["_oc_"], err)) fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err))
continue continue
} }
y, err = lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp) if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
if err == nil {
output <- y output <- y
} }
@ -201,11 +202,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int: %v", key_value["_cc_"], err)) fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err))
continue continue
} }
y, err = lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp) if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
if err == nil {
output <- y output <- y
} }
@ -214,11 +214,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int: %v", key_value["_rdc_"], err)) fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err))
continue continue
} }
y, err = lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp) if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
if err == nil {
output <- y output <- y
} }
@ -227,11 +226,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int: %v", key_value["_wc_"], err)) fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err))
continue continue
} }
y, err = lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp) if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
if err == nil {
output <- y output <- y
} }
@ -240,11 +238,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int: %v", key_value["_dir_"], err)) fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err))
continue continue
} }
y, err = lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp) if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
if err == nil {
output <- y output <- y
} }
@ -256,8 +253,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err)) fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err))
continue continue
} }
y, err = lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp) if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
if err == nil {
output <- y output <- y
} }
} }

30
collectors/gpfsMetric.md Normal file
View File

@ -0,0 +1,30 @@
## `gpfs` collector
```json
"ibstat": {
"mmpmon_path": "/path/to/mmpmon",
"exclude_filesystem": [
"fs1"
]
}
```
The `gpfs` collector uses the `mmpmon` command to read performance metrics for
GPFS / IBM Spectrum Scale filesystems.
The reported filesystems can be filtered with the `exclude_filesystem` option
in the configuration.
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
in the configuration.
Metrics:
* `bytes_read`
* `gpfs_bytes_written`
* `gpfs_num_opens`
* `gpfs_num_closes`
* `gpfs_num_reads`
* `gpfs_num_readdirs`
* `gpfs_num_inode_updates`
The collector adds a `filesystem` tag to all metrics

View File

@ -4,6 +4,7 @@ import (
"fmt" "fmt"
"os" "os"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
@ -20,7 +21,7 @@ type InfinibandCollectorInfo struct {
LID string // IB local Identifier (LID) LID string // IB local Identifier (LID)
device string // IB device device string // IB device
port string // IB device port port string // IB device port
portCounterFiles map[string]string // mapping counter name -> file portCounterFiles map[string]string // mapping counter name -> sysfs file
tagSet map[string]string // corresponding tag list tagSet map[string]string // corresponding tag list
} }
@ -32,26 +33,14 @@ type InfinibandCollector struct {
info []InfinibandCollectorInfo info []InfinibandCollectorInfo
} }
func (m *InfinibandCollector) Help() {
fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH)
fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "/<dev>/ports/<port>/lid).")
fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.")
fmt.Println("For each found LIDs the collector calls the 'perfquery' command")
fmt.Println("")
fmt.Println("Full configuration object:")
fmt.Println("\"ibstat\" : {")
fmt.Println(" \"exclude_devices\" : [\"dev1\"]")
fmt.Println("}")
fmt.Println("")
fmt.Println("Metrics:")
fmt.Println("- ib_recv")
fmt.Println("- ib_xmit")
fmt.Println("- ib_recv_pkts")
fmt.Println("- ib_xmit_pkts")
}
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH // Init initializes the Infiniband collector by walking through files below IB_BASEPATH
func (m *InfinibandCollector) Init(config json.RawMessage) error { func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check if already initialized
if !m.init {
return nil
}
var err error var err error
m.name = "InfinibandCollector" m.name = "InfinibandCollector"
m.setup() m.setup()
@ -153,14 +142,25 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
// device info // device info
info := &m.info[i] info := &m.info[i]
for counterName, counterFile := range info.portCounterFiles { for counterName, counterFile := range info.portCounterFiles {
if data, ok := readOneLine(counterFile); ok { data, ok := readOneLine(counterFile)
if v, err := strconv.ParseInt(data, 10, 64); err == nil { if !ok {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read one line from file '%s'", counterFile))
continue
}
v, err := strconv.ParseInt(data, 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err))
continue
}
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
output <- y output <- y
} }
} }
}
}
} }
} }

View File

@ -3,17 +3,24 @@
```json ```json
"ibstat": { "ibstat": {
"perfquery_path" : "<path to perfquery command>",
"exclude_devices": [ "exclude_devices": [
"mlx4" "mlx4"
] ]
} }
``` ```
The `ibstat` collector reads either data through the `perfquery` command or the sysfs files below `/sys/class/infiniband/<device>`. The `ibstat` collector includes all Infiniband devices that can be
found below `/sys/class/infiniband/` and where any of the ports provides a
LID file (`/sys/class/infiniband/<dev>/ports/<port>/lid`)
The devices can be filtered with the `exclude_devices` option in the configuration.
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`.
Metrics: Metrics:
* `ib_recv` * `ib_recv`
* `ib_xmit` * `ib_xmit`
* `ib_recv_pkts`
* `ib_xmit_pkts`
The collector adds a `device` tag to all metrics The collector adds a `device` tag to all metrics

View File

@ -29,27 +29,6 @@ type InfinibandPerfQueryCollector struct {
} }
} }
func (m *InfinibandPerfQueryCollector) Help() {
fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH)
fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "/<dev>/ports/<port>/lid).")
fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.")
fmt.Println("For each found LIDs the collector calls the 'perfquery' command")
fmt.Println("The path to the 'perfquery' command can be configured with the 'perfquery_path' option")
fmt.Println("in the configuration")
fmt.Println("")
fmt.Println("Full configuration object:")
fmt.Println("\"ibstat\" : {")
fmt.Println(" \"perfquery_path\" : \"path/to/perfquery\" # if omitted, it searches in $PATH")
fmt.Println(" \"exclude_devices\" : [\"dev1\"]")
fmt.Println("}")
fmt.Println("")
fmt.Println("Metrics:")
fmt.Println("- ib_recv")
fmt.Println("- ib_xmit")
fmt.Println("- ib_recv_pkts")
fmt.Println("- ib_xmit_pkts")
}
func (m *InfinibandPerfQueryCollector) Init(config json.RawMessage) error { func (m *InfinibandPerfQueryCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "InfinibandCollectorPerfQuery" m.name = "InfinibandCollectorPerfQuery"

View File

@ -0,0 +1,28 @@
## `ibstat_perfquery` collector
```json
"ibstat_perfquery": {
"perfquery_path": "/path/to/perfquery",
"exclude_devices": [
"mlx4"
]
}
```
The `ibstat_perfquery` collector includes all Infiniband devices that can be
found below `/sys/class/infiniband/` and where any of the ports provides a
LID file (`/sys/class/infiniband/<dev>/ports/<port>/lid`)
The devices can be filtered with the `exclude_devices` option in the configuration.
For each found LID the collector calls the `perfquery` command. The path to the
`perfquery` command can be configured with the `perfquery_path` option in the configuration
Metrics:
* `ib_recv`
* `ib_xmit`
* `ib_recv_pkts`
* `ib_xmit_pkts`
The collector adds a `device` tag to all metrics

View File

@ -22,16 +22,16 @@ import (
// //
const LOADAVGFILE = "/proc/loadavg" const LOADAVGFILE = "/proc/loadavg"
type LoadavgCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
}
type LoadavgCollector struct { type LoadavgCollector struct {
metricCollector metricCollector
tags map[string]string tags map[string]string
load_matches []string load_matches []string
load_skips []bool
proc_matches []string proc_matches []string
config LoadavgCollectorConfig proc_skips []bool
config struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
}
} }
func (m *LoadavgCollector) Init(config json.RawMessage) error { func (m *LoadavgCollector) Init(config json.RawMessage) error {
@ -51,15 +51,23 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
"load_one", "load_one",
"load_five", "load_five",
"load_fifteen"} "load_fifteen"}
m.load_skips = make([]bool, len(m.load_matches))
m.proc_matches = []string{ m.proc_matches = []string{
"proc_run", "proc_run",
"proc_total"} "proc_total"}
m.proc_skips = make([]bool, len(m.proc_matches))
for i, name := range m.load_matches {
_, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
for i, name := range m.proc_matches {
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
m.init = true m.init = true
return nil return nil
} }
func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) { func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) {
var skip bool
if !m.init { if !m.init {
return return
} }
@ -84,9 +92,11 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err)) fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err))
continue continue
} }
_, skip = stringArrayContains(m.config.ExcludeMetrics, name) if m.load_skips[i] {
continue
}
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil && !skip { if err == nil {
output <- y output <- y
} }
} }
@ -94,16 +104,18 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
// Process metrics // Process metrics
lv := strings.Split(ls[3], `/`) lv := strings.Split(ls[3], `/`)
for i, name := range m.proc_matches { for i, name := range m.proc_matches {
x, err := strconv.ParseFloat(lv[i], 64) x, err := strconv.ParseInt(lv[i], 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err)) fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err))
continue continue
} }
_, skip = stringArrayContains(m.config.ExcludeMetrics, name) if m.proc_skips[i] {
continue
}
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil && !skip { if err == nil {
output <- y output <- y
} }