mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-01-15 00:29:09 +01:00
7840de7b82
* Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Update README.md Use right JSON type in configuration * Update sink's README * Test whether ipmitool or ipmi-sensors can be executed without errors * Little fixes to the prometheus sink (#115) * Add uint64 to float64 cast option * Add prometheus sink to the list of available sinks * Add aggregated counters by gpu for nvlink errors --------- Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de> * Ccmessage migration (#119) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Switch to CCMessage for all files. --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Switch to ccmessage also for latest additions in nvidiaMetric * New Message processor (#118) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * New message processor to check whether a message should be dropped or manipulate it in flight * Create a copy of message before manipulation --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Update collector's Makefile and go.mod/sum files * Use message processor in router, all sinks and all receivers * Add support for credential file (NKEY) to NATS sink and receiver * Fix JSON keys in message processor configuration * Update docs for message processor, router and the default router config file * Add link to expr syntax and fix regex matching docs * Update sample collectors * Minor style change in collector manager * Some helpers for ccTopology * LIKWID collector: write log owner change only once * Fix for metrics without units and reduce debugging messages for messageProcessor * Use shorted hostname for hostname added by router * Define default port for NATS * CPUstat collector: only add unit for applicable metrics * Add precision option to all sinks using Influx's encoder * Add message processor to all sink documentation * Add units to documentation of cpustat collector --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: oscarminus <me@oscarminus.de> Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
417 lines
10 KiB
Go
417 lines
10 KiB
Go
package collectors
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"os/exec"
|
|
"os/user"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
|
)
|
|
|
|
const DEFAULT_GPFS_CMD = "mmpmon"
|
|
|
|
type GpfsCollectorLastState struct {
|
|
bytesRead int64
|
|
bytesWritten int64
|
|
}
|
|
|
|
type GpfsCollector struct {
|
|
metricCollector
|
|
tags map[string]string
|
|
config struct {
|
|
Mmpmon string `json:"mmpmon_path,omitempty"`
|
|
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
|
SendBandwidths bool `json:"send_bandwidths"`
|
|
SendTotalValues bool `json:"send_total_values"`
|
|
}
|
|
skipFS map[string]struct{}
|
|
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
|
lastState map[string]GpfsCollectorLastState
|
|
}
|
|
|
|
func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|
// Check if already initialized
|
|
if m.init {
|
|
return nil
|
|
}
|
|
|
|
var err error
|
|
m.name = "GpfsCollector"
|
|
m.setup()
|
|
m.parallel = true
|
|
|
|
// Set default mmpmon binary
|
|
m.config.Mmpmon = DEFAULT_GPFS_CMD
|
|
|
|
// Read JSON configuration
|
|
if len(config) > 0 {
|
|
err = json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
log.Print(err.Error())
|
|
return err
|
|
}
|
|
}
|
|
m.meta = map[string]string{
|
|
"source": m.name,
|
|
"group": "GPFS",
|
|
}
|
|
m.tags = map[string]string{
|
|
"type": "node",
|
|
"filesystem": "",
|
|
}
|
|
m.skipFS = make(map[string]struct{})
|
|
for _, fs := range m.config.ExcludeFilesystem {
|
|
m.skipFS[fs] = struct{}{}
|
|
}
|
|
m.lastState = make(map[string]GpfsCollectorLastState)
|
|
|
|
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
|
|
user, err := user.Current()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get current user: %v", err)
|
|
}
|
|
if user.Uid != "0" {
|
|
return fmt.Errorf("GPFS file system statistics can only be queried by user root")
|
|
}
|
|
|
|
// Check if mmpmon is in executable search path
|
|
p, err := exec.LookPath(m.config.Mmpmon)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
|
|
}
|
|
m.config.Mmpmon = p
|
|
|
|
m.init = true
|
|
return nil
|
|
}
|
|
|
|
func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|
// Check if already initialized
|
|
if !m.init {
|
|
return
|
|
}
|
|
|
|
// Current time stamp
|
|
now := time.Now()
|
|
// time difference to last time stamp
|
|
timeDiff := now.Sub(m.lastTimestamp).Seconds()
|
|
// Save current timestamp
|
|
m.lastTimestamp = now
|
|
|
|
// mmpmon:
|
|
// -p: generate output that can be parsed
|
|
// -s: suppress the prompt on input
|
|
// fs_io_s: Displays I/O statistics per mounted file system
|
|
cmd := exec.Command(m.config.Mmpmon, "-p", "-s")
|
|
cmd.Stdin = strings.NewReader("once fs_io_s\n")
|
|
cmdStdout := new(bytes.Buffer)
|
|
cmdStderr := new(bytes.Buffer)
|
|
cmd.Stdout = cmdStdout
|
|
cmd.Stderr = cmdStderr
|
|
err := cmd.Run()
|
|
if err != nil {
|
|
dataStdErr, _ := io.ReadAll(cmdStderr)
|
|
dataStdOut, _ := io.ReadAll(cmdStdout)
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
|
|
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
|
|
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
|
|
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
|
|
)
|
|
return
|
|
}
|
|
|
|
// Read I/O statistics
|
|
scanner := bufio.NewScanner(cmdStdout)
|
|
for scanner.Scan() {
|
|
lineSplit := strings.Fields(scanner.Text())
|
|
|
|
// Only process lines starting with _fs_io_s_
|
|
if lineSplit[0] != "_fs_io_s_" {
|
|
continue
|
|
}
|
|
|
|
key_value := make(map[string]string)
|
|
for i := 1; i < len(lineSplit); i += 2 {
|
|
key_value[lineSplit[i]] = lineSplit[i+1]
|
|
}
|
|
|
|
// Ignore keys:
|
|
// _n_: node IP address,
|
|
// _nn_: node name,
|
|
// _cl_: cluster name,
|
|
// _d_: number of disks
|
|
|
|
filesystem, ok := key_value["_fs_"]
|
|
if !ok {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
"Read(): Failed to get filesystem name.")
|
|
continue
|
|
}
|
|
|
|
// Skip excluded filesystems
|
|
if _, skip := m.skipFS[filesystem]; skip {
|
|
continue
|
|
}
|
|
|
|
// Add filesystem tag
|
|
m.tags["filesystem"] = filesystem
|
|
|
|
// Create initial last state
|
|
if m.config.SendBandwidths {
|
|
if _, ok := m.lastState[filesystem]; !ok {
|
|
m.lastState[filesystem] = GpfsCollectorLastState{
|
|
bytesRead: -1,
|
|
bytesWritten: -1,
|
|
}
|
|
}
|
|
}
|
|
|
|
// return code
|
|
rc, err := strconv.Atoi(key_value["_rc_"])
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
|
continue
|
|
}
|
|
if rc != 0 {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
|
continue
|
|
}
|
|
|
|
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
|
continue
|
|
}
|
|
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
|
continue
|
|
}
|
|
timestamp := time.Unix(sec, msec*1000)
|
|
|
|
// bytes read
|
|
bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err))
|
|
continue
|
|
}
|
|
if y, err :=
|
|
lp.NewMessage(
|
|
"gpfs_bytes_read",
|
|
m.tags,
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": bytesRead,
|
|
},
|
|
timestamp,
|
|
); err == nil {
|
|
y.AddMeta("unit", "bytes")
|
|
output <- y
|
|
}
|
|
if m.config.SendBandwidths {
|
|
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
|
|
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
|
|
if y, err :=
|
|
lp.NewMessage(
|
|
"gpfs_bw_read",
|
|
m.tags,
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": bwRead,
|
|
},
|
|
timestamp,
|
|
); err == nil {
|
|
y.AddMeta("unit", "bytes/sec")
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
|
|
// bytes written
|
|
bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err))
|
|
continue
|
|
}
|
|
if y, err :=
|
|
lp.NewMessage(
|
|
"gpfs_bytes_written",
|
|
m.tags,
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": bytesWritten,
|
|
},
|
|
timestamp,
|
|
); err == nil {
|
|
y.AddMeta("unit", "bytes")
|
|
output <- y
|
|
}
|
|
if m.config.SendBandwidths {
|
|
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
|
|
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
|
|
if y, err :=
|
|
lp.NewMessage(
|
|
"gpfs_bw_write",
|
|
m.tags,
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": bwWrite,
|
|
},
|
|
timestamp,
|
|
); err == nil {
|
|
y.AddMeta("unit", "bytes/sec")
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
|
|
if m.config.SendBandwidths {
|
|
m.lastState[filesystem] = GpfsCollectorLastState{
|
|
bytesRead: bytesRead,
|
|
bytesWritten: bytesWritten,
|
|
}
|
|
}
|
|
|
|
// number of opens
|
|
numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err))
|
|
continue
|
|
}
|
|
if y, err := lp.NewMessage("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
|
|
output <- y
|
|
}
|
|
|
|
// number of closes
|
|
numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err))
|
|
continue
|
|
}
|
|
if y, err := lp.NewMessage("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
|
|
output <- y
|
|
}
|
|
|
|
// number of reads
|
|
numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err))
|
|
continue
|
|
}
|
|
if y, err := lp.NewMessage("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
|
|
output <- y
|
|
}
|
|
|
|
// number of writes
|
|
numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err))
|
|
continue
|
|
}
|
|
if y, err := lp.NewMessage("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
|
|
output <- y
|
|
}
|
|
|
|
// number of read directories
|
|
numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err))
|
|
continue
|
|
}
|
|
if y, err := lp.NewMessage("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
|
|
output <- y
|
|
}
|
|
|
|
// Number of inode updates
|
|
numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err))
|
|
continue
|
|
}
|
|
if y, err := lp.NewMessage("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
|
|
output <- y
|
|
}
|
|
|
|
// Total values
|
|
if m.config.SendTotalValues {
|
|
bytesTotal := bytesRead + bytesWritten
|
|
if y, err :=
|
|
lp.NewMessage("gpfs_bytes_total",
|
|
m.tags,
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": bytesTotal,
|
|
},
|
|
timestamp,
|
|
); err == nil {
|
|
y.AddMeta("unit", "bytes")
|
|
output <- y
|
|
}
|
|
iops := numReads + numWrites
|
|
if y, err :=
|
|
lp.NewMessage("gpfs_iops",
|
|
m.tags,
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": iops,
|
|
},
|
|
timestamp,
|
|
); err == nil {
|
|
output <- y
|
|
}
|
|
metaops := numInodeUpdates + numCloses + numOpens + numReaddirs
|
|
if y, err :=
|
|
lp.NewMessage("gpfs_metaops",
|
|
m.tags,
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": metaops,
|
|
},
|
|
timestamp,
|
|
); err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *GpfsCollector) Close() {
|
|
m.init = false
|
|
}
|