mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-26 23:29:06 +01:00
6ab45dd3ec
* Add cpu_used (all-cpu_idle) to CpustatCollector * Update to line-protocol/v2 * Update runonce.yml with Golang 1.20 * Update fsnotify in LIKWID Collector * Use not a pointer to line-protocol.Encoder * Simplify Makefile * Use only as many arguments as required * Allow sum function to handle non float types * Allow values to be a slice of type float64, float32, int, int64, int32, bool * Use generic function to simplify code * Add missing case for type []int32 * Use generic function to compute minimum * Use generic function to compute maximum * Use generic function to compute average * Add error value to sumAnyType * Use generic function to compute median * For older versions of go slices is not part of the installation * Remove old entries from go.sum * Use simpler sort function * Compute metrics ib_total and ib_total_pkts * Add aggregated metrics. Add missing units * Update likwidMetric.go Fixes a potential bug when `fsnotify.NewWatcher()` fails with an error * Completly avoid memory allocations in infinibandMetric read() * Fixed initialization: Initalization and measurements should run in the same thread * Add safe.directory to Release action * Fix path after installation to /usr/bin after installation * ioutil.ReadFile is deprecated: As of Go 1.16, this function simply calls os.ReadFile * Switch to package slices from the golang 1.21 default library * Read file line by line * Read file line by line * Read file line by line * Use CamelCase * Use CamelCase * Fix function getNumaDomain, it always returned 0 * Avoid type conversion by using Atoi Avoid copying structs by using pointer access Increase readability with CamelCase variable names * Add caching * Cache CpuData * Cleanup * Use init function to initalize cache structure to avoid multi threading problems * Reuse information from /proc/cpuinfo * Avoid slice cloning. Directly use the cache * Add DieList * Add NumaDomainList and SMTList * Cleanup * Add comment * Lookup core ID from /sys/devices/system/cpu, /proc/cpuinfo is not portable * Lookup all information from /sys/devices/system/cpu, /proc/cpuinfo is not portable * Correctly handle lists from /sys * Add Simultaneous Multithreading siblings * Replace deprecated thread_siblings_list by core_cpus_list * Reduce number of required slices * Allow to send total values per core, socket and node * Send all metrics with same time stamp calcEventsetMetrics does only computiation, counter measurement is done before * Input parameters should be float64 when evaluating to float64 * Send all metrics with same time stamp calcGlobalMetrics does only computiation, counter measurement is done before * Remove unused variable gmresults * Add comments * Updated go packages * Add build with golang 1.21 * Switch to checkout action version 4 * Switch to setup-go action version 4 * Add workflow_dispatch to allow manual run of workflow * Add workflow_dispatch to allow manual run of workflow * Add release build jobs to runonce.yml * Switch to golang 1.20 for RHEL based distributions * Use dnf to download golang * Remove golang versions before 1.20 * Upgrade Ubuntu focal -> jammy * Pipe golang tar package directly to tar * Update golang version * Fix Ubuntu version number * Add links to ipmi and redfish receivers * Fix http server addr format * github.com/influxdata/line-protocol -> github.com/influxdata/line-protocol/v2/lineprotocol * Corrected spelling * Add some comments * github.com/influxdata/line-protocol -> github.com/influxdata/line-protocol/v2/lineprotocol * Allow other fields not only field "value" * Add some basic debugging documentation * Add some basic debugging documentation * Use a lock for the flush timer * Add tags in lexical order as required by AddTag() * Only access meta data, when it gets used as tag * Use slice to store lexialicly orderd key value pairs * Increase golang version requirement to 1.20. * Avoid package cmp to allow builds with golang v1.20 * Fix: Error NVML library not found did crash cc-metric-collector with "SIGSEGV: segmentation violation" * Add config option idle_timeout * Add basic authentication support * Add basic authentication support * Avoid unneccessary memory allocations * Add documentation for send_*_total values * Use generic package maps to clone maps * Reuse flush timer * Add Influx client options * Reuse ccTopology functionality * Do not store unused topology information * Add batch_size config * Cleanup * Use stype and stype-id for the NIC in NetstatCollector * Wait for concurrent flush operations to finish * Be more verbose in error messages * Reverted previous changes. Made the code to complex without much advantages * Use line protocol encoder * Go pkg update * Stop flush timer, when immediatelly flushing * Fix: Corrected unlock access to batch slice * Add config option to specify whether to use GZip compression in influx write requests * Add asynchron send of encoder metrics * Use DefaultServeMux instead of github.com/gorilla/mux * Add config option for HTTP keep-alives * Be more strict, when parsing json * Add config option for HTTP request timeout and Retry interval * Allow more then one background send operation * Fix %sysusers_create_package args (#108) %sysusers_create_package requires two arguments. See: https://github.com/systemd/systemd/blob/main/src/rpm/macros.systemd.in#L165 * Add nfsiostat to list of collectors --------- Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Holger Obermaier <holgerob@gmx.de> Co-authored-by: Obihörnchen <obihoernchende@gmail.com>
432 lines
10 KiB
Go
432 lines
10 KiB
Go
package collectors
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"os/exec"
|
|
"os/user"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
|
)
|
|
|
|
const LUSTRE_SYSFS = `/sys/fs/lustre`
|
|
const LCTL_CMD = `lctl`
|
|
const LCTL_OPTION = `get_param`
|
|
|
|
type LustreCollectorConfig struct {
|
|
LCtlCommand string `json:"lctl_command,omitempty"`
|
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
|
Sudo bool `json:"use_sudo,omitempty"`
|
|
SendAbsoluteValues bool `json:"send_abs_values,omitempty"`
|
|
SendDerivedValues bool `json:"send_derived_values,omitempty"`
|
|
SendDiffValues bool `json:"send_diff_values,omitempty"`
|
|
}
|
|
|
|
type LustreMetricDefinition struct {
|
|
name string
|
|
lineprefix string
|
|
lineoffset int
|
|
unit string
|
|
calc string
|
|
}
|
|
|
|
type LustreCollector struct {
|
|
metricCollector
|
|
tags map[string]string
|
|
config LustreCollectorConfig
|
|
lctl string
|
|
sudoCmd string
|
|
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
|
definitions []LustreMetricDefinition // Combined list without excluded metrics
|
|
stats map[string]map[string]int64 // Data for last value per device and metric
|
|
}
|
|
|
|
func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
|
var command *exec.Cmd
|
|
statsfile := fmt.Sprintf("llite.%s.stats", device)
|
|
if m.config.Sudo {
|
|
command = exec.Command(m.sudoCmd, m.lctl, LCTL_OPTION, statsfile)
|
|
} else {
|
|
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
|
}
|
|
command.Wait()
|
|
stdout, _ := command.Output()
|
|
return strings.Split(string(stdout), "\n")
|
|
}
|
|
|
|
func (m *LustreCollector) getDevices() []string {
|
|
devices := make([]string, 0)
|
|
|
|
// //Version reading devices from sysfs
|
|
// globPattern := filepath.Join(LUSTRE_SYSFS, "llite/*/stats")
|
|
// files, err := filepath.Glob(globPattern)
|
|
// if err != nil {
|
|
// return devices
|
|
// }
|
|
// for _, f := range files {
|
|
// pathlist := strings.Split(f, "/")
|
|
// devices = append(devices, pathlist[4])
|
|
// }
|
|
|
|
data := m.getDeviceDataCommand("*")
|
|
|
|
for _, line := range data {
|
|
if strings.HasPrefix(line, "llite") {
|
|
linefields := strings.Split(line, ".")
|
|
if len(linefields) > 2 {
|
|
devices = append(devices, linefields[1])
|
|
}
|
|
}
|
|
}
|
|
return devices
|
|
}
|
|
|
|
func getMetricData(lines []string, prefix string, offset int) (int64, error) {
|
|
for _, line := range lines {
|
|
if strings.HasPrefix(line, prefix) {
|
|
lf := strings.Fields(line)
|
|
return strconv.ParseInt(lf[offset], 0, 64)
|
|
}
|
|
}
|
|
return 0, errors.New("no such line in data")
|
|
}
|
|
|
|
// //Version reading the stats data of a device from sysfs
|
|
// func (m *LustreCollector) getDeviceDataSysfs(device string) []string {
|
|
// llitedir := filepath.Join(LUSTRE_SYSFS, "llite")
|
|
// devdir := filepath.Join(llitedir, device)
|
|
// statsfile := filepath.Join(devdir, "stats")
|
|
// buffer, err := os.ReadFile(statsfile)
|
|
// if err != nil {
|
|
// return make([]string, 0)
|
|
// }
|
|
// return strings.Split(string(buffer), "\n")
|
|
// }
|
|
|
|
var LustreAbsMetrics = []LustreMetricDefinition{
|
|
{
|
|
name: "lustre_read_requests",
|
|
lineprefix: "read_bytes",
|
|
lineoffset: 1,
|
|
unit: "requests",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_write_requests",
|
|
lineprefix: "write_bytes",
|
|
lineoffset: 1,
|
|
unit: "requests",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_read_bytes",
|
|
lineprefix: "read_bytes",
|
|
lineoffset: 6,
|
|
unit: "bytes",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_write_bytes",
|
|
lineprefix: "write_bytes",
|
|
lineoffset: 6,
|
|
unit: "bytes",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_open",
|
|
lineprefix: "open",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_close",
|
|
lineprefix: "close",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_setattr",
|
|
lineprefix: "setattr",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_getattr",
|
|
lineprefix: "getattr",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_statfs",
|
|
lineprefix: "statfs",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "none",
|
|
},
|
|
{
|
|
name: "lustre_inode_permission",
|
|
lineprefix: "inode_permission",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "none",
|
|
},
|
|
}
|
|
|
|
var LustreDiffMetrics = []LustreMetricDefinition{
|
|
{
|
|
name: "lustre_read_requests_diff",
|
|
lineprefix: "read_bytes",
|
|
lineoffset: 1,
|
|
unit: "requests",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_write_requests_diff",
|
|
lineprefix: "write_bytes",
|
|
lineoffset: 1,
|
|
unit: "requests",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_read_bytes_diff",
|
|
lineprefix: "read_bytes",
|
|
lineoffset: 6,
|
|
unit: "bytes",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_write_bytes_diff",
|
|
lineprefix: "write_bytes",
|
|
lineoffset: 6,
|
|
unit: "bytes",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_open_diff",
|
|
lineprefix: "open",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_close_diff",
|
|
lineprefix: "close",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_setattr_diff",
|
|
lineprefix: "setattr",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_getattr_diff",
|
|
lineprefix: "getattr",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_statfs_diff",
|
|
lineprefix: "statfs",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "difference",
|
|
},
|
|
{
|
|
name: "lustre_inode_permission_diff",
|
|
lineprefix: "inode_permission",
|
|
lineoffset: 1,
|
|
unit: "",
|
|
calc: "difference",
|
|
},
|
|
}
|
|
|
|
var LustreDeriveMetrics = []LustreMetricDefinition{
|
|
{
|
|
name: "lustre_read_requests_rate",
|
|
lineprefix: "read_bytes",
|
|
lineoffset: 1,
|
|
unit: "requests/sec",
|
|
calc: "derivative",
|
|
},
|
|
{
|
|
name: "lustre_write_requests_rate",
|
|
lineprefix: "write_bytes",
|
|
lineoffset: 1,
|
|
unit: "requests/sec",
|
|
calc: "derivative",
|
|
},
|
|
{
|
|
name: "lustre_read_bw",
|
|
lineprefix: "read_bytes",
|
|
lineoffset: 6,
|
|
unit: "bytes/sec",
|
|
calc: "derivative",
|
|
},
|
|
{
|
|
name: "lustre_write_bw",
|
|
lineprefix: "write_bytes",
|
|
lineoffset: 6,
|
|
unit: "bytes/sec",
|
|
calc: "derivative",
|
|
},
|
|
}
|
|
|
|
func (m *LustreCollector) Init(config json.RawMessage) error {
|
|
var err error
|
|
m.name = "LustreCollector"
|
|
m.parallel = true
|
|
if len(config) > 0 {
|
|
err = json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
m.setup()
|
|
m.tags = map[string]string{"type": "node"}
|
|
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
|
|
|
|
// Lustre file system statistics can only be queried by user root
|
|
// or with password-less sudo
|
|
if !m.config.Sudo {
|
|
user, err := user.Current()
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
|
return err
|
|
}
|
|
if user.Uid != "0" {
|
|
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
|
|
return err
|
|
}
|
|
} else {
|
|
p, err := exec.LookPath("sudo")
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
|
return err
|
|
}
|
|
m.sudoCmd = p
|
|
}
|
|
|
|
p, err := exec.LookPath(m.config.LCtlCommand)
|
|
if err != nil {
|
|
p, err = exec.LookPath(LCTL_CMD)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
m.lctl = p
|
|
|
|
m.definitions = []LustreMetricDefinition{}
|
|
if m.config.SendAbsoluteValues {
|
|
for _, def := range LustreAbsMetrics {
|
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
|
m.definitions = append(m.definitions, def)
|
|
}
|
|
}
|
|
}
|
|
if m.config.SendDiffValues {
|
|
for _, def := range LustreDiffMetrics {
|
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
|
m.definitions = append(m.definitions, def)
|
|
}
|
|
}
|
|
}
|
|
if m.config.SendDerivedValues {
|
|
for _, def := range LustreDeriveMetrics {
|
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
|
m.definitions = append(m.definitions, def)
|
|
}
|
|
}
|
|
}
|
|
if len(m.definitions) == 0 {
|
|
return errors.New("no metrics to collect")
|
|
}
|
|
|
|
devices := m.getDevices()
|
|
if len(devices) == 0 {
|
|
return errors.New("no Lustre devices found")
|
|
}
|
|
m.stats = make(map[string]map[string]int64)
|
|
for _, d := range devices {
|
|
m.stats[d] = make(map[string]int64)
|
|
data := m.getDeviceDataCommand(d)
|
|
for _, def := range m.definitions {
|
|
x, err := getMetricData(data, def.lineprefix, def.lineoffset)
|
|
if err == nil {
|
|
m.stats[d][def.name] = x
|
|
} else {
|
|
m.stats[d][def.name] = 0
|
|
}
|
|
}
|
|
}
|
|
m.lastTimestamp = time.Now()
|
|
m.init = true
|
|
return nil
|
|
}
|
|
|
|
func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|
if !m.init {
|
|
return
|
|
}
|
|
now := time.Now()
|
|
tdiff := now.Sub(m.lastTimestamp)
|
|
for device, devData := range m.stats {
|
|
data := m.getDeviceDataCommand(device)
|
|
for _, def := range m.definitions {
|
|
var use_x int64
|
|
var err error
|
|
var y lp.CCMetric
|
|
x, err := getMetricData(data, def.lineprefix, def.lineoffset)
|
|
if err == nil {
|
|
use_x = x
|
|
} else {
|
|
use_x = devData[def.name]
|
|
}
|
|
var value interface{}
|
|
switch def.calc {
|
|
case "none":
|
|
value = use_x
|
|
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
|
case "difference":
|
|
value = use_x - devData[def.name]
|
|
if value.(int64) < 0 {
|
|
value = 0
|
|
}
|
|
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
|
case "derivative":
|
|
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
|
|
if value.(float64) < 0 {
|
|
value = 0
|
|
}
|
|
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
|
}
|
|
if err == nil {
|
|
y.AddTag("device", device)
|
|
if len(def.unit) > 0 {
|
|
y.AddMeta("unit", def.unit)
|
|
}
|
|
output <- y
|
|
}
|
|
devData[def.name] = use_x
|
|
}
|
|
}
|
|
m.lastTimestamp = now
|
|
}
|
|
|
|
func (m *LustreCollector) Close() {
|
|
m.init = false
|
|
}
|