Derived metrics (#65)

* Add time-based derivatived (e.g. bandwidth) to some collectors

* Add documentation

* Add comments

* Fix: Only compute rates with a valid previous state

* Only compute rates with a valid previous state

* Define const values for net/dev fields

* Set default config values

* Add comments

* Refactor: Consolidate data structures

* Refactor: Consolidate data structures

* Refactor: Avoid struct deep copy

* Refactor: Avoid redundant tag maps

* Refactor: Use int64 type for absolut values

* Update LustreCollector

Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
This commit is contained in:
Thomas Gruber 2022-03-15 16:09:47 +01:00 committed by GitHub
parent 992b19d354
commit aa1afd745e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 301 additions and 106 deletions

View File

@ -19,23 +19,31 @@ const LCTL_CMD = `lctl`
const LCTL_OPTION = `get_param` const LCTL_OPTION = `get_param`
type LustreCollectorConfig struct { type LustreCollectorConfig struct {
LCtlCommand string `json:"lctl_command"` LCtlCommand string `json:"lctl_command,omitempty"`
ExcludeMetrics []string `json:"exclude_metrics"` ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
SendAllMetrics bool `json:"send_all_metrics"` Sudo bool `json:"use_sudo,omitempty"`
Sudo bool `json:"use_sudo"` SendAbsoluteValues bool `json:"send_abs_values,omitempty"`
SendAbsoluteValues bool `json:"send_abs_values"` SendDerivedValues bool `json:"send_derived_values,omitempty"`
SendDerivedValues bool `json:"send_derived_values"` SendDiffValues bool `json:"send_diff_values,omitempty"`
}
type LustreMetricDefinition struct {
name string
lineprefix string
lineoffset int
unit string
calc string
} }
type LustreCollector struct { type LustreCollector struct {
metricCollector metricCollector
tags map[string]string tags map[string]string
matches map[string]map[string]int
stats map[string]map[string]int64
config LustreCollectorConfig config LustreCollectorConfig
lctl string lctl string
sudoCmd string sudoCmd string
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
definitions []LustreMetricDefinition // Combined list without excluded metrics
stats map[string]map[string]int64 // Data for last value per device and metric
} }
func (m *LustreCollector) getDeviceDataCommand(device string) []string { func (m *LustreCollector) getDeviceDataCommand(device string) []string {
@ -78,6 +86,16 @@ func (m *LustreCollector) getDevices() []string {
return devices return devices
} }
func getMetricData(lines []string, prefix string, offset int) (int64, error) {
for _, line := range lines {
if strings.HasPrefix(line, prefix) {
lf := strings.Fields(line)
return strconv.ParseInt(lf[offset], 0, 64)
}
}
return 0, errors.New("no such line in data")
}
// //Version reading the stats data of a device from sysfs // //Version reading the stats data of a device from sysfs
// func (m *LustreCollector) getDeviceDataSysfs(device string) []string { // func (m *LustreCollector) getDeviceDataSysfs(device string) []string {
// llitedir := filepath.Join(LUSTRE_SYSFS, "llite") // llitedir := filepath.Join(LUSTRE_SYSFS, "llite")
@ -90,6 +108,183 @@ func (m *LustreCollector) getDevices() []string {
// return strings.Split(string(buffer), "\n") // return strings.Split(string(buffer), "\n")
// } // }
var LustreAbsMetrics = []LustreMetricDefinition{
{
name: "lustre_read_requests",
lineprefix: "read_bytes",
lineoffset: 1,
unit: "requests",
calc: "none",
},
{
name: "lustre_write_requests",
lineprefix: "write_bytes",
lineoffset: 1,
unit: "requests",
calc: "none",
},
{
name: "lustre_read_bytes",
lineprefix: "read_bytes",
lineoffset: 6,
unit: "bytes",
calc: "none",
},
{
name: "lustre_write_bytes",
lineprefix: "write_bytes",
lineoffset: 6,
unit: "bytes",
calc: "none",
},
{
name: "lustre_open",
lineprefix: "open",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_close",
lineprefix: "close",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_setattr",
lineprefix: "setattr",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_getattr",
lineprefix: "getattr",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_statfs",
lineprefix: "statfs",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_inode_permission",
lineprefix: "inode_permission",
lineoffset: 1,
unit: "",
calc: "none",
},
}
var LustreDiffMetrics = []LustreMetricDefinition{
{
name: "lustre_read_requests_diff",
lineprefix: "read_bytes",
lineoffset: 1,
unit: "requests",
calc: "difference",
},
{
name: "lustre_write_requests_diff",
lineprefix: "write_bytes",
lineoffset: 1,
unit: "requests",
calc: "difference",
},
{
name: "lustre_read_bytes_diff",
lineprefix: "read_bytes",
lineoffset: 6,
unit: "bytes",
calc: "difference",
},
{
name: "lustre_write_bytes_diff",
lineprefix: "write_bytes",
lineoffset: 6,
unit: "bytes",
calc: "difference",
},
{
name: "lustre_open_diff",
lineprefix: "open",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_close_diff",
lineprefix: "close",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_setattr_diff",
lineprefix: "setattr",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_getattr_diff",
lineprefix: "getattr",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_statfs_diff",
lineprefix: "statfs",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_inode_permission_diff",
lineprefix: "inode_permission",
lineoffset: 1,
unit: "",
calc: "difference",
},
}
var LustreDeriveMetrics = []LustreMetricDefinition{
{
name: "lustre_read_requests_rate",
lineprefix: "read_bytes",
lineoffset: 1,
unit: "requests/sec",
calc: "derivative",
},
{
name: "lustre_write_requests_rate",
lineprefix: "write_bytes",
lineoffset: 1,
unit: "requests/sec",
calc: "derivative",
},
{
name: "lustre_read_bw",
lineprefix: "read_bytes",
lineoffset: 6,
unit: "bytes/sec",
calc: "derivative",
},
{
name: "lustre_write_bw",
lineprefix: "write_bytes",
lineoffset: 6,
unit: "bytes/sec",
calc: "derivative",
},
}
func (m *LustreCollector) Init(config json.RawMessage) error { func (m *LustreCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "LustreCollector" m.name = "LustreCollector"
@ -102,17 +297,9 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
m.setup() m.setup()
m.tags = map[string]string{"type": "node"} m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "Lustre"} m.meta = map[string]string{"source": m.name, "group": "Lustre"}
defmatches := map[string]map[string]int{
"read_bytes": {"lustre_read_bytes": 6, "lustre_read_requests": 1},
"write_bytes": {"lustre_write_bytes": 6, "lustre_write_requests": 1},
"open": {"lustre_open": 1},
"close": {"lustre_close": 1},
"setattr": {"lustre_setattr": 1},
"getattr": {"lustre_getattr": 1},
"statfs": {"lustre_statfs": 1},
"inode_permission": {"lustre_inode_permission": 1}}
// Lustre file system statistics can only be queried by user root // Lustre file system statistics can only be queried by user root
// or with password-less sudo
if !m.config.Sudo { if !m.config.Sudo {
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
@ -123,23 +310,15 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root") cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
return err return err
} }
} else {
p, err := exec.LookPath("sudo")
if err != nil {
cclog.ComponentError(m.name, "Cannot find 'sudo'")
return err
}
m.sudoCmd = p
} }
m.matches = make(map[string]map[string]int)
for lineprefix, names := range defmatches {
for metricname, offset := range names {
_, skip := stringArrayContains(m.config.ExcludeMetrics, metricname)
if skip {
continue
}
if _, prefixExist := m.matches[lineprefix]; !prefixExist {
m.matches[lineprefix] = make(map[string]int)
}
if _, metricExist := m.matches[lineprefix][metricname]; !metricExist {
m.matches[lineprefix][metricname] = offset
}
}
}
p, err := exec.LookPath(m.config.LCtlCommand) p, err := exec.LookPath(m.config.LCtlCommand)
if err != nil { if err != nil {
p, err = exec.LookPath(LCTL_CMD) p, err = exec.LookPath(LCTL_CMD)
@ -148,23 +327,47 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
} }
} }
m.lctl = p m.lctl = p
if m.config.Sudo {
p, err := exec.LookPath("sudo") m.definitions = []LustreMetricDefinition{}
if err != nil { if m.config.SendAbsoluteValues {
m.sudoCmd = p for _, def := range LustreAbsMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
} }
} }
}
if m.config.SendDiffValues {
for _, def := range LustreDiffMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if m.config.SendDerivedValues {
for _, def := range LustreDeriveMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if len(m.definitions) == 0 {
return errors.New("no metrics to collect")
}
devices := m.getDevices() devices := m.getDevices()
if len(devices) == 0 { if len(devices) == 0 {
return errors.New("no metrics to collect") return errors.New("no Lustre devices found")
} }
m.stats = make(map[string]map[string]int64) m.stats = make(map[string]map[string]int64)
for _, d := range devices { for _, d := range devices {
m.stats[d] = make(map[string]int64) m.stats[d] = make(map[string]int64)
for _, names := range m.matches { data := m.getDeviceDataCommand(d)
for metricname := range names { for _, def := range m.definitions {
m.stats[d][metricname] = 0 x, err := getMetricData(data, def.lineprefix, def.lineoffset)
if err == nil {
m.stats[d][def.name] = x
} else {
m.stats[d][def.name] = 0
} }
} }
} }
@ -180,63 +383,43 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric)
now := time.Now() now := time.Now()
tdiff := now.Sub(m.lastTimestamp) tdiff := now.Sub(m.lastTimestamp)
for device, devData := range m.stats { for device, devData := range m.stats {
stats := m.getDeviceDataCommand(device) data := m.getDeviceDataCommand(device)
processed := []string{} for _, def := range m.definitions {
var use_x int64
for _, line := range stats { var err error
lf := strings.Fields(line) var y lp.CCMetric
if len(lf) > 1 { x, err := getMetricData(data, def.lineprefix, def.lineoffset)
if fields, ok := m.matches[lf[0]]; ok {
for name, idx := range fields {
x, err := strconv.ParseInt(lf[idx], 0, 64)
if err == nil { if err == nil {
value := x - devData[name] use_x = x
devData[name] = x } else {
if value < 0 { use_x = devData[def.name]
}
var value interface{}
switch def.calc {
case "none":
value = use_x
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "difference":
value = use_x - devData[def.name]
if value.(int64) < 0 {
value = 0 value = 0
} }
if m.config.SendAbsoluteValues { y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) case "derivative":
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
if value.(float64) < 0 {
value = 0
}
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
}
if err == nil { if err == nil {
y.AddTag("device", device) y.AddTag("device", device)
if strings.Contains(name, "byte") { if len(def.unit) > 0 {
y.AddMeta("unit", "Byte") y.AddMeta("unit", def.unit)
}
output <- y
if m.config.SendAllMetrics {
processed = append(processed, name)
}
}
}
if m.config.SendDerivedValues && strings.Contains(name, "bytes") {
y, err := lp.New(name+"_bw", m.tags, m.meta, map[string]interface{}{"value": float64(value) / tdiff.Seconds()}, time.Now())
if err == nil {
y.AddTag("device", device)
y.AddMeta("unit", "Bytes/sec")
output <- y
if m.config.SendAllMetrics {
processed = append(processed, name)
}
}
}
}
}
}
}
}
if m.config.SendAllMetrics {
for name := range devData {
if _, done := stringArrayContains(processed, name); !done {
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": 0}, time.Now())
if err == nil {
y.AddTag("device", device)
if strings.Contains(name, "byte") {
y.AddMeta("unit", "Byte")
} }
output <- y output <- y
} }
} devData[def.name] = use_x
}
} }
} }
m.lastTimestamp = now m.lastTimestamp = now

View File

@ -3,32 +3,44 @@
```json ```json
"lustrestat": { "lustrestat": {
"procfiles" : [ "lctl_command": "/path/to/lctl",
"/proc/fs/lustre/llite/lnec-XXXXXX/stats"
],
"exclude_metrics": [ "exclude_metrics": [
"setattr", "setattr",
"getattr" "getattr"
], ],
"send_abs_values" : true, "send_abs_values" : true,
"send_derived_values" : true "send_derived_values" : true,
"send_diff_values": true,
"use_sudo": false
} }
``` ```
The `lustrestat` collector reads from the procfs stat files for Lustre like `/proc/fs/lustre/llite/lnec-XXXXXX/stats`. The `lustrestat` collector uses the `lctl` application with the `get_param` option to get all `llite` metrics (Lustre client). The `llite` metrics are only available for root users. If password-less sudo is configured, you can enable `sudo` in the configuration.
Metrics: Metrics:
* `lustre_read_bytes` * `lustre_read_bytes` (unit `bytes`)
* `lustre_read_requests` * `lustre_read_requests` (unit `requests`)
* `lustre_write_bytes` * `lustre_write_bytes` (unit `bytes`)
* `lustre_write_requests` * `lustre_write_requests` (unit `requests`)
* `lustre_open` * `lustre_open`
* `lustre_close` * `lustre_close`
* `lustre_getattr` * `lustre_getattr`
* `lustre_setattr` * `lustre_setattr`
* `lustre_statfs` * `lustre_statfs`
* `lustre_inode_permission` * `lustre_inode_permission`
* `lustre_read_bytes_bw` (if `send_derived_values == true`) * `lustre_read_bw` (if `send_derived_values == true`, unit `bytes/sec`)
* `lustre_write_bytes_bw` (if `send_derived_values == true`) * `lustre_write_bw` (if `send_derived_values == true`, unit `bytes/sec`)
* `lustre_read_requests_rate` (if `send_derived_values == true`, unit `requests/sec`)
* `lustre_write_requests_rate` (if `send_derived_values == true`, unit `requests/sec`)
* `lustre_read_bytes_diff` (if `send_diff_values == true`, unit `bytes`)
* `lustre_read_requests_diff` (if `send_diff_values == true`, unit `requests`)
* `lustre_write_bytes_diff` (if `send_diff_values == true`, unit `bytes`)
* `lustre_write_requests_diff` (if `send_diff_values == true`, unit `requests`)
* `lustre_open_diff` (if `send_diff_values == true`)
* `lustre_close_diff` (if `send_diff_values == true`)
* `lustre_getattr_diff` (if `send_diff_values == true`)
* `lustre_setattr_diff` (if `send_diff_values == true`)
* `lustre_statfs_diff` (if `send_diff_values == true`)
* `lustre_inode_permission_diff` (if `send_diff_values == true`)
This collector adds an `device` tag. This collector adds an `device` tag.