From aa1afd745e2d4c5da050a2444dcd579a0e9ba661 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 15 Mar 2022 16:09:47 +0100 Subject: [PATCH] Derived metrics (#65) * Add time-based derivatived (e.g. bandwidth) to some collectors * Add documentation * Add comments * Fix: Only compute rates with a valid previous state * Only compute rates with a valid previous state * Define const values for net/dev fields * Set default config values * Add comments * Refactor: Consolidate data structures * Refactor: Consolidate data structures * Refactor: Avoid struct deep copy * Refactor: Avoid redundant tag maps * Refactor: Use int64 type for absolut values * Update LustreCollector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> --- collectors/lustreMetric.go | 373 +++++++++++++++++++++++++++---------- collectors/lustreMetric.md | 34 ++-- 2 files changed, 301 insertions(+), 106 deletions(-) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index 67efd7a..d5a96e4 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -19,23 +19,31 @@ const LCTL_CMD = `lctl` const LCTL_OPTION = `get_param` type LustreCollectorConfig struct { - LCtlCommand string `json:"lctl_command"` - ExcludeMetrics []string `json:"exclude_metrics"` - SendAllMetrics bool `json:"send_all_metrics"` - Sudo bool `json:"use_sudo"` - SendAbsoluteValues bool `json:"send_abs_values"` - SendDerivedValues bool `json:"send_derived_values"` + LCtlCommand string `json:"lctl_command,omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + Sudo bool `json:"use_sudo,omitempty"` + SendAbsoluteValues bool `json:"send_abs_values,omitempty"` + SendDerivedValues bool `json:"send_derived_values,omitempty"` + SendDiffValues bool `json:"send_diff_values,omitempty"` +} + +type LustreMetricDefinition struct { + name string + lineprefix string + lineoffset int + unit string + calc string } type LustreCollector struct { metricCollector tags map[string]string - matches map[string]map[string]int - stats map[string]map[string]int64 config LustreCollectorConfig lctl string sudoCmd string - lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths + lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths + definitions []LustreMetricDefinition // Combined list without excluded metrics + stats map[string]map[string]int64 // Data for last value per device and metric } func (m *LustreCollector) getDeviceDataCommand(device string) []string { @@ -78,6 +86,16 @@ func (m *LustreCollector) getDevices() []string { return devices } +func getMetricData(lines []string, prefix string, offset int) (int64, error) { + for _, line := range lines { + if strings.HasPrefix(line, prefix) { + lf := strings.Fields(line) + return strconv.ParseInt(lf[offset], 0, 64) + } + } + return 0, errors.New("no such line in data") +} + // //Version reading the stats data of a device from sysfs // func (m *LustreCollector) getDeviceDataSysfs(device string) []string { // llitedir := filepath.Join(LUSTRE_SYSFS, "llite") @@ -90,6 +108,183 @@ func (m *LustreCollector) getDevices() []string { // return strings.Split(string(buffer), "\n") // } +var LustreAbsMetrics = []LustreMetricDefinition{ + { + name: "lustre_read_requests", + lineprefix: "read_bytes", + lineoffset: 1, + unit: "requests", + calc: "none", + }, + { + name: "lustre_write_requests", + lineprefix: "write_bytes", + lineoffset: 1, + unit: "requests", + calc: "none", + }, + { + name: "lustre_read_bytes", + lineprefix: "read_bytes", + lineoffset: 6, + unit: "bytes", + calc: "none", + }, + { + name: "lustre_write_bytes", + lineprefix: "write_bytes", + lineoffset: 6, + unit: "bytes", + calc: "none", + }, + { + name: "lustre_open", + lineprefix: "open", + lineoffset: 1, + unit: "", + calc: "none", + }, + { + name: "lustre_close", + lineprefix: "close", + lineoffset: 1, + unit: "", + calc: "none", + }, + { + name: "lustre_setattr", + lineprefix: "setattr", + lineoffset: 1, + unit: "", + calc: "none", + }, + { + name: "lustre_getattr", + lineprefix: "getattr", + lineoffset: 1, + unit: "", + calc: "none", + }, + { + name: "lustre_statfs", + lineprefix: "statfs", + lineoffset: 1, + unit: "", + calc: "none", + }, + { + name: "lustre_inode_permission", + lineprefix: "inode_permission", + lineoffset: 1, + unit: "", + calc: "none", + }, +} + +var LustreDiffMetrics = []LustreMetricDefinition{ + { + name: "lustre_read_requests_diff", + lineprefix: "read_bytes", + lineoffset: 1, + unit: "requests", + calc: "difference", + }, + { + name: "lustre_write_requests_diff", + lineprefix: "write_bytes", + lineoffset: 1, + unit: "requests", + calc: "difference", + }, + { + name: "lustre_read_bytes_diff", + lineprefix: "read_bytes", + lineoffset: 6, + unit: "bytes", + calc: "difference", + }, + { + name: "lustre_write_bytes_diff", + lineprefix: "write_bytes", + lineoffset: 6, + unit: "bytes", + calc: "difference", + }, + { + name: "lustre_open_diff", + lineprefix: "open", + lineoffset: 1, + unit: "", + calc: "difference", + }, + { + name: "lustre_close_diff", + lineprefix: "close", + lineoffset: 1, + unit: "", + calc: "difference", + }, + { + name: "lustre_setattr_diff", + lineprefix: "setattr", + lineoffset: 1, + unit: "", + calc: "difference", + }, + { + name: "lustre_getattr_diff", + lineprefix: "getattr", + lineoffset: 1, + unit: "", + calc: "difference", + }, + { + name: "lustre_statfs_diff", + lineprefix: "statfs", + lineoffset: 1, + unit: "", + calc: "difference", + }, + { + name: "lustre_inode_permission_diff", + lineprefix: "inode_permission", + lineoffset: 1, + unit: "", + calc: "difference", + }, +} + +var LustreDeriveMetrics = []LustreMetricDefinition{ + { + name: "lustre_read_requests_rate", + lineprefix: "read_bytes", + lineoffset: 1, + unit: "requests/sec", + calc: "derivative", + }, + { + name: "lustre_write_requests_rate", + lineprefix: "write_bytes", + lineoffset: 1, + unit: "requests/sec", + calc: "derivative", + }, + { + name: "lustre_read_bw", + lineprefix: "read_bytes", + lineoffset: 6, + unit: "bytes/sec", + calc: "derivative", + }, + { + name: "lustre_write_bw", + lineprefix: "write_bytes", + lineoffset: 6, + unit: "bytes/sec", + calc: "derivative", + }, +} + func (m *LustreCollector) Init(config json.RawMessage) error { var err error m.name = "LustreCollector" @@ -102,17 +297,9 @@ func (m *LustreCollector) Init(config json.RawMessage) error { m.setup() m.tags = map[string]string{"type": "node"} m.meta = map[string]string{"source": m.name, "group": "Lustre"} - defmatches := map[string]map[string]int{ - "read_bytes": {"lustre_read_bytes": 6, "lustre_read_requests": 1}, - "write_bytes": {"lustre_write_bytes": 6, "lustre_write_requests": 1}, - "open": {"lustre_open": 1}, - "close": {"lustre_close": 1}, - "setattr": {"lustre_setattr": 1}, - "getattr": {"lustre_getattr": 1}, - "statfs": {"lustre_statfs": 1}, - "inode_permission": {"lustre_inode_permission": 1}} // Lustre file system statistics can only be queried by user root + // or with password-less sudo if !m.config.Sudo { user, err := user.Current() if err != nil { @@ -123,23 +310,15 @@ func (m *LustreCollector) Init(config json.RawMessage) error { cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root") return err } + } else { + p, err := exec.LookPath("sudo") + if err != nil { + cclog.ComponentError(m.name, "Cannot find 'sudo'") + return err + } + m.sudoCmd = p } - m.matches = make(map[string]map[string]int) - for lineprefix, names := range defmatches { - for metricname, offset := range names { - _, skip := stringArrayContains(m.config.ExcludeMetrics, metricname) - if skip { - continue - } - if _, prefixExist := m.matches[lineprefix]; !prefixExist { - m.matches[lineprefix] = make(map[string]int) - } - if _, metricExist := m.matches[lineprefix][metricname]; !metricExist { - m.matches[lineprefix][metricname] = offset - } - } - } p, err := exec.LookPath(m.config.LCtlCommand) if err != nil { p, err = exec.LookPath(LCTL_CMD) @@ -148,23 +327,47 @@ func (m *LustreCollector) Init(config json.RawMessage) error { } } m.lctl = p - if m.config.Sudo { - p, err := exec.LookPath("sudo") - if err != nil { - m.sudoCmd = p + + m.definitions = []LustreMetricDefinition{} + if m.config.SendAbsoluteValues { + for _, def := range LustreAbsMetrics { + if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip { + m.definitions = append(m.definitions, def) + } } } + if m.config.SendDiffValues { + for _, def := range LustreDiffMetrics { + if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip { + m.definitions = append(m.definitions, def) + } + } + } + if m.config.SendDerivedValues { + for _, def := range LustreDeriveMetrics { + if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip { + m.definitions = append(m.definitions, def) + } + } + } + if len(m.definitions) == 0 { + return errors.New("no metrics to collect") + } devices := m.getDevices() if len(devices) == 0 { - return errors.New("no metrics to collect") + return errors.New("no Lustre devices found") } m.stats = make(map[string]map[string]int64) for _, d := range devices { m.stats[d] = make(map[string]int64) - for _, names := range m.matches { - for metricname := range names { - m.stats[d][metricname] = 0 + data := m.getDeviceDataCommand(d) + for _, def := range m.definitions { + x, err := getMetricData(data, def.lineprefix, def.lineoffset) + if err == nil { + m.stats[d][def.name] = x + } else { + m.stats[d][def.name] = 0 } } } @@ -180,63 +383,43 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) now := time.Now() tdiff := now.Sub(m.lastTimestamp) for device, devData := range m.stats { - stats := m.getDeviceDataCommand(device) - processed := []string{} - - for _, line := range stats { - lf := strings.Fields(line) - if len(lf) > 1 { - if fields, ok := m.matches[lf[0]]; ok { - for name, idx := range fields { - x, err := strconv.ParseInt(lf[idx], 0, 64) - if err == nil { - value := x - devData[name] - devData[name] = x - if value < 0 { - value = 0 - } - if m.config.SendAbsoluteValues { - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) - if err == nil { - y.AddTag("device", device) - if strings.Contains(name, "byte") { - y.AddMeta("unit", "Byte") - } - output <- y - if m.config.SendAllMetrics { - processed = append(processed, name) - } - } - } - if m.config.SendDerivedValues && strings.Contains(name, "bytes") { - y, err := lp.New(name+"_bw", m.tags, m.meta, map[string]interface{}{"value": float64(value) / tdiff.Seconds()}, time.Now()) - if err == nil { - y.AddTag("device", device) - y.AddMeta("unit", "Bytes/sec") - output <- y - if m.config.SendAllMetrics { - processed = append(processed, name) - } - } - } - } - } - } + data := m.getDeviceDataCommand(device) + for _, def := range m.definitions { + var use_x int64 + var err error + var y lp.CCMetric + x, err := getMetricData(data, def.lineprefix, def.lineoffset) + if err == nil { + use_x = x + } else { + use_x = devData[def.name] } - } - if m.config.SendAllMetrics { - for name := range devData { - if _, done := stringArrayContains(processed, name); !done { - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": 0}, time.Now()) - if err == nil { - y.AddTag("device", device) - if strings.Contains(name, "byte") { - y.AddMeta("unit", "Byte") - } - output <- y - } + var value interface{} + switch def.calc { + case "none": + value = use_x + y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + case "difference": + value = use_x - devData[def.name] + if value.(int64) < 0 { + value = 0 } + y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + case "derivative": + value = float64(use_x-devData[def.name]) / tdiff.Seconds() + if value.(float64) < 0 { + value = 0 + } + y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) } + if err == nil { + y.AddTag("device", device) + if len(def.unit) > 0 { + y.AddMeta("unit", def.unit) + } + output <- y + } + devData[def.name] = use_x } } m.lastTimestamp = now diff --git a/collectors/lustreMetric.md b/collectors/lustreMetric.md index de4ed60..f11b85f 100644 --- a/collectors/lustreMetric.md +++ b/collectors/lustreMetric.md @@ -3,32 +3,44 @@ ```json "lustrestat": { - "procfiles" : [ - "/proc/fs/lustre/llite/lnec-XXXXXX/stats" - ], + "lctl_command": "/path/to/lctl", "exclude_metrics": [ "setattr", "getattr" ], "send_abs_values" : true, - "send_derived_values" : true + "send_derived_values" : true, + "send_diff_values": true, + "use_sudo": false } ``` -The `lustrestat` collector reads from the procfs stat files for Lustre like `/proc/fs/lustre/llite/lnec-XXXXXX/stats`. +The `lustrestat` collector uses the `lctl` application with the `get_param` option to get all `llite` metrics (Lustre client). The `llite` metrics are only available for root users. If password-less sudo is configured, you can enable `sudo` in the configuration. Metrics: -* `lustre_read_bytes` -* `lustre_read_requests` -* `lustre_write_bytes` -* `lustre_write_requests` +* `lustre_read_bytes` (unit `bytes`) +* `lustre_read_requests` (unit `requests`) +* `lustre_write_bytes` (unit `bytes`) +* `lustre_write_requests` (unit `requests`) * `lustre_open` * `lustre_close` * `lustre_getattr` * `lustre_setattr` * `lustre_statfs` * `lustre_inode_permission` -* `lustre_read_bytes_bw` (if `send_derived_values == true`) -* `lustre_write_bytes_bw` (if `send_derived_values == true`) +* `lustre_read_bw` (if `send_derived_values == true`, unit `bytes/sec`) +* `lustre_write_bw` (if `send_derived_values == true`, unit `bytes/sec`) +* `lustre_read_requests_rate` (if `send_derived_values == true`, unit `requests/sec`) +* `lustre_write_requests_rate` (if `send_derived_values == true`, unit `requests/sec`) +* `lustre_read_bytes_diff` (if `send_diff_values == true`, unit `bytes`) +* `lustre_read_requests_diff` (if `send_diff_values == true`, unit `requests`) +* `lustre_write_bytes_diff` (if `send_diff_values == true`, unit `bytes`) +* `lustre_write_requests_diff` (if `send_diff_values == true`, unit `requests`) +* `lustre_open_diff` (if `send_diff_values == true`) +* `lustre_close_diff` (if `send_diff_values == true`) +* `lustre_getattr_diff` (if `send_diff_values == true`) +* `lustre_setattr_diff` (if `send_diff_values == true`) +* `lustre_statfs_diff` (if `send_diff_values == true`) +* `lustre_inode_permission_diff` (if `send_diff_values == true`) This collector adds an `device` tag. \ No newline at end of file