add only_metrics. docs: clarify usage of filtering, consistency for metric list and units

This commit is contained in:
brinkcoder 2025-03-05 01:11:59 +01:00
parent 2d792684ff
commit 04fa267d9d
2 changed files with 84 additions and 67 deletions

View File

@ -10,8 +10,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const LUSTRE_SYSFS = `/sys/fs/lustre`
@ -21,6 +21,7 @@ const LCTL_OPTION = `get_param`
type LustreCollectorConfig struct {
LCtlCommand string `json:"lctl_command,omitempty"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
OnlyMetrics []string `json:"only_metrics,omitempty"`
Sudo bool `json:"use_sudo,omitempty"`
SendAbsoluteValues bool `json:"send_abs_values,omitempty"`
SendDerivedValues bool `json:"send_derived_values,omitempty"`
@ -41,9 +42,26 @@ type LustreCollector struct {
config LustreCollectorConfig
lctl string
sudoCmd string
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
lastTimestamp time.Time // Timestamp of last tick for diff/derivative calculations
definitions []LustreMetricDefinition // Combined list without excluded metrics
stats map[string]map[string]int64 // Data for last value per device and metric
stats map[string]map[string]int64 // Last measurement per device and metric
}
func (m *LustreCollector) shouldOutput(metricName string) bool {
if len(m.config.OnlyMetrics) > 0 {
for _, n := range m.config.OnlyMetrics {
if n == metricName {
return true
}
}
return false
}
for _, n := range m.config.ExcludeMetrics {
if n == metricName {
return false
}
}
return true
}
func (m *LustreCollector) getDeviceDataCommand(device string) []string {
@ -61,20 +79,7 @@ func (m *LustreCollector) getDeviceDataCommand(device string) []string {
func (m *LustreCollector) getDevices() []string {
devices := make([]string, 0)
// //Version reading devices from sysfs
// globPattern := filepath.Join(LUSTRE_SYSFS, "llite/*/stats")
// files, err := filepath.Glob(globPattern)
// if err != nil {
// return devices
// }
// for _, f := range files {
// pathlist := strings.Split(f, "/")
// devices = append(devices, pathlist[4])
// }
data := m.getDeviceDataCommand("*")
for _, line := range data {
if strings.HasPrefix(line, "llite") {
linefields := strings.Split(line, ".")
@ -96,18 +101,6 @@ func getMetricData(lines []string, prefix string, offset int) (int64, error) {
return 0, errors.New("no such line in data")
}
// //Version reading the stats data of a device from sysfs
// func (m *LustreCollector) getDeviceDataSysfs(device string) []string {
// llitedir := filepath.Join(LUSTRE_SYSFS, "llite")
// devdir := filepath.Join(llitedir, device)
// statsfile := filepath.Join(devdir, "stats")
// buffer, err := os.ReadFile(statsfile)
// if err != nil {
// return make([]string, 0)
// }
// return strings.Split(string(buffer), "\n")
// }
var LustreAbsMetrics = []LustreMetricDefinition{
{
name: "lustre_read_requests",
@ -308,7 +301,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
return err
}
if user.Uid != "0" {
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
cclog.ComponentError(m.name, "Lustre statistics can only be queried by root")
return err
}
} else {
@ -332,23 +325,26 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
m.definitions = []LustreMetricDefinition{}
if m.config.SendAbsoluteValues {
for _, def := range LustreAbsMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
if !m.shouldOutput(def.name) {
continue
}
m.definitions = append(m.definitions, def)
}
}
if m.config.SendDiffValues {
for _, def := range LustreDiffMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
if !m.shouldOutput(def.name) {
continue
}
m.definitions = append(m.definitions, def)
}
}
if m.config.SendDerivedValues {
for _, def := range LustreDeriveMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
if !m.shouldOutput(def.name) {
continue
}
m.definitions = append(m.definitions, def)
}
}
if len(m.definitions) == 0 {
@ -418,7 +414,9 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
if len(def.unit) > 0 {
y.AddMeta("unit", def.unit)
}
output <- y
if m.shouldOutput(y.Name()) {
output <- y
}
}
devData[def.name] = use_x
}

View File

@ -1,46 +1,65 @@
## `lustrestat` collector
```json
"lustrestat": {
"lctl_command": "/path/to/lctl",
"exclude_metrics": [
"setattr",
"getattr"
"lustre_setattr",
"lustre_getattr"
],
"send_abs_values" : true,
"send_derived_values" : true,
"only_metrics": [
"lustre_read_bytes",
"lustre_read_bytes_diff",
"lustre_read_bw",
"lustre_open",
"lustre_open_diff"
],
"send_abs_values": true,
"send_diff_values": true,
"send_derived_values": true,
"use_sudo": false
}
```
The `lustrestat` collector uses the `lctl` application with the `get_param` option to get all `llite` metrics (Lustre client). The `llite` metrics are only available for root users. If password-less sudo is configured, you can enable `sudo` in the configuration.
Metrics:
* `lustre_read_bytes` (unit `bytes`)
* `lustre_read_requests` (unit `requests`)
* `lustre_write_bytes` (unit `bytes`)
* `lustre_write_requests` (unit `requests`)
* `lustre_open`
* `lustre_close`
* `lustre_getattr`
* `lustre_setattr`
* `lustre_statfs`
* `lustre_inode_permission`
* `lustre_read_bw` (if `send_derived_values == true`, unit `bytes/sec`)
* `lustre_write_bw` (if `send_derived_values == true`, unit `bytes/sec`)
* `lustre_read_requests_rate` (if `send_derived_values == true`, unit `requests/sec`)
* `lustre_write_requests_rate` (if `send_derived_values == true`, unit `requests/sec`)
* `lustre_read_bytes_diff` (if `send_diff_values == true`, unit `bytes`)
* `lustre_read_requests_diff` (if `send_diff_values == true`, unit `requests`)
* `lustre_write_bytes_diff` (if `send_diff_values == true`, unit `bytes`)
* `lustre_write_requests_diff` (if `send_diff_values == true`, unit `requests`)
* `lustre_open_diff` (if `send_diff_values == true`)
* `lustre_close_diff` (if `send_diff_values == true`)
* `lustre_getattr_diff` (if `send_diff_values == true`)
* `lustre_setattr_diff` (if `send_diff_values == true`)
* `lustre_statfs_diff` (if `send_diff_values == true`)
* `lustre_inode_permission_diff` (if `send_diff_values == true`)
At least one of the settings for absolute, diff, and derived values must be set to true.
This collector adds an `device` tag.
Both filtering mechanisms are supported:
- `exclude_metrics`: Excludes the specified metrics.
- `only_metrics`: If provided, only the listed metrics are collected. This takes precedence over `exclude_metrics`.
Metrics are categorized as follows:
**Absolute Metrics:**
- `lustre_read_bytes` (unit: `bytes`)
- `lustre_read_requests` (unit: `requests`)
- `lustre_write_bytes` (unit: `bytes`)
- `lustre_write_requests` (unit: `requests`)
- `lustre_open`
- `lustre_close`
- `lustre_getattr`
- `lustre_setattr`
- `lustre_statfs`
- `lustre_inode_permission`
**Diff Metrics:**
- `lustre_read_bytes_diff` (unit: `bytes`)
- `lustre_read_requests_diff` (unit: `requests`)
- `lustre_write_bytes_diff` (unit: `bytes`)
- `lustre_write_requests_diff` (unit: `requests`)
- `lustre_open_diff`
- `lustre_close_diff`
- `lustre_getattr_diff`
- `lustre_setattr_diff`
- `lustre_statfs_diff`
- `lustre_inode_permission_diff`
**Derived Metrics:**
- `lustre_read_bw` (unit: `bytes/sec`)
- `lustre_write_bw` (unit: `bytes/sec`)
- `lustre_read_requests_rate` (unit: `requests/sec`)
- `lustre_write_requests_rate` (unit: `requests/sec`)
This collector adds a `device` tag.