From 032d0c61fedf5d08df57e888793f976da9186078 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 16 May 2023 13:34:13 +0200 Subject: [PATCH] Add an collector to read SNMP endpoints --- collectors/collectorManager.go | 1 + collectors/snmpMetric.go | 202 +++++++++++++++++++++++++++++++++ collectors/snmpMetric.md | 43 +++++++ 3 files changed, 246 insertions(+) create mode 100644 collectors/snmpMetric.go create mode 100644 collectors/snmpMetric.md diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 11b501a..7d4fae0 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -40,6 +40,7 @@ var AvailableCollectors = map[string]MetricCollector{ "rocm_smi": new(RocmSmiCollector), "self": new(SelfCollector), "schedstat": new(SchedstatCollector), + "snmpstat": new(SNMPCollector), } // Metric collector manager data structure diff --git a/collectors/snmpMetric.go b/collectors/snmpMetric.go new file mode 100644 index 0000000..2496158 --- /dev/null +++ b/collectors/snmpMetric.go @@ -0,0 +1,202 @@ +package collectors + +import ( + "encoding/json" + "fmt" + "regexp" + "strings" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + "github.com/gosnmp/gosnmp" +) + +type SNMPCollectorTargetConfig struct { + Hostname string `json:"hostname"` + Port int `json:"port,omitempty"` + Community string `json:"community"` + Timeout int `json:"timeout"` // timeout in seconds +} + +type SNMPCollectorMetricConfig struct { + Name string `json:"name"` + Value string `json:"value"` + Unit string `json:"unit,omitempty"` +} + +// These are the fields we read from the JSON configuration +type SNMPCollectorConfig struct { + Targets []SNMPCollectorTargetConfig `json:"targets"` + Metrics []SNMPCollectorMetricConfig `json:"metrics"` +} + +// This contains all variables we need during execution and the variables +// defined by metricCollector (name, init, ...) +type SNMPCollector struct { + metricCollector + config SNMPCollectorConfig // the configuration structure + meta map[string]string // default meta information + tags map[string]string // default tags +} + +func validOid(oid string) bool { + match, err := regexp.MatchString(`^[012]\.(?:[0-9]|[1-3][0-9])(\.\d+)*$`, oid) + if err != nil { + return false + } + return match +} + +// Init initializes the snmp collector +// Called once by the collector manager +// All tags, meta data tags and metrics that do not change over the runtime should be set here +func (m *SNMPCollector) Init(config json.RawMessage) error { + var err error = nil + // Always set the name early in Init() to use it in cclog.Component* functions + m.name = "SNMPCollector" + // This is for later use, also call it early + m.setup() + // Tell whether the collector should be run in parallel with others (reading files, ...) + // or it should be run serially, mostly for collectors actually doing measurements + // because they should not measure the execution of the other collectors + m.parallel = true + // Define meta information sent with each metric + // (Can also be dynamic or this is the basic set with extension through AddMeta()) + m.meta = map[string]string{"source": m.name, "group": "SNMP"} + // Define tags sent with each metric + // The 'type' tag is always needed, it defines the granularity of the metric + // node -> whole system + // socket -> CPU socket (requires socket ID as 'type-id' tag) + // die -> CPU die (requires CPU die ID as 'type-id' tag) + // memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag) + // llc -> Last level cache (requires last level cache ID as 'type-id' tag) + // core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag) + // hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag) + // accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag) + m.tags = map[string]string{"type": "node"} + // Read in the JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + + if len(m.config.Targets) == 0 { + err = fmt.Errorf("no targets configured, exiting") + cclog.ComponentError(m.name, err.Error()) + return err + } + + if len(m.config.Metrics) == 0 { + err = fmt.Errorf("no metrics configured, exiting") + cclog.ComponentError(m.name, err.Error()) + return err + } + + // Set this flag only if everything is initialized properly, all required files exist, ... + m.init = true + return err +} + +// Read collects all metrics belonging to the snmp collector +// and sends them through the output channel to the collector manager +func (m *SNMPCollector) Read(interval time.Duration, output chan lp.CCMetric) { + // Create a snmp metric + timestamp := time.Now() + + for _, target := range m.config.Targets { + port := uint16(161) + if target.Port > 0 { + port = uint16(target.Port) + } + comm := "public" + if len(target.Community) > 0 { + comm = target.Community + } + timeout := 1 + if target.Timeout > 0 { + timeout = target.Timeout + } + params := &gosnmp.GoSNMP{ + Target: target.Hostname, + Port: port, + Community: comm, + Version: gosnmp.Version2c, + Timeout: time.Duration(timeout) * time.Second, + } + err := params.Connect() + if err != nil { + cclog.ComponentError(m.name, err.Error()) + continue + } + for _, metric := range m.config.Metrics { + if !validOid(metric.Value) { + continue + } + oids := []string{} + name := gosnmp.SnmpPDU{ + Value: metric.Name, + Name: metric.Name, + } + nameidx := -1 + value := gosnmp.SnmpPDU{ + Value: 0, + Name: metric.Value, + } + valueidx := -1 + unit := gosnmp.SnmpPDU{ + Value: metric.Unit, + Name: metric.Unit, + } + unitidx := -1 + if validOid(metric.Name) { + oids = append(oids, metric.Name) + nameidx = 0 + } + if validOid(metric.Value) { + oids = append(oids, metric.Value) + valueidx = 1 + } + if len(metric.Unit) > 0 && validOid(metric.Unit) { + oids = append(oids, metric.Unit) + unitidx = 2 + } + result, err := gosnmp.Default.Get(oids) + if err != nil { + cclog.ComponentError(m.name, "failed to get data for OIDs %s", strings.Join(oids, ",")) + continue + } + if nameidx >= 0 && len(result.Variables) > nameidx { + name = result.Variables[nameidx] + } + if valueidx >= 0 && len(result.Variables) > valueidx { + value = result.Variables[valueidx] + } + if unitidx >= 0 && len(result.Variables) > unitidx { + unit = result.Variables[unitidx] + } + if len(result.Variables) > 2 { + unit = result.Variables[2] + } + y, err := lp.New(name.Value.(string), m.tags, m.meta, map[string]interface{}{"value": value.Value}, timestamp) + if err == nil { + // Send it to output channel + if len(unit.Name) > 0 && unit.Value != nil { + y.AddMeta("unit", unit.Value.(string)) + } + output <- y + } + } + params.Conn.Close() + } +} + +// Close metric collector: close network connection, close files, close libraries, ... +// Called once by the collector manager +func (m *SNMPCollector) Close() { + // Unset flag + m.init = false +} diff --git a/collectors/snmpMetric.md b/collectors/snmpMetric.md new file mode 100644 index 0000000..2162dc4 --- /dev/null +++ b/collectors/snmpMetric.md @@ -0,0 +1,43 @@ + +## `snmpstat` collector + +```json + "snmpstat": { + "targets" : [{ + "hostname" : "host1.example.com", + "port" : 161, + "community": "public", + "timeout" : 1, + }], + "metrics" : [ + { + "name": "sensor1", + "value": "1.3.6.1.2.1.1.4.0", + "unit": "1.3.6.1.2.1.1.7.0", + }, + { + "name": "1.3.6.1.2.1.1.2.0", + "value": "1.3.6.1.2.1.1.4.0", + "unit": "mb/s", + } + ] + } +``` + +The `snmpstat` collector uses [gosnmp](https://github.com/gosnmp/gosnmp) to read metrics from network-attached devices. + +The configuration of SNMP is quite extensive due to it's flexibility. For the collector, the configuration is split in two parts: + +### Target configuration + +Each network-attached device that should be queried. A target consits of +- `hostname` +- `port` (default 161) +- `community` (default 'public') +- `timeout` in seconds (default 1 for 1 second) + +### Metric configuration +- `name` can be an OID or a user-given string +- `value` has to be an OID +- `unit` can be empty, an OID or a user-given string +