From db04c8fbae6792dd487cd5e22c01a06148a9428a Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:52:50 +0100 Subject: [PATCH 1/7] Removed infinibandPerfQueryMetric.go. infinibandMetric.go offers the same functionality without requiring root privileges. --- collectors/collectorManager.go | 1 - collectors/infinibandPerfQueryMetric.go | 232 ------------------------ collectors/infinibandPerfQueryMetric.md | 28 --- 3 files changed, 261 deletions(-) delete mode 100644 collectors/infinibandPerfQueryMetric.go delete mode 100644 collectors/infinibandPerfQueryMetric.md diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 86b423e..46d791a 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -19,7 +19,6 @@ var AvailableCollectors = map[string]MetricCollector{ "memstat": new(MemstatCollector), "netstat": new(NetstatCollector), "ibstat": new(InfinibandCollector), - "ibstat_perfquery": new(InfinibandPerfQueryCollector), "lustrestat": new(LustreCollector), "cpustat": new(CpustatCollector), "topprocs": new(TopProcsCollector), diff --git a/collectors/infinibandPerfQueryMetric.go b/collectors/infinibandPerfQueryMetric.go deleted file mode 100644 index 72f701f..0000000 --- a/collectors/infinibandPerfQueryMetric.go +++ /dev/null @@ -1,232 +0,0 @@ -package collectors - -import ( - "fmt" - "io/ioutil" - "log" - "os/exec" - - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - - // "os" - "encoding/json" - "errors" - "path/filepath" - "strconv" - "strings" - "time" -) - -const PERFQUERY = `/usr/sbin/perfquery` - -type InfinibandPerfQueryCollector struct { - metricCollector - tags map[string]string - lids map[string]map[string]string - config struct { - ExcludeDevices []string `json:"exclude_devices,omitempty"` - PerfQueryPath string `json:"perfquery_path"` - } -} - -func (m *InfinibandPerfQueryCollector) Init(config json.RawMessage) error { - var err error - m.name = "InfinibandCollectorPerfQuery" - m.setup() - m.meta = map[string]string{"source": m.name, "group": "Network"} - m.tags = map[string]string{"type": "node"} - if len(config) > 0 { - err = json.Unmarshal(config, &m.config) - if err != nil { - return err - } - } - if len(m.config.PerfQueryPath) == 0 { - path, err := exec.LookPath("perfquery") - if err == nil { - m.config.PerfQueryPath = path - } - } - m.lids = make(map[string]map[string]string) - p := fmt.Sprintf("%s/*/ports/*/lid", string(IB_BASEPATH)) - files, err := filepath.Glob(p) - if err != nil { - return err - } - for _, f := range files { - lid, err := ioutil.ReadFile(f) - if err == nil { - plist := strings.Split(strings.Replace(f, string(IB_BASEPATH), "", -1), "/") - skip := false - for _, d := range m.config.ExcludeDevices { - if d == plist[0] { - skip = true - } - } - if !skip { - m.lids[plist[0]] = make(map[string]string) - m.lids[plist[0]][plist[2]] = string(lid) - } - } - } - - for _, ports := range m.lids { - for port, lid := range ports { - args := fmt.Sprintf("-r %s %s 0xf000", lid, port) - command := exec.Command(m.config.PerfQueryPath, args) - command.Wait() - _, err := command.Output() - if err != nil { - return fmt.Errorf("Failed to execute %s: %v", m.config.PerfQueryPath, err) - } - } - } - - if len(m.lids) == 0 { - return errors.New("No usable IB devices") - } - - m.init = true - return nil -} - -func (m *InfinibandPerfQueryCollector) doPerfQuery(cmd string, dev string, lid string, port string, tags map[string]string, output chan lp.CCMetric) error { - - args := fmt.Sprintf("-r %s %s 0xf000", lid, port) - command := exec.Command(cmd, args) - command.Wait() - stdout, err := command.Output() - if err != nil { - log.Print(err) - return err - } - ll := strings.Split(string(stdout), "\n") - - for _, line := range ll { - if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - } - return nil -} - -func (m *InfinibandPerfQueryCollector) Read(interval time.Duration, output chan lp.CCMetric) { - - if m.init { - for dev, ports := range m.lids { - for port, lid := range ports { - tags := map[string]string{ - "type": "node", - "device": dev, - "port": port, - "lid": lid} - path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(IB_BASEPATH), dev, port) - buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_data", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - } - } - } -} - -func (m *InfinibandPerfQueryCollector) Close() { - m.init = false -} diff --git a/collectors/infinibandPerfQueryMetric.md b/collectors/infinibandPerfQueryMetric.md deleted file mode 100644 index 2147963..0000000 --- a/collectors/infinibandPerfQueryMetric.md +++ /dev/null @@ -1,28 +0,0 @@ - -## `ibstat_perfquery` collector - -```json - "ibstat_perfquery": { - "perfquery_path": "/path/to/perfquery", - "exclude_devices": [ - "mlx4" - ] - } -``` - -The `ibstat_perfquery` collector includes all Infiniband devices that can be -found below `/sys/class/infiniband/` and where any of the ports provides a -LID file (`/sys/class/infiniband//ports//lid`) - -The devices can be filtered with the `exclude_devices` option in the configuration. - -For each found LID the collector calls the `perfquery` command. The path to the -`perfquery` command can be configured with the `perfquery_path` option in the configuration - -Metrics: -* `ib_recv` -* `ib_xmit` -* `ib_recv_pkts` -* `ib_xmit_pkts` - -The collector adds a `device` tag to all metrics From b3030a8d44f232906d3903b016c228691b1527df Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 3 Mar 2022 17:24:32 +0100 Subject: [PATCH 2/7] Use right systemd macro to create the user --- scripts/cc-metric-collector.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cc-metric-collector.spec b/scripts/cc-metric-collector.spec index fef1bb8..00a4aa4 100644 --- a/scripts/cc-metric-collector.spec +++ b/scripts/cc-metric-collector.spec @@ -42,7 +42,7 @@ install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.c # go test should be here... :) %pre -%sysusers_create_compat scripts/%{name}.sysusers +%sysusers_create_package scripts/%{name}.sysusers %post %systemd_post %{name}.service From f1d2828e1dce6145f6f19e57c0b04a72a158dcee Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 4 Mar 2022 11:32:10 +0100 Subject: [PATCH 3/7] Fix error print in LustreCollector --- collectors/lustreMetric.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index f98d746..66fd3fd 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -110,14 +110,16 @@ func (m *LustreCollector) Init(config json.RawMessage) error { "inode_permission": {"lustre_inode_permission": 1}} // Lustre file system statistics can only be queried by user root - user, err := user.Current() - if err != nil { - cclog.ComponentError(m.name, "Failed to get current user:", err.Error()) - return err - } - if user.Uid != "0" { - cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root:", err.Error()) - return err + if !m.config.Sudo { + user, err := user.Current() + if err != nil { + cclog.ComponentError(m.name, "Failed to get current user:", err.Error()) + return err + } + if user.Uid != "0" { + cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root") + return err + } } m.matches = make(map[string]map[string]int) From 5e64c01c9d3c1cb6895832ac243b66fbc8086cc6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 4 Mar 2022 11:51:21 +0100 Subject: [PATCH 4/7] Fix name for ClusterCockpit user --- scripts/cc-metric-collector.sysusers | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cc-metric-collector.sysusers b/scripts/cc-metric-collector.sysusers index 5c17ec7..6ce3700 100644 --- a/scripts/cc-metric-collector.sysusers +++ b/scripts/cc-metric-collector.sysusers @@ -1,2 +1,2 @@ -#Type Name ID GECOS Home directory Shell -u hpcop - "User for ClusterCockpit" /run/cc-metric-collector /sbin/nologin +#Type Name ID GECOS Home directory Shell +u clustercockpit - "User for ClusterCockpit" /run/cc-metric-collector /sbin/nologin From 7f62975a681607f770fb265867be116fe41d49e3 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 4 Mar 2022 11:52:48 +0100 Subject: [PATCH 5/7] Set proper user for files --- scripts/cc-metric-collector.spec | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/cc-metric-collector.spec b/scripts/cc-metric-collector.spec index 00a4aa4..8b23f20 100644 --- a/scripts/cc-metric-collector.spec +++ b/scripts/cc-metric-collector.spec @@ -51,15 +51,18 @@ install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.c %systemd_preun %{name}.service %files +# Binary +%attr(-,clustercockpit,clustercockpit) %{_sbindir}/%{name} +# Config %dir %{_sysconfdir}/%{name} -%{_sbindir}/%{name} +%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json +%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/collectors.json +%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/sinks.json +%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/receivers.json +%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/router.json +# Systemd %{_unitdir}/%{name}.service %{_sysconfdir}/default/%{name} -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/collectors.json -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/sinks.json -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/receivers.json -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/router.json %{_sysusersdir}/%{name}.conf %changelog From 547bc0461f77d02002b1b8d46828b51536329dad Mon Sep 17 00:00:00 2001 From: Mehmet Soysal Date: Fri, 4 Mar 2022 14:35:47 +0100 Subject: [PATCH 6/7] Beegfs collector (#50) * added beegfs collectors to collectors/README.md * added beegfs collectors and docs * added new beegfs collectors to AvailableCollectors list * Feedback implemented * changed error type * changed error to only return * changed beegfs lookup path * fixed typo in md files Co-authored-by: Mehmet Soysal --- collectors/README.md | 2 + collectors/beegfsmetaMetric.go | 229 ++++++++++++++++++++++++++++++ collectors/beegfsmetaMetric.md | 75 ++++++++++ collectors/beegfsstorageMetric.go | 221 ++++++++++++++++++++++++++++ collectors/beegfsstorageMetric.md | 55 +++++++ collectors/collectorManager.go | 2 + 6 files changed, 584 insertions(+) create mode 100644 collectors/beegfsmetaMetric.go create mode 100644 collectors/beegfsmetaMetric.md create mode 100644 collectors/beegfsstorageMetric.go create mode 100644 collectors/beegfsstorageMetric.md diff --git a/collectors/README.md b/collectors/README.md index 00e0da7..3fcdd49 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -37,6 +37,8 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md) * [`numastat`](./numastatMetric.md) * [`gpfs`](./gpfsMetric.md) +* [`beegfs_meta`](./beegfsmetaMetric.md) +* [`beegfs_storage`](./beegfsstorageMetric.md) ## Todos diff --git a/collectors/beegfsmetaMetric.go b/collectors/beegfsmetaMetric.go new file mode 100644 index 0000000..57b1e39 --- /dev/null +++ b/collectors/beegfsmetaMetric.go @@ -0,0 +1,229 @@ +package collectors + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "os/user" + "regexp" + "strconv" + "strings" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +const DEFAULT_BEEGFS_CMD = "beegfs-ctl" + +// Struct for the collector-specific JSON config +type BeegfsMetaCollectorConfig struct { + Beegfs string `json:"beegfs_path"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeFilesystem []string `json:"exclude_filesystem"` +} + +type BeegfsMetaCollector struct { + metricCollector + tags map[string]string + matches map[string]string + config BeegfsMetaCollectorConfig + skipFS map[string]struct{} +} + +func (m *BeegfsMetaCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + // Metrics + var nodeMdstat_array = [39]string{ + "sum", "ack", "close", "entInf", + "fndOwn", "mkdir", "create", "rddir", + "refrEn", "mdsInf", "rmdir", "rmLnk", + "mvDirIns", "mvFiIns", "open", "ren", + "sChDrct", "sAttr", "sDirPat", "stat", + "statfs", "trunc", "symlnk", "unlnk", + "lookLI", "statLI", "revalLI", "openLI", + "createLI", "hardlnk", "flckAp", "flckEn", + "flckRg", "dirparent", "listXA", "getXA", + "rmXA", "setXA", "mirror"} + + m.name = "BeegfsMetaCollector" + m.setup() + // Set default beegfs-ctl binary + + m.config.Beegfs = DEFAULT_BEEGFS_CMD + + // Read JSON configuration + if len(config) > 0 { + err := json.Unmarshal(config, &m.config) + if err != nil { + return err + } + } + + //create map with possible variables + m.matches = make(map[string]string) + for _, value := range nodeMdstat_array { + _, skip := stringArrayContains(m.config.ExcludeMetrics, value) + if skip { + m.matches["other"] = "0" + } else { + m.matches["beegfs_cmeta_"+value] = "0" + } + } + + m.meta = map[string]string{ + "source": m.name, + "group": "BeegfsMeta", + } + m.tags = map[string]string{ + "type": "node", + "filesystem": "", + } + m.skipFS = make(map[string]struct{}) + for _, fs := range m.config.ExcludeFilesystem { + m.skipFS[fs] = struct{}{} + } + + // Beegfs file system statistics can only be queried by user root + user, err := user.Current() + if err != nil { + return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %v", err) + } + if user.Uid != "0" { + return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root") + } + + // Check if beegfs-ctl is in executable search path + _, err = exec.LookPath(m.config.Beegfs) + if err != nil { + return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err) + } + m.init = true + return nil +} + +func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + //get mounpoint + buffer, _ := ioutil.ReadFile(string("/proc/mounts")) + mounts := strings.Split(string(buffer), "\n") + var mountpoints []string + for _, line := range mounts { + if len(line) == 0 { + continue + } + f := strings.Fields(line) + if strings.Contains(f[0], "beegfs_ondemand") { + // Skip excluded filesystems + if _, skip := m.skipFS[f[1]]; skip { + continue + } + mountpoints = append(mountpoints, f[1]) + } + } + + if len(mountpoints) == 0 { + return + } + + for _, mountpoint := range mountpoints { + m.tags["filesystem"] = mountpoint + + // bwwgfs-ctl: + // --clientstats: Show client IO statistics. + // --nodetype=meta: The node type to query (meta, storage). + // --interval: + // --mount=/mnt/beeond/: Which mount point + //cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt") + mountoption := "--mount=" + mountpoint + cmd := exec.Command(m.config.Beegfs, "--clientstats", + "--nodetype=meta", mountoption, "--allstats") + cmd.Stdin = strings.NewReader("\n") + cmdStdout := new(bytes.Buffer) + cmdStderr := new(bytes.Buffer) + cmd.Stdout = cmdStdout + cmd.Stderr = cmdStderr + err := cmd.Run() + if err != nil { + fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) + fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) + data, _ := ioutil.ReadAll(cmdStderr) + fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data)) + data, _ = ioutil.ReadAll(cmdStdout) + fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data)) + return + } + // Read I/O statistics + scanner := bufio.NewScanner(cmdStdout) + + sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`) + //Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`) + statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`) + singleSpacePattern := regexp.MustCompile(`\s+`) + removePattern := regexp.MustCompile(`[\[|\]]`) + + for scanner.Scan() { + readLine := scanner.Text() + //fmt.Println(readLine) + // Jump few lines, we only want the I/O stats from nodes + if !sumLine.MatchString(readLine) { + continue + } + + match := statsLine.FindStringSubmatch(readLine) + // nodeName = "Sum:" or would be nodes + // nodeName := match[1] + //Remove multiple whitespaces + dummy := removePattern.ReplaceAllString(match[2], " ") + metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " ")) + split := strings.Split(metaStats, " ") + + // fill map with values + // split[i+1] = mdname + // split[i] = amount of md operations + for i := 0; i <= len(split)-1; i += 2 { + if _, ok := m.matches[split[i+1]]; ok { + m.matches["beegfs_cmeta_"+split[i+1]] = split[i] + } else { + f1, err := strconv.ParseFloat(m.matches["other"], 32) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) + continue + } + f2, err := strconv.ParseFloat(split[i], 32) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) + continue + } + //mdStat["other"] = fmt.Sprintf("%f", f1+f2) + m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2) + } + } + + for key, data := range m.matches { + value, _ := strconv.ParseFloat(data, 32) + y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + if err == nil { + output <- y + } + } + } + } +} + +func (m *BeegfsMetaCollector) Close() { + m.init = false +} diff --git a/collectors/beegfsmetaMetric.md b/collectors/beegfsmetaMetric.md new file mode 100644 index 0000000..932e72f --- /dev/null +++ b/collectors/beegfsmetaMetric.md @@ -0,0 +1,75 @@ +## `BeeGFS on Demand` collector +This Collector is to collect BeeGFS on Demand (BeeOND) metadata clientstats. + +```json + "beegfs_meta": { + "beegfs_path": "/usr/bin/beegfs-ctl", + "exclude_filesystem": [ + "/mnt/ignore_me" + ], + "exclude_metrics": [ + "ack", + "entInf", + "fndOwn" + ] + } +``` + +The `BeeGFS On Demand (BeeOND)` collector uses the `beegfs-ctl` command to read performance metrics for +BeeGFS filesystems. + +The reported filesystems can be filtered with the `exclude_filesystem` option +in the configuration. + +The path to the `beegfs-ctl` command can be configured with the `beegfs_path` option +in the configuration. + +When using the `exclude_metrics` option, the excluded metrics are summed as `other`. + +Important: The metrics listed below, are similar to the naming of BeeGFS. The Collector prefixes these with `beegfs_cstorage`(beegfs client storage). + +For example beegfs metric `open`-> `beegfs_cstorage_open` + +Available Metrics: + +* sum +* ack +* close +* entInf +* fndOwn +* mkdir +* create +* rddir +* refrEnt +* mdsInf +* rmdir +* rmLnk +* mvDirIns +* mvFiIns +* open +* ren +* sChDrct +* sAttr +* sDirPat +* stat +* statfs +* trunc +* symlnk +* unlnk +* lookLI +* statLI +* revalLI +* openLI +* createLI +* hardlnk +* flckAp +* flckEn +* flckRg +* dirparent +* listXA +* getXA +* rmXA +* setXA +* mirror + +The collector adds a `filesystem` tag to all metrics \ No newline at end of file diff --git a/collectors/beegfsstorageMetric.go b/collectors/beegfsstorageMetric.go new file mode 100644 index 0000000..cbc8314 --- /dev/null +++ b/collectors/beegfsstorageMetric.go @@ -0,0 +1,221 @@ +package collectors + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "os/user" + "regexp" + "strconv" + "strings" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +// Struct for the collector-specific JSON config +type BeegfsStorageCollectorConfig struct { + Beegfs string `json:"beegfs_path"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeFilesystem []string `json:"exclude_filesystem"` +} + +type BeegfsStorageCollector struct { + metricCollector + tags map[string]string + matches map[string]string + config BeegfsStorageCollectorConfig + skipFS map[string]struct{} +} + +func (m *BeegfsStorageCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + // Metrics + var storageStat_array = [18]string{ + "sum", "ack", "sChDrct", "getFSize", + "sAttr", "statfs", "trunc", "close", + "fsync", "ops-rd", "MiB-rd/s", "ops-wr", + "MiB-wr/s", "gendbg", "hrtbeat", "remNode", + "storInf", "unlnk"} + + m.name = "BeegfsStorageCollector" + m.setup() + // Set default beegfs-ctl binary + + m.config.Beegfs = DEFAULT_BEEGFS_CMD + + // Read JSON configuration + if len(config) > 0 { + err := json.Unmarshal(config, &m.config) + if err != nil { + return err + } + } + println(m.config.Beegfs) + //create map with possible variables + m.matches = make(map[string]string) + for _, value := range storageStat_array { + _, skip := stringArrayContains(m.config.ExcludeMetrics, value) + if skip { + m.matches["other"] = "0" + } else { + m.matches["beegfs_cstorage_"+value] = "0" + } + } + + m.meta = map[string]string{ + "source": m.name, + "group": "BeegfsStorage", + } + m.tags = map[string]string{ + "type": "node", + "filesystem": "", + } + m.skipFS = make(map[string]struct{}) + for _, fs := range m.config.ExcludeFilesystem { + m.skipFS[fs] = struct{}{} + } + + // Beegfs file system statistics can only be queried by user root + user, err := user.Current() + if err != nil { + return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %v", err) + } + if user.Uid != "0" { + return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root") + } + + // Check if beegfs-ctl is in executable search path + _, err = exec.LookPath(m.config.Beegfs) + if err != nil { + return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err) + } + m.init = true + return nil +} + +func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + //get mounpoint + buffer, _ := ioutil.ReadFile(string("/proc/mounts")) + mounts := strings.Split(string(buffer), "\n") + var mountpoints []string + for _, line := range mounts { + if len(line) == 0 { + continue + } + f := strings.Fields(line) + if strings.Contains(f[0], "beegfs_ondemand") { + // Skip excluded filesystems + if _, skip := m.skipFS[f[1]]; skip { + continue + } + mountpoints = append(mountpoints, f[1]) + } + } + if len(mountpoints) == 0 { + return + } + // collects stats for each BeeGFS on Demand FS + for _, mountpoint := range mountpoints { + m.tags["filesystem"] = mountpoint + + // bwwgfs-ctl: + // --clientstats: Show client IO statistics. + // --nodetype=meta: The node type to query (meta, storage). + // --interval: + // --mount=/mnt/beeond/: Which mount point + //cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt") + mountoption := "--mount=" + mountpoint + cmd := exec.Command(m.config.Beegfs, "--clientstats", + "--nodetype=storage", mountoption, "--allstats") + cmd.Stdin = strings.NewReader("\n") + cmdStdout := new(bytes.Buffer) + cmdStderr := new(bytes.Buffer) + cmd.Stdout = cmdStdout + cmd.Stderr = cmdStderr + err := cmd.Run() + if err != nil { + fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) + fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) + data, _ := ioutil.ReadAll(cmdStderr) + fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data)) + data, _ = ioutil.ReadAll(cmdStdout) + fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data)) + return + } + // Read I/O statistics + scanner := bufio.NewScanner(cmdStdout) + + sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`) + //Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`) + statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`) + singleSpacePattern := regexp.MustCompile(`\s+`) + removePattern := regexp.MustCompile(`[\[|\]]`) + + for scanner.Scan() { + readLine := scanner.Text() + //fmt.Println(readLine) + // Jump few lines, we only want the I/O stats from nodes + if !sumLine.MatchString(readLine) { + continue + } + + match := statsLine.FindStringSubmatch(readLine) + // nodeName = "Sum:" or would be nodes + // nodeName := match[1] + //Remove multiple whitespaces + dummy := removePattern.ReplaceAllString(match[2], " ") + metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " ")) + split := strings.Split(metaStats, " ") + + // fill map with values + // split[i+1] = mdname + // split[i] = amount of operations + for i := 0; i <= len(split)-1; i += 2 { + if _, ok := m.matches[split[i+1]]; ok { + m.matches["beegfs_cstorage_"+split[i+1]] = split[i] + //m.matches[split[i+1]] = split[i] + } else { + f1, err := strconv.ParseFloat(m.matches["other"], 32) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) + continue + } + f2, err := strconv.ParseFloat(split[i], 32) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) + continue + } + m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2) + } + } + + for key, data := range m.matches { + value, _ := strconv.ParseFloat(data, 32) + y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + if err == nil { + output <- y + } + } + } + } +} + +func (m *BeegfsStorageCollector) Close() { + m.init = false +} diff --git a/collectors/beegfsstorageMetric.md b/collectors/beegfsstorageMetric.md new file mode 100644 index 0000000..519b5bf --- /dev/null +++ b/collectors/beegfsstorageMetric.md @@ -0,0 +1,55 @@ +## `BeeGFS on Demand` collector +This Collector is to collect BeeGFS on Demand (BeeOND) storage stats. + +```json + "beegfs_storage": { + "beegfs_path": "/usr/bin/beegfs-ctl", + "exclude_filesystem": [ + "/mnt/ignore_me" + ], + "exclude_metrics": [ + "ack", + "storInf", + "unlnk" + ] + } +``` + +The `BeeGFS On Demand (BeeOND)` collector uses the `beegfs-ctl` command to read performance metrics for BeeGFS filesystems. + +The reported filesystems can be filtered with the `exclude_filesystem` option +in the configuration. + +The path to the `beegfs-ctl` command can be configured with the `beegfs_path` option +in the configuration. + +When using the `exclude_metrics` option, the excluded metrics are summed as `other`. + +Important: The metrics listed below, are similar to the naming of BeeGFS. The Collector prefixes these with `beegfs_cstorage_`(beegfs client meta). +For example beegfs metric `open`-> `beegfs_cstorage_` + +Note: BeeGFS FS offers many Metadata Information. Probably it makes sense to exlcude most of them. Nevertheless, these excluded metrics will be summed as `beegfs_cstorage_other`. + +Available Metrics: + +* "sum" +* "ack" +* "sChDrct" +* "getFSize" +* "sAttr" +* "statfs" +* "trunc" +* "close" +* "fsync" +* "ops-rd" +* "MiB-rd/s" +* "ops-wr" +* "MiB-wr/s" +* "endbg" +* "hrtbeat" +* "remNode" +* "storInf" +* "unlnk" + + +The collector adds a `filesystem` tag to all metrics \ No newline at end of file diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 46d791a..e9ccfe7 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -34,6 +34,8 @@ var AvailableCollectors = map[string]MetricCollector{ "nfs3stat": new(Nfs3Collector), "nfs4stat": new(Nfs4Collector), "numastats": new(NUMAStatsCollector), + "beegfs_meta": new(BeegfsMetaCollector), + "beegfs_storage": new(BeegfsStorageCollector), } // Metric collector manager data structure From 1961edc65968ed9eaae8f98541f68b68f6bb286b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 4 Mar 2022 15:42:25 +0100 Subject: [PATCH 7/7] Add documentation to help configuring the CC metric collector --- docs/configuration.md | 187 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 docs/configuration.md diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000..75c7aa5 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,187 @@ +# Configuring the CC metric collector + +The configuration of the CC metric collector consists of five configuration files: one global file and four component related files. + +## Global configuration + +The global file contains the paths to the other four files and some global options. + +```json +{ + "sinks": "sinks.json", + "collectors" : "collectors.json", + "receivers" : "receivers.json", + "router" : "router.json", + "interval": 10, + "duration": 1 +} +``` + +Be aware that the paths are relative to the execution folder of the cc-metric-collector binary, so it is recommended to use absolute paths. + +## Component configuration + +The others are mainly list of of subcomponents: the collectors, the receivers, the router and the sinks. Their role is best shown in a picture: + +```mermaid +flowchart LR + + subgraph col ["Collectors"] + direction TB + cpustat["cpustat"] + memstat["memstat"] + tempstat["tempstat"] + misc["..."] + end + + subgraph Receivers ["Receivers"] + direction TB + nats["NATS"] + miscrecv[...] + end + + subgraph calc["Aggregator"] + direction TB + cache["Cache"] + agg["Calculator"] + end + + subgraph sinks ["Sinks"] + direction RL + influx["InfluxDB"] + ganglia["Ganglia"] + logger["Logfile"] + miscsink["..."] + end + + cpustat --> CollectorManager["CollectorManager"] + memstat --> CollectorManager + tempstat --> CollectorManager + misc --> CollectorManager + + nats --> ReceiverManager["ReceiverManager"] + miscrecv --> ReceiverManager + + CollectorManager --> newrouter["Router"] + ReceiverManager -.-> newrouter + calc -.-> newrouter + newrouter --> SinkManager["SinkManager"] + newrouter -.-> calc + + SinkManager --> influx + SinkManager --> ganglia + SinkManager --> logger + SinkManager --> miscsink + + +``` + +There are four parts: +- The collectors read data from files, execute commands and call dynamically loaded library function and send it to the router +- The router can process metrics by cacheing and evaluating functions and conditions on them +- The sinks send the metrics to storage backends +- The receivers can be used to receive metrics from other collectors and forward them to the router. They can be used to create a tree-like structure of collectors. + +(A maybe better differentiation between collectors and receivers is that the collectors are called periodically while the receivers have their own logic and submit metrics at any time) + + +### Collectors configuration file + +The collectors configuration file tells which metrics should be queried from the system. The metric gathering is logically grouped in so called 'Collectors'. So there are Collectors to read CPU, memory or filesystem statistics. The collectors configuration file is a list of these collectors with collector-specific configurations: + +```json +{ + "cpustat" : {}, + "diskstat": { + "exclude_metrics": [ + "disk_total" + ] + } +} +``` + +The first one is the CPU statistics collector without any collector-specific setting. The second one enables disk mount statistics but excludes the metric `disk_total`. + +All names and possible collector-specific configuration options can be found [here](../collectors/README.md). + +Some collectors might dynamically load shared libraries. In order to enable these collectors, make sure that the shared library path is part of the `LD_LIBRARY_PATH` environment variable. + +### Sinks configuration file + +The sinks define the output/sending of metrics. The metrics can be forwarded to multiple sinks, even to sinks of the same type. The sinks configuration file is a list of these sinks, each with an individual name. + +```json +{ + "myinflux" : { + "type" : "influxasync", + "host": "localhost", + "port": "8086", + "organization" : "testorga", + "database" : "testbucket", + "password" : "" + }, + "companyinflux" : { + "type" : "influxasync", + "host": "companyhost", + "port": "8086", + "organization" : "company", + "database" : "main", + "password" : "" + } +} +``` + +The above example configuration file defines two sink, both ot type `influxasync`. They are differentiated internally by the names: `myinflux` and `companyinflux`. + +All types and possible sink-specific configuration options can be found [here](../sinks/README.md). + +Some sinks might dynamically load shared libraries. In order to enable these sinks, make sure that the shared library path is part of the `LD_LIBRARY_PATH` environment variable. + +### Router configuration file + +The collectors and the sinks are connected through the router. The router forwards the metrics to the sinks but enables some data processing. A common example is to tag all passing metrics like adding `cluster=mycluster`. But also aggregations like "take the average of all 'ipc' metrics" (ipc -> Instructions Per Cycle). Since the configurations of these aggregations can be quite complicated, we refer to the router's [README](../internal/metricRouter/README.md). + +A simple router configuration file to start with looks like this: + +```json +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "mycluster", + "if" : "*" + } + ], + "interval_timestamp" : false, + "num_cache_intervals" : 0 +} +``` + +With the `add_tags` section, we tell to attach the `cluster=mycluster` tag to each (`*` metric). The `interval_timestamp` tell the router to not touch the timestamp of metrics. It is possible to send all metrics within an interval with a common time stamp to avoid later alignment issues. The `num_cache_intervals` diables the cache completely. The cache is only required if you want to do complex metric aggregations. + +All configuration options can be found [here](../internal/metricRouter/README.md). + +### Receivers configuration file + +The receivers are a special feature of the CC Metric Collector to enable simpler integration into exising setups. While collectors query data from the local system, the receivers commonly get data from other systems through some network technology like HTTP or NATS. The idea is keep the current setup but send it to a CC Metric Collector which forwards it to the the destination system (if a sink exists for it). For most setups, the receivers are not required and an the receiver config file should contain only an empty JSON map (`{}`). + +```json +{ + "nats_rack0": { + "type": "nats", + "address" : "nats-server.example.org", + "port" : "4222", + "subject" : "rack0", + }, + "nats_rack1": { + "type": "nats", + "address" : "nats-server.example.org", + "port" : "4222", + "subject" : "rack1", + } +} +``` + +This example configuration creates two receivers with the names `nats_rack0` and `nats_rack1`. While one subscribes to metrics published with the `rack0` subject, the other one subscribes to the `rack0` subject. The NATS server is the same as it manages all subjects in a subnet. (As example, the router could add tags `rack=0` and `rack=1` respectively to the received metrics.) + +All types and possible receiver-specific configuration options can be found [here](../receivers/README.md). \ No newline at end of file