From 43a8ea683dd398f6d3df9508c93a48c8f08debba Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 6 Jan 2022 15:25:51 +0100 Subject: [PATCH 001/174] Cast collector measurement duration to seconds. Thanks to KIT --- collectors/likwidMetric.go | 2 +- metric-collector.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 2fd1129..34e2364 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -200,7 +200,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) m.results[i][tid][gctr] = float64(res) } - m.results[i][tid]["time"] = float64(interval) + m.results[i][tid]["time"] = interval.Seconds() m.results[i][tid]["inverseClock"] = float64(1.0 / m.basefreq) for _, metric := range evset.Metrics { expression, err := govaluate.NewEvaluableExpression(metric.Calc) diff --git a/metric-collector.go b/metric-collector.go index f6c8f5c..fd3b556 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -281,7 +281,7 @@ func main() { // storage locations for _, c := range config.Collectors { col := Collectors[c] - col.Read(time.Duration(config.Duration), &tmpPoints) + col.Read(time.Duration(config.Duration)*time.Second, &tmpPoints) for { if len(tmpPoints) == 0 { From 11e40c6ee3a11a5d1add7964e37813ec23c512c1 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 10:15:41 +0100 Subject: [PATCH 002/174] Add IB metrics ib_recv_pkts and ib_xmit_pkts --- collectors/infinibandMetric.go | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 93725d1..54c974e 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -47,6 +47,8 @@ func (m *InfinibandCollector) Help() { fmt.Println("Metrics:") fmt.Println("- ib_recv") fmt.Println("- ib_xmit") + fmt.Println("- ib_recv_pkts") + fmt.Println("- ib_xmit_pkts") } func (m *InfinibandCollector) Init(config []byte) error { @@ -143,6 +145,26 @@ func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[strin } } } + if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } + if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } } return nil } @@ -171,6 +193,28 @@ func DoSysfsRead(dev string, lid string, port string, tags map[string]string, ou } } } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } return nil } From b97c5886600c1be0e130481ae9da65f56c268c59 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 14:25:24 +0100 Subject: [PATCH 003/174] Add GPFS / IBM Spectrum Scale collector --- collectors/gpfs.go | 298 ++++++++++++++++++++++++++++++++++++++++++++ metric-collector.go | 1 + 2 files changed, 299 insertions(+) create mode 100644 collectors/gpfs.go diff --git a/collectors/gpfs.go b/collectors/gpfs.go new file mode 100644 index 0000000..14398b4 --- /dev/null +++ b/collectors/gpfs.go @@ -0,0 +1,298 @@ +package collectors + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "log" + "os" + "os/exec" + "os/user" + "strconv" + "strings" + "time" + + lp "github.com/influxdata/line-protocol" +) + +type GpfsCollectorConfig struct { + Mmpmon string `json:"mmpmon"` +} + +type GpfsCollector struct { + MetricCollector + config GpfsCollectorConfig +} + +func (m *GpfsCollector) Init(config []byte) error { + var err error + m.name = "GpfsCollector" + m.setup() + + // Set default mmpmon binary + m.config.Mmpmon = "/usr/lpp/mmfs/bin/mmpmon" + + // Read JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + log.Print(err.Error()) + return err + } + } + + // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root + user, err := user.Current() + if err != nil { + return fmt.Errorf("GpfsCollector.Init(): Failed to get current user: %v", err) + } + if user.Uid != "0" { + return fmt.Errorf("GpfsCollector.Init(): GPFS file system statistics can only be queried by user root") + } + + // Check if mmpmon is in executable search path + _, err = exec.LookPath(m.config.Mmpmon) + if err != nil { + return fmt.Errorf("GpfsCollector.Init(): Failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err) + } + + m.init = true + return nil +} + +func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + if !m.init { + return + } + + // mmpmon: + // -p: generate output that can be parsed + // -s: suppress the prompt on input + // fs_io_s: Displays I/O statistics per mounted file system + cmd := exec.Command(m.config.Mmpmon, "-p", "-s") + cmd.Stdin = strings.NewReader("once fs_io_s\n") + cmdStdout := new(bytes.Buffer) + cmdStderr := new(bytes.Buffer) + cmd.Stdout = cmdStdout + cmd.Stderr = cmdStderr + err := cmd.Run() + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) + data, _ := ioutil.ReadAll(cmdStderr) + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stderr: \"%s\"\n", string(data)) + data, _ = ioutil.ReadAll(cmdStdout) + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stdout: \"%s\"\n", string(data)) + return + } + + // Read I/O statistics + scanner := bufio.NewScanner(cmdStdout) + for scanner.Scan() { + lineSplit := strings.Fields(scanner.Text()) + if lineSplit[0] == "_fs_io_s_" { + key_value := make(map[string]string) + for i := 1; i < len(lineSplit); i += 2 { + key_value[lineSplit[i]] = lineSplit[i+1] + } + + // Ignore keys: + // _n_: node IP address, + // _nn_: node name, + // _cl_: cluster name, + // _d_: number of disks + + filesystem, ok := key_value["_fs_"] + if !ok { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to get filesystem name.\n") + continue + } + + // return code + rc, err := strconv.Atoi(key_value["_rc_"]) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert return code: %s\n", err.Error()) + continue + } + if rc != 0 { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Filesystem %s not ok.", filesystem) + continue + } + + // unix epoch in microseconds + timestampInt, err := strconv.ParseInt(key_value["_t_"]+key_value["_tu_"], 10, 64) + timestamp := time.UnixMicro(timestampInt) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert time stamp '%s': %s\n", + key_value["_t_"]+key_value["_tu_"], err.Error()) + continue + } + + // bytes read + bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert bytes read '%s': %s\n", + key_value["_br_"], err.Error()) + continue + } + y, err := lp.New( + "gpfs_bytes_read", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": bytesRead, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // bytes written + bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert bytes written '%s': %s\n", + key_value["_bw_"], err.Error()) + continue + } + y, err = lp.New( + "gpfs_bytes_written", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": bytesWritten, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of opens + numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert number of opens '%s': %s\n", + key_value["_oc_"], err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_opens", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numOpens, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of closes + numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of closes: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_closes", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numCloses, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of reads + numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of reads: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_reads", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numReads, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of writes + numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of writes: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_writes", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numWrites, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of read directories + numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of read directories: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_readdirs", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numReaddirs, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // Number of inode updates + numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert Number of inode updates: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_inode_updates", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numInodeUpdates, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + } + } +} + +func (m *GpfsCollector) Close() { + m.init = false + return +} diff --git a/metric-collector.go b/metric-collector.go index fd3b556..0b75675 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -32,6 +32,7 @@ var Collectors = map[string]collectors.MetricGetter{ "diskstat": &collectors.DiskstatCollector{}, "tempstat": &collectors.TempCollector{}, "ipmistat": &collectors.IpmiCollector{}, + "gpfs": &collectors.GpfsCollector{}, } var Sinks = map[string]sinks.SinkFuncs{ From 82b10b365e5459f07889a5a2de6a2c2b68e7458c Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 14:47:59 +0100 Subject: [PATCH 004/174] Fix to work with golang 1.16 --- collectors/gpfs.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/collectors/gpfs.go b/collectors/gpfs.go index 14398b4..db8a0d0 100644 --- a/collectors/gpfs.go +++ b/collectors/gpfs.go @@ -121,6 +121,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { continue } + /* requires go 1.17 // unix epoch in microseconds timestampInt, err := strconv.ParseInt(key_value["_t_"]+key_value["_tu_"], 10, 64) timestamp := time.UnixMicro(timestampInt) @@ -130,6 +131,8 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { key_value["_t_"]+key_value["_tu_"], err.Error()) continue } + */ + timestamp := time.Now() // bytes read bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) From 7b29a14e1a89ba515935ede4ba308681007a9321 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 15:35:12 +0100 Subject: [PATCH 005/174] Drop domain part of host name --- metric-collector.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metric-collector.go b/metric-collector.go index 0b75675..04c221f 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -166,6 +166,8 @@ func main() { log.Print(err) return } + // Drop domain part of host name + host = strings.SplitN(host, `.`, 2)[0] clicfg := ReadCli() err = CreatePidfile(clicfg["pidfile"]) err = SetLogging(clicfg["logfile"]) From f17719113d3fe84d178b0a13b01e8066308b0df8 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 15:55:15 +0100 Subject: [PATCH 006/174] Updated to latest stable version of likwid --- collectors/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectors/Makefile b/collectors/Makefile index ab47caa..0c637b5 100644 --- a/collectors/Makefile +++ b/collectors/Makefile @@ -8,9 +8,9 @@ ACCESSMODE = direct # if CENTRAL_INSTALL == true ####################################################################### # Path to central installation (if CENTRAL_INSTALL=true) -LIKWID_BASE=/apps/likwid/5.2.0 -# LIKWID version (should be same major version as central installation, 5.1.x) -LIKWID_VERSION = 5.2.0 +LIKWID_BASE=/apps/likwid/5.2.1 +# LIKWID version (should be same major version as central installation, 5.2.x) +LIKWID_VERSION = 5.2.1 ####################################################################### # if CENTRAL_INSTALL == false and ACCESSMODE == accessdaemon From dcb5b4add5a003b9bc4e70696995f46ad373a3d6 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 16:41:32 +0100 Subject: [PATCH 007/174] Define source code dependencies in Makefile --- Makefile | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index e82685e..f49162e 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,27 @@ APP = cc-metric-collector +GOSRC_APP := metric-collector.go +GOSRC_COLLECTORS := $(wildcard collectors/*.go) +GOSRC_SINKS := $(wildcard sinks/*.go) +GOSRC_RECEIVERS := $(wildcard receivers/*.go) +GOSRC := $(GOSRC_APP) $(GOSRC_COLLECTORS) $(GOSRC_SINKS) $(GOSRC_RECEIVERS) +.PHONY: all all: $(APP) -$(APP): metric-collector.go +$(APP): $(GOSRC) make -C collectors go get - go build -o $(APP) metric-collector.go + go build -o $(APP) $(GOSRC_APP) +.PHONY: clean clean: make -C collectors clean rm -f $(APP) +.PHONY: fmt fmt: - go fmt collectors/*.go - go fmt sinks/*.go - go fmt receivers/*.go - go fmt metric-collector.go + go fmt $(GOSRC_COLLECTORS) + go fmt $(GOSRC_SINKS) + go fmt $(GOSRC_RECEIVERS) + go fmt $(GOSRC_APP) -.PHONY: clean From f91150f4ba9db90b038d3414df37cff7d0e488db Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 12:09:22 +0100 Subject: [PATCH 008/174] Add vet and staticcheck make targets --- Makefile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Makefile b/Makefile index f49162e..91a1200 100644 --- a/Makefile +++ b/Makefile @@ -25,3 +25,14 @@ fmt: go fmt $(GOSRC_RECEIVERS) go fmt $(GOSRC_APP) +# Examine Go source code and reports suspicious constructs +.PHONY: vet + go vet ./... + + +# Run linter for the Go programming language. +# Using static analysis, it finds bugs and performance issues, offers simplifications, and enforces style rules +.PHONY: staticcheck +staticcheck: + go install honnef.co/go/tools/cmd/staticcheck@latest + $$(go env GOPATH)/bin/staticcheck ./... From 8860b8d0f76408fc078414f38be0d4311d726d78 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 12:13:50 +0100 Subject: [PATCH 009/174] Add vet and staticcheck make targets --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 91a1200..892bbcc 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,7 @@ fmt: # Examine Go source code and reports suspicious constructs .PHONY: vet +vet: go vet ./... From 5d263adddec4b23fc6299fa99fc017615ba3b163 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 12:38:52 +0100 Subject: [PATCH 010/174] Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value --- collectors/cpustatMetric.go | 2 +- collectors/diskstatMetric.go | 2 +- collectors/infinibandMetric.go | 2 +- collectors/loadavgMetric.go | 2 +- collectors/nvidiaMetric.go | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index fe31c3c..9e44fa7 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -13,7 +13,7 @@ import ( const CPUSTATFILE = `/proc/stat` type CpustatCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } type CpustatCollector struct { diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index e2d2f25..5080ca2 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -15,7 +15,7 @@ const DISKSTATFILE = `/proc/diskstats` const DISKSTAT_SYSFSPATH = `/sys/block` type DiskstatCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } type DiskstatCollector struct { diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 54c974e..a9552f7 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -20,7 +20,7 @@ const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid` const PERFQUERY = `/usr/sbin/perfquery` type InfinibandCollectorConfig struct { - ExcludeDevices []string `json:"exclude_devices, omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` PerfQueryPath string `json:"perfquery_path"` } diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index dbccf22..21cf350 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -12,7 +12,7 @@ import ( const LOADAVGFILE = `/proc/loadavg` type LoadavgCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } type LoadavgCollector struct { diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 4597610..bd63e2c 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -11,8 +11,8 @@ import ( ) type NvidiaCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` - ExcludeDevices []string `json:"exclude_devices, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` } type NvidiaCollector struct { From 0feb880c3b65cd01547de519109d535755f65a47 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 16:32:10 +0100 Subject: [PATCH 011/174] Correct go syntax in README.md --- collectors/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/README.md b/collectors/README.md index b5ae4e1..df02dd6 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -339,7 +339,7 @@ func (m *SampleCollector) Read(interval time.Duration, out *[]lp.MutableMetric) return } // tags for the metric, if type != node use proper type and type-id - tags := map[string][string]{"type" : "node"} + tags := map[string]string{"type" : "node"} // Each metric has exactly one field: value ! value := map[string]interface{}{"value": int(x)} y, err := lp.New("sample_metric", tags, value, time.Now()) From 83b784e6f0315f2835c18c1fb494c150e28418c9 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 21 Jan 2022 09:59:57 +0100 Subject: [PATCH 012/174] Add CPU frequency collector --- collectors/cpufreqMetric.go | 189 ++++++++++++++++++++++++++++++++++++ metric-collector.go | 12 ++- 2 files changed, 196 insertions(+), 5 deletions(-) create mode 100644 collectors/cpufreqMetric.go diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go new file mode 100644 index 0000000..94f8f4a --- /dev/null +++ b/collectors/cpufreqMetric.go @@ -0,0 +1,189 @@ +package collectors + +import ( + "bufio" + "encoding/json" + "fmt" + "log" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + lp "github.com/influxdata/line-protocol" + "golang.org/x/sys/unix" +) + +var warnLog *log.Logger = log.New(os.Stderr, "Warning: ", log.LstdFlags) + +// +// readOneLine reads one line from a file. +// It returns ok when file was successfully read. +// In this case text contains the first line of the files contents. +// +func readOneLine(filename string) (text string, ok bool) { + file, err := os.Open(filename) + if err != nil { + return + } + defer file.Close() + scanner := bufio.NewScanner(file) + ok = scanner.Scan() + text = scanner.Text() + return +} + +type CPUFreqCollectorCPU struct { + // coreID, packageID, num_cores, num_package + tagSet map[string]string + scalingCurFreqFile string +} + +// +// CPUFreqCollector +// a metric collector to measure the current frequency of the CPUs +// as obtained from the hardware (in KHz) +// Only measure on the first hyper thread +// +// See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html +// +type CPUFreqCollector struct { + MetricCollector + config struct { + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + } + cpus []CPUFreqCollectorCPU +} + +func (m *CPUFreqCollector) Init(config []byte) error { + m.name = "CPUFreqCollector" + m.setup() + if len(config) > 0 { + err := json.Unmarshal(config, &m.config) + if err != nil { + return err + } + } + + // Initialize CPU list + m.cpus = make([]CPUFreqCollectorCPU, 0) + + // Loop for all CPU directories + baseDir := "/sys/devices/system/cpu" + globPattern := filepath.Join(baseDir, "cpu[0-9]*") + cpuDirs, err := filepath.Glob(globPattern) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to glob files with pattern %s: %v", globPattern, err) + } + if cpuDirs == nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to find any files with pattern %s", globPattern) + } + + maxPackageID := 0 + maxCoreID := 0 + for _, cpuDir := range cpuDirs { + cpuID := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") + + // Read thread sibling list + threadSiblingListFile := filepath.Join(cpuDir, "topology", "thread_siblings_list") + threadSiblingList, ok := readOneLine(threadSiblingListFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read thread siblings list from %s", threadSiblingListFile) + } + + // Read frequency only from first hardware thread + // Ignore Simultaneous Multithreading (SMT) / Hyper-Threading + if strings.Split(threadSiblingList, ",")[0] == cpuID { + // Read package ID + packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") + packageID, ok := readOneLine(packageIDFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) + } + packageID_int, err := strconv.Atoi(packageID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) + } + + // Update maxPackageID + if packageID_int > maxPackageID { + maxPackageID = packageID_int + } + + // Read core ID + coreIDFile := filepath.Join(cpuDir, "topology", "core_id") + coreID, ok := readOneLine(coreIDFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) + } + coreID_int, err := strconv.Atoi(coreID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) + } + + // Update maxCoreID + if coreID_int > maxCoreID { + maxCoreID = coreID_int + } + + // Check access to current frequency file + scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") + err = unix.Access(scalingCurFreqFile, unix.R_OK) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) + } + + m.cpus = append( + m.cpus, + CPUFreqCollectorCPU{ + tagSet: map[string]string{ + "coreID": strings.TrimSpace(coreID), + "packageID": strings.TrimSpace(packageID), + }, + scalingCurFreqFile: scalingCurFreqFile, + }) + } + } + + // Add num packages and num cores as tags + numPackages := strconv.Itoa(maxPackageID + 1) + numCores := strconv.Itoa(maxCoreID + 1) + for i := range m.cpus { + m.cpus[i].tagSet["num_core"] = numCores + m.cpus[i].tagSet["num_package"] = numPackages + } + + m.init = true + return nil +} + +func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + if !m.init { + return + } + + for _, cpu := range m.cpus { + // Read current frequency + line, ok := readOneLine(cpu.scalingCurFreqFile) + if !ok { + warnLog.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) + continue + } + cpuFreq, err := strconv.Atoi(line) + if err != nil { + warnLog.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err) + continue + } + + value := map[string]interface{}{"value": cpuFreq} + y, err := lp.New("cpufreq", cpu.tagSet, value, time.Now()) + if err == nil { + *out = append(*out, y) + } + } +} + +func (m *CPUFreqCollector) Close() { + m.init = false +} diff --git a/metric-collector.go b/metric-collector.go index 04c221f..90f50c4 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -4,16 +4,17 @@ import ( "encoding/json" "flag" "fmt" - "github.com/ClusterCockpit/cc-metric-collector/collectors" - "github.com/ClusterCockpit/cc-metric-collector/receivers" - "github.com/ClusterCockpit/cc-metric-collector/sinks" - lp "github.com/influxdata/line-protocol" "log" "os" "os/signal" "strings" "sync" "time" + + "github.com/ClusterCockpit/cc-metric-collector/collectors" + "github.com/ClusterCockpit/cc-metric-collector/receivers" + "github.com/ClusterCockpit/cc-metric-collector/sinks" + lp "github.com/influxdata/line-protocol" ) // List of provided collectors. Which collector should be run can be @@ -32,7 +33,8 @@ var Collectors = map[string]collectors.MetricGetter{ "diskstat": &collectors.DiskstatCollector{}, "tempstat": &collectors.TempCollector{}, "ipmistat": &collectors.IpmiCollector{}, - "gpfs": &collectors.GpfsCollector{}, + "gpfs": new(collectors.GpfsCollector), + "cpufreq": new(collectors.CPUFreqCollector), } var Sinks = map[string]sinks.SinkFuncs{ From 5dd2af4e8fcce87f80a2fde5751de7919e2689c0 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 21 Jan 2022 14:35:52 +0100 Subject: [PATCH 013/174] Avoid staticcheck warning: redundant return statement --- collectors/cpustatMetric.go | 4 ++-- collectors/customCmdMetric.go | 4 ++-- collectors/diskstatMetric.go | 6 +++--- collectors/{gpfs.go => gpfsMetric.go} | 9 +++------ collectors/infinibandMetric.go | 5 +++-- collectors/ipmiMetric.go | 4 ++-- collectors/likwidMetric.go | 6 +++--- collectors/loadavgMetric.go | 4 ++-- collectors/lustreMetric.go | 4 ++-- collectors/memstatMetric.go | 4 ++-- collectors/netstatMetric.go | 4 ++-- collectors/nvidiaMetric.go | 6 +++--- collectors/tempMetric.go | 4 ++-- collectors/topprocsMetric.go | 4 ++-- sinks/stdoutSink.go | 4 +--- 15 files changed, 34 insertions(+), 38 deletions(-) rename collectors/{gpfs.go => gpfsMetric.go} (98%) diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 9e44fa7..64b5842 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -3,11 +3,12 @@ package collectors import ( "encoding/json" "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const CPUSTATFILE = `/proc/stat` @@ -88,5 +89,4 @@ func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *CpustatCollector) Close() { m.init = false - return } diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index 547bb87..bbafc2d 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -3,12 +3,13 @@ package collectors import ( "encoding/json" "errors" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "os/exec" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom` @@ -126,5 +127,4 @@ func (m *CustomCmdCollector) Read(interval time.Duration, out *[]lp.MutableMetri func (m *CustomCmdCollector) Close() { m.init = false - return } diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 5080ca2..4cbd3c6 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -1,8 +1,10 @@ package collectors import ( - lp "github.com/influxdata/line-protocol" "io/ioutil" + + lp "github.com/influxdata/line-protocol" + // "log" "encoding/json" "errors" @@ -107,10 +109,8 @@ func (m *DiskstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric } } } - return } func (m *DiskstatCollector) Close() { m.init = false - return } diff --git a/collectors/gpfs.go b/collectors/gpfsMetric.go similarity index 98% rename from collectors/gpfs.go rename to collectors/gpfsMetric.go index db8a0d0..fbf3a63 100644 --- a/collectors/gpfs.go +++ b/collectors/gpfsMetric.go @@ -17,13 +17,11 @@ import ( lp "github.com/influxdata/line-protocol" ) -type GpfsCollectorConfig struct { - Mmpmon string `json:"mmpmon"` -} - type GpfsCollector struct { MetricCollector - config GpfsCollectorConfig + config struct { + Mmpmon string `json:"mmpmon"` + } } func (m *GpfsCollector) Init(config []byte) error { @@ -297,5 +295,4 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { func (m *GpfsCollector) Close() { m.init = false - return } diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index a9552f7..6e14251 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -2,10 +2,12 @@ package collectors import ( "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "os/exec" + + lp "github.com/influxdata/line-protocol" + // "os" "encoding/json" "errors" @@ -278,5 +280,4 @@ func (m *InfinibandCollector) Read(interval time.Duration, out *[]lp.MutableMetr func (m *InfinibandCollector) Close() { m.init = false - return } diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index d28a134..3179148 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -3,13 +3,14 @@ package collectors import ( "encoding/json" "errors" - lp "github.com/influxdata/line-protocol" "log" "os" "os/exec" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const IPMITOOL_PATH = `/usr/bin/ipmitool` @@ -133,5 +134,4 @@ func (m *IpmiCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { func (m *IpmiCollector) Close() { m.init = false - return } diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 34e2364..454a593 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -12,8 +12,6 @@ import ( "encoding/json" "errors" "fmt" - lp "github.com/influxdata/line-protocol" - "gopkg.in/Knetic/govaluate.v2" "io/ioutil" "log" "math" @@ -22,6 +20,9 @@ import ( "strings" "time" "unsafe" + + lp "github.com/influxdata/line-protocol" + "gopkg.in/Knetic/govaluate.v2" ) type LikwidCollectorMetricConfig struct { @@ -303,5 +304,4 @@ func (m *LikwidCollector) Close() { C.perfmon_finalize() C.topology_finalize() } - return } diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index 21cf350..1ecaea5 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -2,11 +2,12 @@ package collectors import ( "encoding/json" - lp "github.com/influxdata/line-protocol" "io/ioutil" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const LOADAVGFILE = `/proc/loadavg` @@ -76,5 +77,4 @@ func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *LoadavgCollector) Close() { m.init = false - return } diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index e7bb7a6..d77ac09 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -3,12 +3,13 @@ package collectors import ( "encoding/json" "errors" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats` @@ -102,5 +103,4 @@ func (m *LustreCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *LustreCollector) Close() { m.init = false - return } diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index 91987bb..17db13e 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -4,12 +4,13 @@ import ( "encoding/json" "errors" "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const MEMSTATFILE = `/proc/meminfo` @@ -125,5 +126,4 @@ func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *MemstatCollector) Close() { m.init = false - return } diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 659b89f..a273de1 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -2,12 +2,13 @@ package collectors import ( "encoding/json" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const NETSTATFILE = `/proc/net/dev` @@ -84,5 +85,4 @@ func (m *NetstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *NetstatCollector) Close() { m.init = false - return } diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index bd63e2c..31118c2 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -4,10 +4,11 @@ import ( "encoding/json" "errors" "fmt" - "github.com/NVIDIA/go-nvml/pkg/nvml" - lp "github.com/influxdata/line-protocol" "log" "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + lp "github.com/influxdata/line-protocol" ) type NvidiaCollectorConfig struct { @@ -267,5 +268,4 @@ func (m *NvidiaCollector) Close() { nvml.Shutdown() m.init = false } - return } diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index 3665025..b074d78 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -3,13 +3,14 @@ package collectors import ( "encoding/json" "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "os" "path/filepath" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const HWMON_PATH = `/sys/class/hwmon` @@ -105,5 +106,4 @@ func (m *TempCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { func (m *TempCollector) Close() { m.init = false - return } diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index a1bf989..e1b31ee 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -4,11 +4,12 @@ import ( "encoding/json" "errors" "fmt" - lp "github.com/influxdata/line-protocol" "log" "os/exec" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const MAX_NUM_PROCS = 10 @@ -74,5 +75,4 @@ func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric func (m *TopProcsCollector) Close() { m.init = false - return } diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index 34561e0..8016fcb 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -59,6 +59,4 @@ func (s *StdoutSink) Flush() error { return nil } -func (s *StdoutSink) Close() { - return -} +func (s *StdoutSink) Close() {} From 25b9268b24f3458dab0128a8af36df217039528e Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 21 Jan 2022 15:20:53 +0100 Subject: [PATCH 014/174] Avoid staticcheck warning: unnecessary assignment to the blank identifier --- collectors/likwidMetric.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 454a593..45fe68c 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -152,7 +152,7 @@ func (m *LikwidCollector) Init(config []byte) error { C.free(unsafe.Pointer(cstr)) m.results[i] = make(map[int]map[string]interface{}) m.mresults[i] = make(map[int]map[string]float64) - for tid, _ := range m.cpulist { + for tid := range m.cpulist { m.results[i][tid] = make(map[string]interface{}) m.mresults[i][tid] = make(map[string]float64) m.gmresults[tid] = make(map[string]float64) @@ -194,7 +194,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) continue } var eidx C.int - for tid, _ := range m.cpulist { + for tid := range m.cpulist { for eidx = 0; int(eidx) < len(evset.Events); eidx++ { ctr := C.perfmon_getCounterName(gid, eidx) gctr := C.GoString(ctr) @@ -220,7 +220,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) } for _, metric := range m.config.Metrics { - for tid, _ := range m.cpulist { + for tid := range m.cpulist { var params map[string]interface{} expression, err := govaluate.NewEvaluableExpression(metric.Calc) if err != nil { @@ -228,7 +228,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) continue } params = make(map[string]interface{}) - for j, _ := range m.groups { + for j := range m.groups { for mname, mres := range m.mresults[j][tid] { params[mname] = mres } @@ -241,7 +241,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) m.gmresults[tid][metric.Name] = float64(result.(float64)) } } - for i, _ := range m.groups { + for i := range m.groups { evset := m.config.Eventsets[i] for _, metric := range evset.Metrics { _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) From daa7c6bf99135c66cbe86df46595d2b98d9e1c20 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 11:31:45 +0100 Subject: [PATCH 015/174] Simplified code --- collectors/cpufreqMetric.go | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 94f8f4a..f5a10bc 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -15,8 +15,6 @@ import ( "golang.org/x/sys/unix" ) -var warnLog *log.Logger = log.New(os.Stderr, "Warning: ", log.LstdFlags) - // // readOneLine reads one line from a file. // It returns ok when file was successfully read. @@ -138,7 +136,8 @@ func (m *CPUFreqCollector) Init(config []byte) error { m.cpus, CPUFreqCollectorCPU{ tagSet: map[string]string{ - "coreID": strings.TrimSpace(coreID), + "type": "cpu", + "type-id": strings.TrimSpace(coreID), "packageID": strings.TrimSpace(packageID), }, scalingCurFreqFile: scalingCurFreqFile, @@ -150,8 +149,9 @@ func (m *CPUFreqCollector) Init(config []byte) error { numPackages := strconv.Itoa(maxPackageID + 1) numCores := strconv.Itoa(maxCoreID + 1) for i := range m.cpus { - m.cpus[i].tagSet["num_core"] = numCores - m.cpus[i].tagSet["num_package"] = numPackages + c := &m.cpus[i] + c.tagSet["num_core"] = numCores + c.tagSet["num_package"] = numPackages } m.init = true @@ -163,21 +163,23 @@ func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) return } - for _, cpu := range m.cpus { + now := time.Now() + for i := range m.cpus { + cpu := &m.cpus[i] + // Read current frequency line, ok := readOneLine(cpu.scalingCurFreqFile) if !ok { - warnLog.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) + log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) continue } cpuFreq, err := strconv.Atoi(line) if err != nil { - warnLog.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err) + log.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err) continue } - value := map[string]interface{}{"value": cpuFreq} - y, err := lp.New("cpufreq", cpu.tagSet, value, time.Now()) + y, err := lp.New("cpufreq", cpu.tagSet, map[string]interface{}{"value": cpuFreq}, now) if err == nil { *out = append(*out, y) } From 8d314ecb19c18353f417e8da38de95a7a6ad9c86 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 13:10:33 +0100 Subject: [PATCH 016/174] Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread --- collectors/cpufreqCpuinfoMetric.go | 176 +++++++++++++++++++++++++++++ metric-collector.go | 31 ++--- 2 files changed, 192 insertions(+), 15 deletions(-) create mode 100644 collectors/cpufreqCpuinfoMetric.go diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go new file mode 100644 index 0000000..1658878 --- /dev/null +++ b/collectors/cpufreqCpuinfoMetric.go @@ -0,0 +1,176 @@ +package collectors + +import ( + "bufio" + "fmt" + "log" + "os" + "strconv" + "strings" + "time" + + lp "github.com/influxdata/line-protocol" +) + +// +// CPUFreqCollector +// a metric collector to measure the current frequency of the CPUs +// as obtained from /proc/cpuinfo +// Only measure on the first hyperthread +// +type CPUFreqCpuInfoCollectorTopology struct { + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + physicalID string // socket / package ID + numPhysicalID string // number of sockets / packages + isHT bool + numNonHT string // number of non hyperthreading processors + tagSet map[string]string +} + +type CPUFreqCpuInfoCollector struct { + MetricCollector + topology []CPUFreqCpuInfoCollectorTopology +} + +func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { + m.name = "CPUFreqCpuInfoCollector" + + const cpuInfoFile = "/proc/cpuinfo" + file, err := os.Open(cpuInfoFile) + if err != nil { + return fmt.Errorf("Failed to open '%s': %v", cpuInfoFile, err) + } + defer file.Close() + + // Collect topology information from file cpuinfo + foundFreq := false + processor := "" + numNonHT := 0 + coreID := "" + physicalID := "" + maxPhysicalID := 0 + m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0) + coreSeenBefore := make(map[string]bool) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lineSplit := strings.Split(scanner.Text(), ":") + if len(lineSplit) == 2 { + key := strings.TrimSpace(lineSplit[0]) + value := strings.TrimSpace(lineSplit[1]) + switch key { + case "cpu MHz": + // frequency + foundFreq = true + case "processor": + processor = value + case "core id": + coreID = value + case "physical id": + physicalID = value + } + } + + // were all topology information collected? + if foundFreq && + len(processor) > 0 && + len(coreID) > 0 && + len(physicalID) > 0 { + + globalID := physicalID + ":" + coreID + isHT := coreSeenBefore[globalID] + coreSeenBefore[globalID] = true + if !isHT { + // increase number on non hyper thread cores + numNonHT++ + + // increase maximun socket / package ID, when required + physicalIDInt, err := strconv.Atoi(physicalID) + if err != nil { + return fmt.Errorf("Failed to convert physical id to int: %v", err) + } + if physicalIDInt > maxPhysicalID { + maxPhysicalID = physicalIDInt + } + } + + // store collected topology information + m.topology = append( + m.topology, + CPUFreqCpuInfoCollectorTopology{ + processor: processor, + coreID: coreID, + physicalID: physicalID, + isHT: isHT, + }) + + // reset topology information + foundFreq = false + processor = "" + coreID = "" + physicalID = "" + } + } + + numPhysicalID := fmt.Sprint(maxPhysicalID + 1) + numNonHTString := fmt.Sprint(numNonHT) + for i := range m.topology { + t := &m.topology[i] + t.numPhysicalID = numPhysicalID + t.numNonHT = numNonHTString + t.tagSet = map[string]string{ + "type": "cpu", + "type-id": t.processor, + "num_core": t.numNonHT, + "package_id": t.physicalID, + "num_package": t.numPhysicalID, + } + } + + m.init = true + return nil +} + +func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + if !m.init { + return + } + const cpuInfoFile = "/proc/cpuinfo" + file, err := os.Open(cpuInfoFile) + if err != nil { + log.Printf("Failed to open '%s': %v", cpuInfoFile, err) + return + } + defer file.Close() + + processorCounter := 0 + now := time.Now() + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lineSplit := strings.Split(scanner.Text(), ":") + if len(lineSplit) == 2 { + key := strings.TrimSpace(lineSplit[0]) + + // frequency + if key == "cpu MHz" { + t := &m.topology[processorCounter] + if !t.isHT { + value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64) + if err != nil { + log.Printf("Failed to convert cpu MHz to float: %v", err) + return + } + y, err := lp.New("cpufreq", t.tagSet, map[string]interface{}{"value": value}, now) + if err == nil { + *out = append(*out, y) + } + } + processorCounter++ + } + } + } +} + +func (m *CPUFreqCpuInfoCollector) Close() { + m.init = false +} diff --git a/metric-collector.go b/metric-collector.go index 90f50c4..02a2b21 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -20,21 +20,22 @@ import ( // List of provided collectors. Which collector should be run can be // configured at 'collectors' list in 'config.json'. var Collectors = map[string]collectors.MetricGetter{ - "likwid": &collectors.LikwidCollector{}, - "loadavg": &collectors.LoadavgCollector{}, - "memstat": &collectors.MemstatCollector{}, - "netstat": &collectors.NetstatCollector{}, - "ibstat": &collectors.InfinibandCollector{}, - "lustrestat": &collectors.LustreCollector{}, - "cpustat": &collectors.CpustatCollector{}, - "topprocs": &collectors.TopProcsCollector{}, - "nvidia": &collectors.NvidiaCollector{}, - "customcmd": &collectors.CustomCmdCollector{}, - "diskstat": &collectors.DiskstatCollector{}, - "tempstat": &collectors.TempCollector{}, - "ipmistat": &collectors.IpmiCollector{}, - "gpfs": new(collectors.GpfsCollector), - "cpufreq": new(collectors.CPUFreqCollector), + "likwid": &collectors.LikwidCollector{}, + "loadavg": &collectors.LoadavgCollector{}, + "memstat": &collectors.MemstatCollector{}, + "netstat": &collectors.NetstatCollector{}, + "ibstat": &collectors.InfinibandCollector{}, + "lustrestat": &collectors.LustreCollector{}, + "cpustat": &collectors.CpustatCollector{}, + "topprocs": &collectors.TopProcsCollector{}, + "nvidia": &collectors.NvidiaCollector{}, + "customcmd": &collectors.CustomCmdCollector{}, + "diskstat": &collectors.DiskstatCollector{}, + "tempstat": &collectors.TempCollector{}, + "ipmistat": &collectors.IpmiCollector{}, + "gpfs": new(collectors.GpfsCollector), + "cpufreq": new(collectors.CPUFreqCollector), + "cpufreq_cpuinfo": new(collectors.CPUFreqCpuInfoCollector), } var Sinks = map[string]sinks.SinkFuncs{ From 7953629940413ade2d9d708f9afb1d7e1f910720 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 15:55:15 +0100 Subject: [PATCH 017/174] Update GitHub actions --- .github/ci-config.json | 23 +++++++++++++---------- .github/workflows/runonce.yml | 5 ++++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/ci-config.json b/.github/ci-config.json index b3fbff1..402388d 100644 --- a/.github/ci-config.json +++ b/.github/ci-config.json @@ -21,7 +21,10 @@ "topprocs", "nvidia", "diskstat", - "ipmistat" + "ipmistat", + "gpfs", + "cpufreq", + "cpufreq_cpuinfo" ], "default_tags": { "cluster": "testcluster" @@ -30,20 +33,20 @@ "type": "none" }, "collect_config": { - "topprocs" : { + "topprocs": { "num_procs": 2 - }, + }, "tempstat": { "tag_override": { - "hwmon0" : { - "type" : "socket", - "type-id" : "0" + "hwmon0": { + "type": "socket", + "type-id": "0" }, - "hwmon1" : { - "type" : "socket", - "type-id" : "1" + "hwmon1": { + "type": "socket", + "type-id": "1" } } } } -} +} \ No newline at end of file diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index 8efc70a..194710f 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -7,8 +7,11 @@ jobs: steps: - uses: actions/checkout@v2 + # See: https://github.com/marketplace/actions/setup-go-environment - name: Setup Golang - uses: actions/setup-go@v2.1.4 + uses: actions/setup-go@v2.1.5 + with: + go-version: '^1.17.6' - name: Build MetricCollector run: make From 2026c3acd9f050e2958ca719aa9127490b7228a7 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 20:22:08 +0100 Subject: [PATCH 018/174] Fixed topology detection --- collectors/cpufreqMetric.go | 175 +++++++++++++++++++++--------------- 1 file changed, 101 insertions(+), 74 deletions(-) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index f5a10bc..ec42445 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -32,10 +32,19 @@ func readOneLine(filename string) (text string, ok bool) { return } -type CPUFreqCollectorCPU struct { - // coreID, packageID, num_cores, num_package - tagSet map[string]string +type CPUFreqCollectorTopology struct { + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + coreID_int int + physicalID string // socket / package ID + physicalID_int int + numPhysicalID string // number of sockets / packages + numPhysicalID_int int + isHT bool + numNonHT string // number of non hyperthreading processors + numNonHT_int int scalingCurFreqFile string + tagSet map[string]string } // @@ -48,10 +57,10 @@ type CPUFreqCollectorCPU struct { // type CPUFreqCollector struct { MetricCollector - config struct { + topology []CPUFreqCollectorTopology + config struct { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } - cpus []CPUFreqCollectorCPU } func (m *CPUFreqCollector) Init(config []byte) error { @@ -64,9 +73,6 @@ func (m *CPUFreqCollector) Init(config []byte) error { } } - // Initialize CPU list - m.cpus = make([]CPUFreqCollectorCPU, 0) - // Loop for all CPU directories baseDir := "/sys/devices/system/cpu" globPattern := filepath.Join(baseDir, "cpu[0-9]*") @@ -78,82 +84,98 @@ func (m *CPUFreqCollector) Init(config []byte) error { return fmt.Errorf("CPUFreqCollector.Init() unable to find any files with pattern %s", globPattern) } - maxPackageID := 0 - maxCoreID := 0 + // Initialize CPU topology + m.topology = make([]CPUFreqCollectorTopology, len(cpuDirs)) for _, cpuDir := range cpuDirs { - cpuID := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") + processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") + processor_int, err := strconv.Atoi(processor) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert cpuID to int: %v", err) + } - // Read thread sibling list - threadSiblingListFile := filepath.Join(cpuDir, "topology", "thread_siblings_list") - threadSiblingList, ok := readOneLine(threadSiblingListFile) + // Read package ID + packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") + packageID, ok := readOneLine(packageIDFile) if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read thread siblings list from %s", threadSiblingListFile) + return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) + } + packageID_int, err := strconv.Atoi(packageID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) } - // Read frequency only from first hardware thread - // Ignore Simultaneous Multithreading (SMT) / Hyper-Threading - if strings.Split(threadSiblingList, ",")[0] == cpuID { - // Read package ID - packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") - packageID, ok := readOneLine(packageIDFile) - if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) - } - packageID_int, err := strconv.Atoi(packageID) - if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) - } + // Read core ID + coreIDFile := filepath.Join(cpuDir, "topology", "core_id") + coreID, ok := readOneLine(coreIDFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) + } + coreID_int, err := strconv.Atoi(coreID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) + } - // Update maxPackageID - if packageID_int > maxPackageID { - maxPackageID = packageID_int - } + // Check access to current frequency file + scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") + err = unix.Access(scalingCurFreqFile, unix.R_OK) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) + } - // Read core ID - coreIDFile := filepath.Join(cpuDir, "topology", "core_id") - coreID, ok := readOneLine(coreIDFile) - if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) - } - coreID_int, err := strconv.Atoi(coreID) - if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) - } + t := &m.topology[processor_int] + t.processor = processor + t.physicalID = packageID + t.physicalID_int = packageID_int + t.coreID = coreID + t.coreID_int = coreID_int + t.scalingCurFreqFile = scalingCurFreqFile + } - // Update maxCoreID - if coreID_int > maxCoreID { - maxCoreID = coreID_int - } + // is processor a hyperthread? + coreSeenBefore := make(map[string]bool) + for i := range m.topology { + t := &m.topology[i] - // Check access to current frequency file - scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") - err = unix.Access(scalingCurFreqFile, unix.R_OK) - if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) - } + globalID := t.physicalID + ":" + t.coreID + t.isHT = coreSeenBefore[globalID] + coreSeenBefore[globalID] = true + } - m.cpus = append( - m.cpus, - CPUFreqCollectorCPU{ - tagSet: map[string]string{ - "type": "cpu", - "type-id": strings.TrimSpace(coreID), - "packageID": strings.TrimSpace(packageID), - }, - scalingCurFreqFile: scalingCurFreqFile, - }) + // number of non hyper thread cores and packages / sockets + numNonHT_int := 0 + maxPhysicalID := 0 + for i := range m.topology { + t := &m.topology[i] + + // Update maxPackageID + if t.physicalID_int > maxPhysicalID { + maxPhysicalID = t.physicalID_int + } + + if !t.isHT { + numNonHT_int++ } } - // Add num packages and num cores as tags - numPackages := strconv.Itoa(maxPackageID + 1) - numCores := strconv.Itoa(maxCoreID + 1) - for i := range m.cpus { - c := &m.cpus[i] - c.tagSet["num_core"] = numCores - c.tagSet["num_package"] = numPackages + numPhysicalID_int := maxPhysicalID + 1 + numPhysicalID := fmt.Sprint(numPhysicalID_int) + numNonHT := fmt.Sprint(numNonHT_int) + for i := range m.topology { + t := &m.topology[i] + t.numPhysicalID = numPhysicalID + t.numPhysicalID_int = numPhysicalID_int + t.numNonHT = numNonHT + t.numNonHT_int = numNonHT_int + t.tagSet = map[string]string{ + "type": "cpu", + "type-id": t.processor, + "num_core": t.numNonHT, + "package_id": t.physicalID, + "num_package": t.numPhysicalID, + } } + fmt.Printf("%+v\n", m.topology) m.init = true return nil } @@ -164,13 +186,18 @@ func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) } now := time.Now() - for i := range m.cpus { - cpu := &m.cpus[i] + for i := range m.topology { + t := &m.topology[i] + + // skip hyperthreads + if t.isHT { + continue + } // Read current frequency - line, ok := readOneLine(cpu.scalingCurFreqFile) + line, ok := readOneLine(t.scalingCurFreqFile) if !ok { - log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) + log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile) continue } cpuFreq, err := strconv.Atoi(line) @@ -179,7 +206,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) continue } - y, err := lp.New("cpufreq", cpu.tagSet, map[string]interface{}{"value": cpuFreq}, now) + y, err := lp.New("cpufreq", t.tagSet, map[string]interface{}{"value": cpuFreq}, now) if err == nil { *out = append(*out, y) } From be8c92676a4d2532eb848a019a821b106e6e4951 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 22:03:13 +0100 Subject: [PATCH 019/174] Refactoring --- collectors/cpufreqMetric.go | 51 ++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index ec42445..35e64ac 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -33,18 +33,18 @@ func readOneLine(filename string) (text string, ok bool) { } type CPUFreqCollectorTopology struct { - processor string // logical processor number (continuous, starting at 0) - coreID string // socket local core ID - coreID_int int - physicalID string // socket / package ID - physicalID_int int - numPhysicalID string // number of sockets / packages - numPhysicalID_int int - isHT bool - numNonHT string // number of non hyperthreading processors - numNonHT_int int - scalingCurFreqFile string - tagSet map[string]string + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + coreID_int int + physicalPackageID string // socket / package ID + physicalPackageID_int int + numPhysicalPackages string // number of sockets / packages + numPhysicalPackages_int int + isHT bool + numNonHT string // number of non hyperthreading processors + numNonHT_int int + scalingCurFreqFile string + tagSet map[string]string } // @@ -94,12 +94,12 @@ func (m *CPUFreqCollector) Init(config []byte) error { } // Read package ID - packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") - packageID, ok := readOneLine(packageIDFile) + physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") + physicalPackageID, ok := readOneLine(physicalPackageIDFile) if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) + return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", physicalPackageIDFile) } - packageID_int, err := strconv.Atoi(packageID) + physicalPackageID_int, err := strconv.Atoi(physicalPackageID) if err != nil { return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) } @@ -124,8 +124,8 @@ func (m *CPUFreqCollector) Init(config []byte) error { t := &m.topology[processor_int] t.processor = processor - t.physicalID = packageID - t.physicalID_int = packageID_int + t.physicalPackageID = physicalPackageID + t.physicalPackageID_int = physicalPackageID_int t.coreID = coreID t.coreID_int = coreID_int t.scalingCurFreqFile = scalingCurFreqFile @@ -136,7 +136,7 @@ func (m *CPUFreqCollector) Init(config []byte) error { for i := range m.topology { t := &m.topology[i] - globalID := t.physicalID + ":" + t.coreID + globalID := t.physicalPackageID + ":" + t.coreID t.isHT = coreSeenBefore[globalID] coreSeenBefore[globalID] = true } @@ -148,8 +148,8 @@ func (m *CPUFreqCollector) Init(config []byte) error { t := &m.topology[i] // Update maxPackageID - if t.physicalID_int > maxPhysicalID { - maxPhysicalID = t.physicalID_int + if t.physicalPackageID_int > maxPhysicalID { + maxPhysicalID = t.physicalPackageID_int } if !t.isHT { @@ -162,20 +162,19 @@ func (m *CPUFreqCollector) Init(config []byte) error { numNonHT := fmt.Sprint(numNonHT_int) for i := range m.topology { t := &m.topology[i] - t.numPhysicalID = numPhysicalID - t.numPhysicalID_int = numPhysicalID_int + t.numPhysicalPackages = numPhysicalID + t.numPhysicalPackages_int = numPhysicalID_int t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ "type": "cpu", "type-id": t.processor, "num_core": t.numNonHT, - "package_id": t.physicalID, - "num_package": t.numPhysicalID, + "package_id": t.physicalPackageID, + "num_package": t.numPhysicalPackages, } } - fmt.Printf("%+v\n", m.topology) m.init = true return nil } From e095e4f202e2335aba925de1d45dc69a7e2a017e Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 09:47:24 +0100 Subject: [PATCH 020/174] Refactoring --- collectors/cpufreqCpuinfoMetric.go | 82 +++++++++++++++++------------- collectors/cpufreqMetric.go | 14 ++--- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 1658878..e8cd0fc 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -19,13 +19,17 @@ import ( // Only measure on the first hyperthread // type CPUFreqCpuInfoCollectorTopology struct { - processor string // logical processor number (continuous, starting at 0) - coreID string // socket local core ID - physicalID string // socket / package ID - numPhysicalID string // number of sockets / packages - isHT bool - numNonHT string // number of non hyperthreading processors - tagSet map[string]string + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + coreID_int int + physicalPackageID string // socket / package ID + physicalPackageID_int int + numPhysicalPackages string // number of sockets / packages + numPhysicalPackages_int int + isHT bool + numNonHT string // number of non hyperthreading processors + numNonHT_int int + tagSet map[string]string } type CPUFreqCpuInfoCollector struct { @@ -46,10 +50,10 @@ func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { // Collect topology information from file cpuinfo foundFreq := false processor := "" - numNonHT := 0 + numNonHT_int := 0 coreID := "" - physicalID := "" - maxPhysicalID := 0 + physicalPackageID := "" + maxPhysicalPackageID := 0 m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0) coreSeenBefore := make(map[string]bool) scanner := bufio.NewScanner(file) @@ -67,7 +71,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { case "core id": coreID = value case "physical id": - physicalID = value + physicalPackageID = value } } @@ -75,55 +79,65 @@ func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { if foundFreq && len(processor) > 0 && len(coreID) > 0 && - len(physicalID) > 0 { + len(physicalPackageID) > 0 { - globalID := physicalID + ":" + coreID + coreID_int, err := strconv.Atoi(coreID) + if err != nil { + return fmt.Errorf("Unable to convert coreID to int: %v", err) + } + physicalPackageID_int, err := strconv.Atoi(physicalPackageID) + if err != nil { + return fmt.Errorf("Unable to convert physicalPackageID to int: %v", err) + } + + // increase maximun socket / package ID, when required + if physicalPackageID_int > maxPhysicalPackageID { + maxPhysicalPackageID = physicalPackageID_int + } + + globalID := physicalPackageID + ":" + coreID isHT := coreSeenBefore[globalID] coreSeenBefore[globalID] = true if !isHT { // increase number on non hyper thread cores - numNonHT++ - - // increase maximun socket / package ID, when required - physicalIDInt, err := strconv.Atoi(physicalID) - if err != nil { - return fmt.Errorf("Failed to convert physical id to int: %v", err) - } - if physicalIDInt > maxPhysicalID { - maxPhysicalID = physicalIDInt - } + numNonHT_int++ } // store collected topology information m.topology = append( m.topology, CPUFreqCpuInfoCollectorTopology{ - processor: processor, - coreID: coreID, - physicalID: physicalID, - isHT: isHT, + processor: processor, + coreID: coreID, + coreID_int: coreID_int, + physicalPackageID: physicalPackageID, + physicalPackageID_int: physicalPackageID_int, + isHT: isHT, }) // reset topology information foundFreq = false processor = "" coreID = "" - physicalID = "" + physicalPackageID = "" } } - numPhysicalID := fmt.Sprint(maxPhysicalID + 1) - numNonHTString := fmt.Sprint(numNonHT) + numPhysicalPackageID_int := maxPhysicalPackageID + 1 + numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int) + numNonHT := fmt.Sprint(numNonHT_int) for i := range m.topology { t := &m.topology[i] - t.numPhysicalID = numPhysicalID - t.numNonHT = numNonHTString + t.numPhysicalPackages = numPhysicalPackageID + t.numPhysicalPackages_int = numPhysicalPackageID_int + t.numNonHT = numNonHT + t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ "type": "cpu", "type-id": t.processor, "num_core": t.numNonHT, - "package_id": t.physicalID, - "num_package": t.numPhysicalID, + "package_id": t.physicalPackageID, + "num_package": t.numPhysicalPackages, } } diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 35e64ac..fcab782 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -143,13 +143,13 @@ func (m *CPUFreqCollector) Init(config []byte) error { // number of non hyper thread cores and packages / sockets numNonHT_int := 0 - maxPhysicalID := 0 + maxPhysicalPackageID := 0 for i := range m.topology { t := &m.topology[i] // Update maxPackageID - if t.physicalPackageID_int > maxPhysicalID { - maxPhysicalID = t.physicalPackageID_int + if t.physicalPackageID_int > maxPhysicalPackageID { + maxPhysicalPackageID = t.physicalPackageID_int } if !t.isHT { @@ -157,13 +157,13 @@ func (m *CPUFreqCollector) Init(config []byte) error { } } - numPhysicalID_int := maxPhysicalID + 1 - numPhysicalID := fmt.Sprint(numPhysicalID_int) + numPhysicalPackageID_int := maxPhysicalPackageID + 1 + numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int) numNonHT := fmt.Sprint(numNonHT_int) for i := range m.topology { t := &m.topology[i] - t.numPhysicalPackages = numPhysicalID - t.numPhysicalPackages_int = numPhysicalID_int + t.numPhysicalPackages = numPhysicalPackageID + t.numPhysicalPackages_int = numPhysicalPackageID_int t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ From df77c3fd60688266466f7bb07aae84e2373ebb86 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 10:32:08 +0100 Subject: [PATCH 021/174] Avoid vet warning: Println arg list ends with redundant newline --- collectors/infinibandMetric.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 6e14251..db7c129 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -17,9 +17,10 @@ import ( "time" ) -const IBBASEPATH = `/sys/class/infiniband/` -const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid` -const PERFQUERY = `/usr/sbin/perfquery` +const ( + IBBASEPATH = `/sys/class/infiniband/` + PERFQUERY = `/usr/sbin/perfquery` +) type InfinibandCollectorConfig struct { ExcludeDevices []string `json:"exclude_devices,omitempty"` @@ -40,12 +41,14 @@ func (m *InfinibandCollector) Help() { fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.") fmt.Println("For each found LIDs the collector calls the 'perfquery' command") fmt.Println("The path to the 'perfquery' command can be configured with the 'perfquery_path' option") - fmt.Println("in the configuration\n") + fmt.Println("in the configuration") + fmt.Println("") fmt.Println("Full configuration object:") fmt.Println("\"ibstat\" : {") fmt.Println(" \"perfquery_path\" : \"path/to/perfquery\" # if omitted, it searches in $PATH") fmt.Println(" \"exclude_devices\" : [\"dev1\"]") - fmt.Println("}\n") + fmt.Println("}") + fmt.Println("") fmt.Println("Metrics:") fmt.Println("- ib_recv") fmt.Println("- ib_xmit") From 222862af322710872a5b88d21d5723a84cce79d8 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:12:06 +0100 Subject: [PATCH 022/174] Avoid vet warning struct field commands has json tag but is not exported --- collectors/customCmdMetric.go | 8 ++++---- collectors/lustreMetric.go | 4 ++-- collectors/topprocsMetric.go | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index bbafc2d..e11f4c7 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -15,8 +15,8 @@ import ( const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom` type CustomCmdCollectorConfig struct { - commands []string `json:"commands"` - files []string `json:"files"` + Commands []string `json:"commands"` + Files []string `json:"files"` ExcludeMetrics []string `json:"exclude_metrics"` } @@ -40,7 +40,7 @@ func (m *CustomCmdCollector) Init(config []byte) error { } } m.setup() - for _, c := range m.config.commands { + for _, c := range m.config.Commands { cmdfields := strings.Fields(c) command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " ")) command.Wait() @@ -49,7 +49,7 @@ func (m *CustomCmdCollector) Init(config []byte) error { m.commands = append(m.commands, c) } } - for _, f := range m.config.files { + for _, f := range m.config.Files { _, err = ioutil.ReadFile(f) if err == nil { m.files = append(m.files, f) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index d77ac09..8931f84 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -15,7 +15,7 @@ import ( const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats` type LustreCollectorConfig struct { - procfiles []string `json:"procfiles"` + Procfiles []string `json:"procfiles"` ExcludeMetrics []string `json:"exclude_metrics"` } @@ -47,7 +47,7 @@ func (m *LustreCollector) Init(config []byte) error { "statfs": {"statfs": 1}, "inode_permission": {"inode_permission": 1}} m.devices = make([]string, 0) - for _, p := range m.config.procfiles { + for _, p := range m.config.Procfiles { _, err := ioutil.ReadFile(p) if err == nil { m.devices = append(m.devices, p) diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index e1b31ee..715b8c3 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -16,7 +16,7 @@ const MAX_NUM_PROCS = 10 const DEFAULT_NUM_PROCS = 2 type TopProcsCollectorConfig struct { - num_procs int `json:"num_procs"` + Num_procs int `json:"num_procs"` } type TopProcsCollector struct { @@ -35,9 +35,9 @@ func (m *TopProcsCollector) Init(config []byte) error { return err } } else { - m.config.num_procs = int(DEFAULT_NUM_PROCS) + m.config.Num_procs = int(DEFAULT_NUM_PROCS) } - if m.config.num_procs <= 0 || m.config.num_procs > MAX_NUM_PROCS { + if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS { return errors.New(fmt.Sprintf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)) } m.setup() @@ -64,7 +64,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric } lines := strings.Split(string(stdout), "\n") - for i := 1; i < m.config.num_procs+1; i++ { + for i := 1; i < m.config.Num_procs+1; i++ { name := fmt.Sprintf("topproc%d", i) y, err := lp.New(name, m.tags, map[string]interface{}{"value": string(lines[i])}, time.Now()) if err == nil { From 200af84c546ae71358ef55b5dd8dfc1a13b006bc Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 25 Jan 2022 15:37:43 +0100 Subject: [PATCH 023/174] Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> --- Makefile | 6 +- README.md | 87 +--- collectors.json | 15 + collectors/README.md | 321 ++------------- collectors/collectorManager.go | 143 +++++++ collectors/cpufreqCpuinfoMetric.go | 19 +- collectors/cpufreqMetric.go | 17 +- collectors/cpustatMetric.go | 20 +- collectors/cpustatMetric.md | 23 ++ collectors/customCmdMetric.go | 26 +- collectors/customCmdMetric.md | 20 + collectors/diskstatMetric.go | 15 +- collectors/diskstatMetric.md | 34 ++ collectors/gpfsMetric.go | 121 ++---- collectors/infinibandMetric.go | 113 +++--- collectors/infinibandMetric.md | 19 + collectors/ipmiMetric.go | 38 +- collectors/ipmiMetric.md | 16 + collectors/likwidMetric.go | 62 ++- collectors/likwidMetric.md | 119 ++++++ collectors/loadavgMetric.go | 18 +- collectors/loadavgMetric.md | 19 + collectors/lustreMetric.go | 17 +- collectors/lustreMetric.md | 29 ++ collectors/memstatMetric.go | 22 +- collectors/memstatMetric.md | 27 ++ collectors/metricCollector.go | 28 +- collectors/netstatMetric.go | 20 +- collectors/netstatMetric.md | 21 + collectors/nfsMetric.go | 147 +++++++ collectors/nvidiaMetric.go | 103 ++--- collectors/nvidiaMetric.md | 40 ++ collectors/tempMetric.go | 22 +- collectors/tempMetric.md | 22 ++ collectors/topprocsMetric.go | 14 +- collectors/topprocsMetric.md | 15 + config.json | 40 +- go.mod | 3 +- go.sum | 2 + internal/ccMetric/README.md | 32 ++ internal/ccMetric/ccMetric.go | 374 ++++++++++++++++++ internal/metricRouter/README.md | 50 +++ internal/metricRouter/metricRouter.go | 208 ++++++++++ internal/multiChanTicker/README.md | 37 ++ internal/multiChanTicker/multiChanTicker.go | 39 ++ metric-collector.go | 416 +++++++++----------- receivers.json | 8 + receivers/README.md | 39 +- receivers/metricReceiver.go | 29 +- receivers/natsReceiver.go | 62 +-- receivers/receiveManager.go | 153 +++++++ router.json | 22 ++ sinks.json | 6 + sinks/README.md | 126 +++--- sinks/httpSink.go | 16 +- sinks/influxSink.go | 16 +- sinks/metricSink.go | 34 +- sinks/natsSink.go | 35 +- sinks/sinkManager.go | 141 +++++++ sinks/stdoutSink.go | 15 +- 60 files changed, 2596 insertions(+), 1105 deletions(-) create mode 100644 collectors.json create mode 100644 collectors/collectorManager.go create mode 100644 collectors/cpustatMetric.md create mode 100644 collectors/customCmdMetric.md create mode 100644 collectors/diskstatMetric.md create mode 100644 collectors/infinibandMetric.md create mode 100644 collectors/ipmiMetric.md create mode 100644 collectors/likwidMetric.md create mode 100644 collectors/loadavgMetric.md create mode 100644 collectors/lustreMetric.md create mode 100644 collectors/memstatMetric.md create mode 100644 collectors/netstatMetric.md create mode 100644 collectors/nfsMetric.go create mode 100644 collectors/nvidiaMetric.md create mode 100644 collectors/tempMetric.md create mode 100644 collectors/topprocsMetric.md create mode 100644 internal/ccMetric/README.md create mode 100644 internal/ccMetric/ccMetric.go create mode 100644 internal/metricRouter/README.md create mode 100644 internal/metricRouter/metricRouter.go create mode 100644 internal/multiChanTicker/README.md create mode 100644 internal/multiChanTicker/multiChanTicker.go create mode 100644 receivers.json create mode 100644 receivers/receiveManager.go create mode 100644 router.json create mode 100644 sinks.json create mode 100644 sinks/sinkManager.go diff --git a/Makefile b/Makefile index 892bbcc..c9805eb 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,9 @@ GOSRC_APP := metric-collector.go GOSRC_COLLECTORS := $(wildcard collectors/*.go) GOSRC_SINKS := $(wildcard sinks/*.go) GOSRC_RECEIVERS := $(wildcard receivers/*.go) -GOSRC := $(GOSRC_APP) $(GOSRC_COLLECTORS) $(GOSRC_SINKS) $(GOSRC_RECEIVERS) +GOSRC_INTERNAL := $(wildcard internal/*/*.go) +GOSRC := $(GOSRC_APP) $(GOSRC_COLLECTORS) $(GOSRC_SINKS) $(GOSRC_RECEIVERS) $(GOSRC_INTERNAL) + .PHONY: all all: $(APP) @@ -24,6 +26,8 @@ fmt: go fmt $(GOSRC_SINKS) go fmt $(GOSRC_RECEIVERS) go fmt $(GOSRC_APP) + @for F in $(GOSRC_INTERNAL); do go fmt $$F; done + # Examine Go source code and reports suspicious constructs .PHONY: vet diff --git a/README.md b/README.md index fcabb82..158cc0c 100644 --- a/README.md +++ b/README.md @@ -12,79 +12,33 @@ The receiver runs as a go routine side-by-side with the timer loop and asynchron Configuration is implemented using a single json document that is distributed over network and may be persisted as file. Supported metrics are documented [here](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md). +There is a main configuration file with basic settings that point to the other configuration files for the different components. + ``` json { - "interval": 3, - "duration": 1, - "collectors": [ - "memstat", - "likwid", - "loadavg", - "netstat", - "ibstat", - "lustrestat", - "topprocs", - "cpustat", - "nvidia" - ], - "sink": { - "user": "admin", - "password": "12345", - "host": "localhost", - "port": "8080", - "database": "testdb", - "organisation": "testorg", - "type": "stdout" - }, - "default_tags": { - "cluster": "testcluster" - }, - "receiver": { - "type": "none", - "address": "127.0.0.1", - "port": "4222", - "database": "testdb" - }, - "collect_config": { - "tempstat": { - "tag_override": { - "hwmon0": { - "type": "socket", - "type-id": "0" - }, - "hwmon1": { - "type": "socket", - "type-id": "1" - } - } - }, - "diskstat": { - "exclude_metrics": [ - "read_ms" - ] - } - } + "sinks": "sinks.json", + "collectors" : "collectors.json", + "receivers" : "receivers.json", + "router" : "router.json", + "interval": 10, + "duration": 1 } ``` -The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. An example for this is the `likwid` collector which starts the hardware performance counter, waits for `duration` seconds and stops the counters again. If you configure a collector to do two measurments, the `duration` must be at least half the `interval`. +The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. -The `collectors` contains all collectors executed collectors. Each collector can be configured in the `collect_config` section. A more detailed list of all collectors and their configuration options can be found in the [README for collectors](./collectors/README.md). +See the component READMEs for their configuration: +* [`collectors`](./collectors/README.md) +* [`sinks`](./sinks/README.md) +* [`receivers`](./receivers/README.md) +* [`router`](./internal/metricRouter/README.md) -The `sink` section contains the configuration where the data should be transmitted to. There are currently four sinks supported `influxdb`, `nats`, `http` and `stdout`. See [README for sinks](./sinks/README.md) for more information about the individual sinks and which configuration field they are using. - -In the `default_tags` section, one can define key-value-pairs (only strings) that are added to each sent out metric. This can be useful for cluster names like in the example JSON or information like rank or island for orientation. - -With `receiver`, the collector can be used as a router by receiving metrics and forwarding them to the configured sink. There are currently only types `none` (for no receiver) and `nats`. For more information see the [README in receivers](./receivers/README.md). # Installation ``` $ git clone git@github.com:ClusterCockpit/cc-metric-collector.git -$ cd cc-metric-collector/collectors -$ edit Makefile (for LIKWID collector) -$ make (downloads LIKWID, builds it as static library and copies all required files for the collector. Uses sudo in case of own accessdaemon) -$ cd .. +$ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector) $ go get (requires at least golang 1.13) $ go build metric-collector ``` @@ -104,13 +58,6 @@ Usage of metric-collector: Path for PID file (default "/var/run/cc-metric-collector.pid") ``` -# Todos - -- [ ] Use only non-blocking APIs for the sinks -- [x] Collector specific configuration in global JSON file? Changing the configuration inside the Go code is not user-friendly. -- [ ] Mark collectors as 'can-run-in-parallel' and use goroutines for them. There are only a few collectors that should run serially (e.g. LIKWID) -- [ ] Configuration option for receivers to add other tags. Additonal flag to tell whether default tags should be added as well. -- [ ] CLI option to get help output for collectors, sinks and receivers about their configuration options and metrics # Contributing The ClusterCockpit ecosystem is designed to be used by different HPC computing centers. Since configurations and setups differ between the centers, the centers likely have to put some work into the cc-metric-collector to gather all desired metrics. @@ -119,5 +66,5 @@ You are free to open an issue to request a collector but we would also be happy # Contact -[Matrix.org ClusterCockpit General chat](https://matrix.to/#/#clustercockpit-dev:matrix.org) -[Matrix.org ClusterCockpit Development chat](https://matrix.to/#/#clustercockpit:matrix.org) +* [Matrix.org ClusterCockpit General chat](https://matrix.to/#/#clustercockpit-dev:matrix.org) +* [Matrix.org ClusterCockpit Development chat](https://matrix.to/#/#clustercockpit:matrix.org) diff --git a/collectors.json b/collectors.json new file mode 100644 index 0000000..df2fce3 --- /dev/null +++ b/collectors.json @@ -0,0 +1,15 @@ +{ + "tempstat": { + "tag_override": { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + } + +} diff --git a/collectors/README.md b/collectors/README.md index df02dd6..1c3784e 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -1,288 +1,34 @@ +# CCMetric collectors + This folder contains the collectors for the cc-metric-collector. -# `metricCollector.go` -The base class/configuration is located in `metricCollector.go`. - -# Collectors - -* `memstatMetric.go`: Reads `/proc/meminfo` to calculate **node** metrics. It also combines values to the metric `mem_used` -* `loadavgMetric.go`: Reads `/proc/loadavg` and submits **node** metrics: -* `netstatMetric.go`: Reads `/proc/net/dev` and submits for all network devices as the **node** metrics. -* `lustreMetric.go`: Reads Lustre's stats files and submits **node** metrics: -* `infinibandMetric.go`: Reads InfiniBand metrics. It uses the `perfquery` command to read the **node** metrics but can fallback to sysfs counters in case `perfquery` does not work. -* `likwidMetric.go`: Reads hardware performance events using LIKWID. It submits **socket** and **cpu** metrics -* `cpustatMetric.go`: Read CPU specific values from `/proc/stat` -* `topprocsMetric.go`: Reads the TopX processes by their CPU usage. X is configurable -* `nvidiaMetric.go`: Read data about Nvidia GPUs using the NVML library -* `tempMetric.go`: Read temperature data from `/sys/class/hwmon/hwmon*` -* `ipmiMetric.go`: Collect data from `ipmitool` or as fallback `ipmi-sensors` -* `customCmdMetric.go`: Run commands or read files and submit the output (output has to be in InfluxDB line protocol!) - -If any of the collectors cannot be initialized, it is excluded from all further reads. Like if the Lustre stat file is not a valid path, no Lustre specific metrics will be recorded. - -# Collector configuration +# Configuration ```json - "collectors": [ - "tempstat" - ], - "collect_config": { - "tempstat": { - "tag_override": { - "hwmon0" : { - "type" : "socket", - "type-id" : "0" - }, - "hwmon1" : { - "type" : "socket", - "type-id" : "1" - } - } +{ + "collector_type" : { + } - } +} ``` -The configuration of the collectors in the main config files consists of two parts: active collectors (`collectors`) and collector configuration (`collect_config`). At startup, all collectors in the `collectors` list is initialized and, if successfully initialized, added to the active collectors for metric retrieval. At initialization the collector-specific configuration from the `collect_config` section is handed over. Each collector has own configuration options, check at the collector-specific section. +In contrast to the configuration files for sinks and receivers, the collectors configuration is not a list but a set of dicts. This is required because we didn't manage to partially read the type before loading the remaining configuration. We are eager to change this to the same format. -## `memstat` +# Available collectors -```json - "memstat": { - "exclude_metrics": [ - "mem_used" - ] - } -``` - -The `memstat` collector reads data from `/proc/meminfo` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. - - -Metrics: -* `mem_total` -* `mem_sreclaimable` -* `mem_slab` -* `mem_free` -* `mem_buffers` -* `mem_cached` -* `mem_available` -* `mem_shared` -* `swap_total` -* `swap_free` -* `mem_used` = `mem_total` - (`mem_free` + `mem_buffers` + `mem_cached`) - -## `loadavg` -```json - "loadavg": { - "exclude_metrics": [ - "proc_run" - ] - } -``` - -The `loadavg` collector reads data from `/proc/loadavg` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. - -Metrics: -* `load_one` -* `load_five` -* `load_fifteen` -* `proc_run` -* `proc_total` - -## `netstat` -```json - "netstat": { - "exclude_devices": [ - "lo" - ] - } -``` - -The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a device is not required, it can be excluded from forwarding it to the sink. Commonly the `lo` device should be excluded. - -Metrics: -* `bytes_in` -* `bytes_out` -* `pkts_in` -* `pkts_out` - -The device name is added as tag `device`. - - -## `diskstat` - -```json - "diskstat": { - "exclude_metrics": [ - "read_ms" - ], - } -``` - -The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. - -Metrics: -* `reads` -* `reads_merged` -* `read_sectors` -* `read_ms` -* `writes` -* `writes_merged` -* `writes_sectors` -* `writes_ms` -* `ioops` -* `ioops_ms` -* `ioops_weighted_ms` -* `discards` -* `discards_merged` -* `discards_sectors` -* `discards_ms` -* `flushes` -* `flushes_ms` - - -The device name is added as tag `device`. - -## `cpustat` -```json - "netstat": { - "exclude_metrics": [ - "cpu_idle" - ] - } -``` - -The `cpustat` collector reads data from `/proc/stats` and outputs a handful **node** and **hwthread** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. - -Metrics: -* `cpu_user` -* `cpu_nice` -* `cpu_system` -* `cpu_idle` -* `cpu_iowait` -* `cpu_irq` -* `cpu_softirq` -* `cpu_steal` -* `cpu_guest` -* `cpu_guest_nice` - -## `likwid` -```json - "likwid": { - "eventsets": [ - { - "events": { - "FIXC1": "ACTUAL_CPU_CLOCK", - "FIXC2": "MAX_CPU_CLOCK", - "PMC0": "RETIRED_INSTRUCTIONS", - "PMC1": "CPU_CLOCKS_UNHALTED", - "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", - "PMC3": "MERGE", - "DFC0": "DRAM_CHANNEL_0", - "DFC1": "DRAM_CHANNEL_1", - "DFC2": "DRAM_CHANNEL_2", - "DFC3": "DRAM_CHANNEL_3" - }, - "metrics": [ - { - "name": "ipc", - "calc": "PMC0/PMC1", - "socket_scope": false, - "publish": true - }, - { - "name": "flops_any", - "calc": "0.000001*PMC2/time", - "socket_scope": false, - "publish": true - }, - { - "name": "clock_mhz", - "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", - "socket_scope": false, - "publish": true - }, - { - "name": "mem1", - "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", - "socket_scope": true, - "publish": false - } - ] - }, - { - "events": { - "DFC0": "DRAM_CHANNEL_4", - "DFC1": "DRAM_CHANNEL_5", - "DFC2": "DRAM_CHANNEL_6", - "DFC3": "DRAM_CHANNEL_7", - "PWR0": "RAPL_CORE_ENERGY", - "PWR1": "RAPL_PKG_ENERGY" - }, - "metrics": [ - { - "name": "pwr_core", - "calc": "PWR0/time", - "socket_scope": false, - "publish": true - }, - { - "name": "pwr_pkg", - "calc": "PWR1/time", - "socket_scope": true, - "publish": true - }, - { - "name": "mem2", - "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", - "socket_scope": true, - "publish": false - } - ] - } - ], - "globalmetrics": [ - { - "name": "mem_bw", - "calc": "mem1+mem2", - "socket_scope": true, - "publish": true - } - ] - } -``` - -_Example config suitable for AMD Zen3_ - -The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. - -The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference: -``` -EVENTSET -> "events": { -FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK", -FIXC2 MAX_CPU_CLOCK -> "FIXC2": "MAX_CPU_CLOCK", -PMC0 RETIRED_INSTRUCTIONS -> "PMC0" : "RETIRED_INSTRUCTIONS", -PMC1 CPU_CLOCKS_UNHALTED -> "PMC1" : "CPU_CLOCKS_UNHALTED", -PMC2 RETIRED_SSE_AVX_FLOPS_ALL -> "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", -PMC3 MERGE -> "PMC3": "MERGE", - -> } -``` - -The metrics are following the same procedure: - -``` -METRICS -> "metrics": [ -IPC PMC0/PMC1 -> { - -> "name" : "IPC", - -> "calc" : "PMC0/PMC1", - -> "socket_scope": false, - -> "publish": true - -> } - -> ] -``` - -The `socket_scope` option tells whether it is submitted per socket or per hwthread. If a metric is only used for internal calculations, you can set `publish = false`. - -Since some metrics can only be gathered in multiple measurements (like the memory bandwidth on AMD Zen3 chips), configure multiple eventsets like in the example config and use the `globalmetrics` section to combine them. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. +* [`cpustat`](./cpustatMetric.md) +* [`memstat`](./memstatMetric.md) +* [`diskstat`](./diskstatMetric.md) +* [`loadavg`](./loadavgMetric.md) +* [`netstat`](./netstatMetric.md) +* [`ibstat`](./infinibandMetric.md) +* [`tempstat`](./tempMetric.md) +* [`lustre`](./lustreMetric.md) +* [`likwid`](./likwidMetric.md) +* [`nvidia`](./nvidiaMetric.md) +* [`customcmd`](./customCmdMetric.md) +* [`ipmistat`](./ipmiMetric.md) +* [`topprocs`](./topprocsMetric.md) ## Todos @@ -292,13 +38,15 @@ Since some metrics can only be gathered in multiple measurements (like the memor # Contributing own collectors A collector reads data from any source, parses it to metrics and submits these metrics to the `metric-collector`. A collector provides three function: -* `Init(config []byte) error`: Initializes the collector using the given collector-specific config in JSON. -* `Read(duration time.Duration, out *[]lp.MutableMetric) error`: Read, parse and submit data to the `out` list. If the collector has to measure anything for some duration, use the provided function argument `duration`. +* `Name() string`: Return the name of the collector +* `Init(config json.RawMessage) error`: Initializes the collector using the given collector-specific config in JSON. Check if needed files/commands exists, ... +* `Initialized() bool`: Check if a collector is successfully initialized +* `Read(duration time.Duration, output chan ccMetric.CCMetric)`: Read, parse and submit data to the `output` channel as [`CCMetric`](../internal/ccMetric/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`. * `Close()`: Closes down the collector. It is recommanded to call `setup()` in the `Init()` function. -Finally, the collector needs to be registered in the `metric-collector.go`. There is a list of collectors called `Collectors` which is a map (string -> pointer to collector). Add a new entry with a descriptive name and the new collector. +Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector. ## Sample collector @@ -307,8 +55,9 @@ package collectors import ( "encoding/json" - lp "github.com/influxdata/line-protocol" "time" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) // Struct for the collector-specific JSON config @@ -317,11 +66,11 @@ type SampleCollectorConfig struct { } type SampleCollector struct { - MetricCollector + metricCollector config SampleCollectorConfig } -func (m *SampleCollector) Init(config []byte) error { +func (m *SampleCollector) Init(config json.RawMessage) error { m.name = "SampleCollector" m.setup() if len(config) > 0 { @@ -330,11 +79,13 @@ func (m *SampleCollector) Init(config []byte) error { return err } } + m.meta = map[string]string{"source": m.name, "group": "Sample"} + m.init = true return nil } -func (m *SampleCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -342,9 +93,9 @@ func (m *SampleCollector) Read(interval time.Duration, out *[]lp.MutableMetric) tags := map[string]string{"type" : "node"} // Each metric has exactly one field: value ! value := map[string]interface{}{"value": int(x)} - y, err := lp.New("sample_metric", tags, value, time.Now()) + y, err := lp.New("sample_metric", tags, m.meta, value, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go new file mode 100644 index 0000000..9543431 --- /dev/null +++ b/collectors/collectorManager.go @@ -0,0 +1,143 @@ +package collectors + +import ( + "encoding/json" + "log" + "os" + "sync" + "time" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" +) + +var AvailableCollectors = map[string]MetricCollector{ + + "likwid": &LikwidCollector{}, + "loadavg": &LoadavgCollector{}, + "memstat": &MemstatCollector{}, + "netstat": &NetstatCollector{}, + "ibstat": &InfinibandCollector{}, + "lustrestat": &LustreCollector{}, + "cpustat": &CpustatCollector{}, + "topprocs": &TopProcsCollector{}, + "nvidia": &NvidiaCollector{}, + "customcmd": &CustomCmdCollector{}, + "diskstat": &DiskstatCollector{}, + "tempstat": &TempCollector{}, + "ipmistat": &IpmiCollector{}, + "gpfs": new(GpfsCollector), + "cpufreq": new(CPUFreqCollector), + "cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector), + "nfsstat": new(NfsCollector), +} + +type collectorManager struct { + collectors []MetricCollector + output chan lp.CCMetric + done chan bool + ticker mct.MultiChanTicker + duration time.Duration + wg *sync.WaitGroup + config map[string]json.RawMessage +} + +type CollectorManager interface { + Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error + AddOutput(output chan lp.CCMetric) + Start() + Close() +} + +func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error { + cm.collectors = make([]MetricCollector, 0) + cm.output = nil + cm.done = make(chan bool) + cm.wg = wg + cm.ticker = ticker + cm.duration = duration + configFile, err := os.Open(collectConfigFile) + if err != nil { + log.Print(err.Error()) + return err + } + defer configFile.Close() + jsonParser := json.NewDecoder(configFile) + err = jsonParser.Decode(&cm.config) + if err != nil { + log.Print(err.Error()) + return err + } + for k, cfg := range cm.config { + log.Print(k, " ", cfg) + if _, found := AvailableCollectors[k]; !found { + log.Print("[CollectorManager] SKIP unknown collector ", k) + continue + } + c := AvailableCollectors[k] + + err = c.Init(cfg) + if err != nil { + log.Print("[CollectorManager] Collector ", k, "initialization failed: ", err.Error()) + continue + } + cm.collectors = append(cm.collectors, c) + } + return nil +} + +func (cm *collectorManager) Start() { + cm.wg.Add(1) + tick := make(chan time.Time) + cm.ticker.AddChannel(tick) + go func() { + for { + CollectorManagerLoop: + select { + case <-cm.done: + for _, c := range cm.collectors { + c.Close() + } + cm.wg.Done() + log.Print("[CollectorManager] DONE\n") + break CollectorManagerLoop + case t := <-tick: + for _, c := range cm.collectors { + CollectorManagerInputLoop: + select { + case <-cm.done: + for _, c := range cm.collectors { + c.Close() + } + cm.wg.Done() + log.Print("[CollectorManager] DONE\n") + break CollectorManagerInputLoop + default: + log.Print("[CollectorManager] ", c.Name(), " ", t) + c.Read(cm.duration, cm.output) + } + } + } + } + log.Print("[CollectorManager] EXIT\n") + }() + log.Print("[CollectorManager] STARTED\n") +} + +func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { + cm.output = output +} + +func (cm *collectorManager) Close() { + cm.done <- true + log.Print("[CollectorManager] CLOSE") +} + +func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) { + cm := &collectorManager{} + err := cm.Init(ticker, duration, wg, collectConfigFile) + if err != nil { + return nil, err + } + return cm, err +} diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index e8cd0fc..9c91a50 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -2,14 +2,16 @@ package collectors import ( "bufio" + "encoding/json" + "fmt" "log" "os" "strconv" "strings" "time" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - lp "github.com/influxdata/line-protocol" ) // @@ -33,12 +35,16 @@ type CPUFreqCpuInfoCollectorTopology struct { } type CPUFreqCpuInfoCollector struct { - MetricCollector + metricCollector topology []CPUFreqCpuInfoCollectorTopology } -func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { +func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { m.name = "CPUFreqCpuInfoCollector" + m.meta = map[string]string{ + "source": m.name, + "group": "cpufreq", + } const cpuInfoFile = "/proc/cpuinfo" file, err := os.Open(cpuInfoFile) @@ -145,7 +151,8 @@ func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { return nil } -func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + +func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -174,9 +181,9 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, out *[]lp.Mutable log.Printf("Failed to convert cpu MHz to float: %v", err) return } - y, err := lp.New("cpufreq", t.tagSet, map[string]interface{}{"value": value}, now) + y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now) if err == nil { - *out = append(*out, y) + output <- y } } processorCounter++ diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index fcab782..5febed9 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -10,8 +10,7 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "golang.org/x/sys/unix" ) @@ -56,14 +55,14 @@ type CPUFreqCollectorTopology struct { // See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html // type CPUFreqCollector struct { - MetricCollector + metricCollector topology []CPUFreqCollectorTopology config struct { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } } -func (m *CPUFreqCollector) Init(config []byte) error { +func (m *CPUFreqCollector) Init(config json.RawMessage) error { m.name = "CPUFreqCollector" m.setup() if len(config) > 0 { @@ -72,6 +71,10 @@ func (m *CPUFreqCollector) Init(config []byte) error { return err } } + m.meta = map[string]string{ + "source": m.name, + "group": "CPU Frequency", + } // Loop for all CPU directories baseDir := "/sys/devices/system/cpu" @@ -179,7 +182,7 @@ func (m *CPUFreqCollector) Init(config []byte) error { return nil } -func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -205,9 +208,9 @@ func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) continue } - y, err := lp.New("cpufreq", t.tagSet, map[string]interface{}{"value": cpuFreq}, now) + y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now) if err == nil { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 64b5842..f517300 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -7,8 +7,7 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const CPUSTATFILE = `/proc/stat` @@ -18,13 +17,14 @@ type CpustatCollectorConfig struct { } type CpustatCollector struct { - MetricCollector + metricCollector config CpustatCollectorConfig } -func (m *CpustatCollector) Init(config []byte) error { +func (m *CpustatCollector) Init(config json.RawMessage) error { m.name = "CpustatCollector" m.setup() + m.meta = map[string]string{"source": m.name, "group": "CPU"} if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { @@ -35,7 +35,7 @@ func (m *CpustatCollector) Init(config []byte) error { return nil } -func ParseStatLine(line string, cpu int, exclude []string, out *[]lp.MutableMetric) { +func (c *CpustatCollector) parseStatLine(line string, cpu int, exclude []string, output chan lp.CCMetric) { ls := strings.Fields(line) matches := []string{"", "cpu_user", "cpu_nice", "cpu_system", "cpu_idle", "cpu_iowait", "cpu_irq", "cpu_softirq", "cpu_steal", "cpu_guest", "cpu_guest_nice"} for _, ex := range exclude { @@ -52,16 +52,16 @@ func ParseStatLine(line string, cpu int, exclude []string, out *[]lp.MutableMetr if len(m) > 0 { x, err := strconv.ParseInt(ls[i], 0, 64) if err == nil { - y, err := lp.New(m, tags, map[string]interface{}{"value": int(x)}, time.Now()) + y, err := lp.New(m, tags, c.meta, map[string]interface{}{"value": int(x)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } } } -func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -78,11 +78,11 @@ func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) } ls := strings.Fields(line) if strings.Compare(ls[0], "cpu") == 0 { - ParseStatLine(line, -1, m.config.ExcludeMetrics, out) + m.parseStatLine(line, -1, m.config.ExcludeMetrics, output) } else if strings.HasPrefix(ls[0], "cpu") { cpustr := strings.TrimLeft(ls[0], "cpu") cpu, _ := strconv.Atoi(cpustr) - ParseStatLine(line, cpu, m.config.ExcludeMetrics, out) + m.parseStatLine(line, cpu, m.config.ExcludeMetrics, output) } } } diff --git a/collectors/cpustatMetric.md b/collectors/cpustatMetric.md new file mode 100644 index 0000000..604445a --- /dev/null +++ b/collectors/cpustatMetric.md @@ -0,0 +1,23 @@ + +## `cpustat` collector +```json + "netstat": { + "exclude_metrics": [ + "cpu_idle" + ] + } +``` + +The `cpustat` collector reads data from `/proc/stats` and outputs a handful **node** and **hwthread** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. + +Metrics: +* `cpu_user` +* `cpu_nice` +* `cpu_system` +* `cpu_idle` +* `cpu_iowait` +* `cpu_irq` +* `cpu_softirq` +* `cpu_steal` +* `cpu_guest` +* `cpu_guest_nice` diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index e11f4c7..ffe8b73 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -9,7 +9,8 @@ import ( "strings" "time" - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + influx "github.com/influxdata/line-protocol" ) const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom` @@ -21,17 +22,18 @@ type CustomCmdCollectorConfig struct { } type CustomCmdCollector struct { - MetricCollector - handler *lp.MetricHandler - parser *lp.Parser + metricCollector + handler *influx.MetricHandler + parser *influx.Parser config CustomCmdCollectorConfig commands []string files []string } -func (m *CustomCmdCollector) Init(config []byte) error { +func (m *CustomCmdCollector) Init(config json.RawMessage) error { var err error m.name = "CustomCmdCollector" + m.meta = map[string]string{"source": m.name, "group": "Custom"} if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { @@ -61,8 +63,8 @@ func (m *CustomCmdCollector) Init(config []byte) error { if len(m.files) == 0 && len(m.commands) == 0 { return errors.New("No metrics to collect") } - m.handler = lp.NewMetricHandler() - m.parser = lp.NewParser(m.handler) + m.handler = influx.NewMetricHandler() + m.parser = influx.NewParser(m.handler) m.parser.SetTimeFunc(DefaultTime) m.init = true return nil @@ -72,7 +74,7 @@ var DefaultTime = func() time.Time { return time.Unix(42, 0) } -func (m *CustomCmdCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -95,9 +97,9 @@ func (m *CustomCmdCollector) Read(interval time.Duration, out *[]lp.MutableMetri if skip { continue } - y, err := lp.New(c.Name(), Tags2Map(c), Fields2Map(c), c.Time()) + y, err := lp.New(c.Name(), Tags2Map(c), m.meta, Fields2Map(c), c.Time()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -117,9 +119,9 @@ func (m *CustomCmdCollector) Read(interval time.Duration, out *[]lp.MutableMetri if skip { continue } - y, err := lp.New(f.Name(), Tags2Map(f), Fields2Map(f), f.Time()) + y, err := lp.New(f.Name(), Tags2Map(f), m.meta, Fields2Map(f), f.Time()) if err == nil { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/customCmdMetric.md b/collectors/customCmdMetric.md new file mode 100644 index 0000000..011135d --- /dev/null +++ b/collectors/customCmdMetric.md @@ -0,0 +1,20 @@ + +## `customcmd` collector + +```json + "customcmd": { + "exclude_metrics": [ + "mymetric" + ], + "files" : [ + "/var/run/myapp.metrics" + ], + "commands" : [ + "/usr/local/bin/getmetrics.pl" + ] + } +``` + +The `customcmd` collector reads data from files and the output of executed commands. The files and commands can output multiple metrics (separated by newline) but the have to be in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/). If a metric is not parsable, it is skipped. If a metric is not required, it can be excluded from forwarding it to the sink. + + diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 4cbd3c6..50c41cd 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -2,9 +2,7 @@ package collectors import ( "io/ioutil" - - lp "github.com/influxdata/line-protocol" - + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" // "log" "encoding/json" "errors" @@ -21,14 +19,15 @@ type DiskstatCollectorConfig struct { } type DiskstatCollector struct { - MetricCollector + metricCollector matches map[int]string config DiskstatCollectorConfig } -func (m *DiskstatCollector) Init(config []byte) error { +func (m *DiskstatCollector) Init(config json.RawMessage) error { var err error m.name = "DiskstatCollector" + m.meta = map[string]string{"source": m.name, "group": "Disk"} m.setup() if len(config) > 0 { err = json.Unmarshal(config, &m.config) @@ -73,7 +72,7 @@ func (m *DiskstatCollector) Init(config []byte) error { return err } -func (m *DiskstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { var lines []string if !m.init { return @@ -101,9 +100,9 @@ func (m *DiskstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric if idx < len(f) { x, err := strconv.ParseInt(f[idx], 0, 64) if err == nil { - y, err := lp.New(name, tags, map[string]interface{}{"value": int(x)}, time.Now()) + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": int(x)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/diskstatMetric.md b/collectors/diskstatMetric.md new file mode 100644 index 0000000..1ac341d --- /dev/null +++ b/collectors/diskstatMetric.md @@ -0,0 +1,34 @@ + +## `diskstat` collector + +```json + "diskstat": { + "exclude_metrics": [ + "read_ms" + ], + } +``` + +The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. + +Metrics: +* `reads` +* `reads_merged` +* `read_sectors` +* `read_ms` +* `writes` +* `writes_merged` +* `writes_sectors` +* `writes_ms` +* `ioops` +* `ioops_ms` +* `ioops_weighted_ms` +* `discards` +* `discards_merged` +* `discards_sectors` +* `discards_ms` +* `flushes` +* `flushes_ms` + +The device name is added as tag `device`. + diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index fbf3a63..f1d3d75 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -13,18 +13,20 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) type GpfsCollector struct { - MetricCollector + metricCollector + tags map[string]string + config struct { Mmpmon string `json:"mmpmon"` } } -func (m *GpfsCollector) Init(config []byte) error { + +func (m *GpfsCollector) Init(config json.RawMessage) error { var err error m.name = "GpfsCollector" m.setup() @@ -40,6 +42,14 @@ func (m *GpfsCollector) Init(config []byte) error { return err } } + m.meta = map[string]string{ + "source": m.name, + "group": "GPFS", + } + m.tags = map[string]string{ + "type": "node", + "filesystem": "", + } // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root user, err := user.Current() @@ -60,7 +70,7 @@ func (m *GpfsCollector) Init(config []byte) error { return nil } -func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -108,6 +118,9 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { continue } + m.tags["filesystem"] = filesystem + + // return code rc, err := strconv.Atoi(key_value["_rc_"]) if err != nil { @@ -140,17 +153,10 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { key_value["_br_"], err.Error()) continue } - y, err := lp.New( - "gpfs_bytes_read", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": bytesRead, - }, - timestamp) + + y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } // bytes written @@ -161,17 +167,10 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { key_value["_bw_"], err.Error()) continue } - y, err = lp.New( - "gpfs_bytes_written", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": bytesWritten, - }, - timestamp) + + y, err = lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } // number of opens @@ -182,17 +181,9 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { key_value["_oc_"], err.Error()) continue } - y, err = lp.New( - "gpfs_num_opens", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": numOpens, - }, - timestamp) + y, err = lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } // number of closes @@ -201,17 +192,9 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of closes: %s\n", err.Error()) continue } - y, err = lp.New( - "gpfs_num_closes", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": numCloses, - }, - timestamp) + y, err = lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } // number of reads @@ -220,17 +203,9 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of reads: %s\n", err.Error()) continue } - y, err = lp.New( - "gpfs_num_reads", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": numReads, - }, - timestamp) + y, err = lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } // number of writes @@ -239,17 +214,9 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of writes: %s\n", err.Error()) continue } - y, err = lp.New( - "gpfs_num_writes", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": numWrites, - }, - timestamp) + y, err = lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } // number of read directories @@ -258,17 +225,9 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of read directories: %s\n", err.Error()) continue } - y, err = lp.New( - "gpfs_num_readdirs", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": numReaddirs, - }, - timestamp) + y, err = lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } // Number of inode updates @@ -277,17 +236,9 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert Number of inode updates: %s\n", err.Error()) continue } - y, err = lp.New( - "gpfs_num_inode_updates", - map[string]string{ - "filesystem": filesystem, - }, - map[string]interface{}{ - "value": numInodeUpdates, - }, - timestamp) + y, err = lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp) if err == nil { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index db7c129..af4e579 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -5,9 +5,7 @@ import ( "io/ioutil" "log" "os/exec" - - lp "github.com/influxdata/line-protocol" - + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" // "os" "encoding/json" "errors" @@ -28,7 +26,7 @@ type InfinibandCollectorConfig struct { } type InfinibandCollector struct { - MetricCollector + metricCollector tags map[string]string lids map[string]map[string]string config InfinibandCollectorConfig @@ -56,11 +54,12 @@ func (m *InfinibandCollector) Help() { fmt.Println("- ib_xmit_pkts") } -func (m *InfinibandCollector) Init(config []byte) error { +func (m *InfinibandCollector) Init(config json.RawMessage) error { var err error m.name = "InfinibandCollector" m.use_perfquery = false m.setup() + m.meta = map[string]string{"source": m.name, "group": "Network"} m.tags = map[string]string{"type": "node"} if len(config) > 0 { err = json.Unmarshal(config, &m.config) @@ -117,7 +116,7 @@ func (m *InfinibandCollector) Init(config []byte) error { return err } -func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[string]string, out *[]lp.MutableMetric) error { +func (m *InfinibandCollector) doPerfQuery(cmd string, dev string, lid string, port string, tags map[string]string, output chan lp.CCMetric) error { args := fmt.Sprintf("-r %s %s 0xf000", lid, port) command := exec.Command(cmd, args) @@ -134,9 +133,9 @@ func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[strin lv := strings.Fields(line) v, err := strconv.ParseFloat(lv[1], 64) if err == nil { - y, err := lp.New("ib_recv", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -144,9 +143,9 @@ func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[strin lv := strings.Fields(line) v, err := strconv.ParseFloat(lv[1], 64) if err == nil { - y, err := lp.New("ib_xmit", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -154,9 +153,9 @@ func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[strin lv := strings.Fields(line) v, err := strconv.ParseFloat(lv[1], 64) if err == nil { - y, err := lp.New("ib_recv_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -164,9 +163,29 @@ func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[strin lv := strings.Fields(line) v, err := strconv.ParseFloat(lv[1], 64) if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y + } + } + } + if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y } } } @@ -174,16 +193,16 @@ func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[strin return nil } -func DoSysfsRead(dev string, lid string, port string, tags map[string]string, out *[]lp.MutableMetric) error { +func (m *InfinibandCollector) doSysfsRead(dev string, lid string, port string, tags map[string]string, output chan lp.CCMetric) error { path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(IBBASEPATH), dev, port) buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path)) if err == nil { data := strings.Replace(string(buffer), "\n", "", -1) v, err := strconv.ParseFloat(data, 64) if err == nil { - y, err := lp.New("ib_recv", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -192,9 +211,9 @@ func DoSysfsRead(dev string, lid string, port string, tags map[string]string, ou data := strings.Replace(string(buffer), "\n", "", -1) v, err := strconv.ParseFloat(data, 64) if err == nil { - y, err := lp.New("ib_xmit", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -203,9 +222,9 @@ func DoSysfsRead(dev string, lid string, port string, tags map[string]string, ou data := strings.Replace(string(buffer), "\n", "", -1) v, err := strconv.ParseFloat(data, 64) if err == nil { - y, err := lp.New("ib_recv_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -214,71 +233,29 @@ func DoSysfsRead(dev string, lid string, port string, tags map[string]string, ou data := strings.Replace(string(buffer), "\n", "", -1) v, err := strconv.ParseFloat(data, 64) if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } return nil } -func (m *InfinibandCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetric) { if m.init { for dev, ports := range m.lids { for port, lid := range ports { tags := map[string]string{"type": "node", "device": dev, "port": port} if m.use_perfquery { - DoPerfQuery(m.config.PerfQueryPath, dev, lid, port, tags, out) + m.doPerfQuery(m.config.PerfQueryPath, dev, lid, port, tags, output) } else { - DoSysfsRead(dev, lid, port, tags, out) + m.doSysfsRead(dev, lid, port, tags, output) } } } } - - // buffer, err := ioutil.ReadFile(string(LIDFILE)) - - // if err != nil { - // log.Print(err) - // return - // } - - // args := fmt.Sprintf("-r %s 1 0xf000", string(buffer)) - - // command := exec.Command(PERFQUERY, args) - // command.Wait() - // stdout, err := command.Output() - // if err != nil { - // log.Print(err) - // return - // } - - // ll := strings.Split(string(stdout), "\n") - - // for _, line := range ll { - // if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") { - // lv := strings.Fields(line) - // v, err := strconv.ParseFloat(lv[1], 64) - // if err == nil { - // y, err := lp.New("ib_recv", m.tags, map[string]interface{}{"value": float64(v)}, time.Now()) - // if err == nil { - // *out = append(*out, y) - // } - // } - // } - // if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") { - // lv := strings.Fields(line) - // v, err := strconv.ParseFloat(lv[1], 64) - // if err == nil { - // y, err := lp.New("ib_xmit", m.tags, map[string]interface{}{"value": float64(v)}, time.Now()) - // if err == nil { - // *out = append(*out, y) - // } - // } - // } - // } } func (m *InfinibandCollector) Close() { diff --git a/collectors/infinibandMetric.md b/collectors/infinibandMetric.md new file mode 100644 index 0000000..e9ba043 --- /dev/null +++ b/collectors/infinibandMetric.md @@ -0,0 +1,19 @@ + +## `ibstat` collector + +```json + "ibstat": { + "perfquery_path" : "", + "exclude_devices": [ + "mlx4" + ] + } +``` + +The `ibstat` collector reads either data through the `perfquery` command or the sysfs files below `/sys/class/infiniband/`. + +Metrics: +* `ib_recv` +* `ib_xmit` + +The collector adds a `device` tag to all metrics diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index 3179148..f4c5167 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -9,8 +9,7 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const IPMITOOL_PATH = `/usr/bin/ipmitool` @@ -23,15 +22,16 @@ type IpmiCollectorConfig struct { } type IpmiCollector struct { - MetricCollector + metricCollector tags map[string]string matches map[string]string config IpmiCollectorConfig } -func (m *IpmiCollector) Init(config []byte) error { +func (m *IpmiCollector) Init(config json.RawMessage) error { m.name = "IpmiCollector" m.setup() + m.meta = map[string]string{"source": m.name, "group": "IPMI"} if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { @@ -53,7 +53,7 @@ func (m *IpmiCollector) Init(config []byte) error { return nil } -func ReadIpmiTool(cmd string, out *[]lp.MutableMetric) { +func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) { command := exec.Command(cmd, "sensor") command.Wait() stdout, err := command.Output() @@ -74,24 +74,25 @@ func ReadIpmiTool(cmd string, out *[]lp.MutableMetric) { name := strings.ToLower(strings.Replace(strings.Trim(lv[0], " "), " ", "_", -1)) unit := strings.Trim(lv[2], " ") if unit == "Volts" { - unit = "V" + unit = "Volts" } else if unit == "degrees C" { - unit = "C" + unit = "degC" } else if unit == "degrees F" { - unit = "F" + unit = "degF" } else if unit == "Watts" { - unit = "W" + unit = "Watts" } - y, err := lp.New(name, map[string]string{"unit": unit, "type": "node"}, map[string]interface{}{"value": v}, time.Now()) + y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now()) if err == nil { - *out = append(*out, y) + y.AddMeta("unit", unit) + output <- y } } } } -func ReadIpmiSensors(cmd string, out *[]lp.MutableMetric) { +func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) { command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate") command.Wait() @@ -109,25 +110,28 @@ func ReadIpmiSensors(cmd string, out *[]lp.MutableMetric) { v, err := strconv.ParseFloat(lv[3], 64) if err == nil { name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1)) - y, err := lp.New(name, map[string]string{"unit": lv[4], "type": "node"}, map[string]interface{}{"value": v}, time.Now()) + y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now()) if err == nil { - *out = append(*out, y) + if len(lv) > 4 { + y.AddMeta("unit", lv[4]) + } + output <- y } } } } } -func (m *IpmiCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMetric) { if len(m.config.IpmitoolPath) > 0 { _, err := os.Stat(m.config.IpmitoolPath) if err == nil { - ReadIpmiTool(m.config.IpmitoolPath, out) + m.readIpmiTool(m.config.IpmitoolPath, output) } } else if len(m.config.IpmisensorsPath) > 0 { _, err := os.Stat(m.config.IpmisensorsPath) if err == nil { - ReadIpmiSensors(m.config.IpmisensorsPath, out) + m.readIpmiSensors(m.config.IpmisensorsPath, output) } } } diff --git a/collectors/ipmiMetric.md b/collectors/ipmiMetric.md new file mode 100644 index 0000000..fe83759 --- /dev/null +++ b/collectors/ipmiMetric.md @@ -0,0 +1,16 @@ + +## `ipmistat` collector + +```json + "ipmistat": { + "ipmitool_path": "/path/to/ipmitool", + "ipmisensors_path": "/path/to/ipmi-sensors", + } +``` + +The `ipmistat` collector reads data from `ipmitool` (`ipmitool sensor`) or `ipmi-sensors` (`ipmi-sensors --sdr-cache-recreate --comma-separated-output`). + +The metrics depend on the output of the underlying tools but contain temperature, power and energy metrics. + + + diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 45fe68c..430a09b 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -20,16 +20,28 @@ import ( "strings" "time" "unsafe" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "gopkg.in/Knetic/govaluate.v2" ) +type MetricScope int + +const ( + METRIC_SCOPE_HWTHREAD = iota + METRIC_SCOPE_SOCKET + METRIC_SCOPE_NUMA + METRIC_SCOPE_NODE +) + +func (ms MetricScope) String() string { + return []string{"Head", "Shoulder", "Knee", "Toe"}[ms] +} + type LikwidCollectorMetricConfig struct { - Name string `json:"name"` - Calc string `json:"calc"` - Socket_scope bool `json:"socket_scope"` - Publish bool `json:"publish"` + Name string `json:"name"` + Calc string `json:"calc"` + Scope MetricScope `json:"socket_scope"` + Publish bool `json:"publish"` } type LikwidCollectorEventsetConfig struct { @@ -45,7 +57,7 @@ type LikwidCollectorConfig struct { } type LikwidCollector struct { - MetricCollector + metricCollector cpulist []C.int sock2tid map[int]int metrics map[C.int]map[string]int @@ -105,7 +117,7 @@ func getSocketCpus() map[C.int]int { return outmap } -func (m *LikwidCollector) Init(config []byte) error { +func (m *LikwidCollector) Init(config json.RawMessage) error { var ret C.int m.name = "LikwidCollector" if len(config) > 0 { @@ -115,11 +127,13 @@ func (m *LikwidCollector) Init(config []byte) error { } } m.setup() + m.meta = map[string]string{"source": m.name, "group": "PerfCounter"} cpulist := CpuList() m.cpulist = make([]C.int, len(cpulist)) slist := getSocketCpus() m.sock2tid = make(map[int]int) + // m.numa2tid = make(map[int]int) for i, c := range cpulist { m.cpulist[i] = C.int(c) if sid, found := slist[m.cpulist[i]]; found { @@ -169,7 +183,7 @@ func (m *LikwidCollector) Init(config []byte) error { return nil } -func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -246,24 +260,28 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) for _, metric := range evset.Metrics { _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) if metric.Publish && !skip { - if metric.Socket_scope { + if metric.Scope.String() == "socket" { for sid, tid := range m.sock2tid { y, err := lp.New(metric.Name, - map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))}, + map[string]string{"type": "socket", + "type-id": fmt.Sprintf("%d", int(sid))}, + m.meta, map[string]interface{}{"value": m.mresults[i][tid][metric.Name]}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } - } else { + } else if metric.Scope.String() == "hwthread" { for tid, cpu := range m.cpulist { y, err := lp.New(metric.Name, - map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))}, + map[string]string{"type": "cpu", + "type-id": fmt.Sprintf("%d", int(cpu))}, + m.meta, map[string]interface{}{"value": m.mresults[i][tid][metric.Name]}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } @@ -273,24 +291,28 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) for _, metric := range m.config.Metrics { _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) if metric.Publish && !skip { - if metric.Socket_scope { + if metric.Scope.String() == "socket" { for sid, tid := range m.sock2tid { y, err := lp.New(metric.Name, - map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))}, + map[string]string{"type": "socket", + "type-id": fmt.Sprintf("%d", int(sid))}, + m.meta, map[string]interface{}{"value": m.gmresults[tid][metric.Name]}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } else { for tid, cpu := range m.cpulist { y, err := lp.New(metric.Name, - map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))}, + map[string]string{"type": "cpu", + "type-id": fmt.Sprintf("%d", int(cpu))}, + m.meta, map[string]interface{}{"value": m.gmresults[tid][metric.Name]}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md new file mode 100644 index 0000000..08b917f --- /dev/null +++ b/collectors/likwidMetric.md @@ -0,0 +1,119 @@ + +## `likwid` collector +```json + "likwid": { + "eventsets": [ + { + "events": { + "FIXC1": "ACTUAL_CPU_CLOCK", + "FIXC2": "MAX_CPU_CLOCK", + "PMC0": "RETIRED_INSTRUCTIONS", + "PMC1": "CPU_CLOCKS_UNHALTED", + "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", + "PMC3": "MERGE", + "DFC0": "DRAM_CHANNEL_0", + "DFC1": "DRAM_CHANNEL_1", + "DFC2": "DRAM_CHANNEL_2", + "DFC3": "DRAM_CHANNEL_3" + }, + "metrics": [ + { + "name": "ipc", + "calc": "PMC0/PMC1", + "socket_scope": false, + "publish": true + }, + { + "name": "flops_any", + "calc": "0.000001*PMC2/time", + "socket_scope": false, + "publish": true + }, + { + "name": "clock_mhz", + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "socket_scope": false, + "publish": true + }, + { + "name": "mem1", + "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", + "socket_scope": true, + "publish": false + } + ] + }, + { + "events": { + "DFC0": "DRAM_CHANNEL_4", + "DFC1": "DRAM_CHANNEL_5", + "DFC2": "DRAM_CHANNEL_6", + "DFC3": "DRAM_CHANNEL_7", + "PWR0": "RAPL_CORE_ENERGY", + "PWR1": "RAPL_PKG_ENERGY" + }, + "metrics": [ + { + "name": "pwr_core", + "calc": "PWR0/time", + "socket_scope": false, + "publish": true + }, + { + "name": "pwr_pkg", + "calc": "PWR1/time", + "socket_scope": true, + "publish": true + }, + { + "name": "mem2", + "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", + "socket_scope": true, + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "name": "mem_bw", + "calc": "mem1+mem2", + "socket_scope": true, + "publish": true + } + ] + } +``` + +_Example config suitable for AMD Zen3_ + +The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. + +The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference: +``` +EVENTSET -> "events": { +FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK", +FIXC2 MAX_CPU_CLOCK -> "FIXC2": "MAX_CPU_CLOCK", +PMC0 RETIRED_INSTRUCTIONS -> "PMC0" : "RETIRED_INSTRUCTIONS", +PMC1 CPU_CLOCKS_UNHALTED -> "PMC1" : "CPU_CLOCKS_UNHALTED", +PMC2 RETIRED_SSE_AVX_FLOPS_ALL -> "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", +PMC3 MERGE -> "PMC3": "MERGE", + -> } +``` + +The metrics are following the same procedure: + +``` +METRICS -> "metrics": [ +IPC PMC0/PMC1 -> { + -> "name" : "IPC", + -> "calc" : "PMC0/PMC1", + -> "socket_scope": false, + -> "publish": true + -> } + -> ] +``` + +The `socket_scope` option tells whether it is submitted per socket or per hwthread. If a metric is only used for internal calculations, you can set `publish = false`. + +Since some metrics can only be gathered in multiple measurements (like the memory bandwidth on AMD Zen3 chips), configure multiple eventsets like in the example config and use the `globalmetrics` section to combine them. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index 1ecaea5..11c0e5e 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -6,8 +6,7 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const LOADAVGFILE = `/proc/loadavg` @@ -17,14 +16,14 @@ type LoadavgCollectorConfig struct { } type LoadavgCollector struct { - MetricCollector + metricCollector tags map[string]string load_matches []string proc_matches []string config LoadavgCollectorConfig } -func (m *LoadavgCollector) Init(config []byte) error { +func (m *LoadavgCollector) Init(config json.RawMessage) error { m.name = "LoadavgCollector" m.setup() if len(config) > 0 { @@ -33,6 +32,7 @@ func (m *LoadavgCollector) Init(config []byte) error { return err } } + m.meta = map[string]string{"source": m.name, "group": "LOAD"} m.tags = map[string]string{"type": "node"} m.load_matches = []string{"load_one", "load_five", "load_fifteen"} m.proc_matches = []string{"proc_run", "proc_total"} @@ -40,7 +40,7 @@ func (m *LoadavgCollector) Init(config []byte) error { return nil } -func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) { var skip bool if !m.init { return @@ -56,9 +56,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) x, err := strconv.ParseFloat(ls[i], 64) if err == nil { _, skip = stringArrayContains(m.config.ExcludeMetrics, name) - y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now()) + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": float64(x)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } } @@ -67,9 +67,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) x, err := strconv.ParseFloat(lv[i], 64) if err == nil { _, skip = stringArrayContains(m.config.ExcludeMetrics, name) - y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now()) + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": float64(x)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/loadavgMetric.md b/collectors/loadavgMetric.md new file mode 100644 index 0000000..d2b3f50 --- /dev/null +++ b/collectors/loadavgMetric.md @@ -0,0 +1,19 @@ + +## `loadavg` collector + +```json + "loadavg": { + "exclude_metrics": [ + "proc_run" + ] + } +``` + +The `loadavg` collector reads data from `/proc/loadavg` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. + +Metrics: +* `load_one` +* `load_five` +* `load_fifteen` +* `proc_run` +* `proc_total` diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index 8931f84..3e248fa 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -8,8 +8,7 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats` @@ -20,14 +19,14 @@ type LustreCollectorConfig struct { } type LustreCollector struct { - MetricCollector + metricCollector tags map[string]string matches map[string]map[string]int devices []string config LustreCollectorConfig } -func (m *LustreCollector) Init(config []byte) error { +func (m *LustreCollector) Init(config json.RawMessage) error { var err error m.name = "LustreCollector" if len(config) > 0 { @@ -38,6 +37,7 @@ func (m *LustreCollector) Init(config []byte) error { } m.setup() m.tags = map[string]string{"type": "node"} + m.meta = map[string]string{"source": m.name, "group": "Lustre"} m.matches = map[string]map[string]int{"read_bytes": {"read_bytes": 6, "read_requests": 1}, "write_bytes": {"write_bytes": 6, "write_requests": 1}, "open": {"open": 1}, @@ -64,7 +64,7 @@ func (m *LustreCollector) Init(config []byte) error { return nil } -func (m *LustreCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -88,9 +88,12 @@ func (m *LustreCollector) Read(interval time.Duration, out *[]lp.MutableMetric) } x, err := strconv.ParseInt(lf[idx], 0, 64) if err == nil { - y, err := lp.New(name, m.tags, map[string]interface{}{"value": x}, time.Now()) + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, time.Now()) if err == nil { - *out = append(*out, y) + if strings.Contains(name, "byte") { + y.AddMeta("unit", "Byte") + } + output <- y } } } diff --git a/collectors/lustreMetric.md b/collectors/lustreMetric.md new file mode 100644 index 0000000..0cb9fc8 --- /dev/null +++ b/collectors/lustreMetric.md @@ -0,0 +1,29 @@ + +## `lustrestat` collector + +```json + "lustrestat": { + "procfiles" : [ + "/proc/fs/lustre/llite/lnec-XXXXXX/stats" + ], + "exclude_metrics": [ + "setattr", + "getattr" + ] + } +``` + +The `lustrestat` collector reads from the procfs stat files for Lustre like `/proc/fs/lustre/llite/lnec-XXXXXX/stats`. + +Metrics: +* `read_bytes` +* `read_requests` +* `write_bytes` +* `write_requests` +* `open` +* `close` +* `getattr` +* `setattr` +* `statfs` +* `inode_permission` + diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index 17db13e..c83402c 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -9,8 +9,7 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const MEMSTATFILE = `/proc/meminfo` @@ -20,14 +19,14 @@ type MemstatCollectorConfig struct { } type MemstatCollector struct { - MetricCollector + metricCollector stats map[string]int64 tags map[string]string matches map[string]string config MemstatCollectorConfig } -func (m *MemstatCollector) Init(config []byte) error { +func (m *MemstatCollector) Init(config json.RawMessage) error { var err error m.name = "MemstatCollector" if len(config) > 0 { @@ -36,6 +35,7 @@ func (m *MemstatCollector) Init(config []byte) error { return err } } + m.meta = map[string]string{"source": m.name, "group": "Memory", "unit": "kByte"} m.stats = make(map[string]int64) m.matches = make(map[string]string) m.tags = map[string]string{"type": "node"} @@ -65,7 +65,7 @@ func (m *MemstatCollector) Init(config []byte) error { return err } -func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -97,9 +97,9 @@ func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) log.Print(err) continue } - y, err := lp.New(name, m.tags, map[string]interface{}{"value": int(float64(m.stats[match]) * 1.0e-3)}, time.Now()) + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": int(float64(m.stats[match]) * 1.0e-3)}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } @@ -108,18 +108,18 @@ func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) if _, cached := m.stats[`Cached`]; cached { memUsed := m.stats[`MemTotal`] - (m.stats[`MemFree`] + m.stats[`Buffers`] + m.stats[`Cached`]) _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used") - y, err := lp.New("mem_used", m.tags, map[string]interface{}{"value": int(float64(memUsed) * 1.0e-3)}, time.Now()) + y, err := lp.New("mem_used", m.tags, m.meta, map[string]interface{}{"value": int(float64(memUsed) * 1.0e-3)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } } } if _, found := m.stats[`MemShared`]; found { _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_shared") - y, err := lp.New("mem_shared", m.tags, map[string]interface{}{"value": int(float64(m.stats[`MemShared`]) * 1.0e-3)}, time.Now()) + y, err := lp.New("mem_shared", m.tags, m.meta, map[string]interface{}{"value": int(float64(m.stats[`MemShared`]) * 1.0e-3)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/memstatMetric.md b/collectors/memstatMetric.md new file mode 100644 index 0000000..4b7b8c7 --- /dev/null +++ b/collectors/memstatMetric.md @@ -0,0 +1,27 @@ + +## `memstat` collector + +```json + "memstat": { + "exclude_metrics": [ + "mem_used" + ] + } +``` + +The `memstat` collector reads data from `/proc/meminfo` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. + + +Metrics: +* `mem_total` +* `mem_sreclaimable` +* `mem_slab` +* `mem_free` +* `mem_buffers` +* `mem_cached` +* `mem_available` +* `mem_shared` +* `swap_total` +* `swap_free` +* `mem_used` = `mem_total` - (`mem_free` + `mem_buffers` + `mem_cached`) + diff --git a/collectors/metricCollector.go b/collectors/metricCollector.go index 0228530..6bc9047 100644 --- a/collectors/metricCollector.go +++ b/collectors/metricCollector.go @@ -1,8 +1,10 @@ package collectors import ( + "encoding/json" "errors" - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + influx "github.com/influxdata/line-protocol" "io/ioutil" "log" "strconv" @@ -10,28 +12,30 @@ import ( "time" ) -type MetricGetter interface { +type MetricCollector interface { Name() string - Init(config []byte) error + Init(config json.RawMessage) error Initialized() bool - Read(time.Duration, *[]lp.MutableMetric) + Read(duration time.Duration, output chan lp.CCMetric) Close() } -type MetricCollector struct { - name string - init bool +type metricCollector struct { + output chan lp.CCMetric + name string + init bool + meta map[string]string } -func (c *MetricCollector) Name() string { +func (c *metricCollector) Name() string { return c.name } -func (c *MetricCollector) setup() error { +func (c *metricCollector) setup() error { return nil } -func (c *MetricCollector) Initialized() bool { +func (c *metricCollector) Initialized() bool { return c.init == true } @@ -103,7 +107,7 @@ func CpuList() []int { return cpulist } -func Tags2Map(metric lp.Metric) map[string]string { +func Tags2Map(metric influx.Metric) map[string]string { tags := make(map[string]string) for _, t := range metric.TagList() { tags[t.Key] = t.Value @@ -111,7 +115,7 @@ func Tags2Map(metric lp.Metric) map[string]string { return tags } -func Fields2Map(metric lp.Metric) map[string]interface{} { +func Fields2Map(metric influx.Metric) map[string]interface{} { fields := make(map[string]interface{}) for _, f := range metric.FieldList() { fields[f.Key] = f.Value diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index a273de1..86437ea 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -7,8 +7,7 @@ import ( "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const NETSTATFILE = `/proc/net/dev` @@ -18,14 +17,15 @@ type NetstatCollectorConfig struct { } type NetstatCollector struct { - MetricCollector + metricCollector config NetstatCollectorConfig matches map[int]string } -func (m *NetstatCollector) Init(config []byte) error { +func (m *NetstatCollector) Init(config json.RawMessage) error { m.name = "NetstatCollector" m.setup() + m.meta = map[string]string{"source": m.name, "group": "Memory"} m.matches = map[int]string{ 1: "bytes_in", 9: "bytes_out", @@ -46,7 +46,7 @@ func (m *NetstatCollector) Init(config []byte) error { return nil } -func (m *NetstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { data, err := ioutil.ReadFile(string(NETSTATFILE)) if err != nil { log.Print(err.Error()) @@ -73,9 +73,15 @@ func (m *NetstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) for i, name := range m.matches { v, err := strconv.ParseInt(f[i], 10, 0) if err == nil { - y, err := lp.New(name, tags, map[string]interface{}{"value": int(float64(v) * 1.0e-3)}, time.Now()) + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": int(float64(v) * 1.0e-3)}, time.Now()) if err == nil { - *out = append(*out, y) + switch { + case strings.Contains(name, "byte"): + y.AddMeta("unit", "Byte") + case strings.Contains(name, "pkt"): + y.AddMeta("unit", "Packets") + } + output <- y } } } diff --git a/collectors/netstatMetric.md b/collectors/netstatMetric.md new file mode 100644 index 0000000..34a48fd --- /dev/null +++ b/collectors/netstatMetric.md @@ -0,0 +1,21 @@ + +## `netstat` collector + +```json + "netstat": { + "exclude_devices": [ + "lo" + ] + } +``` + +The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a device is not required, it can be excluded from forwarding it to the sink. Commonly the `lo` device should be excluded. + +Metrics: +* `bytes_in` +* `bytes_out` +* `pkts_in` +* `pkts_out` + +The device name is added as tag `device`. + diff --git a/collectors/nfsMetric.go b/collectors/nfsMetric.go new file mode 100644 index 0000000..16a6d23 --- /dev/null +++ b/collectors/nfsMetric.go @@ -0,0 +1,147 @@ +package collectors + +import ( + "encoding/json" + "fmt" + "log" + + // "os" + "os/exec" + "strconv" + "strings" + "time" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +type NfsCollectorData struct { + current int64 + last int64 +} + +type NfsCollector struct { + metricCollector + tags map[string]string + config struct { + Nfsutils string `json:"nfsutils"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + } + data map[string]map[string]NfsCollectorData +} + +func (m *NfsCollector) initStats() error { + cmd := exec.Command(m.config.Nfsutils, "-l") + cmd.Wait() + buffer, err := cmd.Output() + if err == nil { + for _, line := range strings.Split(string(buffer), "\n") { + lf := strings.Fields(line) + if len(lf) != 5 { + continue + } + if _, exist := m.data[lf[1]]; !exist { + m.data[lf[1]] = make(map[string]NfsCollectorData) + } + name := strings.Trim(lf[3], ":") + if _, exist := m.data[lf[1]][name]; !exist { + value, err := strconv.ParseInt(lf[4], 0, 64) + if err == nil { + x := m.data[lf[1]][name] + x.current = value + x.last = 0 + m.data[lf[1]][name] = x + } + } + } + } + return err +} + +func (m *NfsCollector) updateStats() error { + cmd := exec.Command(m.config.Nfsutils, "-l") + cmd.Wait() + buffer, err := cmd.Output() + if err == nil { + for _, line := range strings.Split(string(buffer), "\n") { + lf := strings.Fields(line) + if len(lf) != 5 { + continue + } + if _, exist := m.data[lf[1]]; !exist { + m.data[lf[1]] = make(map[string]NfsCollectorData) + } + name := strings.Trim(lf[3], ":") + if _, exist := m.data[lf[1]][name]; exist { + value, err := strconv.ParseInt(lf[4], 0, 64) + if err == nil { + x := m.data[lf[1]][name] + x.last = x.current + x.current = value + m.data[lf[1]][name] = x + } + } + } + } + return err +} + +func (m *NfsCollector) Init(config json.RawMessage) error { + var err error + m.name = "NfsCollector" + m.setup() + + // Set default mmpmon binary + m.config.Nfsutils = "/usr/sbin/nfsstat" + + // Read JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + log.Print(err.Error()) + return err + } + } + m.meta = map[string]string{ + "source": m.name, + "group": "NFS", + } + m.tags = map[string]string{ + "type": "node", + } + // Check if mmpmon is in executable search path + _, err = exec.LookPath(m.config.Nfsutils) + if err != nil { + return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsutils, err) + } + m.data = make(map[string]map[string]NfsCollectorData) + m.initStats() + m.init = true + return nil +} + +func (m *NfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + timestamp := time.Now() + + m.updateStats() + + for version, metrics := range m.data { + for name, data := range metrics { + if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip { + continue + } + value := data.current - data.last + y, err := lp.New(fmt.Sprintf("nfs_%s", name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + y.AddMeta("version", version) + output <- y + } + } + } +} + +func (m *NfsCollector) Close() { + m.init = false +} diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 31118c2..6f5141a 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -6,9 +6,8 @@ import ( "fmt" "log" "time" - + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "github.com/NVIDIA/go-nvml/pkg/nvml" - lp "github.com/influxdata/line-protocol" ) type NvidiaCollectorConfig struct { @@ -17,7 +16,7 @@ type NvidiaCollectorConfig struct { } type NvidiaCollector struct { - MetricCollector + metricCollector num_gpus int config NvidiaCollectorConfig } @@ -29,10 +28,11 @@ func (m *NvidiaCollector) CatchPanic() { } } -func (m *NvidiaCollector) Init(config []byte) error { +func (m *NvidiaCollector) Init(config json.RawMessage) error { var err error m.name = "NvidiaCollector" m.setup() + m.meta = map[string]string{"source": m.name, "group": "Nvidia"} if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { @@ -55,7 +55,7 @@ func (m *NvidiaCollector) Init(config []byte) error { return nil } -func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -74,14 +74,14 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) util, ret := nvml.DeviceGetUtilizationRates(device) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "util") - y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) + y, err := lp.New("util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_util") - y, err = lp.New("mem_util", tags, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) + y, err = lp.New("mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } @@ -89,174 +89,177 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) if ret == nvml.SUCCESS { t := float64(meminfo.Total) / (1024 * 1024) _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_total") - y, err := lp.New("mem_total", tags, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.New("mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + y.AddMeta("unit", "MByte") + output <- y } f := float64(meminfo.Used) / (1024 * 1024) _, skip = stringArrayContains(m.config.ExcludeMetrics, "fb_memory") - y, err = lp.New("fb_memory", tags, map[string]interface{}{"value": f}, time.Now()) + y, err = lp.New("fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + y.AddMeta("unit", "MByte") + output <- y } } temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "temp") - y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now()) + y, err := lp.New("temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + y.AddMeta("unit", "degC") + output <- y } } fan, ret := nvml.DeviceGetFanSpeed(device) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "fan") - y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now()) + y, err := lp.New("fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } _, ecc_pend, ret := nvml.DeviceGetEccMode(device) if ret == nvml.SUCCESS { - var y lp.MutableMetric + var y lp.CCMetric var err error switch ecc_pend { case nvml.FEATURE_DISABLED: - y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("OFF")}, time.Now()) + y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) case nvml.FEATURE_ENABLED: - y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("ON")}, time.Now()) + y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) default: - y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) + y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) } _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode") if err == nil && !skip { - *out = append(*out, y) + output <- y } } else if ret == nvml.ERROR_NOT_SUPPORTED { _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode") - y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now()) + y, err := lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } pstate, ret := nvml.DeviceGetPerformanceState(device) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state") - y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) + y, err := lp.New("perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } power, ret := nvml.DeviceGetPowerUsage(device) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report") - y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) + y, err := lp.New("power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report") - y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now()) + y, err := lp.New("graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report") - y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now()) + y, err := lp.New("sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report") - y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now()) + y, err := lp.New("mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock") - y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) + y, err := lp.New("max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock") - y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) + y, err := lp.New("max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock") - y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) + y, err := lp.New("max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error") - y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) + y, err := lp.New("ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error") - y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) + y, err := lp.New("ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit") - y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) + y, err := lp.New("power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util") - y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now()) + y, err := lp.New("encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device) if ret == nvml.SUCCESS { _, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util") - y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now()) + y, err := lp.New("decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) if err == nil && !skip { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md new file mode 100644 index 0000000..c774139 --- /dev/null +++ b/collectors/nvidiaMetric.md @@ -0,0 +1,40 @@ + +## `nvidia` collector + +```json + "lustrestat": { + "exclude_devices" : [ + "0","1" + ], + "exclude_metrics": [ + "fb_memory", + "fan" + ] + } +``` + +Metrics: +* `util` +* `mem_util` +* `mem_total` +* `fb_memory` +* `temp` +* `fan` +* `ecc_mode` +* `perf_state` +* `power_usage_report` +* `graphics_clock_report` +* `sm_clock_report` +* `mem_clock_report` +* `max_graphics_clock` +* `max_sm_clock` +* `max_mem_clock` +* `ecc_db_error` +* `ecc_sb_error` +* `power_man_limit` +* `encoder_util` +* `decoder_util` + +It uses a separate `type` in the metrics. The output metric looks like this: +`,type=accelerator,type-id= value= ` + diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index b074d78..b73d582 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -4,13 +4,13 @@ import ( "encoding/json" "fmt" "io/ioutil" + "log" "os" "path/filepath" "strconv" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const HWMON_PATH = `/sys/class/hwmon` @@ -21,20 +21,21 @@ type TempCollectorConfig struct { } type TempCollector struct { - MetricCollector + metricCollector config TempCollectorConfig } -func (m *TempCollector) Init(config []byte) error { +func (m *TempCollector) Init(config json.RawMessage) error { m.name = "TempCollector" m.setup() - m.init = true + m.meta = map[string]string{"source": m.name, "group": "IPMI", "unit": "degC"} if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { return err } } + m.init = true return nil } @@ -74,7 +75,7 @@ func get_hwmon_sensors() (map[string]map[string]string, error) { return sensors, nil } -func (m *TempCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { sensors, err := get_hwmon_sensors() if err != nil { @@ -89,15 +90,20 @@ func (m *TempCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { break } } + mname := strings.Replace(name, " ", "_", -1) + if !strings.Contains(mname, "temp") { + mname = fmt.Sprintf("temp_%s", mname) + } buffer, err := ioutil.ReadFile(string(file)) if err != nil { continue } x, err := strconv.ParseInt(strings.Replace(string(buffer), "\n", "", -1), 0, 64) if err == nil { - y, err := lp.New(strings.ToLower(name), tags, map[string]interface{}{"value": float64(x) / 1000}, time.Now()) + y, err := lp.New(strings.ToLower(mname), tags, m.meta, map[string]interface{}{"value": int(float64(x) / 1000)}, time.Now()) if err == nil { - *out = append(*out, y) + log.Print("[", m.name, "] ", y) + output <- y } } } diff --git a/collectors/tempMetric.md b/collectors/tempMetric.md new file mode 100644 index 0000000..1e3d979 --- /dev/null +++ b/collectors/tempMetric.md @@ -0,0 +1,22 @@ + +## `tempstat` collector + +```json + "tempstat": { + "tag_override" : { + "" : { + "type" : "socket", + "type-id" : "0" + } + }, + "exclude_metrics": [ + "metric1", + "metric2" + ] + } +``` + +The `tempstat` collector reads the data from `/sys/class/hwmon//tempX_{input,label}` + +Metrics: +* `temp_*`: The metric name is taken from the `label` files. diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index 715b8c3..d2691dc 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -8,8 +8,7 @@ import ( "os/exec" "strings" "time" - - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const MAX_NUM_PROCS = 10 @@ -20,15 +19,16 @@ type TopProcsCollectorConfig struct { } type TopProcsCollector struct { - MetricCollector + metricCollector tags map[string]string config TopProcsCollectorConfig } -func (m *TopProcsCollector) Init(config []byte) error { +func (m *TopProcsCollector) Init(config json.RawMessage) error { var err error m.name = "TopProcsCollector" m.tags = map[string]string{"type": "node"} + m.meta = map[string]string{"source": m.name, "group": "TopProcs"} if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { @@ -51,7 +51,7 @@ func (m *TopProcsCollector) Init(config []byte) error { return nil } -func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { +func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } @@ -66,9 +66,9 @@ func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric lines := strings.Split(string(stdout), "\n") for i := 1; i < m.config.Num_procs+1; i++ { name := fmt.Sprintf("topproc%d", i) - y, err := lp.New(name, m.tags, map[string]interface{}{"value": string(lines[i])}, time.Now()) + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now()) if err == nil { - *out = append(*out, y) + output <- y } } } diff --git a/collectors/topprocsMetric.md b/collectors/topprocsMetric.md new file mode 100644 index 0000000..ca47582 --- /dev/null +++ b/collectors/topprocsMetric.md @@ -0,0 +1,15 @@ + +## `topprocs` collector + +```json + "topprocs": { + "num_procs": 5 + } +``` + +The `topprocs` collector reads the TopX processes (sorted by CPU utilization, `ps -Ao comm --sort=-pcpu`). + +In contrast to most other collectors, the metric value is a `string`. + + + diff --git a/config.json b/config.json index 4a7fd89..52f9df1 100644 --- a/config.json +++ b/config.json @@ -1,36 +1,8 @@ { - "sink": { - "user": "testuser", - "password": "testpass", - "host": "127.0.0.1", - "port": "9090", - "database": "testdb", - "organization": "testorg", - "type": "stdout" - }, - "interval": 3, - "duration": 1, - "collectors": [ - "tempstat" - ], - "default_tags": { - "cluster": "testcluster" - }, - "receiver": { - "type": "none" - }, - "collect_config": { - "tempstat": { - "tag_override": { - "hwmon0" : { - "type" : "socket", - "type-id" : "0" - }, - "hwmon1" : { - "type" : "socket", - "type-id" : "1" - } - } - } - } + "sinks": "sinks.json", + "collectors" : "collectors.json", + "receivers" : "receivers.json", + "router" : "router.json", + "interval": 10, + "duration": 1 } diff --git a/go.mod b/go.mod index 903ea80..be384b6 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,11 @@ module github.com/ClusterCockpit/cc-metric-collector go 1.16 require ( - github.com/NVIDIA/go-nvml v0.11.1-0 // indirect + github.com/NVIDIA/go-nvml v0.11.1-0 github.com/influxdata/influxdb-client-go/v2 v2.2.2 github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097 github.com/nats-io/nats.go v1.10.0 github.com/nats-io/nkeys v0.1.4 // indirect github.com/prometheus/client_golang v1.10.0 // indirect + gopkg.in/Knetic/govaluate.v2 v2.3.0 ) diff --git a/go.sum b/go.sum index 4bd7c8a..a6f98d7 100644 --- a/go.sum +++ b/go.sum @@ -421,6 +421,8 @@ google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miE google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +gopkg.in/Knetic/govaluate.v2 v2.3.0 h1:naJVc9CZlWA8rC8f5mvECJD7jreTrn7FvGXjBthkHJQ= +gopkg.in/Knetic/govaluate.v2 v2.3.0/go.mod h1:NW0gr10J8s7aNghEg6uhdxiEaBvc0+8VgJjVViHUKp4= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/ccMetric/README.md b/internal/ccMetric/README.md new file mode 100644 index 0000000..1787ff0 --- /dev/null +++ b/internal/ccMetric/README.md @@ -0,0 +1,32 @@ +# ClusterCockpit metrics + +As described in the [ClusterCockpit specifications](https://github.com/ClusterCockpit/cc-specifications), the whole ClusterCockpit stack uses metrics in the InfluxDB line protocol format. This is also the input and output format for the ClusterCockpit Metric Collector but internally it uses an extended format while processing, named CCMetric. + +It is basically a copy of the [InfluxDB line protocol](https://github.com/influxdata/line-protocol) `MutableMetric` interface with one extension. Besides the tags and fields, it contains a list of meta information (re-using the `Tag` structure of the original protocol): + +```golang +type ccMetric struct { + name string // same as + tags []*influx.Tag // original + fields []*influx.Field // Influx + tm time.Time // line-protocol + meta []*influx.Tag +} + +type CCMetric interface { + influx.MutableMetric // the same functions as defined by influx.MutableMetric + RemoveTag(key string) // this is not published by the original influx.MutableMetric + Meta() map[string]string + MetaList() []*inlux.Tag + AddMeta(key, value string) + HasMeta(key string) bool + GetMeta(key string) (string, bool) + RemoveMeta(key string) +} +``` + +The `CCMetric` interface provides the same functions as the `MutableMetric` like `{Add, Remove, Has}{Tag, Field}` and additionally provides `{Add, Remove, Has}Meta`. + +The InfluxDB protocol creates a new metric with `influx.New(name, tags, fields, time)` while CCMetric uses `ccMetric.New(name, tags, meta, fields, time)` where `tags` and `meta` are both of type `map[string]string`. + +You can copy a CCMetric with `FromMetric(other CCMetric) CCMetric`. If you get an `influx.Metric` from a function, like the line protocol parser, you can use `FromInfluxMetric(other influx.Metric) CCMetric` to get a CCMetric out of it (see `NatsReceiver` for an example). diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go new file mode 100644 index 0000000..6b6bda9 --- /dev/null +++ b/internal/ccMetric/ccMetric.go @@ -0,0 +1,374 @@ +package ccmetric + +import ( + "fmt" + lp "github.com/influxdata/line-protocol" // MIT license + "sort" + "time" +) + +// Most functions are derived from github.com/influxdata/line-protocol/metric.go +// The metric type is extended with an extra meta information list re-using the Tag +// type. + +type ccMetric struct { + name string + tags []*lp.Tag + fields []*lp.Field + tm time.Time + meta []*lp.Tag +} + +type CCMetric interface { + lp.MutableMetric + AddMeta(key, value string) + MetaList() []*lp.Tag + RemoveTag(key string) +} + +func (m *ccMetric) Meta() map[string]string { + meta := make(map[string]string, len(m.meta)) + for _, m := range m.meta { + meta[m.Key] = m.Value + } + return meta +} + +func (m *ccMetric) MetaList() []*lp.Tag { + return m.meta +} + +func (m *ccMetric) String() string { + return fmt.Sprintf("%s %v %v %v %d", m.name, m.Tags(), m.Meta(), m.Fields(), m.tm.UnixNano()) +} + +func (m *ccMetric) Name() string { + return m.name +} + +func (m *ccMetric) Tags() map[string]string { + tags := make(map[string]string, len(m.tags)) + for _, tag := range m.tags { + tags[tag.Key] = tag.Value + } + return tags +} + +func (m *ccMetric) TagList() []*lp.Tag { + return m.tags +} + +func (m *ccMetric) Fields() map[string]interface{} { + fields := make(map[string]interface{}, len(m.fields)) + for _, field := range m.fields { + fields[field.Key] = field.Value + } + + return fields +} + +func (m *ccMetric) FieldList() []*lp.Field { + return m.fields +} + +func (m *ccMetric) Time() time.Time { + return m.tm +} + +func (m *ccMetric) SetTime(t time.Time) { + m.tm = t +} + +func (m *ccMetric) HasTag(key string) bool { + for _, tag := range m.tags { + if tag.Key == key { + return true + } + } + return false +} + +func (m *ccMetric) GetTag(key string) (string, bool) { + for _, tag := range m.tags { + if tag.Key == key { + return tag.Value, true + } + } + return "", false +} + +func (m *ccMetric) RemoveTag(key string) { + for i, tag := range m.tags { + if tag.Key == key { + copy(m.tags[i:], m.tags[i+1:]) + m.tags[len(m.tags)-1] = nil + m.tags = m.tags[:len(m.tags)-1] + return + } + } +} + +func (m *ccMetric) AddTag(key, value string) { + for i, tag := range m.tags { + if key > tag.Key { + continue + } + + if key == tag.Key { + tag.Value = value + return + } + + m.tags = append(m.tags, nil) + copy(m.tags[i+1:], m.tags[i:]) + m.tags[i] = &lp.Tag{Key: key, Value: value} + return + } + + m.tags = append(m.tags, &lp.Tag{Key: key, Value: value}) +} + +func (m *ccMetric) HasMeta(key string) bool { + for _, tag := range m.meta { + if tag.Key == key { + return true + } + } + return false +} + +func (m *ccMetric) GetMeta(key string) (string, bool) { + for _, tag := range m.meta { + if tag.Key == key { + return tag.Value, true + } + } + return "", false +} + +func (m *ccMetric) RemoveMeta(key string) { + for i, tag := range m.meta { + if tag.Key == key { + copy(m.meta[i:], m.meta[i+1:]) + m.meta[len(m.meta)-1] = nil + m.meta = m.meta[:len(m.meta)-1] + return + } + } +} + +func (m *ccMetric) AddMeta(key, value string) { + for i, tag := range m.meta { + if key > tag.Key { + continue + } + + if key == tag.Key { + tag.Value = value + return + } + + m.meta = append(m.meta, nil) + copy(m.meta[i+1:], m.meta[i:]) + m.meta[i] = &lp.Tag{Key: key, Value: value} + return + } + + m.meta = append(m.meta, &lp.Tag{Key: key, Value: value}) +} + +func (m *ccMetric) AddField(key string, value interface{}) { + for i, field := range m.fields { + if key == field.Key { + m.fields[i] = &lp.Field{Key: key, Value: convertField(value)} + return + } + } + m.fields = append(m.fields, &lp.Field{Key: key, Value: convertField(value)}) +} + +func New( + name string, + tags map[string]string, + meta map[string]string, + fields map[string]interface{}, + tm time.Time, +) (CCMetric, error) { + m := &ccMetric{ + name: name, + tags: nil, + fields: nil, + tm: tm, + meta: nil, + } + + if len(tags) > 0 { + m.tags = make([]*lp.Tag, 0, len(tags)) + for k, v := range tags { + m.tags = append(m.tags, + &lp.Tag{Key: k, Value: v}) + } + sort.Slice(m.tags, func(i, j int) bool { return m.tags[i].Key < m.tags[j].Key }) + } + + if len(meta) > 0 { + m.meta = make([]*lp.Tag, 0, len(meta)) + for k, v := range meta { + m.meta = append(m.meta, + &lp.Tag{Key: k, Value: v}) + } + sort.Slice(m.meta, func(i, j int) bool { return m.meta[i].Key < m.meta[j].Key }) + } + + if len(fields) > 0 { + m.fields = make([]*lp.Field, 0, len(fields)) + for k, v := range fields { + v := convertField(v) + if v == nil { + continue + } + m.AddField(k, v) + } + } + + return m, nil +} + +func FromMetric(other CCMetric) CCMetric { + m := &ccMetric{ + name: other.Name(), + tags: make([]*lp.Tag, len(other.TagList())), + fields: make([]*lp.Field, len(other.FieldList())), + meta: make([]*lp.Tag, len(other.MetaList())), + tm: other.Time(), + } + + for i, tag := range other.TagList() { + m.tags[i] = &lp.Tag{Key: tag.Key, Value: tag.Value} + } + for i, s := range other.MetaList() { + m.meta[i] = &lp.Tag{Key: s.Key, Value: s.Value} + } + + for i, field := range other.FieldList() { + m.fields[i] = &lp.Field{Key: field.Key, Value: field.Value} + } + return m +} + +func FromInfluxMetric(other lp.Metric) CCMetric { + m := &ccMetric{ + name: other.Name(), + tags: make([]*lp.Tag, len(other.TagList())), + fields: make([]*lp.Field, len(other.FieldList())), + meta: make([]*lp.Tag, 0), + tm: other.Time(), + } + + for i, tag := range other.TagList() { + m.tags[i] = &lp.Tag{Key: tag.Key, Value: tag.Value} + } + + for i, field := range other.FieldList() { + m.fields[i] = &lp.Field{Key: field.Key, Value: field.Value} + } + return m +} + +func convertField(v interface{}) interface{} { + switch v := v.(type) { + case float64: + return v + case int64: + return v + case string: + return v + case bool: + return v + case int: + return int64(v) + case uint: + return uint64(v) + case uint64: + return uint64(v) + case []byte: + return string(v) + case int32: + return int64(v) + case int16: + return int64(v) + case int8: + return int64(v) + case uint32: + return uint64(v) + case uint16: + return uint64(v) + case uint8: + return uint64(v) + case float32: + return float64(v) + case *float64: + if v != nil { + return *v + } + case *int64: + if v != nil { + return *v + } + case *string: + if v != nil { + return *v + } + case *bool: + if v != nil { + return *v + } + case *int: + if v != nil { + return int64(*v) + } + case *uint: + if v != nil { + return uint64(*v) + } + case *uint64: + if v != nil { + return uint64(*v) + } + case *[]byte: + if v != nil { + return string(*v) + } + case *int32: + if v != nil { + return int64(*v) + } + case *int16: + if v != nil { + return int64(*v) + } + case *int8: + if v != nil { + return int64(*v) + } + case *uint32: + if v != nil { + return uint64(*v) + } + case *uint16: + if v != nil { + return uint64(*v) + } + case *uint8: + if v != nil { + return uint64(*v) + } + case *float32: + if v != nil { + return float64(*v) + } + default: + return nil + } + return nil +} diff --git a/internal/metricRouter/README.md b/internal/metricRouter/README.md new file mode 100644 index 0000000..a3aef16 --- /dev/null +++ b/internal/metricRouter/README.md @@ -0,0 +1,50 @@ +# CC Metric Router + +The CCMetric router sits in between the collectors and the sinks and can be used to add and remove tags to/from traversing [CCMetrics](../ccMetric/README.md). + +# Configuration + +```json +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "testcluster", + "if" : "*" + }, + { + "key" : "test", + "value" : "testing", + "if" : "name == 'temp_package_id_0'" + } + ], + "delete_tags" : [ + { + "key" : "unit", + "value" : "*", + "if" : "*" + } + ], + "interval_timestamp" : true +} +``` + +There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval. + +# Conditional manipulation of tags + +The `if` setting allows conditional testing of a single metric like in the example: + +```json +{ + "key" : "test", + "value" : "testing", + "if" : "name == 'temp_package_id_0'" +} +``` + +If the CCMetric name is equal to 'temp_package_id_0', it adds an additional tag `test=testing` to the metric. + +In order to match all metrics, you can use `*`, so in order to add a flag per default, like the `cluster=testcluster` tag in the example. + + diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go new file mode 100644 index 0000000..25b0dc2 --- /dev/null +++ b/internal/metricRouter/metricRouter.go @@ -0,0 +1,208 @@ +package metricRouter + +import ( + "encoding/json" + "log" + "os" + "sync" + "time" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" + "gopkg.in/Knetic/govaluate.v2" +) + +type metricRouterTagConfig struct { + Key string `json:"key"` + Value string `json:"value"` + Condition string `json:"if"` +} + +type metricRouterConfig struct { + AddTags []metricRouterTagConfig `json:"add_tags"` + DelTags []metricRouterTagConfig `json:"delete_tags"` + IntervalStamp bool `json:"interval_timestamp"` +} + +type metricRouter struct { + inputs []chan lp.CCMetric + outputs []chan lp.CCMetric + done chan bool + wg *sync.WaitGroup + timestamp time.Time + ticker mct.MultiChanTicker + config metricRouterConfig +} + +type MetricRouter interface { + Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error + AddInput(input chan lp.CCMetric) + AddOutput(output chan lp.CCMetric) + Start() + Close() +} + +func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error { + r.inputs = make([]chan lp.CCMetric, 0) + r.outputs = make([]chan lp.CCMetric, 0) + r.done = make(chan bool) + r.wg = wg + r.ticker = ticker + configFile, err := os.Open(routerConfigFile) + if err != nil { + log.Print(err.Error()) + return err + } + defer configFile.Close() + jsonParser := json.NewDecoder(configFile) + err = jsonParser.Decode(&r.config) + if err != nil { + log.Print(err.Error()) + return err + } + return nil +} + +func (r *metricRouter) StartTimer() { + m := make(chan time.Time) + r.ticker.AddChannel(m) + go func() { + for { + select { + case t := <-m: + r.timestamp = t + } + } + }() +} + +func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, error) { + expression, err := govaluate.NewEvaluableExpression(Cond) + if err != nil { + log.Print(Cond, " = ", err.Error()) + return false, err + } + params := make(map[string]interface{}) + params["name"] = point.Name() + for _, t := range point.TagList() { + params[t.Key] = t.Value + } + for _, m := range point.MetaList() { + params[m.Key] = m.Value + } + for _, f := range point.FieldList() { + params[f.Key] = f.Value + } + params["timestamp"] = point.Time() + + result, err := expression.Evaluate(params) + if err != nil { + log.Print(Cond, " = ", err.Error()) + return false, err + } + return bool(result.(bool)), err +} + +func (r *metricRouter) DoAddTags(point lp.CCMetric) { + for _, m := range r.config.AddTags { + var conditionMatches bool + + if m.Condition == "*" { + conditionMatches = true + } else { + var err error + conditionMatches, err = r.EvalCondition(m.Condition, point) + if err != nil { + log.Print(err.Error()) + conditionMatches = false + } + } + if conditionMatches { + point.AddTag(m.Key, m.Value) + } + } +} + +func (r *metricRouter) DoDelTags(point lp.CCMetric) { + for _, m := range r.config.DelTags { + var conditionMatches bool + + if m.Condition == "*" { + conditionMatches = true + } else { + var err error + conditionMatches, err = r.EvalCondition(m.Condition, point) + if err != nil { + log.Print(err.Error()) + conditionMatches = false + } + } + if conditionMatches { + point.RemoveTag(m.Key) + } + } +} + +func (r *metricRouter) Start() { + r.wg.Add(1) + r.timestamp = time.Now() + if r.config.IntervalStamp { + r.StartTimer() + } + go func() { + for { + RouterLoop: + select { + case <-r.done: + log.Print("[MetricRouter] DONE\n") + r.wg.Done() + break RouterLoop + default: + for _, c := range r.inputs { + RouterInputLoop: + select { + case <-r.done: + log.Print("[MetricRouter] DONE\n") + r.wg.Done() + break RouterInputLoop + case p := <-c: + log.Print("[MetricRouter] FORWARD ", p) + r.DoAddTags(p) + r.DoDelTags(p) + if r.config.IntervalStamp { + p.SetTime(r.timestamp) + } + for _, o := range r.outputs { + o <- p + } + default: + } + } + } + } + log.Print("[MetricRouter] EXIT\n") + }() + log.Print("[MetricRouter] STARTED\n") +} + +func (r *metricRouter) AddInput(input chan lp.CCMetric) { + r.inputs = append(r.inputs, input) +} + +func (r *metricRouter) AddOutput(output chan lp.CCMetric) { + r.outputs = append(r.outputs, output) +} + +func (r *metricRouter) Close() { + r.done <- true + log.Print("[MetricRouter] CLOSE\n") +} + +func New(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) (MetricRouter, error) { + r := new(metricRouter) + err := r.Init(ticker, wg, routerConfigFile) + if err != nil { + return nil, err + } + return r, err +} diff --git a/internal/multiChanTicker/README.md b/internal/multiChanTicker/README.md new file mode 100644 index 0000000..30deb4f --- /dev/null +++ b/internal/multiChanTicker/README.md @@ -0,0 +1,37 @@ +# MultiChanTicker + +The idea of this ticker is to multiply the output channels. The original Golang `time.Ticker` provides only a single output channel, so the signal can only be received by a single other class. This ticker allows to add multiple channels which get all notified about the time tick. + +```golang +type MultiChanTicker interface { + Init(duration time.Duration) + AddChannel(chan time.Time) +} +``` + +The MultiChanTicker is created similarly to the common `time.Ticker`: + +```golang +NewTicker(duration time.Duration) MultiChanTicker +``` + +Afterwards, you can add channels: + +```golang +t := MultiChanTicker(duration) +c1 := make(chan time.Time) +c2 := make(chan time.Time) +t.AddChannel(c1) +t.AddChannel(c2) + +for { + select { + case t1 := <- c1: + log.Print(t1) + case t2 := <- c2: + log.Print(t2) + } +} +``` + +The result should be the same `time.Time` output in both channels, notified "simultaneously". diff --git a/internal/multiChanTicker/multiChanTicker.go b/internal/multiChanTicker/multiChanTicker.go new file mode 100644 index 0000000..f8139fa --- /dev/null +++ b/internal/multiChanTicker/multiChanTicker.go @@ -0,0 +1,39 @@ +package multiChanTicker + +import ( + "time" +) + +type multiChanTicker struct { + ticker *time.Ticker + channels []chan time.Time +} + +type MultiChanTicker interface { + Init(duration time.Duration) + AddChannel(chan time.Time) +} + +func (t *multiChanTicker) Init(duration time.Duration) { + t.ticker = time.NewTicker(duration) + go func() { + for { + select { + case ts := <-t.ticker.C: + for _, c := range t.channels { + c <- ts + } + } + } + }() +} + +func (t *multiChanTicker) AddChannel(channel chan time.Time) { + t.channels = append(t.channels, channel) +} + +func NewTicker(duration time.Duration) MultiChanTicker { + t := &multiChanTicker{} + t.Init(duration) + return t +} diff --git a/metric-collector.go b/metric-collector.go index 02a2b21..c071933 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -8,60 +8,32 @@ import ( "os" "os/signal" "strings" - "sync" - "time" "github.com/ClusterCockpit/cc-metric-collector/collectors" "github.com/ClusterCockpit/cc-metric-collector/receivers" "github.com/ClusterCockpit/cc-metric-collector/sinks" - lp "github.com/influxdata/line-protocol" + + // "strings" + "sync" + "time" + + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" + mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" ) -// List of provided collectors. Which collector should be run can be -// configured at 'collectors' list in 'config.json'. -var Collectors = map[string]collectors.MetricGetter{ - "likwid": &collectors.LikwidCollector{}, - "loadavg": &collectors.LoadavgCollector{}, - "memstat": &collectors.MemstatCollector{}, - "netstat": &collectors.NetstatCollector{}, - "ibstat": &collectors.InfinibandCollector{}, - "lustrestat": &collectors.LustreCollector{}, - "cpustat": &collectors.CpustatCollector{}, - "topprocs": &collectors.TopProcsCollector{}, - "nvidia": &collectors.NvidiaCollector{}, - "customcmd": &collectors.CustomCmdCollector{}, - "diskstat": &collectors.DiskstatCollector{}, - "tempstat": &collectors.TempCollector{}, - "ipmistat": &collectors.IpmiCollector{}, - "gpfs": new(collectors.GpfsCollector), - "cpufreq": new(collectors.CPUFreqCollector), - "cpufreq_cpuinfo": new(collectors.CPUFreqCpuInfoCollector), +type CentralConfigFile struct { + Interval int `json:"interval"` + Duration int `json:"duration"` + Pidfile string `json:"pidfile,omitempty"` + CollectorConfigFile string `json:"collectors"` + RouterConfigFile string `json:"router"` + SinkConfigFile string `json:"sinks"` + ReceiverConfigFile string `json:"receivers,omitempty"` } -var Sinks = map[string]sinks.SinkFuncs{ - "influxdb": &sinks.InfluxSink{}, - "stdout": &sinks.StdoutSink{}, - "nats": &sinks.NatsSink{}, - "http": &sinks.HttpSink{}, -} - -var Receivers = map[string]receivers.ReceiverFuncs{ - "nats": &receivers.NatsReceiver{}, -} - -// Structure of the configuration file -type GlobalConfig struct { - Sink sinks.SinkConfig `json:"sink"` - Interval int `json:"interval"` - Duration int `json:"duration"` - Collectors []string `json:"collectors"` - Receiver receivers.ReceiverConfig `json:"receiver"` - DefTags map[string]string `json:"default_tags"` - CollectConfigs map[string]json.RawMessage `json:"collect_config"` -} - -// Load JSON configuration file -func LoadConfiguration(file string, config *GlobalConfig) error { +func LoadCentralConfiguration(file string, config *CentralConfigFile) error { configFile, err := os.Open(file) defer configFile.Close() if err != nil { @@ -73,6 +45,56 @@ func LoadConfiguration(file string, config *GlobalConfig) error { return err } +type RuntimeConfig struct { + Hostname string + Interval time.Duration + Duration time.Duration + CliArgs map[string]string + ConfigFile CentralConfigFile + + Router mr.MetricRouter + CollectManager collectors.CollectorManager + SinkManager sinks.SinkManager + ReceiveManager receivers.ReceiveManager + Ticker mct.MultiChanTicker + + Channels []chan lp.CCMetric + Sync sync.WaitGroup +} + +func prepare_runcfg() RuntimeConfig { + return RuntimeConfig{ + Router: nil, + CollectManager: nil, + SinkManager: nil, + ReceiveManager: nil, + } +} + +//// Structure of the configuration file +//type GlobalConfig struct { +// Sink sinks.SinkConfig `json:"sink"` +// Interval int `json:"interval"` +// Duration int `json:"duration"` +// Collectors []string `json:"collectors"` +// Receiver receivers.ReceiverConfig `json:"receiver"` +// DefTags map[string]string `json:"default_tags"` +// CollectConfigs map[string]json.RawMessage `json:"collect_config"` +//} + +//// Load JSON configuration file +//func LoadConfiguration(file string, config *GlobalConfig) error { +// configFile, err := os.Open(file) +// defer configFile.Close() +// if err != nil { +// fmt.Println(err.Error()) +// return err +// } +// jsonParser := json.NewDecoder(configFile) +// err = jsonParser.Decode(config) +// return err +//} + func ReadCli() map[string]string { var m map[string]string cfg := flag.String("config", "./config.json", "Path to configuration file") @@ -92,228 +114,168 @@ func ReadCli() map[string]string { return m } -func SetLogging(logfile string) error { - var file *os.File - var err error - if logfile != "stderr" { - file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600) - if err != nil { - log.Fatal(err) - return err - } - } else { - file = os.Stderr - } - log.SetOutput(file) - return nil -} +//func SetLogging(logfile string) error { +// var file *os.File +// var err error +// if logfile != "stderr" { +// file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600) +// if err != nil { +// log.Fatal(err) +// return err +// } +// } else { +// file = os.Stderr +// } +// log.SetOutput(file) +// return nil +//} -func CreatePidfile(pidfile string) error { - file, err := os.OpenFile(pidfile, os.O_CREATE|os.O_RDWR, 0600) - if err != nil { - log.Print(err) - return err - } - file.Write([]byte(fmt.Sprintf("%d", os.Getpid()))) - file.Close() - return nil -} +//func CreatePidfile(pidfile string) error { +// file, err := os.OpenFile(pidfile, os.O_CREATE|os.O_RDWR, 0600) +// if err != nil { +// log.Print(err) +// return err +// } +// file.Write([]byte(fmt.Sprintf("%d", os.Getpid()))) +// file.Close() +// return nil +//} -func RemovePidfile(pidfile string) error { - info, err := os.Stat(pidfile) - if !os.IsNotExist(err) && !info.IsDir() { - os.Remove(pidfile) - } - return nil -} +//func RemovePidfile(pidfile string) error { +// info, err := os.Stat(pidfile) +// if !os.IsNotExist(err) && !info.IsDir() { +// os.Remove(pidfile) +// } +// return nil +//} // General shutdown function that gets executed in case of interrupt or graceful shutdown -func shutdown(wg *sync.WaitGroup, collectors []string, sink sinks.SinkFuncs, recv receivers.ReceiverFuncs, pidfile string) { +func shutdown(config *RuntimeConfig) { log.Print("Shutdown...") - for _, c := range collectors { - col := Collectors[c] - log.Print("Stop ", col.Name()) - col.Close() + if config.CollectManager != nil { + log.Print("Shutdown CollectManager...") + config.CollectManager.Close() } - time.Sleep(1 * time.Second) - if recv != nil { - recv.Close() + if config.ReceiveManager != nil { + log.Print("Shutdown ReceiveManager...") + config.ReceiveManager.Close() } - sink.Close() - RemovePidfile(pidfile) - wg.Done() + if config.Router != nil { + log.Print("Shutdown Router...") + config.Router.Close() + } + if config.SinkManager != nil { + log.Print("Shutdown SinkManager...") + config.SinkManager.Close() + } + + // pidfile := config.ConfigFile.Pidfile + // RemovePidfile(pidfile) + // pidfile = config.CliArgs["pidfile"] + // RemovePidfile(pidfile) + config.Sync.Done() } // Register an interrupt handler for Ctrl+C and similar. At signal, // all collectors are closed -func prepare_shutdown(wg *sync.WaitGroup, config *GlobalConfig, sink sinks.SinkFuncs, recv receivers.ReceiverFuncs, pidfile string) { +func prepare_shutdown(config *RuntimeConfig) { sigs := make(chan os.Signal, 1) signal.Notify(sigs, os.Interrupt) - go func(wg *sync.WaitGroup) { + go func(config *RuntimeConfig) { <-sigs log.Print("Shutdown...") - shutdown(wg, config.Collectors, sink, recv, pidfile) - }(wg) + shutdown(config) + }(config) } func main() { - var config GlobalConfig - var wg sync.WaitGroup - var recv receivers.ReceiverFuncs = nil - var use_recv bool - use_recv = false - wg.Add(1) - host, err := os.Hostname() - if err != nil { - log.Print(err) - return - } - // Drop domain part of host name - host = strings.SplitN(host, `.`, 2)[0] - clicfg := ReadCli() - err = CreatePidfile(clicfg["pidfile"]) - err = SetLogging(clicfg["logfile"]) - if err != nil { - log.Print("Error setting up logging system to ", clicfg["logfile"], " on ", host) - return - } + var err error + use_recv := false + + rcfg := prepare_runcfg() + rcfg.CliArgs = ReadCli() // Load and check configuration - err = LoadConfiguration(clicfg["configfile"], &config) + err = LoadCentralConfiguration(rcfg.CliArgs["configfile"], &rcfg.ConfigFile) if err != nil { - log.Print("Error reading configuration file ", clicfg["configfile"]) + log.Print("Error reading configuration file ", rcfg.CliArgs["configfile"]) log.Print(err.Error()) return } - if config.Interval <= 0 || time.Duration(config.Interval)*time.Second <= 0 { + if rcfg.ConfigFile.Interval <= 0 || time.Duration(rcfg.ConfigFile.Interval)*time.Second <= 0 { log.Print("Configuration value 'interval' must be greater than zero") return } - if config.Duration <= 0 { + rcfg.Interval = time.Duration(rcfg.ConfigFile.Interval) * time.Second + if rcfg.ConfigFile.Duration <= 0 || time.Duration(rcfg.ConfigFile.Duration)*time.Second <= 0 { log.Print("Configuration value 'duration' must be greater than zero") return } - if len(config.Collectors) == 0 { - var keys []string - for k := range Collectors { - keys = append(keys, k) - } - log.Print("Configuration value 'collectors' does not contain any collector. Available: ", strings.Join(keys, ", ")) - return - } - for _, name := range config.Collectors { - if _, found := Collectors[name]; !found { - log.Print("Invalid collector '", name, "' in configuration") - return - } - } - if _, found := Sinks[config.Sink.Type]; !found { - log.Print("Invalid sink type '", config.Sink.Type, "' in configuration") - return - } - // Setup sink - sink := Sinks[config.Sink.Type] - err = sink.Init(config.Sink) + rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second + + rcfg.Hostname, err = os.Hostname() if err != nil { - log.Print(err) + log.Print(err.Error()) return } - // Setup receiver - if len(config.Receiver.Type) > 0 && config.Receiver.Type != "none" { - if _, found := Receivers[config.Receiver.Type]; !found { - log.Print("Invalid receiver type '", config.Receiver.Type, "' in configuration") - return - } else { - recv = Receivers[config.Receiver.Type] - err = recv.Init(config.Receiver, sink) - if err == nil { - use_recv = true - } else { - log.Print(err) - } - } - } - - // Register interrupt handler - prepare_shutdown(&wg, &config, sink, recv, clicfg["pidfile"]) - - // Initialize all collectors - tmp := make([]string, 0) - for _, c := range config.Collectors { - col := Collectors[c] - conf, found := config.CollectConfigs[c] - if !found { - conf = json.RawMessage("") - } - err = col.Init([]byte(conf)) + // Drop domain part of host name + rcfg.Hostname = strings.SplitN(rcfg.Hostname, `.`, 2)[0] + // err = CreatePidfile(rcfg.CliArgs["pidfile"]) + // err = SetLogging(rcfg.CliArgs["logfile"]) + // if err != nil { + // log.Print("Error setting up logging system to ", rcfg.CliArgs["logfile"], " on ", rcfg.Hostname) + // return + // } + rcfg.Ticker = mct.NewTicker(rcfg.Interval) + if len(rcfg.ConfigFile.RouterConfigFile) > 0 { + rcfg.Router, err = mr.New(rcfg.Ticker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile) if err != nil { - log.Print("SKIP ", col.Name(), " (", err.Error(), ")") - } else if !col.Initialized() { - log.Print("SKIP ", col.Name(), " (Not initialized)") - } else { - log.Print("Start ", col.Name()) - tmp = append(tmp, c) + log.Print(err.Error()) + return } } - config.Collectors = tmp - config.DefTags["hostname"] = host - - // Setup up ticker loop - if clicfg["once"] != "true" { - log.Print("Running loop every ", time.Duration(config.Interval)*time.Second) - } else { - log.Print("Running loop only once") + if len(rcfg.ConfigFile.SinkConfigFile) > 0 { + rcfg.SinkManager, err = sinks.New(&rcfg.Sync, rcfg.ConfigFile.SinkConfigFile) + if err != nil { + log.Print(err.Error()) + return + } + RouterToSinksChannel := make(chan lp.CCMetric) + rcfg.SinkManager.AddInput(RouterToSinksChannel) + rcfg.Router.AddOutput(RouterToSinksChannel) } - ticker := time.NewTicker(time.Duration(config.Interval) * time.Second) - done := make(chan bool) + if len(rcfg.ConfigFile.CollectorConfigFile) > 0 { + rcfg.CollectManager, err = collectors.New(rcfg.Ticker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile) + if err != nil { + log.Print(err.Error()) + return + } + CollectToRouterChannel := make(chan lp.CCMetric) + rcfg.CollectManager.AddOutput(CollectToRouterChannel) + rcfg.Router.AddInput(CollectToRouterChannel) + } + if len(rcfg.ConfigFile.ReceiverConfigFile) > 0 { + rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, rcfg.ConfigFile.ReceiverConfigFile) + if err != nil { + log.Print(err.Error()) + return + } + ReceiveToRouterChannel := make(chan lp.CCMetric) + rcfg.ReceiveManager.AddOutput(ReceiveToRouterChannel) + rcfg.Router.AddInput(ReceiveToRouterChannel) + use_recv = true + } + prepare_shutdown(&rcfg) + rcfg.Sync.Add(1) + rcfg.Router.Start() + rcfg.SinkManager.Start() + rcfg.CollectManager.Start() - // Storage for all node metrics - tmpPoints := make([]lp.MutableMetric, 0) - - // Start receiver if use_recv { - recv.Start() + rcfg.ReceiveManager.Start() } - go func() { - for { - select { - case <-done: - return - case t := <-ticker.C: - - // Read all collectors are sort the results in the right - // storage locations - for _, c := range config.Collectors { - col := Collectors[c] - col.Read(time.Duration(config.Duration)*time.Second, &tmpPoints) - - for { - if len(tmpPoints) == 0 { - break - } - p := tmpPoints[0] - for k, v := range config.DefTags { - p.AddTag(k, v) - p.SetTime(t) - } - sink.Write(p) - tmpPoints = tmpPoints[1:] - } - } - - if err := sink.Flush(); err != nil { - log.Printf("sink error: %s\n", err) - } - if clicfg["once"] == "true" { - shutdown(&wg, config.Collectors, sink, recv, clicfg["pidfile"]) - return - } - } - } - }() - // Wait until receiving an interrupt - wg.Wait() + rcfg.Sync.Wait() } diff --git a/receivers.json b/receivers.json new file mode 100644 index 0000000..e368fc3 --- /dev/null +++ b/receivers.json @@ -0,0 +1,8 @@ +[ + { + "type": "nats", + "address": "nats://my-url", + "port" : "4222", + "database": "testcluster" + } +] diff --git a/receivers/README.md b/receivers/README.md index 7733a94..24425f2 100644 --- a/receivers/README.md +++ b/receivers/README.md @@ -1,35 +1,44 @@ -This folder contains the receivers for the cc-metric-collector. +# CCMetric receivers -# `metricReceiver.go` -The base class/configuration is located in `metricReceiver.go`. +This folder contains the ReceiveManager and receiver implementations for the cc-metric-collector. -# Receivers -* `natsReceiver.go`: Receives metrics from the Nats transport system in Influx line protocol encoding. The database name is used as subscription subject for the NATS messages. It uses https://github.com/nats-io/nats.go +# Configuration -# Installation -Nothing to do, all receivers are pure Go code - -# Receiver configuration +The configuration file for the receivers is a list of configurations. The `type` field in each specifies which receiver to initialize. ```json - "receiver": { +[ + { "type": "nats", - "address": "nats://my-url" + "address": "nats://my-url", "port" : "4222", "database": "testcluster" - }, + } +] ``` -## `nats` -The receiver connects to `address` and `port` and subscribes itself for all messages with topic `database`. The default port is `4222`. +## Type `nats` + +```json +{ + "type": "nats", + "address": "", + "port" : "", + "database": "" +} +``` + +The `nats` receiver subscribes to the topic `database` and listens on `address` and `port` for metrics in the InfluxDB line protocol. # Contributing own receivers A receiver contains three functions and is derived from the type `Receiver` (in `metricReceiver.go`): * `Init(config ReceiverConfig) error` * `Start() error` * `Close()` +* `Name() string` +* `SetSink(sink chan ccMetric.CCMetric)` The data structures should be set up in `Init()` like opening a file or server connection. The `Start()` function should either start a go routine or issue some other asynchronous mechanism for receiving metrics. The `Close()` function should tear down anything created in `Init()`. -Finally, the receiver needs to be registered in the `metric-collector.go`. There is a list of receivers called `Receivers` which is a map (string -> pointer to receiver). Add a new entry with a descriptive name and the new receiver. +Finally, the receiver needs to be registered in the `receiveManager.go`. There is a list of receivers called `AvailableReceivers` which is a map (`receiver_type_string` -> `pointer to Receiver interface`). Add a new entry with a descriptive name and the new receiver. diff --git a/receivers/metricReceiver.go b/receivers/metricReceiver.go index acdc455..2c74409 100644 --- a/receivers/metricReceiver.go +++ b/receivers/metricReceiver.go @@ -2,30 +2,41 @@ package receivers import ( // "time" - s "github.com/ClusterCockpit/cc-metric-collector/sinks" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" ) type ReceiverConfig struct { - Addr string `json:"address"` - Port string `json:"port"` - Database string `json:"database"` - Type string `json:"type"` + Addr string `json:"address"` + Port string `json:"port"` + Database string `json:"database"` + Organization string `json:"organization,omitempty"` + Type string `json:"type"` } -type Receiver struct { +type receiver struct { name string addr string port string database string organization string - sink s.SinkFuncs + sink chan lp.CCMetric } -type ReceiverFuncs interface { - Init(config ReceiverConfig, sink s.SinkFuncs) error +type Receiver interface { + Init(config ReceiverConfig) error Start() Close() + Name() string + SetSink(sink chan lp.CCMetric) +} + +func (r *receiver) Name() string { + return r.name +} + +func (r *receiver) SetSink(sink chan lp.CCMetric) { + r.sink = sink } func Tags2Map(metric influx.Metric) map[string]string { diff --git a/receivers/natsReceiver.go b/receivers/natsReceiver.go index 9d98f00..5cbe90d 100644 --- a/receivers/natsReceiver.go +++ b/receivers/natsReceiver.go @@ -2,56 +2,68 @@ package receivers import ( "errors" - s "github.com/ClusterCockpit/cc-metric-collector/sinks" - lp "github.com/influxdata/line-protocol" + "fmt" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + influx "github.com/influxdata/line-protocol" nats "github.com/nats-io/nats.go" "log" "time" ) +type NatsReceiverConfig struct { + Addr string `json:"address"` + Port string `json:"port"` + Database string `json:"database"` +} + type NatsReceiver struct { - Receiver + receiver nc *nats.Conn - handler *lp.MetricHandler - parser *lp.Parser + handler *influx.MetricHandler + parser *influx.Parser + meta map[string]string + config ReceiverConfig } var DefaultTime = func() time.Time { return time.Unix(42, 0) } -func (r *NatsReceiver) Init(config ReceiverConfig, sink s.SinkFuncs) error { - if len(config.Addr) == 0 || - len(config.Port) == 0 || - len(config.Database) == 0 { +func (r *NatsReceiver) Init(config ReceiverConfig) error { + r.name = "NatsReceiver" + r.config = config + if len(r.config.Addr) == 0 || + len(r.config.Port) == 0 || + len(r.config.Database) == 0 { return errors.New("Not all configuration variables set required by NatsReceiver") } - r.addr = config.Addr + r.meta = map[string]string{"source": r.name} + r.addr = r.config.Addr if len(r.addr) == 0 { r.addr = nats.DefaultURL } - r.port = config.Port + r.port = r.config.Port if len(r.port) == 0 { r.port = "4222" } - log.Print("Init NATS Receiver") - nc, err := nats.Connect(r.addr) + log.Print("[NatsReceiver] INIT") + uri := fmt.Sprintf("%s:%s", r.addr, r.port) + nc, err := nats.Connect(uri) if err == nil { - r.database = config.Database - r.sink = sink + r.database = r.config.Database r.nc = nc } else { - log.Print(err) r.nc = nil + return err } - r.handler = lp.NewMetricHandler() - r.parser = lp.NewParser(r.handler) + r.handler = influx.NewMetricHandler() + r.parser = influx.NewParser(r.handler) r.parser.SetTimeFunc(DefaultTime) return err } func (r *NatsReceiver) Start() { - log.Print("Start NATS Receiver") + log.Print("[NatsReceiver] START") r.nc.Subscribe(r.database, r._NatsReceive) } @@ -59,9 +71,13 @@ func (r *NatsReceiver) _NatsReceive(m *nats.Msg) { metrics, err := r.parser.Parse(m.Data) if err == nil { for _, m := range metrics { - y, err := lp.New(m.Name(), Tags2Map(m), Fields2Map(m), m.Time()) - if err == nil { - r.sink.Write(y) + y := lp.FromInfluxMetric(m) + for k, v := range r.meta { + y.AddMeta(k, v) + } + //y, err := lp.New(m.Name(), Tags2Map(m), r.meta, Fields2Map(m), m.Time()) + if r.sink != nil { + r.sink <- y } } } @@ -69,7 +85,7 @@ func (r *NatsReceiver) _NatsReceive(m *nats.Msg) { func (r *NatsReceiver) Close() { if r.nc != nil { - log.Print("Close NATS Receiver") + log.Print("[NatsReceiver] CLOSE") r.nc.Close() } } diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go new file mode 100644 index 0000000..62f70b3 --- /dev/null +++ b/receivers/receiveManager.go @@ -0,0 +1,153 @@ +package receivers + +import ( + "encoding/json" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + "log" + "os" + "sync" +) + +var AvailableReceivers = map[string]Receiver{ + "nats": &NatsReceiver{}, +} + +type receiveManager struct { + inputs []Receiver + output chan lp.CCMetric + done chan bool + wg *sync.WaitGroup + config []ReceiverConfig +} + +type ReceiveManager interface { + Init(wg *sync.WaitGroup, receiverConfigFile string) error + AddInput(rawConfig json.RawMessage) error + AddOutput(output chan lp.CCMetric) + Start() + Close() +} + +func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) error { + rm.inputs = make([]Receiver, 0) + rm.output = nil + rm.done = make(chan bool) + rm.wg = wg + rm.config = make([]ReceiverConfig, 0) + configFile, err := os.Open(receiverConfigFile) + if err != nil { + log.Print(err.Error()) + return err + } + defer configFile.Close() + jsonParser := json.NewDecoder(configFile) + var rawConfigs []json.RawMessage + err = jsonParser.Decode(&rawConfigs) + if err != nil { + log.Print(err.Error()) + return err + } + for _, raw := range rawConfigs { + log.Print("[ReceiveManager] ", string(raw)) + rm.AddInput(raw) + // if _, found := AvailableReceivers[k.Type]; !found { + // log.Print("[ReceiveManager] SKIP Config specifies unknown receiver 'type': ", k.Type) + // continue + // } + // r := AvailableReceivers[k.Type] + // err = r.Init(k) + // if err != nil { + // log.Print("[ReceiveManager] SKIP Receiver ", k.Type, " cannot be initialized: ", err.Error()) + // continue + // } + // rm.inputs = append(rm.inputs, r) + } + return nil +} + +func (rm *receiveManager) Start() { + rm.wg.Add(1) + + for _, r := range rm.inputs { + log.Print("[ReceiveManager] START ", r.Name()) + r.Start() + } + log.Print("[ReceiveManager] STARTED\n") + // go func() { + // for { + //ReceiveManagerLoop: + // select { + // case <- rm.done: + // log.Print("ReceiveManager done\n") + // rm.wg.Done() + // break ReceiveManagerLoop + // default: + // for _, c := range rm.inputs { + //ReceiveManagerInputLoop: + // select { + // case <- rm.done: + // log.Print("ReceiveManager done\n") + // rm.wg.Done() + // break ReceiveManagerInputLoop + // case p := <- c: + // log.Print("ReceiveManager: ", p) + // rm.output <- p + // default: + // } + // } + // } + // } + // }() + // for _, r := range rm.inputs { + // r.Close() + // } +} + +func (rm *receiveManager) AddInput(rawConfig json.RawMessage) error { + var config ReceiverConfig + err := json.Unmarshal(rawConfig, &config) + if err != nil { + log.Print("[ReceiveManager] SKIP ", config.Type, " JSON config error: ", err.Error()) + log.Print(err.Error()) + return err + } + if _, found := AvailableReceivers[config.Type]; !found { + log.Print("[ReceiveManager] SKIP ", config.Type, " unknown receiver: ", err.Error()) + return err + } + r := AvailableReceivers[config.Type] + err = r.Init(config) + if err != nil { + log.Print("[ReceiveManager] SKIP ", r.Name(), " initialization failed: ", err.Error()) + return err + } + rm.inputs = append(rm.inputs, r) + rm.config = append(rm.config, config) + return nil +} + +func (rm *receiveManager) AddOutput(output chan lp.CCMetric) { + rm.output = output + for _, r := range rm.inputs { + r.SetSink(rm.output) + } +} + +func (rm *receiveManager) Close() { + for _, r := range rm.inputs { + log.Print("[ReceiveManager] CLOSE ", r.Name()) + r.Close() + } + rm.wg.Done() + log.Print("[ReceiveManager] CLOSE\n") + log.Print("[ReceiveManager] EXIT\n") +} + +func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) { + r := &receiveManager{} + err := r.Init(wg, receiverConfigFile) + if err != nil { + return nil, err + } + return r, err +} diff --git a/router.json b/router.json new file mode 100644 index 0000000..a9f8714 --- /dev/null +++ b/router.json @@ -0,0 +1,22 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "testcluster", + "if" : "*" + }, + { + "key" : "test", + "value" : "testing", + "if" : "name == 'temp_package_id_0'" + } + ], + "delete_tags" : [ + { + "key" : "unit", + "value" : "*", + "if" : "*" + } + ], + "interval_timestamp" : true +} diff --git a/sinks.json b/sinks.json new file mode 100644 index 0000000..d304018 --- /dev/null +++ b/sinks.json @@ -0,0 +1,6 @@ +[ + { + "type" : "stdout", + "meta_as_tags" : true + } +] diff --git a/sinks/README.md b/sinks/README.md index 66783c4..8fac8e5 100644 --- a/sinks/README.md +++ b/sinks/README.md @@ -1,65 +1,99 @@ -This folder contains the sinks for the cc-metric-collector. +# CCMetric sinks -# `metricSink.go` -The base class/configuration is located in `metricSink.go`. +This folder contains the SinkManager and sink implementations for the cc-metric-collector. -# Sinks -* `stdoutSink.go`: Writes all metrics to `stdout` in InfluxDB line protocol. The sink does not use https://github.com/influxdata/line-protocol to reduce the executed code for debugging -* `influxSink.go`: Writes all metrics to an InfluxDB database instance using a blocking writer. It uses https://github.com/influxdata/influxdb-client-go . Configuration for the server, port, ssl, password, database name and organisation are in the global configuration file. The 'password' is used for the token and the 'database' for the bucket. It uses the v2 API of Influx. -* `natsSink.go`: Sends all metrics to an NATS server using the InfluxDB line protocol as encoding. It uses https://github.com/nats-io/nats.go . Configuration for the server, port, user, password and database name are in the global configuration file. The database name is used as subject for the NATS messages. -* `httpSink.go`: Sends all metrics to an HTTP endpoint `http://:/` using a POST request. The body of the request will consist of lines in the InfluxDB line protocol. In case password is specified, that password is used as a JWT in the 'Authorization' header. +# Configuration -# Installation -Nothing to do, all sinks are pure Go code - -# Sink configuration +The configuration file for the sinks is a list of configurations. The `type` field in each specifies which sink to initialize. ```json - "sink": { - "user": "testuser", - "password": "testpass", - "host": "127.0.0.1", - "port": "9090", - "database": "testdb", - "organization": "testorg", - "ssl": false - "type": "stdout" +[ + { + "type" : "stdout", + "meta_as_tags" : false + }, + { + "type" : "http", + "host" : "localhost", + "port" : "4123", + "database" : "ccmetric", + "password" : "" } +] ``` -## `stdout` -When configuring `type = stdout`, all metrics are printed to stdout. No further configuration is required or touched, so you can leave your other-sink-config in there and just change the `type` for debugging purposes +This example initializes two sinks, the `stdout` sink printing all metrics to the STDOUT and the `http` sink with the given `host`, `port`, `database` and `password`. -## `influxdb` -The InfluxDB sink uses blocking write operations to write to an InfluxDB database using the v2 API. It uses the following configuration options: -* `host`: Hostname of the database instance -* `port`: Portnumber (as string) of the database -* `database`: Name of the database, called 'bucket' in InfluxDB v2 -* `organization`: The InfluxDB v2 API uses organizations to separate database instances running on the same host -* `ssl`: Boolean to activate SSL/TLS -* `user`: Although the v2 API uses API keys instead of username and password, this field can be used if the sink should authentificate with `username:password`. If you want to use an API key, leave this field empty. -* `password`: API key for the InfluxDB v2 API or password if `user` is set +If `meta_as_tags` is set, all meta information attached to CCMetric are printed out as tags. -## `nats` -* `host`: Hostname of the NATS server -* `port`: Portnumber (as string) of the NATS server -* `user`: Username for authentification in the NATS transport system -* `password`: Password for authentification in the NATS transport system +## Type `stdout` + +```json +{ + "type" : "stdout", + "meta_as_tags" : +} +``` + +The `stdout` sink dumps all metrics to the STDOUT. + +## Type `http` + +```json +{ + "type" : "http", + "host" : "", + "port" : "", + "database" : "", + "password" : "", + "meta_as_tags" : +} +``` +The sink uses POST requests to send metrics to `http://:/` using the JWT token as a JWT in the 'Authorization' header. + +## Type `nats` + +```json +{ + "type" : "nats", + "host" : "", + "port" : "", + "user" : "", + "password" : "", + "database" : "" + "meta_as_tags" : +} +``` + +This sink publishes the CCMetric in a NATS environment using `host`, `port`, `user` and `password` for connecting. The metrics are published using the topic `database`. + +## Type `influxdb` + +```json +{ + "type" : "influxdb", + "host" : "", + "port" : "", + "user" : "", + "password" : "", + "database" : "" + "organization": "", + "ssl" : , + "meta_as_tags" : +} +``` + +This sink submits the CCMetrics to an InfluxDB time-series database. It uses `host`, `port` and `ssl` for connecting. For authentification, it uses either `user:password` if `user` is set and only `password` as API key. The `organization` and `database` are used for writing to the correct database. -## `http` -* `host`: Hostname of the HTTP server -* `port`: Portnumber (as string) of the HTTP server -* `database`: Endpoint to write to. HTTP POST requests are performed on `http://:/` -* `password`: JSON Web token used for authentification # Contributing own sinks -A sink contains three functions and is derived from the type `Sink` (in `metricSink.go`): +A sink contains three functions and is derived from the type `Sink`: * `Init(config SinkConfig) error` -* `Write(measurement string, tags map[string]string, fields map[string]interface{}, t time.Time) error` +* `Write(point CCMetric) error` * `Flush() error` * `Close()` -The data structures should be set up in `Init()` like opening a file or server connection. The `Write()` function takes a measurement, tags, fields and a timestamp and writes/sends the data. For non-blocking sinks, the `Flush()` method tells the sink to drain its internal buffers. The `Close()` function should tear down anything created in `Init()`. +The data structures should be set up in `Init()` like opening a file or server connection. The `Write()` function writes/sends the data. For non-blocking sinks, the `Flush()` method tells the sink to drain its internal buffers. The `Close()` function should tear down anything created in `Init()`. -Finally, the sink needs to be registered in the `metric-collector.go`. There is a list of sinks called `Sinks` which is a map (sink_type_string -> pointer to sink). Add a new entry with a descriptive name and the new sink. +Finally, the sink needs to be registered in the `sinkManager.go`. There is a list of sinks called `AvailableSinks` which is a map (`sink_type_string` -> `pointer to sink interface`). Add a new entry with a descriptive name and the new sink. diff --git a/sinks/httpSink.go b/sinks/httpSink.go index e443ceb..25b0082 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -7,19 +7,21 @@ import ( "net/http" "time" - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + influx "github.com/influxdata/line-protocol" ) type HttpSink struct { - Sink + sink client *http.Client url, jwt string - encoder *lp.Encoder + encoder *influx.Encoder buffer *bytes.Buffer } -func (s *HttpSink) Init(config SinkConfig) error { - if len(config.Host) == 0 || len(config.Port) == 0 { +func (s *HttpSink) Init(config sinkConfig) error { + s.name = "HttpSink" + if len(config.Host) == 0 || len(config.Port) == 0 || len(config.Database) == 0 { return errors.New("`host`, `port` and `database` config options required for TCP sink") } @@ -28,13 +30,13 @@ func (s *HttpSink) Init(config SinkConfig) error { s.port = config.Port s.jwt = config.Password s.buffer = &bytes.Buffer{} - s.encoder = lp.NewEncoder(s.buffer) + s.encoder = influx.NewEncoder(s.buffer) s.encoder.SetPrecision(time.Second) return nil } -func (s *HttpSink) Write(point lp.MutableMetric) error { +func (s *HttpSink) Write(point lp.CCMetric) error { _, err := s.encoder.Encode(point) return err } diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 40e681f..dca1572 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -5,15 +5,14 @@ import ( "crypto/tls" "errors" "fmt" - + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" - lp "github.com/influxdata/line-protocol" "log" ) type InfluxSink struct { - Sink + sink client influxdb2.Client writeApi influxdb2Api.WriteAPIBlocking retPolicy string @@ -39,7 +38,8 @@ func (s *InfluxSink) connect() error { return nil } -func (s *InfluxSink) Init(config SinkConfig) error { +func (s *InfluxSink) Init(config sinkConfig) error { + s.name = "InfluxSink" if len(config.Host) == 0 || len(config.Port) == 0 || len(config.Database) == 0 || @@ -54,15 +54,21 @@ func (s *InfluxSink) Init(config SinkConfig) error { s.user = config.User s.password = config.Password s.ssl = config.SSL + s.meta_as_tags = config.MetaAsTags return s.connect() } -func (s *InfluxSink) Write(point lp.MutableMetric) error { +func (s *InfluxSink) Write(point lp.CCMetric) error { tags := map[string]string{} fields := map[string]interface{}{} for _, t := range point.TagList() { tags[t.Key] = t.Value } + if s.meta_as_tags { + for _, m := range point.MetaList() { + tags[m.Key] = m.Value + } + } for _, f := range point.FieldList() { fields[f.Key] = f.Value } diff --git a/sinks/metricSink.go b/sinks/metricSink.go index 182495a..25f66bb 100644 --- a/sinks/metricSink.go +++ b/sinks/metricSink.go @@ -2,21 +2,22 @@ package sinks import ( // "time" - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -type SinkConfig struct { - Host string `json:"host"` - Port string `json:"port"` - Database string `json:"database"` - User string `json:"user"` - Password string `json:"password"` - Organization string `json:"organization"` +type sinkConfig struct { Type string `json:"type"` - SSL bool `json:"ssl"` + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` + Organization string `json:"organization,omitempty"` + SSL bool `json:"ssl,omitempty"` + MetaAsTags bool `json:"meta_as_tags,omitempty"` } -type Sink struct { +type sink struct { host string port string user string @@ -24,11 +25,18 @@ type Sink struct { database string organization string ssl bool + meta_as_tags bool + name string } -type SinkFuncs interface { - Init(config SinkConfig) error - Write(point lp.MutableMetric) error +type Sink interface { + Init(config sinkConfig) error + Write(point lp.CCMetric) error Flush() error Close() + Name() string +} + +func (s *sink) Name() string { + return s.name } diff --git a/sinks/natsSink.go b/sinks/natsSink.go index 0df14f4..f9cd7eb 100644 --- a/sinks/natsSink.go +++ b/sinks/natsSink.go @@ -4,16 +4,17 @@ import ( "bytes" "errors" "fmt" - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + influx "github.com/influxdata/line-protocol" nats "github.com/nats-io/nats.go" "log" "time" ) type NatsSink struct { - Sink + sink client *nats.Conn - encoder *lp.Encoder + encoder *influx.Encoder buffer *bytes.Buffer } @@ -31,7 +32,8 @@ func (s *NatsSink) connect() error { return nil } -func (s *NatsSink) Init(config SinkConfig) error { +func (s *NatsSink) Init(config sinkConfig) error { + s.name = "NatsSink" if len(config.Host) == 0 || len(config.Port) == 0 || len(config.Database) == 0 { @@ -46,40 +48,31 @@ func (s *NatsSink) Init(config SinkConfig) error { // Setup Influx line protocol s.buffer = &bytes.Buffer{} s.buffer.Grow(1025) - s.encoder = lp.NewEncoder(s.buffer) + s.encoder = influx.NewEncoder(s.buffer) s.encoder.SetPrecision(time.Second) s.encoder.SetMaxLineBytes(1024) // Setup infos for connection return s.connect() } -func (s *NatsSink) Write(point lp.MutableMetric) error { +func (s *NatsSink) Write(point lp.CCMetric) error { if s.client != nil { - // var tags map[string]string - // var fields map[string]interface{} - // for _, t := range point.TagList() { - // tags[t.Key] = t.Value - // } - // for _, f := range point.FieldList() { - // fields[f.Key] = f.Value - // } - // m, err := protocol.New(point.Name(), tags, fields, point.Time()) - // if err != nil { - // log.Print(err) - // return err - // } _, err := s.encoder.Encode(point) if err != nil { log.Print(err) return err } - s.client.Publish(s.database, s.buffer.Bytes()) - s.buffer.Reset() } return nil } func (s *NatsSink) Flush() error { + if s.client != nil { + if err := s.client.Publish(s.database, s.buffer.Bytes()); err != nil { + return err + } + s.buffer.Reset() + } return nil } diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go new file mode 100644 index 0000000..beb0f32 --- /dev/null +++ b/sinks/sinkManager.go @@ -0,0 +1,141 @@ +package sinks + +import ( + "encoding/json" + "log" + "os" + "sync" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +var AvailableSinks = map[string]Sink{ + "influxdb": &InfluxSink{}, + "stdout": &StdoutSink{}, + "nats": &NatsSink{}, + "http": &HttpSink{}, +} + +type sinkManager struct { + input chan lp.CCMetric + outputs []Sink + done chan bool + wg *sync.WaitGroup + config []sinkConfig +} + +type SinkManager interface { + Init(wg *sync.WaitGroup, sinkConfigFile string) error + AddInput(input chan lp.CCMetric) + AddOutput(config json.RawMessage) error + Start() + Close() +} + +func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { + sm.input = nil + sm.outputs = make([]Sink, 0) + sm.done = make(chan bool) + sm.wg = wg + sm.config = make([]sinkConfig, 0) + if len(sinkConfigFile) > 0 { + configFile, err := os.Open(sinkConfigFile) + if err != nil { + log.Print("[SinkManager] ", err.Error()) + return err + } + defer configFile.Close() + jsonParser := json.NewDecoder(configFile) + var rawConfigs []json.RawMessage + err = jsonParser.Decode(&rawConfigs) + if err != nil { + log.Print("[SinkManager] ", err.Error()) + return err + } + for _, raw := range rawConfigs { + err = sm.AddOutput(raw) + if err != nil { + continue + } + } + } + return nil +} + +func (sm *sinkManager) Start() { + sm.wg.Add(1) + batchcount := 20 + go func() { + for { + SinkManagerLoop: + select { + case <-sm.done: + for _, s := range sm.outputs { + s.Close() + } + log.Print("[SinkManager] DONE\n") + sm.wg.Done() + break SinkManagerLoop + case p := <-sm.input: + log.Print("[SinkManager] WRITE ", p) + for _, s := range sm.outputs { + s.Write(p) + } + if batchcount == 0 { + log.Print("[SinkManager] FLUSH") + for _, s := range sm.outputs { + s.Flush() + } + batchcount = 20 + } + batchcount-- + default: + } + } + log.Print("[SinkManager] EXIT\n") + }() + log.Print("[SinkManager] STARTED\n") +} + +func (sm *sinkManager) AddInput(input chan lp.CCMetric) { + sm.input = input +} + +func (sm *sinkManager) AddOutput(rawConfig json.RawMessage) error { + var err error + var config sinkConfig + if len(rawConfig) > 3 { + err = json.Unmarshal(rawConfig, &config) + if err != nil { + log.Print("[SinkManager] SKIP ", config.Type, " JSON config error: ", err.Error()) + return err + } + } + if _, found := AvailableSinks[config.Type]; !found { + log.Print("[SinkManager] SKIP ", config.Type, " unknown sink: ", err.Error()) + return err + } + s := AvailableSinks[config.Type] + err = s.Init(config) + if err != nil { + log.Print("[SinkManager] SKIP ", s.Name(), " initialization failed: ", err.Error()) + return err + } + sm.outputs = append(sm.outputs, s) + sm.config = append(sm.config, config) + return nil +} + +func (sm *sinkManager) Close() { + sm.done <- true + log.Print("[SinkManager] CLOSE") +} + +func New(wg *sync.WaitGroup, sinkConfigFile string) (SinkManager, error) { + sm := &sinkManager{} + err := sm.Init(wg, sinkConfigFile) + if err != nil { + return nil, err + } + return sm, err +} diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index 8016fcb..215239f 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -6,23 +6,30 @@ import ( "strings" // "time" - lp "github.com/influxdata/line-protocol" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) type StdoutSink struct { - Sink + sink } -func (s *StdoutSink) Init(config SinkConfig) error { +func (s *StdoutSink) Init(config sinkConfig) error { + s.name = "StdoutSink" + s.meta_as_tags = config.MetaAsTags return nil } -func (s *StdoutSink) Write(point lp.MutableMetric) error { +func (s *StdoutSink) Write(point lp.CCMetric) error { var tagsstr []string var fieldstr []string for _, t := range point.TagList() { tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", t.Key, t.Value)) } + if s.meta_as_tags { + for _, m := range point.MetaList() { + tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", m.Key, m.Value)) + } + } for _, f := range point.FieldList() { switch f.Value.(type) { case float64: From 99aaece6c280548f6273ca20a992d43b14f50dc0 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 25 Jan 2022 15:46:41 +0100 Subject: [PATCH 024/174] Activate --once option and return proper exit Code with os.Exit() --- metric-collector.go | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/metric-collector.go b/metric-collector.go index c071933..a205a4f 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -189,7 +189,7 @@ func prepare_shutdown(config *RuntimeConfig) { }(config) } -func main() { +func mainFunc() int { var err error use_recv := false @@ -201,23 +201,23 @@ func main() { if err != nil { log.Print("Error reading configuration file ", rcfg.CliArgs["configfile"]) log.Print(err.Error()) - return + return 1 } if rcfg.ConfigFile.Interval <= 0 || time.Duration(rcfg.ConfigFile.Interval)*time.Second <= 0 { log.Print("Configuration value 'interval' must be greater than zero") - return + return 1 } rcfg.Interval = time.Duration(rcfg.ConfigFile.Interval) * time.Second if rcfg.ConfigFile.Duration <= 0 || time.Duration(rcfg.ConfigFile.Duration)*time.Second <= 0 { log.Print("Configuration value 'duration' must be greater than zero") - return + return 1 } rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second rcfg.Hostname, err = os.Hostname() if err != nil { log.Print(err.Error()) - return + return 1 } // Drop domain part of host name rcfg.Hostname = strings.SplitN(rcfg.Hostname, `.`, 2)[0] @@ -232,14 +232,14 @@ func main() { rcfg.Router, err = mr.New(rcfg.Ticker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile) if err != nil { log.Print(err.Error()) - return + return 1 } } if len(rcfg.ConfigFile.SinkConfigFile) > 0 { rcfg.SinkManager, err = sinks.New(&rcfg.Sync, rcfg.ConfigFile.SinkConfigFile) if err != nil { log.Print(err.Error()) - return + return 1 } RouterToSinksChannel := make(chan lp.CCMetric) rcfg.SinkManager.AddInput(RouterToSinksChannel) @@ -249,7 +249,7 @@ func main() { rcfg.CollectManager, err = collectors.New(rcfg.Ticker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile) if err != nil { log.Print(err.Error()) - return + return 1 } CollectToRouterChannel := make(chan lp.CCMetric) rcfg.CollectManager.AddOutput(CollectToRouterChannel) @@ -259,7 +259,7 @@ func main() { rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, rcfg.ConfigFile.ReceiverConfigFile) if err != nil { log.Print(err.Error()) - return + return 1 } ReceiveToRouterChannel := make(chan lp.CCMetric) rcfg.ReceiveManager.AddOutput(ReceiveToRouterChannel) @@ -276,6 +276,19 @@ func main() { rcfg.ReceiveManager.Start() } + // Wait until one tick has passed. This is a workaround + if rcfg.CliArgs["once"] == "true" { + var x int = (1.8 * float64(rcfg.ConfigFile.Interval)) + time.Sleep(time.Duration(int(x)) * time.Second) + shutdown(&rcfg) + } + // Wait until receiving an interrupt rcfg.Sync.Wait() + return 0 +} + +func main() { + exitCode := mainFunc() + os.Exit(exitCode) } From a40d1c954b36941737a35bf4b9dc424bfda04d12 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 16:33:23 +0100 Subject: [PATCH 025/174] Fix data type mismatch --- metric-collector.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metric-collector.go b/metric-collector.go index a205a4f..b95b38f 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -17,7 +17,6 @@ import ( "sync" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" @@ -278,7 +277,7 @@ func mainFunc() int { // Wait until one tick has passed. This is a workaround if rcfg.CliArgs["once"] == "true" { - var x int = (1.8 * float64(rcfg.ConfigFile.Interval)) + x := 1.8 * float64(rcfg.ConfigFile.Interval) time.Sleep(time.Duration(int(x)) * time.Second) shutdown(&rcfg) } From bafc6322e674d89e3565d13e03b003d9977c58f9 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 25 Jan 2022 16:40:02 +0100 Subject: [PATCH 026/174] Change to own Logger --- collectors/collectorManager.go | 23 ++++--- go.mod | 1 + internal/ccLogger/cclogger.go | 111 +++++++++++++++++++++++++++++++++ metric-collector.go | 37 +++++------ 4 files changed, 140 insertions(+), 32 deletions(-) create mode 100644 internal/ccLogger/cclogger.go diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 9543431..73b2891 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -2,13 +2,13 @@ package collectors import ( "encoding/json" - "log" "os" "sync" "time" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" ) var AvailableCollectors = map[string]MetricCollector{ @@ -58,29 +58,29 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat cm.duration = duration configFile, err := os.Open(collectConfigFile) if err != nil { - log.Print(err.Error()) + cclog.Error(err.Error()) return err } defer configFile.Close() jsonParser := json.NewDecoder(configFile) err = jsonParser.Decode(&cm.config) if err != nil { - log.Print(err.Error()) + cclog.Error(err.Error()) return err } for k, cfg := range cm.config { - log.Print(k, " ", cfg) if _, found := AvailableCollectors[k]; !found { - log.Print("[CollectorManager] SKIP unknown collector ", k) + cclog.ComponentPrint("CollectorManager", "SKIP unknown collector ", k) continue } c := AvailableCollectors[k] err = c.Init(cfg) if err != nil { - log.Print("[CollectorManager] Collector ", k, "initialization failed: ", err.Error()) + cclog.ComponentPrint("CollectorManager", "Collector ", k, "initialization failed: ", err.Error()) continue } + cclog.ComponentDebug("CollectorManager", "Collector ", k, "initialized") cm.collectors = append(cm.collectors, c) } return nil @@ -99,7 +99,7 @@ func (cm *collectorManager) Start() { c.Close() } cm.wg.Done() - log.Print("[CollectorManager] DONE\n") + cclog.ComponentPrint("CollectorManager", "DONE") break CollectorManagerLoop case t := <-tick: for _, c := range cm.collectors { @@ -110,18 +110,17 @@ func (cm *collectorManager) Start() { c.Close() } cm.wg.Done() - log.Print("[CollectorManager] DONE\n") + cclog.ComponentPrint("CollectorManager", "DONE") break CollectorManagerInputLoop default: - log.Print("[CollectorManager] ", c.Name(), " ", t) + cclog.ComponentPrint("CollectorManager", c.Name(), " ", t) c.Read(cm.duration, cm.output) } } } } - log.Print("[CollectorManager] EXIT\n") }() - log.Print("[CollectorManager] STARTED\n") + cclog.ComponentPrint("CollectorManager", "STARTED") } func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { @@ -130,7 +129,7 @@ func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { func (cm *collectorManager) Close() { cm.done <- true - log.Print("[CollectorManager] CLOSE") + cclog.ComponentPrint("CollectorManager", "CLOSE") } func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) { diff --git a/go.mod b/go.mod index be384b6..d20d431 100644 --- a/go.mod +++ b/go.mod @@ -9,5 +9,6 @@ require ( github.com/nats-io/nats.go v1.10.0 github.com/nats-io/nkeys v0.1.4 // indirect github.com/prometheus/client_golang v1.10.0 // indirect + golang.org/x/sys v0.0.0-20210309074719-68d13333faf2 gopkg.in/Knetic/govaluate.v2 v2.3.0 ) diff --git a/internal/ccLogger/cclogger.go b/internal/ccLogger/cclogger.go new file mode 100644 index 0000000..ad5b986 --- /dev/null +++ b/internal/ccLogger/cclogger.go @@ -0,0 +1,111 @@ +package cclogger + +import ( + "fmt" + "runtime" + "os" + "log" +) + + +var ( + globalDebug = false + stdout = os.Stdout + stderr = os.Stderr + debugLog *log.Logger = nil + infoLog *log.Logger = nil + errorLog *log.Logger = nil + warnLog *log.Logger = nil + defaultLog *log.Logger = nil +) + +func initLogger() { + if debugLog == nil { + debugLog = log.New(stderr, "DEBUG", log.LstdFlags) + } + if infoLog == nil { + infoLog = log.New(stdout, "INFO", log.LstdFlags) + } + if errorLog == nil { + errorLog = log.New(stderr, "ERROR", log.LstdFlags) + } + if warnLog == nil { + warnLog = log.New(stderr, "WARN", log.LstdFlags) + } + if defaultLog == nil { + defaultLog = log.New(stdout, "", log.LstdFlags) + } +} + +func CCPrint(logger *log.Logger, e ... interface {}) { + if logger != nil { + logger.Print(e) + } +} + +func Print(e ... interface{}) { + CCPrint(defaultLog, e) +} + +func ComponentPrint(component string, e ... interface{}) { + CCPrint(defaultLog, fmt.Sprintf("[%s]", component), e) +} + +func Info(e ... interface{}) { + CCPrint(infoLog, e) +} + +func ComponentInfo(component string, e ... interface{}) { + CCPrint(infoLog, fmt.Sprintf("[%s]", component), e) +} + +func Debug(e ... interface{}) { + if globalDebug { + CCPrint(debugLog, e) + } +} + +func ComponentDebug(component string, e ... interface{}) { + if globalDebug { + CCPrint(debugLog, fmt.Sprintf("[%s]", component), e) + } +} + +func Error(e ... interface{}) { + _, fn, line, _ := runtime.Caller(1) + CCPrint(errorLog, fn, line, e) +} + +func ComponentError(component string, e ... interface{}) { + _, fn, line, _ := runtime.Caller(1) + CCPrint(errorLog, fmt.Sprintf("[%s]", component), fn, line, e) +} + +func SetDebug() { + globalDebug = true +} + + +func SetOutput(filename string) { + if filename == "stderr" { + if stderr != os.Stderr && stderr != os.Stdout { + stderr.Close() + } + stderr = os.Stderr + } else if filename == "stdout" { + if stderr != os.Stderr && stderr != os.Stdout { + stderr.Close() + } + stderr = os.Stdout + } else { + file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600) + if err == nil { + defer file.Close() + stderr = file + } + } + debugLog = nil + errorLog = nil + warnLog = nil + initLogger() +} diff --git a/metric-collector.go b/metric-collector.go index a205a4f..56d9cab 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -3,8 +3,7 @@ package main import ( "encoding/json" "flag" - "fmt" - "log" +// "log" "os" "os/signal" "strings" @@ -17,7 +16,7 @@ import ( "sync" "time" - + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" @@ -37,7 +36,7 @@ func LoadCentralConfiguration(file string, config *CentralConfigFile) error { configFile, err := os.Open(file) defer configFile.Close() if err != nil { - fmt.Println(err.Error()) + cclog.Error(err.Error()) return err } jsonParser := json.NewDecoder(configFile) @@ -151,21 +150,21 @@ func ReadCli() map[string]string { // General shutdown function that gets executed in case of interrupt or graceful shutdown func shutdown(config *RuntimeConfig) { - log.Print("Shutdown...") + cclog.Info("Shutdown...") if config.CollectManager != nil { - log.Print("Shutdown CollectManager...") + cclog.Debug("Shutdown CollectManager...") config.CollectManager.Close() } if config.ReceiveManager != nil { - log.Print("Shutdown ReceiveManager...") + cclog.Debug("Shutdown ReceiveManager...") config.ReceiveManager.Close() } if config.Router != nil { - log.Print("Shutdown Router...") + cclog.Debug("Shutdown Router...") config.Router.Close() } if config.SinkManager != nil { - log.Print("Shutdown SinkManager...") + cclog.Debug("Shutdown SinkManager...") config.SinkManager.Close() } @@ -184,7 +183,6 @@ func prepare_shutdown(config *RuntimeConfig) { go func(config *RuntimeConfig) { <-sigs - log.Print("Shutdown...") shutdown(config) }(config) } @@ -199,24 +197,23 @@ func mainFunc() int { // Load and check configuration err = LoadCentralConfiguration(rcfg.CliArgs["configfile"], &rcfg.ConfigFile) if err != nil { - log.Print("Error reading configuration file ", rcfg.CliArgs["configfile"]) - log.Print(err.Error()) + cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error()) return 1 } if rcfg.ConfigFile.Interval <= 0 || time.Duration(rcfg.ConfigFile.Interval)*time.Second <= 0 { - log.Print("Configuration value 'interval' must be greater than zero") + cclog.Error("Configuration value 'interval' must be greater than zero") return 1 } rcfg.Interval = time.Duration(rcfg.ConfigFile.Interval) * time.Second if rcfg.ConfigFile.Duration <= 0 || time.Duration(rcfg.ConfigFile.Duration)*time.Second <= 0 { - log.Print("Configuration value 'duration' must be greater than zero") + cclog.Error("Configuration value 'duration' must be greater than zero") return 1 } rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second rcfg.Hostname, err = os.Hostname() if err != nil { - log.Print(err.Error()) + cclog.Error(err.Error()) return 1 } // Drop domain part of host name @@ -231,14 +228,14 @@ func mainFunc() int { if len(rcfg.ConfigFile.RouterConfigFile) > 0 { rcfg.Router, err = mr.New(rcfg.Ticker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile) if err != nil { - log.Print(err.Error()) + cclog.Error(err.Error()) return 1 } } if len(rcfg.ConfigFile.SinkConfigFile) > 0 { rcfg.SinkManager, err = sinks.New(&rcfg.Sync, rcfg.ConfigFile.SinkConfigFile) if err != nil { - log.Print(err.Error()) + cclog.Error(err.Error()) return 1 } RouterToSinksChannel := make(chan lp.CCMetric) @@ -248,7 +245,7 @@ func mainFunc() int { if len(rcfg.ConfigFile.CollectorConfigFile) > 0 { rcfg.CollectManager, err = collectors.New(rcfg.Ticker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile) if err != nil { - log.Print(err.Error()) + cclog.Error(err.Error()) return 1 } CollectToRouterChannel := make(chan lp.CCMetric) @@ -258,7 +255,7 @@ func mainFunc() int { if len(rcfg.ConfigFile.ReceiverConfigFile) > 0 { rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, rcfg.ConfigFile.ReceiverConfigFile) if err != nil { - log.Print(err.Error()) + cclog.Error(err.Error()) return 1 } ReceiveToRouterChannel := make(chan lp.CCMetric) @@ -278,7 +275,7 @@ func mainFunc() int { // Wait until one tick has passed. This is a workaround if rcfg.CliArgs["once"] == "true" { - var x int = (1.8 * float64(rcfg.ConfigFile.Interval)) + var x float64 = (1.8 * float64(rcfg.ConfigFile.Interval)) time.Sleep(time.Duration(int(x)) * time.Second) shutdown(&rcfg) } From b4fde31626590589da79fff84a1666f07b6288b6 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 17:20:20 +0100 Subject: [PATCH 027/174] Add documentation --- collectors/metricCollector.go | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/collectors/metricCollector.go b/collectors/metricCollector.go index 6bc9047..3484dca 100644 --- a/collectors/metricCollector.go +++ b/collectors/metricCollector.go @@ -2,14 +2,15 @@ package collectors import ( "encoding/json" - "errors" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - influx "github.com/influxdata/line-protocol" + "fmt" "io/ioutil" "log" "strconv" "strings" "time" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + influx "github.com/influxdata/line-protocol" ) type MetricCollector interface { @@ -21,12 +22,12 @@ type MetricCollector interface { } type metricCollector struct { - output chan lp.CCMetric - name string - init bool - meta map[string]string + name string + init bool + meta map[string]string } +// Name() returns the name of the metric collector func (c *metricCollector) Name() string { return c.name } @@ -35,10 +36,14 @@ func (c *metricCollector) setup() error { return nil } +// Initialized() indicates whether the metric collector has been initialized. func (c *metricCollector) Initialized() bool { - return c.init == true + return c.init } +// intArrayContains scans an array of ints if the value str is present in the array +// If the specified value is found, the corresponding array index is returned. +// The bool value is used to signal success or failure func intArrayContains(array []int, str int) (int, bool) { for i, a := range array { if a == str { @@ -48,6 +53,9 @@ func intArrayContains(array []int, str int) (int, bool) { return -1, false } +// stringArrayContains scans an array of strings if the value str is present in the array +// If the specified value is found, the corresponding array index is returned. +// The bool value is used to signal success or failure func stringArrayContains(array []string, str string) (int, bool) { for i, a := range array { if a == str { @@ -107,6 +115,7 @@ func CpuList() []int { return cpulist } +// Tags2Map stores a InfluxDB list of tags in a map of key value pairs func Tags2Map(metric influx.Metric) map[string]string { tags := make(map[string]string) for _, t := range metric.TagList() { @@ -115,6 +124,7 @@ func Tags2Map(metric influx.Metric) map[string]string { return tags } +// Fields2Map stores a InfluxDB list of fields in a map of key value pairs func Fields2Map(metric influx.Metric) map[string]interface{} { fields := make(map[string]interface{}) for _, f := range metric.FieldList() { @@ -123,11 +133,13 @@ func Fields2Map(metric influx.Metric) map[string]interface{} { return fields } +// RemoveFromStringList removes the string r from the array of strings s +// If r is not contained in the array an error is returned func RemoveFromStringList(s []string, r string) ([]string, error) { for i, item := range s { if r == item { return append(s[:i], s[i+1:]...), nil } } - return s, errors.New("No such string in list") + return s, fmt.Errorf("No such string in list") } From 2925ad9f402a56e1ba9d188131ae8795a03b407f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 25 Jan 2022 17:43:10 +0100 Subject: [PATCH 028/174] Use ccLogger anywhere --- collectors/collectorManager.go | 16 +++---- collectors/tempMetric.go | 4 +- internal/ccLogger/cclogger.go | 44 ++++++++++--------- internal/metricRouter/metricRouter.go | 25 +++++------ metric-collector.go | 11 +++++ receivers/natsReceiver.go | 9 ++-- receivers/receiveManager.go | 63 +++++---------------------- sinks/sinkManager.go | 25 ++++++----- 8 files changed, 85 insertions(+), 112 deletions(-) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 73b2891..88cfdf8 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -70,17 +70,17 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat } for k, cfg := range cm.config { if _, found := AvailableCollectors[k]; !found { - cclog.ComponentPrint("CollectorManager", "SKIP unknown collector ", k) + cclog.ComponentError("CollectorManager", "SKIP unknown collector", k) continue } c := AvailableCollectors[k] err = c.Init(cfg) if err != nil { - cclog.ComponentPrint("CollectorManager", "Collector ", k, "initialization failed: ", err.Error()) + cclog.ComponentError("CollectorManager", "Collector", k, "initialization failed:", err.Error()) continue } - cclog.ComponentDebug("CollectorManager", "Collector ", k, "initialized") + cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", c.Name()) cm.collectors = append(cm.collectors, c) } return nil @@ -99,7 +99,7 @@ func (cm *collectorManager) Start() { c.Close() } cm.wg.Done() - cclog.ComponentPrint("CollectorManager", "DONE") + cclog.ComponentDebug("CollectorManager", "DONE") break CollectorManagerLoop case t := <-tick: for _, c := range cm.collectors { @@ -110,17 +110,17 @@ func (cm *collectorManager) Start() { c.Close() } cm.wg.Done() - cclog.ComponentPrint("CollectorManager", "DONE") + cclog.ComponentDebug("CollectorManager", "DONE") break CollectorManagerInputLoop default: - cclog.ComponentPrint("CollectorManager", c.Name(), " ", t) + cclog.ComponentDebug("CollectorManager", c.Name(), t) c.Read(cm.duration, cm.output) } } } } }() - cclog.ComponentPrint("CollectorManager", "STARTED") + cclog.ComponentDebug("CollectorManager", "STARTED") } func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { @@ -129,7 +129,7 @@ func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { func (cm *collectorManager) Close() { cm.done <- true - cclog.ComponentPrint("CollectorManager", "CLOSE") + cclog.ComponentDebug("CollectorManager", "CLOSE") } func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) { diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index b73d582..caa726e 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -4,7 +4,7 @@ import ( "encoding/json" "fmt" "io/ioutil" - "log" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "os" "path/filepath" "strconv" @@ -102,7 +102,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err == nil { y, err := lp.New(strings.ToLower(mname), tags, m.meta, map[string]interface{}{"value": int(float64(x) / 1000)}, time.Now()) if err == nil { - log.Print("[", m.name, "] ", y) + cclog.ComponentDebug(m.name, y) output <- y } } diff --git a/internal/ccLogger/cclogger.go b/internal/ccLogger/cclogger.go index ad5b986..ee92376 100644 --- a/internal/ccLogger/cclogger.go +++ b/internal/ccLogger/cclogger.go @@ -21,68 +21,72 @@ var ( func initLogger() { if debugLog == nil { - debugLog = log.New(stderr, "DEBUG", log.LstdFlags) + debugLog = log.New(stderr, "DEBUG ", log.LstdFlags) } if infoLog == nil { - infoLog = log.New(stdout, "INFO", log.LstdFlags) + infoLog = log.New(stdout, "INFO ", log.LstdFlags) } if errorLog == nil { - errorLog = log.New(stderr, "ERROR", log.LstdFlags) + errorLog = log.New(stderr, "ERROR ", log.LstdFlags) } if warnLog == nil { - warnLog = log.New(stderr, "WARN", log.LstdFlags) + warnLog = log.New(stderr, "WARN ", log.LstdFlags) } if defaultLog == nil { defaultLog = log.New(stdout, "", log.LstdFlags) } } -func CCPrint(logger *log.Logger, e ... interface {}) { - if logger != nil { - logger.Print(e) - } -} - func Print(e ... interface{}) { - CCPrint(defaultLog, e) + initLogger() + defaultLog.Print(e) } func ComponentPrint(component string, e ... interface{}) { - CCPrint(defaultLog, fmt.Sprintf("[%s]", component), e) + initLogger() + defaultLog.Print(fmt.Sprintf("[%s] ", component), e) } func Info(e ... interface{}) { - CCPrint(infoLog, e) + initLogger() + infoLog.Print(e) } func ComponentInfo(component string, e ... interface{}) { - CCPrint(infoLog, fmt.Sprintf("[%s]", component), e) + initLogger() + infoLog.Print(fmt.Sprintf("[%s] ", component), e) } func Debug(e ... interface{}) { - if globalDebug { - CCPrint(debugLog, e) + initLogger() + if globalDebug == true { + debugLog.Print(e) } } func ComponentDebug(component string, e ... interface{}) { - if globalDebug { - CCPrint(debugLog, fmt.Sprintf("[%s]", component), e) + initLogger() + if globalDebug == true && debugLog != nil { + //CCComponentPrint(debugLog, component, e) + debugLog.Print(fmt.Sprintf("[%s] ", component), e) } } func Error(e ... interface{}) { + initLogger() _, fn, line, _ := runtime.Caller(1) - CCPrint(errorLog, fn, line, e) + errorLog.Print(fmt.Sprintf("[%s:%d] ", fn, line), e) } func ComponentError(component string, e ... interface{}) { + initLogger() _, fn, line, _ := runtime.Caller(1) - CCPrint(errorLog, fmt.Sprintf("[%s]", component), fn, line, e) + errorLog.Print(fmt.Sprintf("[%s|%s:%d] ", component, fn, line), e) } func SetDebug() { globalDebug = true + initLogger() } diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 25b0dc2..5fd55ba 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -2,7 +2,7 @@ package metricRouter import ( "encoding/json" - "log" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "os" "sync" "time" @@ -50,14 +50,14 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout r.ticker = ticker configFile, err := os.Open(routerConfigFile) if err != nil { - log.Print(err.Error()) + cclog.ComponentError("MetricRouter", err.Error()) return err } defer configFile.Close() jsonParser := json.NewDecoder(configFile) err = jsonParser.Decode(&r.config) if err != nil { - log.Print(err.Error()) + cclog.ComponentError("MetricRouter", err.Error()) return err } return nil @@ -79,7 +79,7 @@ func (r *metricRouter) StartTimer() { func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, error) { expression, err := govaluate.NewEvaluableExpression(Cond) if err != nil { - log.Print(Cond, " = ", err.Error()) + cclog.ComponentDebug("MetricRouter", Cond, " = ", err.Error()) return false, err } params := make(map[string]interface{}) @@ -97,7 +97,7 @@ func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, erro result, err := expression.Evaluate(params) if err != nil { - log.Print(Cond, " = ", err.Error()) + cclog.ComponentDebug("MetricRouter", Cond, " = ", err.Error()) return false, err } return bool(result.(bool)), err @@ -113,7 +113,7 @@ func (r *metricRouter) DoAddTags(point lp.CCMetric) { var err error conditionMatches, err = r.EvalCondition(m.Condition, point) if err != nil { - log.Print(err.Error()) + cclog.ComponentError("MetricRouter", err.Error()) conditionMatches = false } } @@ -133,7 +133,7 @@ func (r *metricRouter) DoDelTags(point lp.CCMetric) { var err error conditionMatches, err = r.EvalCondition(m.Condition, point) if err != nil { - log.Print(err.Error()) + cclog.ComponentError("MetricRouter", err.Error()) conditionMatches = false } } @@ -154,7 +154,7 @@ func (r *metricRouter) Start() { RouterLoop: select { case <-r.done: - log.Print("[MetricRouter] DONE\n") + cclog.ComponentDebug("MetricRouter", "DONE") r.wg.Done() break RouterLoop default: @@ -162,11 +162,11 @@ func (r *metricRouter) Start() { RouterInputLoop: select { case <-r.done: - log.Print("[MetricRouter] DONE\n") + cclog.ComponentDebug("MetricRouter", "DONE") r.wg.Done() break RouterInputLoop case p := <-c: - log.Print("[MetricRouter] FORWARD ", p) + cclog.ComponentDebug("MetricRouter", "FORWARD", p) r.DoAddTags(p) r.DoDelTags(p) if r.config.IntervalStamp { @@ -180,9 +180,8 @@ func (r *metricRouter) Start() { } } } - log.Print("[MetricRouter] EXIT\n") }() - log.Print("[MetricRouter] STARTED\n") + cclog.ComponentDebug("MetricRouter", "STARTED") } func (r *metricRouter) AddInput(input chan lp.CCMetric) { @@ -195,7 +194,7 @@ func (r *metricRouter) AddOutput(output chan lp.CCMetric) { func (r *metricRouter) Close() { r.done <- true - log.Print("[MetricRouter] CLOSE\n") + cclog.ComponentDebug("MetricRouter", "CLOSE") } func New(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) (MetricRouter, error) { diff --git a/metric-collector.go b/metric-collector.go index 494bcbf..b3ad9d0 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -100,6 +100,7 @@ func ReadCli() map[string]string { logfile := flag.String("log", "stderr", "Path for logfile") pidfile := flag.String("pidfile", "/var/run/cc-metric-collector.pid", "Path for PID file") once := flag.Bool("once", false, "Run all collectors only once") + debug := flag.Bool("debug", false, "Activate debug output") flag.Parse() m = make(map[string]string) m["configfile"] = *cfg @@ -110,6 +111,12 @@ func ReadCli() map[string]string { } else { m["once"] = "false" } + if *debug { + m["debug"] = "true" + cclog.SetDebug() + } else { + m["debug"] = "false" + } return m } @@ -219,6 +226,10 @@ func mainFunc() int { // Drop domain part of host name rcfg.Hostname = strings.SplitN(rcfg.Hostname, `.`, 2)[0] // err = CreatePidfile(rcfg.CliArgs["pidfile"]) + + if rcfg.CliArgs["logfile"] != "stderr" { + cclog.SetOutput(rcfg.CliArgs["logfile"]) + } // err = SetLogging(rcfg.CliArgs["logfile"]) // if err != nil { // log.Print("Error setting up logging system to ", rcfg.CliArgs["logfile"], " on ", rcfg.Hostname) diff --git a/receivers/natsReceiver.go b/receivers/natsReceiver.go index 5cbe90d..853edf1 100644 --- a/receivers/natsReceiver.go +++ b/receivers/natsReceiver.go @@ -6,7 +6,7 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" nats "github.com/nats-io/nats.go" - "log" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "time" ) @@ -46,8 +46,8 @@ func (r *NatsReceiver) Init(config ReceiverConfig) error { if len(r.port) == 0 { r.port = "4222" } - log.Print("[NatsReceiver] INIT") uri := fmt.Sprintf("%s:%s", r.addr, r.port) + cclog.ComponentDebug("NatsReceiver", "INIT", uri) nc, err := nats.Connect(uri) if err == nil { r.database = r.config.Database @@ -63,7 +63,7 @@ func (r *NatsReceiver) Init(config ReceiverConfig) error { } func (r *NatsReceiver) Start() { - log.Print("[NatsReceiver] START") + cclog.ComponentDebug("NatsReceiver", "START") r.nc.Subscribe(r.database, r._NatsReceive) } @@ -75,7 +75,6 @@ func (r *NatsReceiver) _NatsReceive(m *nats.Msg) { for k, v := range r.meta { y.AddMeta(k, v) } - //y, err := lp.New(m.Name(), Tags2Map(m), r.meta, Fields2Map(m), m.Time()) if r.sink != nil { r.sink <- y } @@ -85,7 +84,7 @@ func (r *NatsReceiver) _NatsReceive(m *nats.Msg) { func (r *NatsReceiver) Close() { if r.nc != nil { - log.Print("[NatsReceiver] CLOSE") + cclog.ComponentDebug("NatsReceiver", "CLOSE") r.nc.Close() } } diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index 62f70b3..e6a2eee 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -3,7 +3,7 @@ package receivers import ( "encoding/json" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - "log" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "os" "sync" ) @@ -36,7 +36,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er rm.config = make([]ReceiverConfig, 0) configFile, err := os.Open(receiverConfigFile) if err != nil { - log.Print(err.Error()) + cclog.ComponentError("ReceiveManager", err.Error()) return err } defer configFile.Close() @@ -44,23 +44,11 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er var rawConfigs []json.RawMessage err = jsonParser.Decode(&rawConfigs) if err != nil { - log.Print(err.Error()) + cclog.ComponentError("ReceiveManager", err.Error()) return err } for _, raw := range rawConfigs { - log.Print("[ReceiveManager] ", string(raw)) rm.AddInput(raw) - // if _, found := AvailableReceivers[k.Type]; !found { - // log.Print("[ReceiveManager] SKIP Config specifies unknown receiver 'type': ", k.Type) - // continue - // } - // r := AvailableReceivers[k.Type] - // err = r.Init(k) - // if err != nil { - // log.Print("[ReceiveManager] SKIP Receiver ", k.Type, " cannot be initialized: ", err.Error()) - // continue - // } - // rm.inputs = append(rm.inputs, r) } return nil } @@ -69,60 +57,32 @@ func (rm *receiveManager) Start() { rm.wg.Add(1) for _, r := range rm.inputs { - log.Print("[ReceiveManager] START ", r.Name()) + cclog.ComponentDebug("ReceiveManager", "START", r.Name()) r.Start() } - log.Print("[ReceiveManager] STARTED\n") - // go func() { - // for { - //ReceiveManagerLoop: - // select { - // case <- rm.done: - // log.Print("ReceiveManager done\n") - // rm.wg.Done() - // break ReceiveManagerLoop - // default: - // for _, c := range rm.inputs { - //ReceiveManagerInputLoop: - // select { - // case <- rm.done: - // log.Print("ReceiveManager done\n") - // rm.wg.Done() - // break ReceiveManagerInputLoop - // case p := <- c: - // log.Print("ReceiveManager: ", p) - // rm.output <- p - // default: - // } - // } - // } - // } - // }() - // for _, r := range rm.inputs { - // r.Close() - // } + cclog.ComponentDebug("ReceiveManager", "STARTED") } func (rm *receiveManager) AddInput(rawConfig json.RawMessage) error { var config ReceiverConfig err := json.Unmarshal(rawConfig, &config) if err != nil { - log.Print("[ReceiveManager] SKIP ", config.Type, " JSON config error: ", err.Error()) - log.Print(err.Error()) + cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "JSON config error:", err.Error()) return err } if _, found := AvailableReceivers[config.Type]; !found { - log.Print("[ReceiveManager] SKIP ", config.Type, " unknown receiver: ", err.Error()) + cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "unknown receiver:", err.Error()) return err } r := AvailableReceivers[config.Type] err = r.Init(config) if err != nil { - log.Print("[ReceiveManager] SKIP ", r.Name(), " initialization failed: ", err.Error()) + cclog.ComponentError("ReceiveManager", "SKIP", r.Name(), "initialization failed:", err.Error()) return err } rm.inputs = append(rm.inputs, r) rm.config = append(rm.config, config) + cclog.ComponentDebug("ReceiveManager", "ADD RECEIVER", r.Name()) return nil } @@ -135,12 +95,11 @@ func (rm *receiveManager) AddOutput(output chan lp.CCMetric) { func (rm *receiveManager) Close() { for _, r := range rm.inputs { - log.Print("[ReceiveManager] CLOSE ", r.Name()) + cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name()) r.Close() } rm.wg.Done() - log.Print("[ReceiveManager] CLOSE\n") - log.Print("[ReceiveManager] EXIT\n") + cclog.ComponentDebug("ReceiveManager", "CLOSE") } func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) { diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index beb0f32..b2d60dc 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -2,11 +2,12 @@ package sinks import ( "encoding/json" - "log" +// "log" "os" "sync" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" ) var AvailableSinks = map[string]Sink{ @@ -41,7 +42,7 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { if len(sinkConfigFile) > 0 { configFile, err := os.Open(sinkConfigFile) if err != nil { - log.Print("[SinkManager] ", err.Error()) + cclog.ComponentError("SinkManager", err.Error()) return err } defer configFile.Close() @@ -49,7 +50,7 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { var rawConfigs []json.RawMessage err = jsonParser.Decode(&rawConfigs) if err != nil { - log.Print("[SinkManager] ", err.Error()) + cclog.ComponentError("SinkManager", err.Error()) return err } for _, raw := range rawConfigs { @@ -73,16 +74,16 @@ func (sm *sinkManager) Start() { for _, s := range sm.outputs { s.Close() } - log.Print("[SinkManager] DONE\n") + cclog.ComponentDebug("SinkManager", "DONE") sm.wg.Done() break SinkManagerLoop case p := <-sm.input: - log.Print("[SinkManager] WRITE ", p) + cclog.ComponentDebug("SinkManager", "WRITE", p) for _, s := range sm.outputs { s.Write(p) } if batchcount == 0 { - log.Print("[SinkManager] FLUSH") + cclog.ComponentDebug("SinkManager", "FLUSH") for _, s := range sm.outputs { s.Flush() } @@ -92,9 +93,8 @@ func (sm *sinkManager) Start() { default: } } - log.Print("[SinkManager] EXIT\n") }() - log.Print("[SinkManager] STARTED\n") + cclog.ComponentDebug("SinkManager", "STARTED") } func (sm *sinkManager) AddInput(input chan lp.CCMetric) { @@ -107,28 +107,29 @@ func (sm *sinkManager) AddOutput(rawConfig json.RawMessage) error { if len(rawConfig) > 3 { err = json.Unmarshal(rawConfig, &config) if err != nil { - log.Print("[SinkManager] SKIP ", config.Type, " JSON config error: ", err.Error()) + cclog.ComponentError("SinkManager", "SKIP", config.Type, "JSON config error:", err.Error()) return err } } if _, found := AvailableSinks[config.Type]; !found { - log.Print("[SinkManager] SKIP ", config.Type, " unknown sink: ", err.Error()) + cclog.ComponentError("SinkManager", "SKIP", config.Type, "unknown sink:", err.Error()) return err } s := AvailableSinks[config.Type] err = s.Init(config) if err != nil { - log.Print("[SinkManager] SKIP ", s.Name(), " initialization failed: ", err.Error()) + cclog.ComponentError("SinkManager", "SKIP", s.Name(), "initialization failed:", err.Error()) return err } sm.outputs = append(sm.outputs, s) sm.config = append(sm.config, config) + cclog.ComponentDebug("SinkManager", "ADD SINK", s.Name()) return nil } func (sm *sinkManager) Close() { sm.done <- true - log.Print("[SinkManager] CLOSE") + cclog.ComponentDebug("SinkManager", "CLOSE") } func New(wg *sync.WaitGroup, sinkConfigFile string) (SinkManager, error) { From 7f77cad0565e1242a54b923a439588e04fba340f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 25 Jan 2022 17:49:15 +0100 Subject: [PATCH 029/174] Don't wait too long in case of --once --- metric-collector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric-collector.go b/metric-collector.go index b3ad9d0..25989ed 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -286,7 +286,7 @@ func mainFunc() int { // Wait until one tick has passed. This is a workaround if rcfg.CliArgs["once"] == "true" { - x := 1.8 * float64(rcfg.ConfigFile.Interval) + x := 1.2 * float64(rcfg.ConfigFile.Interval) time.Sleep(time.Duration(int(x)) * time.Second) shutdown(&rcfg) } From 9bd8a3a90b5a0a93a2ba1e542c6a2cfab74659e4 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 26 Jan 2022 11:38:43 +0100 Subject: [PATCH 030/174] Add documentation --- internal/metricRouter/metricRouter.go | 46 ++++++++++++++++++--------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 5fd55ba..dc7703a 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -2,38 +2,42 @@ package metricRouter import ( "encoding/json" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "os" "sync" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" "gopkg.in/Knetic/govaluate.v2" ) +// Metric router tag configuration type metricRouterTagConfig struct { - Key string `json:"key"` - Value string `json:"value"` - Condition string `json:"if"` + Key string `json:"key"` // Tag name + Value string `json:"value"` // Tag value + Condition string `json:"if"` // Condition for adding or removing corresponding tag } +// Metric router configuration type metricRouterConfig struct { - AddTags []metricRouterTagConfig `json:"add_tags"` - DelTags []metricRouterTagConfig `json:"delete_tags"` - IntervalStamp bool `json:"interval_timestamp"` + AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met + DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met + IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically? } type metricRouter struct { - inputs []chan lp.CCMetric - outputs []chan lp.CCMetric - done chan bool + inputs []chan lp.CCMetric // List of all input channels + outputs []chan lp.CCMetric // List of all output channels + done chan bool // channel to finish stop metric router wg *sync.WaitGroup - timestamp time.Time + timestamp time.Time // timestamp ticker mct.MultiChanTicker config metricRouterConfig } +// MetricRouter access functions type MetricRouter interface { Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error AddInput(input chan lp.CCMetric) @@ -42,6 +46,12 @@ type MetricRouter interface { Close() } +// Init initializes a metric router by setting up: +// * input and output channels +// * done channel +// * wait group synchronization (from variable wg) +// * ticker (from variable ticker) +// * configuration (read from config file in variable routerConfigFile) func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error { r.inputs = make([]chan lp.CCMetric, 0) r.outputs = make([]chan lp.CCMetric, 0) @@ -63,25 +73,27 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout return nil } +// StartTimer starts a timer which updates timestamp periodically func (r *metricRouter) StartTimer() { m := make(chan time.Time) r.ticker.AddChannel(m) go func() { for { - select { - case t := <-m: - r.timestamp = t - } + t := <-m + r.timestamp = t } }() } +// EvalCondition evaluates condition Cond for metric data from point func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, error) { expression, err := govaluate.NewEvaluableExpression(Cond) if err != nil { cclog.ComponentDebug("MetricRouter", Cond, " = ", err.Error()) return false, err } + + // Add metric name, tags, meta data, fields and timestamp to the parameter list params := make(map[string]interface{}) params["name"] = point.Name() for _, t := range point.TagList() { @@ -95,6 +107,7 @@ func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, erro } params["timestamp"] = point.Time() + // evaluate condition result, err := expression.Evaluate(params) if err != nil { cclog.ComponentDebug("MetricRouter", Cond, " = ", err.Error()) @@ -103,6 +116,7 @@ func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, erro return bool(result.(bool)), err } +// DoAddTags adds a tag when condition is fullfiled func (r *metricRouter) DoAddTags(point lp.CCMetric) { for _, m := range r.config.AddTags { var conditionMatches bool @@ -123,6 +137,7 @@ func (r *metricRouter) DoAddTags(point lp.CCMetric) { } } +// DoDelTags removes a tag when condition is fullfiled func (r *metricRouter) DoDelTags(point lp.CCMetric) { for _, m := range r.config.DelTags { var conditionMatches bool @@ -143,6 +158,7 @@ func (r *metricRouter) DoDelTags(point lp.CCMetric) { } } +// Start starts the metric router func (r *metricRouter) Start() { r.wg.Add(1) r.timestamp = time.Now() From 3d073080f83cd4f7716b1d8787fff24e9ee98562 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 26 Jan 2022 12:08:40 +0100 Subject: [PATCH 031/174] Add documentation --- internal/metricRouter/metricRouter.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index dc7703a..f76c31f 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -30,7 +30,7 @@ type metricRouterConfig struct { type metricRouter struct { inputs []chan lp.CCMetric // List of all input channels outputs []chan lp.CCMetric // List of all output channels - done chan bool // channel to finish stop metric router + done chan bool // channel to finish / stop metric router wg *sync.WaitGroup timestamp time.Time // timestamp ticker mct.MultiChanTicker @@ -200,19 +200,23 @@ func (r *metricRouter) Start() { cclog.ComponentDebug("MetricRouter", "STARTED") } +// AddInput adds a input channel to the metric router func (r *metricRouter) AddInput(input chan lp.CCMetric) { r.inputs = append(r.inputs, input) } +// AddOutput adds a output channel to the metric router func (r *metricRouter) AddOutput(output chan lp.CCMetric) { r.outputs = append(r.outputs, output) } +// Close finishes / stops the metric router func (r *metricRouter) Close() { r.done <- true cclog.ComponentDebug("MetricRouter", "CLOSE") } +// New creates a new initialized metric router func New(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) (MetricRouter, error) { r := new(metricRouter) err := r.Init(ticker, wg, routerConfigFile) From c193b800830a3d7f1faa375616fcc2b4388d4c15 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 26 Jan 2022 12:31:04 +0100 Subject: [PATCH 032/174] Add documentation --- collectors/collectorManager.go | 51 ++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 88cfdf8..192ef31 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -6,26 +6,27 @@ import ( "sync" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" ) +// Map of all available metric collectors var AvailableCollectors = map[string]MetricCollector{ - "likwid": &LikwidCollector{}, - "loadavg": &LoadavgCollector{}, - "memstat": &MemstatCollector{}, - "netstat": &NetstatCollector{}, - "ibstat": &InfinibandCollector{}, - "lustrestat": &LustreCollector{}, - "cpustat": &CpustatCollector{}, - "topprocs": &TopProcsCollector{}, - "nvidia": &NvidiaCollector{}, - "customcmd": &CustomCmdCollector{}, - "diskstat": &DiskstatCollector{}, - "tempstat": &TempCollector{}, - "ipmistat": &IpmiCollector{}, + "likwid": new(LikwidCollector), + "loadavg": new(LoadavgCollector), + "memstat": new(MemstatCollector), + "netstat": new(NetstatCollector), + "ibstat": new(InfinibandCollector), + "lustrestat": new(LustreCollector), + "cpustat": new(CpustatCollector), + "topprocs": new(TopProcsCollector), + "nvidia": new(NvidiaCollector), + "customcmd": new(CustomCmdCollector), + "diskstat": new(DiskstatCollector), + "tempstat": new(TempCollector), + "ipmistat": new(IpmiCollector), "gpfs": new(GpfsCollector), "cpufreq": new(CPUFreqCollector), "cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector), @@ -34,14 +35,15 @@ var AvailableCollectors = map[string]MetricCollector{ type collectorManager struct { collectors []MetricCollector - output chan lp.CCMetric - done chan bool + output chan lp.CCMetric // List of all output channels + done chan bool // channel to finish / stop metric collector manager ticker mct.MultiChanTicker duration time.Duration wg *sync.WaitGroup config map[string]json.RawMessage } +// Metric collector access functions type CollectorManager interface { Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error AddOutput(output chan lp.CCMetric) @@ -49,6 +51,13 @@ type CollectorManager interface { Close() } +// Init initializes a new metric collector manager by setting up: +// * output channels +// * done channel +// * wait group synchronization (from variable wg) +// * ticker (from variable ticker) +// * configuration (read from config file in variable collectConfigFile) +// Initialization is done for all configured collectors func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error { cm.collectors = make([]MetricCollector, 0) cm.output = nil @@ -56,6 +65,8 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat cm.wg = wg cm.ticker = ticker cm.duration = duration + + // Read collector config file configFile, err := os.Open(collectConfigFile) if err != nil { cclog.Error(err.Error()) @@ -68,6 +79,8 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat cclog.Error(err.Error()) return err } + + // Initialize configured collectors for k, cfg := range cm.config { if _, found := AvailableCollectors[k]; !found { cclog.ComponentError("CollectorManager", "SKIP unknown collector", k) @@ -86,6 +99,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat return nil } +// Start starts the metric collector manager func (cm *collectorManager) Start() { cm.wg.Add(1) tick := make(chan time.Time) @@ -113,7 +127,7 @@ func (cm *collectorManager) Start() { cclog.ComponentDebug("CollectorManager", "DONE") break CollectorManagerInputLoop default: - cclog.ComponentDebug("CollectorManager", c.Name(), t) + cclog.ComponentDebug("CollectorManager", c.Name(), t) c.Read(cm.duration, cm.output) } } @@ -123,15 +137,18 @@ func (cm *collectorManager) Start() { cclog.ComponentDebug("CollectorManager", "STARTED") } +// AddOutput adds the output channel to the metric collector manager func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { cm.output = output } +// Close finishes / stops the metric collector manager func (cm *collectorManager) Close() { cm.done <- true cclog.ComponentDebug("CollectorManager", "CLOSE") } +// New creates a new initialized metric collector manager func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) { cm := &collectorManager{} err := cm.Init(ticker, duration, wg, collectConfigFile) From 09b753847982ceafc02c75cebd78163cb31ceb08 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 26 Jan 2022 15:54:49 +0100 Subject: [PATCH 033/174] Avoid labels in collector manager loop --- collectors/collectorManager.go | 34 ++++++++++++++++++++-------------- collectors/memstatMetric.go | 3 ++- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 192ef31..4aae041 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -104,29 +104,33 @@ func (cm *collectorManager) Start() { cm.wg.Add(1) tick := make(chan time.Time) cm.ticker.AddChannel(tick) + go func() { + // Collector manager is done + done := func() { + // close all metric collectors + for _, c := range cm.collectors { + c.Close() + } + cm.wg.Done() + cclog.ComponentDebug("CollectorManager", "DONE") + } + + // Wait for done signal or timer event for { - CollectorManagerLoop: select { case <-cm.done: - for _, c := range cm.collectors { - c.Close() - } - cm.wg.Done() - cclog.ComponentDebug("CollectorManager", "DONE") - break CollectorManagerLoop + done() + return case t := <-tick: for _, c := range cm.collectors { - CollectorManagerInputLoop: + // Wait for done signal or execute the collector select { case <-cm.done: - for _, c := range cm.collectors { - c.Close() - } - cm.wg.Done() - cclog.ComponentDebug("CollectorManager", "DONE") - break CollectorManagerInputLoop + done() + return default: + // Read metrics from collector c cclog.ComponentDebug("CollectorManager", c.Name(), t) c.Read(cm.duration, cm.output) } @@ -134,6 +138,8 @@ func (cm *collectorManager) Start() { } } }() + + // Collector manager is started cclog.ComponentDebug("CollectorManager", "STARTED") } diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index c83402c..b6ef855 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -9,6 +9,7 @@ import ( "strconv" "strings" "time" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -93,7 +94,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) for match, name := range m.matches { if _, exists := m.stats[match]; !exists { - err = errors.New(fmt.Sprintf("Parse error for %s : %s", match, name)) + err = fmt.Errorf("Parse error for %s : %s", match, name) log.Print(err) continue } From babd7a9af82dab8071204e3175188503780eec73 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 16:52:56 +0100 Subject: [PATCH 034/174] Use non-blocking send at close --- collectors/collectorManager.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 4aae041..6140dbf 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -150,7 +150,10 @@ func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { // Close finishes / stops the metric collector manager func (cm *collectorManager) Close() { - cm.done <- true + select { + case cm.done <- true: + default: + } cclog.ComponentDebug("CollectorManager", "CLOSE") } From 3fd77e6887cf368799f07a1cd6f35941a6e55d55 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 16:54:51 +0100 Subject: [PATCH 035/174] Use non-blocking send at close, use common done function and remove default case --- sinks/sinkManager.go | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index b2d60dc..efcb5a0 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -2,12 +2,11 @@ package sinks import ( "encoding/json" -// "log" "os" "sync" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) var AvailableSinks = map[string]Sink{ @@ -67,16 +66,18 @@ func (sm *sinkManager) Start() { sm.wg.Add(1) batchcount := 20 go func() { + done := func() { + for _, s := range sm.outputs { + s.Close() + } + cclog.ComponentDebug("SinkManager", "DONE") + sm.wg.Done() + } for { - SinkManagerLoop: select { case <-sm.done: - for _, s := range sm.outputs { - s.Close() - } - cclog.ComponentDebug("SinkManager", "DONE") - sm.wg.Done() - break SinkManagerLoop + done() + return case p := <-sm.input: cclog.ComponentDebug("SinkManager", "WRITE", p) for _, s := range sm.outputs { @@ -90,7 +91,6 @@ func (sm *sinkManager) Start() { batchcount = 20 } batchcount-- - default: } } }() @@ -128,7 +128,10 @@ func (sm *sinkManager) AddOutput(rawConfig json.RawMessage) error { } func (sm *sinkManager) Close() { - sm.done <- true + select { + case sm.done <- true: + default: + } cclog.ComponentDebug("SinkManager", "CLOSE") } From 5600cf1f5f27f5781c76032fe94db3acd30f7fa2 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 17:08:53 +0100 Subject: [PATCH 036/174] Use two separate inputs for metric router to simplify management. Activate --logfile option and close MultiChanTicker explicitly --- internal/metricRouter/metricRouter.go | 99 +++++++++++++-------- internal/multiChanTicker/multiChanTicker.go | 23 ++++- metric-collector.go | 8 +- 3 files changed, 87 insertions(+), 43 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index f76c31f..57ba708 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -28,19 +28,22 @@ type metricRouterConfig struct { } type metricRouter struct { - inputs []chan lp.CCMetric // List of all input channels - outputs []chan lp.CCMetric // List of all output channels - done chan bool // channel to finish / stop metric router - wg *sync.WaitGroup - timestamp time.Time // timestamp - ticker mct.MultiChanTicker - config metricRouterConfig + coll_input chan lp.CCMetric // Input channel from CollectorManager + recv_input chan lp.CCMetric // Input channel from ReceiveManager + outputs []chan lp.CCMetric // List of all output channels + done chan bool // channel to finish / stop metric router + wg *sync.WaitGroup + timestamp time.Time // timestamp + timerdone chan bool // channel to finish / stop timestamp updater + ticker mct.MultiChanTicker + config metricRouterConfig } // MetricRouter access functions type MetricRouter interface { Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error - AddInput(input chan lp.CCMetric) + AddCollectorInput(input chan lp.CCMetric) + AddReceiverInput(input chan lp.CCMetric) AddOutput(output chan lp.CCMetric) Start() Close() @@ -53,7 +56,6 @@ type MetricRouter interface { // * ticker (from variable ticker) // * configuration (read from config file in variable routerConfigFile) func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error { - r.inputs = make([]chan lp.CCMetric, 0) r.outputs = make([]chan lp.CCMetric, 0) r.done = make(chan bool) r.wg = wg @@ -77,12 +79,19 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout func (r *metricRouter) StartTimer() { m := make(chan time.Time) r.ticker.AddChannel(m) + r.timerdone = make(chan bool) go func() { for { - t := <-m - r.timestamp = t + select { + case <-r.timerdone: + cclog.ComponentDebug("MetricRouter", "TIMER DONE") + return + case t := <-m: + r.timestamp = t + } } }() + cclog.ComponentDebug("MetricRouter", "TIMER START") } // EvalCondition evaluates condition Cond for metric data from point @@ -165,35 +174,35 @@ func (r *metricRouter) Start() { if r.config.IntervalStamp { r.StartTimer() } + done := func() { + r.wg.Done() + cclog.ComponentDebug("MetricRouter", "DONE") + } + forward := func(point lp.CCMetric) { + cclog.ComponentDebug("MetricRouter", "FORWARD", point) + r.DoAddTags(point) + r.DoDelTags(point) + for _, o := range r.outputs { + o <- point + } + } go func() { for { - RouterLoop: + // RouterLoop: select { case <-r.done: - cclog.ComponentDebug("MetricRouter", "DONE") - r.wg.Done() - break RouterLoop - default: - for _, c := range r.inputs { - RouterInputLoop: - select { - case <-r.done: - cclog.ComponentDebug("MetricRouter", "DONE") - r.wg.Done() - break RouterInputLoop - case p := <-c: - cclog.ComponentDebug("MetricRouter", "FORWARD", p) - r.DoAddTags(p) - r.DoDelTags(p) - if r.config.IntervalStamp { - p.SetTime(r.timestamp) - } - for _, o := range r.outputs { - o <- p - } - default: - } + done() + return + case p := <-r.coll_input: + if r.config.IntervalStamp { + p.SetTime(r.timestamp) } + forward(p) + case p := <-r.recv_input: + if r.config.IntervalStamp { + p.SetTime(r.timestamp) + } + forward(p) } } }() @@ -201,8 +210,12 @@ func (r *metricRouter) Start() { } // AddInput adds a input channel to the metric router -func (r *metricRouter) AddInput(input chan lp.CCMetric) { - r.inputs = append(r.inputs, input) +func (r *metricRouter) AddCollectorInput(input chan lp.CCMetric) { + r.coll_input = input +} + +func (r *metricRouter) AddReceiverInput(input chan lp.CCMetric) { + r.recv_input = input } // AddOutput adds a output channel to the metric router @@ -212,7 +225,17 @@ func (r *metricRouter) AddOutput(output chan lp.CCMetric) { // Close finishes / stops the metric router func (r *metricRouter) Close() { - r.done <- true + select { + case r.done <- true: + default: + } + if r.config.IntervalStamp { + cclog.ComponentDebug("MetricRouter", "TIMER CLOSE") + select { + case r.timerdone <- true: + default: + } + } cclog.ComponentDebug("MetricRouter", "CLOSE") } diff --git a/internal/multiChanTicker/multiChanTicker.go b/internal/multiChanTicker/multiChanTicker.go index f8139fa..37778ad 100644 --- a/internal/multiChanTicker/multiChanTicker.go +++ b/internal/multiChanTicker/multiChanTicker.go @@ -1,27 +1,43 @@ package multiChanTicker import ( + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "time" ) type multiChanTicker struct { ticker *time.Ticker channels []chan time.Time + done chan bool } type MultiChanTicker interface { Init(duration time.Duration) AddChannel(chan time.Time) + Close() } func (t *multiChanTicker) Init(duration time.Duration) { t.ticker = time.NewTicker(duration) + t.done = make(chan bool) go func() { + done := func() { + cclog.ComponentDebug("MultiChanTicker", "DONE") + } for { select { + case <-t.done: + done() + return case ts := <-t.ticker.C: + cclog.ComponentDebug("MultiChanTicker", "Tick", ts) for _, c := range t.channels { - c <- ts + select { + case <-t.done: + done() + return + case c <- ts: + } } } } @@ -32,6 +48,11 @@ func (t *multiChanTicker) AddChannel(channel chan time.Time) { t.channels = append(t.channels, channel) } +func (t *multiChanTicker) Close() { + t.done <- true + cclog.ComponentDebug("MultiChanTicker", "CLOSE") +} + func NewTicker(duration time.Duration) MultiChanTicker { t := &multiChanTicker{} t.Init(duration) diff --git a/metric-collector.go b/metric-collector.go index 25989ed..6a6c1b3 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -3,7 +3,6 @@ package main import ( "encoding/json" "flag" -// "log" "os" "os/signal" "strings" @@ -158,6 +157,7 @@ func ReadCli() map[string]string { // General shutdown function that gets executed in case of interrupt or graceful shutdown func shutdown(config *RuntimeConfig) { cclog.Info("Shutdown...") + config.Ticker.Close() if config.CollectManager != nil { cclog.Debug("Shutdown CollectManager...") config.CollectManager.Close() @@ -228,7 +228,7 @@ func mainFunc() int { // err = CreatePidfile(rcfg.CliArgs["pidfile"]) if rcfg.CliArgs["logfile"] != "stderr" { - cclog.SetOutput(rcfg.CliArgs["logfile"]) + cclog.SetOutput(rcfg.CliArgs["logfile"]) } // err = SetLogging(rcfg.CliArgs["logfile"]) // if err != nil { @@ -261,7 +261,7 @@ func mainFunc() int { } CollectToRouterChannel := make(chan lp.CCMetric) rcfg.CollectManager.AddOutput(CollectToRouterChannel) - rcfg.Router.AddInput(CollectToRouterChannel) + rcfg.Router.AddCollectorInput(CollectToRouterChannel) } if len(rcfg.ConfigFile.ReceiverConfigFile) > 0 { rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, rcfg.ConfigFile.ReceiverConfigFile) @@ -271,7 +271,7 @@ func mainFunc() int { } ReceiveToRouterChannel := make(chan lp.CCMetric) rcfg.ReceiveManager.AddOutput(ReceiveToRouterChannel) - rcfg.Router.AddInput(ReceiveToRouterChannel) + rcfg.Router.AddReceiverInput(ReceiveToRouterChannel) use_recv = true } prepare_shutdown(&rcfg) From 0a383a3789898b36b53d341769e32aa66a4e869b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 17:09:20 +0100 Subject: [PATCH 037/174] Update CCLogger --- internal/ccLogger/cclogger.go | 160 +++++++++++++++++----------------- 1 file changed, 79 insertions(+), 81 deletions(-) diff --git a/internal/ccLogger/cclogger.go b/internal/ccLogger/cclogger.go index ee92376..38e7e6b 100644 --- a/internal/ccLogger/cclogger.go +++ b/internal/ccLogger/cclogger.go @@ -2,114 +2,112 @@ package cclogger import ( "fmt" - "runtime" - "os" "log" + "os" + "runtime" ) - var ( - globalDebug = false - stdout = os.Stdout - stderr = os.Stderr - debugLog *log.Logger = nil - infoLog *log.Logger = nil - errorLog *log.Logger = nil - warnLog *log.Logger = nil - defaultLog *log.Logger = nil + globalDebug = false + stdout = os.Stdout + stderr = os.Stderr + debugLog *log.Logger = nil + infoLog *log.Logger = nil + errorLog *log.Logger = nil + warnLog *log.Logger = nil + defaultLog *log.Logger = nil ) func initLogger() { - if debugLog == nil { - debugLog = log.New(stderr, "DEBUG ", log.LstdFlags) - } - if infoLog == nil { - infoLog = log.New(stdout, "INFO ", log.LstdFlags) - } - if errorLog == nil { - errorLog = log.New(stderr, "ERROR ", log.LstdFlags) - } - if warnLog == nil { - warnLog = log.New(stderr, "WARN ", log.LstdFlags) - } - if defaultLog == nil { - defaultLog = log.New(stdout, "", log.LstdFlags) - } + if debugLog == nil { + debugLog = log.New(stderr, "DEBUG ", log.LstdFlags) + } + if infoLog == nil { + infoLog = log.New(stdout, "INFO ", log.LstdFlags) + } + if errorLog == nil { + errorLog = log.New(stderr, "ERROR ", log.LstdFlags) + } + if warnLog == nil { + warnLog = log.New(stderr, "WARN ", log.LstdFlags) + } + if defaultLog == nil { + defaultLog = log.New(stdout, "", log.LstdFlags) + } } -func Print(e ... interface{}) { - initLogger() - defaultLog.Print(e) +func Print(e ...interface{}) { + initLogger() + defaultLog.Print(e) } -func ComponentPrint(component string, e ... interface{}) { - initLogger() - defaultLog.Print(fmt.Sprintf("[%s] ", component), e) +func ComponentPrint(component string, e ...interface{}) { + initLogger() + defaultLog.Print(fmt.Sprintf("[%s] ", component), e) } -func Info(e ... interface{}) { - initLogger() - infoLog.Print(e) +func Info(e ...interface{}) { + initLogger() + infoLog.Print(e) } -func ComponentInfo(component string, e ... interface{}) { - initLogger() - infoLog.Print(fmt.Sprintf("[%s] ", component), e) +func ComponentInfo(component string, e ...interface{}) { + initLogger() + infoLog.Print(fmt.Sprintf("[%s] ", component), e) } -func Debug(e ... interface{}) { - initLogger() - if globalDebug == true { - debugLog.Print(e) - } +func Debug(e ...interface{}) { + initLogger() + if globalDebug == true { + debugLog.Print(e) + } } -func ComponentDebug(component string, e ... interface{}) { - initLogger() - if globalDebug == true && debugLog != nil { - //CCComponentPrint(debugLog, component, e) - debugLog.Print(fmt.Sprintf("[%s] ", component), e) - } +func ComponentDebug(component string, e ...interface{}) { + initLogger() + if globalDebug == true && debugLog != nil { + //CCComponentPrint(debugLog, component, e) + debugLog.Print(fmt.Sprintf("[%s] ", component), e) + } } -func Error(e ... interface{}) { - initLogger() - _, fn, line, _ := runtime.Caller(1) - errorLog.Print(fmt.Sprintf("[%s:%d] ", fn, line), e) +func Error(e ...interface{}) { + initLogger() + _, fn, line, _ := runtime.Caller(1) + errorLog.Print(fmt.Sprintf("[%s:%d] ", fn, line), e) } -func ComponentError(component string, e ... interface{}) { - initLogger() - _, fn, line, _ := runtime.Caller(1) - errorLog.Print(fmt.Sprintf("[%s|%s:%d] ", component, fn, line), e) +func ComponentError(component string, e ...interface{}) { + initLogger() + _, fn, line, _ := runtime.Caller(1) + errorLog.Print(fmt.Sprintf("[%s|%s:%d] ", component, fn, line), e) } func SetDebug() { - globalDebug = true - initLogger() + globalDebug = true + initLogger() } - func SetOutput(filename string) { - if filename == "stderr" { - if stderr != os.Stderr && stderr != os.Stdout { - stderr.Close() - } - stderr = os.Stderr - } else if filename == "stdout" { - if stderr != os.Stderr && stderr != os.Stdout { - stderr.Close() - } - stderr = os.Stdout - } else { - file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600) - if err == nil { - defer file.Close() - stderr = file - } - } - debugLog = nil - errorLog = nil - warnLog = nil - initLogger() + if filename == "stderr" { + if stderr != os.Stderr && stderr != os.Stdout { + stderr.Close() + } + stderr = os.Stderr + } else if filename == "stdout" { + if stderr != os.Stderr && stderr != os.Stdout { + stderr.Close() + } + stderr = os.Stdout + } else { + file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600) + if err == nil { + defer file.Close() + stderr = file + } + } + debugLog = nil + errorLog = nil + warnLog = nil + initLogger() } From 78834337b0343a0d31c19390be83267b6a516036 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 18:37:59 +0100 Subject: [PATCH 038/174] Fix for documentation --- collectors/cpustatMetric.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/cpustatMetric.md b/collectors/cpustatMetric.md index 604445a..8122afe 100644 --- a/collectors/cpustatMetric.md +++ b/collectors/cpustatMetric.md @@ -1,7 +1,7 @@ ## `cpustat` collector ```json - "netstat": { + "cpustat": { "exclude_metrics": [ "cpu_idle" ] From 86e9b55bc979dae760f88e75afd7e9dee76c6a95 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 18:41:25 +0100 Subject: [PATCH 039/174] Fix for documentation --- collectors/nvidiaMetric.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md index c774139..e2e08e5 100644 --- a/collectors/nvidiaMetric.md +++ b/collectors/nvidiaMetric.md @@ -2,7 +2,7 @@ ## `nvidia` collector ```json - "lustrestat": { + "nvidia": { "exclude_devices" : [ "0","1" ], From 76884c3380aaf1b1b5e83964c3d2e9224a0052e8 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 18:45:23 +0100 Subject: [PATCH 040/174] Prefix Nvidia metrics with 'nv_' --- collectors/nvidiaMetric.go | 88 +++++++++++++++++++------------------- collectors/nvidiaMetric.md | 44 +++++++++---------- 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 6f5141a..1eff3be 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -73,13 +73,13 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) util, ret := nvml.DeviceGetUtilizationRates(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "util") - y, err := lp.New("util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_util") + y, err := lp.New("nv_util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if err == nil && !skip { output <- y } - _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_util") - y, err = lp.New("mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_util") + y, err = lp.New("nv_mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) if err == nil && !skip { output <- y } @@ -88,15 +88,15 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) meminfo, ret := nvml.DeviceGetMemoryInfo(device) if ret == nvml.SUCCESS { t := float64(meminfo.Total) / (1024 * 1024) - _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_total") - y, err := lp.New("mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_total") + y, err := lp.New("nv_mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil && !skip { y.AddMeta("unit", "MByte") output <- y } f := float64(meminfo.Used) / (1024 * 1024) - _, skip = stringArrayContains(m.config.ExcludeMetrics, "fb_memory") - y, err = lp.New("fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fb_memory") + y, err = lp.New("nv_fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now()) if err == nil && !skip { y.AddMeta("unit", "MByte") output <- y @@ -105,8 +105,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "temp") - y, err := lp.New("temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_temp") + y, err := lp.New("nv_temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) if err == nil && !skip { y.AddMeta("unit", "degC") output <- y @@ -115,8 +115,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) fan, ret := nvml.DeviceGetFanSpeed(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "fan") - y, err := lp.New("fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fan") + y, err := lp.New("nv_fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) if err == nil && !skip { output <- y } @@ -128,19 +128,19 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) var err error switch ecc_pend { case nvml.FEATURE_DISABLED: - y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) + y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) case nvml.FEATURE_ENABLED: - y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) + y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) default: - y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) + y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) } - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode") + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") if err == nil && !skip { output <- y } } else if ret == nvml.ERROR_NOT_SUPPORTED { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode") - y, err := lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") + y, err := lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) if err == nil && !skip { output <- y } @@ -148,8 +148,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) pstate, ret := nvml.DeviceGetPerformanceState(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state") - y, err := lp.New("perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_perf_state") + y, err := lp.New("nv_perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) if err == nil && !skip { output <- y } @@ -157,8 +157,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) power, ret := nvml.DeviceGetPowerUsage(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report") - y, err := lp.New("power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_usage_report") + y, err := lp.New("nv_power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) if err == nil && !skip { output <- y } @@ -166,8 +166,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report") - y, err := lp.New("graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_graphics_clock_report") + y, err := lp.New("nv_graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -175,8 +175,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report") - y, err := lp.New("sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_sm_clock_report") + y, err := lp.New("nv_sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -184,8 +184,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report") - y, err := lp.New("mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_clock_report") + y, err := lp.New("nv_mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -193,8 +193,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock") - y, err := lp.New("max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_graphics_clock") + y, err := lp.New("nv_max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -202,8 +202,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock") - y, err := lp.New("max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_sm_clock") + y, err := lp.New("nv_max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -211,8 +211,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock") - y, err := lp.New("max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_mem_clock") + y, err := lp.New("nv_max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -220,8 +220,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error") - y, err := lp.New("ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_db_error") + y, err := lp.New("nv_ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) if err == nil && !skip { output <- y } @@ -229,8 +229,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error") - y, err := lp.New("ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_sb_error") + y, err := lp.New("nv_ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) if err == nil && !skip { output <- y } @@ -238,8 +238,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit") - y, err := lp.New("power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_man_limit") + y, err := lp.New("nv_power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) if err == nil && !skip { output <- y } @@ -247,8 +247,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util") - y, err := lp.New("encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_encoder_util") + y, err := lp.New("nv_encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) if err == nil && !skip { output <- y } @@ -256,8 +256,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util") - y, err := lp.New("decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_decoder_util") + y, err := lp.New("nv_decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) if err == nil && !skip { output <- y } diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md index e2e08e5..afe8b9e 100644 --- a/collectors/nvidiaMetric.md +++ b/collectors/nvidiaMetric.md @@ -7,33 +7,33 @@ "0","1" ], "exclude_metrics": [ - "fb_memory", - "fan" + "nv_fb_memory", + "nv_fan" ] } ``` Metrics: -* `util` -* `mem_util` -* `mem_total` -* `fb_memory` -* `temp` -* `fan` -* `ecc_mode` -* `perf_state` -* `power_usage_report` -* `graphics_clock_report` -* `sm_clock_report` -* `mem_clock_report` -* `max_graphics_clock` -* `max_sm_clock` -* `max_mem_clock` -* `ecc_db_error` -* `ecc_sb_error` -* `power_man_limit` -* `encoder_util` -* `decoder_util` +* `nv_util` +* `nv_mem_util` +* `nv_mem_total` +* `nv_fb_memory` +* `nv_temp` +* `nv_fan` +* `nv_ecc_mode` +* `nv_perf_state` +* `nv_power_usage_report` +* `nv_graphics_clock_report` +* `nv_sm_clock_report` +* `nv_mem_clock_report` +* `nv_max_graphics_clock` +* `nv_max_sm_clock` +* `nv_max_mem_clock` +* `nv_ecc_db_error` +* `nv_ecc_sb_error` +* `nv_power_man_limit` +* `nv_encoder_util` +* `nv_decoder_util` It uses a separate `type` in the metrics. The output metric looks like this: `,type=accelerator,type-id= value= ` From 7077452a5df2c3f42c4b714ca01afdcbcbd9b3b2 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 26 Jan 2022 20:18:47 +0100 Subject: [PATCH 041/174] Split InfiniBand metric collector, one using /sys filesystem reads and one using perfquery. --- collectors/collectorManager.go | 35 ++-- collectors/infinibandMetric.go | 243 ++++++----------------- collectors/infinibandPerfQueryMetric.go | 250 ++++++++++++++++++++++++ 3 files changed, 332 insertions(+), 196 deletions(-) create mode 100644 collectors/infinibandPerfQueryMetric.go diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 6140dbf..98b6115 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -14,23 +14,24 @@ import ( // Map of all available metric collectors var AvailableCollectors = map[string]MetricCollector{ - "likwid": new(LikwidCollector), - "loadavg": new(LoadavgCollector), - "memstat": new(MemstatCollector), - "netstat": new(NetstatCollector), - "ibstat": new(InfinibandCollector), - "lustrestat": new(LustreCollector), - "cpustat": new(CpustatCollector), - "topprocs": new(TopProcsCollector), - "nvidia": new(NvidiaCollector), - "customcmd": new(CustomCmdCollector), - "diskstat": new(DiskstatCollector), - "tempstat": new(TempCollector), - "ipmistat": new(IpmiCollector), - "gpfs": new(GpfsCollector), - "cpufreq": new(CPUFreqCollector), - "cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector), - "nfsstat": new(NfsCollector), + "likwid": new(LikwidCollector), + "loadavg": new(LoadavgCollector), + "memstat": new(MemstatCollector), + "netstat": new(NetstatCollector), + "ibstat": new(InfinibandCollector), + "ibstat_perfquery": new(InfinibandPerfQueryCollector), + "lustrestat": new(LustreCollector), + "cpustat": new(CpustatCollector), + "topprocs": new(TopProcsCollector), + "nvidia": new(NvidiaCollector), + "customcmd": new(CustomCmdCollector), + "diskstat": new(DiskstatCollector), + "tempstat": new(TempCollector), + "ipmistat": new(IpmiCollector), + "gpfs": new(GpfsCollector), + "cpufreq": new(CPUFreqCollector), + "cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector), + "nfsstat": new(NfsCollector), } type collectorManager struct { diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index af4e579..f506f37 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -3,10 +3,9 @@ package collectors import ( "fmt" "io/ioutil" - "log" - "os/exec" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - // "os" + "encoding/json" "errors" "path/filepath" @@ -15,35 +14,25 @@ import ( "time" ) -const ( - IBBASEPATH = `/sys/class/infiniband/` - PERFQUERY = `/usr/sbin/perfquery` -) - -type InfinibandCollectorConfig struct { - ExcludeDevices []string `json:"exclude_devices,omitempty"` - PerfQueryPath string `json:"perfquery_path"` -} +const IB_BASEPATH = `/sys/class/infiniband/` type InfinibandCollector struct { metricCollector - tags map[string]string - lids map[string]map[string]string - config InfinibandCollectorConfig - use_perfquery bool + tags map[string]string + lids map[string]map[string]string + config struct { + ExcludeDevices []string `json:"exclude_devices,omitempty"` + } } func (m *InfinibandCollector) Help() { - fmt.Println("This collector includes all devices that can be found below ", IBBASEPATH) - fmt.Println("and where any of the ports provides a 'lid' file (glob ", IBBASEPATH, "//ports//lid).") + fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH) + fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "//ports//lid).") fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.") fmt.Println("For each found LIDs the collector calls the 'perfquery' command") - fmt.Println("The path to the 'perfquery' command can be configured with the 'perfquery_path' option") - fmt.Println("in the configuration") fmt.Println("") fmt.Println("Full configuration object:") fmt.Println("\"ibstat\" : {") - fmt.Println(" \"perfquery_path\" : \"path/to/perfquery\" # if omitted, it searches in $PATH") fmt.Println(" \"exclude_devices\" : [\"dev1\"]") fmt.Println("}") fmt.Println("") @@ -57,7 +46,6 @@ func (m *InfinibandCollector) Help() { func (m *InfinibandCollector) Init(config json.RawMessage) error { var err error m.name = "InfinibandCollector" - m.use_perfquery = false m.setup() m.meta = map[string]string{"source": m.name, "group": "Network"} m.tags = map[string]string{"type": "node"} @@ -67,19 +55,13 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { return err } } - if len(m.config.PerfQueryPath) == 0 { - path, err := exec.LookPath("perfquery") - if err == nil { - m.config.PerfQueryPath = path - } - } m.lids = make(map[string]map[string]string) - p := fmt.Sprintf("%s/*/ports/*/lid", string(IBBASEPATH)) + p := fmt.Sprintf("%s/*/ports/*/lid", string(IB_BASEPATH)) files, err := filepath.Glob(p) for _, f := range files { lid, err := ioutil.ReadFile(f) if err == nil { - plist := strings.Split(strings.Replace(f, string(IBBASEPATH), "", -1), "/") + plist := strings.Split(strings.Replace(f, string(IB_BASEPATH), "", -1), "/") skip := false for _, d := range m.config.ExcludeDevices { if d == plist[0] { @@ -93,152 +75,11 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { } } - for _, ports := range m.lids { - for port, lid := range ports { - args := fmt.Sprintf("-r %s %s 0xf000", lid, port) - command := exec.Command(m.config.PerfQueryPath, args) - command.Wait() - _, err := command.Output() - if err == nil { - m.use_perfquery = true - } - break - } - break + if len(m.lids) == 0 { + return errors.New("No usable IB devices") } - if len(m.lids) > 0 { - m.init = true - } else { - err = errors.New("No usable devices") - } - - return err -} - -func (m *InfinibandCollector) doPerfQuery(cmd string, dev string, lid string, port string, tags map[string]string, output chan lp.CCMetric) error { - - args := fmt.Sprintf("-r %s %s 0xf000", lid, port) - command := exec.Command(cmd, args) - command.Wait() - stdout, err := command.Output() - if err != nil { - log.Print(err) - return err - } - ll := strings.Split(string(stdout), "\n") - - for _, line := range ll { - if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { - lv := strings.Fields(line) - v, err := strconv.ParseFloat(lv[1], 64) - if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - } - return nil -} - -func (m *InfinibandCollector) doSysfsRead(dev string, lid string, port string, tags map[string]string, output chan lp.CCMetric) error { - path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(IBBASEPATH), dev, port) - buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_data", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } + m.init = true return nil } @@ -247,11 +88,55 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr if m.init { for dev, ports := range m.lids { for port, lid := range ports { - tags := map[string]string{"type": "node", "device": dev, "port": port} - if m.use_perfquery { - m.doPerfQuery(m.config.PerfQueryPath, dev, lid, port, tags, output) - } else { - m.doSysfsRead(dev, lid, port, tags, output) + tags := map[string]string{ + "type": "node", + "device": dev, + "port": port, + "lid": lid} + path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(IB_BASEPATH), dev, port) + buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_data", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } } } } diff --git a/collectors/infinibandPerfQueryMetric.go b/collectors/infinibandPerfQueryMetric.go new file mode 100644 index 0000000..d8f7bf4 --- /dev/null +++ b/collectors/infinibandPerfQueryMetric.go @@ -0,0 +1,250 @@ +package collectors + +import ( + "fmt" + "io/ioutil" + "log" + "os/exec" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + + // "os" + "encoding/json" + "errors" + "path/filepath" + "strconv" + "strings" + "time" +) + +const PERFQUERY = `/usr/sbin/perfquery` + +type InfinibandPerfQueryCollector struct { + metricCollector + tags map[string]string + lids map[string]map[string]string + config struct { + ExcludeDevices []string `json:"exclude_devices,omitempty"` + PerfQueryPath string `json:"perfquery_path"` + } +} + +func (m *InfinibandPerfQueryCollector) Help() { + fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH) + fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "//ports//lid).") + fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.") + fmt.Println("For each found LIDs the collector calls the 'perfquery' command") + fmt.Println("The path to the 'perfquery' command can be configured with the 'perfquery_path' option") + fmt.Println("in the configuration") + fmt.Println("") + fmt.Println("Full configuration object:") + fmt.Println("\"ibstat\" : {") + fmt.Println(" \"perfquery_path\" : \"path/to/perfquery\" # if omitted, it searches in $PATH") + fmt.Println(" \"exclude_devices\" : [\"dev1\"]") + fmt.Println("}") + fmt.Println("") + fmt.Println("Metrics:") + fmt.Println("- ib_recv") + fmt.Println("- ib_xmit") + fmt.Println("- ib_recv_pkts") + fmt.Println("- ib_xmit_pkts") +} + +func (m *InfinibandPerfQueryCollector) Init(config json.RawMessage) error { + var err error + m.name = "InfinibandCollectorPerfQuery" + m.setup() + m.meta = map[string]string{"source": m.name, "group": "Network"} + m.tags = map[string]string{"type": "node"} + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + return err + } + } + if len(m.config.PerfQueryPath) == 0 { + path, err := exec.LookPath("perfquery") + if err == nil { + m.config.PerfQueryPath = path + } + } + m.lids = make(map[string]map[string]string) + p := fmt.Sprintf("%s/*/ports/*/lid", string(IB_BASEPATH)) + files, err := filepath.Glob(p) + for _, f := range files { + lid, err := ioutil.ReadFile(f) + if err == nil { + plist := strings.Split(strings.Replace(f, string(IB_BASEPATH), "", -1), "/") + skip := false + for _, d := range m.config.ExcludeDevices { + if d == plist[0] { + skip = true + } + } + if !skip { + m.lids[plist[0]] = make(map[string]string) + m.lids[plist[0]][plist[2]] = string(lid) + } + } + } + + for _, ports := range m.lids { + for port, lid := range ports { + args := fmt.Sprintf("-r %s %s 0xf000", lid, port) + command := exec.Command(m.config.PerfQueryPath, args) + command.Wait() + _, err := command.Output() + if err != nil { + return fmt.Errorf("Failed to execute %s: %v", m.config.PerfQueryPath, err) + } + } + } + + if len(m.lids) == 0 { + return errors.New("No usable IB devices") + } + + m.init = true + return nil +} + +func (m *InfinibandPerfQueryCollector) doPerfQuery(cmd string, dev string, lid string, port string, tags map[string]string, output chan lp.CCMetric) error { + + args := fmt.Sprintf("-r %s %s 0xf000", lid, port) + command := exec.Command(cmd, args) + command.Wait() + stdout, err := command.Output() + if err != nil { + log.Print(err) + return err + } + ll := strings.Split(string(stdout), "\n") + + for _, line := range ll { + if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + } + return nil +} + +func (m *InfinibandPerfQueryCollector) Read(interval time.Duration, output chan lp.CCMetric) { + + if m.init { + for dev, ports := range m.lids { + for port, lid := range ports { + tags := map[string]string{ + "type": "node", + "device": dev, + "port": port, + "lid": lid} + path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(IB_BASEPATH), dev, port) + buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_data", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + output <- y + } + } + } + } + } + } +} + +func (m *InfinibandPerfQueryCollector) Close() { + m.init = false +} From e1d0aacd1efad81caf10f8a4727382ab2abab6b5 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 27 Jan 2022 11:08:27 +0100 Subject: [PATCH 042/174] Moved as much work as possible to Init() --- collectors/infinibandMetric.go | 177 ++++++++++++++++++--------------- 1 file changed, 99 insertions(+), 78 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index f506f37..6b4c882 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -2,12 +2,12 @@ package collectors import ( "fmt" - "io/ioutil" + "os" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + "golang.org/x/sys/unix" "encoding/json" - "errors" "path/filepath" "strconv" "strings" @@ -16,13 +16,20 @@ import ( const IB_BASEPATH = `/sys/class/infiniband/` +type InfinibandCollectorInfo struct { + LID string // IB local Identifier (LID) + device string // IB device + port string // IB device port + portCounterFiles map[string]string // mapping counter name -> file + tagSet map[string]string // corresponding tag list +} + type InfinibandCollector struct { metricCollector - tags map[string]string - lids map[string]map[string]string config struct { - ExcludeDevices []string `json:"exclude_devices,omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0 } + info []InfinibandCollectorInfo } func (m *InfinibandCollector) Help() { @@ -43,99 +50,113 @@ func (m *InfinibandCollector) Help() { fmt.Println("- ib_xmit_pkts") } +// Init initializes the Infiniband collector by walking through files below IB_BASEPATH func (m *InfinibandCollector) Init(config json.RawMessage) error { var err error m.name = "InfinibandCollector" m.setup() - m.meta = map[string]string{"source": m.name, "group": "Network"} - m.tags = map[string]string{"type": "node"} + m.meta = map[string]string{ + "source": m.name, + "group": "Network", + } if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { return err } } - m.lids = make(map[string]map[string]string) - p := fmt.Sprintf("%s/*/ports/*/lid", string(IB_BASEPATH)) - files, err := filepath.Glob(p) - for _, f := range files { - lid, err := ioutil.ReadFile(f) - if err == nil { - plist := strings.Split(strings.Replace(f, string(IB_BASEPATH), "", -1), "/") - skip := false - for _, d := range m.config.ExcludeDevices { - if d == plist[0] { - skip = true - } - } - if !skip { - m.lids[plist[0]] = make(map[string]string) - m.lids[plist[0]][plist[2]] = string(lid) - } - } + + // Loop for all InfiniBand directories + globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*") + ibDirs, err := filepath.Glob(globPattern) + if err != nil { + return fmt.Errorf("Unable to glob files with pattern %s: %v", globPattern, err) + } + if ibDirs == nil { + return fmt.Errorf("Unable to find any directories with pattern %s", globPattern) } - if len(m.lids) == 0 { - return errors.New("No usable IB devices") + for _, path := range ibDirs { + + // Skip, when no LID is assigned + LID, ok := readOneLine(path + "/lid") + if !ok || LID == "0x0" { + continue + } + + // Get device and port component + pathSplit := strings.Split(path, string(os.PathSeparator)) + device := pathSplit[4] + port := pathSplit[6] + + // Skip excluded devices + skip := false + for _, excludedDevice := range m.config.ExcludeDevices { + if excludedDevice == device { + skip = true + break + } + } + if skip { + continue + } + + // Check access to counter files + countersDir := filepath.Join(path, "counters") + portCounterFiles := map[string]string{ + "ib_recv": filepath.Join(countersDir, "port_rcv_data"), + "ib_xmit": filepath.Join(countersDir, "port_xmit_data"), + "ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"), + "ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"), + } + for _, counterFile := range portCounterFiles { + err := unix.Access(counterFile, unix.R_OK) + if err != nil { + return fmt.Errorf("Unable to access %s: %v", counterFile, err) + } + } + + m.info = append(m.info, + InfinibandCollectorInfo{ + LID: LID, + device: device, + port: port, + portCounterFiles: portCounterFiles, + tagSet: map[string]string{ + "type": "node", + "device": device, + "port": port, + "lid": LID, + }, + }) + } + + if len(m.info) == 0 { + return fmt.Errorf("Found no IB devices") } m.init = true return nil } +// Read reads Infiniband counter files below IB_BASEPATH func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetric) { - if m.init { - for dev, ports := range m.lids { - for port, lid := range ports { - tags := map[string]string{ - "type": "node", - "device": dev, - "port": port, - "lid": lid} - path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(IB_BASEPATH), dev, port) - buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_data", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } - } - } - buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path)) - if err == nil { - data := strings.Replace(string(buffer), "\n", "", -1) - v, err := strconv.ParseFloat(data, 64) - if err == nil { - y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now()) - if err == nil { - output <- y - } + // Check if already initialized + if !m.init { + return + } + + now := time.Now() + for i := range m.info { + + // device info + info := &m.info[i] + for counterName, counterFile := range info.portCounterFiles { + if data, ok := readOneLine(counterFile); ok { + if v, err := strconv.ParseInt(data, 10, 64); err == nil { + if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { + output <- y } } } From b9236dcc31d56d432038c10480a69171695a4ace Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 27 Jan 2022 17:43:00 +0100 Subject: [PATCH 043/174] Handle shutdown sequentially --- collectors/collectorManager.go | 5 +-- collectors/cpufreqMetric.go | 1 + internal/metricRouter/metricRouter.go | 12 ++---- internal/multiChanTicker/multiChanTicker.go | 5 ++- metric-collector.go | 43 +++++++++++---------- receivers/receiveManager.go | 5 ++- sinks/sinkManager.go | 8 ++-- 7 files changed, 37 insertions(+), 42 deletions(-) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 98b6115..0b2dfcc 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -151,11 +151,8 @@ func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { // Close finishes / stops the metric collector manager func (cm *collectorManager) Close() { - select { - case cm.done <- true: - default: - } cclog.ComponentDebug("CollectorManager", "CLOSE") + cm.done <- true } // New creates a new initialized metric collector manager diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 5febed9..f3309ff 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -10,6 +10,7 @@ import ( "strconv" "strings" "time" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "golang.org/x/sys/unix" ) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 57ba708..6327d95 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -225,18 +225,12 @@ func (r *metricRouter) AddOutput(output chan lp.CCMetric) { // Close finishes / stops the metric router func (r *metricRouter) Close() { - select { - case r.done <- true: - default: - } + cclog.ComponentDebug("MetricRouter", "CLOSE") + r.done <- true if r.config.IntervalStamp { cclog.ComponentDebug("MetricRouter", "TIMER CLOSE") - select { - case r.timerdone <- true: - default: - } + r.timerdone <- true } - cclog.ComponentDebug("MetricRouter", "CLOSE") } // New creates a new initialized metric router diff --git a/internal/multiChanTicker/multiChanTicker.go b/internal/multiChanTicker/multiChanTicker.go index 37778ad..a9394ab 100644 --- a/internal/multiChanTicker/multiChanTicker.go +++ b/internal/multiChanTicker/multiChanTicker.go @@ -1,8 +1,9 @@ package multiChanTicker import ( - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" ) type multiChanTicker struct { @@ -49,8 +50,8 @@ func (t *multiChanTicker) AddChannel(channel chan time.Time) { } func (t *multiChanTicker) Close() { - t.done <- true cclog.ComponentDebug("MultiChanTicker", "CLOSE") + t.done <- true } func NewTicker(duration time.Duration) MultiChanTicker { diff --git a/metric-collector.go b/metric-collector.go index 6a6c1b3..0cd368a 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -6,6 +6,7 @@ import ( "os" "os/signal" "strings" + "syscall" "github.com/ClusterCockpit/cc-metric-collector/collectors" "github.com/ClusterCockpit/cc-metric-collector/receivers" @@ -154,10 +155,19 @@ func ReadCli() map[string]string { // return nil //} -// General shutdown function that gets executed in case of interrupt or graceful shutdown -func shutdown(config *RuntimeConfig) { +// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler +func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { + <-shutdownSignal + + // Remove shutdown handler + // every additional interrupt signal will stop without cleaning up + signal.Stop(shutdownSignal) + cclog.Info("Shutdown...") + + cclog.Debug("Shutdown Ticker...") config.Ticker.Close() + if config.CollectManager != nil { cclog.Debug("Shutdown CollectManager...") config.CollectManager.Close() @@ -182,18 +192,6 @@ func shutdown(config *RuntimeConfig) { config.Sync.Done() } -// Register an interrupt handler for Ctrl+C and similar. At signal, -// all collectors are closed -func prepare_shutdown(config *RuntimeConfig) { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, os.Interrupt) - - go func(config *RuntimeConfig) { - <-sigs - shutdown(config) - }(config) -} - func mainFunc() int { var err error use_recv := false @@ -249,7 +247,7 @@ func mainFunc() int { cclog.Error(err.Error()) return 1 } - RouterToSinksChannel := make(chan lp.CCMetric) + RouterToSinksChannel := make(chan lp.CCMetric, 200) rcfg.SinkManager.AddInput(RouterToSinksChannel) rcfg.Router.AddOutput(RouterToSinksChannel) } @@ -259,7 +257,7 @@ func mainFunc() int { cclog.Error(err.Error()) return 1 } - CollectToRouterChannel := make(chan lp.CCMetric) + CollectToRouterChannel := make(chan lp.CCMetric, 200) rcfg.CollectManager.AddOutput(CollectToRouterChannel) rcfg.Router.AddCollectorInput(CollectToRouterChannel) } @@ -269,12 +267,17 @@ func mainFunc() int { cclog.Error(err.Error()) return 1 } - ReceiveToRouterChannel := make(chan lp.CCMetric) + ReceiveToRouterChannel := make(chan lp.CCMetric, 200) rcfg.ReceiveManager.AddOutput(ReceiveToRouterChannel) rcfg.Router.AddReceiverInput(ReceiveToRouterChannel) use_recv = true } - prepare_shutdown(&rcfg) + + shutdownSignal := make(chan os.Signal, 1) + signal.Notify(shutdownSignal, os.Interrupt) + signal.Notify(shutdownSignal, syscall.SIGTERM) + go shutdownHandler(&rcfg, shutdownSignal) + rcfg.Sync.Add(1) rcfg.Router.Start() rcfg.SinkManager.Start() @@ -288,10 +291,10 @@ func mainFunc() int { if rcfg.CliArgs["once"] == "true" { x := 1.2 * float64(rcfg.ConfigFile.Interval) time.Sleep(time.Duration(int(x)) * time.Second) - shutdown(&rcfg) + shutdownSignal <- os.Interrupt } - // Wait until receiving an interrupt + // Wait until shutdownHandler is executed rcfg.Sync.Wait() return 0 } diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index e6a2eee..c570aa4 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -2,10 +2,11 @@ package receivers import ( "encoding/json" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" "os" "sync" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) var AvailableReceivers = map[string]Receiver{ diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index efcb5a0..4be8313 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -68,10 +68,11 @@ func (sm *sinkManager) Start() { go func() { done := func() { for _, s := range sm.outputs { + s.Flush() s.Close() } - cclog.ComponentDebug("SinkManager", "DONE") sm.wg.Done() + cclog.ComponentDebug("SinkManager", "DONE") } for { select { @@ -128,11 +129,8 @@ func (sm *sinkManager) AddOutput(rawConfig json.RawMessage) error { } func (sm *sinkManager) Close() { - select { - case sm.done <- true: - default: - } cclog.ComponentDebug("SinkManager", "CLOSE") + sm.done <- true } func New(wg *sync.WaitGroup, sinkConfigFile string) (SinkManager, error) { From aea3e2c6b183a9bad5f428c30f178b7fea1fd5bd Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 27 Jan 2022 20:45:22 +0100 Subject: [PATCH 044/174] Place wait group Add() and Done() near to each other --- collectors/collectorManager.go | 4 +- internal/metricRouter/metricRouter.go | 8 ++- metric-collector.go | 86 +++++++++++++++------------ 3 files changed, 57 insertions(+), 41 deletions(-) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 0b2dfcc..7b0a9b7 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -102,18 +102,18 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat // Start starts the metric collector manager func (cm *collectorManager) Start() { - cm.wg.Add(1) tick := make(chan time.Time) cm.ticker.AddChannel(tick) + cm.wg.Add(1) go func() { + defer cm.wg.Done() // Collector manager is done done := func() { // close all metric collectors for _, c := range cm.collectors { c.Close() } - cm.wg.Done() cclog.ComponentDebug("CollectorManager", "DONE") } diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 6327d95..a321aae 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -80,7 +80,10 @@ func (r *metricRouter) StartTimer() { m := make(chan time.Time) r.ticker.AddChannel(m) r.timerdone = make(chan bool) + + r.wg.Add(1) go func() { + defer r.wg.Done() for { select { case <-r.timerdone: @@ -169,13 +172,11 @@ func (r *metricRouter) DoDelTags(point lp.CCMetric) { // Start starts the metric router func (r *metricRouter) Start() { - r.wg.Add(1) r.timestamp = time.Now() if r.config.IntervalStamp { r.StartTimer() } done := func() { - r.wg.Done() cclog.ComponentDebug("MetricRouter", "DONE") } forward := func(point lp.CCMetric) { @@ -186,7 +187,10 @@ func (r *metricRouter) Start() { o <- point } } + + r.wg.Add(1) go func() { + defer r.wg.Done() for { // RouterLoop: select { diff --git a/metric-collector.go b/metric-collector.go index 0cd368a..3975b62 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -51,25 +51,16 @@ type RuntimeConfig struct { CliArgs map[string]string ConfigFile CentralConfigFile - Router mr.MetricRouter - CollectManager collectors.CollectorManager - SinkManager sinks.SinkManager - ReceiveManager receivers.ReceiveManager - Ticker mct.MultiChanTicker + MetricRouter mr.MetricRouter + CollectManager collectors.CollectorManager + SinkManager sinks.SinkManager + ReceiveManager receivers.ReceiveManager + MultiChanTicker mct.MultiChanTicker Channels []chan lp.CCMetric Sync sync.WaitGroup } -func prepare_runcfg() RuntimeConfig { - return RuntimeConfig{ - Router: nil, - CollectManager: nil, - SinkManager: nil, - ReceiveManager: nil, - } -} - //// Structure of the configuration file //type GlobalConfig struct { // Sink sinks.SinkConfig `json:"sink"` @@ -157,8 +148,9 @@ func ReadCli() map[string]string { // General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { - <-shutdownSignal + defer config.Sync.Done() + <-shutdownSignal // Remove shutdown handler // every additional interrupt signal will stop without cleaning up signal.Stop(shutdownSignal) @@ -166,7 +158,7 @@ func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { cclog.Info("Shutdown...") cclog.Debug("Shutdown Ticker...") - config.Ticker.Close() + config.MultiChanTicker.Close() if config.CollectManager != nil { cclog.Debug("Shutdown CollectManager...") @@ -176,9 +168,9 @@ func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { cclog.Debug("Shutdown ReceiveManager...") config.ReceiveManager.Close() } - if config.Router != nil { + if config.MetricRouter != nil { cclog.Debug("Shutdown Router...") - config.Router.Close() + config.MetricRouter.Close() } if config.SinkManager != nil { cclog.Debug("Shutdown SinkManager...") @@ -189,15 +181,20 @@ func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { // RemovePidfile(pidfile) // pidfile = config.CliArgs["pidfile"] // RemovePidfile(pidfile) - config.Sync.Done() } func mainFunc() int { var err error use_recv := false - rcfg := prepare_runcfg() - rcfg.CliArgs = ReadCli() + // Initialize runtime configuration + rcfg := RuntimeConfig{ + MetricRouter: nil, + CollectManager: nil, + SinkManager: nil, + ReceiveManager: nil, + CliArgs: ReadCli(), + } // Load and check configuration err = LoadCentralConfiguration(rcfg.CliArgs["configfile"], &rcfg.ConfigFile) @@ -225,61 +222,75 @@ func mainFunc() int { rcfg.Hostname = strings.SplitN(rcfg.Hostname, `.`, 2)[0] // err = CreatePidfile(rcfg.CliArgs["pidfile"]) - if rcfg.CliArgs["logfile"] != "stderr" { - cclog.SetOutput(rcfg.CliArgs["logfile"]) + // Set log file + if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" { + cclog.SetOutput(logfile) } - // err = SetLogging(rcfg.CliArgs["logfile"]) - // if err != nil { - // log.Print("Error setting up logging system to ", rcfg.CliArgs["logfile"], " on ", rcfg.Hostname) - // return - // } - rcfg.Ticker = mct.NewTicker(rcfg.Interval) + + // Creat new multi channel ticker + rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval) + + // Create new metric router if len(rcfg.ConfigFile.RouterConfigFile) > 0 { - rcfg.Router, err = mr.New(rcfg.Ticker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile) + rcfg.MetricRouter, err = mr.New(rcfg.MultiChanTicker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile) if err != nil { cclog.Error(err.Error()) return 1 } } + + // Create new sink if len(rcfg.ConfigFile.SinkConfigFile) > 0 { rcfg.SinkManager, err = sinks.New(&rcfg.Sync, rcfg.ConfigFile.SinkConfigFile) if err != nil { cclog.Error(err.Error()) return 1 } + + // Connect metric router to sink manager RouterToSinksChannel := make(chan lp.CCMetric, 200) rcfg.SinkManager.AddInput(RouterToSinksChannel) - rcfg.Router.AddOutput(RouterToSinksChannel) + rcfg.MetricRouter.AddOutput(RouterToSinksChannel) } + + // Create new collector manager if len(rcfg.ConfigFile.CollectorConfigFile) > 0 { - rcfg.CollectManager, err = collectors.New(rcfg.Ticker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile) + rcfg.CollectManager, err = collectors.New(rcfg.MultiChanTicker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile) if err != nil { cclog.Error(err.Error()) return 1 } + + // Connect collector manager to metric router CollectToRouterChannel := make(chan lp.CCMetric, 200) rcfg.CollectManager.AddOutput(CollectToRouterChannel) - rcfg.Router.AddCollectorInput(CollectToRouterChannel) + rcfg.MetricRouter.AddCollectorInput(CollectToRouterChannel) } + + // Create new receive manager if len(rcfg.ConfigFile.ReceiverConfigFile) > 0 { rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, rcfg.ConfigFile.ReceiverConfigFile) if err != nil { cclog.Error(err.Error()) return 1 } + + // Connect receive manager to metric router ReceiveToRouterChannel := make(chan lp.CCMetric, 200) rcfg.ReceiveManager.AddOutput(ReceiveToRouterChannel) - rcfg.Router.AddReceiverInput(ReceiveToRouterChannel) + rcfg.MetricRouter.AddReceiverInput(ReceiveToRouterChannel) use_recv = true } + // Create shutdown handler shutdownSignal := make(chan os.Signal, 1) signal.Notify(shutdownSignal, os.Interrupt) signal.Notify(shutdownSignal, syscall.SIGTERM) + rcfg.Sync.Add(1) go shutdownHandler(&rcfg, shutdownSignal) - rcfg.Sync.Add(1) - rcfg.Router.Start() + // Start the managers + rcfg.MetricRouter.Start() rcfg.SinkManager.Start() rcfg.CollectManager.Start() @@ -294,8 +305,9 @@ func mainFunc() int { shutdownSignal <- os.Interrupt } - // Wait until shutdownHandler is executed + // Wait that all goroutines finish rcfg.Sync.Wait() + return 0 } From 82f5c1c5d010b1a7669f7489800d39b823d585a2 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 28 Jan 2022 09:42:19 +0100 Subject: [PATCH 045/174] Minimum requirement go version 1.17 --- collectors/gpfsMetric.go | 8 +- go.mod | 25 ++- go.sum | 457 +++++++-------------------------------- 3 files changed, 94 insertions(+), 396 deletions(-) diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index f1d3d75..bc1852b 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -13,19 +13,18 @@ import ( "strconv" "strings" "time" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) type GpfsCollector struct { metricCollector tags map[string]string - config struct { Mmpmon string `json:"mmpmon"` } } - func (m *GpfsCollector) Init(config json.RawMessage) error { var err error m.name = "GpfsCollector" @@ -120,7 +119,6 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { m.tags["filesystem"] = filesystem - // return code rc, err := strconv.Atoi(key_value["_rc_"]) if err != nil { @@ -132,8 +130,6 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } - /* requires go 1.17 - // unix epoch in microseconds timestampInt, err := strconv.ParseInt(key_value["_t_"]+key_value["_tu_"], 10, 64) timestamp := time.UnixMicro(timestampInt) if err != nil { @@ -142,8 +138,6 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { key_value["_t_"]+key_value["_tu_"], err.Error()) continue } - */ - timestamp := time.Now() // bytes read bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) diff --git a/go.mod b/go.mod index d20d431..da4f3ea 100644 --- a/go.mod +++ b/go.mod @@ -1,14 +1,25 @@ module github.com/ClusterCockpit/cc-metric-collector -go 1.16 +go 1.17 require ( github.com/NVIDIA/go-nvml v0.11.1-0 - github.com/influxdata/influxdb-client-go/v2 v2.2.2 - github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097 - github.com/nats-io/nats.go v1.10.0 - github.com/nats-io/nkeys v0.1.4 // indirect - github.com/prometheus/client_golang v1.10.0 // indirect - golang.org/x/sys v0.0.0-20210309074719-68d13333faf2 + github.com/influxdata/influxdb-client-go/v2 v2.7.0 + github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf + github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc + golang.org/x/sys v0.0.0-20220114195835-da31bd327af9 gopkg.in/Knetic/govaluate.v2 v2.3.0 ) + +require ( + github.com/deepmap/oapi-codegen v1.8.2 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/nats-io/nats-server/v2 v2.7.0 // indirect + github.com/nats-io/nkeys v0.3.0 // indirect + github.com/nats-io/nuid v1.0.1 // indirect + github.com/pkg/errors v0.9.1 // indirect + golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce // indirect + golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2 // indirect + google.golang.org/protobuf v1.27.1 // indirect + gopkg.in/yaml.v2 v2.3.0 // indirect +) diff --git a/go.sum b/go.sum index a6f98d7..311633a 100644 --- a/go.sum +++ b/go.sum @@ -1,449 +1,142 @@ -cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0= github.com/NVIDIA/go-nvml v0.11.1-0 h1:XHSz3zZKC4NCP2ja1rI7++DXFhA+uDhdYa3MykCTGHY= github.com/NVIDIA/go-nvml v0.11.1-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= -github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= -github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= -github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g= -github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c= -github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= -github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= -github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= -github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= -github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= -github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= -github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= -github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= -github.com/aryann/difflib v0.0.0-20170710044230-e206f873d14a/go.mod h1:DAHtR1m6lCRdSC2Tm3DSWRPvIPr6xNKyeHdqDQSQT+A= -github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU= -github.com/aws/aws-sdk-go v1.27.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= -github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g= -github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= -github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= -github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= -github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= -github.com/casbin/casbin/v2 v2.1.2/go.mod h1:YcPU1XXisHhLzuxH9coDNf2FbKpjGlbCg3n9yuLkIJQ= -github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= -github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= -github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4dUb/I5gc9Hdhagfvm9+RyrPryS/auMzxE= -github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8= -github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= -github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= -github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= -github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= -github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= -github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/cyberdelia/templates v0.0.0-20141128023046-ca7fffd4298c/go.mod h1:GyV+0YP4qX0UQ7r2MoYZ+AvYDp12OF5yg4q8rGnyNh4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/deepmap/oapi-codegen v1.3.13 h1:9HKGCsdJqE4dnrQ8VerFS0/1ZOJPmAhN+g8xgp8y3K4= -github.com/deepmap/oapi-codegen v1.3.13/go.mod h1:WAmG5dWY8/PYHt4vKxlt90NsbHMAOCiteYKZMiIRfOo= +github.com/deepmap/oapi-codegen v1.8.2 h1:SegyeYGcdi0jLLrpbCMoJxnUUn8GBXHsvr4rbzjuhfU= +github.com/deepmap/oapi-codegen v1.8.2/go.mod h1:YLgSKSDv/bZQB7N4ws6luhozi3cEdRktEqrX88CvjIw= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= -github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= -github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs= -github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU= -github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I= -github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M= -github.com/envoyproxy/go-control-plane v0.6.9/go.mod h1:SBwIajubJHhxtWwsL9s8ss4safvEdbitLhGGK48rN6g= -github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4= -github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20= -github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/getkin/kin-openapi v0.13.0/go.mod h1:WGRs2ZMM1Q8LR1QBEwUxC6RJEfaBcD0s+pcEVXFuAjw= +github.com/getkin/kin-openapi v0.61.0/go.mod h1:7Yn5whZr5kJi6t+kShccXS8ae1APpYTW6yheSwk8Yi4= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= -github.com/go-chi/chi v4.0.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ= -github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-kit/kit v0.10.0/go.mod h1:xUsJbQ/Fp4kEt7AFgCuvyX4a71u8h9jB8tj/ORgOZ7o= -github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= -github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= -github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= -github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w= -github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/gogo/googleapis v1.1.0/go.mod h1:gf4bu3Q80BeJ6H1S1vYPm8/ELATdvryBaNFGgqEef3s= -github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= -github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= -github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= -github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/go-chi/chi/v5 v5.0.0/go.mod h1:BBug9lr0cqtdAhsu6R4AAdvufI0/XBzAQSsUqJpoZOs= +github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/protobuf v1.4.3 h1:JjCZWpVbqXDqFVmTfYWEVTMIYrL/NPdPSCHPJ0T/raM= -github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golangci/lint-1 v0.0.0-20181222135242-d2cdd8c08219/go.mod h1:/X8TswGSh1pIozq4ZwCfxS0WA5JGXguxk94ar/4c87Y= -github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= -github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= -github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= -github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= -github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= -github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= -github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= -github.com/hashicorp/consul/api v1.3.0/go.mod h1:MmDNSzIMUjNpY/mQ398R4bk2FnqQLoPndWW5VkKPlCE= -github.com/hashicorp/consul/sdk v0.3.0/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= -github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= -github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= -github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= -github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= -github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= -github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU= -github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= -github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= -github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= -github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= -github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= -github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90= -github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= -github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= -github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= -github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= -github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= -github.com/hudl/fargo v1.3.0/go.mod h1:y3CKSmjA+wD2gak7sUSXTAoopbhU08POFhmITJgmKTg= -github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= -github.com/influxdata/influxdb-client-go v1.4.0 h1:+KavOkwhLClHFfYcJMHHnTL5CZQhXJzOm5IKHI9BqJk= -github.com/influxdata/influxdb-client-go/v2 v2.2.2 h1:O0CGIuIwQafvAxttAJ/VqMKfbWWn2Mt8rbOmaM2Zj4w= -github.com/influxdata/influxdb-client-go/v2 v2.2.2/go.mod h1:fa/d1lAdUHxuc1jedx30ZfNG573oQTQmUni3N6pcW+0= -github.com/influxdata/influxdb1-client v0.0.0-20191209144304-8bf82d3c094d/go.mod h1:qj24IKcXYK6Iy9ceXlo3Tc+vtHo9lIhSX5JddghvEPo= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= +github.com/influxdata/influxdb-client-go/v2 v2.7.0 h1:QgP5mlBE9sGnzplpnf96pr+p7uqlIlL4W2GAP3n+XZg= +github.com/influxdata/influxdb-client-go/v2 v2.7.0/go.mod h1:Y/0W1+TZir7ypoQZYd2IrnVOKB3Tq6oegAQeSVN/+EU= github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= -github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097 h1:vilfsDSy7TDxedi9gyBkMvAirat/oRcL0lFdJBf6tdM= -github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= -github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= -github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= -github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= -github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= -github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= -github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= -github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= +github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= +github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= +github.com/klauspost/compress v1.13.4 h1:0zhec2I8zGnjWcKyLl6i3gPqKANCCn5e9xmviEEeX6s= +github.com/klauspost/compress v1.13.4/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/labstack/echo/v4 v4.1.11/go.mod h1:i541M3Fj6f76NZtHSj7TXnyM8n2gaodfvfxNnFqi74g= +github.com/labstack/echo/v4 v4.2.1/go.mod h1:AA49e0DZ8kk5jTOOCKNuPR6oTnBS0dYiM4FW1e6jwpg= github.com/labstack/gommon v0.3.0/go.mod h1:MULnywXg0yavhxWKc+lOruYdAhDwPK9wf0OL7NoOu+k= -github.com/lightstep/lightstep-tracer-common/golang/gogo v0.0.0-20190605223551-bc2310a04743/go.mod h1:qklhhLq1aX+mtWk9cPHPzaBjWImj5ULL6C7HFJtXQMM= -github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0UBX0ZE6WURAspgAczcDHrL4= -github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= +github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/matryer/moq v0.0.0-20190312154309-6cfb0558e1bd/go.mod h1:9ELz6aaclSIGnZBoaSLZ3NAl1VTufbOrXBPvtcy6WiQ= -github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= -github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= -github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= +github.com/mattn/go-colorable v0.1.7/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= +github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ= -github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84= -github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= -github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= -github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= -github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= -github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= -github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= -github.com/mitchellh/gox v0.4.0/go.mod h1:Sd9lOJ0+aimLBi73mGofS1ycjY8lL3uZM3JPS42BGNg= -github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY= -github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= -github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg= -github.com/nats-io/jwt v0.3.2 h1:+RB5hMpXUUA2dfxuhBTEkMOrYmM+gKIZYS1KjSostMI= -github.com/nats-io/jwt v0.3.2/go.mod h1:/euKqTS1ZD+zzjYrY7pseZrTtWQSjujC7xjPc8wL6eU= -github.com/nats-io/nats-server/v2 v2.1.2/go.mod h1:Afk+wRZqkMQs/p45uXdrVLuab3gwv3Z8C4HTBu8GD/k= -github.com/nats-io/nats.go v1.9.1/go.mod h1:ZjDU1L/7fJ09jvUSRVBR2e7+RnLiiIQyqyzEE/Zbp4w= -github.com/nats-io/nats.go v1.10.0 h1:L8qnKaofSfNFbXg0C5F71LdjPRnmQwSsA4ukmkt1TvY= -github.com/nats-io/nats.go v1.10.0/go.mod h1:AjGArbfyR50+afOUotNX2Xs5SYHf+CoOa5HH1eEl2HE= -github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= -github.com/nats-io/nkeys v0.1.3/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= -github.com/nats-io/nkeys v0.1.4 h1:aEsHIssIk6ETN5m2/MD8Y4B2X7FfXrBAUdkyRvbVYzA= -github.com/nats-io/nkeys v0.1.4/go.mod h1:XdZpAbhgyyODYqjTawOnIOI7VlbKSarI9Gfy1tqEu/s= +github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= +github.com/minio/highwayhash v1.0.1 h1:dZ6IIu8Z14VlC0VpfKofAhCy74wu/Qb5gcn52yWoz/0= +github.com/minio/highwayhash v1.0.1/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= +github.com/nats-io/jwt/v2 v2.2.1-0.20220113022732-58e87895b296 h1:vU9tpM3apjYlLLeY23zRWJ9Zktr5jp+mloR942LEOpY= +github.com/nats-io/jwt/v2 v2.2.1-0.20220113022732-58e87895b296/go.mod h1:0tqz9Hlu6bCBFLWAASKhE5vUA4c24L9KPUUgvwumE/k= +github.com/nats-io/nats-server/v2 v2.7.0 h1:UpqcAM93FI7AHlCyI2FD5QcV3QuHNCauQF2LBVU0238= +github.com/nats-io/nats-server/v2 v2.7.0/go.mod h1:cjxtMhZsZovK1XS2iiapCduR8HuqB/YpFamL0qntIcw= +github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc h1:SHr4MUUZJ/fAC0uSm2OzWOJYsHpapmR86mpw7q1qPXU= +github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc/go.mod h1:BPko4oXsySz4aSWeFgOHLZs3G4Jq4ZAyE6/zMCxRT6w= +github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8= +github.com/nats-io/nkeys v0.3.0/go.mod h1:gvUNGjVcM2IPr5rCsRsC6Wb3Hr2CQAm08dsxtV6A5y4= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= -github.com/oklog/oklog v0.3.2/go.mod h1:FCV+B7mhrz4o+ueLpx+KqkyXRGMWOYEvfiXtdGtbWGs= -github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA= -github.com/olekukonko/tablewriter v0.0.0-20170122224234-a0225b3f23b5/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= -github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= -github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk= -github.com/opentracing-contrib/go-observer v0.0.0-20170622124052-a52f23424492/go.mod h1:Ngi6UdF0k5OKD5t5wlmGhe/EDKPoUM3BXZSSfIuJbis= -github.com/opentracing/basictracer-go v1.0.0/go.mod h1:QfBfYuafItcjQuMwinw9GhYKwFXS9KnPs5lxoYwgW74= -github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= -github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= -github.com/openzipkin-contrib/zipkin-go-opentracing v0.4.5/go.mod h1:/wsWhb9smxSfWAKL3wpBW7V8scJMt8N8gnaMCS9E/cA= -github.com/openzipkin/zipkin-go v0.1.6/go.mod h1:QgAqvLzwWbR/WpD4A3cGpPtJrZXNIiJc5AZX7/PBEpw= -github.com/openzipkin/zipkin-go v0.2.1/go.mod h1:NaW6tEwdmWMaCDZzg8sh+IBNOxHMPnhQw8ySjnjRyN4= -github.com/openzipkin/zipkin-go v0.2.2/go.mod h1:NaW6tEwdmWMaCDZzg8sh+IBNOxHMPnhQw8ySjnjRyN4= -github.com/pact-foundation/pact-go v1.0.4/go.mod h1:uExwJY4kCzNPcHRj+hCR/HBbOOIwwtUjcrb0b5/5kLM= -github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= -github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= -github.com/performancecopilot/speed v3.0.0+incompatible/go.mod h1:/CLtqpZ5gBg1M9iaPbIdPPGyKcA8hKdoy6hAWba7Yac= -github.com/pierrec/lz4 v1.0.2-0.20190131084431-473cd7ce01a1/go.mod h1:3/3N9NVKO0jef7pBehbT1qWhCMrIgbYNnFAZCqQ5LRc= -github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= -github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/profile v1.2.1/go.mod h1:hJw3o1OdXxsrSjjVksARp5W95eeEaEfptyVZyv6JUPA= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= -github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= -github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= -github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= -github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeDPbaTKGT+JTgUa3og= -github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= -github.com/prometheus/client_golang v1.10.0 h1:/o0BDeWzLWXNZ+4q5gXltUvaMpJqckTa+jTNoB+z4cg= -github.com/prometheus/client_golang v1.10.0/go.mod h1:WJM3cc3yu7XKBKa/I8WeZm+V3eltZnBwfENSU7mdogU= -github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= -github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= -github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.1.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M= -github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= -github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= -github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= -github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= -github.com/prometheus/common v0.18.0 h1:WCVKW7aL6LEe1uryfI9dnEc2ZqNB1Fn0ok930v0iL1Y= -github.com/prometheus/common v0.18.0/go.mod h1:U+gB1OBLb1lF3O42bTCL+FK18tX9Oar16Clt/msog/s= -github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= -github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= -github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= -github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= -github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= -github.com/prometheus/procfs v0.6.0 h1:mxy4L2jP6qMonqmq+aTtOx1ifVWUgG/TAmntgbh3xv4= -github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= -github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= -github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= -github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= -github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= -github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= -github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= -github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= -github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= -github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= -github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= -github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= -github.com/sony/gobreaker v0.4.1/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY= -github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= -github.com/spf13/pflag v1.0.1/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= -github.com/streadway/amqp v0.0.0-20190404075320-75d898a42a94/go.mod h1:AZpEONHx3DKn8O/DFsRAY58/XVQiIPMTMB1SddzLXVw= -github.com/streadway/amqp v0.0.0-20190827072141-edfb9018d271/go.mod h1:AZpEONHx3DKn8O/DFsRAY58/XVQiIPMTMB1SddzLXVw= -github.com/streadway/handy v0.0.0-20190108123426-d5acb3125c2a/go.mod h1:qNTQ5P5JnDBl6z3cMAg/SywNDC5ABu5ApDIw6lUbRmI= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= -github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= -github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8= -github.com/valyala/fasttemplate v1.1.0/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8= -github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= -go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= -go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= -go.opencensus.io v0.20.1/go.mod h1:6WKK9ahsWS3RSO+PY9ZHZUfv2irvY6gN279GOPZjmmk= -go.opencensus.io v0.20.2/go.mod h1:6WKK9ahsWS3RSO+PY9ZHZUfv2irvY6gN279GOPZjmmk= -go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= -go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= -go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= -go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= -go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= -go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= -go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= -golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20191112222119-e1110fd1c708/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59 h1:3zb4D3T4G8jdExgVU/95+vQXfpEPiMdCaZgmGVxjNHM= -golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 h1:psW17arqaxU48Z5kZ0CQnkZWQJsqcURM6tKiBApRjXI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= -golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= -golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190125091013-d26f9f9a57f3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= +golang.org/x/crypto v0.0.0-20210314154223-e6e6c4f2bb5b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= +golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce h1:Roh6XWxHFKrPgC/EQhVubSAGQ6Ozk6IdxHSzt1mR0EI= +golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= -golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20191112182307-2180aed22343 h1:00ohfJ4K98s3m6BGUoBd8nyfp4Yl0GoIKvw5abItTjI= -golang.org/x/net v0.0.0-20191112182307-2180aed22343/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200625001655-4c5254603344 h1:vGXIOMxbNfDTk/aXCmfdLgkrSV+Z2tcbze+pEc3v5W4= -golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2 h1:CIJ76btIcR3eFI5EgSo6k1qKw9KJexJuRLI9G7Hp5wE= +golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200826173525-f9321e4c35a6/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210309074719-68d13333faf2 h1:46ULzRKLh1CwgRq2dC5SlBzEqqNCi8rreOZnNrbqcIY= -golang.org/x/sys v0.0.0-20210309074719-68d13333faf2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220111092808-5a964db01320/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220114195835-da31bd327af9 h1:XfKQ4OlFl8okEOr5UvAqFRVj8pY/4yfcXrddB8qAbU0= +golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11 h1:GZokNIeuVkl3aZHJchRrr13WCsols02MLUcz1U9is6M= +golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= -golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/api v0.3.1/go.mod h1:6wY9I6uQWHQ8EM57III9mq/AjF+i8G65rmVagqKMtkk= -google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= -google.golang.org/appengine v1.2.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= -google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190530194941-fb225487d101/go.mod h1:z3L6/3dTEVtUr6QSP8miRzeRqwQOioJ9I66odjN4I7s= -google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs= -google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= -google.golang.org/grpc v1.20.0/go.mod h1:chYK+tFQF0nDUGJgXMSgLCQk3phJEuONr2DCgLDdAQM= -google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= -google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= -google.golang.org/grpc v1.22.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= -google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= -google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= -google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= -google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ= +google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= gopkg.in/Knetic/govaluate.v2 v2.3.0 h1:naJVc9CZlWA8rC8f5mvECJD7jreTrn7FvGXjBthkHJQ= gopkg.in/Knetic/govaluate.v2 v2.3.0/go.mod h1:NW0gr10J8s7aNghEg6uhdxiEaBvc0+8VgJjVViHUKp4= -gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= -gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= -gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= -gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= -gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= -gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= -sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= -sourcegraph.com/sourcegraph/appdash v0.0.0-20190731080439-ebfcffb1b5c0/go.mod h1:hI742Nqp5OhwiqlzhgfbWU4mW4yO10fP+LoT9WOswdU= From 4e408f9490cee6ed816a1ca83cdc203d78c79e65 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 28 Jan 2022 15:16:58 +0100 Subject: [PATCH 046/174] Add documentation --- collectors/collectorManager.go | 39 +++++++++++----------- internal/metricRouter/metricRouter.go | 48 +++++++++++++++++---------- sinks/sinkManager.go | 41 +++++++++++++++++------ 3 files changed, 81 insertions(+), 47 deletions(-) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 7b0a9b7..52e91e7 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -34,17 +34,18 @@ var AvailableCollectors = map[string]MetricCollector{ "nfsstat": new(NfsCollector), } +// Metric collector manager data structure type collectorManager struct { - collectors []MetricCollector - output chan lp.CCMetric // List of all output channels - done chan bool // channel to finish / stop metric collector manager - ticker mct.MultiChanTicker - duration time.Duration - wg *sync.WaitGroup - config map[string]json.RawMessage + collectors []MetricCollector // List of metric collectors to use + output chan lp.CCMetric // Output channels + done chan bool // channel to finish / stop metric collector manager + ticker mct.MultiChanTicker // periodically ticking once each interval + duration time.Duration // duration (for metrics that measure over a given duration) + wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector + config map[string]json.RawMessage // json encoded config for collector manager } -// Metric collector access functions +// Metric collector manager access functions type CollectorManager interface { Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error AddOutput(output chan lp.CCMetric) @@ -53,9 +54,9 @@ type CollectorManager interface { } // Init initializes a new metric collector manager by setting up: -// * output channels +// * output channel // * done channel -// * wait group synchronization (from variable wg) +// * wait group synchronization for goroutines (from variable wg) // * ticker (from variable ticker) // * configuration (read from config file in variable collectConfigFile) // Initialization is done for all configured collectors @@ -82,20 +83,20 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat } // Initialize configured collectors - for k, cfg := range cm.config { - if _, found := AvailableCollectors[k]; !found { - cclog.ComponentError("CollectorManager", "SKIP unknown collector", k) + for collectorName, collectorCfg := range cm.config { + if _, found := AvailableCollectors[collectorName]; !found { + cclog.ComponentError("CollectorManager", "SKIP unknown collector", collectorName) continue } - c := AvailableCollectors[k] + collector := AvailableCollectors[collectorName] - err = c.Init(cfg) + err = collector.Init(collectorCfg) if err != nil { - cclog.ComponentError("CollectorManager", "Collector", k, "initialization failed:", err.Error()) + cclog.ComponentError("CollectorManager", "Collector", collectorName, "initialization failed:", err.Error()) continue } - cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", c.Name()) - cm.collectors = append(cm.collectors, c) + cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name()) + cm.collectors = append(cm.collectors, collector) } return nil } @@ -157,7 +158,7 @@ func (cm *collectorManager) Close() { // New creates a new initialized metric collector manager func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) { - cm := &collectorManager{} + cm := new(collectorManager) err := cm.Init(ticker, duration, wg, collectConfigFile) if err != nil { return nil, err diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index a321aae..956ac11 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -24,19 +24,20 @@ type metricRouterTagConfig struct { type metricRouterConfig struct { AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met - IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically? + IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval? } +// Metric router data structure type metricRouter struct { - coll_input chan lp.CCMetric // Input channel from CollectorManager - recv_input chan lp.CCMetric // Input channel from ReceiveManager - outputs []chan lp.CCMetric // List of all output channels - done chan bool // channel to finish / stop metric router - wg *sync.WaitGroup - timestamp time.Time // timestamp - timerdone chan bool // channel to finish / stop timestamp updater - ticker mct.MultiChanTicker - config metricRouterConfig + coll_input chan lp.CCMetric // Input channel from CollectorManager + recv_input chan lp.CCMetric // Input channel from ReceiveManager + outputs []chan lp.CCMetric // List of all output channels + done chan bool // channel to finish / stop metric router + wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector + timestamp time.Time // timestamp periodically updated by ticker each interval + timerdone chan bool // channel to finish / stop timestamp updater + ticker mct.MultiChanTicker // periodically ticking once each interval + config metricRouterConfig // json encoded config for metric router } // MetricRouter access functions @@ -60,6 +61,8 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout r.done = make(chan bool) r.wg = wg r.ticker = ticker + + // Read metric router config file configFile, err := os.Open(routerConfigFile) if err != nil { cclog.ComponentError("MetricRouter", err.Error()) @@ -97,11 +100,11 @@ func (r *metricRouter) StartTimer() { cclog.ComponentDebug("MetricRouter", "TIMER START") } -// EvalCondition evaluates condition Cond for metric data from point -func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, error) { - expression, err := govaluate.NewEvaluableExpression(Cond) +// EvalCondition evaluates condition cond for metric data from point +func (r *metricRouter) EvalCondition(cond string, point lp.CCMetric) (bool, error) { + expression, err := govaluate.NewEvaluableExpression(cond) if err != nil { - cclog.ComponentDebug("MetricRouter", Cond, " = ", err.Error()) + cclog.ComponentDebug("MetricRouter", cond, " = ", err.Error()) return false, err } @@ -122,7 +125,7 @@ func (r *metricRouter) EvalCondition(Cond string, point lp.CCMetric) (bool, erro // evaluate condition result, err := expression.Evaluate(params) if err != nil { - cclog.ComponentDebug("MetricRouter", Cond, " = ", err.Error()) + cclog.ComponentDebug("MetricRouter", cond, " = ", err.Error()) return false, err } return bool(result.(bool)), err @@ -172,13 +175,20 @@ func (r *metricRouter) DoDelTags(point lp.CCMetric) { // Start starts the metric router func (r *metricRouter) Start() { + + // start timer if configured r.timestamp = time.Now() if r.config.IntervalStamp { r.StartTimer() } + + // Router manager is done done := func() { cclog.ComponentDebug("MetricRouter", "DONE") } + + // Forward takes a received metric, adds or deletes tags + // and forwards it to the output channels forward := func(point lp.CCMetric) { cclog.ComponentDebug("MetricRouter", "FORWARD", point) r.DoAddTags(point) @@ -192,17 +202,20 @@ func (r *metricRouter) Start() { go func() { defer r.wg.Done() for { - // RouterLoop: select { case <-r.done: done() return + case p := <-r.coll_input: + // receive from metric collector if r.config.IntervalStamp { p.SetTime(r.timestamp) } forward(p) + case p := <-r.recv_input: + // receive from receive manager if r.config.IntervalStamp { p.SetTime(r.timestamp) } @@ -213,11 +226,12 @@ func (r *metricRouter) Start() { cclog.ComponentDebug("MetricRouter", "STARTED") } -// AddInput adds a input channel to the metric router +// AddCollectorInput adds a channel between metric collector and metric router func (r *metricRouter) AddCollectorInput(input chan lp.CCMetric) { r.coll_input = input } +// AddReceiverInput adds a channel between metric receiver and metric router func (r *metricRouter) AddReceiverInput(input chan lp.CCMetric) { r.recv_input = input } diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 4be8313..b4b3dc5 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -9,21 +9,24 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) +// Map of all available sinks var AvailableSinks = map[string]Sink{ - "influxdb": &InfluxSink{}, - "stdout": &StdoutSink{}, - "nats": &NatsSink{}, - "http": &HttpSink{}, + "influxdb": new(InfluxSink), + "stdout": new(StdoutSink), + "nats": new(NatsSink), + "http": new(HttpSink), } +// Metric collector manager data structure type sinkManager struct { - input chan lp.CCMetric - outputs []Sink - done chan bool - wg *sync.WaitGroup - config []sinkConfig + input chan lp.CCMetric // input channel + outputs []Sink // List of sinks to use + done chan bool // channel to finish / stop metric sink manager + wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector + config []sinkConfig // json encoded config for sink manager } +// Sink manager access functions type SinkManager interface { Init(wg *sync.WaitGroup, sinkConfigFile string) error AddInput(input chan lp.CCMetric) @@ -38,6 +41,8 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { sm.done = make(chan bool) sm.wg = wg sm.config = make([]sinkConfig, 0) + + // Read sink config file if len(sinkConfigFile) > 0 { configFile, err := os.Open(sinkConfigFile) if err != nil { @@ -63,27 +68,36 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { } func (sm *sinkManager) Start() { - sm.wg.Add(1) batchcount := 20 + + sm.wg.Add(1) go func() { + defer sm.wg.Done() + + // Sink manager is done done := func() { for _, s := range sm.outputs { s.Flush() s.Close() } - sm.wg.Done() + cclog.ComponentDebug("SinkManager", "DONE") } + for { select { case <-sm.done: done() return + case p := <-sm.input: + // Send received metric to all outputs cclog.ComponentDebug("SinkManager", "WRITE", p) for _, s := range sm.outputs { s.Write(p) } + + // Flush all outputs if batchcount == 0 { cclog.ComponentDebug("SinkManager", "FLUSH") for _, s := range sm.outputs { @@ -95,9 +109,12 @@ func (sm *sinkManager) Start() { } } }() + + // Sink manager is started cclog.ComponentDebug("SinkManager", "STARTED") } +// AddInput adds the input channel to the sink manager func (sm *sinkManager) AddInput(input chan lp.CCMetric) { sm.input = input } @@ -128,11 +145,13 @@ func (sm *sinkManager) AddOutput(rawConfig json.RawMessage) error { return nil } +// Close finishes / stops the sink manager func (sm *sinkManager) Close() { cclog.ComponentDebug("SinkManager", "CLOSE") sm.done <- true } +// New creates a new initialized sink manager func New(wg *sync.WaitGroup, sinkConfigFile string) (SinkManager, error) { sm := &sinkManager{} err := sm.Init(wg, sinkConfigFile) From d2e02ed36daf508d20fccd5f881fcd057c16003e Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 28 Jan 2022 19:31:27 +0100 Subject: [PATCH 047/174] Fix: Add missing hostname tag --- internal/metricRouter/metricRouter.go | 12 ++++++++++++ metric-collector.go | 9 --------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 956ac11..e75e77d 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -3,6 +3,7 @@ package metricRouter import ( "encoding/json" "os" + "strings" "sync" "time" @@ -29,6 +30,7 @@ type metricRouterConfig struct { // Metric router data structure type metricRouter struct { + hostname string // Hostname used in tags coll_input chan lp.CCMetric // Input channel from CollectorManager recv_input chan lp.CCMetric // Input channel from ReceiveManager outputs []chan lp.CCMetric // List of all output channels @@ -62,6 +64,15 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout r.wg = wg r.ticker = ticker + // Set hostname + hostname, err := os.Hostname() + if err != nil { + cclog.Error(err.Error()) + return err + } + // Drop domain part of host name + r.hostname = strings.SplitN(hostname, `.`, 2)[0] + // Read metric router config file configFile, err := os.Open(routerConfigFile) if err != nil { @@ -209,6 +220,7 @@ func (r *metricRouter) Start() { case p := <-r.coll_input: // receive from metric collector + p.AddTag("hostname", r.hostname) if r.config.IntervalStamp { p.SetTime(r.timestamp) } diff --git a/metric-collector.go b/metric-collector.go index 3975b62..b709512 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -5,7 +5,6 @@ import ( "flag" "os" "os/signal" - "strings" "syscall" "github.com/ClusterCockpit/cc-metric-collector/collectors" @@ -45,7 +44,6 @@ func LoadCentralConfiguration(file string, config *CentralConfigFile) error { } type RuntimeConfig struct { - Hostname string Interval time.Duration Duration time.Duration CliArgs map[string]string @@ -213,13 +211,6 @@ func mainFunc() int { } rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second - rcfg.Hostname, err = os.Hostname() - if err != nil { - cclog.Error(err.Error()) - return 1 - } - // Drop domain part of host name - rcfg.Hostname = strings.SplitN(rcfg.Hostname, `.`, 2)[0] // err = CreatePidfile(rcfg.CliArgs["pidfile"]) // Set log file From 7316de281357f49ac81fc7cd68015863f2d85d20 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 28 Jan 2022 19:49:46 +0100 Subject: [PATCH 048/174] Fix crash caused by: * not running a collector manager when collector manager config file is missing * not running a metric router when metric router config file is missing * not running a sink manager when sink manager config file is missing --- metric-collector.go | 65 ++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/metric-collector.go b/metric-collector.go index b709512..8121141 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -211,6 +211,21 @@ func mainFunc() int { } rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second + if len(rcfg.ConfigFile.RouterConfigFile) == 0 { + cclog.Error("Metric router configuration file must be set") + return 1 + } + + if len(rcfg.ConfigFile.SinkConfigFile) == 0 { + cclog.Error("Sink configuration file must be set") + return 1 + } + + if len(rcfg.ConfigFile.CollectorConfigFile) == 0 { + cclog.Error("Metric collector configuration file must be set") + return 1 + } + // err = CreatePidfile(rcfg.CliArgs["pidfile"]) // Set log file @@ -222,42 +237,36 @@ func mainFunc() int { rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval) // Create new metric router - if len(rcfg.ConfigFile.RouterConfigFile) > 0 { - rcfg.MetricRouter, err = mr.New(rcfg.MultiChanTicker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile) - if err != nil { - cclog.Error(err.Error()) - return 1 - } + rcfg.MetricRouter, err = mr.New(rcfg.MultiChanTicker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile) + if err != nil { + cclog.Error(err.Error()) + return 1 } // Create new sink - if len(rcfg.ConfigFile.SinkConfigFile) > 0 { - rcfg.SinkManager, err = sinks.New(&rcfg.Sync, rcfg.ConfigFile.SinkConfigFile) - if err != nil { - cclog.Error(err.Error()) - return 1 - } - - // Connect metric router to sink manager - RouterToSinksChannel := make(chan lp.CCMetric, 200) - rcfg.SinkManager.AddInput(RouterToSinksChannel) - rcfg.MetricRouter.AddOutput(RouterToSinksChannel) + rcfg.SinkManager, err = sinks.New(&rcfg.Sync, rcfg.ConfigFile.SinkConfigFile) + if err != nil { + cclog.Error(err.Error()) + return 1 } + // Connect metric router to sink manager + RouterToSinksChannel := make(chan lp.CCMetric, 200) + rcfg.SinkManager.AddInput(RouterToSinksChannel) + rcfg.MetricRouter.AddOutput(RouterToSinksChannel) + // Create new collector manager - if len(rcfg.ConfigFile.CollectorConfigFile) > 0 { - rcfg.CollectManager, err = collectors.New(rcfg.MultiChanTicker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile) - if err != nil { - cclog.Error(err.Error()) - return 1 - } - - // Connect collector manager to metric router - CollectToRouterChannel := make(chan lp.CCMetric, 200) - rcfg.CollectManager.AddOutput(CollectToRouterChannel) - rcfg.MetricRouter.AddCollectorInput(CollectToRouterChannel) + rcfg.CollectManager, err = collectors.New(rcfg.MultiChanTicker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile) + if err != nil { + cclog.Error(err.Error()) + return 1 } + // Connect collector manager to metric router + CollectToRouterChannel := make(chan lp.CCMetric, 200) + rcfg.CollectManager.AddOutput(CollectToRouterChannel) + rcfg.MetricRouter.AddCollectorInput(CollectToRouterChannel) + // Create new receive manager if len(rcfg.ConfigFile.ReceiverConfigFile) > 0 { rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, rcfg.ConfigFile.ReceiverConfigFile) From 8df58c051fa4c8d71424d8a90cf6bf5d66d86697 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Sat, 29 Jan 2022 10:04:31 +0100 Subject: [PATCH 049/174] Lower minimum required golang version to 1.16. --- collectors/gpfsMetric.go | 15 +++++++++++---- go.mod | 9 +-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index bc1852b..53db1c2 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -130,14 +130,21 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } - timestampInt, err := strconv.ParseInt(key_value["_t_"]+key_value["_tu_"], 10, 64) - timestamp := time.UnixMicro(timestampInt) + sec, err := strconv.ParseInt(key_value["_t_"], 10, 64) if err != nil { fmt.Fprintf(os.Stderr, - "GpfsCollector.Read(): Failed to convert time stamp '%s': %s\n", - key_value["_t_"]+key_value["_tu_"], err.Error()) + "GpfsCollector.Read(): Failed to convert seconds to int '%s': %v\n", + key_value["_t_"], err) continue } + msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert micro seconds to int '%s': %v\n", + key_value["_tu_"], err) + continue + } + timestamp := time.Unix(sec, msec*1000) // bytes read bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) diff --git a/go.mod b/go.mod index da4f3ea..0789f7e 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/ClusterCockpit/cc-metric-collector -go 1.17 +go 1.16 require ( github.com/NVIDIA/go-nvml v0.11.1-0 @@ -12,14 +12,7 @@ require ( ) require ( - github.com/deepmap/oapi-codegen v1.8.2 // indirect github.com/golang/protobuf v1.5.2 // indirect github.com/nats-io/nats-server/v2 v2.7.0 // indirect - github.com/nats-io/nkeys v0.3.0 // indirect - github.com/nats-io/nuid v1.0.1 // indirect - github.com/pkg/errors v0.9.1 // indirect - golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce // indirect - golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2 // indirect google.golang.org/protobuf v1.27.1 // indirect - gopkg.in/yaml.v2 v2.3.0 // indirect ) From 9e99e47d73ea2e9718e91f9717725cbe4c554b19 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Sun, 30 Jan 2022 12:08:33 +0100 Subject: [PATCH 050/174] Wait for close of done channel, to ensure manager finished. --- collectors/collectorManager.go | 3 +++ internal/metricRouter/metricRouter.go | 6 ++++++ internal/multiChanTicker/multiChanTicker.go | 3 +++ sinks/sinkManager.go | 3 +++ 4 files changed, 15 insertions(+) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 52e91e7..f91db20 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -115,6 +115,7 @@ func (cm *collectorManager) Start() { for _, c := range cm.collectors { c.Close() } + close(cm.done) cclog.ComponentDebug("CollectorManager", "DONE") } @@ -154,6 +155,8 @@ func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { func (cm *collectorManager) Close() { cclog.ComponentDebug("CollectorManager", "CLOSE") cm.done <- true + // wait for close of channel cm.done + <-cm.done } // New creates a new initialized metric collector manager diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index e75e77d..96a2f05 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -101,6 +101,7 @@ func (r *metricRouter) StartTimer() { for { select { case <-r.timerdone: + close(r.timerdone) cclog.ComponentDebug("MetricRouter", "TIMER DONE") return case t := <-m: @@ -195,6 +196,7 @@ func (r *metricRouter) Start() { // Router manager is done done := func() { + close(r.done) cclog.ComponentDebug("MetricRouter", "DONE") } @@ -257,9 +259,13 @@ func (r *metricRouter) AddOutput(output chan lp.CCMetric) { func (r *metricRouter) Close() { cclog.ComponentDebug("MetricRouter", "CLOSE") r.done <- true + // wait for close of channel r.done + <-r.done if r.config.IntervalStamp { cclog.ComponentDebug("MetricRouter", "TIMER CLOSE") r.timerdone <- true + // wait for close of channel r.timerdone + <-r.timerdone } } diff --git a/internal/multiChanTicker/multiChanTicker.go b/internal/multiChanTicker/multiChanTicker.go index a9394ab..e0eca43 100644 --- a/internal/multiChanTicker/multiChanTicker.go +++ b/internal/multiChanTicker/multiChanTicker.go @@ -23,6 +23,7 @@ func (t *multiChanTicker) Init(duration time.Duration) { t.done = make(chan bool) go func() { done := func() { + close(t.done) cclog.ComponentDebug("MultiChanTicker", "DONE") } for { @@ -52,6 +53,8 @@ func (t *multiChanTicker) AddChannel(channel chan time.Time) { func (t *multiChanTicker) Close() { cclog.ComponentDebug("MultiChanTicker", "CLOSE") t.done <- true + // wait for close of channel t.done + <-t.done } func NewTicker(duration time.Duration) MultiChanTicker { diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index b4b3dc5..8d2872a 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -81,6 +81,7 @@ func (sm *sinkManager) Start() { s.Close() } + close(sm.done) cclog.ComponentDebug("SinkManager", "DONE") } @@ -149,6 +150,8 @@ func (sm *sinkManager) AddOutput(rawConfig json.RawMessage) error { func (sm *sinkManager) Close() { cclog.ComponentDebug("SinkManager", "CLOSE") sm.done <- true + // wait for close of channel sm.done + <-sm.done } // New creates a new initialized sink manager From 4541e50bea556036c666d48408da3b41495a3304 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 30 Jan 2022 14:29:25 +0100 Subject: [PATCH 051/174] Minor fixes in ccLogger --- internal/ccLogger/cclogger.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/ccLogger/cclogger.go b/internal/ccLogger/cclogger.go index 38e7e6b..5135780 100644 --- a/internal/ccLogger/cclogger.go +++ b/internal/ccLogger/cclogger.go @@ -38,7 +38,7 @@ func initLogger() { func Print(e ...interface{}) { initLogger() - defaultLog.Print(e) + defaultLog.Print(e...) } func ComponentPrint(component string, e ...interface{}) { @@ -48,7 +48,7 @@ func ComponentPrint(component string, e ...interface{}) { func Info(e ...interface{}) { initLogger() - infoLog.Print(e) + infoLog.Print(e...) } func ComponentInfo(component string, e ...interface{}) { @@ -58,14 +58,14 @@ func ComponentInfo(component string, e ...interface{}) { func Debug(e ...interface{}) { initLogger() - if globalDebug == true { - debugLog.Print(e) + if globalDebug { + debugLog.Print(e...) } } func ComponentDebug(component string, e ...interface{}) { initLogger() - if globalDebug == true && debugLog != nil { + if globalDebug && debugLog != nil { //CCComponentPrint(debugLog, component, e) debugLog.Print(fmt.Sprintf("[%s] ", component), e) } From d3f56115411dd8f9812f9354b7243bfd51f494e6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 30 Jan 2022 14:30:06 +0100 Subject: [PATCH 052/174] Add functions to get the fields of a CCMetric and export some more CCMetric functions --- internal/ccMetric/ccMetric.go | 37 ++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 6b6bda9..05f81ff 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -2,9 +2,10 @@ package ccmetric import ( "fmt" - lp "github.com/influxdata/line-protocol" // MIT license "sort" "time" + + lp "github.com/influxdata/line-protocol" // MIT license ) // Most functions are derived from github.com/influxdata/line-protocol/metric.go @@ -24,6 +25,11 @@ type CCMetric interface { AddMeta(key, value string) MetaList() []*lp.Tag RemoveTag(key string) + GetTag(key string) (string, bool) + GetMeta(key string) (string, bool) + GetField(key string) (interface{}, bool) + HasField(key string) bool + RemoveField(key string) } func (m *ccMetric) Meta() map[string]string { @@ -187,6 +193,35 @@ func (m *ccMetric) AddField(key string, value interface{}) { m.fields = append(m.fields, &lp.Field{Key: key, Value: convertField(value)}) } +func (m *ccMetric) GetField(key string) (interface{}, bool) { + for _, field := range m.fields { + if field.Key == key { + return field.Value, true + } + } + return "", false +} + +func (m *ccMetric) HasField(key string) bool { + for _, field := range m.fields { + if field.Key == key { + return true + } + } + return false +} + +func (m *ccMetric) RemoveField(key string) { + for i, field := range m.fields { + if field.Key == key { + copy(m.fields[i:], m.fields[i+1:]) + m.fields[len(m.fields)-1] = nil + m.fields = m.fields[:len(m.fields)-1] + return + } + } +} + func New( name string, tags map[string]string, From 6abbc5f77e452805afe645e63ed1385dc5f064a5 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Sun, 30 Jan 2022 14:54:36 +0100 Subject: [PATCH 053/174] Fix Github Actions (#18) * Fix config for Github Actions * Fix paths * Add CentOS Latest and AlmaLinux 8.5 to RPM action * Fix ID * Reduce min Go version to 1.16 and use time.Unix in gpfsMetric --- .github/ci-collectors.json | 6 ++++ .github/ci-config.json | 58 ++++------------------------------ .github/ci-receivers.json | 1 + .github/ci-router.json | 22 +++++++++++++ .github/ci-sinks.json | 6 ++++ .github/workflows/rpmbuild.yml | 40 ++++++++++++++++++++++- 6 files changed, 81 insertions(+), 52 deletions(-) create mode 100644 .github/ci-collectors.json create mode 100644 .github/ci-receivers.json create mode 100644 .github/ci-router.json create mode 100644 .github/ci-sinks.json diff --git a/.github/ci-collectors.json b/.github/ci-collectors.json new file mode 100644 index 0000000..3497fc0 --- /dev/null +++ b/.github/ci-collectors.json @@ -0,0 +1,6 @@ +{ + "tempstat": {}, + "diskstat": {}, + "memstat": {}, + "cpustat": {} +} diff --git a/.github/ci-config.json b/.github/ci-config.json index 402388d..15b2e6f 100644 --- a/.github/ci-config.json +++ b/.github/ci-config.json @@ -1,52 +1,8 @@ { - "sink": { - "user": "testuser", - "password": "testpass", - "host": "127.0.0.1", - "port": "9090", - "database": "testdb", - "organization": "testorg", - "type": "stdout" - }, - "interval": 3, - "duration": 1, - "collectors": [ - "tempstat", - "loadavg", - "memstat", - "netstat", - "ibstat", - "lustrestat", - "cpustat", - "topprocs", - "nvidia", - "diskstat", - "ipmistat", - "gpfs", - "cpufreq", - "cpufreq_cpuinfo" - ], - "default_tags": { - "cluster": "testcluster" - }, - "receiver": { - "type": "none" - }, - "collect_config": { - "topprocs": { - "num_procs": 2 - }, - "tempstat": { - "tag_override": { - "hwmon0": { - "type": "socket", - "type-id": "0" - }, - "hwmon1": { - "type": "socket", - "type-id": "1" - } - } - } - } -} \ No newline at end of file + "sinks": ".github/ci-sinks.json", + "collectors" : ".github/ci-collectors.json", + "receivers" : ".github/ci-receivers.json", + "router" : ".github/ci-router.json", + "interval": 5, + "duration": 1 +} diff --git a/.github/ci-receivers.json b/.github/ci-receivers.json new file mode 100644 index 0000000..fe51488 --- /dev/null +++ b/.github/ci-receivers.json @@ -0,0 +1 @@ +[] diff --git a/.github/ci-router.json b/.github/ci-router.json new file mode 100644 index 0000000..a9f8714 --- /dev/null +++ b/.github/ci-router.json @@ -0,0 +1,22 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "testcluster", + "if" : "*" + }, + { + "key" : "test", + "value" : "testing", + "if" : "name == 'temp_package_id_0'" + } + ], + "delete_tags" : [ + { + "key" : "unit", + "value" : "*", + "if" : "*" + } + ], + "interval_timestamp" : true +} diff --git a/.github/ci-sinks.json b/.github/ci-sinks.json new file mode 100644 index 0000000..d304018 --- /dev/null +++ b/.github/ci-sinks.json @@ -0,0 +1,6 @@ +[ + { + "type" : "stdout", + "meta_as_tags" : true + } +] diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index 3e121d0..a7aee22 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -2,7 +2,7 @@ name: Run RPM Build on: push jobs: - build: + build-centos8: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -21,3 +21,41 @@ jobs: with: name: cc-metric-collector SRPM CentOS8 path: ${{ steps.rpm.outputs.source_rpm_path }} + build-centos-latest: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: TomTheBear/rpmbuild@centos_latest + id: rpm + name: Build RPM package on CentOS 'Latest' + with: + spec_file: "./scripts/cc-metric-collector.spec" + - name: Save RPM as artifact + uses: actions/upload-artifact@v1.0.0 + with: + name: cc-metric-collector RPM CentOS 'Latest' + path: ${{ steps.rpm.outputs.rpm_dir_path }} + - name: Save SRPM as artifact + uses: actions/upload-artifact@v1.0.0 + with: + name: cc-metric-collector SRPM CentOS 'Latest' + path: ${{ steps.rpm.outputs.source_rpm_path }} + build-alma-8_5: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: TomTheBear/rpmbuild@alma8.5 + id: rpm + name: Build RPM package on AlmaLinux 8.5 + with: + spec_file: "./scripts/cc-metric-collector.spec" + - name: Save RPM as artifact + uses: actions/upload-artifact@v1.0.0 + with: + name: cc-metric-collector RPM AlmaLinux 8.5 + path: ${{ steps.rpm.outputs.rpm_dir_path }} + - name: Save SRPM as artifact + uses: actions/upload-artifact@v1.0.0 + with: + name: cc-metric-collector SRPM AlmaLinux 8.5 + path: ${{ steps.rpm.outputs.source_rpm_path }} From 11844d9d5db98d74ac82d36a59e980f8766c9121 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Sun, 30 Jan 2022 14:59:26 +0100 Subject: [PATCH 054/174] Add common topology module for MetricCollectors and MetricRouter (#20) --- internal/ccTopology/ccTopology.go | 277 ++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 internal/ccTopology/ccTopology.go diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go new file mode 100644 index 0000000..8d53b05 --- /dev/null +++ b/internal/ccTopology/ccTopology.go @@ -0,0 +1,277 @@ +package ccTopology + +import ( + "fmt" + "io/ioutil" + "log" + "os" + "path/filepath" + "strconv" + "strings" + + cclogger "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" +) + +// intArrayContains scans an array of ints if the value str is present in the array +// If the specified value is found, the corresponding array index is returned. +// The bool value is used to signal success or failure +func intArrayContains(array []int, str int) (int, bool) { + for i, a := range array { + if a == str { + return i, true + } + } + return -1, false +} + +// stringArrayContains scans an array of strings if the value str is present in the array +// If the specified value is found, the corresponding array index is returned. +// The bool value is used to signal success or failure +// func stringArrayContains(array []string, str string) (int, bool) { +// for i, a := range array { +// if a == str { +// return i, true +// } +// } +// return -1, false +// } + +func SocketList() []int { + buffer, err := ioutil.ReadFile("/proc/cpuinfo") + if err != nil { + log.Print(err) + return nil + } + ll := strings.Split(string(buffer), "\n") + var packs []int + for _, line := range ll { + if strings.HasPrefix(line, "physical id") { + lv := strings.Fields(line) + id, err := strconv.ParseInt(lv[3], 10, 32) + if err != nil { + log.Print(err) + return packs + } + _, found := intArrayContains(packs, int(id)) + if !found { + packs = append(packs, int(id)) + } + } + } + return packs +} + +func CpuList() []int { + buffer, err := ioutil.ReadFile("/proc/cpuinfo") + if err != nil { + log.Print(err) + return nil + } + ll := strings.Split(string(buffer), "\n") + var cpulist []int + for _, line := range ll { + if strings.HasPrefix(line, "processor") { + lv := strings.Fields(line) + id, err := strconv.ParseInt(lv[2], 10, 32) + if err != nil { + log.Print(err) + return cpulist + } + _, found := intArrayContains(cpulist, int(id)) + if !found { + cpulist = append(cpulist, int(id)) + } + } + } + return cpulist +} + +type CpuEntry struct { + Cpuid int + SMT int + Core int + Socket int + Numadomain int + Die int +} + +func CpuData() []CpuEntry { + + fileToInt := func(path string) int { + buffer, err := ioutil.ReadFile(path) + if err != nil { + log.Print(err) + cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error()) + return -1 + } + sbuffer := strings.Replace(string(buffer), "\n", "", -1) + var id int64 + //_, err = fmt.Scanf("%d", sbuffer, &id) + id, err = strconv.ParseInt(sbuffer, 10, 32) + if err != nil { + cclogger.ComponentError("ccTopology", "Parsing", path, ":", sbuffer, err.Error()) + return -1 + } + return int(id) + } + getCore := func(basepath string) int { + return fileToInt(fmt.Sprintf("%s/core_id", basepath)) + } + + getSocket := func(basepath string) int { + return fileToInt(fmt.Sprintf("%s/physical_package_id", basepath)) + } + + getDie := func(basepath string) int { + return fileToInt(fmt.Sprintf("%s/die_id", basepath)) + } + + getSMT := func(cpuid int, basepath string) int { + buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/thread_siblings_list", basepath)) + if err != nil { + log.Print(err) + } + threadlist := make([]int, 0) + sbuffer := strings.Replace(string(buffer), "\n", "", -1) + for _, x := range strings.Split(sbuffer, ",") { + id, err := strconv.ParseInt(x, 10, 32) + if err != nil { + log.Print(err) + } + threadlist = append(threadlist, int(id)) + } + for i, x := range threadlist { + if x == cpuid { + return i + } + } + return 1 + } + + getNumaDomain := func(basepath string) int { + files, err := filepath.Glob(fmt.Sprintf("%s/node*", basepath)) + if err != nil { + log.Print(err) + } + for _, f := range files { + finfo, err := os.Lstat(f) + if err == nil && (finfo.IsDir() || finfo.Mode()&os.ModeSymlink != 0) { + var id int + parts := strings.Split(f, "/") + _, err = fmt.Scanf("node%d", parts[len(parts)-1], &id) + if err == nil { + return id + } + } + } + return 0 + } + + clist := make([]CpuEntry, 0) + for _, c := range CpuList() { + clist = append(clist, CpuEntry{Cpuid: c}) + } + for _, centry := range clist { + centry.Socket = -1 + centry.Numadomain = -1 + centry.Die = -1 + centry.Core = -1 + // Set base directory for topology lookup + base := fmt.Sprintf("/sys/devices/system/cpu/cpu%d/topology", centry.Cpuid) + + // Lookup CPU core id + centry.Core = getCore(base) + + // Lookup CPU socket id + centry.Socket = getSocket(base) + + // Lookup CPU die id + centry.Die = getDie(base) + + // Lookup SMT thread id + centry.SMT = getSMT(centry.Cpuid, base) + + // Lookup NUMA domain id + centry.Numadomain = getNumaDomain(base) + + } + return clist +} + +type CpuInformation struct { + NumHWthreads int + SMTWidth int + NumSockets int + NumDies int + NumNumaDomains int +} + +func CpuInfo() CpuInformation { + var c CpuInformation + + smt := 0 + numa := 0 + die := 0 + socket := 0 + cdata := CpuData() + for _, d := range cdata { + if d.SMT > smt { + smt = d.SMT + } + if d.Numadomain > numa { + numa = d.Numadomain + } + if d.Die > die { + die = d.Die + } + if d.Socket > socket { + socket = d.Socket + } + } + c.NumNumaDomains = numa + 1 + c.SMTWidth = smt + 1 + c.NumDies = die + 1 + c.NumSockets = socket + 1 + c.NumHWthreads = len(cdata) + return c +} + +func GetCpuSocket(cpuid int) int { + cdata := CpuData() + for _, d := range cdata { + if d.Cpuid == cpuid { + return d.Socket + } + } + return -1 +} + +func GetCpuNumaDomain(cpuid int) int { + cdata := CpuData() + for _, d := range cdata { + if d.Cpuid == cpuid { + return d.Numadomain + } + } + return -1 +} + +func GetCpuDie(cpuid int) int { + cdata := CpuData() + for _, d := range cdata { + if d.Cpuid == cpuid { + return d.Die + } + } + return -1 +} + +func GetCpuCore(cpuid int) int { + cdata := CpuData() + for _, d := range cdata { + if d.Cpuid == cpuid { + return d.Core + } + } + return -1 +} From cf810b1c0c3c69d210647a36462f8b51609556e5 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Sun, 30 Jan 2022 15:03:21 +0100 Subject: [PATCH 055/174] Add Cache and Aggregator to MetricRouter (#21) * Add Cache and Aggregator to MetricRouter * Close done channel in MetricCache --- internal/metricRouter/metricAggregator.go | 291 ++++++++++++++ .../metricRouter/metricAggregatorFunctions.go | 376 ++++++++++++++++++ internal/metricRouter/metricCache.go | 176 ++++++++ internal/metricRouter/metricRouter.go | 55 ++- 4 files changed, 885 insertions(+), 13 deletions(-) create mode 100644 internal/metricRouter/metricAggregator.go create mode 100644 internal/metricRouter/metricAggregatorFunctions.go create mode 100644 internal/metricRouter/metricCache.go diff --git a/internal/metricRouter/metricAggregator.go b/internal/metricRouter/metricAggregator.go new file mode 100644 index 0000000..41c5276 --- /dev/null +++ b/internal/metricRouter/metricAggregator.go @@ -0,0 +1,291 @@ +package metricRouter + +import ( + "context" + "fmt" + "os" + "strings" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" + + "github.com/PaesslerAG/gval" +) + +type metricAggregatorIntervalConfig struct { + Name string `json:"name"` // Metric name for the new metric + Function string `json:"function"` // Function to apply on the metric + Condition string `json:"if"` // Condition for applying function + Tags map[string]string `json:"tags"` // Tags for the new metric + Meta map[string]string `json:"meta"` // Meta information for the new metric + gvalCond gval.Evaluable + gvalFunc gval.Evaluable +} + +type metricAggregator struct { + functions []*metricAggregatorIntervalConfig + constants map[string]interface{} + language gval.Language + output chan lp.CCMetric +} + +type MetricAggregator interface { + AddAggregation(name, function, condition string, tags, meta map[string]string) error + DeleteAggregation(name string) error + Init(output chan lp.CCMetric) error + Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMetric) +} + +var metricCacheLanguage = gval.NewLanguage( + gval.Base(), + gval.Function("sum", sumfunc), + gval.Function("min", minfunc), + gval.Function("avg", avgfunc), + gval.Function("mean", avgfunc), + gval.Function("max", maxfunc), + gval.Function("len", lenfunc), + gval.Function("median", medianfunc), + gval.InfixOperator("in", infunc), + gval.Function("match", matchfunc), + gval.Function("getCpuCore", getCpuCoreFunc), + gval.Function("getCpuSocket", getCpuSocketFunc), + gval.Function("getCpuNuma", getCpuNumaDomainFunc), + gval.Function("getCpuDie", getCpuDieFunc), + gval.Function("getSockCpuList", getCpuListOfSocketFunc), + gval.Function("getNumaCpuList", getCpuListOfNumaDomainFunc), + gval.Function("getDieCpuList", getCpuListOfDieFunc), + gval.Function("getCoreCpuList", getCpuListOfCoreFunc), + gval.Function("getCpuList", getCpuListOfNode), + gval.Function("getCpuListOfType", getCpuListOfType), +) + +func (c *metricAggregator) Init(output chan lp.CCMetric) error { + c.output = output + c.functions = make([]*metricAggregatorIntervalConfig, 0) + c.constants = make(map[string]interface{}) + + // add constants like hostname, numSockets, ... to constants list + // Set hostname + hostname, err := os.Hostname() + if err != nil { + cclog.Error(err.Error()) + return err + } + // Drop domain part of host name + c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0] + cinfo := topo.CpuInfo() + c.constants["numHWThreads"] = cinfo.NumHWthreads + c.constants["numSockets"] = cinfo.NumSockets + c.constants["numNumaDomains"] = cinfo.NumNumaDomains + c.constants["numDies"] = cinfo.NumDies + c.constants["smtWidth"] = cinfo.SMTWidth + + c.language = gval.NewLanguage( + gval.Base(), + metricCacheLanguage, + ) + + // Example aggregation function + // var f metricCacheFunctionConfig + // f.Name = "temp_cores_avg" + // //f.Condition = `"temp_core_" in name` + // f.Condition = `match("temp_core_%d+", metric.Name())` + // f.Function = `avg(values)` + // f.Tags = map[string]string{"type": "node"} + // f.Meta = map[string]string{"group": "IPMI", "unit": "degC", "source": "TempCollector"} + // c.functions = append(c.functions, &f) + return nil +} + +func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMetric) { + vars := make(map[string]interface{}) + for k, v := range c.constants { + vars[k] = v + } + vars["starttime"] = starttime + vars["endtime"] = endtime + for _, f := range c.functions { + cclog.ComponentDebug("MetricCache", "COLLECT", f.Name, "COND", f.Condition) + values := make([]float64, 0) + matches := make([]lp.CCMetric, 0) + for _, m := range metrics { + vars["metric"] = m + //value, err := gval.Evaluate(f.Condition, vars, c.language) + value, err := f.gvalCond.EvalBool(context.Background(), vars) + if err != nil { + cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error()) + continue + } + if value { + v, valid := m.GetField("value") + if valid { + switch x := v.(type) { + case float64: + values = append(values, x) + case float32: + case int: + case int64: + values = append(values, float64(x)) + case bool: + if x { + values = append(values, float64(1.0)) + } else { + values = append(values, float64(0.0)) + } + default: + cclog.ComponentError("MetricCache", "COLLECT ADD VALUE", v, "FAILED") + } + } + matches = append(matches, m) + } + } + delete(vars, "metric") + cclog.ComponentDebug("MetricCache", "EVALUATE", f.Name, "METRICS", len(values), "CALC", f.Function) + vars["values"] = values + vars["metrics"] = matches + if len(values) > 0 { + value, err := gval.Evaluate(f.Function, vars, c.language) + if err != nil { + cclog.ComponentError("MetricCache", "EVALUATE", f.Name, "METRICS", len(values), "CALC", f.Function, ":", err.Error()) + break + } + + copy_tags := func(tags map[string]string, metrics []lp.CCMetric) map[string]string { + out := make(map[string]string) + for key, value := range tags { + switch value { + case "": + for _, m := range metrics { + v, err := m.GetTag(key) + if err { + out[key] = v + } + } + default: + out[key] = value + } + } + return out + } + copy_meta := func(meta map[string]string, metrics []lp.CCMetric) map[string]string { + out := make(map[string]string) + for key, value := range meta { + switch value { + case "": + for _, m := range metrics { + v, err := m.GetMeta(key) + if err { + out[key] = v + } + } + default: + out[key] = value + } + } + return out + } + tags := copy_tags(f.Tags, matches) + meta := copy_meta(f.Meta, matches) + + var m lp.CCMetric + switch t := value.(type) { + case float64: + m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + case float32: + m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + case int: + m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + case int64: + m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + case string: + m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + default: + cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name) + } + if err != nil { + cclog.ComponentError("MetricCache", "Cannot create metric from Gval result", value, ":", err.Error()) + } + cclog.ComponentDebug("MetricCache", "SEND", m) + select { + case c.output <- m: + default: + } + + } + } +} + +func (c *metricAggregator) AddAggregation(name, function, condition string, tags, meta map[string]string) error { + // Since "" cannot be used inside of JSON strings, we use '' and replace them here because gval does not like '' + // but wants "" + newfunc := strings.ReplaceAll(function, "'", "\"") + newcond := strings.ReplaceAll(condition, "'", "\"") + gvalCond, err := gval.Full(metricCacheLanguage).NewEvaluable(newcond) + if err != nil { + cclog.ComponentError("MetricAggregator", "Cannot add aggregation, invalid if condition", newcond, ":", err.Error()) + return err + } + gvalFunc, err := gval.Full(metricCacheLanguage).NewEvaluable(newfunc) + if err != nil { + cclog.ComponentError("MetricAggregator", "Cannot add aggregation, invalid function condition", newfunc, ":", err.Error()) + return err + } + for _, agg := range c.functions { + if agg.Name == name { + agg.Name = name + agg.Condition = newcond + agg.Function = newfunc + agg.Tags = tags + agg.Meta = meta + agg.gvalCond = gvalCond + agg.gvalFunc = gvalFunc + return nil + } + } + var agg metricAggregatorIntervalConfig + agg.Name = name + agg.Condition = newcond + agg.gvalCond = gvalCond + agg.Function = newfunc + agg.gvalFunc = gvalFunc + agg.Tags = tags + agg.Meta = meta + c.functions = append(c.functions, &agg) + return nil +} + +func (c *metricAggregator) DeleteAggregation(name string) error { + for i, agg := range c.functions { + if agg.Name == name { + copy(c.functions[i:], c.functions[i+1:]) + c.functions[len(c.functions)-1] = nil + c.functions = c.functions[:len(c.functions)-1] + return nil + } + } + return fmt.Errorf("no aggregation for metric name %s", name) +} + +func (c *metricAggregator) AddConstant(name string, value interface{}) { + c.constants[name] = value +} + +func (c *metricAggregator) DelConstant(name string) { + delete(c.constants, name) +} + +func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) { + c.language = gval.NewLanguage(c.language, gval.Function(name, function)) +} + +func NewAggregator(output chan lp.CCMetric) (MetricAggregator, error) { + a := new(metricAggregator) + err := a.Init(output) + if err != nil { + return nil, err + } + return a, err +} diff --git a/internal/metricRouter/metricAggregatorFunctions.go b/internal/metricRouter/metricAggregatorFunctions.go new file mode 100644 index 0000000..4133a4b --- /dev/null +++ b/internal/metricRouter/metricAggregatorFunctions.go @@ -0,0 +1,376 @@ +package metricRouter + +import ( + "errors" + "fmt" + "math" + "regexp" + "sort" + "strings" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" +) + +/* + * Arithmetic functions on value arrays + */ + +// Sum up values +func sumfunc(args ...interface{}) (interface{}, error) { + s := 0.0 + values, ok := args[0].([]float64) + if ok { + cclog.ComponentDebug("MetricCache", "SUM FUNC START") + for _, x := range values { + s += x + } + cclog.ComponentDebug("MetricCache", "SUM FUNC END", s) + } else { + cclog.ComponentDebug("MetricCache", "SUM FUNC CAST FAILED") + } + return s, nil +} + +// Get the minimum value +func minfunc(args ...interface{}) (interface{}, error) { + var err error = nil + switch values := args[0].(type) { + case []float64: + var s float64 = math.MaxFloat64 + for _, x := range values { + if x < s { + s = x + } + } + return s, nil + case []float32: + var s float32 = math.MaxFloat32 + for _, x := range values { + if x < s { + s = x + } + } + return s, nil + case []int: + var s int = math.MaxInt + for _, x := range values { + if x < s { + s = x + } + } + return s, nil + case []int64: + var s int64 = math.MaxInt64 + for _, x := range values { + if x < s { + s = x + } + } + return s, nil + case []int32: + var s int32 = math.MaxInt32 + for _, x := range values { + if x < s { + s = x + } + } + return s, nil + default: + err = errors.New("function 'min' only on list of values (float64, float32, int, int32, int64)") + } + + return 0.0, err +} + +// Get the average or mean value +func avgfunc(args ...interface{}) (interface{}, error) { + switch values := args[0].(type) { + case []float64: + var s float64 = 0 + for _, x := range values { + s += x + } + return s / float64(len(values)), nil + case []float32: + var s float32 = 0 + for _, x := range values { + s += x + } + return s / float32(len(values)), nil + case []int: + var s int = 0 + for _, x := range values { + s += x + } + return s / len(values), nil + case []int64: + var s int64 = 0 + for _, x := range values { + s += x + } + return s / int64(len(values)), nil + } + return 0.0, nil +} + +// Get the maximum value +func maxfunc(args ...interface{}) (interface{}, error) { + s := 0.0 + values, ok := args[0].([]float64) + if ok { + for _, x := range values { + if x > s { + s = x + } + } + } + return s, nil +} + +// Get the median value +func medianfunc(args ...interface{}) (interface{}, error) { + switch values := args[0].(type) { + case []float64: + sort.Float64s(values) + return values[len(values)/2], nil + // case []float32: + // sort.Float64s(values) + // return values[len(values)/2], nil + case []int: + sort.Ints(values) + return values[len(values)/2], nil + + // case []int64: + // sort.Ints(values) + // return values[len(values)/2], nil + // case []int32: + // sort.Ints(values) + // return values[len(values)/2], nil + } + return 0.0, errors.New("function 'median()' only on lists of type float64 and int") +} + +/* + * Get number of values in list. Returns always an int + */ + +func lenfunc(args ...interface{}) (interface{}, error) { + var err error = nil + var length int = 0 + switch values := args[0].(type) { + case []float64: + length = len(values) + case []float32: + length = len(values) + case []int: + length = len(values) + case []int64: + length = len(values) + case []int32: + length = len(values) + case float64: + err = errors.New("function 'len' can only be applied on arrays and strings") + case float32: + err = errors.New("function 'len' can only be applied on arrays and strings") + case int: + err = errors.New("function 'len' can only be applied on arrays and strings") + case int64: + err = errors.New("function 'len' can only be applied on arrays and strings") + case string: + length = len(values) + } + return length, err +} + +/* + * Check if a values is in a list + * In constrast to most of the other functions, this one is an infix operator for + * - substring matching: `"abc" in "abcdef"` -> true + * - substring matching with int casting: `3 in "abd3"` -> true + * - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads) + */ + +func infunc(a interface{}, b interface{}) (interface{}, error) { + switch match := a.(type) { + case string: + switch total := b.(type) { + case string: + return strings.Contains(total, match), nil + } + case int: + switch total := b.(type) { + case []int: + for _, x := range total { + if x == match { + return true, nil + } + } + case string: + smatch := fmt.Sprintf("%d", match) + return strings.Contains(total, smatch), nil + } + + } + return false, nil +} + +/* + * Regex matching of strings (metric name, tag keys, tag values, meta keys, meta values) + * Since we cannot use \ inside JSON strings without escaping, we use % instead for the + * format keys \d = %d, \w = %d, ... Not sure how to fix this + */ + +func matchfunc(args ...interface{}) (interface{}, error) { + switch match := args[0].(type) { + case string: + switch total := args[1].(type) { + case string: + smatch := strings.Replace(match, "%", "\\", -1) + regex, err := regexp.Compile(smatch) + if err != nil { + return false, err + } + s := regex.Find([]byte(total)) + return s != nil, nil + } + } + return false, nil +} + +/* + * System topology getter functions + */ + +// for a given cpuid, it returns the core id +func getCpuCoreFunc(args ...interface{}) (interface{}, error) { + switch cpuid := args[0].(type) { + case int: + return topo.GetCpuCore(cpuid), nil + } + return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid") +} + +// for a given cpuid, it returns the socket id +func getCpuSocketFunc(args ...interface{}) (interface{}, error) { + switch cpuid := args[0].(type) { + case int: + return topo.GetCpuSocket(cpuid), nil + } + return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid") +} + +// for a given cpuid, it returns the id of the NUMA node +func getCpuNumaDomainFunc(args ...interface{}) (interface{}, error) { + switch cpuid := args[0].(type) { + case int: + return topo.GetCpuNumaDomain(cpuid), nil + } + return -1, errors.New("function 'getCpuNuma' accepts only an 'int' cpuid") +} + +// for a given cpuid, it returns the id of the CPU die +func getCpuDieFunc(args ...interface{}) (interface{}, error) { + switch cpuid := args[0].(type) { + case int: + return topo.GetCpuDie(cpuid), nil + } + return -1, errors.New("function 'getCpuDie' accepts only an 'int' cpuid") +} + +// for a given core id, it returns the list of cpuids +func getCpuListOfCoreFunc(args ...interface{}) (interface{}, error) { + cpulist := make([]int, 0) + switch in := args[0].(type) { + case int: + for _, c := range topo.CpuData() { + if c.Core == in { + cpulist = append(cpulist, c.Cpuid) + } + } + } + return cpulist, nil +} + +// for a given socket id, it returns the list of cpuids +func getCpuListOfSocketFunc(args ...interface{}) (interface{}, error) { + cpulist := make([]int, 0) + switch in := args[0].(type) { + case int: + for _, c := range topo.CpuData() { + if c.Socket == in { + cpulist = append(cpulist, c.Cpuid) + } + } + } + return cpulist, nil +} + +// for a given id of a NUMA domain, it returns the list of cpuids +func getCpuListOfNumaDomainFunc(args ...interface{}) (interface{}, error) { + cpulist := make([]int, 0) + switch in := args[0].(type) { + case int: + for _, c := range topo.CpuData() { + if c.Numadomain == in { + cpulist = append(cpulist, c.Cpuid) + } + } + } + return cpulist, nil +} + +// for a given CPU die id, it returns the list of cpuids +func getCpuListOfDieFunc(args ...interface{}) (interface{}, error) { + cpulist := make([]int, 0) + switch in := args[0].(type) { + case int: + for _, c := range topo.CpuData() { + if c.Die == in { + cpulist = append(cpulist, c.Cpuid) + } + } + } + return cpulist, nil +} + +// wrapper function to get a list of all cpuids of the node +func getCpuListOfNode(args ...interface{}) (interface{}, error) { + return topo.CpuList(), nil +} + +// helper function to get the cpuid list for a CCMetric type tag set (type and type-id) +// since there is no access to the metric data in the function, is should be called like +// `getCpuListOfType()` +func getCpuListOfType(args ...interface{}) (interface{}, error) { + cpulist := make([]int, 0) + switch typ := args[0].(type) { + case string: + switch typ { + case "node": + return topo.CpuList(), nil + case "socket": + return getCpuListOfSocketFunc(args[1]) + case "numadomain": + return getCpuListOfNumaDomainFunc(args[1]) + case "core": + return getCpuListOfCoreFunc(args[1]) + case "cpu": + var cpu int + + switch id := args[1].(type) { + case string: + _, err := fmt.Scanf(id, "%d", &cpu) + if err == nil { + cpulist = append(cpulist, cpu) + } + case int: + cpulist = append(cpulist, id) + case int64: + cpulist = append(cpulist, int(id)) + } + + } + } + return cpulist, errors.New("no valid args type and type-id") +} diff --git a/internal/metricRouter/metricCache.go b/internal/metricRouter/metricCache.go new file mode 100644 index 0000000..1cfd8c3 --- /dev/null +++ b/internal/metricRouter/metricCache.go @@ -0,0 +1,176 @@ +package metricRouter + +import ( + "sync" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" +) + +type metricCachePeriod struct { + startstamp time.Time + stopstamp time.Time + numMetrics int + sizeMetrics int + metrics []lp.CCMetric +} + +// Metric cache data structure +type metricCache struct { + numPeriods int + curPeriod int + intervals []*metricCachePeriod + wg *sync.WaitGroup + ticker mct.MultiChanTicker + tickchan chan time.Time + done chan bool + output chan lp.CCMetric + aggEngine MetricAggregator +} + +type MetricCache interface { + Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error + Start() + Add(metric lp.CCMetric) + GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric) + AddAggregation(name, function, condition string, tags, meta map[string]string) error + DeleteAggregation(name string) error + Close() +} + +func (c *metricCache) Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error { + var err error = nil + c.done = make(chan bool) + c.wg = wg + c.ticker = ticker + c.numPeriods = numPeriods + c.output = output + c.intervals = make([]*metricCachePeriod, 0) + for i := 0; i < c.numPeriods+1; i++ { + p := new(metricCachePeriod) + p.numMetrics = 0 + p.sizeMetrics = 0 + p.metrics = make([]lp.CCMetric, 0) + c.intervals = append(c.intervals, p) + } + + // Create a new aggregation engine. No separate goroutine at the moment + // The code is executed by the MetricCache goroutine + c.aggEngine, err = NewAggregator(c.output) + if err != nil { + cclog.ComponentError("MetricCache", "Cannot create aggregator") + return err + } + + return nil +} + +// Start starts the metric cache +func (c *metricCache) Start() { + + c.tickchan = make(chan time.Time) + c.ticker.AddChannel(c.tickchan) + // Router cache is done + done := func() { + cclog.ComponentDebug("MetricCache", "DONE") + close(c.done) + } + + // Rotate cache interval + rotate := func(timestamp time.Time) int { + oldPeriod := c.curPeriod + c.curPeriod = oldPeriod + 1 + if c.curPeriod >= c.numPeriods { + c.curPeriod = 0 + } + c.intervals[oldPeriod].numMetrics = 0 + c.intervals[oldPeriod].stopstamp = timestamp + c.intervals[c.curPeriod].startstamp = timestamp + c.intervals[c.curPeriod].stopstamp = timestamp + return oldPeriod + } + + c.wg.Add(1) + go func() { + defer c.wg.Done() + for { + select { + case <-c.done: + done() + return + case tick := <-c.tickchan: + old := rotate(tick) + // Get the last period and evaluate aggregation metrics + starttime, endtime, metrics := c.GetPeriod(old) + if len(metrics) > 0 { + c.aggEngine.Eval(starttime, endtime, metrics) + } else { + // This message is also printed in the first interval after startup + cclog.ComponentDebug("MetricCache", "EMPTY INTERVAL?") + } + } + } + }() + cclog.ComponentDebug("MetricCache", "START") +} + +// Add a metric to the cache. The interval is defined by the global timer (rotate() in Start()) +// The intervals list is used as round-robin buffer and the metric list grows dynamically and +// to avoid reallocations +func (c *metricCache) Add(metric lp.CCMetric) { + if c.curPeriod >= 0 && c.curPeriod < c.numPeriods { + p := c.intervals[c.curPeriod] + if p.numMetrics < p.sizeMetrics { + p.metrics[p.numMetrics] = metric + p.numMetrics = p.numMetrics + 1 + p.stopstamp = metric.Time() + } else { + p.metrics = append(p.metrics, metric) + p.numMetrics = p.numMetrics + 1 + p.sizeMetrics = p.sizeMetrics + 1 + p.stopstamp = metric.Time() + } + } +} + +func (c *metricCache) AddAggregation(name, function, condition string, tags, meta map[string]string) error { + return c.aggEngine.AddAggregation(name, function, condition, tags, meta) +} + +func (c *metricCache) DeleteAggregation(name string) error { + return c.aggEngine.DeleteAggregation(name) +} + +// Get all metrics of a interval. The index is the difference to the current interval, so index=0 +// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index +// is given (negative index, index larger than configured number of total intervals, ...) +func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric) { + if index >= 0 && index < c.numPeriods { + pindex := c.curPeriod - index + if pindex < 0 { + pindex = c.numPeriods - pindex + } + if pindex >= 0 && pindex < c.numPeriods { + return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics + } + } + return time.Now(), time.Now(), make([]lp.CCMetric, 0) +} + +// Close finishes / stops the metric cache +func (c *metricCache) Close() { + cclog.ComponentDebug("MetricCache", "CLOSE") + c.done <- true +} + +func NewCache(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) (MetricCache, error) { + c := new(metricCache) + err := c.Init(output, ticker, wg, numPeriods) + if err != nil { + return nil, err + } + return c, err +} diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 96a2f05..870af02 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -23,23 +23,28 @@ type metricRouterTagConfig struct { // Metric router configuration type metricRouterConfig struct { - AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met - DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met - IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval? + AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met + DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met + IntervalAgg []metricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval + IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval? + NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation } // Metric router data structure type metricRouter struct { - hostname string // Hostname used in tags - coll_input chan lp.CCMetric // Input channel from CollectorManager - recv_input chan lp.CCMetric // Input channel from ReceiveManager - outputs []chan lp.CCMetric // List of all output channels - done chan bool // channel to finish / stop metric router - wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector - timestamp time.Time // timestamp periodically updated by ticker each interval - timerdone chan bool // channel to finish / stop timestamp updater - ticker mct.MultiChanTicker // periodically ticking once each interval - config metricRouterConfig // json encoded config for metric router + hostname string // Hostname used in tags + coll_input chan lp.CCMetric // Input channel from CollectorManager + recv_input chan lp.CCMetric // Input channel from ReceiveManager + cache_input chan lp.CCMetric // Input channel from MetricCache + outputs []chan lp.CCMetric // List of all output channels + done chan bool // channel to finish / stop metric router + wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector + timestamp time.Time // timestamp periodically updated by ticker each interval + timerdone chan bool // channel to finish / stop timestamp updater + ticker mct.MultiChanTicker // periodically ticking once each interval + config metricRouterConfig // json encoded config for metric router + cache MetricCache // pointer to MetricCache + cachewg sync.WaitGroup // wait group for MetricCache } // MetricRouter access functions @@ -61,6 +66,7 @@ type MetricRouter interface { func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error { r.outputs = make([]chan lp.CCMetric, 0) r.done = make(chan bool) + r.cache_input = make(chan lp.CCMetric) r.wg = wg r.ticker = ticker @@ -86,6 +92,18 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout cclog.ComponentError("MetricRouter", err.Error()) return err } + numIntervals := r.config.NumCacheIntervals + if numIntervals <= 0 { + numIntervals = 1 + } + r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, numIntervals) + if err != nil { + cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error()) + return err + } + for _, agg := range r.config.IntervalAgg { + r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta) + } return nil } @@ -211,6 +229,9 @@ func (r *metricRouter) Start() { } } + // Start Metric Cache + r.cache.Start() + r.wg.Add(1) go func() { defer r.wg.Done() @@ -227,6 +248,7 @@ func (r *metricRouter) Start() { p.SetTime(r.timestamp) } forward(p) + r.cache.Add(p) case p := <-r.recv_input: // receive from receive manager @@ -234,6 +256,11 @@ func (r *metricRouter) Start() { p.SetTime(r.timestamp) } forward(p) + + case p := <-r.cache_input: + // receive from metric collector + p.AddTag("hostname", r.hostname) + forward(p) } } }() @@ -267,6 +294,8 @@ func (r *metricRouter) Close() { // wait for close of channel r.timerdone <-r.timerdone } + r.cache.Close() + r.cachewg.Wait() } // New creates a new initialized metric router From 18c5d0eb34878e71a9989319d4cd020787133ad5 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 30 Jan 2022 15:04:31 +0100 Subject: [PATCH 056/174] Add example interval aggregation to MetricRouter config for CI --- .github/ci-router.json | 55 +++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/.github/ci-router.json b/.github/ci-router.json index a9f8714..0146768 100644 --- a/.github/ci-router.json +++ b/.github/ci-router.json @@ -1,22 +1,37 @@ { - "add_tags" : [ - { - "key" : "cluster", - "value" : "testcluster", - "if" : "*" - }, - { - "key" : "test", - "value" : "testing", - "if" : "name == 'temp_package_id_0'" - } - ], - "delete_tags" : [ - { - "key" : "unit", - "value" : "*", - "if" : "*" - } - ], - "interval_timestamp" : true + "add_tags": [ + { + "key": "cluster", + "value": "testcluster", + "if": "*" + }, + { + "key": "test", + "value": "testing", + "if": "name == 'temp_package_id_0'" + } + ], + "delete_tags": [ + { + "key": "unit", + "value": "*", + "if": "*" + } + ], + "interval_aggregates": [ + { + "name": "temp_cores_avg", + "function": "avg(values)", + "if": "match('temp_core_%d+', metric.Name())", + "tags": { + "type": "node" + }, + "meta": { + "group": "", + "unit": "", + "source": "MetricAggregator" + } + } + ], + "interval_timestamp": true } From 70ebd2f36d33143e770ba80758d3e8656b997fa6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 30 Jan 2022 15:13:12 +0100 Subject: [PATCH 057/174] Add gval to go files --- go.mod | 1 + go.sum | 3 +++ 2 files changed, 4 insertions(+) diff --git a/go.mod b/go.mod index 0789f7e..130f5cc 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( ) require ( + github.com/PaesslerAG/gval v1.1.2 github.com/golang/protobuf v1.5.2 // indirect github.com/nats-io/nats-server/v2 v2.7.0 // indirect google.golang.org/protobuf v1.27.1 // indirect diff --git a/go.sum b/go.sum index 311633a..44be790 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,8 @@ github.com/NVIDIA/go-nvml v0.11.1-0 h1:XHSz3zZKC4NCP2ja1rI7++DXFhA+uDhdYa3MykCTGHY= github.com/NVIDIA/go-nvml v0.11.1-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= +github.com/PaesslerAG/gval v1.1.2 h1:EROKxV4/fAKWb0Qoj7NOxmHZA7gcpjOV9XgiRZMRCUU= +github.com/PaesslerAG/gval v1.1.2/go.mod h1:Fa8gfkCmUsELXgayr8sfL/sw+VzCVoa03dcOcR/if2w= +github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= github.com/cyberdelia/templates v0.0.0-20141128023046-ca7fffd4298c/go.mod h1:GyV+0YP4qX0UQ7r2MoYZ+AvYDp12OF5yg4q8rGnyNh4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= From df31df149b9d2139d21e35af87ad00b0c05e5c74 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 30 Jan 2022 15:16:46 +0100 Subject: [PATCH 058/174] Fix for missing math.MaxInt in go 1.16 --- internal/metricRouter/metricAggregatorFunctions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/metricRouter/metricAggregatorFunctions.go b/internal/metricRouter/metricAggregatorFunctions.go index 4133a4b..f00479d 100644 --- a/internal/metricRouter/metricAggregatorFunctions.go +++ b/internal/metricRouter/metricAggregatorFunctions.go @@ -53,7 +53,7 @@ func minfunc(args ...interface{}) (interface{}, error) { } return s, nil case []int: - var s int = math.MaxInt + var s int = int(math.MaxInt32) for _, x := range values { if x < s { s = x From d915bcc02d46e3a41b2d8abaf49e28855bcb1f9f Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Sun, 30 Jan 2022 15:21:24 +0100 Subject: [PATCH 059/174] Add sink to add metrics to Ganglia through gmetric (#15) --- sinks/gangliaSink.go | 81 ++++++++++++++++++++++++++++++++++++++++++++ sinks/sinkManager.go | 1 + 2 files changed, 82 insertions(+) create mode 100644 sinks/gangliaSink.go diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go new file mode 100644 index 0000000..1db11b6 --- /dev/null +++ b/sinks/gangliaSink.go @@ -0,0 +1,81 @@ +package sinks + +import ( + "fmt" + "strings" + "log" + // "time" + lp "github.com/influxdata/line-protocol" + "os/exec" +) + +const GMETRIC_EXEC = `gmetric` + +type GangliaSink struct { + Sink + gmetric_path string +} + +func (s *GangliaSink) Init(config SinkConfig) error { + p, err := exec.LookPath(string(GMETRIC_EXEC)) + if err == nil { + s.gmetric_path = p + } + return err +} + +func (s *GangliaSink) Write(point lp.MutableMetric) error { + var err error = nil + var tagsstr []string + var argstr []string + for _, t := range point.TagList() { + switch t.Key { + case "cluster": + argstr = append(argstr, fmt.Sprintf("--cluster=%s", t.Value)) + case "unit": + argstr = append(argstr, fmt.Sprintf("--units=%s", t.Value)) + case "group": + argstr = append(argstr, fmt.Sprintf("--group=%s", t.Value)) + default: + tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", t.Key, t.Value)) + } + } + if len(tagsstr) > 0 { + argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) + } + argstr = append(argstr, fmt.Sprintf("--name=%s", point.Name())) + for _, f := range point.FieldList() { + if f.Key == "value" { + switch f.Value.(type) { + case float64: + argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float64))) + argstr = append(argstr, "--type=double") + case float32: + argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float32))) + argstr = append(argstr, "--type=float") + case int: + argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int))) + argstr = append(argstr, "--type=int32") + case int64: + argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int64))) + argstr = append(argstr, "--type=int32") + case string: + argstr = append(argstr, fmt.Sprintf("--value=%q", f.Value.(string))) + argstr = append(argstr, "--type=string") + } + } + } + log.Print(s.gmetric_path, " ", strings.Join(argstr, " ")) +// command := exec.Command(string(GMETRIC_EXEC), strings.Join(argstr, " ")) +// command.Wait() +// _, err := command.Output() + return err +} + +func (s *GangliaSink) Flush() error { + return nil +} + +func (s *GangliaSink) Close() { + return +} diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 8d2872a..02421d3 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -15,6 +15,7 @@ var AvailableSinks = map[string]Sink{ "stdout": new(StdoutSink), "nats": new(NatsSink), "http": new(HttpSink), + "ganglia": new(GangliaSink), } // Metric collector manager data structure From 011218ab80c2781141c9c7d91876ce7008daaf5c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 30 Jan 2022 15:25:57 +0100 Subject: [PATCH 060/174] Adjust ganglia sink to CCMetric --- sinks/gangliaSink.go | 95 ++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 1db11b6..87506a0 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -2,11 +2,13 @@ package sinks import ( "fmt" + "log" "strings" - "log" + // "time" - lp "github.com/influxdata/line-protocol" "os/exec" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const GMETRIC_EXEC = `gmetric` @@ -16,59 +18,59 @@ type GangliaSink struct { gmetric_path string } -func (s *GangliaSink) Init(config SinkConfig) error { - p, err := exec.LookPath(string(GMETRIC_EXEC)) - if err == nil { - s.gmetric_path = p - } +func (s *GangliaSink) Init(config sinkConfig) error { + p, err := exec.LookPath(string(GMETRIC_EXEC)) + if err == nil { + s.gmetric_path = p + } return err } -func (s *GangliaSink) Write(point lp.MutableMetric) error { - var err error = nil - var tagsstr []string - var argstr []string - for _, t := range point.TagList() { - switch t.Key { - case "cluster": - argstr = append(argstr, fmt.Sprintf("--cluster=%s", t.Value)) - case "unit": - argstr = append(argstr, fmt.Sprintf("--units=%s", t.Value)) - case "group": - argstr = append(argstr, fmt.Sprintf("--group=%s", t.Value)) - default: - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", t.Key, t.Value)) - } +func (s *GangliaSink) Write(point lp.CCMetric) error { + var err error = nil + var tagsstr []string + var argstr []string + for _, t := range point.TagList() { + switch t.Key { + case "cluster": + argstr = append(argstr, fmt.Sprintf("--cluster=%s", t.Value)) + case "unit": + argstr = append(argstr, fmt.Sprintf("--units=%s", t.Value)) + case "group": + argstr = append(argstr, fmt.Sprintf("--group=%s", t.Value)) + default: + tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", t.Key, t.Value)) + } } if len(tagsstr) > 0 { - argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) - } + argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) + } argstr = append(argstr, fmt.Sprintf("--name=%s", point.Name())) for _, f := range point.FieldList() { - if f.Key == "value" { - switch f.Value.(type) { - case float64: - argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float64))) - argstr = append(argstr, "--type=double") - case float32: - argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float32))) - argstr = append(argstr, "--type=float") - case int: - argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int))) - argstr = append(argstr, "--type=int32") - case int64: - argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int64))) - argstr = append(argstr, "--type=int32") - case string: - argstr = append(argstr, fmt.Sprintf("--value=%q", f.Value.(string))) - argstr = append(argstr, "--type=string") - } - } + if f.Key == "value" { + switch f.Value.(type) { + case float64: + argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float64))) + argstr = append(argstr, "--type=double") + case float32: + argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float32))) + argstr = append(argstr, "--type=float") + case int: + argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int))) + argstr = append(argstr, "--type=int32") + case int64: + argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int64))) + argstr = append(argstr, "--type=int32") + case string: + argstr = append(argstr, fmt.Sprintf("--value=%q", f.Value.(string))) + argstr = append(argstr, "--type=string") + } + } } log.Print(s.gmetric_path, " ", strings.Join(argstr, " ")) -// command := exec.Command(string(GMETRIC_EXEC), strings.Join(argstr, " ")) -// command.Wait() -// _, err := command.Output() + // command := exec.Command(string(GMETRIC_EXEC), strings.Join(argstr, " ")) + // command.Wait() + // _, err := command.Output() return err } @@ -77,5 +79,4 @@ func (s *GangliaSink) Flush() error { } func (s *GangliaSink) Close() { - return } From 3e329c3324d28761a9024d847d1239a53c1d8e82 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 30 Jan 2022 22:05:27 +0100 Subject: [PATCH 061/174] Move defer after checking error --- metric-collector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric-collector.go b/metric-collector.go index 8121141..066fe3c 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -33,11 +33,11 @@ type CentralConfigFile struct { func LoadCentralConfiguration(file string, config *CentralConfigFile) error { configFile, err := os.Open(file) - defer configFile.Close() if err != nil { cclog.Error(err.Error()) return err } + defer configFile.Close() jsonParser := json.NewDecoder(configFile) err = jsonParser.Decode(config) return err From 1f55aa247faa1eb03d84fe4c340042cea880cfb2 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 31 Jan 2022 13:29:14 +0100 Subject: [PATCH 062/174] Run rpmbuild workflow only for new tags --- .github/workflows/rpmbuild.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index a7aee22..8d16e37 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -1,5 +1,8 @@ name: Run RPM Build -on: push +on: + push: + tags: + - '**' jobs: build-centos8: From fd3c7ed573739222182d7205b952869c8efd4a5b Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 31 Jan 2022 14:02:00 +0100 Subject: [PATCH 063/174] Add documentation --- internal/ccMetric/ccMetric.go | 85 +++++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 05f81ff..ae7ab1b 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -11,27 +11,30 @@ import ( // Most functions are derived from github.com/influxdata/line-protocol/metric.go // The metric type is extended with an extra meta information list re-using the Tag // type. - +// +// See: https://docs.influxdata.com/influxdb/latest/reference/syntax/line-protocol/ type ccMetric struct { - name string - tags []*lp.Tag - fields []*lp.Field - tm time.Time - meta []*lp.Tag + name string // Measurement name + tags []*lp.Tag // ordered list of of tags + fields []*lp.Field // unordered list of of fields + tm time.Time // timestamp + meta []*lp.Tag // odered list of meta data tags } +// ccmetric access functions type CCMetric interface { - lp.MutableMetric - AddMeta(key, value string) - MetaList() []*lp.Tag - RemoveTag(key string) - GetTag(key string) (string, bool) - GetMeta(key string) (string, bool) - GetField(key string) (interface{}, bool) - HasField(key string) bool - RemoveField(key string) + lp.MutableMetric // SetTime, AddTag, AddField + AddMeta(key, value string) // Add a meta data tag + MetaList() []*lp.Tag // Returns the meta data list + RemoveTag(key string) // Remove a tag addressed by its key + GetTag(key string) (string, bool) // Get a tag addressed by its key + GetMeta(key string) (string, bool) // Get a meta data tab addressed by its key + GetField(key string) (interface{}, bool) // Get a field addressed by its key + HasField(key string) bool // Check if a field key is present + RemoveField(key string) // Remove a field addressed by its key } +// Meta returns the list of meta data tags as key-value mapping func (m *ccMetric) Meta() map[string]string { meta := make(map[string]string, len(m.meta)) for _, m := range m.meta { @@ -40,18 +43,22 @@ func (m *ccMetric) Meta() map[string]string { return meta } +// MetaList returns the list of meta data tags func (m *ccMetric) MetaList() []*lp.Tag { return m.meta } +// String implements the stringer interface for data type ccMetric func (m *ccMetric) String() string { return fmt.Sprintf("%s %v %v %v %d", m.name, m.Tags(), m.Meta(), m.Fields(), m.tm.UnixNano()) } +// Name returns the metric name func (m *ccMetric) Name() string { return m.name } +// Tags returns the the list of tags as key-value-mapping func (m *ccMetric) Tags() map[string]string { tags := make(map[string]string, len(m.tags)) for _, tag := range m.tags { @@ -60,10 +67,12 @@ func (m *ccMetric) Tags() map[string]string { return tags } +// TagList returns the list of tags func (m *ccMetric) TagList() []*lp.Tag { return m.tags } +// Fields returns the list of fields as key-value-mapping func (m *ccMetric) Fields() map[string]interface{} { fields := make(map[string]interface{}, len(m.fields)) for _, field := range m.fields { @@ -73,18 +82,22 @@ func (m *ccMetric) Fields() map[string]interface{} { return fields } +// FieldList returns the list of fields func (m *ccMetric) FieldList() []*lp.Field { return m.fields } +// Time returns timestamp func (m *ccMetric) Time() time.Time { return m.tm } +// SetTime sets the timestamp func (m *ccMetric) SetTime(t time.Time) { m.tm = t } +// HasTag checks if a tag with key equal to is present in the list of tags func (m *ccMetric) HasTag(key string) bool { for _, tag := range m.tags { if tag.Key == key { @@ -94,6 +107,7 @@ func (m *ccMetric) HasTag(key string) bool { return false } +// GetTag returns the tag with tag's key equal to func (m *ccMetric) GetTag(key string) (string, bool) { for _, tag := range m.tags { if tag.Key == key { @@ -103,6 +117,8 @@ func (m *ccMetric) GetTag(key string) (string, bool) { return "", false } +// RemoveTag removes the tag with tag's key equal to +// and keeps the tag list ordered by the keys func (m *ccMetric) RemoveTag(key string) { for i, tag := range m.tags { if tag.Key == key { @@ -114,6 +130,8 @@ func (m *ccMetric) RemoveTag(key string) { } } +// AddTag adds a tag (consisting of key and value) +// and keeps the tag list ordered by the keys func (m *ccMetric) AddTag(key, value string) { for i, tag := range m.tags { if key > tag.Key { @@ -134,6 +152,7 @@ func (m *ccMetric) AddTag(key, value string) { m.tags = append(m.tags, &lp.Tag{Key: key, Value: value}) } +// HasTag checks if a meta data tag with meta data's key equal to is present in the list of meta data tags func (m *ccMetric) HasMeta(key string) bool { for _, tag := range m.meta { if tag.Key == key { @@ -143,6 +162,7 @@ func (m *ccMetric) HasMeta(key string) bool { return false } +// GetMeta returns the meta data tag with meta data's key equal to func (m *ccMetric) GetMeta(key string) (string, bool) { for _, tag := range m.meta { if tag.Key == key { @@ -152,6 +172,8 @@ func (m *ccMetric) GetMeta(key string) (string, bool) { return "", false } +// RemoveMeta removes the meta data tag with tag's key equal to +// and keeps the meta data tag list ordered by the keys func (m *ccMetric) RemoveMeta(key string) { for i, tag := range m.meta { if tag.Key == key { @@ -163,6 +185,8 @@ func (m *ccMetric) RemoveMeta(key string) { } } +// AddMeta adds a meta data tag (consisting of key and value) +// and keeps the meta data list ordered by the keys func (m *ccMetric) AddMeta(key, value string) { for i, tag := range m.meta { if key > tag.Key { @@ -183,6 +207,7 @@ func (m *ccMetric) AddMeta(key, value string) { m.meta = append(m.meta, &lp.Tag{Key: key, Value: value}) } +// AddField adds a field (consisting of key and value) to the unordered list of fields func (m *ccMetric) AddField(key string, value interface{}) { for i, field := range m.fields { if key == field.Key { @@ -193,6 +218,7 @@ func (m *ccMetric) AddField(key string, value interface{}) { m.fields = append(m.fields, &lp.Field{Key: key, Value: convertField(value)}) } +// GetField returns the field with field's key equal to func (m *ccMetric) GetField(key string) (interface{}, bool) { for _, field := range m.fields { if field.Key == key { @@ -202,6 +228,7 @@ func (m *ccMetric) GetField(key string) (interface{}, bool) { return "", false } +// HasField checks if a field with field's key equal to is present in the list of fields func (m *ccMetric) HasField(key string) bool { for _, field := range m.fields { if field.Key == key { @@ -211,6 +238,8 @@ func (m *ccMetric) HasField(key string) bool { return false } +// RemoveField removes the field with field's key equal to +// from the unordered list of fields func (m *ccMetric) RemoveField(key string) { for i, field := range m.fields { if field.Key == key { @@ -222,6 +251,7 @@ func (m *ccMetric) RemoveField(key string) { } } +// New creates a new measurement point func New( name string, tags map[string]string, @@ -237,6 +267,7 @@ func New( meta: nil, } + // Sorted list of tags if len(tags) > 0 { m.tags = make([]*lp.Tag, 0, len(tags)) for k, v := range tags { @@ -246,6 +277,7 @@ func New( sort.Slice(m.tags, func(i, j int) bool { return m.tags[i].Key < m.tags[j].Key }) } + // Sorted list of meta data tags if len(meta) > 0 { m.meta = make([]*lp.Tag, 0, len(meta)) for k, v := range meta { @@ -255,6 +287,7 @@ func New( sort.Slice(m.meta, func(i, j int) bool { return m.meta[i].Key < m.meta[j].Key }) } + // Unsorted list of fields if len(fields) > 0 { m.fields = make([]*lp.Field, 0, len(fields)) for k, v := range fields { @@ -269,6 +302,7 @@ func New( return m, nil } +// FromMetric copies the metric func FromMetric(other CCMetric) CCMetric { m := &ccMetric{ name: other.Name(), @@ -291,6 +325,7 @@ func FromMetric(other CCMetric) CCMetric { return m } +// FromInfluxMetric copies the influxDB line protocol metric func FromInfluxMetric(other lp.Metric) CCMetric { m := &ccMetric{ name: other.Name(), @@ -300,16 +335,28 @@ func FromInfluxMetric(other lp.Metric) CCMetric { tm: other.Time(), } - for i, tag := range other.TagList() { - m.tags[i] = &lp.Tag{Key: tag.Key, Value: tag.Value} + for i, otherTag := range other.TagList() { + m.tags[i] = &lp.Tag{ + Key: otherTag.Key, + Value: otherTag.Value, + } } - for i, field := range other.FieldList() { - m.fields[i] = &lp.Field{Key: field.Key, Value: field.Value} + for i, otherField := range other.FieldList() { + m.fields[i] = &lp.Field{ + Key: otherField.Key, + Value: otherField.Value, + } } return m } +// convertField converts data types of fields by the following schemata: +// *float32, *float64, float32, float64 -> float64 +// *int, *int8, *int16, *int32, *int64, int, int8, int16, int32, int64 -> int64 +// *uint, *uint8, *uint16, *uint32, *uint64, uint, uint8, uint16, uint32, uint64 -> uint64 +// *[]byte, *string, []byte, string -> string +// *bool, bool -> bool func convertField(v interface{}) interface{} { switch v := v.(type) { case float64: From 862630a21888d1fe860f9ec295453ffc3bcd4230 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 31 Jan 2022 14:42:19 +0100 Subject: [PATCH 064/174] Extend workflow to test Go 1.16 and 1.17 --- .github/workflows/runonce.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index 194710f..2a2cc8a 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -2,7 +2,7 @@ name: Run Test on: push jobs: - build: + build-1-17: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -18,3 +18,19 @@ jobs: - name: Run MetricCollector run: ./cc-metric-collector --once --config .github/ci-config.json + build-1-16: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v2.1.5 + with: + go-version: '^1.16.7' # The version AlmaLinux 8.5 uses + + - name: Build MetricCollector + run: make + + - name: Run MetricCollector + run: ./cc-metric-collector --once --config .github/ci-config.json From 6ff6cb721959d7dd72e11aaa7585a7cb7142f642 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 1 Feb 2022 14:54:34 +0100 Subject: [PATCH 065/174] Change CCMetric's internal data structure (#22) * package ccmetric rewrite * Create deep copy in New() to avoid access conflicts * Renamed TagMap() -> Tags(), MetaMap() -> Meta Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> --- internal/ccMetric/ccMetric.go | 219 +++++++++----------------- internal/metricRouter/metricRouter.go | 8 +- sinks/gangliaSink.go | 12 +- sinks/influxSink.go | 11 +- 4 files changed, 92 insertions(+), 158 deletions(-) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index ae7ab1b..20b9786 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -14,62 +14,73 @@ import ( // // See: https://docs.influxdata.com/influxdb/latest/reference/syntax/line-protocol/ type ccMetric struct { - name string // Measurement name - tags []*lp.Tag // ordered list of of tags - fields []*lp.Field // unordered list of of fields - tm time.Time // timestamp - meta []*lp.Tag // odered list of meta data tags + name string // Measurement name + meta map[string]string // map of meta data tags + tags map[string]string // map of of tags + fields []*lp.Field // unordered list of of fields + tm time.Time // timestamp } // ccmetric access functions type CCMetric interface { - lp.MutableMetric // SetTime, AddTag, AddField - AddMeta(key, value string) // Add a meta data tag - MetaList() []*lp.Tag // Returns the meta data list - RemoveTag(key string) // Remove a tag addressed by its key - GetTag(key string) (string, bool) // Get a tag addressed by its key - GetMeta(key string) (string, bool) // Get a meta data tab addressed by its key + lp.Metric // Time(), Name(), TagList(), FieldList() + + SetTime(t time.Time) + + Meta() map[string]string // Map of meta data tags + MetaList() []*lp.Tag // Ordered list of meta data + AddMeta(key, value string) // Add a meta data tag + GetMeta(key string) (string, bool) // Get a meta data tab addressed by its key + + Tags() map[string]string // Map of tags + AddTag(key, value string) // Add a tag + GetTag(key string) (string, bool) // Get a tag by its key + RemoveTag(key string) // Remove a tag by its key + GetField(key string) (interface{}, bool) // Get a field addressed by its key HasField(key string) bool // Check if a field key is present RemoveField(key string) // Remove a field addressed by its key } -// Meta returns the list of meta data tags as key-value mapping +// Meta returns the meta data tags as key-value mapping func (m *ccMetric) Meta() map[string]string { - meta := make(map[string]string, len(m.meta)) - for _, m := range m.meta { - meta[m.Key] = m.Value - } - return meta + return m.meta } -// MetaList returns the list of meta data tags +// MetaList returns the the list of meta data tags as sorted list of key value tags func (m *ccMetric) MetaList() []*lp.Tag { - return m.meta + + ml := make([]*lp.Tag, 0, len(m.meta)) + for key, value := range m.meta { + ml = append(ml, &lp.Tag{Key: key, Value: value}) + } + sort.Slice(ml, func(i, j int) bool { return ml[i].Key < ml[j].Key }) + return ml } // String implements the stringer interface for data type ccMetric func (m *ccMetric) String() string { - return fmt.Sprintf("%s %v %v %v %d", m.name, m.Tags(), m.Meta(), m.Fields(), m.tm.UnixNano()) + return fmt.Sprintf("%s %v %v %v %d", m.name, m.tags, m.meta, m.Fields(), m.tm.UnixNano()) } -// Name returns the metric name +// Name returns the measurement name func (m *ccMetric) Name() string { return m.name } // Tags returns the the list of tags as key-value-mapping func (m *ccMetric) Tags() map[string]string { - tags := make(map[string]string, len(m.tags)) - for _, tag := range m.tags { - tags[tag.Key] = tag.Value - } - return tags + return m.tags } -// TagList returns the list of tags +// TagList returns the the list of tags as sorted list of key value tags func (m *ccMetric) TagList() []*lp.Tag { - return m.tags + tl := make([]*lp.Tag, 0, len(m.tags)) + for key, value := range m.tags { + tl = append(tl, &lp.Tag{Key: key, Value: value}) + } + sort.Slice(tl, func(i, j int) bool { return tl[i].Key < tl[j].Key }) + return tl } // Fields returns the list of fields as key-value-mapping @@ -99,112 +110,50 @@ func (m *ccMetric) SetTime(t time.Time) { // HasTag checks if a tag with key equal to is present in the list of tags func (m *ccMetric) HasTag(key string) bool { - for _, tag := range m.tags { - if tag.Key == key { - return true - } - } - return false + _, ok := m.tags[key] + return ok } // GetTag returns the tag with tag's key equal to func (m *ccMetric) GetTag(key string) (string, bool) { - for _, tag := range m.tags { - if tag.Key == key { - return tag.Value, true - } - } - return "", false + value, ok := m.tags[key] + return value, ok } // RemoveTag removes the tag with tag's key equal to // and keeps the tag list ordered by the keys func (m *ccMetric) RemoveTag(key string) { - for i, tag := range m.tags { - if tag.Key == key { - copy(m.tags[i:], m.tags[i+1:]) - m.tags[len(m.tags)-1] = nil - m.tags = m.tags[:len(m.tags)-1] - return - } - } + delete(m.tags, key) } // AddTag adds a tag (consisting of key and value) // and keeps the tag list ordered by the keys func (m *ccMetric) AddTag(key, value string) { - for i, tag := range m.tags { - if key > tag.Key { - continue - } - - if key == tag.Key { - tag.Value = value - return - } - - m.tags = append(m.tags, nil) - copy(m.tags[i+1:], m.tags[i:]) - m.tags[i] = &lp.Tag{Key: key, Value: value} - return - } - - m.tags = append(m.tags, &lp.Tag{Key: key, Value: value}) + m.tags[key] = value } // HasTag checks if a meta data tag with meta data's key equal to is present in the list of meta data tags func (m *ccMetric) HasMeta(key string) bool { - for _, tag := range m.meta { - if tag.Key == key { - return true - } - } - return false + _, ok := m.meta[key] + return ok } // GetMeta returns the meta data tag with meta data's key equal to func (m *ccMetric) GetMeta(key string) (string, bool) { - for _, tag := range m.meta { - if tag.Key == key { - return tag.Value, true - } - } - return "", false + value, ok := m.meta[key] + return value, ok } // RemoveMeta removes the meta data tag with tag's key equal to // and keeps the meta data tag list ordered by the keys func (m *ccMetric) RemoveMeta(key string) { - for i, tag := range m.meta { - if tag.Key == key { - copy(m.meta[i:], m.meta[i+1:]) - m.meta[len(m.meta)-1] = nil - m.meta = m.meta[:len(m.meta)-1] - return - } - } + delete(m.meta, key) } // AddMeta adds a meta data tag (consisting of key and value) // and keeps the meta data list ordered by the keys func (m *ccMetric) AddMeta(key, value string) { - for i, tag := range m.meta { - if key > tag.Key { - continue - } - - if key == tag.Key { - tag.Value = value - return - } - - m.meta = append(m.meta, nil) - copy(m.meta[i+1:], m.meta[i:]) - m.meta[i] = &lp.Tag{Key: key, Value: value} - return - } - - m.meta = append(m.meta, &lp.Tag{Key: key, Value: value}) + m.meta[key] = value } // AddField adds a field (consisting of key and value) to the unordered list of fields @@ -261,62 +210,49 @@ func New( ) (CCMetric, error) { m := &ccMetric{ name: name, - tags: nil, - fields: nil, + tags: make(map[string]string, len(tags)), + meta: make(map[string]string, len(meta)), + fields: make([]*lp.Field, 0, len(fields)), tm: tm, - meta: nil, } - // Sorted list of tags - if len(tags) > 0 { - m.tags = make([]*lp.Tag, 0, len(tags)) - for k, v := range tags { - m.tags = append(m.tags, - &lp.Tag{Key: k, Value: v}) - } - sort.Slice(m.tags, func(i, j int) bool { return m.tags[i].Key < m.tags[j].Key }) + // deep copy tags + for k, v := range tags { + m.tags[k] = v } - // Sorted list of meta data tags - if len(meta) > 0 { - m.meta = make([]*lp.Tag, 0, len(meta)) - for k, v := range meta { - m.meta = append(m.meta, - &lp.Tag{Key: k, Value: v}) - } - sort.Slice(m.meta, func(i, j int) bool { return m.meta[i].Key < m.meta[j].Key }) + // deep copy meta data tags + for k, v := range meta { + m.meta[k] = v } // Unsorted list of fields - if len(fields) > 0 { - m.fields = make([]*lp.Field, 0, len(fields)) - for k, v := range fields { - v := convertField(v) - if v == nil { - continue - } - m.AddField(k, v) + for k, v := range fields { + v := convertField(v) + if v == nil { + continue } + m.AddField(k, v) } return m, nil } // FromMetric copies the metric -func FromMetric(other CCMetric) CCMetric { +func FromMetric(other ccMetric) CCMetric { m := &ccMetric{ name: other.Name(), - tags: make([]*lp.Tag, len(other.TagList())), + tags: make(map[string]string), fields: make([]*lp.Field, len(other.FieldList())), - meta: make([]*lp.Tag, len(other.MetaList())), + meta: make(map[string]string), tm: other.Time(), } - for i, tag := range other.TagList() { - m.tags[i] = &lp.Tag{Key: tag.Key, Value: tag.Value} + for key, value := range other.Tags() { + m.tags[key] = value } - for i, s := range other.MetaList() { - m.meta[i] = &lp.Tag{Key: s.Key, Value: s.Value} + for key, value := range other.Meta() { + m.meta[key] = value } for i, field := range other.FieldList() { @@ -329,17 +265,14 @@ func FromMetric(other CCMetric) CCMetric { func FromInfluxMetric(other lp.Metric) CCMetric { m := &ccMetric{ name: other.Name(), - tags: make([]*lp.Tag, len(other.TagList())), + tags: make(map[string]string), fields: make([]*lp.Field, len(other.FieldList())), - meta: make([]*lp.Tag, 0), + meta: make(map[string]string), tm: other.Time(), } - for i, otherTag := range other.TagList() { - m.tags[i] = &lp.Tag{ - Key: otherTag.Key, - Value: otherTag.Value, - } + for _, otherTag := range other.TagList() { + m.tags[otherTag.Key] = otherTag.Value } for i, otherField := range other.FieldList() { diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 870af02..83c14e7 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -141,11 +141,11 @@ func (r *metricRouter) EvalCondition(cond string, point lp.CCMetric) (bool, erro // Add metric name, tags, meta data, fields and timestamp to the parameter list params := make(map[string]interface{}) params["name"] = point.Name() - for _, t := range point.TagList() { - params[t.Key] = t.Value + for key, value := range point.Tags() { + params[key] = value } - for _, m := range point.MetaList() { - params[m.Key] = m.Value + for key, value := range point.Meta() { + params[key] = value } for _, f := range point.FieldList() { params[f.Key] = f.Value diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 87506a0..3fd48e7 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -30,16 +30,16 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { var err error = nil var tagsstr []string var argstr []string - for _, t := range point.TagList() { - switch t.Key { + for key, value := range point.Tags() { + switch key { case "cluster": - argstr = append(argstr, fmt.Sprintf("--cluster=%s", t.Value)) + argstr = append(argstr, fmt.Sprintf("--cluster=%s", value)) case "unit": - argstr = append(argstr, fmt.Sprintf("--units=%s", t.Value)) + argstr = append(argstr, fmt.Sprintf("--units=%s", value)) case "group": - argstr = append(argstr, fmt.Sprintf("--group=%s", t.Value)) + argstr = append(argstr, fmt.Sprintf("--group=%s", value)) default: - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", t.Key, t.Value)) + tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) } } if len(tagsstr) > 0 { diff --git a/sinks/influxSink.go b/sinks/influxSink.go index dca1572..7313490 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -5,10 +5,11 @@ import ( "crypto/tls" "errors" "fmt" + "log" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" - "log" ) type InfluxSink struct { @@ -61,12 +62,12 @@ func (s *InfluxSink) Init(config sinkConfig) error { func (s *InfluxSink) Write(point lp.CCMetric) error { tags := map[string]string{} fields := map[string]interface{}{} - for _, t := range point.TagList() { - tags[t.Key] = t.Value + for key, value := range point.Tags() { + tags[key] = value } if s.meta_as_tags { - for _, m := range point.MetaList() { - tags[m.Key] = m.Value + for key, value := range point.Meta() { + tags[key] = value } } for _, f := range point.FieldList() { From e550226416579b7f6109775890ac4c1027c456d3 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 1 Feb 2022 16:01:31 +0100 Subject: [PATCH 066/174] Use gval in LikwidCollector --- collectors/likwidMetric.go | 378 ++++++++++++++++++++++++++++--------- 1 file changed, 289 insertions(+), 89 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 430a09b..e3be810 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -13,40 +13,59 @@ import ( "errors" "fmt" "io/ioutil" - "log" "math" "os" + "regexp" "strconv" "strings" "time" "unsafe" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - "gopkg.in/Knetic/govaluate.v2" + topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" + "github.com/PaesslerAG/gval" ) -type MetricScope int +type MetricScope string const ( METRIC_SCOPE_HWTHREAD = iota - METRIC_SCOPE_SOCKET + METRIC_SCOPE_CORE + METRIC_SCOPE_LLC METRIC_SCOPE_NUMA + METRIC_SCOPE_DIE + METRIC_SCOPE_SOCKET METRIC_SCOPE_NODE ) func (ms MetricScope) String() string { - return []string{"Head", "Shoulder", "Knee", "Toe"}[ms] + return string(ms) +} + +func (ms MetricScope) Granularity() int { + grans := []string{"hwthread", "core", "llc", "numadomain", "die", "socket", "node"} + for i, g := range grans { + if ms.String() == g { + return i + } + } + return -1 } type LikwidCollectorMetricConfig struct { - Name string `json:"name"` - Calc string `json:"calc"` - Scope MetricScope `json:"socket_scope"` - Publish bool `json:"publish"` + Name string `json:"name"` // Name of the metric + Calc string `json:"calc"` // Calculation for the metric using + Aggr string `json:"aggregation"` // if scope unequal to LIKWID metric scope, the values are combined (sum, min, max, mean or avg, median) + Scope MetricScope `json:"scope"` // scope for calculation. subscopes are aggregated using the 'aggregation' function + Publish bool `json:"publish"` + granulatity MetricScope } type LikwidCollectorEventsetConfig struct { - Events map[string]string `json:"events"` - Metrics []LikwidCollectorMetricConfig `json:"metrics"` + Events map[string]string `json:"events"` + granulatity map[string]MetricScope + Metrics []LikwidCollectorMetricConfig `json:"metrics"` } type LikwidCollectorConfig struct { @@ -67,13 +86,14 @@ type LikwidCollector struct { mresults map[int]map[int]map[string]float64 gmresults map[int]map[string]float64 basefreq float64 + running bool } type LikwidMetric struct { - name string - search string - socket_scope bool - group_idx int + name string + search string + scope MetricScope + group_idx int } func eventsToEventStr(events map[string]string) string { @@ -84,6 +104,21 @@ func eventsToEventStr(events map[string]string) string { return strings.Join(elist, ",") } +func getGranularity(counter, event string) MetricScope { + if strings.HasPrefix(counter, "PMC") || strings.HasPrefix(counter, "FIXC") { + return "hwthread" + } else if strings.Contains(counter, "BOX") || strings.Contains(counter, "DEV") { + return "socket" + } else if strings.HasPrefix(counter, "PWR") { + if event == "RAPL_CORE_ENERGY" { + return "hwthread" + } else { + return "socket" + } + } + return "unknown" +} + func getBaseFreq() float64 { var freq float64 = math.NaN() C.power_init(0) @@ -117,6 +152,53 @@ func getSocketCpus() map[C.int]int { return outmap } +func (m *LikwidCollector) CatchGvalPanic() { + if rerr := recover(); rerr != nil { + cclog.ComponentError(m.name, "Gval failed to calculate a metric", rerr) + m.init = false + } +} + +func (m *LikwidCollector) initGranularity() { + for _, evset := range m.config.Eventsets { + evset.granulatity = make(map[string]MetricScope) + for counter, event := range evset.Events { + gran := getGranularity(counter, event) + if gran.Granularity() >= 0 { + evset.granulatity[counter] = gran + } + } + for i, metric := range evset.Metrics { + s := regexp.MustCompile("[+-/*()]").Split(metric.Calc, -1) + gran := MetricScope("hwthread") + evset.Metrics[i].granulatity = gran + for _, x := range s { + if _, ok := evset.Events[x]; ok { + if evset.granulatity[x].Granularity() > gran.Granularity() { + gran = evset.granulatity[x] + } + } + } + evset.Metrics[i].granulatity = gran + } + } + for i, metric := range m.config.Metrics { + s := regexp.MustCompile("[+-/*()]").Split(metric.Calc, -1) + gran := MetricScope("hwthread") + m.config.Metrics[i].granulatity = gran + for _, x := range s { + for _, evset := range m.config.Eventsets { + for _, m := range evset.Metrics { + if m.Name == x && m.granulatity.Granularity() > gran.Granularity() { + gran = m.granulatity + } + } + } + } + m.config.Metrics[i].granulatity = gran + } +} + func (m *LikwidCollector) Init(config json.RawMessage) error { var ret C.int m.name = "LikwidCollector" @@ -126,38 +208,70 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { return err } } + m.initGranularity() + if m.config.ForceOverwrite { + os.Setenv("LIKWID_FORCE", "1") + } m.setup() - m.meta = map[string]string{"source": m.name, "group": "PerfCounter"} - cpulist := CpuList() - m.cpulist = make([]C.int, len(cpulist)) - slist := getSocketCpus() + // in some cases, gval causes a panic. We catch it with the handler and deactivate + // the collector (m.init = false). + defer m.CatchGvalPanic() + m.meta = map[string]string{"source": m.name, "group": "PerfCounter"} + cpulist := topo.CpuList() + m.cpulist = make([]C.int, len(cpulist)) + + cclog.ComponentDebug(m.name, "Create maps for socket, numa, core and die metrics") m.sock2tid = make(map[int]int) - // m.numa2tid = make(map[int]int) + // m.numa2tid = make(map[int]int) + // m.core2tid = make(map[int]int) + // m.die2tid = make(map[int]int) for i, c := range cpulist { m.cpulist[i] = C.int(c) - if sid, found := slist[m.cpulist[i]]; found { - m.sock2tid[sid] = i - } + m.sock2tid[topo.GetCpuSocket(c)] = i + // m.numa2tid[topo.GetCpuNumaDomain(c)] = i + // m.core2tid[topo.GetCpuCore(c)] = i + // m.die2tid[topo.GetCpuDie(c)] = i } m.results = make(map[int]map[int]map[string]interface{}) m.mresults = make(map[int]map[int]map[string]float64) m.gmresults = make(map[int]map[string]float64) ret = C.topology_init() if ret != 0 { - return errors.New("Failed to initialize LIKWID topology") - } - if m.config.ForceOverwrite { - os.Setenv("LIKWID_FORCE", "1") + err := errors.New("failed to initialize LIKWID topology") + cclog.ComponentError(m.name, err.Error()) + return err } ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) if ret != 0 { C.topology_finalize() - return errors.New("Failed to initialize LIKWID topology") + err := errors.New("failed to initialize LIKWID topology") + cclog.ComponentError(m.name, err.Error()) + return err } + globalParams := make(map[string]interface{}) + globalParams["time"] = float64(1.0) + globalParams["inverseClock"] = float64(1.0) + for i, evset := range m.config.Eventsets { estr := eventsToEventStr(evset.Events) + params := make(map[string]interface{}) + params["time"] = float64(1.0) + params["inverseClock"] = float64(1.0) + for counter, _ := range evset.Events { + params[counter] = float64(1.0) + } + for _, metric := range evset.Metrics { + _, err := gval.Evaluate(metric.Calc, params, gval.Full()) + if err != nil { + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) + continue + } + if _, ok := globalParams[metric.Name]; !ok { + globalParams[metric.Name] = float64(1.0) + } + } cstr := C.CString(estr) gid := C.perfmon_addEventSet(cstr) if gid >= 0 { @@ -172,95 +286,173 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.gmresults[tid] = make(map[string]float64) } } + for _, metric := range m.config.Metrics { + _, err := gval.Evaluate(metric.Calc, globalParams, gval.Full()) + if err != nil { + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) + continue + } + } if len(m.groups) == 0 { C.perfmon_finalize() C.topology_finalize() - return errors.New("No LIKWID performance group initialized") + err := errors.New("no LIKWID performance group initialized") + cclog.ComponentError(m.name, err.Error()) + return err } m.basefreq = getBaseFreq() m.init = true return nil } -func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { - if !m.init { - return - } +func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error { var ret C.int - - for i, gid := range m.groups { - evset := m.config.Eventsets[i] - ret = C.perfmon_setupCounters(gid) - if ret != 0 { - log.Print("Failed to setup performance group ", C.perfmon_getGroupName(gid)) - continue - } - ret = C.perfmon_startCounters() - if ret != 0 { - log.Print("Failed to start performance group ", C.perfmon_getGroupName(gid)) - continue - } - time.Sleep(interval) - ret = C.perfmon_stopCounters() - if ret != 0 { - log.Print("Failed to stop performance group ", C.perfmon_getGroupName(gid)) - continue - } - var eidx C.int - for tid := range m.cpulist { - for eidx = 0; int(eidx) < len(evset.Events); eidx++ { - ctr := C.perfmon_getCounterName(gid, eidx) - gctr := C.GoString(ctr) - res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) - m.results[i][tid][gctr] = float64(res) - } - m.results[i][tid]["time"] = interval.Seconds() - m.results[i][tid]["inverseClock"] = float64(1.0 / m.basefreq) - for _, metric := range evset.Metrics { - expression, err := govaluate.NewEvaluableExpression(metric.Calc) - if err != nil { - log.Print(err.Error()) - continue - } - result, err := expression.Evaluate(m.results[i][tid]) - if err != nil { - log.Print(err.Error()) - continue - } - m.mresults[i][tid][metric.Name] = float64(result.(float64)) - } - } + gid := m.groups[group] + ret = C.perfmon_setupCounters(gid) + if ret != 0 { + gctr := C.GoString(C.perfmon_getGroupName(gid)) + err := fmt.Errorf("failed to setup performance group %s", gctr) + cclog.ComponentError(m.name, err.Error()) + return err } + ret = C.perfmon_startCounters() + if ret != 0 { + gctr := C.GoString(C.perfmon_getGroupName(gid)) + err := fmt.Errorf("failed to start performance group %s", gctr) + cclog.ComponentError(m.name, err.Error()) + return err + } + m.running = true + time.Sleep(interval) + m.running = false + ret = C.perfmon_stopCounters() + if ret != 0 { + gctr := C.GoString(C.perfmon_getGroupName(gid)) + err := fmt.Errorf("failed to stop performance group %s", gctr) + cclog.ComponentError(m.name, err.Error()) + return err + } + return nil +} - for _, metric := range m.config.Metrics { - for tid := range m.cpulist { - var params map[string]interface{} - expression, err := govaluate.NewEvaluableExpression(metric.Calc) +func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration) error { + var eidx C.int + evset := m.config.Eventsets[group] + gid := m.groups[group] + for tid := range m.cpulist { + for eidx = 0; int(eidx) < len(evset.Events); eidx++ { + ctr := C.perfmon_getCounterName(gid, eidx) + gctr := C.GoString(ctr) + res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) + m.results[group][tid][gctr] = float64(res) + if m.results[group][tid][gctr] == 0 { + m.results[group][tid][gctr] = 1.0 + } + } + m.results[group][tid]["time"] = interval.Seconds() + m.results[group][tid]["inverseClock"] = float64(1.0 / m.basefreq) + for _, metric := range evset.Metrics { + value, err := gval.Evaluate(metric.Calc, m.results[group][tid], gval.Full()) if err != nil { - log.Print(err.Error()) + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue } - params = make(map[string]interface{}) + m.mresults[group][tid][metric.Name] = value.(float64) + } + } + return nil +} + +func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration) error { + for _, metric := range m.config.Metrics { + for tid := range m.cpulist { + params := make(map[string]interface{}) for j := range m.groups { for mname, mres := range m.mresults[j][tid] { params[mname] = mres } } - result, err := expression.Evaluate(params) + value, err := gval.Evaluate(metric.Calc, params, gval.Full()) if err != nil { - log.Print(err.Error()) + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue } - m.gmresults[tid][metric.Name] = float64(result.(float64)) + m.gmresults[tid][metric.Name] = value.(float64) } } + return nil +} + +// func (m *LikwidCollector) calcResultMetrics(interval time.Duration) ([]lp.CCMetric, error) { +// var err error = nil +// metrics := make([]lp.CCMetric, 0) +// for i := range m.groups { +// evset := m.config.Eventsets[i] +// for _, metric := range evset.Metrics { +// log.Print(metric.Name, " ", metric.Scope, " ", metric.granulatity) +// if metric.Scope.Granularity() > metric.granulatity.Granularity() { +// log.Print("Different granularity wanted for ", metric.Name, ": ", metric.Scope, " vs ", metric.granulatity) +// var idlist []int +// idfunc := func(cpuid int) int { return cpuid } +// switch metric.Scope { +// case "socket": +// idlist = topo.SocketList() +// idfunc = topo.GetCpuSocket +// case "numa": +// idlist = topo.NumaNodeList() +// idfunc = topo.GetCpuNumaDomain +// case "core": +// idlist = topo.CoreList() +// idfunc = topo.GetCpuCore +// case "die": +// idlist = topo.DieList() +// idfunc = topo.GetCpuDie +// case "node": +// idlist = topo.CpuList() +// } +// for i := 0; i < num_results; i++ { + +// } +// } +// } +// } +// for _, metric := range m.config.Metrics { +// log.Print(metric.Name, " ", metric.Scope, " ", metric.granulatity) +// if metric.Scope.Granularity() > metric.granulatity.Granularity() { +// log.Print("Different granularity wanted for ", metric.Name, ": ", metric.Scope, " vs ", metric.granulatity) +// } +// } +// return metrics, err +// } + +func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + defer m.CatchGvalPanic() + + for i, _ := range m.groups { + // measure event set 'i' for 'interval' seconds + err := m.takeMeasurement(i, interval) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + continue + } + m.calcEventsetMetrics(i, interval) + } + + m.calcGlobalMetrics(interval) + + //metrics, err = m.calcResultMetrics(interval) + for i := range m.groups { evset := m.config.Eventsets[i] for _, metric := range evset.Metrics { + _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) if metric.Publish && !skip { - if metric.Scope.String() == "socket" { + if metric.Scope == "socket" { for sid, tid := range m.sock2tid { y, err := lp.New(metric.Name, map[string]string{"type": "socket", @@ -272,7 +464,7 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) output <- y } } - } else if metric.Scope.String() == "hwthread" { + } else if metric.Scope == "hwthread" { for tid, cpu := range m.cpulist { y, err := lp.New(metric.Name, map[string]string{"type": "cpu", @@ -291,7 +483,7 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) for _, metric := range m.config.Metrics { _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) if metric.Publish && !skip { - if metric.Scope.String() == "socket" { + if metric.Scope == "socket" { for sid, tid := range m.sock2tid { y, err := lp.New(metric.Name, map[string]string{"type": "socket", @@ -303,7 +495,7 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) output <- y } } - } else { + } else if metric.Scope == "hwthread" { for tid, cpu := range m.cpulist { y, err := lp.New(metric.Name, map[string]string{"type": "cpu", @@ -322,8 +514,16 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) func (m *LikwidCollector) Close() { if m.init { + cclog.ComponentDebug(m.name, "Closing ...") m.init = false + if m.running { + cclog.ComponentDebug(m.name, "Stopping counters") + C.perfmon_stopCounters() + } + cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") C.perfmon_finalize() + cclog.ComponentDebug(m.name, "Finalize LIKWID topology module") C.topology_finalize() + cclog.ComponentDebug(m.name, "Closing done") } } From 8319d3de43f5f597ad13afe06a0db40a7360ea8a Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 1 Feb 2022 18:26:54 +0100 Subject: [PATCH 067/174] Add some more helper functions to ccTopology --- internal/ccTopology/ccTopology.go | 141 +++++++++++++++++++++++++++--- 1 file changed, 129 insertions(+), 12 deletions(-) diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go index 8d53b05..030b2f7 100644 --- a/internal/ccTopology/ccTopology.go +++ b/internal/ccTopology/ccTopology.go @@ -24,17 +24,23 @@ func intArrayContains(array []int, str int) (int, bool) { return -1, false } -// stringArrayContains scans an array of strings if the value str is present in the array -// If the specified value is found, the corresponding array index is returned. -// The bool value is used to signal success or failure -// func stringArrayContains(array []string, str string) (int, bool) { -// for i, a := range array { -// if a == str { -// return i, true -// } -// } -// return -1, false -// } +func fileToInt(path string) int { + buffer, err := ioutil.ReadFile(path) + if err != nil { + log.Print(err) + cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error()) + return -1 + } + sbuffer := strings.Replace(string(buffer), "\n", "", -1) + var id int64 + //_, err = fmt.Scanf("%d", sbuffer, &id) + id, err = strconv.ParseInt(sbuffer, 10, 32) + if err != nil { + cclogger.ComponentError("ccTopology", "Parsing", path, ":", sbuffer, err.Error()) + return -1 + } + return int(id) +} func SocketList() []int { buffer, err := ioutil.ReadFile("/proc/cpuinfo") @@ -68,7 +74,7 @@ func CpuList() []int { return nil } ll := strings.Split(string(buffer), "\n") - var cpulist []int + cpulist := make([]int, 0) for _, line := range ll { if strings.HasPrefix(line, "processor") { lv := strings.Fields(line) @@ -86,6 +92,67 @@ func CpuList() []int { return cpulist } +func CoreList() []int { + buffer, err := ioutil.ReadFile("/proc/cpuinfo") + if err != nil { + log.Print(err) + return nil + } + ll := strings.Split(string(buffer), "\n") + corelist := make([]int, 0) + for _, line := range ll { + if strings.HasPrefix(line, "core id") { + lv := strings.Fields(line) + id, err := strconv.ParseInt(lv[3], 10, 32) + if err != nil { + log.Print(err) + return corelist + } + _, found := intArrayContains(corelist, int(id)) + if !found { + corelist = append(corelist, int(id)) + } + } + } + return corelist +} + +func NumaNodeList() []int { + numalist := make([]int, 0) + files, err := filepath.Glob("/sys/devices/system/node/node*") + if err != nil { + log.Print(err) + } + for _, f := range files { + finfo, err := os.Lstat(f) + if err == nil && (finfo.IsDir() || finfo.Mode()&os.ModeSymlink != 0) { + var id int + parts := strings.Split(f, "/") + _, err = fmt.Scanf("node%d", parts[len(parts)-1], &id) + if err == nil { + _, found := intArrayContains(numalist, int(id)) + if !found { + numalist = append(numalist, int(id)) + } + } + } + } + return numalist +} + +func DieList() []int { + cpulist := CpuList() + dielist := make([]int, 0) + for _, c := range cpulist { + dieid := fileToInt(fmt.Sprintf("/sys/devices/system/cpu/cpu%d/topology/die_id", c)) + _, found := intArrayContains(dielist, int(dieid)) + if !found { + dielist = append(dielist, int(dieid)) + } + } + return dielist +} + type CpuEntry struct { Cpuid int SMT int @@ -203,6 +270,7 @@ type CpuInformation struct { SMTWidth int NumSockets int NumDies int + NumCores int NumNumaDomains int } @@ -213,6 +281,7 @@ func CpuInfo() CpuInformation { numa := 0 die := 0 socket := 0 + core := 0 cdata := CpuData() for _, d := range cdata { if d.SMT > smt { @@ -227,10 +296,14 @@ func CpuInfo() CpuInformation { if d.Socket > socket { socket = d.Socket } + if d.Core > core { + core = d.Core + } } c.NumNumaDomains = numa + 1 c.SMTWidth = smt + 1 c.NumDies = die + 1 + c.NumCores = core + 1 c.NumSockets = socket + 1 c.NumHWthreads = len(cdata) return c @@ -275,3 +348,47 @@ func GetCpuCore(cpuid int) int { } return -1 } + +func GetSocketCpus(socket int) []int { + all := CpuData() + cpulist := make([]int, 0) + for _, d := range all { + if d.Socket == socket { + cpulist = append(cpulist, d.Cpuid) + } + } + return cpulist +} + +func GetNumaDomainCpus(domain int) []int { + all := CpuData() + cpulist := make([]int, 0) + for _, d := range all { + if d.Numadomain == domain { + cpulist = append(cpulist, d.Cpuid) + } + } + return cpulist +} + +func GetDieCpus(die int) []int { + all := CpuData() + cpulist := make([]int, 0) + for _, d := range all { + if d.Die == die { + cpulist = append(cpulist, d.Cpuid) + } + } + return cpulist +} + +func GetCoreCpus(core int) []int { + all := CpuData() + cpulist := make([]int, 0) + for _, d := range all { + if d.Core == core { + cpulist = append(cpulist, d.Cpuid) + } + } + return cpulist +} From 64a12b80bb1e860c12835a6925319c300f8b251a Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 1 Feb 2022 18:27:16 +0100 Subject: [PATCH 068/174] Add and export SetName() function for CCMetric --- internal/ccMetric/ccMetric.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 20b9786..9745e9d 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -25,6 +25,7 @@ type ccMetric struct { type CCMetric interface { lp.Metric // Time(), Name(), TagList(), FieldList() + SetName(name string) SetTime(t time.Time) Meta() map[string]string // Map of meta data tags @@ -68,6 +69,10 @@ func (m *ccMetric) Name() string { return m.name } +func (m *ccMetric) SetName(name string) { + m.name = name +} + // Tags returns the the list of tags as key-value-mapping func (m *ccMetric) Tags() map[string]string { return m.tags From a4bd1417861dc3556f4fb1cb5c25a398295627c1 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 1 Feb 2022 18:27:59 +0100 Subject: [PATCH 069/174] Use the MetricAggregator for all calculations in the MetricRouter --- internal/metricRouter/metricAggregator.go | 82 ++++++++++++++++++++++- internal/metricRouter/metricRouter.go | 76 +++++++++++++-------- 2 files changed, 130 insertions(+), 28 deletions(-) diff --git a/internal/metricRouter/metricAggregator.go b/internal/metricRouter/metricAggregator.go index 41c5276..e3303e4 100644 --- a/internal/metricRouter/metricAggregator.go +++ b/internal/metricRouter/metricAggregator.go @@ -3,6 +3,7 @@ package metricRouter import ( "context" "fmt" + "math" "os" "strings" "time" @@ -84,7 +85,7 @@ func (c *metricAggregator) Init(output chan lp.CCMetric) error { c.constants["smtWidth"] = cinfo.SMTWidth c.language = gval.NewLanguage( - gval.Base(), + gval.Full(), metricCacheLanguage, ) @@ -281,6 +282,85 @@ func (c *metricAggregator) AddFunction(name string, function func(args ...interf c.language = gval.NewLanguage(c.language, gval.Function(name, function)) } +func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) { + newcond := strings.ReplaceAll(condition, "'", "\"") + newcond = strings.ReplaceAll(newcond, "%", "\\") + language := gval.NewLanguage( + gval.Full(), + metricCacheLanguage, + ) + value, err := gval.Evaluate(newcond, params, language) + if err != nil { + return false, err + } + var endResult bool = false + err = nil + switch r := value.(type) { + case bool: + endResult = r + case float64: + if r != 0.0 { + endResult = true + } + case float32: + if r != 0.0 { + endResult = true + } + case int: + if r != 0 { + endResult = true + } + case int64: + if r != 0 { + endResult = true + } + case int32: + if r != 0 { + endResult = true + } + default: + err = fmt.Errorf("cannot evaluate '%s' to bool", newcond) + } + return endResult, err +} + +func EvalFloat64Condition(condition string, params map[string]interface{}) (float64, error) { + var endResult float64 = math.NaN() + newcond := strings.ReplaceAll(condition, "'", "\"") + newcond = strings.ReplaceAll(newcond, "%", "\\") + language := gval.NewLanguage( + gval.Full(), + metricCacheLanguage, + ) + value, err := gval.Evaluate(newcond, params, language) + if err != nil { + cclog.ComponentDebug("MetricRouter", condition, " = ", err.Error()) + return endResult, err + } + err = nil + switch r := value.(type) { + case bool: + if r { + endResult = 1.0 + } else { + endResult = 0.0 + } + case float64: + endResult = r + case float32: + endResult = float64(r) + case int: + endResult = float64(r) + case int64: + endResult = float64(r) + case int32: + endResult = float64(r) + default: + err = fmt.Errorf("cannot evaluate '%s' to float64", newcond) + } + return endResult, err +} + func NewAggregator(output chan lp.CCMetric) (MetricAggregator, error) { a := new(metricAggregator) err := a.Init(output) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 83c14e7..8ec7e06 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -11,7 +11,6 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" - "gopkg.in/Knetic/govaluate.v2" ) // Metric router tag configuration @@ -26,8 +25,12 @@ type metricRouterConfig struct { AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met IntervalAgg []metricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval + DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if + DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics + RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval? NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation + dropMetrics map[string]bool // Internal map for O(1) lookup } // Metric router data structure @@ -104,6 +107,10 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout for _, agg := range r.config.IntervalAgg { r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta) } + r.config.dropMetrics = make(map[string]bool) + for _, mname := range r.config.DropMetrics { + r.config.dropMetrics[mname] = true + } return nil } @@ -130,16 +137,9 @@ func (r *metricRouter) StartTimer() { cclog.ComponentDebug("MetricRouter", "TIMER START") } -// EvalCondition evaluates condition cond for metric data from point -func (r *metricRouter) EvalCondition(cond string, point lp.CCMetric) (bool, error) { - expression, err := govaluate.NewEvaluableExpression(cond) - if err != nil { - cclog.ComponentDebug("MetricRouter", cond, " = ", err.Error()) - return false, err - } - - // Add metric name, tags, meta data, fields and timestamp to the parameter list +func getParamMap(point lp.CCMetric) map[string]interface{} { params := make(map[string]interface{}) + params["metric"] = point params["name"] = point.Name() for key, value := range point.Tags() { params[key] = value @@ -151,26 +151,19 @@ func (r *metricRouter) EvalCondition(cond string, point lp.CCMetric) (bool, erro params[f.Key] = f.Value } params["timestamp"] = point.Time() - - // evaluate condition - result, err := expression.Evaluate(params) - if err != nil { - cclog.ComponentDebug("MetricRouter", cond, " = ", err.Error()) - return false, err - } - return bool(result.(bool)), err + return params } // DoAddTags adds a tag when condition is fullfiled func (r *metricRouter) DoAddTags(point lp.CCMetric) { for _, m := range r.config.AddTags { - var conditionMatches bool + var conditionMatches bool = false if m.Condition == "*" { conditionMatches = true } else { var err error - conditionMatches, err = r.EvalCondition(m.Condition, point) + conditionMatches, err = EvalBoolCondition(m.Condition, getParamMap(point)) if err != nil { cclog.ComponentError("MetricRouter", err.Error()) conditionMatches = false @@ -185,13 +178,13 @@ func (r *metricRouter) DoAddTags(point lp.CCMetric) { // DoDelTags removes a tag when condition is fullfiled func (r *metricRouter) DoDelTags(point lp.CCMetric) { for _, m := range r.config.DelTags { - var conditionMatches bool + var conditionMatches bool = false if m.Condition == "*" { conditionMatches = true } else { var err error - conditionMatches, err = r.EvalCondition(m.Condition, point) + conditionMatches, err = EvalBoolCondition(m.Condition, getParamMap(point)) if err != nil { cclog.ComponentError("MetricRouter", err.Error()) conditionMatches = false @@ -203,9 +196,24 @@ func (r *metricRouter) DoDelTags(point lp.CCMetric) { } } +// Conditional test whether a metric should be dropped +func (r *metricRouter) dropMetric(point lp.CCMetric) bool { + // Simple drop check + if _, ok := r.config.dropMetrics[point.Name()]; ok { + return true + } + // Checking the dropping conditions + for _, m := range r.config.DropMetricsIf { + conditionMatches, err := EvalBoolCondition(m, getParamMap(point)) + if conditionMatches || err != nil { + return true + } + } + return false +} + // Start starts the metric router func (r *metricRouter) Start() { - // start timer if configured r.timestamp = time.Now() if r.config.IntervalStamp { @@ -224,6 +232,12 @@ func (r *metricRouter) Start() { cclog.ComponentDebug("MetricRouter", "FORWARD", point) r.DoAddTags(point) r.DoDelTags(point) + if new, ok := r.config.RenameMetrics[point.Name()]; ok { + point.SetName(new) + } + r.DoAddTags(point) + r.DoDelTags(point) + for _, o := range r.outputs { o <- point } @@ -247,7 +261,11 @@ func (r *metricRouter) Start() { if r.config.IntervalStamp { p.SetTime(r.timestamp) } - forward(p) + if !r.dropMetric(p) { + forward(p) + } + // even if the metric is dropped, it is stored in the cache for + // aggregations r.cache.Add(p) case p := <-r.recv_input: @@ -255,12 +273,16 @@ func (r *metricRouter) Start() { if r.config.IntervalStamp { p.SetTime(r.timestamp) } - forward(p) + if !r.dropMetric(p) { + forward(p) + } case p := <-r.cache_input: // receive from metric collector - p.AddTag("hostname", r.hostname) - forward(p) + if !r.dropMetric(p) { + p.AddTag("hostname", r.hostname) + forward(p) + } } } }() From af8654d3250e908078f60f6dcf5f25b1a905fb30 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 1 Feb 2022 18:28:20 +0100 Subject: [PATCH 070/174] Update MetricRouter README --- internal/metricRouter/README.md | 168 +++++++++++++++++++++++++++++++- 1 file changed, 163 insertions(+), 5 deletions(-) diff --git a/internal/metricRouter/README.md b/internal/metricRouter/README.md index a3aef16..17f336c 100644 --- a/internal/metricRouter/README.md +++ b/internal/metricRouter/README.md @@ -6,6 +6,7 @@ The CCMetric router sits in between the collectors and the sinks and can be used ```json { + "interval_timestamp" : true, "add_tags" : [ { "key" : "cluster", @@ -25,16 +26,52 @@ The CCMetric router sits in between the collectors and the sinks and can be used "if" : "*" } ], - "interval_timestamp" : true + "interval_aggregates" : [ + { + "name" : "temp_cores_avg", + "if" : "match('temp_core_%d+', metric.Name())", + "function" : "avg(values)", + "tags" : { + "type" : "node" + }, + "meta" : { + "group": "IPMI", + "unit": "degC", + "source": "TempCollector" + } + } + ], + "drop_metrics" : [ + "not_interesting_metric_at_all" + ], + "drop_metrics_if" : [ + "match('temp_core_%d+', metric.Name())" + ], + "rename_metrics" : { + "metric_12345" : "mymetric" + } } ``` There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval. +# The `interval_timestamp` option -# Conditional manipulation of tags +The collectors' `Read()` functions are not called simultaneously and therefore the metrics gathered in an interval can have different timestamps. If you want to avoid that and have a common timestamp (the beginning of the interval), set this option to `true` and the MetricRouter sets the time. -The `if` setting allows conditional testing of a single metric like in the example: +# The `rename_metrics` option +In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname` + +```json +{ + "oldname" : "newname", + "clock_mhz" : "clock" +} +``` + +# Conditional manipulation of tags (`add_tags` and `del_tags`) + +Common config format: ```json { "key" : "test", @@ -43,8 +80,129 @@ The `if` setting allows conditional testing of a single metric like in the examp } ``` -If the CCMetric name is equal to 'temp_package_id_0', it adds an additional tag `test=testing` to the metric. +## The `del_tags` option -In order to match all metrics, you can use `*`, so in order to add a flag per default, like the `cluster=testcluster` tag in the example. +The collectors are free to add whatever `key=value` pair to the metric tags (although the usage of tags should be minimized). If you want to delete a tag afterwards, you can do that. When the `if` condition matches on a metric, the `key` is removed from the metric's tags. + +If you want to remove a tag for all metrics, use the condition wildcard `*`. The `value` field can be omitted in the `del_tags` case. + +Never delete tags: +- `hostname` +- `type` +- `type-id` + +## The `add_tags` option + +In some cases, metrics should be tagged or an existing tag changed based on some condition. This can be done in the `add_tags` section. When the `if` condition evaluates to `true`, the tag `key` is added or gets changed to the new `value`. + +If the CCMetric name is equal to `temp_package_id_0`, it adds an additional tag `test=testing` to the metric. + +For this metric, a more useful example would be: + +```json +[ + { + "key" : "type", + "value" : "socket", + "if" : "name == 'temp_package_id_0'" + }, + { + "key" : "type-id", + "value" : "0", + "if" : "name == 'temp_package_id_0'" + }, +] +``` + +The metric `temp_package_id_0` corresponds to the tempature of the first CPU socket (=package). With the above configuration, the tags would reflect that because commonly the [TempCollector](../../collectors/tempMetric.md) submits only `node` metrics. + +In order to match all metrics, you can use `*`, so in order to add a flag per default. This is useful to attached system-specific tags like `cluster=testcluster`: + +```json +{ + "key" : "cluster", + "value" : "testcluster", + "if" : "*" +} +``` + +# Dropping metrics + +In some cases, you want to drop a metric and don't get it forwarded to the sinks. There are two options based on the required specification: +- Based only on the metric name -> `drop_metrics` section +- An evaluable condition with more overhead -> `drop_metrics_if` section + +## The `drop_metrics` section + +The argument is a list of metric names. No futher checks are performed, only a comparison of the metric name + +```json +{ + "drop_metrics" : [ + "drop_metric_1", + "drop_metric_2" + ] +} +``` + +The example drops all metrics with the name `drop_metric_1` and `drop_metric_2`. + +## The `drop_metrics_if` section + +This option takes a list of evaluable conditions and performs them one after the other on **all** metrics incoming from the collectors and the metric cache (aka `interval_aggregates`). + +```json +{ + "drop_metrics_if" : [ + "match('drop_metric_%d+', name)", + "match('cpu', type) && type-id == 0" + ] +} +``` +The first line is comparable with the example in `drop_metrics`, it drops all metrics starting with `drop_metric_` and ending with a number. The second line drops all metrics of the first hardware thread (**not** recommended) +# Aggregate metric values of the current interval with the `interval_aggregates` option + +In some cases, you need to derive new metrics based on the metrics arriving during an interval. This can be done in the `interval_aggregates` section. The logic is similar to the other metric manipulation and filtering options. A cache stores all metrics that arrive during an interval. At the beginning of the *next* interval, the list of metrics is submitted to the MetricAggregator. It derives new metrics and submits them back to the MetricRouter, so they are sent in the next interval but have the timestamp of the previous interval beginning. + +```json +"interval_aggregates" : [ + { + "name" : "new_metric_name", + "if" : "match('sub_metric_%d+', metric.Name())", + "function" : "avg(values)", + "tags" : { + "key" : "value", + "type" : "node" + }, + "meta" : { + "key" : "value", + "group": "IPMI", + "unit": "", + } + } +] +``` + +The above configuration, collects all metric values for metrics evaluating `if` to `true`. Afterwards it calculates the average `avg` of the `values` (list of all metrics' field `value`) and creates a new CCMetric with the name `new_metric_name` and adds the tags in `tags` and the meta information in `meta`. The special value `` searches the input metrics and copies the value of the first match of `key` to the new CCMetric. + +If you are not interested in the input metrics `sub_metric_%d+` at all, you can add the same condition used here to the `drop_metrics_if` section to drop them. + +Use cases for `interval_aggregates`: +- Combine multiple metrics of the a collector to a new one like the [MemstatCollector](../../collectors/memstatMetric.go) does it for `mem_used`)): +```json + { + "name" : "mem_used", + "if" : "source == 'MemstatCollector'", + "function" : "sum(mem_total) - (sum(mem_free) + sum(mem_buffers) + sum(mem_cached))", + "tags" : { + "type" : "node" + }, + "meta" : { + "group": "", + "unit": "", + "source": "" + } + } +``` From ed62e952ce98dab0dc8c9deb2567b3ab5534b155 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Feb 2022 14:52:07 +0100 Subject: [PATCH 071/174] Use MetricAggregator to calculate metrics in LIKWID collector. --- collectors/likwidMetric.go | 401 ++++++++++++++++++------------------- 1 file changed, 199 insertions(+), 202 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index e3be810..82e241d 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -24,7 +24,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" - "github.com/PaesslerAG/gval" + mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" ) type MetricScope string @@ -43,16 +43,32 @@ func (ms MetricScope) String() string { return string(ms) } +func (ms MetricScope) Likwid() string { + LikwidDomains := map[string]string{ + "hwthread": "", + "core": "", + "llc": "C", + "numadomain": "M", + "die": "D", + "socket": "S", + "node": "N", + } + return LikwidDomains[string(ms)] +} + func (ms MetricScope) Granularity() int { - grans := []string{"hwthread", "core", "llc", "numadomain", "die", "socket", "node"} - for i, g := range grans { - if ms.String() == g { + for i, g := range GetAllMetricScopes() { + if ms == g { return i } } return -1 } +func GetAllMetricScopes() []MetricScope { + return []MetricScope{"hwthread" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"} +} + type LikwidCollectorMetricConfig struct { Name string `json:"name"` // Name of the metric Calc string `json:"calc"` // Calculation for the metric using @@ -77,16 +93,18 @@ type LikwidCollectorConfig struct { type LikwidCollector struct { metricCollector - cpulist []C.int - sock2tid map[int]int - metrics map[C.int]map[string]int - groups []C.int - config LikwidCollectorConfig - results map[int]map[int]map[string]interface{} - mresults map[int]map[int]map[string]float64 - gmresults map[int]map[string]float64 - basefreq float64 - running bool + cpulist []C.int + cpu2tid map[int]int + sock2tid map[int]int + scopeRespTids map[MetricScope]map[int]int + metrics map[C.int]map[string]int + groups []C.int + config LikwidCollectorConfig + results map[int]map[int]map[string]interface{} + mresults map[int]map[int]map[string]float64 + gmresults map[int]map[string]float64 + basefreq float64 + running bool } type LikwidMetric struct { @@ -138,28 +156,8 @@ func getBaseFreq() float64 { return freq } -func getSocketCpus() map[C.int]int { - slist := SocketList() - var cpu C.int - outmap := make(map[C.int]int) - for _, s := range slist { - t := C.CString(fmt.Sprintf("S%d", s)) - clen := C.cpustr_to_cpulist(t, &cpu, 1) - if int(clen) == 1 { - outmap[cpu] = s - } - } - return outmap -} - -func (m *LikwidCollector) CatchGvalPanic() { - if rerr := recover(); rerr != nil { - cclog.ComponentError(m.name, "Gval failed to calculate a metric", rerr) - m.init = false - } -} - func (m *LikwidCollector) initGranularity() { + splitRegex := regexp.MustCompile("[+-/*()]") for _, evset := range m.config.Eventsets { evset.granulatity = make(map[string]MetricScope) for counter, event := range evset.Events { @@ -169,7 +167,7 @@ func (m *LikwidCollector) initGranularity() { } } for i, metric := range evset.Metrics { - s := regexp.MustCompile("[+-/*()]").Split(metric.Calc, -1) + s := splitRegex.Split(metric.Calc, -1) gran := MetricScope("hwthread") evset.Metrics[i].granulatity = gran for _, x := range s { @@ -183,7 +181,7 @@ func (m *LikwidCollector) initGranularity() { } } for i, metric := range m.config.Metrics { - s := regexp.MustCompile("[+-/*()]").Split(metric.Calc, -1) + s := splitRegex.Split(metric.Calc, -1) gran := MetricScope("hwthread") m.config.Metrics[i].granulatity = gran for _, x := range s { @@ -199,6 +197,59 @@ func (m *LikwidCollector) initGranularity() { } } +type TopoResolveFunc func(cpuid int) int + +func (m *LikwidCollector) getResponsiblities() map[MetricScope]map[int]int { + get_cpus := func(scope MetricScope) map[int]int { + var slist []int + var cpu C.int + var input func(index int) string + switch scope { + case "node": + slist = []int{0} + input = func(index int) string { return "N:0" } + case "socket": + input = func(index int) string { return fmt.Sprintf("%s%d:0", scope.Likwid(), index) } + slist = topo.SocketList() + // case "numadomain": + // input = func(index int) string { return fmt.Sprintf("%s%d:0", scope.Likwid(), index) } + // slist = topo.NumaNodeList() + // cclog.Debug(scope, " ", input(0), " ", slist) + // case "die": + // input = func(index int) string { return fmt.Sprintf("%s%d:0", scope.Likwid(), index) } + // slist = topo.DieList() + // case "llc": + // input = fmt.Sprintf("%s%d:0", scope.Likwid(), s) + // slist = topo.LLCacheList() + case "hwthread": + input = func(index int) string { return fmt.Sprintf("%d", index) } + slist = topo.CpuList() + } + outmap := make(map[int]int) + for _, s := range slist { + t := C.CString(input(s)) + clen := C.cpustr_to_cpulist(t, &cpu, 1) + if int(clen) == 1 { + outmap[s] = m.cpu2tid[int(cpu)] + } else { + cclog.Error(fmt.Sprintf("Cannot determine responsible CPU for %s", input(s))) + outmap[s] = -1 + } + C.free(unsafe.Pointer(t)) + } + return outmap + } + + scopes := GetAllMetricScopes() + complete := make(map[MetricScope]map[int]int) + for _, s := range scopes { + cclog.Debug("Start ", s) + complete[s] = get_cpus(s) + cclog.Debug("End ", s) + } + return complete +} + func (m *LikwidCollector) Init(config json.RawMessage) error { var ret C.int m.name = "LikwidCollector" @@ -208,40 +259,39 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { return err } } - m.initGranularity() if m.config.ForceOverwrite { + cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1") os.Setenv("LIKWID_FORCE", "1") } m.setup() - // in some cases, gval causes a panic. We catch it with the handler and deactivate - // the collector (m.init = false). - defer m.CatchGvalPanic() m.meta = map[string]string{"source": m.name, "group": "PerfCounter"} + cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists") cpulist := topo.CpuList() m.cpulist = make([]C.int, len(cpulist)) - - cclog.ComponentDebug(m.name, "Create maps for socket, numa, core and die metrics") - m.sock2tid = make(map[int]int) - // m.numa2tid = make(map[int]int) - // m.core2tid = make(map[int]int) - // m.die2tid = make(map[int]int) + m.cpu2tid = make(map[int]int) for i, c := range cpulist { m.cpulist[i] = C.int(c) - m.sock2tid[topo.GetCpuSocket(c)] = i - // m.numa2tid[topo.GetCpuNumaDomain(c)] = i - // m.core2tid[topo.GetCpuCore(c)] = i - // m.die2tid[topo.GetCpuDie(c)] = i + m.cpu2tid[c] = i + } m.results = make(map[int]map[int]map[string]interface{}) m.mresults = make(map[int]map[int]map[string]float64) m.gmresults = make(map[int]map[string]float64) + cclog.ComponentDebug(m.name, "initialize LIKWID topology") ret = C.topology_init() if ret != 0 { err := errors.New("failed to initialize LIKWID topology") cclog.ComponentError(m.name, err.Error()) return err } + + // Determine which counter works at which level. PMC*: hwthread, *BOX*: socket, ... + m.initGranularity() + // Generate map for MetricScope -> scope_id (like socket id) -> responsible id (offset in cpulist) + m.scopeRespTids = m.getResponsiblities() + + cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) if ret != 0 { C.topology_finalize() @@ -250,28 +300,33 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { return err } + // This is for the global metrics computation test globalParams := make(map[string]interface{}) globalParams["time"] = float64(1.0) globalParams["inverseClock"] = float64(1.0) - + // While adding the events, we test the metrics whether they can be computed at all for i, evset := range m.config.Eventsets { estr := eventsToEventStr(evset.Events) + // Generate parameter list for the metric computing test params := make(map[string]interface{}) params["time"] = float64(1.0) params["inverseClock"] = float64(1.0) - for counter, _ := range evset.Events { + for counter := range evset.Events { params[counter] = float64(1.0) } for _, metric := range evset.Metrics { - _, err := gval.Evaluate(metric.Calc, params, gval.Full()) + // Try to evaluate the metric + _, err := mr.EvalFloat64Condition(metric.Calc, params) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue } + // If the metric is not in the parameter list for the global metrics, add it if _, ok := globalParams[metric.Name]; !ok { globalParams[metric.Name] = float64(1.0) } } + // Now we add the list of events to likwid cstr := C.CString(estr) gid := C.perfmon_addEventSet(cstr) if gid >= 0 { @@ -283,17 +338,21 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { for tid := range m.cpulist { m.results[i][tid] = make(map[string]interface{}) m.mresults[i][tid] = make(map[string]float64) - m.gmresults[tid] = make(map[string]float64) + if i == 0 { + m.gmresults[tid] = make(map[string]float64) + } } } for _, metric := range m.config.Metrics { - _, err := gval.Evaluate(metric.Calc, globalParams, gval.Full()) + // Try to evaluate the global metric + _, err := mr.EvalFloat64Condition(metric.Calc, globalParams) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue } } + // If no event set could be added, shut down LikwidCollector if len(m.groups) == 0 { C.perfmon_finalize() C.topology_finalize() @@ -306,6 +365,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { return nil } +// take a measurement for 'interval' seconds of event set index 'group' func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error { var ret C.int gid := m.groups[group] @@ -336,101 +396,104 @@ func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) err return nil } -func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration) error { +// Get all measurement results for an event set, derive the metric values out of the measurement results and send it +func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, output chan lp.CCMetric) error { var eidx C.int evset := m.config.Eventsets[group] gid := m.groups[group] - for tid := range m.cpulist { - for eidx = 0; int(eidx) < len(evset.Events); eidx++ { - ctr := C.perfmon_getCounterName(gid, eidx) - gctr := C.GoString(ctr) - res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) - m.results[group][tid][gctr] = float64(res) - if m.results[group][tid][gctr] == 0 { - m.results[group][tid][gctr] = 1.0 + + // Go over events and get the results + for eidx = 0; int(eidx) < len(evset.Events); eidx++ { + ctr := C.perfmon_getCounterName(gid, eidx) + ev := C.perfmon_getEventName(gid, eidx) + gctr := C.GoString(ctr) + gev := C.GoString(ev) + // MetricScope for the counter (and if needed the event) + scope := getGranularity(gctr, gev) + // Get the map scope-id -> tids + // This way we read less counters like only the responsible hardware thread for a socket + scopemap := m.scopeRespTids[scope] + for _, tid := range scopemap { + if tid >= 0 { + m.results[group][tid]["time"] = interval.Seconds() + m.results[group][tid]["inverseClock"] = float64(1.0 / m.basefreq) + res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) + m.results[group][tid][gctr] = float64(res) } } - m.results[group][tid]["time"] = interval.Seconds() - m.results[group][tid]["inverseClock"] = float64(1.0 / m.basefreq) - for _, metric := range evset.Metrics { - value, err := gval.Evaluate(metric.Calc, m.results[group][tid], gval.Full()) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue - } - m.mresults[group][tid][metric.Name] = value.(float64) - } } - return nil -} -func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration) error { - for _, metric := range m.config.Metrics { - for tid := range m.cpulist { - params := make(map[string]interface{}) - for j := range m.groups { - for mname, mres := range m.mresults[j][tid] { - params[mname] = mres + // Go over the event set metrics, derive the value out of the event:counter values and send it + for _, metric := range evset.Metrics { + // The metric scope is determined in the Init() function + // Get the map scope-id -> tids + scopemap := m.scopeRespTids[metric.Scope] + for domain, tid := range scopemap { + if tid >= 0 { + value, err := mr.EvalFloat64Condition(metric.Calc, m.results[group][tid]) + if err != nil { + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) + continue + } + m.mresults[group][tid][metric.Name] = value + // Now we have the result, send it with the proper tags + tags := map[string]string{"type": metric.Scope.String()} + if metric.Scope != "node" { + tags["type-id"] = fmt.Sprintf("%d", domain) + } + fields := map[string]interface{}{"value": value} + y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) + if err == nil { + output <- y } } - value, err := gval.Evaluate(metric.Calc, params, gval.Full()) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue + } + } + + return nil +} + +// Go over the global metrics, derive the value out of the event sets' metric values and send it +func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan lp.CCMetric) error { + for _, metric := range m.config.Metrics { + scopemap := m.scopeRespTids[metric.Scope] + for domain, tid := range scopemap { + if tid >= 0 { + // Here we generate parameter list + params := make(map[string]interface{}) + for j := range m.groups { + for mname, mres := range m.mresults[j][tid] { + params[mname] = mres + } + } + // Evaluate the metric + value, err := mr.EvalFloat64Condition(metric.Calc, params) + if err != nil { + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) + continue + } + m.gmresults[tid][metric.Name] = value + // Now we have the result, send it with the proper tags + tags := map[string]string{"type": metric.Scope.String()} + if metric.Scope != "node" { + tags["type-id"] = fmt.Sprintf("%d", domain) + } + fields := map[string]interface{}{"value": value} + y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) + if err == nil { + output <- y + } } - m.gmresults[tid][metric.Name] = value.(float64) } } return nil } -// func (m *LikwidCollector) calcResultMetrics(interval time.Duration) ([]lp.CCMetric, error) { -// var err error = nil -// metrics := make([]lp.CCMetric, 0) -// for i := range m.groups { -// evset := m.config.Eventsets[i] -// for _, metric := range evset.Metrics { -// log.Print(metric.Name, " ", metric.Scope, " ", metric.granulatity) -// if metric.Scope.Granularity() > metric.granulatity.Granularity() { -// log.Print("Different granularity wanted for ", metric.Name, ": ", metric.Scope, " vs ", metric.granulatity) -// var idlist []int -// idfunc := func(cpuid int) int { return cpuid } -// switch metric.Scope { -// case "socket": -// idlist = topo.SocketList() -// idfunc = topo.GetCpuSocket -// case "numa": -// idlist = topo.NumaNodeList() -// idfunc = topo.GetCpuNumaDomain -// case "core": -// idlist = topo.CoreList() -// idfunc = topo.GetCpuCore -// case "die": -// idlist = topo.DieList() -// idfunc = topo.GetCpuDie -// case "node": -// idlist = topo.CpuList() -// } -// for i := 0; i < num_results; i++ { - -// } -// } -// } -// } -// for _, metric := range m.config.Metrics { -// log.Print(metric.Name, " ", metric.Scope, " ", metric.granulatity) -// if metric.Scope.Granularity() > metric.granulatity.Granularity() { -// log.Print("Different granularity wanted for ", metric.Name, ": ", metric.Scope, " vs ", metric.granulatity) -// } -// } -// return metrics, err -// } - +// main read function taking multiple measurement rounds, each 'interval' seconds long func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } - defer m.CatchGvalPanic() for i, _ := range m.groups { // measure event set 'i' for 'interval' seconds @@ -439,77 +502,11 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) cclog.ComponentError(m.name, err.Error()) continue } - m.calcEventsetMetrics(i, interval) - } - - m.calcGlobalMetrics(interval) - - //metrics, err = m.calcResultMetrics(interval) - - for i := range m.groups { - evset := m.config.Eventsets[i] - for _, metric := range evset.Metrics { - - _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) - if metric.Publish && !skip { - if metric.Scope == "socket" { - for sid, tid := range m.sock2tid { - y, err := lp.New(metric.Name, - map[string]string{"type": "socket", - "type-id": fmt.Sprintf("%d", int(sid))}, - m.meta, - map[string]interface{}{"value": m.mresults[i][tid][metric.Name]}, - time.Now()) - if err == nil { - output <- y - } - } - } else if metric.Scope == "hwthread" { - for tid, cpu := range m.cpulist { - y, err := lp.New(metric.Name, - map[string]string{"type": "cpu", - "type-id": fmt.Sprintf("%d", int(cpu))}, - m.meta, - map[string]interface{}{"value": m.mresults[i][tid][metric.Name]}, - time.Now()) - if err == nil { - output <- y - } - } - } - } - } - } - for _, metric := range m.config.Metrics { - _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) - if metric.Publish && !skip { - if metric.Scope == "socket" { - for sid, tid := range m.sock2tid { - y, err := lp.New(metric.Name, - map[string]string{"type": "socket", - "type-id": fmt.Sprintf("%d", int(sid))}, - m.meta, - map[string]interface{}{"value": m.gmresults[tid][metric.Name]}, - time.Now()) - if err == nil { - output <- y - } - } - } else if metric.Scope == "hwthread" { - for tid, cpu := range m.cpulist { - y, err := lp.New(metric.Name, - map[string]string{"type": "cpu", - "type-id": fmt.Sprintf("%d", int(cpu))}, - m.meta, - map[string]interface{}{"value": m.gmresults[tid][metric.Name]}, - time.Now()) - if err == nil { - output <- y - } - } - } - } + // read measurements and derive event set metrics + m.calcEventsetMetrics(i, interval, output) } + // use the event set metrics to derive the global metrics + m.calcGlobalMetrics(interval, output) } func (m *LikwidCollector) Close() { From 2c13cecf133dc1f9c962cd9a24f25551d724584c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Feb 2022 14:52:19 +0100 Subject: [PATCH 072/174] Fix link in MetricRouter README --- internal/metricRouter/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/metricRouter/README.md b/internal/metricRouter/README.md index 17f336c..d239c64 100644 --- a/internal/metricRouter/README.md +++ b/internal/metricRouter/README.md @@ -190,7 +190,7 @@ The above configuration, collects all metric values for metrics evaluating `if` If you are not interested in the input metrics `sub_metric_%d+` at all, you can add the same condition used here to the `drop_metrics_if` section to drop them. Use cases for `interval_aggregates`: -- Combine multiple metrics of the a collector to a new one like the [MemstatCollector](../../collectors/memstatMetric.go) does it for `mem_used`)): +- Combine multiple metrics of the a collector to a new one like the [MemstatCollector](../../collectors/memstatMetric.md) does it for `mem_used`)): ```json { "name" : "mem_used", From 1222f7a32fa96d5e8861cb6956c81ef579ffd847 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Feb 2022 15:30:14 +0100 Subject: [PATCH 073/174] Configuration option to disable MetricCache completely --- internal/metricRouter/README.md | 9 +++++++ internal/metricRouter/metricRouter.go | 34 +++++++++++++++------------ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/internal/metricRouter/README.md b/internal/metricRouter/README.md index d239c64..9cd0d6c 100644 --- a/internal/metricRouter/README.md +++ b/internal/metricRouter/README.md @@ -6,6 +6,7 @@ The CCMetric router sits in between the collectors and the sinks and can be used ```json { + "num_cache_intervals" : 1, "interval_timestamp" : true, "add_tags" : [ { @@ -58,6 +59,12 @@ There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. The collectors' `Read()` functions are not called simultaneously and therefore the metrics gathered in an interval can have different timestamps. If you want to avoid that and have a common timestamp (the beginning of the interval), set this option to `true` and the MetricRouter sets the time. +# The `num_cache_intervals` option + +If the MetricRouter should buffer metrics of intervals in a MetricCache, this option specifies the number of past intervals that should be kept. If `num_cache_intervals = 0`, the cache is disabled. With `num_cache_intervals = 1`, only the metrics of the last interval are buffered. + +A `num_cache_intervals > 0` is required to use the `interval_aggregates` option. + # The `rename_metrics` option In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname` @@ -164,6 +171,8 @@ The first line is comparable with the example in `drop_metrics`, it drops all me # Aggregate metric values of the current interval with the `interval_aggregates` option +**Note:** `interval_aggregates` works only if `num_cache_intervals` > 0 + In some cases, you need to derive new metrics based on the metrics arriving during an interval. This can be done in the `interval_aggregates` section. The logic is similar to the other metric manipulation and filtering options. A cache stores all metrics that arrive during an interval. At the beginning of the *next* interval, the list of metrics is submitted to the MetricAggregator. It derives new metrics and submits them back to the MetricRouter, so they are sent in the next interval but have the timestamp of the previous interval beginning. ```json diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 8ec7e06..6d63e15 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -95,17 +95,15 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout cclog.ComponentError("MetricRouter", err.Error()) return err } - numIntervals := r.config.NumCacheIntervals - if numIntervals <= 0 { - numIntervals = 1 - } - r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, numIntervals) - if err != nil { - cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error()) - return err - } - for _, agg := range r.config.IntervalAgg { - r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta) + if r.config.NumCacheIntervals >= 0 { + r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals) + if err != nil { + cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error()) + return err + } + for _, agg := range r.config.IntervalAgg { + r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta) + } } r.config.dropMetrics = make(map[string]bool) for _, mname := range r.config.DropMetrics { @@ -244,7 +242,9 @@ func (r *metricRouter) Start() { } // Start Metric Cache - r.cache.Start() + if r.config.NumCacheIntervals > 0 { + r.cache.Start() + } r.wg.Add(1) go func() { @@ -266,7 +266,9 @@ func (r *metricRouter) Start() { } // even if the metric is dropped, it is stored in the cache for // aggregations - r.cache.Add(p) + if r.config.NumCacheIntervals > 0 { + r.cache.Add(p) + } case p := <-r.recv_input: // receive from receive manager @@ -316,8 +318,10 @@ func (r *metricRouter) Close() { // wait for close of channel r.timerdone <-r.timerdone } - r.cache.Close() - r.cachewg.Wait() + if r.config.NumCacheIntervals > 0 { + r.cache.Close() + r.cachewg.Wait() + } } // New creates a new initialized metric router From 5bf538bf97cfde54b3859875f4098e7eec4c1a58 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Feb 2022 16:40:20 +0100 Subject: [PATCH 074/174] Update LikwidCollector page --- collectors/likwidMetric.md | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 08b917f..9ec62e7 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -1,5 +1,25 @@ ## `likwid` collector + +The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. + +The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": +- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. A counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`hwthread`) or each CPU socket (`socket`). The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. +- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. + +### Available metric scopes + +Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric. + +- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$hwthread_id"` +- `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` + +**Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. + + +### Example configuration + + ```json "likwid": { "eventsets": [ @@ -20,25 +40,25 @@ { "name": "ipc", "calc": "PMC0/PMC1", - "socket_scope": false, + "scope": "hwthread", "publish": true }, { "name": "flops_any", "calc": "0.000001*PMC2/time", - "socket_scope": false, + "scope": "hwthread", "publish": true }, { "name": "clock_mhz", "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", - "socket_scope": false, + "scope": "hwthread", "publish": true }, { "name": "mem1", "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", - "socket_scope": true, + "scope": "socket", "publish": false } ] @@ -56,19 +76,19 @@ { "name": "pwr_core", "calc": "PWR0/time", - "socket_scope": false, + "scope": "socket", "publish": true }, { "name": "pwr_pkg", "calc": "PWR1/time", - "socket_scope": true, + "scope": "socket", "publish": true }, { "name": "mem2", "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", - "socket_scope": true, + "scope": "socket", "publish": false } ] @@ -78,14 +98,14 @@ { "name": "mem_bw", "calc": "mem1+mem2", - "socket_scope": true, + "scope": "socket", "publish": true } ] } ``` -_Example config suitable for AMD Zen3_ +### How to get the eventsets and metrics from LIKWID The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. From 6f399d5f081aa5514436126593f3d4b2cc77eed6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Feb 2022 16:46:35 +0100 Subject: [PATCH 075/174] Add scope guidelines in LikwidCollector page --- collectors/likwidMetric.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 9ec62e7..f8ac2d1 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -5,7 +5,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": - An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. A counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`hwthread`) or each CPU socket (`socket`). The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. -- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. +- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. ### Available metric scopes @@ -16,6 +16,12 @@ Hardware performance counters are scattered all over the system nowadays. A coun **Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. +As a guideline: +- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `hwthread` +- All counters names containing `BOX` have the scope `socket` +- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` scope +- All `DFCx` counters have scope `socket` + ### Example configuration @@ -128,12 +134,9 @@ METRICS -> "metrics": [ IPC PMC0/PMC1 -> { -> "name" : "IPC", -> "calc" : "PMC0/PMC1", - -> "socket_scope": false, + -> "scope": "hwthread", -> "publish": true -> } -> ] ``` -The `socket_scope` option tells whether it is submitted per socket or per hwthread. If a metric is only used for internal calculations, you can set `publish = false`. - -Since some metrics can only be gathered in multiple measurements (like the memory bandwidth on AMD Zen3 chips), configure multiple eventsets like in the example config and use the `globalmetrics` section to combine them. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. From e59852be0305aa2fe12b2066be8e8940fd8096ed Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Feb 2022 16:55:15 +0100 Subject: [PATCH 076/174] Fix LikwidCollector, merge artifact causes problems --- collectors/likwidMetric.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 82e241d..54f025a 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -495,7 +495,7 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) return } - for i, _ := range m.groups { + for i := range m.groups { // measure event set 'i' for 'interval' seconds err := m.takeMeasurement(i, interval) if err != nil { From 2806b1e7cc4960c18a62c968593614d596ad0edf Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Feb 2022 17:14:29 +0100 Subject: [PATCH 077/174] Remove debugging artifacts --- collectors/likwidMetric.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 54f025a..182cb72 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -243,9 +243,7 @@ func (m *LikwidCollector) getResponsiblities() map[MetricScope]map[int]int { scopes := GetAllMetricScopes() complete := make(map[MetricScope]map[int]int) for _, s := range scopes { - cclog.Debug("Start ", s) complete[s] = get_cpus(s) - cclog.Debug("End ", s) } return complete } From a0164830120e5625d181008ce5762a18a6e4b8d9 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 3 Feb 2022 15:02:13 +0100 Subject: [PATCH 078/174] Add NUMA metric collector. --- collectors/README.md | 5 ++ collectors/collectorManager.go | 1 + collectors/numastatsMetric.go | 126 +++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 100644 collectors/numastatsMetric.go diff --git a/collectors/README.md b/collectors/README.md index 1c3784e..5b650eb 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -71,6 +71,11 @@ type SampleCollector struct { } func (m *SampleCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + m.name = "SampleCollector" m.setup() if len(config) > 0 { diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index f91db20..62ea4d2 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -32,6 +32,7 @@ var AvailableCollectors = map[string]MetricCollector{ "cpufreq": new(CPUFreqCollector), "cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector), "nfsstat": new(NfsCollector), + "numastats": new(NUMAStatsCollector), } // Metric collector manager data structure diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go new file mode 100644 index 0000000..e6f31a2 --- /dev/null +++ b/collectors/numastatsMetric.go @@ -0,0 +1,126 @@ +package collectors + +import ( + "bufio" + "encoding/json" + "fmt" + "log" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +// +// Numa policy hit/miss statistics +// +// numa_hit: +// A process wanted to allocate memory from this node, and succeeded. +// numa_miss: +// A process wanted to allocate memory from another node, +// but ended up with memory from this node. +// numa_foreign: +// A process wanted to allocate on this node, +// but ended up with memory from another node. +// local_node: +// A process ran on this node's CPU, +// and got memory from this node. +// other_node: +// A process ran on a different node's CPU +// and got memory from this node. +// interleave_hit: +// Interleaving wanted to allocate from this node +// and succeeded. +// +// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html +// +type NUMAStatsCollectorTopolgy struct { + file string + tagSet map[string]string +} + +type NUMAStatsCollector struct { + metricCollector + topology []NUMAStatsCollectorTopolgy +} + +func (m *NUMAStatsCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + + m.name = "NUMAStatsCollector" + m.setup() + m.meta = map[string]string{ + "source": m.name, + "group": "NUMA", + } + + // Loop for all NUMA node directories + baseDir := "/sys/devices/system/node" + globPattern := filepath.Join(baseDir, "node[0-9]*") + dirs, err := filepath.Glob(globPattern) + if err != nil { + return fmt.Errorf("unable to glob files with pattern %s", globPattern) + } + if dirs == nil { + return fmt.Errorf("unable to find any files with pattern %s", globPattern) + } + m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs)) + for _, dir := range dirs { + node := strings.TrimPrefix(dir, "/sys/devices/system/node/node") + file := filepath.Join(dir, "numastat") + m.topology = append(m.topology, + NUMAStatsCollectorTopolgy{ + file: file, + tagSet: map[string]string{"domain": node}, + }) + } + + m.init = true + return nil +} + +func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + + for i := range m.topology { + // Loop for all NUMA domains + t := &m.topology[i] + + now := time.Now() + file, err := os.Open(t.file) + if err != nil { + return + } + scanner := bufio.NewScanner(file) + for scanner.Scan() { + split := strings.Fields(scanner.Text()) + if len(split) != 2 { + continue + } + key := split[0] + value, err := strconv.ParseInt(split[1], 10, 64) + if err != nil { + log.Printf("failed to convert %s='%s' to int64: %v", key, split[1], err) + continue + } + y, err := lp.New("numastats_"+key, t.tagSet, m.meta, map[string]interface{}{"value": value}, now) + if err == nil { + output <- y + } + } + + file.Close() + } +} + +func (m *NUMAStatsCollector) Close() { + m.init = false +} From 92d4a9c2b9a07894f585f4fce9c8d0296f56da97 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Thu, 3 Feb 2022 16:52:55 +0100 Subject: [PATCH 079/174] Split MetricRouter and MetricAggregator (#24) * Split MetricRouter and MetricAggregator * Missing change in MetricCache * Add README for MetricAggregator --- collectors/likwidMetric.go | 18 ++++----- internal/metricAggregator/README.md | 38 +++++++++++++++++++ .../metricAggregator.go | 10 ++--- .../metricAggregatorFunctions.go | 2 +- internal/metricRouter/metricCache.go | 5 ++- internal/metricRouter/metricRouter.go | 25 ++++++------ 6 files changed, 69 insertions(+), 29 deletions(-) create mode 100644 internal/metricAggregator/README.md rename internal/{metricRouter => metricAggregator}/metricAggregator.go (98%) rename internal/{metricRouter => metricAggregator}/metricAggregatorFunctions.go (99%) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 182cb72..957c4aa 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -24,7 +24,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" - mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" + agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" ) type MetricScope string @@ -70,10 +70,10 @@ func GetAllMetricScopes() []MetricScope { } type LikwidCollectorMetricConfig struct { - Name string `json:"name"` // Name of the metric - Calc string `json:"calc"` // Calculation for the metric using - Aggr string `json:"aggregation"` // if scope unequal to LIKWID metric scope, the values are combined (sum, min, max, mean or avg, median) - Scope MetricScope `json:"scope"` // scope for calculation. subscopes are aggregated using the 'aggregation' function + Name string `json:"name"` // Name of the metric + Calc string `json:"calc"` // Calculation for the metric using + //Aggr string `json:"aggregation"` // if scope unequal to LIKWID metric scope, the values are combined (sum, min, max, mean or avg, median) + Scope MetricScope `json:"scope"` // scope for calculation. subscopes are aggregated using the 'aggregation' function Publish bool `json:"publish"` granulatity MetricScope } @@ -314,7 +314,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } for _, metric := range evset.Metrics { // Try to evaluate the metric - _, err := mr.EvalFloat64Condition(metric.Calc, params) + _, err := agg.EvalFloat64Condition(metric.Calc, params) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue @@ -343,7 +343,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } for _, metric := range m.config.Metrics { // Try to evaluate the global metric - _, err := mr.EvalFloat64Condition(metric.Calc, globalParams) + _, err := agg.EvalFloat64Condition(metric.Calc, globalParams) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue @@ -428,7 +428,7 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, scopemap := m.scopeRespTids[metric.Scope] for domain, tid := range scopemap { if tid >= 0 { - value, err := mr.EvalFloat64Condition(metric.Calc, m.results[group][tid]) + value, err := agg.EvalFloat64Condition(metric.Calc, m.results[group][tid]) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue @@ -465,7 +465,7 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan } } // Evaluate the metric - value, err := mr.EvalFloat64Condition(metric.Calc, params) + value, err := agg.EvalFloat64Condition(metric.Calc, params) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue diff --git a/internal/metricAggregator/README.md b/internal/metricAggregator/README.md new file mode 100644 index 0000000..bc07663 --- /dev/null +++ b/internal/metricAggregator/README.md @@ -0,0 +1,38 @@ +# The MetricAggregator + +In some cases, further combination of metrics or raw values is required. For that strings like `foo + 1` with runtime dependent `foo` need to be evaluated. The MetricAggregator relies on the [`gval`](https://github.com/PaesslerAG/gval) Golang package to perform all expression evaluation. The `gval` package provides the basic arithmetic operations but the MetricAggregator defines additional ones. + +**Note**: To get an impression which expressions can be handled by `gval`, see its [README](https://github.com/PaesslerAG/gval/blob/master/README.md) + +## Simple expression evaluation + +For simple expression evaluation, the MetricAggregator provides two function for different use-cases: +- `EvalBoolCondition(expression string, params map[string]interface{}`: Used by the MetricRouter to match metrics like `metric.Name() == 'mymetric'` +- `EvalFloat64Condition(expression string, params map[string]interface{})`: Used by the MetricRouter and LikwidCollector to derive new values like `(PMC0+PMC1)/PMC3` + +## MetricAggregator extensions for `gval` + +The MetricAggregator provides these functions additional to the `Full` language in `gval`: +- `sum(array)`: Sum up values in an array like `sum(values)` +- `min(array)`: Get the minimum value in an array like `min(values)` +- `avg(array)`: Get the mean value in an array like `avg(values)` +- `mean(array)`: Get the mean value in an array like `mean(values)` +- `max(array)`: Get the maximum value in an array like `max(values)` +- `len(array)`: Get the length of an array like `len(values)` +- `median(array)`: Get the median value in an array like `mean(values)` +- `in`: Check existence in an array like `0 in getCpuList()` to check whether there is an entry `0`. Also substring matching works like `temp in metric.Name()` +- `match`: Regular-expression matching like `match('temp_cores_%d+', metric.Name())`. **Note** all `\` in an regex has to be replaced with `%` +- `getCpuCore(cpuid)`: For a CPU id, the the corresponding CPU core id like `getCpuCore(0)` +- `getCpuSocket(cpuid)`: For a CPU id, the the corresponding CPU socket id +- `getCpuNuma(cpuid)`: For a CPU id, the the corresponding NUMA domain id +- `getCpuDie(cpuid)`: For a CPU id, the the corresponding CPU die id +- `getSockCpuList(sockid)`: For a given CPU socket id, the list of CPU ids is returned like the CPUs on socket 1 `getSockCpuList(1)` +- `getNumaCpuList(numaid)`: For a given NUMA node id, the list of CPU ids is returned +- `getDieCpuList(dieid)`: For a given CPU die id, the list of CPU ids is returned +- `getCoreCpuList(coreid)`: For a given CPU core id, the list of CPU ids is returned +- `getCpuList`: Get the list of all CPUs + +## Limitations + +- Since the metrics are written in JSON files which do not allow `""` without proper escaping inside of JSON strings, you have to use `''` for strings. +- Since `\` is interpreted by JSON as escape character, it cannot be used in metrics. But it is required to write regular expressions. So instead of `/`, use `%` and the MetricAggregator replaces them after reading the JSON file. \ No newline at end of file diff --git a/internal/metricRouter/metricAggregator.go b/internal/metricAggregator/metricAggregator.go similarity index 98% rename from internal/metricRouter/metricAggregator.go rename to internal/metricAggregator/metricAggregator.go index e3303e4..a05f061 100644 --- a/internal/metricRouter/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -1,4 +1,4 @@ -package metricRouter +package metricAggregator import ( "context" @@ -16,7 +16,7 @@ import ( "github.com/PaesslerAG/gval" ) -type metricAggregatorIntervalConfig struct { +type MetricAggregatorIntervalConfig struct { Name string `json:"name"` // Metric name for the new metric Function string `json:"function"` // Function to apply on the metric Condition string `json:"if"` // Condition for applying function @@ -27,7 +27,7 @@ type metricAggregatorIntervalConfig struct { } type metricAggregator struct { - functions []*metricAggregatorIntervalConfig + functions []*MetricAggregatorIntervalConfig constants map[string]interface{} language gval.Language output chan lp.CCMetric @@ -65,7 +65,7 @@ var metricCacheLanguage = gval.NewLanguage( func (c *metricAggregator) Init(output chan lp.CCMetric) error { c.output = output - c.functions = make([]*metricAggregatorIntervalConfig, 0) + c.functions = make([]*MetricAggregatorIntervalConfig, 0) c.constants = make(map[string]interface{}) // add constants like hostname, numSockets, ... to constants list @@ -246,7 +246,7 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags return nil } } - var agg metricAggregatorIntervalConfig + var agg MetricAggregatorIntervalConfig agg.Name = name agg.Condition = newcond agg.gvalCond = gvalCond diff --git a/internal/metricRouter/metricAggregatorFunctions.go b/internal/metricAggregator/metricAggregatorFunctions.go similarity index 99% rename from internal/metricRouter/metricAggregatorFunctions.go rename to internal/metricAggregator/metricAggregatorFunctions.go index f00479d..1fbef65 100644 --- a/internal/metricRouter/metricAggregatorFunctions.go +++ b/internal/metricAggregator/metricAggregatorFunctions.go @@ -1,4 +1,4 @@ -package metricRouter +package metricAggregator import ( "errors" diff --git a/internal/metricRouter/metricCache.go b/internal/metricRouter/metricCache.go index 1cfd8c3..67522c9 100644 --- a/internal/metricRouter/metricCache.go +++ b/internal/metricRouter/metricCache.go @@ -7,6 +7,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" ) @@ -28,7 +29,7 @@ type metricCache struct { tickchan chan time.Time done chan bool output chan lp.CCMetric - aggEngine MetricAggregator + aggEngine agg.MetricAggregator } type MetricCache interface { @@ -59,7 +60,7 @@ func (c *metricCache) Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, // Create a new aggregation engine. No separate goroutine at the moment // The code is executed by the MetricCache goroutine - c.aggEngine, err = NewAggregator(c.output) + c.aggEngine, err = agg.NewAggregator(c.output) if err != nil { cclog.ComponentError("MetricCache", "Cannot create aggregator") return err diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 6d63e15..a31f2a6 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -10,6 +10,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" ) @@ -22,15 +23,15 @@ type metricRouterTagConfig struct { // Metric router configuration type metricRouterConfig struct { - AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met - DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met - IntervalAgg []metricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval - DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if - DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics - RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value - IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval? - NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation - dropMetrics map[string]bool // Internal map for O(1) lookup + AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met + DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met + IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval + DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if + DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics + RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value + IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval? + NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation + dropMetrics map[string]bool // Internal map for O(1) lookup } // Metric router data structure @@ -161,7 +162,7 @@ func (r *metricRouter) DoAddTags(point lp.CCMetric) { conditionMatches = true } else { var err error - conditionMatches, err = EvalBoolCondition(m.Condition, getParamMap(point)) + conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point)) if err != nil { cclog.ComponentError("MetricRouter", err.Error()) conditionMatches = false @@ -182,7 +183,7 @@ func (r *metricRouter) DoDelTags(point lp.CCMetric) { conditionMatches = true } else { var err error - conditionMatches, err = EvalBoolCondition(m.Condition, getParamMap(point)) + conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point)) if err != nil { cclog.ComponentError("MetricRouter", err.Error()) conditionMatches = false @@ -202,7 +203,7 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool { } // Checking the dropping conditions for _, m := range r.config.DropMetricsIf { - conditionMatches, err := EvalBoolCondition(m, getParamMap(point)) + conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point)) if conditionMatches || err != nil { return true } From db02c89683b41b74bf7811ec0dad14e1a9eeda44 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 3 Feb 2022 22:05:16 +0100 Subject: [PATCH 080/174] Update LustreCollector to use lctl. Sysfs version is commented out --- collectors/lustreMetric.go | 91 +++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 21 deletions(-) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index 3e248fa..99b371c 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -3,18 +3,21 @@ package collectors import ( "encoding/json" "errors" - "io/ioutil" - "log" + "fmt" + "os/exec" "strconv" "strings" "time" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats` +const LUSTRE_SYSFS = `/sys/fs/lustre` +const LCTL_CMD = `lctl` +const LCTL_OPTION = `get_param` type LustreCollectorConfig struct { - Procfiles []string `json:"procfiles"` + LCtlCommand string `json:"lctl_command"` ExcludeMetrics []string `json:"exclude_metrics"` } @@ -24,6 +27,58 @@ type LustreCollector struct { matches map[string]map[string]int devices []string config LustreCollectorConfig + lctl string +} + +func (m *LustreCollector) getDevices() []string { + devices := make([]string, 0) + + // //Version reading devices from sysfs + // globPattern := filepath.Join(LUSTRE_SYSFS, "llite/*/stats") + // files, err := filepath.Glob(globPattern) + // if err != nil { + // return devices + // } + // for _, f := range files { + // pathlist := strings.Split(f, "/") + // devices = append(devices, pathlist[4]) + // } + + command := exec.Command(m.lctl, LCTL_OPTION, "llite.*.stats") + command.Wait() + stdout, err := command.Output() + if err != nil { + return devices + } + for _, line := range strings.Split(string(stdout), "\n") { + if strings.HasPrefix(line, "llite") { + linefields := strings.Split(line, ".") + if len(linefields) > 2 { + devices = append(devices, linefields[1]) + } + } + } + return devices +} + +// //Version reading the stats data of a device from sysfs +// func (m *LustreCollector) getDeviceDataSysfs(device string) []string { +// llitedir := filepath.Join(LUSTRE_SYSFS, "llite") +// devdir := filepath.Join(llitedir, device) +// statsfile := filepath.Join(devdir, "stats") +// buffer, err := ioutil.ReadFile(statsfile) +// if err != nil { +// return make([]string, 0) +// } +// return strings.Split(string(buffer), "\n") +// } + +func (m *LustreCollector) getDeviceDataCommand(device string) []string { + statsfile := fmt.Sprintf("llite.%s.stats", device) + command := exec.Command(m.lctl, LCTL_OPTION, statsfile) + command.Wait() + stdout, _ := command.Output() + return strings.Split(string(stdout), "\n") } func (m *LustreCollector) Init(config json.RawMessage) error { @@ -46,19 +101,18 @@ func (m *LustreCollector) Init(config json.RawMessage) error { "getattr": {"getattr": 1}, "statfs": {"statfs": 1}, "inode_permission": {"inode_permission": 1}} - m.devices = make([]string, 0) - for _, p := range m.config.Procfiles { - _, err := ioutil.ReadFile(p) - if err == nil { - m.devices = append(m.devices, p) - } else { - log.Print(err.Error()) - continue + p, err := exec.LookPath(m.config.LCtlCommand) + if err != nil { + p, err = exec.LookPath(LCTL_CMD) + if err != nil { + return err } } + m.lctl = p + m.devices = m.getDevices() if len(m.devices) == 0 { - return errors.New("No metrics to collect") + return errors.New("no metrics to collect") } m.init = true return nil @@ -68,15 +122,10 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) if !m.init { return } - for _, p := range m.devices { - buffer, err := ioutil.ReadFile(p) + for _, device := range m.devices { + stats := m.getDeviceDataCommand(device) - if err != nil { - log.Print(err) - return - } - - for _, line := range strings.Split(string(buffer), "\n") { + for _, line := range stats { lf := strings.Fields(line) if len(lf) > 1 { for match, fields := range m.matches { From 02cd21abe24110867a5a1a1b75bceb288caf8af6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 4 Feb 2022 12:39:25 +0100 Subject: [PATCH 081/174] HTTPS for HttpSink --- sinks/httpSink.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 25b0082..a703f82 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -26,7 +26,11 @@ func (s *HttpSink) Init(config sinkConfig) error { } s.client = &http.Client{} - s.url = fmt.Sprintf("http://%s:%s/%s", config.Host, config.Port, config.Database) + proto := "http" + if config.SSL { + proto = "https" + } + s.url = fmt.Sprintf("%s://%s:%s/%s", proto, config.Host, config.Port, config.Database) s.port = config.Port s.jwt = config.Password s.buffer = &bytes.Buffer{} From 66b9a25a88ba759f53d5d83a9bcb73119766076c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 4 Feb 2022 12:39:59 +0100 Subject: [PATCH 082/174] Prefix metrics from NetstatCollector with 'net' --- collectors/netstatMetric.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 86437ea..0ec94a4 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" "time" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -27,10 +28,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error { m.setup() m.meta = map[string]string{"source": m.name, "group": "Memory"} m.matches = map[int]string{ - 1: "bytes_in", - 9: "bytes_out", - 2: "pkts_in", - 10: "pkts_out", + 1: "net_bytes_in", + 9: "net_bytes_out", + 2: "net_pkts_in", + 10: "net_pkts_out", } if len(config) > 0 { err := json.Unmarshal(config, &m.config) From 76b69c59b4b70125938fbf08ab153e7292c6d060 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 4 Feb 2022 14:42:42 +0100 Subject: [PATCH 083/174] Switched to cclog.ComponentError() for error reporting in Read() --- collectors/README.md | 6 +++ collectors/cpufreqCpuinfoMetric.go | 13 +++-- collectors/cpufreqMetric.go | 10 ++-- collectors/gpfsMetric.go | 79 ++++++++++++++++++------------ collectors/numastatsMetric.go | 13 +++-- 5 files changed, 79 insertions(+), 42 deletions(-) diff --git a/collectors/README.md b/collectors/README.md index 5b650eb..8423c95 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -96,6 +96,12 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric) } // tags for the metric, if type != node use proper type and type-id tags := map[string]string{"type" : "node"} + + x, err := GetMetric() + if err != nil { + cclog.ComponentError(m.name, fmt.Sprintf("Read(): %v", err)) + } + // Each metric has exactly one field: value ! value := map[string]interface{}{"value": int(x)} y, err := lp.New("sample_metric", tags, m.meta, value, time.Now()) diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 9c91a50..c77a981 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -5,13 +5,13 @@ import ( "encoding/json" "fmt" - "log" "os" "strconv" "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) // @@ -151,7 +151,6 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { return nil } - func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return @@ -159,7 +158,9 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC const cpuInfoFile = "/proc/cpuinfo" file, err := os.Open(cpuInfoFile) if err != nil { - log.Printf("Failed to open '%s': %v", cpuInfoFile, err) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to open '%s': %v", cpuInfoFile, err)) return } defer file.Close() @@ -178,7 +179,9 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC if !t.isHT { value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64) if err != nil { - log.Printf("Failed to convert cpu MHz to float: %v", err) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert cpu MHz to float: %v", err)) return } y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index f3309ff..b464160 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -4,13 +4,13 @@ import ( "bufio" "encoding/json" "fmt" - "log" "os" "path/filepath" "strconv" "strings" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "golang.org/x/sys/unix" ) @@ -200,12 +200,16 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) // Read current frequency line, ok := readOneLine(t.scalingCurFreqFile) if !ok { - log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile)) continue } cpuFreq, err := strconv.Atoi(line) if err != nil { - log.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert CPU frequency '%s': %v", line, err)) continue } diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index 53db1c2..8055d4c 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -7,13 +7,13 @@ import ( "fmt" "io/ioutil" "log" - "os" "os/exec" "os/user" "strconv" "strings" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -86,12 +86,15 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { cmd.Stderr = cmdStderr err := cmd.Run() if err != nil { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) - data, _ := ioutil.ReadAll(cmdStderr) - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stderr: \"%s\"\n", string(data)) - data, _ = ioutil.ReadAll(cmdStdout) - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stdout: \"%s\"\n", string(data)) + dataStdErr, _ := ioutil.ReadAll(cmdStderr) + dataStdOut, _ := ioutil.ReadAll(cmdStdout) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err), + fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()), + fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)), + fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)), + ) return } @@ -113,7 +116,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { filesystem, ok := key_value["_fs_"] if !ok { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to get filesystem name.\n") + cclog.ComponentError( + m.name, + "Read(): Failed to get filesystem name.") continue } @@ -122,26 +127,30 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // return code rc, err := strconv.Atoi(key_value["_rc_"]) if err != nil { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert return code: %s\n", err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)) continue } if rc != 0 { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Filesystem %s not ok.", filesystem) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Filesystem %s not ok.", filesystem)) continue } sec, err := strconv.ParseInt(key_value["_t_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, - "GpfsCollector.Read(): Failed to convert seconds to int '%s': %v\n", - key_value["_t_"], err) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert seconds '%s' to int: %v", key_value["_t_"], err)) continue } msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, - "GpfsCollector.Read(): Failed to convert micro seconds to int '%s': %v\n", - key_value["_tu_"], err) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int: %v", key_value["_tu_"], err)) continue } timestamp := time.Unix(sec, msec*1000) @@ -149,9 +158,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // bytes read bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, - "GpfsCollector.Read(): Failed to convert bytes read '%s': %s\n", - key_value["_br_"], err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int: %v", key_value["_br_"], err)) continue } @@ -163,9 +172,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // bytes written bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, - "GpfsCollector.Read(): Failed to convert bytes written '%s': %s\n", - key_value["_bw_"], err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int: %v", key_value["_bw_"], err)) continue } @@ -177,9 +186,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // number of opens numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, - "GpfsCollector.Read(): Failed to convert number of opens '%s': %s\n", - key_value["_oc_"], err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int: %v", key_value["_oc_"], err)) continue } y, err = lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp) @@ -190,7 +199,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // number of closes numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of closes: %s\n", err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int: %v", key_value["_cc_"], err)) continue } y, err = lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp) @@ -201,7 +212,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // number of reads numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of reads: %s\n", err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int: %v", key_value["_rdc_"], err)) continue } y, err = lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp) @@ -212,7 +225,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // number of writes numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of writes: %s\n", err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int: %v", key_value["_wc_"], err)) continue } y, err = lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp) @@ -223,7 +238,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // number of read directories numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of read directories: %s\n", err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int: %v", key_value["_dir_"], err)) continue } y, err = lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp) @@ -234,7 +251,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { // Number of inode updates numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64) if err != nil { - fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert Number of inode updates: %s\n", err.Error()) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err)) continue } y, err = lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp) diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go index e6f31a2..08bd51b 100644 --- a/collectors/numastatsMetric.go +++ b/collectors/numastatsMetric.go @@ -4,13 +4,13 @@ import ( "bufio" "encoding/json" "fmt" - "log" "os" "path/filepath" "strconv" "strings" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -65,10 +65,10 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error { globPattern := filepath.Join(baseDir, "node[0-9]*") dirs, err := filepath.Glob(globPattern) if err != nil { - return fmt.Errorf("unable to glob files with pattern %s", globPattern) + return fmt.Errorf("unable to glob files with pattern '%s'", globPattern) } if dirs == nil { - return fmt.Errorf("unable to find any files with pattern %s", globPattern) + return fmt.Errorf("unable to find any files with pattern '%s'", globPattern) } m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs)) for _, dir := range dirs { @@ -97,6 +97,9 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetri now := time.Now() file, err := os.Open(t.file) if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err)) return } scanner := bufio.NewScanner(file) @@ -108,7 +111,9 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetri key := split[0] value, err := strconv.ParseInt(split[1], 10, 64) if err != nil { - log.Printf("failed to convert %s='%s' to int64: %v", key, split[1], err) + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err)) continue } y, err := lp.New("numastats_"+key, t.tagSet, m.meta, map[string]interface{}{"value": value}, now) From f719f1915c483145e7898bd32bb520560051b3ec Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 4 Feb 2022 16:11:56 +0100 Subject: [PATCH 084/174] Add error handling --- collectors/loadavgMetric.go | 71 +++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index 11c0e5e..a337d9b 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -2,14 +2,25 @@ package collectors import ( "encoding/json" + "fmt" "io/ioutil" "strconv" "strings" "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -const LOADAVGFILE = `/proc/loadavg` +// +// LoadavgCollector collects: +// * load average of last 1, 5 & 15 minutes +// * number of processes currently runnable +// * total number of processes in system +// +// See: https://www.kernel.org/doc/html/latest/filesystems/proc.html +// +const LOADAVGFILE = "/proc/loadavg" type LoadavgCollectorConfig struct { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` @@ -32,10 +43,17 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error { return err } } - m.meta = map[string]string{"source": m.name, "group": "LOAD"} + m.meta = map[string]string{ + "source": m.name, + "group": "LOAD"} m.tags = map[string]string{"type": "node"} - m.load_matches = []string{"load_one", "load_five", "load_fifteen"} - m.proc_matches = []string{"proc_run", "proc_total"} + m.load_matches = []string{ + "load_one", + "load_five", + "load_fifteen"} + m.proc_matches = []string{ + "proc_run", + "proc_total"} m.init = true return nil } @@ -45,33 +63,50 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) if !m.init { return } - buffer, err := ioutil.ReadFile(string(LOADAVGFILE)) - + buffer, err := ioutil.ReadFile(LOADAVGFILE) if err != nil { + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err)) + } return } + now := time.Now() + // Load metrics ls := strings.Split(string(buffer), ` `) for i, name := range m.load_matches { x, err := strconv.ParseFloat(ls[i], 64) - if err == nil { - _, skip = stringArrayContains(m.config.ExcludeMetrics, name) - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": float64(x)}, time.Now()) - if err == nil && !skip { - output <- y - } + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err)) + continue + } + _, skip = stringArrayContains(m.config.ExcludeMetrics, name) + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) + if err == nil && !skip { + output <- y } } + + // Process metrics lv := strings.Split(ls[3], `/`) for i, name := range m.proc_matches { x, err := strconv.ParseFloat(lv[i], 64) - if err == nil { - _, skip = stringArrayContains(m.config.ExcludeMetrics, name) - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": float64(x)}, time.Now()) - if err == nil && !skip { - output <- y - } + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err)) + continue } + _, skip = stringArrayContains(m.config.ExcludeMetrics, name) + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) + if err == nil && !skip { + output <- y + } + } } From fdb58b0be27c98e61ab2cdb67f3ec88f4700e66b Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Fri, 4 Feb 2022 18:12:24 +0100 Subject: [PATCH 085/174] Sink specific configuration maps (#25) * Use sink-specific configurations to have more flexibility. Adjust sample sink configuration files * Add documentation * Add links to individual sink readmes * Fix link in README * HTTPS for HttpSink * If no CPU die id available, use the socket id instead --- .github/ci-sinks.json | 6 +- internal/ccTopology/ccTopology.go | 5 +- sinks.json | 6 +- sinks/README.md | 126 ++++++++++++++---------------- sinks/gangliaSink.go | 65 +++++++++++++-- sinks/gangliaSink.md | 21 +++++ sinks/httpSink.go | 29 +++++-- sinks/httpSink.md | 27 +++++++ sinks/influxSink.go | 65 ++++++++------- sinks/influxSink.md | 32 ++++++++ sinks/metricSink.go | 25 ++---- sinks/natsSink.go | 65 ++++++++++----- sinks/natsSink.md | 28 +++++++ sinks/sinkManager.go | 50 ++++++------ sinks/stdoutSink.go | 44 +++++++++-- sinks/stdoutSink.md | 22 ++++++ 16 files changed, 431 insertions(+), 185 deletions(-) create mode 100644 sinks/gangliaSink.md create mode 100644 sinks/httpSink.md create mode 100644 sinks/influxSink.md create mode 100644 sinks/natsSink.md create mode 100644 sinks/stdoutSink.md diff --git a/.github/ci-sinks.json b/.github/ci-sinks.json index d304018..aa8ae80 100644 --- a/.github/ci-sinks.json +++ b/.github/ci-sinks.json @@ -1,6 +1,6 @@ -[ - { +{ + "testoutput" : { "type" : "stdout", "meta_as_tags" : true } -] +} diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go index 030b2f7..6d8bfae 100644 --- a/internal/ccTopology/ccTopology.go +++ b/internal/ccTopology/ccTopology.go @@ -168,7 +168,7 @@ func CpuData() []CpuEntry { buffer, err := ioutil.ReadFile(path) if err != nil { log.Print(err) - cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error()) + //cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error()) return -1 } sbuffer := strings.Replace(string(buffer), "\n", "", -1) @@ -254,6 +254,9 @@ func CpuData() []CpuEntry { // Lookup CPU die id centry.Die = getDie(base) + if centry.Die < 0 { + centry.Die = centry.Socket + } // Lookup SMT thread id centry.SMT = getSMT(centry.Cpuid, base) diff --git a/sinks.json b/sinks.json index d304018..2fdae5a 100644 --- a/sinks.json +++ b/sinks.json @@ -1,6 +1,6 @@ -[ - { +{ + "mystdout" : { "type" : "stdout", "meta_as_tags" : true } -] +} diff --git a/sinks/README.md b/sinks/README.md index 8fac8e5..1690df9 100644 --- a/sinks/README.md +++ b/sinks/README.md @@ -2,17 +2,24 @@ This folder contains the SinkManager and sink implementations for the cc-metric-collector. +# Available sinks: +- [`stdout`](./stdoutSink.md): Print all metrics to `stdout`, `stderr` or a file +- [`http`](./httpSink.md): Send metrics to an HTTP server as POST requests +- [`influxdb`](./influxSink.md): Send metrics to an [InfluxDB](https://www.influxdata.com/products/influxdb/) database +- [`nats`](./natsSink.md): Publish metrics to the [NATS](https://nats.io/) network overlay system +- [`ganglia`](./gangliaSink.md): Publish metrics in the [Ganglia Monitoring System](http://ganglia.info/) + # Configuration The configuration file for the sinks is a list of configurations. The `type` field in each specifies which sink to initialize. ```json [ - { + "mystdout" : { "type" : "stdout", "meta_as_tags" : false }, - { + "metricstore" : { "type" : "http", "host" : "localhost", "port" : "4123", @@ -22,74 +29,12 @@ The configuration file for the sinks is a list of configurations. The `type` fie ] ``` -This example initializes two sinks, the `stdout` sink printing all metrics to the STDOUT and the `http` sink with the given `host`, `port`, `database` and `password`. - -If `meta_as_tags` is set, all meta information attached to CCMetric are printed out as tags. - -## Type `stdout` - -```json -{ - "type" : "stdout", - "meta_as_tags" : -} -``` - -The `stdout` sink dumps all metrics to the STDOUT. - -## Type `http` - -```json -{ - "type" : "http", - "host" : "", - "port" : "", - "database" : "", - "password" : "", - "meta_as_tags" : -} -``` -The sink uses POST requests to send metrics to `http://:/` using the JWT token as a JWT in the 'Authorization' header. - -## Type `nats` - -```json -{ - "type" : "nats", - "host" : "", - "port" : "", - "user" : "", - "password" : "", - "database" : "" - "meta_as_tags" : -} -``` - -This sink publishes the CCMetric in a NATS environment using `host`, `port`, `user` and `password` for connecting. The metrics are published using the topic `database`. - -## Type `influxdb` - -```json -{ - "type" : "influxdb", - "host" : "", - "port" : "", - "user" : "", - "password" : "", - "database" : "" - "organization": "", - "ssl" : , - "meta_as_tags" : -} -``` - -This sink submits the CCMetrics to an InfluxDB time-series database. It uses `host`, `port` and `ssl` for connecting. For authentification, it uses either `user:password` if `user` is set and only `password` as API key. The `organization` and `database` are used for writing to the correct database. # Contributing own sinks -A sink contains three functions and is derived from the type `Sink`: -* `Init(config SinkConfig) error` +A sink contains four functions and is derived from the type `sink`: +* `Init(config json.RawMessage) error` * `Write(point CCMetric) error` * `Flush() error` * `Close()` @@ -97,3 +42,52 @@ A sink contains three functions and is derived from the type `Sink`: The data structures should be set up in `Init()` like opening a file or server connection. The `Write()` function writes/sends the data. For non-blocking sinks, the `Flush()` method tells the sink to drain its internal buffers. The `Close()` function should tear down anything created in `Init()`. Finally, the sink needs to be registered in the `sinkManager.go`. There is a list of sinks called `AvailableSinks` which is a map (`sink_type_string` -> `pointer to sink interface`). Add a new entry with a descriptive name and the new sink. + +## Sample sink + +```go +package sinks + +import ( + "encoding/json" + "log" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +type SampleSinkConfig struct { + defaultSinkConfig // defines JSON tags for 'name' and 'meta_as_tags' +} + +type SampleSink struct { + sink // declarate 'name' and 'meta_as_tags' + config StdoutSinkConfig // entry point to the SampleSinkConfig +} + +// Initialize the sink by giving it a name and reading in the config JSON +func (s *SampleSink) Init(config json.RawMessage) error { + s.name = "SampleSink" // Always specify a name here + // Read in the config JSON + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return err + } + } + return nil +} + +// Code to submit a single CCMetric to the sink +func (s *SampleSink) Write(point lp.CCMetric) error { + log.Print(point) + return nil +} + +// If the sink uses batched sends internally, you can tell to flush its buffers +func (s *SampleSink) Flush() error { + return nil +} + + +// Close sink: close network connection, close files, close libraries, ... +func (s *SampleSink) Close() {} +``` \ No newline at end of file diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 3fd48e7..989e537 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -1,6 +1,8 @@ package sinks import ( + "encoding/json" + "errors" "fmt" "log" "strings" @@ -8,20 +10,49 @@ import ( // "time" "os/exec" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const GMETRIC_EXEC = `gmetric` -type GangliaSink struct { - Sink - gmetric_path string +type GangliaSinkConfig struct { + defaultSinkConfig + GmetricPath string `json:"gmetric_path"` + AddGangliaGroup bool `json:"add_ganglia_group"` } -func (s *GangliaSink) Init(config sinkConfig) error { - p, err := exec.LookPath(string(GMETRIC_EXEC)) - if err == nil { - s.gmetric_path = p +type GangliaSink struct { + sink + gmetric_path string + config GangliaSinkConfig +} + +func (s *GangliaSink) Init(config json.RawMessage) error { + var err error = nil + s.name = "GangliaSink" + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error()) + return err + } + } + s.gmetric_path = "" + if len(s.config.GmetricPath) > 0 { + p, err := exec.LookPath(s.config.GmetricPath) + if err == nil { + s.gmetric_path = p + } + } + if len(s.gmetric_path) == 0 { + p, err := exec.LookPath(string(GMETRIC_EXEC)) + if err == nil { + s.gmetric_path = p + } + } + if len(s.gmetric_path) == 0 { + err = errors.New("cannot find executable 'gmetric'") } return err } @@ -37,11 +68,29 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { case "unit": argstr = append(argstr, fmt.Sprintf("--units=%s", value)) case "group": - argstr = append(argstr, fmt.Sprintf("--group=%s", value)) + if s.config.AddGangliaGroup { + argstr = append(argstr, fmt.Sprintf("--group=%s", value)) + } default: tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) } } + if s.config.MetaAsTags { + for key, value := range point.Meta() { + switch key { + case "cluster": + argstr = append(argstr, fmt.Sprintf("--cluster=%s", value)) + case "unit": + argstr = append(argstr, fmt.Sprintf("--units=%s", value)) + case "group": + if s.config.AddGangliaGroup { + argstr = append(argstr, fmt.Sprintf("--group=%s", value)) + } + default: + tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) + } + } + } if len(tagsstr) > 0 { argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) } diff --git a/sinks/gangliaSink.md b/sinks/gangliaSink.md new file mode 100644 index 0000000..9b77ac9 --- /dev/null +++ b/sinks/gangliaSink.md @@ -0,0 +1,21 @@ +## `ganglia` sink + +The `ganglia` sink uses the `gmetric` tool of the [Ganglia Monitoring System](http://ganglia.info/) to submit the metrics + +### Configuration structure + +```json +{ + "": { + "type": "ganglia", + "meta_as_tags" : true, + "gmetric_path" : "/path/to/gmetric", + "add_ganglia_group" : true + } +} +``` + +- `type`: makes the sink an `ganglia` sink +- `meta_as_tags`: print all meta information as tags in the output (optional) +- `gmetric_path`: Path to `gmetric` executable (optional). If not given, the sink searches in `$PATH` for `gmetric`. +- `add_ganglia_group`: Add `--group=X` based on meta information to the `gmetric` call. Some old versions of `gmetric` do not support the `--group` option. \ No newline at end of file diff --git a/sinks/httpSink.go b/sinks/httpSink.go index a703f82..3080faa 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -2,6 +2,7 @@ package sinks import ( "bytes" + "encoding/json" "errors" "fmt" "net/http" @@ -11,28 +12,44 @@ import ( influx "github.com/influxdata/line-protocol" ) +type HttpSinkConfig struct { + defaultSinkConfig + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + JWT string `json:"jwt,omitempty"` + SSL bool `json:"ssl,omitempty"` +} + type HttpSink struct { sink client *http.Client url, jwt string encoder *influx.Encoder buffer *bytes.Buffer + config HttpSinkConfig } -func (s *HttpSink) Init(config sinkConfig) error { +func (s *HttpSink) Init(config json.RawMessage) error { s.name = "HttpSink" - if len(config.Host) == 0 || len(config.Port) == 0 || len(config.Database) == 0 { + s.config.SSL = false + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return err + } + } + if len(s.config.Host) == 0 || len(s.config.Port) == 0 || len(s.config.Database) == 0 { return errors.New("`host`, `port` and `database` config options required for TCP sink") } s.client = &http.Client{} proto := "http" - if config.SSL { + if s.config.SSL { proto = "https" } - s.url = fmt.Sprintf("%s://%s:%s/%s", proto, config.Host, config.Port, config.Database) - s.port = config.Port - s.jwt = config.Password + s.url = fmt.Sprintf("%s://%s:%s/%s", proto, s.config.Host, s.config.Port, s.config.Database) + s.jwt = s.config.JWT s.buffer = &bytes.Buffer{} s.encoder = influx.NewEncoder(s.buffer) s.encoder.SetPrecision(time.Second) diff --git a/sinks/httpSink.md b/sinks/httpSink.md new file mode 100644 index 0000000..5440a82 --- /dev/null +++ b/sinks/httpSink.md @@ -0,0 +1,27 @@ +## `http` sink + +The `http` sink uses POST requests to a HTTP server to submit the metrics in the InfluxDB line-protocol format. It uses JSON web tokens for authentification. The sink creates batches of metrics before sending, to reduce the HTTP traffic. + +### Configuration structure + +```json +{ + "": { + "type": "http", + "meta_as_tags" : true, + "database" : "mymetrics", + "host": "dbhost.example.com", + "port": "4222", + "jwt" : "0x0000q231", + "ssl" : false + } +} +``` + +- `type`: makes the sink an `http` sink +- `meta_as_tags`: print all meta information as tags in the output (optional) +- `database`: All metrics are written to this bucket +- `host`: Hostname of the InfluxDB database server +- `port`: Portnumber (as string) of the InfluxDB database server +- `jwt`: JSON web tokens for authentification +- `ssl`: Activate SSL encryption \ No newline at end of file diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 7313490..bb35349 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -3,6 +3,7 @@ package sinks import ( "context" "crypto/tls" + "encoding/json" "errors" "fmt" "log" @@ -12,50 +13,60 @@ import ( influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" ) +type InfluxSinkConfig struct { + defaultSinkConfig + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` + Organization string `json:"organization,omitempty"` + SSL bool `json:"ssl,omitempty"` + RetentionPol string `json:"retention_policy,omitempty"` +} + type InfluxSink struct { sink - client influxdb2.Client - writeApi influxdb2Api.WriteAPIBlocking - retPolicy string + client influxdb2.Client + writeApi influxdb2Api.WriteAPIBlocking + config InfluxSinkConfig } func (s *InfluxSink) connect() error { var auth string var uri string - if s.ssl { - uri = fmt.Sprintf("https://%s:%s", s.host, s.port) + if s.config.SSL { + uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port) } else { - uri = fmt.Sprintf("http://%s:%s", s.host, s.port) + uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port) } - if len(s.user) == 0 { - auth = s.password + if len(s.config.User) == 0 { + auth = s.config.Password } else { - auth = fmt.Sprintf("%s:%s", s.user, s.password) + auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) } - log.Print("Using URI ", uri, " Org ", s.organization, " Bucket ", s.database) + log.Print("Using URI ", uri, " Org ", s.config.Organization, " Bucket ", s.config.Database) s.client = influxdb2.NewClientWithOptions(uri, auth, influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: true})) - s.writeApi = s.client.WriteAPIBlocking(s.organization, s.database) + s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) return nil } -func (s *InfluxSink) Init(config sinkConfig) error { +func (s *InfluxSink) Init(config json.RawMessage) error { s.name = "InfluxSink" - if len(config.Host) == 0 || - len(config.Port) == 0 || - len(config.Database) == 0 || - len(config.Organization) == 0 || - len(config.Password) == 0 { - return errors.New("Not all configuration variables set required by InfluxSink") + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return err + } + } + if len(s.config.Host) == 0 || + len(s.config.Port) == 0 || + len(s.config.Database) == 0 || + len(s.config.Organization) == 0 || + len(s.config.Password) == 0 { + return errors.New("not all configuration variables set required by InfluxSink") } - s.host = config.Host - s.port = config.Port - s.database = config.Database - s.organization = config.Organization - s.user = config.User - s.password = config.Password - s.ssl = config.SSL - s.meta_as_tags = config.MetaAsTags return s.connect() } @@ -65,7 +76,7 @@ func (s *InfluxSink) Write(point lp.CCMetric) error { for key, value := range point.Tags() { tags[key] = value } - if s.meta_as_tags { + if s.config.MetaAsTags { for key, value := range point.Meta() { tags[key] = value } diff --git a/sinks/influxSink.md b/sinks/influxSink.md new file mode 100644 index 0000000..2624034 --- /dev/null +++ b/sinks/influxSink.md @@ -0,0 +1,32 @@ +## `influxdb` sink + +The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.dev/github.com/influxdata/influxdb-client-go/v2) to write the metrics to an InfluxDB database. It provides only support for V2 write endpoints (InfluxDB 1.8.0 or later). + + +### Configuration structure + +```json +{ + "": { + "type": "influxdb", + "meta_as_tags" : true, + "database" : "mymetrics", + "host": "dbhost.example.com", + "port": "4222", + "user": "exampleuser", + "password" : "examplepw", + "organization": "myorg", + "ssl": true, + } +} +``` + +- `type`: makes the sink an `influxdb` sink +- `meta_as_tags`: print all meta information as tags in the output (optional) +- `database`: All metrics are written to this bucket +- `host`: Hostname of the InfluxDB database server +- `port`: Portnumber (as string) of the InfluxDB database server +- `user`: Username for basic authentification +- `password`: Password for basic authentification +- `organization`: Organization in the InfluxDB +- `ssl`: Use SSL connection \ No newline at end of file diff --git a/sinks/metricSink.go b/sinks/metricSink.go index 25f66bb..d76f5f2 100644 --- a/sinks/metricSink.go +++ b/sinks/metricSink.go @@ -1,36 +1,23 @@ package sinks import ( - // "time" + "encoding/json" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -type sinkConfig struct { - Type string `json:"type"` - Host string `json:"host,omitempty"` - Port string `json:"port,omitempty"` - Database string `json:"database,omitempty"` - User string `json:"user,omitempty"` - Password string `json:"password,omitempty"` - Organization string `json:"organization,omitempty"` - SSL bool `json:"ssl,omitempty"` - MetaAsTags bool `json:"meta_as_tags,omitempty"` +type defaultSinkConfig struct { + MetaAsTags bool `json:"meta_as_tags,omitempty"` + Type string `json:"type"` } type sink struct { - host string - port string - user string - password string - database string - organization string - ssl bool meta_as_tags bool name string } type Sink interface { - Init(config sinkConfig) error + Init(config json.RawMessage) error Write(point lp.CCMetric) error Flush() error Close() diff --git a/sinks/natsSink.go b/sinks/natsSink.go index f9cd7eb..37e8c2b 100644 --- a/sinks/natsSink.go +++ b/sinks/natsSink.go @@ -2,49 +2,71 @@ package sinks import ( "bytes" + "encoding/json" "errors" "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" nats "github.com/nats-io/nats.go" - "log" - "time" ) +type NatsSinkConfig struct { + defaultSinkConfig + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` +} + type NatsSink struct { sink client *nats.Conn encoder *influx.Encoder buffer *bytes.Buffer + config NatsSinkConfig } func (s *NatsSink) connect() error { - uinfo := nats.UserInfo(s.user, s.password) - uri := fmt.Sprintf("nats://%s:%s", s.host, s.port) - log.Print("Using URI ", uri) + var err error + var uinfo nats.Option = nil + var nc *nats.Conn + if len(s.config.User) > 0 && len(s.config.Password) > 0 { + uinfo = nats.UserInfo(s.config.User, s.config.Password) + } + uri := fmt.Sprintf("nats://%s:%s", s.config.Host, s.config.Port) + cclog.ComponentDebug(s.name, "Connect to", uri) s.client = nil - nc, err := nats.Connect(uri, uinfo) + if uinfo != nil { + nc, err = nats.Connect(uri, uinfo) + } else { + nc, err = nats.Connect(uri) + } if err != nil { - log.Fatal(err) + cclog.ComponentError(s.name, "Connect to", uri, "failed:", err.Error()) return err } s.client = nc return nil } -func (s *NatsSink) Init(config sinkConfig) error { +func (s *NatsSink) Init(config json.RawMessage) error { s.name = "NatsSink" - if len(config.Host) == 0 || - len(config.Port) == 0 || - len(config.Database) == 0 { - return errors.New("Not all configuration variables set required by NatsSink") + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error()) + return err + } + } + if len(s.config.Host) == 0 || + len(s.config.Port) == 0 || + len(s.config.Database) == 0 { + return errors.New("not all configuration variables set required by NatsSink") } - s.host = config.Host - s.port = config.Port - s.database = config.Database - s.organization = config.Organization - s.user = config.User - s.password = config.Password // Setup Influx line protocol s.buffer = &bytes.Buffer{} s.buffer.Grow(1025) @@ -59,7 +81,7 @@ func (s *NatsSink) Write(point lp.CCMetric) error { if s.client != nil { _, err := s.encoder.Encode(point) if err != nil { - log.Print(err) + cclog.ComponentError(s.name, "Write:", err.Error()) return err } } @@ -68,7 +90,8 @@ func (s *NatsSink) Write(point lp.CCMetric) error { func (s *NatsSink) Flush() error { if s.client != nil { - if err := s.client.Publish(s.database, s.buffer.Bytes()); err != nil { + if err := s.client.Publish(s.config.Database, s.buffer.Bytes()); err != nil { + cclog.ComponentError(s.name, "Flush:", err.Error()) return err } s.buffer.Reset() @@ -77,8 +100,8 @@ func (s *NatsSink) Flush() error { } func (s *NatsSink) Close() { - log.Print("Closing Nats connection") if s.client != nil { + cclog.ComponentDebug(s.name, "Close") s.client.Close() } } diff --git a/sinks/natsSink.md b/sinks/natsSink.md new file mode 100644 index 0000000..7a53f27 --- /dev/null +++ b/sinks/natsSink.md @@ -0,0 +1,28 @@ +## `nats` sink + +The `nats` sink publishes all metrics into a NATS network. The publishing key is the database name provided in the configuration file + + +### Configuration structure + +```json +{ + "": { + "type": "nats", + "meta_as_tags" : true, + "database" : "mymetrics", + "host": "dbhost.example.com", + "port": "4222", + "user": "exampleuser", + "password" : "examplepw" + } +} +``` + +- `type`: makes the sink an `nats` sink +- `meta_as_tags`: print all meta information as tags in the output (optional) +- `database`: All metrics are published with this subject +- `host`: Hostname of the NATS server +- `port`: Portnumber (as string) of the NATS server +- `user`: Username for basic authentification +- `password`: Password for basic authentification \ No newline at end of file diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 02421d3..21c392f 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -2,6 +2,7 @@ package sinks import ( "encoding/json" + "fmt" "os" "sync" @@ -20,28 +21,26 @@ var AvailableSinks = map[string]Sink{ // Metric collector manager data structure type sinkManager struct { - input chan lp.CCMetric // input channel - outputs []Sink // List of sinks to use - done chan bool // channel to finish / stop metric sink manager - wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector - config []sinkConfig // json encoded config for sink manager + input chan lp.CCMetric // input channel + done chan bool // channel to finish / stop metric sink manager + wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector + sinks map[string]Sink // Mapping sink name to sink } // Sink manager access functions type SinkManager interface { Init(wg *sync.WaitGroup, sinkConfigFile string) error AddInput(input chan lp.CCMetric) - AddOutput(config json.RawMessage) error + AddOutput(name string, config json.RawMessage) error Start() Close() } func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { sm.input = nil - sm.outputs = make([]Sink, 0) sm.done = make(chan bool) sm.wg = wg - sm.config = make([]sinkConfig, 0) + sm.sinks = make(map[string]Sink, 0) // Read sink config file if len(sinkConfigFile) > 0 { @@ -52,15 +51,16 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { } defer configFile.Close() jsonParser := json.NewDecoder(configFile) - var rawConfigs []json.RawMessage + var rawConfigs map[string]json.RawMessage err = jsonParser.Decode(&rawConfigs) if err != nil { cclog.ComponentError("SinkManager", err.Error()) return err } - for _, raw := range rawConfigs { - err = sm.AddOutput(raw) + for name, raw := range rawConfigs { + err = sm.AddOutput(name, raw) if err != nil { + cclog.ComponentError("SinkManager", err.Error()) continue } } @@ -77,7 +77,7 @@ func (sm *sinkManager) Start() { // Sink manager is done done := func() { - for _, s := range sm.outputs { + for _, s := range sm.sinks { s.Flush() s.Close() } @@ -95,14 +95,14 @@ func (sm *sinkManager) Start() { case p := <-sm.input: // Send received metric to all outputs cclog.ComponentDebug("SinkManager", "WRITE", p) - for _, s := range sm.outputs { + for _, s := range sm.sinks { s.Write(p) } // Flush all outputs if batchcount == 0 { cclog.ComponentDebug("SinkManager", "FLUSH") - for _, s := range sm.outputs { + for _, s := range sm.sinks { s.Flush() } batchcount = 20 @@ -121,29 +121,27 @@ func (sm *sinkManager) AddInput(input chan lp.CCMetric) { sm.input = input } -func (sm *sinkManager) AddOutput(rawConfig json.RawMessage) error { +func (sm *sinkManager) AddOutput(name string, rawConfig json.RawMessage) error { var err error - var config sinkConfig - if len(rawConfig) > 3 { - err = json.Unmarshal(rawConfig, &config) + var sinkConfig defaultSinkConfig + if len(rawConfig) > 0 { + err := json.Unmarshal(rawConfig, &sinkConfig) if err != nil { - cclog.ComponentError("SinkManager", "SKIP", config.Type, "JSON config error:", err.Error()) return err } } - if _, found := AvailableSinks[config.Type]; !found { - cclog.ComponentError("SinkManager", "SKIP", config.Type, "unknown sink:", err.Error()) + if _, found := AvailableSinks[sinkConfig.Type]; !found { + cclog.ComponentError("SinkManager", "SKIP", name, "unknown sink:", err.Error()) return err } - s := AvailableSinks[config.Type] - err = s.Init(config) + s := AvailableSinks[sinkConfig.Type] + err = s.Init(rawConfig) if err != nil { cclog.ComponentError("SinkManager", "SKIP", s.Name(), "initialization failed:", err.Error()) return err } - sm.outputs = append(sm.outputs, s) - sm.config = append(sm.config, config) - cclog.ComponentDebug("SinkManager", "ADD SINK", s.Name()) + sm.sinks[name] = s + cclog.ComponentDebug("SinkManager", "ADD SINK", s.Name(), "with name", fmt.Sprintf("'%s'", name)) return nil } diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index 215239f..2c9e710 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -1,21 +1,50 @@ package sinks import ( + "encoding/json" "fmt" "math" + "os" "strings" // "time" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -type StdoutSink struct { - sink +type StdoutSinkConfig struct { + defaultSinkConfig + Output string `json:"output_file,omitempty"` } -func (s *StdoutSink) Init(config sinkConfig) error { +type StdoutSink struct { + sink + output *os.File + config StdoutSinkConfig +} + +func (s *StdoutSink) Init(config json.RawMessage) error { s.name = "StdoutSink" - s.meta_as_tags = config.MetaAsTags + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return err + } + } + s.output = os.Stdout + if len(s.config.Output) > 0 { + if strings.ToLower(s.config.Output) == "stdout" { + s.output = os.Stdout + } else if strings.ToLower(s.config.Output) == "stderr" { + s.output = os.Stderr + } else { + f, err := os.OpenFile(s.config.Output, os.O_CREATE|os.O_WRONLY, os.FileMode(0600)) + if err != nil { + return err + } + s.output = f + } + } + s.meta_as_tags = s.config.MetaAsTags return nil } @@ -63,7 +92,12 @@ func (s *StdoutSink) Write(point lp.CCMetric) error { } func (s *StdoutSink) Flush() error { + s.output.Sync() return nil } -func (s *StdoutSink) Close() {} +func (s *StdoutSink) Close() { + if s.output != os.Stdout && s.output != os.Stderr { + s.output.Close() + } +} diff --git a/sinks/stdoutSink.md b/sinks/stdoutSink.md new file mode 100644 index 0000000..317ca3f --- /dev/null +++ b/sinks/stdoutSink.md @@ -0,0 +1,22 @@ +## `stdout` sink + +The `stdout` sink is the most simple sink provided by cc-metric-collector. It writes all metrics in InfluxDB line-procol format to the configurable output file or the common special files `stdout` and `stderr`. + + +### Configuration structure + +```json +{ + "": { + "type": "stdout", + "meta_as_tags" : true, + "output_file" : "mylogfile.log" + } +} +``` + +- `type`: makes the sink an `stdout` sink +- `meta_as_tags`: print all meta information as tags in the output (optional) +- `output_file`: Write all data to the selected file (optional). There are two 'special' files: `stdout` and `stderr`. If this option is not provided, the default value is `stdout` + + From 9ab7a6424bb06100f5cedf92884c206e7aaff725 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 4 Feb 2022 19:22:31 +0100 Subject: [PATCH 086/174] Moved check which metric to skip to Init() --- collectors/loadavgMetric.go | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index a337d9b..3859721 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -22,16 +22,16 @@ import ( // const LOADAVGFILE = "/proc/loadavg" -type LoadavgCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics,omitempty"` -} - type LoadavgCollector struct { metricCollector tags map[string]string load_matches []string + load_skips []bool proc_matches []string - config LoadavgCollectorConfig + proc_skips []bool + config struct { + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + } } func (m *LoadavgCollector) Init(config json.RawMessage) error { @@ -51,15 +51,23 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error { "load_one", "load_five", "load_fifteen"} + m.load_skips = make([]bool, len(m.load_matches)) m.proc_matches = []string{ "proc_run", "proc_total"} + m.proc_skips = make([]bool, len(m.proc_matches)) + + for i, name := range m.load_matches { + _, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name) + } + for i, name := range m.proc_matches { + _, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name) + } m.init = true return nil } func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) { - var skip bool if !m.init { return } @@ -84,9 +92,11 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err)) continue } - _, skip = stringArrayContains(m.config.ExcludeMetrics, name) + if m.load_skips[i] { + continue + } y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) - if err == nil && !skip { + if err == nil { output <- y } } @@ -94,16 +104,18 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) // Process metrics lv := strings.Split(ls[3], `/`) for i, name := range m.proc_matches { - x, err := strconv.ParseFloat(lv[i], 64) + x, err := strconv.ParseInt(lv[i], 10, 64) if err != nil { cclog.ComponentError( m.name, fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err)) continue } - _, skip = stringArrayContains(m.config.ExcludeMetrics, name) + if m.proc_skips[i] { + continue + } y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) - if err == nil && !skip { + if err == nil { output <- y } From 5ac3af895dee82a622b263cf529a4dbb96c42001 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 09:22:59 +0100 Subject: [PATCH 087/174] Moved documentation to markdown file --- collectors/infinibandMetric.go | 18 ------------------ collectors/infinibandMetric.md | 8 ++++++-- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 6b4c882..29231af 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -32,24 +32,6 @@ type InfinibandCollector struct { info []InfinibandCollectorInfo } -func (m *InfinibandCollector) Help() { - fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH) - fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "//ports//lid).") - fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.") - fmt.Println("For each found LIDs the collector calls the 'perfquery' command") - fmt.Println("") - fmt.Println("Full configuration object:") - fmt.Println("\"ibstat\" : {") - fmt.Println(" \"exclude_devices\" : [\"dev1\"]") - fmt.Println("}") - fmt.Println("") - fmt.Println("Metrics:") - fmt.Println("- ib_recv") - fmt.Println("- ib_xmit") - fmt.Println("- ib_recv_pkts") - fmt.Println("- ib_xmit_pkts") -} - // Init initializes the Infiniband collector by walking through files below IB_BASEPATH func (m *InfinibandCollector) Init(config json.RawMessage) error { var err error diff --git a/collectors/infinibandMetric.md b/collectors/infinibandMetric.md index e9ba043..140ea54 100644 --- a/collectors/infinibandMetric.md +++ b/collectors/infinibandMetric.md @@ -3,17 +3,21 @@ ```json "ibstat": { - "perfquery_path" : "", "exclude_devices": [ "mlx4" ] } ``` -The `ibstat` collector reads either data through the `perfquery` command or the sysfs files below `/sys/class/infiniband/`. +The `ibstat` includes all devices that can be found below `/sys/class/infiniband/` +and where any of the ports provides a `lid` file (`/sys/class/infiniband//ports//lid`) +The devices can be filtered with the `exclude_devices` option in the configuration. +The collector reads data through the sysfs files below `/sys/class/infiniband/`. Metrics: * `ib_recv` * `ib_xmit` +* `ib_recv_pkts` +* `ib_xmit_pkts` The collector adds a `device` tag to all metrics From 79b25ddbee126f1949846b5268ecccf1dabaec4a Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 09:46:19 +0100 Subject: [PATCH 088/174] Add markdown documentation for metric collector ibstat_perfquery --- collectors/infinibandMetric.go | 8 ++++++- collectors/infinibandMetric.md | 9 +++++--- collectors/infinibandPerfQueryMetric.go | 21 ------------------- collectors/infinibandPerfQueryMetric.md | 28 +++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 25 deletions(-) create mode 100644 collectors/infinibandPerfQueryMetric.md diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 29231af..d93fd7b 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -20,7 +20,7 @@ type InfinibandCollectorInfo struct { LID string // IB local Identifier (LID) device string // IB device port string // IB device port - portCounterFiles map[string]string // mapping counter name -> file + portCounterFiles map[string]string // mapping counter name -> sysfs file tagSet map[string]string // corresponding tag list } @@ -34,6 +34,12 @@ type InfinibandCollector struct { // Init initializes the Infiniband collector by walking through files below IB_BASEPATH func (m *InfinibandCollector) Init(config json.RawMessage) error { + + // Check if already initialized + if !m.init { + return nil + } + var err error m.name = "InfinibandCollector" m.setup() diff --git a/collectors/infinibandMetric.md b/collectors/infinibandMetric.md index 140ea54..579ed77 100644 --- a/collectors/infinibandMetric.md +++ b/collectors/infinibandMetric.md @@ -9,10 +9,13 @@ } ``` -The `ibstat` includes all devices that can be found below `/sys/class/infiniband/` -and where any of the ports provides a `lid` file (`/sys/class/infiniband//ports//lid`) +The `ibstat` collector includes all Infiniband devices that can be +found below `/sys/class/infiniband/` and where any of the ports provides a +LID file (`/sys/class/infiniband//ports//lid`) + The devices can be filtered with the `exclude_devices` option in the configuration. -The collector reads data through the sysfs files below `/sys/class/infiniband/`. + +For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/`. Metrics: * `ib_recv` diff --git a/collectors/infinibandPerfQueryMetric.go b/collectors/infinibandPerfQueryMetric.go index d8f7bf4..1a81d37 100644 --- a/collectors/infinibandPerfQueryMetric.go +++ b/collectors/infinibandPerfQueryMetric.go @@ -29,27 +29,6 @@ type InfinibandPerfQueryCollector struct { } } -func (m *InfinibandPerfQueryCollector) Help() { - fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH) - fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "//ports//lid).") - fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.") - fmt.Println("For each found LIDs the collector calls the 'perfquery' command") - fmt.Println("The path to the 'perfquery' command can be configured with the 'perfquery_path' option") - fmt.Println("in the configuration") - fmt.Println("") - fmt.Println("Full configuration object:") - fmt.Println("\"ibstat\" : {") - fmt.Println(" \"perfquery_path\" : \"path/to/perfquery\" # if omitted, it searches in $PATH") - fmt.Println(" \"exclude_devices\" : [\"dev1\"]") - fmt.Println("}") - fmt.Println("") - fmt.Println("Metrics:") - fmt.Println("- ib_recv") - fmt.Println("- ib_xmit") - fmt.Println("- ib_recv_pkts") - fmt.Println("- ib_xmit_pkts") -} - func (m *InfinibandPerfQueryCollector) Init(config json.RawMessage) error { var err error m.name = "InfinibandCollectorPerfQuery" diff --git a/collectors/infinibandPerfQueryMetric.md b/collectors/infinibandPerfQueryMetric.md new file mode 100644 index 0000000..2147963 --- /dev/null +++ b/collectors/infinibandPerfQueryMetric.md @@ -0,0 +1,28 @@ + +## `ibstat_perfquery` collector + +```json + "ibstat_perfquery": { + "perfquery_path": "/path/to/perfquery", + "exclude_devices": [ + "mlx4" + ] + } +``` + +The `ibstat_perfquery` collector includes all Infiniband devices that can be +found below `/sys/class/infiniband/` and where any of the ports provides a +LID file (`/sys/class/infiniband//ports//lid`) + +The devices can be filtered with the `exclude_devices` option in the configuration. + +For each found LID the collector calls the `perfquery` command. The path to the +`perfquery` command can be configured with the `perfquery_path` option in the configuration + +Metrics: +* `ib_recv` +* `ib_xmit` +* `ib_recv_pkts` +* `ib_xmit_pkts` + +The collector adds a `device` tag to all metrics From 3c10c6b340c85e6fe99b3f1b1bff491ff3833ef2 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 10:02:38 +0100 Subject: [PATCH 089/174] Add error handling to Read() --- collectors/infinibandMetric.go | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index d93fd7b..63ae2ca 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -4,6 +4,7 @@ import ( "fmt" "os" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "golang.org/x/sys/unix" @@ -141,14 +142,25 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr // device info info := &m.info[i] for counterName, counterFile := range info.portCounterFiles { - if data, ok := readOneLine(counterFile); ok { - if v, err := strconv.ParseInt(data, 10, 64); err == nil { - if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { - output <- y - } - } + data, ok := readOneLine(counterFile) + if !ok { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to read one line from file '%s'", counterFile)) + continue + } + v, err := strconv.ParseInt(data, 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err)) + continue + } + if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { + output <- y } } + } } From 25c2ae4910906c95f1e2f9d96d5da2077bef6224 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 11:12:03 +0100 Subject: [PATCH 090/174] Avoid int -> int64 conversions --- collectors/README.md | 5 ++- collectors/cpufreqCpuinfoMetric.go | 39 +++++++++++--------- collectors/cpufreqMetric.go | 50 ++++++++++++++------------ collectors/gpfsMetric.go | 58 ++++++++++++++---------------- 4 files changed, 80 insertions(+), 72 deletions(-) diff --git a/collectors/README.md b/collectors/README.md index 8423c95..cabb74a 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -103,9 +103,8 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric) } // Each metric has exactly one field: value ! - value := map[string]interface{}{"value": int(x)} - y, err := lp.New("sample_metric", tags, m.meta, value, time.Now()) - if err == nil { + value := map[string]interface{}{"value": int64(x)} + if y, err := lp.New("sample_metric", tags, m.meta, value, time.Now()); err == nil { output <- y } } diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index c77a981..5d3d4b5 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -23,14 +23,14 @@ import ( type CPUFreqCpuInfoCollectorTopology struct { processor string // logical processor number (continuous, starting at 0) coreID string // socket local core ID - coreID_int int + coreID_int int64 physicalPackageID string // socket / package ID - physicalPackageID_int int + physicalPackageID_int int64 numPhysicalPackages string // number of sockets / packages - numPhysicalPackages_int int + numPhysicalPackages_int int64 isHT bool numNonHT string // number of non hyperthreading processors - numNonHT_int int + numNonHT_int int64 tagSet map[string]string } @@ -40,26 +40,32 @@ type CPUFreqCpuInfoCollector struct { } func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + m.name = "CPUFreqCpuInfoCollector" m.meta = map[string]string{ "source": m.name, - "group": "cpufreq", + "group": "CPU", + "unit": "MHz", } const cpuInfoFile = "/proc/cpuinfo" file, err := os.Open(cpuInfoFile) if err != nil { - return fmt.Errorf("Failed to open '%s': %v", cpuInfoFile, err) + return fmt.Errorf("Failed to open file '%s': %v", cpuInfoFile, err) } defer file.Close() // Collect topology information from file cpuinfo foundFreq := false processor := "" - numNonHT_int := 0 + var numNonHT_int int64 = 0 coreID := "" physicalPackageID := "" - maxPhysicalPackageID := 0 + var maxPhysicalPackageID int64 = 0 m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0) coreSeenBefore := make(map[string]bool) scanner := bufio.NewScanner(file) @@ -87,13 +93,13 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { len(coreID) > 0 && len(physicalPackageID) > 0 { - coreID_int, err := strconv.Atoi(coreID) + coreID_int, err := strconv.ParseInt(coreID, 10, 64) if err != nil { - return fmt.Errorf("Unable to convert coreID to int: %v", err) + return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err) } - physicalPackageID_int, err := strconv.Atoi(physicalPackageID) + physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64) if err != nil { - return fmt.Errorf("Unable to convert physicalPackageID to int: %v", err) + return fmt.Errorf("Unable to convert physicalPackageID '%s' to int64: %v", physicalPackageID, err) } // increase maximun socket / package ID, when required @@ -152,15 +158,17 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { } func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) { + // Check if already initialized if !m.init { return } + const cpuInfoFile = "/proc/cpuinfo" file, err := os.Open(cpuInfoFile) if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to open '%s': %v", cpuInfoFile, err)) + fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err)) return } defer file.Close() @@ -181,11 +189,10 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert cpu MHz to float: %v", err)) + fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)) return } - y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now) - if err == nil { + if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil { output <- y } } diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index b464160..5f47ce5 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -35,14 +35,14 @@ func readOneLine(filename string) (text string, ok bool) { type CPUFreqCollectorTopology struct { processor string // logical processor number (continuous, starting at 0) coreID string // socket local core ID - coreID_int int + coreID_int int64 physicalPackageID string // socket / package ID - physicalPackageID_int int + physicalPackageID_int int64 numPhysicalPackages string // number of sockets / packages - numPhysicalPackages_int int + numPhysicalPackages_int int64 isHT bool numNonHT string // number of non hyperthreading processors - numNonHT_int int + numNonHT_int int64 scalingCurFreqFile string tagSet map[string]string } @@ -64,6 +64,11 @@ type CPUFreqCollector struct { } func (m *CPUFreqCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + m.name = "CPUFreqCollector" m.setup() if len(config) > 0 { @@ -74,7 +79,8 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { } m.meta = map[string]string{ "source": m.name, - "group": "CPU Frequency", + "group": "CPU", + "unit": "MHz", } // Loop for all CPU directories @@ -82,48 +88,48 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { globPattern := filepath.Join(baseDir, "cpu[0-9]*") cpuDirs, err := filepath.Glob(globPattern) if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to glob files with pattern %s: %v", globPattern, err) + return fmt.Errorf("Unable to glob files with pattern '%s': %v", globPattern, err) } if cpuDirs == nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to find any files with pattern %s", globPattern) + return fmt.Errorf("Unable to find any files with pattern '%s'", globPattern) } // Initialize CPU topology m.topology = make([]CPUFreqCollectorTopology, len(cpuDirs)) for _, cpuDir := range cpuDirs { processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") - processor_int, err := strconv.Atoi(processor) + processor_int, err := strconv.ParseInt(processor, 10, 64) if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to convert cpuID to int: %v", err) + return fmt.Errorf("Unable to convert cpuID '%s' to int64: %v", processor, err) } // Read package ID physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") physicalPackageID, ok := readOneLine(physicalPackageIDFile) if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", physicalPackageIDFile) + return fmt.Errorf("Unable to read physical package ID from file '%s'", physicalPackageIDFile) } - physicalPackageID_int, err := strconv.Atoi(physicalPackageID) + physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64) if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) + return fmt.Errorf("Unable to convert packageID '%s' to int64: %v", physicalPackageID, err) } // Read core ID coreIDFile := filepath.Join(cpuDir, "topology", "core_id") coreID, ok := readOneLine(coreIDFile) if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) + return fmt.Errorf("Unable to read core ID from file '%s'", coreIDFile) } - coreID_int, err := strconv.Atoi(coreID) + coreID_int, err := strconv.ParseInt(coreID, 10, 64) if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) + return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err) } // Check access to current frequency file scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") err = unix.Access(scalingCurFreqFile, unix.R_OK) if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) + return fmt.Errorf("Unable to access file '%s': %v", scalingCurFreqFile, err) } t := &m.topology[processor_int] @@ -146,8 +152,8 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { } // number of non hyper thread cores and packages / sockets - numNonHT_int := 0 - maxPhysicalPackageID := 0 + var numNonHT_int int64 = 0 + var maxPhysicalPackageID int64 = 0 for i := range m.topology { t := &m.topology[i] @@ -184,6 +190,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { } func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) { + // Check if already initialized if !m.init { return } @@ -205,16 +212,15 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) fmt.Sprintf("Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile)) continue } - cpuFreq, err := strconv.Atoi(line) + cpuFreq, err := strconv.ParseInt(line, 10, 64) if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert CPU frequency '%s': %v", line, err)) + fmt.Sprintf("Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err)) continue } - y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now) - if err == nil { + if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil { output <- y } } diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index 8055d4c..ffb2fac 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -26,6 +26,11 @@ type GpfsCollector struct { } func (m *GpfsCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + var err error m.name = "GpfsCollector" m.setup() @@ -53,16 +58,16 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root user, err := user.Current() if err != nil { - return fmt.Errorf("GpfsCollector.Init(): Failed to get current user: %v", err) + return fmt.Errorf("Failed to get current user: %v", err) } if user.Uid != "0" { - return fmt.Errorf("GpfsCollector.Init(): GPFS file system statistics can only be queried by user root") + return fmt.Errorf("GPFS file system statistics can only be queried by user root") } // Check if mmpmon is in executable search path _, err = exec.LookPath(m.config.Mmpmon) if err != nil { - return fmt.Errorf("GpfsCollector.Init(): Failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err) + return fmt.Errorf("Failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err) } m.init = true @@ -70,6 +75,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { } func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { + // Check if already initialized if !m.init { return } @@ -135,7 +141,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if rc != 0 { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Filesystem %s not ok.", filesystem)) + fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem)) continue } @@ -143,14 +149,14 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert seconds '%s' to int: %v", key_value["_t_"], err)) + fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)) continue } msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64) if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int: %v", key_value["_tu_"], err)) + fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)) continue } timestamp := time.Unix(sec, msec*1000) @@ -160,12 +166,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int: %v", key_value["_br_"], err)) + fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err)) continue } - - y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil { output <- y } @@ -174,12 +178,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int: %v", key_value["_bw_"], err)) + fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err)) continue } - - y, err = lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil { output <- y } @@ -188,11 +190,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int: %v", key_value["_oc_"], err)) + fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err)) continue } - y, err = lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil { output <- y } @@ -201,11 +202,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int: %v", key_value["_cc_"], err)) + fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err)) continue } - y, err = lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil { output <- y } @@ -214,11 +214,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int: %v", key_value["_rdc_"], err)) + fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err)) continue } - y, err = lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil { output <- y } @@ -227,11 +226,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int: %v", key_value["_wc_"], err)) + fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err)) continue } - y, err = lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil { output <- y } @@ -240,11 +238,10 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int: %v", key_value["_dir_"], err)) + fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err)) continue } - y, err = lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil { output <- y } @@ -256,8 +253,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err)) continue } - y, err = lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp) - if err == nil { + if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil { output <- y } } From a534f16685c1642f38dfc04e96bc359405718316 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 11:37:34 +0100 Subject: [PATCH 091/174] Add documentation for GPFS metric --- collectors/gpfsMetric.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 collectors/gpfsMetric.md diff --git a/collectors/gpfsMetric.md b/collectors/gpfsMetric.md new file mode 100644 index 0000000..4a6a058 --- /dev/null +++ b/collectors/gpfsMetric.md @@ -0,0 +1,30 @@ +## `gpfs` collector + +```json + "ibstat": { + "mmpmon_path": "/path/to/mmpmon", + "exclude_filesystem": [ + "fs1" + ] + } +``` + +The `gpfs` collector uses the `mmpmon` command to read performance metrics for +GPFS / IBM Spectrum Scale filesystems. + +The reported filesystems can be filtered with the `exclude_filesystem` option +in the configuration. + +The path to the `mmpmon` command can be configured with the `mmpmon_path` option +in the configuration. + +Metrics: +* `bytes_read` +* `gpfs_bytes_written` +* `gpfs_num_opens` +* `gpfs_num_closes` +* `gpfs_num_reads` +* `gpfs_num_readdirs` +* `gpfs_num_inode_updates` + +The collector adds a `filesystem` tag to all metrics From 25bb395f020c2496f6cd696207eb8d4d1dd92750 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 13:22:26 +0100 Subject: [PATCH 092/174] Fix for NumaDomain getter in ccTopology --- internal/ccTopology/ccTopology.go | 131 ++++++++++++++++++------------ 1 file changed, 78 insertions(+), 53 deletions(-) diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go index 6d8bfae..958bb45 100644 --- a/internal/ccTopology/ccTopology.go +++ b/internal/ccTopology/ccTopology.go @@ -6,12 +6,17 @@ import ( "log" "os" "path/filepath" + "regexp" "strconv" "strings" cclogger "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" ) +const SYSFS_NUMABASE = `/sys/devices/system/node` +const SYSFS_CPUBASE = `/sys/devices/system/cpu` +const PROCFS_CPUINFO = `/proc/cpuinfo` + // intArrayContains scans an array of ints if the value str is present in the array // If the specified value is found, the corresponding array index is returned. // The bool value is used to signal success or failure @@ -43,7 +48,7 @@ func fileToInt(path string) int { } func SocketList() []int { - buffer, err := ioutil.ReadFile("/proc/cpuinfo") + buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO)) if err != nil { log.Print(err) return nil @@ -68,7 +73,7 @@ func SocketList() []int { } func CpuList() []int { - buffer, err := ioutil.ReadFile("/proc/cpuinfo") + buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO)) if err != nil { log.Print(err) return nil @@ -93,7 +98,7 @@ func CpuList() []int { } func CoreList() []int { - buffer, err := ioutil.ReadFile("/proc/cpuinfo") + buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO)) if err != nil { log.Print(err) return nil @@ -118,36 +123,50 @@ func CoreList() []int { } func NumaNodeList() []int { - numalist := make([]int, 0) - files, err := filepath.Glob("/sys/devices/system/node/node*") + numaList := make([]int, 0) + globPath := filepath.Join(string(SYSFS_NUMABASE), "node*") + regexPath := filepath.Join(string(SYSFS_NUMABASE), "node(\\d+)") + regex := regexp.MustCompile(regexPath) + files, err := filepath.Glob(globPath) if err != nil { - log.Print(err) + cclogger.ComponentError("CCTopology", "NumaNodeList", err.Error()) } for _, f := range files { + if !regex.MatchString(f) { + continue + } finfo, err := os.Lstat(f) - if err == nil && (finfo.IsDir() || finfo.Mode()&os.ModeSymlink != 0) { - var id int - parts := strings.Split(f, "/") - _, err = fmt.Scanf("node%d", parts[len(parts)-1], &id) + if err != nil { + continue + } + if !finfo.IsDir() { + continue + } + matches := regex.FindStringSubmatch(f) + if len(matches) == 2 { + id, err := strconv.Atoi(matches[1]) if err == nil { - _, found := intArrayContains(numalist, int(id)) - if !found { - numalist = append(numalist, int(id)) + if _, found := intArrayContains(numaList, id); !found { + numaList = append(numaList, id) } } } + } - return numalist + return numaList } func DieList() []int { cpulist := CpuList() dielist := make([]int, 0) for _, c := range cpulist { - dieid := fileToInt(fmt.Sprintf("/sys/devices/system/cpu/cpu%d/topology/die_id", c)) - _, found := intArrayContains(dielist, int(dieid)) - if !found { - dielist = append(dielist, int(dieid)) + diepath := filepath.Join(string(SYSFS_CPUBASE), fmt.Sprintf("cpu%d", c), "topology/die_id") + dieid := fileToInt(diepath) + if dieid > 0 { + _, found := intArrayContains(dielist, int(dieid)) + if !found { + dielist = append(dielist, int(dieid)) + } } } return dielist @@ -196,14 +215,14 @@ func CpuData() []CpuEntry { getSMT := func(cpuid int, basepath string) int { buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/thread_siblings_list", basepath)) if err != nil { - log.Print(err) + cclogger.ComponentError("CCTopology", "CpuData:getSMT", err.Error()) } threadlist := make([]int, 0) sbuffer := strings.Replace(string(buffer), "\n", "", -1) for _, x := range strings.Split(sbuffer, ",") { id, err := strconv.ParseInt(x, 10, 32) if err != nil { - log.Print(err) + cclogger.ComponentError("CCTopology", "CpuData:getSMT", err.Error()) } threadlist = append(threadlist, int(id)) } @@ -216,18 +235,22 @@ func CpuData() []CpuEntry { } getNumaDomain := func(basepath string) int { - files, err := filepath.Glob(fmt.Sprintf("%s/node*", basepath)) + globPath := filepath.Join(basepath, "node*") + regexPath := filepath.Join(basepath, "node(\\d+)") + regex := regexp.MustCompile(regexPath) + files, err := filepath.Glob(globPath) if err != nil { - log.Print(err) + cclogger.ComponentError("CCTopology", "CpuData:getNumaDomain", err.Error()) } for _, f := range files { finfo, err := os.Lstat(f) - if err == nil && (finfo.IsDir() || finfo.Mode()&os.ModeSymlink != 0) { - var id int - parts := strings.Split(f, "/") - _, err = fmt.Scanf("node%d", parts[len(parts)-1], &id) - if err == nil { - return id + if err == nil && finfo.IsDir() { + matches := regex.FindStringSubmatch(f) + if len(matches) == 2 { + id, err := strconv.Atoi(matches[1]) + if err == nil { + return id + } } } } @@ -244,22 +267,24 @@ func CpuData() []CpuEntry { centry.Die = -1 centry.Core = -1 // Set base directory for topology lookup - base := fmt.Sprintf("/sys/devices/system/cpu/cpu%d/topology", centry.Cpuid) + cpustr := fmt.Sprintf("cpu%d", centry.Cpuid) + base := filepath.Join("/sys/devices/system/cpu", cpustr) + topoBase := filepath.Join(base, "topology") // Lookup CPU core id - centry.Core = getCore(base) + centry.Core = getCore(topoBase) // Lookup CPU socket id - centry.Socket = getSocket(base) + centry.Socket = getSocket(topoBase) // Lookup CPU die id - centry.Die = getDie(base) + centry.Die = getDie(topoBase) if centry.Die < 0 { centry.Die = centry.Socket } // Lookup SMT thread id - centry.SMT = getSMT(centry.Cpuid, base) + centry.SMT = getSMT(centry.Cpuid, topoBase) // Lookup NUMA domain id centry.Numadomain = getNumaDomain(base) @@ -280,34 +305,34 @@ type CpuInformation struct { func CpuInfo() CpuInformation { var c CpuInformation - smt := 0 - numa := 0 - die := 0 - socket := 0 - core := 0 + smtList := make([]int, 0) + numaList := make([]int, 0) + dieList := make([]int, 0) + socketList := make([]int, 0) + coreList := make([]int, 0) cdata := CpuData() for _, d := range cdata { - if d.SMT > smt { - smt = d.SMT + if _, ok := intArrayContains(smtList, d.SMT); !ok { + smtList = append(smtList, d.SMT) } - if d.Numadomain > numa { - numa = d.Numadomain + if _, ok := intArrayContains(numaList, d.Numadomain); !ok { + numaList = append(numaList, d.Numadomain) } - if d.Die > die { - die = d.Die + if _, ok := intArrayContains(dieList, d.Die); !ok { + dieList = append(dieList, d.Die) } - if d.Socket > socket { - socket = d.Socket + if _, ok := intArrayContains(socketList, d.Socket); !ok { + socketList = append(socketList, d.Socket) } - if d.Core > core { - core = d.Core + if _, ok := intArrayContains(coreList, d.Core); !ok { + coreList = append(coreList, d.Core) } } - c.NumNumaDomains = numa + 1 - c.SMTWidth = smt + 1 - c.NumDies = die + 1 - c.NumCores = core + 1 - c.NumSockets = socket + 1 + c.NumNumaDomains = len(numaList) + c.SMTWidth = len(smtList) + c.NumDies = len(dieList) + c.NumCores = len(coreList) + c.NumSockets = len(socketList) c.NumHWthreads = len(cdata) return c } From 52458ce5a1b51e35efb8d85cd6c7cd777af53318 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 13:27:35 +0100 Subject: [PATCH 093/174] Fix for LustreCollector. Check for root user --- collectors/lustreMetric.go | 139 ++++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 42 deletions(-) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index 99b371c..6d6fe26 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -5,10 +5,12 @@ import ( "errors" "fmt" "os/exec" + "os/user" "strconv" "strings" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -19,17 +21,26 @@ const LCTL_OPTION = `get_param` type LustreCollectorConfig struct { LCtlCommand string `json:"lctl_command"` ExcludeMetrics []string `json:"exclude_metrics"` + SendAllMetrics bool `json:"send_all_metrics"` } type LustreCollector struct { metricCollector tags map[string]string matches map[string]map[string]int - devices []string + stats map[string]map[string]int64 config LustreCollectorConfig lctl string } +func (m *LustreCollector) getDeviceDataCommand(device string) []string { + statsfile := fmt.Sprintf("llite.%s.stats", device) + command := exec.Command(m.lctl, LCTL_OPTION, statsfile) + command.Wait() + stdout, _ := command.Output() + return strings.Split(string(stdout), "\n") +} + func (m *LustreCollector) getDevices() []string { devices := make([]string, 0) @@ -44,13 +55,9 @@ func (m *LustreCollector) getDevices() []string { // devices = append(devices, pathlist[4]) // } - command := exec.Command(m.lctl, LCTL_OPTION, "llite.*.stats") - command.Wait() - stdout, err := command.Output() - if err != nil { - return devices - } - for _, line := range strings.Split(string(stdout), "\n") { + data := m.getDeviceDataCommand("*") + + for _, line := range data { if strings.HasPrefix(line, "llite") { linefields := strings.Split(line, ".") if len(linefields) > 2 { @@ -73,14 +80,6 @@ func (m *LustreCollector) getDevices() []string { // return strings.Split(string(buffer), "\n") // } -func (m *LustreCollector) getDeviceDataCommand(device string) []string { - statsfile := fmt.Sprintf("llite.%s.stats", device) - command := exec.Command(m.lctl, LCTL_OPTION, statsfile) - command.Wait() - stdout, _ := command.Output() - return strings.Split(string(stdout), "\n") -} - func (m *LustreCollector) Init(config json.RawMessage) error { var err error m.name = "LustreCollector" @@ -93,14 +92,42 @@ func (m *LustreCollector) Init(config json.RawMessage) error { m.setup() m.tags = map[string]string{"type": "node"} m.meta = map[string]string{"source": m.name, "group": "Lustre"} - m.matches = map[string]map[string]int{"read_bytes": {"read_bytes": 6, "read_requests": 1}, - "write_bytes": {"write_bytes": 6, "write_requests": 1}, - "open": {"open": 1}, - "close": {"close": 1}, - "setattr": {"setattr": 1}, - "getattr": {"getattr": 1}, - "statfs": {"statfs": 1}, - "inode_permission": {"inode_permission": 1}} + defmatches := map[string]map[string]int{ + "read_bytes": {"lustre_read_bytes": 6, "lustre_read_requests": 1}, + "write_bytes": {"lustre_write_bytes": 6, "lustre_write_requests": 1}, + "open": {"lustre_open": 1}, + "close": {"lustre_close": 1}, + "setattr": {"lustre_setattr": 1}, + "getattr": {"lustre_getattr": 1}, + "statfs": {"lustre_statfs": 1}, + "inode_permission": {"lustre_inode_permission": 1}} + + // Lustre file system statistics can only be queried by user root + user, err := user.Current() + if err != nil { + cclog.ComponentError(m.name, "Failed to get current user:", err.Error()) + return err + } + if user.Uid != "0" { + cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root:", err.Error()) + return err + } + + m.matches = make(map[string]map[string]int) + for lineprefix, names := range defmatches { + for metricname, offset := range names { + _, skip := stringArrayContains(m.config.ExcludeMetrics, metricname) + if skip { + continue + } + if _, prefixExist := m.matches[lineprefix]; !prefixExist { + m.matches[lineprefix] = make(map[string]int) + } + if _, metricExist := m.matches[lineprefix][metricname]; !metricExist { + m.matches[lineprefix][metricname] = offset + } + } + } p, err := exec.LookPath(m.config.LCtlCommand) if err != nil { p, err = exec.LookPath(LCTL_CMD) @@ -110,10 +137,19 @@ func (m *LustreCollector) Init(config json.RawMessage) error { } m.lctl = p - m.devices = m.getDevices() - if len(m.devices) == 0 { + devices := m.getDevices() + if len(devices) == 0 { return errors.New("no metrics to collect") } + m.stats = make(map[string]map[string]int64) + for _, d := range devices { + m.stats[d] = make(map[string]int64) + for _, names := range m.matches { + for metricname := range names { + m.stats[d][metricname] = 0 + } + } + } m.init = true return nil } @@ -122,34 +158,53 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) if !m.init { return } - for _, device := range m.devices { + for device, devData := range m.stats { stats := m.getDeviceDataCommand(device) + processed := []string{} for _, line := range stats { lf := strings.Fields(line) if len(lf) > 1 { - for match, fields := range m.matches { - if lf[0] == match { - for name, idx := range fields { - _, skip := stringArrayContains(m.config.ExcludeMetrics, name) - if skip { - continue + if fields, ok := m.matches[lf[0]]; ok { + for name, idx := range fields { + x, err := strconv.ParseInt(lf[idx], 0, 64) + if err != nil { + continue + } + value := x - devData[name] + devData[name] = x + if value < 0 { + value = 0 + } + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + if err == nil { + y.AddTag("device", device) + if strings.Contains(name, "byte") { + y.AddMeta("unit", "Byte") } - x, err := strconv.ParseInt(lf[idx], 0, 64) - if err == nil { - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, time.Now()) - if err == nil { - if strings.Contains(name, "byte") { - y.AddMeta("unit", "Byte") - } - output <- y - } + output <- y + if m.config.SendAllMetrics { + processed = append(processed, name) } } } } } } + if m.config.SendAllMetrics { + for name := range devData { + if _, done := stringArrayContains(processed, name); !done { + y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": 0}, time.Now()) + if err == nil { + y.AddTag("device", device) + if strings.Contains(name, "byte") { + y.AddMeta("unit", "Byte") + } + output <- y + } + } + } + } } } From ead7117cadb86aae8023aa854d07f164a7353720 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 13:30:01 +0100 Subject: [PATCH 094/174] Add skip_filesystem configuration --- collectors.json | 10 +- collectors/gpfsMetric.go | 287 ++++++++++++++++++++------------------- 2 files changed, 159 insertions(+), 138 deletions(-) diff --git a/collectors.json b/collectors.json index df2fce3..563ff05 100644 --- a/collectors.json +++ b/collectors.json @@ -1,4 +1,12 @@ { + "cpufreq": {}, + "cpufreq_cpuinfo": {}, + "gpfs": { + "exclude_filesystem": [ "test_fs" ] + }, + "loadavg": { + "exclude_metrics": [ "proc_total" ] + } "tempstat": { "tag_override": { "hwmon0" : { @@ -10,6 +18,4 @@ "type-id" : "1" } } - } - } diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index ffb2fac..adbc7fb 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -21,8 +21,10 @@ type GpfsCollector struct { metricCollector tags map[string]string config struct { - Mmpmon string `json:"mmpmon"` + Mmpmon string `json:"mmpmon_path,omitempty"` + ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"` } + skipFS map[string]struct{} } func (m *GpfsCollector) Init(config json.RawMessage) error { @@ -54,6 +56,10 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { "type": "node", "filesystem": "", } + m.skipFS = make(map[string]struct{}) + for _, fs := range m.config.ExcludeFilesystem { + m.skipFS[fs] = struct{}{} + } // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root user, err := user.Current() @@ -108,154 +114,163 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { scanner := bufio.NewScanner(cmdStdout) for scanner.Scan() { lineSplit := strings.Fields(scanner.Text()) - if lineSplit[0] == "_fs_io_s_" { - key_value := make(map[string]string) - for i := 1; i < len(lineSplit); i += 2 { - key_value[lineSplit[i]] = lineSplit[i+1] - } - // Ignore keys: - // _n_: node IP address, - // _nn_: node name, - // _cl_: cluster name, - // _d_: number of disks + // Only process lines starting with _fs_io_s_ + if lineSplit[0] != "_fs_io_s_" { + continue + } - filesystem, ok := key_value["_fs_"] - if !ok { - cclog.ComponentError( - m.name, - "Read(): Failed to get filesystem name.") - continue - } + key_value := make(map[string]string) + for i := 1; i < len(lineSplit); i += 2 { + key_value[lineSplit[i]] = lineSplit[i+1] + } - m.tags["filesystem"] = filesystem + // Ignore keys: + // _n_: node IP address, + // _nn_: node name, + // _cl_: cluster name, + // _d_: number of disks - // return code - rc, err := strconv.Atoi(key_value["_rc_"]) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)) - continue - } - if rc != 0 { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem)) - continue - } + filesystem, ok := key_value["_fs_"] + if !ok { + cclog.ComponentError( + m.name, + "Read(): Failed to get filesystem name.") + continue + } - sec, err := strconv.ParseInt(key_value["_t_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)) - continue - } - msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)) - continue - } - timestamp := time.Unix(sec, msec*1000) + // Skip excluded filesystems + if _, skip := m.skipFS[filesystem]; skip { + continue + } - // bytes read - bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err)) - continue - } - if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil { - output <- y - } + m.tags["filesystem"] = filesystem - // bytes written - bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err)) - continue - } - if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil { - output <- y - } + // return code + rc, err := strconv.Atoi(key_value["_rc_"]) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)) + continue + } + if rc != 0 { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem)) + continue + } - // number of opens - numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err)) - continue - } - if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil { - output <- y - } + sec, err := strconv.ParseInt(key_value["_t_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)) + continue + } + msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)) + continue + } + timestamp := time.Unix(sec, msec*1000) - // number of closes - numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err)) - continue - } - if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil { - output <- y - } + // bytes read + bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err)) + continue + } + if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil { + output <- y + } - // number of reads - numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err)) - continue - } - if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil { - output <- y - } + // bytes written + bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err)) + continue + } + if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil { + output <- y + } - // number of writes - numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err)) - continue - } - if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil { - output <- y - } + // number of opens + numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err)) + continue + } + if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil { + output <- y + } - // number of read directories - numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err)) - continue - } - if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil { - output <- y - } + // number of closes + numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err)) + continue + } + if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil { + output <- y + } - // Number of inode updates - numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64) - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err)) - continue - } - if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil { - output <- y - } + // number of reads + numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err)) + continue + } + if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil { + output <- y + } + + // number of writes + numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err)) + continue + } + if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil { + output <- y + } + + // number of read directories + numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err)) + continue + } + if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil { + output <- y + } + + // Number of inode updates + numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err)) + continue + } + if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil { + output <- y } } } From 8a69f76093bd66fc3b9a7ad8a9431d0e9bdac1fd Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 13:53:51 +0100 Subject: [PATCH 095/174] Removed PID file creation, as not required by systemd --- README.md | 2 -- metric-collector.go | 29 ----------------------------- scripts/cc-metric-collector.config | 3 --- scripts/cc-metric-collector.service | 6 +----- 4 files changed, 1 insertion(+), 39 deletions(-) diff --git a/README.md b/README.md index 158cc0c..9b628fc 100644 --- a/README.md +++ b/README.md @@ -54,8 +54,6 @@ Usage of metric-collector: Path for logfile (default "stderr") -once Run all collectors only once - -pidfile string - Path for PID file (default "/var/run/cc-metric-collector.pid") ``` diff --git a/metric-collector.go b/metric-collector.go index 066fe3c..e6388df 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -24,7 +24,6 @@ import ( type CentralConfigFile struct { Interval int `json:"interval"` Duration int `json:"duration"` - Pidfile string `json:"pidfile,omitempty"` CollectorConfigFile string `json:"collectors"` RouterConfigFile string `json:"router"` SinkConfigFile string `json:"sinks"` @@ -87,14 +86,12 @@ func ReadCli() map[string]string { var m map[string]string cfg := flag.String("config", "./config.json", "Path to configuration file") logfile := flag.String("log", "stderr", "Path for logfile") - pidfile := flag.String("pidfile", "/var/run/cc-metric-collector.pid", "Path for PID file") once := flag.Bool("once", false, "Run all collectors only once") debug := flag.Bool("debug", false, "Activate debug output") flag.Parse() m = make(map[string]string) m["configfile"] = *cfg m["logfile"] = *logfile - m["pidfile"] = *pidfile if *once { m["once"] = "true" } else { @@ -125,25 +122,6 @@ func ReadCli() map[string]string { // return nil //} -//func CreatePidfile(pidfile string) error { -// file, err := os.OpenFile(pidfile, os.O_CREATE|os.O_RDWR, 0600) -// if err != nil { -// log.Print(err) -// return err -// } -// file.Write([]byte(fmt.Sprintf("%d", os.Getpid()))) -// file.Close() -// return nil -//} - -//func RemovePidfile(pidfile string) error { -// info, err := os.Stat(pidfile) -// if !os.IsNotExist(err) && !info.IsDir() { -// os.Remove(pidfile) -// } -// return nil -//} - // General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { defer config.Sync.Done() @@ -174,11 +152,6 @@ func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { cclog.Debug("Shutdown SinkManager...") config.SinkManager.Close() } - - // pidfile := config.ConfigFile.Pidfile - // RemovePidfile(pidfile) - // pidfile = config.CliArgs["pidfile"] - // RemovePidfile(pidfile) } func mainFunc() int { @@ -226,8 +199,6 @@ func mainFunc() int { return 1 } - // err = CreatePidfile(rcfg.CliArgs["pidfile"]) - // Set log file if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" { cclog.SetOutput(logfile) diff --git a/scripts/cc-metric-collector.config b/scripts/cc-metric-collector.config index 4f98a30..3535ddf 100644 --- a/scripts/cc-metric-collector.config +++ b/scripts/cc-metric-collector.config @@ -15,6 +15,3 @@ CONF_DIR=/etc/cc-metric-collector CONF_FILE=/etc/cc-metric-collector/cc-metric-collector.json RESTART_ON_UPGRADE=true - -# Only used on systemd systems -PID_FILE_DIR=/var/run diff --git a/scripts/cc-metric-collector.service b/scripts/cc-metric-collector.service index c9ed318..d40580c 100644 --- a/scripts/cc-metric-collector.service +++ b/scripts/cc-metric-collector.service @@ -14,11 +14,7 @@ Restart=on-failure WorkingDirectory=/tmp RuntimeDirectory=cc-metric-collector RuntimeDirectoryMode=0750 -ExecStart=/usr/sbin/cc-metric-collector \ - --config=${CONF_FILE} \ - --pidfile=${PID_FILE_DIR}/cc-metric-collector.pid - - +ExecStart=/usr/sbin/cc-metric-collector --config=${CONF_FILE} LimitNOFILE=10000 TimeoutStopSec=20 UMask=0027 From ca081139dbc9f119633d55b497a178dedff8bdef Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 14:17:59 +0100 Subject: [PATCH 096/174] Fixed JSON syntax --- collectors.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/collectors.json b/collectors.json index 563ff05..8a06608 100644 --- a/collectors.json +++ b/collectors.json @@ -6,7 +6,7 @@ }, "loadavg": { "exclude_metrics": [ "proc_total" ] - } + }, "tempstat": { "tag_override": { "hwmon0" : { @@ -18,4 +18,5 @@ "type-id" : "1" } } + } } From 0bd638f211e8a20a9ac5c268b94df03b150ff019 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 15:42:25 +0100 Subject: [PATCH 097/174] Add script to help create configs for LikwidCollector --- scripts/likwid_perfgroup_to_cc_config.py | 83 ++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 scripts/likwid_perfgroup_to_cc_config.py diff --git a/scripts/likwid_perfgroup_to_cc_config.py b/scripts/likwid_perfgroup_to_cc_config.py new file mode 100755 index 0000000..52959ed --- /dev/null +++ b/scripts/likwid_perfgroup_to_cc_config.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import os, os.path, sys, getopt, re, json + +def which(cmd): + ospath = os.environ.get("PATH", "") + for p in ospath.split(":"): + testcmd = os.path.join(p, cmd) + if os.access(testcmd, os.X_OK): + return testcmd + return None + +def group_to_json(groupfile): + gdata = [] + with open(groupfile, "r") as fp: + gdata = fp.read().strip().split("\n") + events = {} + metrics = [] + parse_events = False + parse_metrics = False + for line in gdata: + if line == "EVENTSET": + parse_events = True + parse_metrics = False + continue + if line == "METRICS": + parse_events = False + parse_metrics = True + continue + if len(line) == 0 or line.startswith("SHORT") or line == "LONG": + parse_events = False + parse_metrics = False + continue + if parse_events: + m = re.match("([\w\d]+)\s+([\w\d_]+)", line) + if m: + events[m.group(1)] = m.group(2) + if parse_metrics: + llist = re.split("\s+", line) + calc = llist[-1] + metric = " ".join(llist[:-1]) + scope = "hwthread" + if "BOX" in calc: + scope = "socket" + if "PWR" in calc: + scope = "socket" + + m = {"name" : metric, "calc": calc, "scope" : scope, "publish" : True} + metrics.append(m) + return {"events" : events, "metrics" : metrics} + +if len(sys.argv) != 3: + print("Usage: $0 ") + sys.exit(1) + + +arch = sys.argv[1] +group = sys.argv[2] + +ltopo = which("likwid-topology") +if not ltopo: + print("Cannot find LIKWID installation. Please add LIKWID bin folder to your PATH.") + sys.exit(1) + +bindir = os.path.dirname(ltopo) + +groupdir = os.path.normpath(os.path.join(bindir, "../share/likwid/perfgroups")) +if not os.path.exists(groupdir): + print("Cannot find LIKWID performance groups in default install location") + sys.exit(1) + +archdir = os.path.join(groupdir, arch) +if not os.path.exists(archdir): + print("Cannot find LIKWID performance groups for architecture {}".format(arch)) + sys.exit(1) + +groupfile = os.path.join(archdir, "{}.txt".format(group)) +if not os.path.exists(groupfile): + print("Cannot find LIKWID performance group {} for architecture {}".format(group, arch)) + sys.exit(1) + +gdata = group_to_json(groupfile) +print(json.dumps(gdata, sort_keys=True, indent=2)) From 5263a974d1858ef7837b6b4dfd7187a199fff40d Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 7 Feb 2022 15:43:01 +0100 Subject: [PATCH 098/174] Split NfsCollector in Nfs3Collector and Nfs4Collector (#28) * Split NfsCollector in Nfs3Collector and Nfs4Collector * Add documentation --- collectors/README.md | 2 + collectors/collectorManager.go | 3 +- collectors/nfs3Metric.md | 39 +++++++++ collectors/nfs4Metric.md | 62 +++++++++++++++ collectors/nfsMetric.go | 141 ++++++++++++++++++++------------- 5 files changed, 189 insertions(+), 58 deletions(-) create mode 100644 collectors/nfs3Metric.md create mode 100644 collectors/nfs4Metric.md diff --git a/collectors/README.md b/collectors/README.md index cabb74a..a79fa03 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -29,6 +29,8 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`customcmd`](./customCmdMetric.md) * [`ipmistat`](./ipmiMetric.md) * [`topprocs`](./topprocsMetric.md) +* [`nfs3stat`](./nfs3Metric.md) +* [`nfs4stat`](./nfs4Metric.md) ## Todos diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 62ea4d2..7918793 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -31,7 +31,8 @@ var AvailableCollectors = map[string]MetricCollector{ "gpfs": new(GpfsCollector), "cpufreq": new(CPUFreqCollector), "cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector), - "nfsstat": new(NfsCollector), + "nfs3stat": new(Nfs3Collector), + "nfs4stat": new(Nfs4Collector), "numastats": new(NUMAStatsCollector), } diff --git a/collectors/nfs3Metric.md b/collectors/nfs3Metric.md new file mode 100644 index 0000000..63937ea --- /dev/null +++ b/collectors/nfs3Metric.md @@ -0,0 +1,39 @@ + +## `nfs3stat` collector + +```json + "nfs3stat": { + "nfsstat" : "/path/to/nfsstat", + "exclude_metrics": [ + "nfs3_total" + ] + } +``` + +The `nfs3stat` collector reads data from `nfsstat` command and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. There is currently no possibility to get the metrics per mount point. + + +Metrics: +* `nfs3_total` +* `nfs3_null` +* `nfs3_getattr` +* `nfs3_setattr` +* `nfs3_lookup` +* `nfs3_access` +* `nfs3_readlink` +* `nfs3_read` +* `nfs3_write` +* `nfs3_create` +* `nfs3_mkdir` +* `nfs3_symlink` +* `nfs3_remove` +* `nfs3_rmdir` +* `nfs3_rename` +* `nfs3_link` +* `nfs3_readdir` +* `nfs3_readdirplus` +* `nfs3_fsstat` +* `nfs3_fsinfo` +* `nfs3_pathconf` +* `nfs3_commit` + diff --git a/collectors/nfs4Metric.md b/collectors/nfs4Metric.md new file mode 100644 index 0000000..71d9613 --- /dev/null +++ b/collectors/nfs4Metric.md @@ -0,0 +1,62 @@ + +## `nfs4stat` collector + +```json + "nfs4stat": { + "nfsstat" : "/path/to/nfsstat", + "exclude_metrics": [ + "nfs4_total" + ] + } +``` + +The `nfs4stat` collector reads data from `nfsstat` command and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. There is currently no possibility to get the metrics per mount point. + + +Metrics: +* `nfs4_total` +* `nfs4_null` +* `nfs4_read` +* `nfs4_write` +* `nfs4_commit` +* `nfs4_open` +* `nfs4_open_conf` +* `nfs4_open_noat` +* `nfs4_open_dgrd` +* `nfs4_close` +* `nfs4_setattr` +* `nfs4_fsinfo` +* `nfs4_renew` +* `nfs4_setclntid` +* `nfs4_confirm` +* `nfs4_lock` +* `nfs4_lockt` +* `nfs4_locku` +* `nfs4_access` +* `nfs4_getattr` +* `nfs4_lookup` +* `nfs4_lookup_root` +* `nfs4_remove` +* `nfs4_rename` +* `nfs4_link` +* `nfs4_symlink` +* `nfs4_create` +* `nfs4_pathconf` +* `nfs4_statfs` +* `nfs4_readlink` +* `nfs4_readdir` +* `nfs4_server_caps` +* `nfs4_delegreturn` +* `nfs4_getacl` +* `nfs4_setacl` +* `nfs4_rel_lkowner` +* `nfs4_exchange_id` +* `nfs4_create_session` +* `nfs4_destroy_session` +* `nfs4_sequence` +* `nfs4_get_lease_time` +* `nfs4_reclaim_comp` +* `nfs4_secinfo_no` +* `nfs4_bind_conn_to_ses` + + diff --git a/collectors/nfsMetric.go b/collectors/nfsMetric.go index 16a6d23..07e684d 100644 --- a/collectors/nfsMetric.go +++ b/collectors/nfsMetric.go @@ -14,23 +14,29 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) +// First part contains the code for the general NfsCollector. +// Later, the general NfsCollector is more limited to Nfs3- and Nfs4Collector. + +const NFSSTAT_EXEC = `nfsstat` + type NfsCollectorData struct { current int64 last int64 } -type NfsCollector struct { +type nfsCollector struct { metricCollector - tags map[string]string - config struct { - Nfsutils string `json:"nfsutils"` + tags map[string]string + version string + config struct { + Nfsstats string `json:"nfsstat"` ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } - data map[string]map[string]NfsCollectorData + data map[string]NfsCollectorData } -func (m *NfsCollector) initStats() error { - cmd := exec.Command(m.config.Nfsutils, "-l") +func (m *nfsCollector) initStats() error { + cmd := exec.Command(m.config.Nfsstats, `-l`) cmd.Wait() buffer, err := cmd.Output() if err == nil { @@ -39,17 +45,16 @@ func (m *NfsCollector) initStats() error { if len(lf) != 5 { continue } - if _, exist := m.data[lf[1]]; !exist { - m.data[lf[1]] = make(map[string]NfsCollectorData) - } - name := strings.Trim(lf[3], ":") - if _, exist := m.data[lf[1]][name]; !exist { - value, err := strconv.ParseInt(lf[4], 0, 64) - if err == nil { - x := m.data[lf[1]][name] - x.current = value - x.last = 0 - m.data[lf[1]][name] = x + if lf[1] == m.version { + name := strings.Trim(lf[3], ":") + if _, exist := m.data[name]; !exist { + value, err := strconv.ParseInt(lf[4], 0, 64) + if err == nil { + x := m.data[name] + x.current = value + x.last = 0 + m.data[name] = x + } } } } @@ -57,8 +62,8 @@ func (m *NfsCollector) initStats() error { return err } -func (m *NfsCollector) updateStats() error { - cmd := exec.Command(m.config.Nfsutils, "-l") +func (m *nfsCollector) updateStats() error { + cmd := exec.Command(m.config.Nfsstats, `-l`) cmd.Wait() buffer, err := cmd.Output() if err == nil { @@ -67,17 +72,16 @@ func (m *NfsCollector) updateStats() error { if len(lf) != 5 { continue } - if _, exist := m.data[lf[1]]; !exist { - m.data[lf[1]] = make(map[string]NfsCollectorData) - } - name := strings.Trim(lf[3], ":") - if _, exist := m.data[lf[1]][name]; exist { - value, err := strconv.ParseInt(lf[4], 0, 64) - if err == nil { - x := m.data[lf[1]][name] - x.last = x.current - x.current = value - m.data[lf[1]][name] = x + if lf[1] == m.version { + name := strings.Trim(lf[3], ":") + if _, exist := m.data[name]; exist { + value, err := strconv.ParseInt(lf[4], 0, 64) + if err == nil { + x := m.data[name] + x.last = x.current + x.current = value + m.data[name] = x + } } } } @@ -85,17 +89,11 @@ func (m *NfsCollector) updateStats() error { return err } -func (m *NfsCollector) Init(config json.RawMessage) error { - var err error - m.name = "NfsCollector" - m.setup() - - // Set default mmpmon binary - m.config.Nfsutils = "/usr/sbin/nfsstat" - +func (m *nfsCollector) MainInit(config json.RawMessage) error { + m.config.Nfsstats = string(NFSSTAT_EXEC) // Read JSON configuration if len(config) > 0 { - err = json.Unmarshal(config, &m.config) + err := json.Unmarshal(config, &m.config) if err != nil { log.Print(err.Error()) return err @@ -108,40 +106,69 @@ func (m *NfsCollector) Init(config json.RawMessage) error { m.tags = map[string]string{ "type": "node", } - // Check if mmpmon is in executable search path - _, err = exec.LookPath(m.config.Nfsutils) + // Check if nfsstat is in executable search path + _, err := exec.LookPath(m.config.Nfsstats) if err != nil { - return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsutils, err) + return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err) } - m.data = make(map[string]map[string]NfsCollectorData) + m.data = make(map[string]NfsCollectorData) m.initStats() m.init = true return nil } -func (m *NfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } timestamp := time.Now() m.updateStats() + prefix := "" + switch m.version { + case "v3": + prefix = "nfs3" + case "v4": + prefix = "nfs4" + default: + prefix = "nfs" + } - for version, metrics := range m.data { - for name, data := range metrics { - if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip { - continue - } - value := data.current - data.last - y, err := lp.New(fmt.Sprintf("nfs_%s", name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) - if err == nil { - y.AddMeta("version", version) - output <- y - } + for name, data := range m.data { + if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip { + continue + } + value := data.current - data.last + y, err := lp.New(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + y.AddMeta("version", m.version) + output <- y } } } -func (m *NfsCollector) Close() { +func (m *nfsCollector) Close() { m.init = false } + +type Nfs3Collector struct { + nfsCollector +} + +type Nfs4Collector struct { + nfsCollector +} + +func (m *Nfs3Collector) Init(config json.RawMessage) error { + m.name = "Nfs3Collector" + m.version = `v3` + m.setup() + return m.MainInit(config) +} + +func (m *Nfs4Collector) Init(config json.RawMessage) error { + m.name = "Nfs4Collector" + m.version = `v4` + m.setup() + return m.MainInit(config) +} From b19ae7a4db29a9f5e4bf6dd7a31652be26c8c11b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 15:43:57 +0100 Subject: [PATCH 099/174] Fix initialization of InfinibandCollector --- collectors/infinibandMetric.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 63ae2ca..4c7615b 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -37,7 +37,7 @@ type InfinibandCollector struct { func (m *InfinibandCollector) Init(config json.RawMessage) error { // Check if already initialized - if !m.init { + if m.init { return nil } From d8ab3b0eb059e378d6480388af1027323a74cbbc Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 15:44:29 +0100 Subject: [PATCH 100/174] Use LookPath in IpmiCollector --- collectors/ipmiMetric.go | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index f4c5167..e59f407 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -9,11 +9,12 @@ import ( "strconv" "strings" "time" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -const IPMITOOL_PATH = `/usr/bin/ipmitool` -const IPMISENSORS_PATH = `/usr/sbin/ipmi-sensors` +const IPMITOOL_PATH = `ipmitool` +const IPMISENSORS_PATH = `ipmi-sensors` type IpmiCollectorConfig struct { ExcludeDevices []string `json:"exclude_devices"` @@ -23,30 +24,36 @@ type IpmiCollectorConfig struct { type IpmiCollector struct { metricCollector - tags map[string]string - matches map[string]string - config IpmiCollectorConfig + //tags map[string]string + //matches map[string]string + config IpmiCollectorConfig + ipmitool string + ipmisensors string } func (m *IpmiCollector) Init(config json.RawMessage) error { m.name = "IpmiCollector" m.setup() m.meta = map[string]string{"source": m.name, "group": "IPMI"} + m.config.IpmitoolPath = string(IPMITOOL_PATH) + m.config.IpmisensorsPath = string(IPMISENSORS_PATH) + m.ipmitool = "" + m.ipmisensors = "" if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { return err } } - _, err1 := os.Stat(m.config.IpmitoolPath) - _, err2 := os.Stat(m.config.IpmisensorsPath) - if err1 != nil { - m.config.IpmitoolPath = "" + p, err := exec.LookPath(m.config.IpmitoolPath) + if err == nil { + m.ipmitool = p } - if err2 != nil { - m.config.IpmisensorsPath = "" + p, err = exec.LookPath(m.config.IpmisensorsPath) + if err == nil { + m.ipmisensors = p } - if err1 != nil && err2 != nil { + if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 { return errors.New("No IPMI reader found") } m.init = true From 6dd95d6feddeb5174d2225fc4de9cc7daf4c091c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 16:20:42 +0100 Subject: [PATCH 101/174] Export all ccMetric functions --- internal/ccMetric/ccMetric.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 9745e9d..0771a10 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -31,16 +31,22 @@ type CCMetric interface { Meta() map[string]string // Map of meta data tags MetaList() []*lp.Tag // Ordered list of meta data AddMeta(key, value string) // Add a meta data tag + HasMeta(key string) bool // Check a meta data tag GetMeta(key string) (string, bool) // Get a meta data tab addressed by its key + RemoveMeta(key string) // Remove a meta data tag by its key Tags() map[string]string // Map of tags + TagList() []*lp.Tag // Ordered list of tags AddTag(key, value string) // Add a tag GetTag(key string) (string, bool) // Get a tag by its key + HasTag(key string) bool // Check a tag RemoveTag(key string) // Remove a tag by its key GetField(key string) (interface{}, bool) // Get a field addressed by its key HasField(key string) bool // Check if a field key is present RemoveField(key string) // Remove a field addressed by its key + Fields() map[string]interface{} // Map of fields + FieldList() []*lp.Field // Ordered list of fields } // Meta returns the meta data tags as key-value mapping From d1e66201a61d4f09cd666a7d39161faaed044bd4 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 7 Feb 2022 16:51:46 +0100 Subject: [PATCH 102/174] Add non-blocking InfluxDB sink (#29) * Add non-blocking InfluxDB sink * Add configurable batch size --- sinks/influxAsyncSink.go | 119 +++++++++++++++++++++++++++++++++++++++ sinks/influxAsyncSink.md | 34 +++++++++++ sinks/influxSink.md | 2 +- sinks/sinkManager.go | 11 ++-- 4 files changed, 160 insertions(+), 6 deletions(-) create mode 100644 sinks/influxAsyncSink.go create mode 100644 sinks/influxAsyncSink.md diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go new file mode 100644 index 0000000..0763a4b --- /dev/null +++ b/sinks/influxAsyncSink.go @@ -0,0 +1,119 @@ +package sinks + +import ( + // "context" + "crypto/tls" + "encoding/json" + "errors" + "fmt" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + influxdb2 "github.com/influxdata/influxdb-client-go/v2" + influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" + "github.com/influxdata/influxdb-client-go/v2/api/write" +) + +type InfluxAsyncSinkConfig struct { + defaultSinkConfig + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` + Organization string `json:"organization,omitempty"` + SSL bool `json:"ssl,omitempty"` + RetentionPol string `json:"retention_policy,omitempty"` + BatchSize uint `json:"batch_size,omitempty"` +} + +type InfluxAsyncSink struct { + sink + client influxdb2.Client + writeApi influxdb2Api.WriteAPI + retPolicy string + errors <-chan error + config InfluxAsyncSinkConfig +} + +func (s *InfluxAsyncSink) connect() error { + var auth string + var uri string + if s.config.SSL { + uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port) + } else { + uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port) + } + if len(s.config.User) == 0 { + auth = s.config.Password + } else { + auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) + } + cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) + batch := s.config.BatchSize + if batch == 0 { + batch = 100 + } + s.client = influxdb2.NewClientWithOptions(uri, auth, + influxdb2.DefaultOptions().SetBatchSize(batch).SetTLSConfig(&tls.Config{ + InsecureSkipVerify: true, + })) + s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database) + return nil +} + +func (s *InfluxAsyncSink) Init(config json.RawMessage) error { + s.name = "InfluxSink" + s.config.BatchSize = 100 + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return err + } + } + if len(s.config.Host) == 0 || + len(s.config.Port) == 0 || + len(s.config.Database) == 0 || + len(s.config.Organization) == 0 || + len(s.config.Password) == 0 { + return errors.New("not all configuration variables set required by InfluxAsyncSink") + } + err := s.connect() + s.errors = s.writeApi.Errors() + go func() { + for err := range s.errors { + cclog.ComponentError(s.name, err.Error()) + } + }() + return err +} + +func (s *InfluxAsyncSink) Write(point lp.CCMetric) error { + var p *write.Point + if s.config.MetaAsTags { + tags := map[string]string{} + for k, v := range point.Tags() { + tags[k] = v + } + for k, v := range point.Meta() { + tags[k] = v + } + p = influxdb2.NewPoint(point.Name(), tags, point.Fields(), point.Time()) + } else { + p = influxdb2.NewPoint(point.Name(), point.Tags(), point.Fields(), point.Time()) + } + + s.writeApi.WritePoint(p) + return nil +} + +func (s *InfluxAsyncSink) Flush() error { + s.writeApi.Flush() + return nil +} + +func (s *InfluxAsyncSink) Close() { + cclog.ComponentDebug(s.name, "Closing InfluxDB connection") + s.writeApi.Flush() + s.client.Close() +} diff --git a/sinks/influxAsyncSink.md b/sinks/influxAsyncSink.md new file mode 100644 index 0000000..286c93c --- /dev/null +++ b/sinks/influxAsyncSink.md @@ -0,0 +1,34 @@ +## `influxasync` sink + +The `influxasync` sink uses the official [InfluxDB golang client](https://pkg.go.dev/github.com/influxdata/influxdb-client-go/v2) to write the metrics to an InfluxDB database in a **non-blocking** fashion. It provides only support for V2 write endpoints (InfluxDB 1.8.0 or later). + + +### Configuration structure + +```json +{ + "": { + "type": "influxasync", + "meta_as_tags" : true, + "database" : "mymetrics", + "host": "dbhost.example.com", + "port": "4222", + "user": "exampleuser", + "password" : "examplepw", + "organization": "myorg", + "ssl": true, + "batch_size": 200, + } +} +``` + +- `type`: makes the sink an `influxdb` sink +- `meta_as_tags`: print all meta information as tags in the output (optional) +- `database`: All metrics are written to this bucket +- `host`: Hostname of the InfluxDB database server +- `port`: Portnumber (as string) of the InfluxDB database server +- `user`: Username for basic authentification +- `password`: Password for basic authentification +- `organization`: Organization in the InfluxDB +- `ssl`: Use SSL connection +- `batch_size`: batch up metrics internally, default 100 \ No newline at end of file diff --git a/sinks/influxSink.md b/sinks/influxSink.md index 2624034..bd0f576 100644 --- a/sinks/influxSink.md +++ b/sinks/influxSink.md @@ -1,6 +1,6 @@ ## `influxdb` sink -The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.dev/github.com/influxdata/influxdb-client-go/v2) to write the metrics to an InfluxDB database. It provides only support for V2 write endpoints (InfluxDB 1.8.0 or later). +The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.dev/github.com/influxdata/influxdb-client-go/v2) to write the metrics to an InfluxDB database in a **blocking** fashion. It provides only support for V2 write endpoints (InfluxDB 1.8.0 or later). ### Configuration structure diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 21c392f..09b4fc4 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -12,11 +12,12 @@ import ( // Map of all available sinks var AvailableSinks = map[string]Sink{ - "influxdb": new(InfluxSink), - "stdout": new(StdoutSink), - "nats": new(NatsSink), - "http": new(HttpSink), - "ganglia": new(GangliaSink), + "influxdb": new(InfluxSink), + "stdout": new(StdoutSink), + "nats": new(NatsSink), + "http": new(HttpSink), + "ganglia": new(GangliaSink), + "influxasync": new(InfluxAsyncSink), } // Metric collector manager data structure From 7182b339b9386de81b4fba5492483e056f467e32 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 17:41:35 +0100 Subject: [PATCH 103/174] Respect the publish option in the LikwidCollector --- collectors/likwidMetric.go | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 957c4aa..1c91c1c 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -435,14 +435,16 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, } m.mresults[group][tid][metric.Name] = value // Now we have the result, send it with the proper tags - tags := map[string]string{"type": metric.Scope.String()} - if metric.Scope != "node" { - tags["type-id"] = fmt.Sprintf("%d", domain) - } - fields := map[string]interface{}{"value": value} - y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) - if err == nil { - output <- y + if metric.Publish { + tags := map[string]string{"type": metric.Scope.String()} + if metric.Scope != "node" { + tags["type-id"] = fmt.Sprintf("%d", domain) + } + fields := map[string]interface{}{"value": value} + y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) + if err == nil { + output <- y + } } } } @@ -472,14 +474,16 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan } m.gmresults[tid][metric.Name] = value // Now we have the result, send it with the proper tags - tags := map[string]string{"type": metric.Scope.String()} - if metric.Scope != "node" { - tags["type-id"] = fmt.Sprintf("%d", domain) - } - fields := map[string]interface{}{"value": value} - y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) - if err == nil { - output <- y + if metric.Publish { + tags := map[string]string{"type": metric.Scope.String()} + if metric.Scope != "node" { + tags["type-id"] = fmt.Sprintf("%d", domain) + } + fields := map[string]interface{}{"value": value} + y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) + if err == nil { + output <- y + } } } } From 7b104ebe9080ebad42e3c352308bbc43520518f1 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 18:00:02 +0100 Subject: [PATCH 104/174] Use cclog.ComponentDebug. Avoid copying point.Fields() --- sinks/influxSink.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/sinks/influxSink.go b/sinks/influxSink.go index bb35349..fcfc32f 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -6,8 +6,8 @@ import ( "encoding/json" "errors" "fmt" - "log" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" @@ -45,9 +45,15 @@ func (s *InfluxSink) connect() error { } else { auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) } - log.Print("Using URI ", uri, " Org ", s.config.Organization, " Bucket ", s.config.Database) - s.client = influxdb2.NewClientWithOptions(uri, auth, - influxdb2.DefaultOptions().SetTLSConfig(&tls.Config{InsecureSkipVerify: true})) + cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) + s.client = + influxdb2.NewClientWithOptions( + uri, + auth, + influxdb2.DefaultOptions().SetTLSConfig( + &tls.Config{InsecureSkipVerify: true}, + ), + ) s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) return nil } @@ -71,8 +77,7 @@ func (s *InfluxSink) Init(config json.RawMessage) error { } func (s *InfluxSink) Write(point lp.CCMetric) error { - tags := map[string]string{} - fields := map[string]interface{}{} + tags := make(map[string]string) for key, value := range point.Tags() { tags[key] = value } @@ -81,10 +86,7 @@ func (s *InfluxSink) Write(point lp.CCMetric) error { tags[key] = value } } - for _, f := range point.FieldList() { - fields[f.Key] = f.Value - } - p := influxdb2.NewPoint(point.Name(), tags, fields, point.Time()) + p := influxdb2.NewPoint(point.Name(), tags, point.Fields(), point.Time()) err := s.writeApi.WritePoint(context.Background(), p) return err } @@ -94,6 +96,6 @@ func (s *InfluxSink) Flush() error { } func (s *InfluxSink) Close() { - log.Print("Closing InfluxDB connection") + cclog.ComponentDebug(s.name, "Closing InfluxDB connection") s.client.Close() } From a6bec61b1ec49ce61cac95f5fed092fd28a4faed Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 7 Feb 2022 18:35:08 +0100 Subject: [PATCH 105/174] LikwidCollector: Filter out NaNs or set them to zero if 'nan_to_zero' option is set --- collectors/likwidMetric.go | 16 +++++++++++----- collectors/likwidMetric.md | 6 ++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 1c91c1c..5153300 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -86,9 +86,9 @@ type LikwidCollectorEventsetConfig struct { type LikwidCollectorConfig struct { Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"` - Metrics []LikwidCollectorMetricConfig `json:"globalmetrics"` - ExcludeMetrics []string `json:"exclude_metrics"` - ForceOverwrite bool `json:"force_overwrite"` + Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` + ForceOverwrite bool `json:"force_overwrite,omitempty"` + NanToZero bool `json:"nan_to_zero,omitempty"` } type LikwidCollector struct { @@ -434,8 +434,11 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, continue } m.mresults[group][tid][metric.Name] = value + if m.config.NanToZero && math.IsNaN(value) { + value = 0.0 + } // Now we have the result, send it with the proper tags - if metric.Publish { + if metric.Publish && !math.IsNaN(value) { tags := map[string]string{"type": metric.Scope.String()} if metric.Scope != "node" { tags["type-id"] = fmt.Sprintf("%d", domain) @@ -473,8 +476,11 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan continue } m.gmresults[tid][metric.Name] = value + if m.config.NanToZero && math.IsNaN(value) { + value = 0.0 + } // Now we have the result, send it with the proper tags - if metric.Publish { + if metric.Publish && !math.IsNaN(value) { tags := map[string]string{"type": metric.Scope.String()} if metric.Scope != "node" { tags["type-id"] = fmt.Sprintf("%d", domain) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index f8ac2d1..c01bf11 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -7,6 +7,10 @@ The `likwid` configuration consists of two parts, the "eventsets" and "globalmet - An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. A counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`hwthread`) or each CPU socket (`socket`). The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. - The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. +Additional options: +- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements +- `nan_to_zero`: In some cases, the calculations result in `NaN`. With this option, all `NaN` values are replaces with `0.0`. + ### Available metric scopes Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric. @@ -28,6 +32,8 @@ As a guideline: ```json "likwid": { + "force_overwrite" : false, + "nan_to_zero" : false, "eventsets": [ { "events": { From 627163d4dfecd0c7524b78306b364b798f6be4bf Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 21:13:57 +0100 Subject: [PATCH 106/174] Add method ToLineProtocol which generates influxDB line protocol for data type ccMetric --- internal/ccMetric/ccMetric.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 0771a10..24f1350 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -5,6 +5,8 @@ import ( "sort" "time" + influxdb2 "github.com/influxdata/influxdb-client-go/v2" + write "github.com/influxdata/influxdb-client-go/v2/api/write" lp "github.com/influxdata/line-protocol" // MIT license ) @@ -70,6 +72,21 @@ func (m *ccMetric) String() string { return fmt.Sprintf("%s %v %v %v %d", m.name, m.tags, m.meta, m.Fields(), m.tm.UnixNano()) } +// ToLineProtocol generates influxDB line protocol for data type ccMetric +func (m *ccMetric) ToLineProtocol(metaAsTags bool) string { + tags := make(map[string]string) + for key, value := range m.tags { + tags[key] = value + } + if metaAsTags { + for key, value := range m.meta { + tags[key] = value + } + } + p := influxdb2.NewPoint(m.name, tags, m.Fields(), m.tm) + return write.PointToLineProtocol(p, time.Nanosecond) +} + // Name returns the measurement name func (m *ccMetric) Name() string { return m.name From fe42e8bb95af34d6ca29ce889cc3247cf044148e Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 22:41:31 +0100 Subject: [PATCH 107/174] Switch fields data type from []*lp.Field to map[string]interface{} --- internal/ccMetric/ccMetric.go | 86 ++++++++++++----------------------- 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 24f1350..ee2cce5 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -16,11 +16,11 @@ import ( // // See: https://docs.influxdata.com/influxdb/latest/reference/syntax/line-protocol/ type ccMetric struct { - name string // Measurement name - meta map[string]string // map of meta data tags - tags map[string]string // map of of tags - fields []*lp.Field // unordered list of of fields - tm time.Time // timestamp + name string // Measurement name + meta map[string]string // map of meta data tags + tags map[string]string // map of of tags + fields map[string]interface{} // map of of fields + tm time.Time // timestamp } // ccmetric access functions @@ -48,7 +48,6 @@ type CCMetric interface { HasField(key string) bool // Check if a field key is present RemoveField(key string) // Remove a field addressed by its key Fields() map[string]interface{} // Map of fields - FieldList() []*lp.Field // Ordered list of fields } // Meta returns the meta data tags as key-value mapping @@ -69,7 +68,7 @@ func (m *ccMetric) MetaList() []*lp.Tag { // String implements the stringer interface for data type ccMetric func (m *ccMetric) String() string { - return fmt.Sprintf("%s %v %v %v %d", m.name, m.tags, m.meta, m.Fields(), m.tm.UnixNano()) + return fmt.Sprintf("%s %v %v %v %d", m.name, m.tags, m.meta, m.fields, m.tm.UnixNano()) } // ToLineProtocol generates influxDB line protocol for data type ccMetric @@ -83,7 +82,7 @@ func (m *ccMetric) ToLineProtocol(metaAsTags bool) string { tags[key] = value } } - p := influxdb2.NewPoint(m.name, tags, m.Fields(), m.tm) + p := influxdb2.NewPoint(m.name, tags, m.fields, m.tm) return write.PointToLineProtocol(p, time.Nanosecond) } @@ -113,17 +112,16 @@ func (m *ccMetric) TagList() []*lp.Tag { // Fields returns the list of fields as key-value-mapping func (m *ccMetric) Fields() map[string]interface{} { - fields := make(map[string]interface{}, len(m.fields)) - for _, field := range m.fields { - fields[field.Key] = field.Value - } - - return fields + return m.fields } // FieldList returns the list of fields func (m *ccMetric) FieldList() []*lp.Field { - return m.fields + fieldList := make([]*lp.Field, 0, len(m.fields)) + for key, value := range m.fields { + fieldList = append(fieldList, &lp.Field{Key: key, Value: value}) + } + return fieldList } // Time returns timestamp @@ -186,46 +184,25 @@ func (m *ccMetric) AddMeta(key, value string) { // AddField adds a field (consisting of key and value) to the unordered list of fields func (m *ccMetric) AddField(key string, value interface{}) { - for i, field := range m.fields { - if key == field.Key { - m.fields[i] = &lp.Field{Key: key, Value: convertField(value)} - return - } - } - m.fields = append(m.fields, &lp.Field{Key: key, Value: convertField(value)}) + m.fields[key] = value } // GetField returns the field with field's key equal to func (m *ccMetric) GetField(key string) (interface{}, bool) { - for _, field := range m.fields { - if field.Key == key { - return field.Value, true - } - } - return "", false + v, ok := m.fields[key] + return v, ok } // HasField checks if a field with field's key equal to is present in the list of fields func (m *ccMetric) HasField(key string) bool { - for _, field := range m.fields { - if field.Key == key { - return true - } - } - return false + _, ok := m.fields[key] + return ok } // RemoveField removes the field with field's key equal to // from the unordered list of fields func (m *ccMetric) RemoveField(key string) { - for i, field := range m.fields { - if field.Key == key { - copy(m.fields[i:], m.fields[i+1:]) - m.fields[len(m.fields)-1] = nil - m.fields = m.fields[:len(m.fields)-1] - return - } - } + delete(m.fields, key) } // New creates a new measurement point @@ -240,7 +217,7 @@ func New( name: name, tags: make(map[string]string, len(tags)), meta: make(map[string]string, len(meta)), - fields: make([]*lp.Field, 0, len(fields)), + fields: make(map[string]interface{}, len(fields)), tm: tm, } @@ -260,7 +237,7 @@ func New( if v == nil { continue } - m.AddField(k, v) + m.fields[k] = v } return m, nil @@ -271,20 +248,19 @@ func FromMetric(other ccMetric) CCMetric { m := &ccMetric{ name: other.Name(), tags: make(map[string]string), - fields: make([]*lp.Field, len(other.FieldList())), meta: make(map[string]string), + fields: make(map[string]interface{}), tm: other.Time(), } - for key, value := range other.Tags() { + for key, value := range other.tags { m.tags[key] = value } - for key, value := range other.Meta() { + for key, value := range other.meta { m.meta[key] = value } - - for i, field := range other.FieldList() { - m.fields[i] = &lp.Field{Key: field.Key, Value: field.Value} + for key, value := range other.fields { + m.fields[key] = value } return m } @@ -294,20 +270,16 @@ func FromInfluxMetric(other lp.Metric) CCMetric { m := &ccMetric{ name: other.Name(), tags: make(map[string]string), - fields: make([]*lp.Field, len(other.FieldList())), meta: make(map[string]string), + fields: make(map[string]interface{}), tm: other.Time(), } for _, otherTag := range other.TagList() { m.tags[otherTag.Key] = otherTag.Value } - - for i, otherField := range other.FieldList() { - m.fields[i] = &lp.Field{ - Key: otherField.Key, - Value: otherField.Value, - } + for _, otherField := range other.FieldList() { + m.fields[otherField.Key] = otherField.Value } return m } From af051b5e7eeb5f0dfc102455c53ff5af2745e2d6 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 7 Feb 2022 22:52:39 +0100 Subject: [PATCH 108/174] Replace FieldList() by Fields() --- internal/metricRouter/metricRouter.go | 4 ++-- sinks/stdoutSink.go | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index a31f2a6..5b254f8 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -146,8 +146,8 @@ func getParamMap(point lp.CCMetric) map[string]interface{} { for key, value := range point.Meta() { params[key] = value } - for _, f := range point.FieldList() { - params[f.Key] = f.Value + for key, value := range point.Fields() { + params[key] = value } params["timestamp"] = point.Time() return params diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index 2c9e710..ee51d40 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -59,28 +59,28 @@ func (s *StdoutSink) Write(point lp.CCMetric) error { tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", m.Key, m.Value)) } } - for _, f := range point.FieldList() { - switch f.Value.(type) { + for key, value := range point.Fields() { + switch value.(type) { case float64: - if !math.IsNaN(f.Value.(float64)) { - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", f.Key, f.Value.(float64))) + if !math.IsNaN(value.(float64)) { + fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, value.(float64))) } else { - fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", f.Key)) + fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", key)) } case float32: - if !math.IsNaN(float64(f.Value.(float32))) { - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", f.Key, f.Value.(float32))) + if !math.IsNaN(float64(value.(float32))) { + fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, value.(float32))) } else { - fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", f.Key)) + fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", key)) } case int: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", f.Key, f.Value.(int))) + fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, value.(int))) case int64: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", f.Key, f.Value.(int64))) + fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, value.(int64))) case string: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%q", f.Key, f.Value.(string))) + fieldstr = append(fieldstr, fmt.Sprintf("%s=%q", key, value.(string))) default: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", f.Key, f.Value)) + fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, value)) } } if len(tagsstr) > 0 { From e1a7379c2ee0f4c4fa0ad0d25667859c2b0d3a21 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 8 Feb 2022 09:31:08 +0100 Subject: [PATCH 109/174] Generate influxDB point for data type ccMetric --- internal/ccMetric/ccMetric.go | 78 +++++++++++++++++++++-------------- sinks/influxAsyncSink.go | 21 ++-------- sinks/influxSink.go | 18 +++----- 3 files changed, 57 insertions(+), 60 deletions(-) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index ee2cce5..ad2b2b1 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -25,29 +25,33 @@ type ccMetric struct { // ccmetric access functions type CCMetric interface { - lp.Metric // Time(), Name(), TagList(), FieldList() + lp.Metric // Time(), Name(), TagList(), FieldList() + ToLineProtocol(metaAsTags bool) string // Generate influxDB line protocol for data type ccMetric + ToPoint(metaAsTags bool) *write.Point // Generate influxDB point for data type ccMetric - SetName(name string) - SetTime(t time.Time) + SetName(name string) // Set metric name + SetTime(t time.Time) // Set timestamp - Meta() map[string]string // Map of meta data tags - MetaList() []*lp.Tag // Ordered list of meta data - AddMeta(key, value string) // Add a meta data tag - HasMeta(key string) bool // Check a meta data tag - GetMeta(key string) (string, bool) // Get a meta data tab addressed by its key - RemoveMeta(key string) // Remove a meta data tag by its key + Tags() map[string]string // Map of tags + TagList() []*lp.Tag // Ordered list of tags + AddTag(key, value string) // Add a tag + GetTag(key string) (value string, ok bool) // Get a tag by its key + HasTag(key string) (ok bool) // Check a tag + RemoveTag(key string) // Remove a tag by its key - Tags() map[string]string // Map of tags - TagList() []*lp.Tag // Ordered list of tags - AddTag(key, value string) // Add a tag - GetTag(key string) (string, bool) // Get a tag by its key - HasTag(key string) bool // Check a tag - RemoveTag(key string) // Remove a tag by its key + Meta() map[string]string // Map of meta data tags + MetaList() []*lp.Tag // Ordered list of meta data + AddMeta(key, value string) // Add a meta data tag + GetMeta(key string) (value string, ok bool) // Get a meta data tab addressed by its key + HasMeta(key string) (ok bool) // Check a meta data tag + RemoveMeta(key string) // Remove a meta data tag by its key - GetField(key string) (interface{}, bool) // Get a field addressed by its key - HasField(key string) bool // Check if a field key is present - RemoveField(key string) // Remove a field addressed by its key - Fields() map[string]interface{} // Map of fields + Fields() map[string]interface{} // Map of fields + FieldList() []*lp.Field // Ordered list of fields + AddField(key string, value interface{}) // Add a field + GetField(key string) (value interface{}, ok bool) // Get a field addressed by its key + HasField(key string) (ok bool) // Check if a field key is present + RemoveField(key string) // Remove a field addressed by its key } // Meta returns the meta data tags as key-value mapping @@ -68,22 +72,34 @@ func (m *ccMetric) MetaList() []*lp.Tag { // String implements the stringer interface for data type ccMetric func (m *ccMetric) String() string { - return fmt.Sprintf("%s %v %v %v %d", m.name, m.tags, m.meta, m.fields, m.tm.UnixNano()) + return fmt.Sprintf("Name: %s, Tags: %+v, Meta: %+v, fields: %+v, Timestamp: %d", m.name, m.tags, m.meta, m.fields, m.tm.UnixNano()) +} + +// ToLineProtocol generates influxDB line protocol for data type ccMetric +func (m *ccMetric) ToPoint(metaAsTags bool) (p *write.Point) { + + if !metaAsTags { + p = influxdb2.NewPoint(m.name, m.tags, m.fields, m.tm) + } else { + tags := make(map[string]string, len(m.tags)+len(m.meta)) + for key, value := range m.tags { + tags[key] = value + } + for key, value := range m.meta { + tags[key] = value + } + p = influxdb2.NewPoint(m.name, tags, m.fields, m.tm) + } + return } // ToLineProtocol generates influxDB line protocol for data type ccMetric func (m *ccMetric) ToLineProtocol(metaAsTags bool) string { - tags := make(map[string]string) - for key, value := range m.tags { - tags[key] = value - } - if metaAsTags { - for key, value := range m.meta { - tags[key] = value - } - } - p := influxdb2.NewPoint(m.name, tags, m.fields, m.tm) - return write.PointToLineProtocol(p, time.Nanosecond) + + return write.PointToLineProtocol( + m.ToPoint(metaAsTags), + time.Nanosecond, + ) } // Name returns the measurement name diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index 0763a4b..3315456 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -2,6 +2,7 @@ package sinks import ( // "context" + "crypto/tls" "encoding/json" "errors" @@ -11,7 +12,6 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" - "github.com/influxdata/influxdb-client-go/v2/api/write" ) type InfluxAsyncSinkConfig struct { @@ -88,22 +88,9 @@ func (s *InfluxAsyncSink) Init(config json.RawMessage) error { return err } -func (s *InfluxAsyncSink) Write(point lp.CCMetric) error { - var p *write.Point - if s.config.MetaAsTags { - tags := map[string]string{} - for k, v := range point.Tags() { - tags[k] = v - } - for k, v := range point.Meta() { - tags[k] = v - } - p = influxdb2.NewPoint(point.Name(), tags, point.Fields(), point.Time()) - } else { - p = influxdb2.NewPoint(point.Name(), point.Tags(), point.Fields(), point.Time()) - } - - s.writeApi.WritePoint(p) +func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { + s.writeApi.WritePoint( + m.ToPoint(s.config.MetaAsTags)) return nil } diff --git a/sinks/influxSink.go b/sinks/influxSink.go index fcfc32f..156f6eb 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -76,18 +76,12 @@ func (s *InfluxSink) Init(config json.RawMessage) error { return s.connect() } -func (s *InfluxSink) Write(point lp.CCMetric) error { - tags := make(map[string]string) - for key, value := range point.Tags() { - tags[key] = value - } - if s.config.MetaAsTags { - for key, value := range point.Meta() { - tags[key] = value - } - } - p := influxdb2.NewPoint(point.Name(), tags, point.Fields(), point.Time()) - err := s.writeApi.WritePoint(context.Background(), p) +func (s *InfluxSink) Write(m lp.CCMetric) error { + err := + s.writeApi.WritePoint( + context.Background(), + m.ToPoint(s.config.MetaAsTags), + ) return err } From 4e0782d66b9eb3ea221ec68e32703962d935ca58 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 8 Feb 2022 10:58:53 +0100 Subject: [PATCH 110/174] Use FromInfluxMetric() to convert influx to cc metric --- collectors/customCmdMetric.go | 6 ++++-- collectors/metricCollector.go | 19 ------------------- 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index ffe8b73..483d2ba 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -9,6 +9,7 @@ import ( "strings" "time" + ccmetric "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" ) @@ -97,7 +98,8 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri if skip { continue } - y, err := lp.New(c.Name(), Tags2Map(c), m.meta, Fields2Map(c), c.Time()) + + y := ccmetric.FromInfluxMetric(c) if err == nil { output <- y } @@ -119,7 +121,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri if skip { continue } - y, err := lp.New(f.Name(), Tags2Map(f), m.meta, Fields2Map(f), f.Time()) + y := ccmetric.FromInfluxMetric(f) if err == nil { output <- y } diff --git a/collectors/metricCollector.go b/collectors/metricCollector.go index 3484dca..c71ae16 100644 --- a/collectors/metricCollector.go +++ b/collectors/metricCollector.go @@ -10,7 +10,6 @@ import ( "time" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - influx "github.com/influxdata/line-protocol" ) type MetricCollector interface { @@ -115,24 +114,6 @@ func CpuList() []int { return cpulist } -// Tags2Map stores a InfluxDB list of tags in a map of key value pairs -func Tags2Map(metric influx.Metric) map[string]string { - tags := make(map[string]string) - for _, t := range metric.TagList() { - tags[t.Key] = t.Value - } - return tags -} - -// Fields2Map stores a InfluxDB list of fields in a map of key value pairs -func Fields2Map(metric influx.Metric) map[string]interface{} { - fields := make(map[string]interface{}) - for _, f := range metric.FieldList() { - fields[f.Key] = f.Value - } - return fields -} - // RemoveFromStringList removes the string r from the array of strings s // If r is not contained in the array an error is returned func RemoveFromStringList(s []string, r string) ([]string, error) { From bfeee5511348aaf228a37d5424a719dd3ac5f076 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 8 Feb 2022 11:00:25 +0100 Subject: [PATCH 111/174] Use FromInfluxMetric() to convert influx to cc metric --- receivers/metricReceiver.go | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/receivers/metricReceiver.go b/receivers/metricReceiver.go index 2c74409..50724b1 100644 --- a/receivers/metricReceiver.go +++ b/receivers/metricReceiver.go @@ -3,7 +3,6 @@ package receivers import ( // "time" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - influx "github.com/influxdata/line-protocol" ) type ReceiverConfig struct { @@ -38,19 +37,3 @@ func (r *receiver) Name() string { func (r *receiver) SetSink(sink chan lp.CCMetric) { r.sink = sink } - -func Tags2Map(metric influx.Metric) map[string]string { - tags := make(map[string]string) - for _, t := range metric.TagList() { - tags[t.Key] = t.Value - } - return tags -} - -func Fields2Map(metric influx.Metric) map[string]interface{} { - fields := make(map[string]interface{}) - for _, f := range metric.FieldList() { - fields[f.Key] = f.Value - } - return fields -} From d98b6783990efb5998e2e65da57f95b559b6bc0b Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 8 Feb 2022 11:05:07 +0100 Subject: [PATCH 112/174] Refactoring: Replace FieldList() -> Fields() --- sinks/gangliaSink.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 989e537..403d222 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -95,23 +95,23 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) } argstr = append(argstr, fmt.Sprintf("--name=%s", point.Name())) - for _, f := range point.FieldList() { - if f.Key == "value" { - switch f.Value.(type) { + for k, v := range point.Fields() { + if k == "value" { + switch value := v.(type) { case float64: - argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float64))) + argstr = append(argstr, fmt.Sprintf("--value=%v", value)) argstr = append(argstr, "--type=double") case float32: - argstr = append(argstr, fmt.Sprintf("--value=%v", f.Value.(float32))) + argstr = append(argstr, fmt.Sprintf("--value=%v", value)) argstr = append(argstr, "--type=float") case int: - argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int))) + argstr = append(argstr, fmt.Sprintf("--value=%d", value)) argstr = append(argstr, "--type=int32") case int64: - argstr = append(argstr, fmt.Sprintf("--value=%d", f.Value.(int64))) + argstr = append(argstr, fmt.Sprintf("--value=%d", value)) argstr = append(argstr, "--type=int32") case string: - argstr = append(argstr, fmt.Sprintf("--value=%q", f.Value.(string))) + argstr = append(argstr, fmt.Sprintf("--value=%q", value)) argstr = append(argstr, "--type=string") } } From 6d55c376bd3c253a7ba35284529c8fe316a803dd Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 8 Feb 2022 11:23:19 +0100 Subject: [PATCH 113/174] Refactoring: Remove all *List() functions from CCMetric --- internal/ccMetric/ccMetric.go | 38 +++-------------------------------- sinks/httpSink.go | 4 ++-- sinks/natsSink.go | 4 ++-- sinks/stdoutSink.go | 26 ++++++++++++------------ 4 files changed, 20 insertions(+), 52 deletions(-) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index ad2b2b1..68f1886 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -2,7 +2,6 @@ package ccmetric import ( "fmt" - "sort" "time" influxdb2 "github.com/influxdata/influxdb-client-go/v2" @@ -25,29 +24,28 @@ type ccMetric struct { // ccmetric access functions type CCMetric interface { - lp.Metric // Time(), Name(), TagList(), FieldList() ToLineProtocol(metaAsTags bool) string // Generate influxDB line protocol for data type ccMetric ToPoint(metaAsTags bool) *write.Point // Generate influxDB point for data type ccMetric + Name() string // Get metric name SetName(name string) // Set metric name + + Time() time.Time // Get timestamp SetTime(t time.Time) // Set timestamp Tags() map[string]string // Map of tags - TagList() []*lp.Tag // Ordered list of tags AddTag(key, value string) // Add a tag GetTag(key string) (value string, ok bool) // Get a tag by its key HasTag(key string) (ok bool) // Check a tag RemoveTag(key string) // Remove a tag by its key Meta() map[string]string // Map of meta data tags - MetaList() []*lp.Tag // Ordered list of meta data AddMeta(key, value string) // Add a meta data tag GetMeta(key string) (value string, ok bool) // Get a meta data tab addressed by its key HasMeta(key string) (ok bool) // Check a meta data tag RemoveMeta(key string) // Remove a meta data tag by its key Fields() map[string]interface{} // Map of fields - FieldList() []*lp.Field // Ordered list of fields AddField(key string, value interface{}) // Add a field GetField(key string) (value interface{}, ok bool) // Get a field addressed by its key HasField(key string) (ok bool) // Check if a field key is present @@ -59,17 +57,6 @@ func (m *ccMetric) Meta() map[string]string { return m.meta } -// MetaList returns the the list of meta data tags as sorted list of key value tags -func (m *ccMetric) MetaList() []*lp.Tag { - - ml := make([]*lp.Tag, 0, len(m.meta)) - for key, value := range m.meta { - ml = append(ml, &lp.Tag{Key: key, Value: value}) - } - sort.Slice(ml, func(i, j int) bool { return ml[i].Key < ml[j].Key }) - return ml -} - // String implements the stringer interface for data type ccMetric func (m *ccMetric) String() string { return fmt.Sprintf("Name: %s, Tags: %+v, Meta: %+v, fields: %+v, Timestamp: %d", m.name, m.tags, m.meta, m.fields, m.tm.UnixNano()) @@ -116,30 +103,11 @@ func (m *ccMetric) Tags() map[string]string { return m.tags } -// TagList returns the the list of tags as sorted list of key value tags -func (m *ccMetric) TagList() []*lp.Tag { - tl := make([]*lp.Tag, 0, len(m.tags)) - for key, value := range m.tags { - tl = append(tl, &lp.Tag{Key: key, Value: value}) - } - sort.Slice(tl, func(i, j int) bool { return tl[i].Key < tl[j].Key }) - return tl -} - // Fields returns the list of fields as key-value-mapping func (m *ccMetric) Fields() map[string]interface{} { return m.fields } -// FieldList returns the list of fields -func (m *ccMetric) FieldList() []*lp.Field { - fieldList := make([]*lp.Field, 0, len(m.fields)) - for key, value := range m.fields { - fieldList = append(fieldList, &lp.Field{Key: key, Value: value}) - } - return fieldList -} - // Time returns timestamp func (m *ccMetric) Time() time.Time { return m.tm diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 3080faa..fc7b450 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -57,8 +57,8 @@ func (s *HttpSink) Init(config json.RawMessage) error { return nil } -func (s *HttpSink) Write(point lp.CCMetric) error { - _, err := s.encoder.Encode(point) +func (s *HttpSink) Write(m lp.CCMetric) error { + _, err := s.encoder.Encode(m.ToPoint(s.config.MetaAsTags)) return err } diff --git a/sinks/natsSink.go b/sinks/natsSink.go index 37e8c2b..187157e 100644 --- a/sinks/natsSink.go +++ b/sinks/natsSink.go @@ -77,9 +77,9 @@ func (s *NatsSink) Init(config json.RawMessage) error { return s.connect() } -func (s *NatsSink) Write(point lp.CCMetric) error { +func (s *NatsSink) Write(m lp.CCMetric) error { if s.client != nil { - _, err := s.encoder.Encode(point) + _, err := s.encoder.Encode(m.ToPoint(s.config.MetaAsTags)) if err != nil { cclog.ComponentError(s.name, "Write:", err.Error()) return err diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index ee51d40..c085c13 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -51,34 +51,34 @@ func (s *StdoutSink) Init(config json.RawMessage) error { func (s *StdoutSink) Write(point lp.CCMetric) error { var tagsstr []string var fieldstr []string - for _, t := range point.TagList() { - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", t.Key, t.Value)) + for key, value := range point.Tags() { + tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) } if s.meta_as_tags { - for _, m := range point.MetaList() { - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", m.Key, m.Value)) + for key, value := range point.Meta() { + tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) } } - for key, value := range point.Fields() { - switch value.(type) { + for key, v := range point.Fields() { + switch value := v.(type) { case float64: - if !math.IsNaN(value.(float64)) { - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, value.(float64))) + if !math.IsNaN(value) { + fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, v)) } else { fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", key)) } case float32: - if !math.IsNaN(float64(value.(float32))) { - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, value.(float32))) + if !math.IsNaN(float64(value)) { + fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, v)) } else { fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", key)) } case int: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, value.(int))) + fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, v)) case int64: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, value.(int64))) + fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, v)) case string: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%q", key, value.(string))) + fieldstr = append(fieldstr, fmt.Sprintf("%s=%q", key, v)) default: fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, value)) } From c47ac2ebc31e93bd8e83040cc2849bd7c237169f Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 8 Feb 2022 12:22:56 +0100 Subject: [PATCH 114/174] Cleanup --- internal/ccMetric/ccMetric.go | 96 +++++++++++++++++------------------ 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 68f1886..1de325a 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -22,10 +22,10 @@ type ccMetric struct { tm time.Time // timestamp } -// ccmetric access functions +// ccMetric access functions type CCMetric interface { - ToLineProtocol(metaAsTags bool) string // Generate influxDB line protocol for data type ccMetric ToPoint(metaAsTags bool) *write.Point // Generate influxDB point for data type ccMetric + ToLineProtocol(metaAsTags bool) string // Generate influxDB line protocol for data type ccMetric Name() string // Get metric name SetName(name string) // Set metric name @@ -36,13 +36,13 @@ type CCMetric interface { Tags() map[string]string // Map of tags AddTag(key, value string) // Add a tag GetTag(key string) (value string, ok bool) // Get a tag by its key - HasTag(key string) (ok bool) // Check a tag + HasTag(key string) (ok bool) // Check if a tag key is present RemoveTag(key string) // Remove a tag by its key Meta() map[string]string // Map of meta data tags AddMeta(key, value string) // Add a meta data tag GetMeta(key string) (value string, ok bool) // Get a meta data tab addressed by its key - HasMeta(key string) (ok bool) // Check a meta data tag + HasMeta(key string) (ok bool) // Check if a meta data key is present RemoveMeta(key string) // Remove a meta data tag by its key Fields() map[string]interface{} // Map of fields @@ -52,14 +52,12 @@ type CCMetric interface { RemoveField(key string) // Remove a field addressed by its key } -// Meta returns the meta data tags as key-value mapping -func (m *ccMetric) Meta() map[string]string { - return m.meta -} - // String implements the stringer interface for data type ccMetric func (m *ccMetric) String() string { - return fmt.Sprintf("Name: %s, Tags: %+v, Meta: %+v, fields: %+v, Timestamp: %d", m.name, m.tags, m.meta, m.fields, m.tm.UnixNano()) + return fmt.Sprintf( + "Name: %s, Tags: %+v, Meta: %+v, fields: %+v, Timestamp: %d", + m.name, m.tags, m.meta, m.fields, m.tm.UnixNano(), + ) } // ToLineProtocol generates influxDB line protocol for data type ccMetric @@ -94,20 +92,11 @@ func (m *ccMetric) Name() string { return m.name } +// SetName sets the measurement name func (m *ccMetric) SetName(name string) { m.name = name } -// Tags returns the the list of tags as key-value-mapping -func (m *ccMetric) Tags() map[string]string { - return m.tags -} - -// Fields returns the list of fields as key-value-mapping -func (m *ccMetric) Fields() map[string]interface{} { - return m.fields -} - // Time returns timestamp func (m *ccMetric) Time() time.Time { return m.tm @@ -118,10 +107,14 @@ func (m *ccMetric) SetTime(t time.Time) { m.tm = t } -// HasTag checks if a tag with key equal to is present in the list of tags -func (m *ccMetric) HasTag(key string) bool { - _, ok := m.tags[key] - return ok +// Tags returns the the list of tags as key-value-mapping +func (m *ccMetric) Tags() map[string]string { + return m.tags +} + +// AddTag adds a tag (consisting of key and value) to the map of tags +func (m *ccMetric) AddTag(key, value string) { + m.tags[key] = value } // GetTag returns the tag with tag's key equal to @@ -130,22 +123,25 @@ func (m *ccMetric) GetTag(key string) (string, bool) { return value, ok } +// HasTag checks if a tag with key equal to is present in the list of tags +func (m *ccMetric) HasTag(key string) bool { + _, ok := m.tags[key] + return ok +} + // RemoveTag removes the tag with tag's key equal to -// and keeps the tag list ordered by the keys func (m *ccMetric) RemoveTag(key string) { delete(m.tags, key) } -// AddTag adds a tag (consisting of key and value) -// and keeps the tag list ordered by the keys -func (m *ccMetric) AddTag(key, value string) { - m.tags[key] = value +// Meta returns the meta data tags as key-value mapping +func (m *ccMetric) Meta() map[string]string { + return m.meta } -// HasTag checks if a meta data tag with meta data's key equal to is present in the list of meta data tags -func (m *ccMetric) HasMeta(key string) bool { - _, ok := m.meta[key] - return ok +// AddMeta adds a meta data tag (consisting of key and value) to the map of meta data tags +func (m *ccMetric) AddMeta(key, value string) { + m.meta[key] = value } // GetMeta returns the meta data tag with meta data's key equal to @@ -154,19 +150,23 @@ func (m *ccMetric) GetMeta(key string) (string, bool) { return value, ok } +// HasMeta checks if a meta data tag with meta data's key equal to is present in the map of meta data tags +func (m *ccMetric) HasMeta(key string) bool { + _, ok := m.meta[key] + return ok +} + // RemoveMeta removes the meta data tag with tag's key equal to -// and keeps the meta data tag list ordered by the keys func (m *ccMetric) RemoveMeta(key string) { delete(m.meta, key) } -// AddMeta adds a meta data tag (consisting of key and value) -// and keeps the meta data list ordered by the keys -func (m *ccMetric) AddMeta(key, value string) { - m.meta[key] = value +// Fields returns the list of fields as key-value-mapping +func (m *ccMetric) Fields() map[string]interface{} { + return m.fields } -// AddField adds a field (consisting of key and value) to the unordered list of fields +// AddField adds a field (consisting of key and value) to the map of fields func (m *ccMetric) AddField(key string, value interface{}) { m.fields[key] = value } @@ -177,14 +177,14 @@ func (m *ccMetric) GetField(key string) (interface{}, bool) { return v, ok } -// HasField checks if a field with field's key equal to is present in the list of fields +// HasField checks if a field with field's key equal to is present in the map of fields func (m *ccMetric) HasField(key string) bool { _, ok := m.fields[key] return ok } // RemoveField removes the field with field's key equal to -// from the unordered list of fields +// from the map of fields func (m *ccMetric) RemoveField(key string) { delete(m.fields, key) } @@ -205,17 +205,13 @@ func New( tm: tm, } - // deep copy tags + // deep copy tags, meta data tags and fields for k, v := range tags { m.tags[k] = v } - - // deep copy meta data tags for k, v := range meta { m.meta[k] = v } - - // Unsorted list of fields for k, v := range fields { v := convertField(v) if v == nil { @@ -231,12 +227,13 @@ func New( func FromMetric(other ccMetric) CCMetric { m := &ccMetric{ name: other.Name(), - tags: make(map[string]string), - meta: make(map[string]string), - fields: make(map[string]interface{}), + tags: make(map[string]string, len(other.tags)), + meta: make(map[string]string, len(other.meta)), + fields: make(map[string]interface{}, len(other.fields)), tm: other.Time(), } + // deep copy tags, meta data tags and fields for key, value := range other.tags { m.tags[key] = value } @@ -259,6 +256,7 @@ func FromInfluxMetric(other lp.Metric) CCMetric { tm: other.Time(), } + // deep copy tags and fields for _, otherTag := range other.TagList() { m.tags[otherTag.Key] = otherTag.Value } From e1cf68298923708851d761ed3f5ac12476f5e8aa Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 8 Feb 2022 13:22:20 +0100 Subject: [PATCH 115/174] Add other collectors to README --- collectors/README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/collectors/README.md b/collectors/README.md index a79fa03..558649e 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -22,8 +22,9 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`loadavg`](./loadavgMetric.md) * [`netstat`](./netstatMetric.md) * [`ibstat`](./infinibandMetric.md) +* [`ibstat_perfquery`](./infinibandPerfQueryMetric.md) * [`tempstat`](./tempMetric.md) -* [`lustre`](./lustreMetric.md) +* [`lustrestat`](./lustreMetric.md) * [`likwid`](./likwidMetric.md) * [`nvidia`](./nvidiaMetric.md) * [`customcmd`](./customCmdMetric.md) @@ -31,10 +32,14 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`topprocs`](./topprocsMetric.md) * [`nfs3stat`](./nfs3Metric.md) * [`nfs4stat`](./nfs4Metric.md) +* [`cpufreq`](./cpufreqMetric.md) +* [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md) +* [`numastat`](./numastatMetric.md) +* [`gpfs`](./gpfsMetric.md) +* [`ipmistat`](./ipmiMetric.md) ## Todos -* [ ] Exclude devices for `diskstat` collector * [ ] Aggreate metrics to higher topology entity (sum hwthread metrics to socket metric, ...). Needs to be configurable # Contributing own collectors From 377f85111adbafc950dba236a4982f0618fb8a24 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 8 Feb 2022 13:38:18 +0100 Subject: [PATCH 116/174] Refactoring: Use ccmetric function ToLineProtocol() in stdout sink --- sinks/stdoutSink.go | 66 +++++++++++---------------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index c085c13..5d0761a 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -3,7 +3,6 @@ package sinks import ( "encoding/json" "fmt" - "math" "os" "strings" @@ -11,15 +10,13 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -type StdoutSinkConfig struct { - defaultSinkConfig - Output string `json:"output_file,omitempty"` -} - type StdoutSink struct { - sink + sink // meta_as_tags, name output *os.File - config StdoutSinkConfig + config struct { + defaultSinkConfig + Output string `json:"output_file,omitempty"` + } } func (s *StdoutSink) Init(config json.RawMessage) error { @@ -30,13 +27,15 @@ func (s *StdoutSink) Init(config json.RawMessage) error { return err } } + s.output = os.Stdout if len(s.config.Output) > 0 { - if strings.ToLower(s.config.Output) == "stdout" { + switch strings.ToLower(s.config.Output) { + case "stdout": s.output = os.Stdout - } else if strings.ToLower(s.config.Output) == "stderr" { + case "stderr": s.output = os.Stderr - } else { + default: f, err := os.OpenFile(s.config.Output, os.O_CREATE|os.O_WRONLY, os.FileMode(0600)) if err != nil { return err @@ -48,46 +47,11 @@ func (s *StdoutSink) Init(config json.RawMessage) error { return nil } -func (s *StdoutSink) Write(point lp.CCMetric) error { - var tagsstr []string - var fieldstr []string - for key, value := range point.Tags() { - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) - } - if s.meta_as_tags { - for key, value := range point.Meta() { - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) - } - } - for key, v := range point.Fields() { - switch value := v.(type) { - case float64: - if !math.IsNaN(value) { - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, v)) - } else { - fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", key)) - } - case float32: - if !math.IsNaN(float64(value)) { - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, v)) - } else { - fieldstr = append(fieldstr, fmt.Sprintf("%s=0.0", key)) - } - case int: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, v)) - case int64: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%d", key, v)) - case string: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%q", key, v)) - default: - fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", key, value)) - } - } - if len(tagsstr) > 0 { - fmt.Printf("%s,%s %s %d\n", point.Name(), strings.Join(tagsstr, ","), strings.Join(fieldstr, ","), point.Time().Unix()) - } else { - fmt.Printf("%s %s %d\n", point.Name(), strings.Join(fieldstr, ","), point.Time().Unix()) - } +func (s *StdoutSink) Write(m lp.CCMetric) error { + fmt.Fprint( + s.output, + m.ToLineProtocol(s.meta_as_tags), + ) return nil } From 006b9f91f688cd115bce6d1304746edf3f670a14 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Feb 2022 13:39:58 +0100 Subject: [PATCH 117/174] Excluding NaN values in Likwid metrics from sending --- collectors/likwidMetric.go | 63 ++++++++++++++++++++++---------------- collectors/likwidMetric.md | 20 ++++++------ 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 5153300..3acd627 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -45,7 +45,7 @@ func (ms MetricScope) String() string { func (ms MetricScope) Likwid() string { LikwidDomains := map[string]string{ - "hwthread": "", + "cpu": "", "core": "", "llc": "C", "numadomain": "M", @@ -66,7 +66,7 @@ func (ms MetricScope) Granularity() int { } func GetAllMetricScopes() []MetricScope { - return []MetricScope{"hwthread" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"} + return []MetricScope{"cpu" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"} } type LikwidCollectorMetricConfig struct { @@ -124,12 +124,12 @@ func eventsToEventStr(events map[string]string) string { func getGranularity(counter, event string) MetricScope { if strings.HasPrefix(counter, "PMC") || strings.HasPrefix(counter, "FIXC") { - return "hwthread" + return "cpu" } else if strings.Contains(counter, "BOX") || strings.Contains(counter, "DEV") { return "socket" } else if strings.HasPrefix(counter, "PWR") { if event == "RAPL_CORE_ENERGY" { - return "hwthread" + return "cpu" } else { return "socket" } @@ -142,7 +142,7 @@ func getBaseFreq() float64 { C.power_init(0) info := C.get_powerInfo() if float64(info.baseFrequency) != 0 { - freq = float64(info.baseFrequency) + freq = float64(info.baseFrequency) * 1e3 } else { buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit") if err == nil { @@ -168,7 +168,7 @@ func (m *LikwidCollector) initGranularity() { } for i, metric := range evset.Metrics { s := splitRegex.Split(metric.Calc, -1) - gran := MetricScope("hwthread") + gran := MetricScope("cpu") evset.Metrics[i].granulatity = gran for _, x := range s { if _, ok := evset.Events[x]; ok { @@ -182,7 +182,7 @@ func (m *LikwidCollector) initGranularity() { } for i, metric := range m.config.Metrics { s := splitRegex.Split(metric.Calc, -1) - gran := MetricScope("hwthread") + gran := MetricScope("cpu") m.config.Metrics[i].granulatity = gran for _, x := range s { for _, evset := range m.config.Eventsets { @@ -221,6 +221,9 @@ func (m *LikwidCollector) getResponsiblities() map[MetricScope]map[int]int { // case "llc": // input = fmt.Sprintf("%s%d:0", scope.Likwid(), s) // slist = topo.LLCacheList() + case "cpu": + input = func(index int) string { return fmt.Sprintf("%d", index) } + slist = topo.CpuList() case "hwthread": input = func(index int) string { return fmt.Sprintf("%d", index) } slist = topo.CpuList() @@ -284,7 +287,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { return err } - // Determine which counter works at which level. PMC*: hwthread, *BOX*: socket, ... + // Determine which counter works at which level. PMC*: cpu, *BOX*: socket, ... m.initGranularity() // Generate map for MetricScope -> scope_id (like socket id) -> responsible id (offset in cpulist) m.scopeRespTids = m.getResponsiblities() @@ -359,6 +362,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { return err } m.basefreq = getBaseFreq() + cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) m.init = true return nil } @@ -399,6 +403,7 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, var eidx C.int evset := m.config.Eventsets[group] gid := m.groups[group] + invClock := float64(1.0 / m.basefreq) // Go over events and get the results for eidx = 0; int(eidx) < len(evset.Events); eidx++ { @@ -414,7 +419,7 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, for _, tid := range scopemap { if tid >= 0 { m.results[group][tid]["time"] = interval.Seconds() - m.results[group][tid]["inverseClock"] = float64(1.0 / m.basefreq) + m.results[group][tid]["inverseClock"] = invClock res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) m.results[group][tid][gctr] = float64(res) } @@ -438,15 +443,17 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, value = 0.0 } // Now we have the result, send it with the proper tags - if metric.Publish && !math.IsNaN(value) { - tags := map[string]string{"type": metric.Scope.String()} - if metric.Scope != "node" { - tags["type-id"] = fmt.Sprintf("%d", domain) - } - fields := map[string]interface{}{"value": value} - y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) - if err == nil { - output <- y + if !math.IsNaN(value) { + if metric.Publish { + tags := map[string]string{"type": metric.Scope.String()} + if metric.Scope != "node" { + tags["type-id"] = fmt.Sprintf("%d", domain) + } + fields := map[string]interface{}{"value": value} + y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) + if err == nil { + output <- y + } } } } @@ -480,15 +487,17 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan value = 0.0 } // Now we have the result, send it with the proper tags - if metric.Publish && !math.IsNaN(value) { - tags := map[string]string{"type": metric.Scope.String()} - if metric.Scope != "node" { - tags["type-id"] = fmt.Sprintf("%d", domain) - } - fields := map[string]interface{}{"value": value} - y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) - if err == nil { - output <- y + if !math.IsNaN(value) { + if metric.Publish { + tags := map[string]string{"type": metric.Scope.String()} + if metric.Scope != "node" { + tags["type-id"] = fmt.Sprintf("%d", domain) + } + fields := map[string]interface{}{"value": value} + y, err := lp.New(metric.Name, tags, m.meta, fields, time.Now()) + if err == nil { + output <- y + } } } } diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index c01bf11..5c54bb6 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -4,7 +4,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": -- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. A counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`hwthread`) or each CPU socket (`socket`). The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. +- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. A counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. - The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Additional options: @@ -15,15 +15,15 @@ Additional options: Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric. -- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$hwthread_id"` +- `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"` - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` -**Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. +**Note:** You cannot specify `socket` scope for a metric that is measured at `cpu` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. As a guideline: -- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `hwthread` +- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu` - All counters names containing `BOX` have the scope `socket` -- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` scope +- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope - All `DFCx` counters have scope `socket` @@ -52,19 +52,19 @@ As a guideline: { "name": "ipc", "calc": "PMC0/PMC1", - "scope": "hwthread", + "scope": "cpu", "publish": true }, { "name": "flops_any", "calc": "0.000001*PMC2/time", - "scope": "hwthread", + "scope": "cpu", "publish": true }, { "name": "clock_mhz", "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", - "scope": "hwthread", + "scope": "cpu", "publish": true }, { @@ -119,7 +119,7 @@ As a guideline: ### How to get the eventsets and metrics from LIKWID -The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. +The `likwid` collector reads hardware performance counters at a **cpu** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference: ``` @@ -140,7 +140,7 @@ METRICS -> "metrics": [ IPC PMC0/PMC1 -> { -> "name" : "IPC", -> "calc" : "PMC0/PMC1", - -> "scope": "hwthread", + -> "scope": "cpu", -> "publish": true -> } -> ] From 9e73dcd43716ccfff2ebc2205d1c1c56f0bba434 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Feb 2022 13:40:27 +0100 Subject: [PATCH 118/174] Fix type tag for numastat --- collectors/numastatsMetric.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go index 08bd51b..707768d 100644 --- a/collectors/numastatsMetric.go +++ b/collectors/numastatsMetric.go @@ -77,7 +77,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error { m.topology = append(m.topology, NUMAStatsCollectorTopolgy{ file: file, - tagSet: map[string]string{"domain": node}, + tagSet: map[string]string{"memoryDomain": node}, }) } From 8c744617be4b95a65a221e6295f43ae74f4785d0 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Feb 2022 13:45:41 +0100 Subject: [PATCH 119/174] Remove logging and enable command execution for GangliaSink --- sinks/gangliaSink.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 403d222..a586375 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -4,7 +4,6 @@ import ( "encoding/json" "errors" "fmt" - "log" "strings" // "time" @@ -116,10 +115,9 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { } } } - log.Print(s.gmetric_path, " ", strings.Join(argstr, " ")) - // command := exec.Command(string(GMETRIC_EXEC), strings.Join(argstr, " ")) - // command.Wait() - // _, err := command.Output() + command := exec.Command(string(GMETRIC_EXEC), strings.Join(argstr, " ")) + command.Wait() + _, err = command.Output() return err } From cc86fc00a0e669552a16dbc6af2d5e4192250b8e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Feb 2022 13:46:19 +0100 Subject: [PATCH 120/174] Add missing error check in InfiniBandPerfQueryMetric --- collectors/infinibandPerfQueryMetric.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/collectors/infinibandPerfQueryMetric.go b/collectors/infinibandPerfQueryMetric.go index 1a81d37..72f701f 100644 --- a/collectors/infinibandPerfQueryMetric.go +++ b/collectors/infinibandPerfQueryMetric.go @@ -50,6 +50,9 @@ func (m *InfinibandPerfQueryCollector) Init(config json.RawMessage) error { m.lids = make(map[string]map[string]string) p := fmt.Sprintf("%s/*/ports/*/lid", string(IB_BASEPATH)) files, err := filepath.Glob(p) + if err != nil { + return err + } for _, f := range files { lid, err := ioutil.ReadFile(f) if err == nil { From fcc25f7d30f3f385ec245949f8a3378de50e3b19 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Feb 2022 13:46:44 +0100 Subject: [PATCH 121/174] Add collector documentation --- collectors/cpufreqCpuinfoMetric.md | 10 ++++++++++ collectors/cpufreqMetric.md | 11 +++++++++++ collectors/numastatsMetric.md | 15 +++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 collectors/cpufreqCpuinfoMetric.md create mode 100644 collectors/cpufreqMetric.md create mode 100644 collectors/numastatsMetric.md diff --git a/collectors/cpufreqCpuinfoMetric.md b/collectors/cpufreqCpuinfoMetric.md new file mode 100644 index 0000000..8b0216f --- /dev/null +++ b/collectors/cpufreqCpuinfoMetric.md @@ -0,0 +1,10 @@ + +## `cpufreq_cpuinfo` collector +```json + "cpufreq_cpuinfo": {} +``` + +The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **cpu** metrics. + +Metrics: +* `cpufreq` diff --git a/collectors/cpufreqMetric.md b/collectors/cpufreqMetric.md new file mode 100644 index 0000000..b62d16e --- /dev/null +++ b/collectors/cpufreqMetric.md @@ -0,0 +1,11 @@ +## `cpufreq_cpuinfo` collector +```json + "cpufreq": { + "exclude_metrics": [] + } +``` + +The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **cpu** metrics. + +Metrics: +* `cpufreq` \ No newline at end of file diff --git a/collectors/numastatsMetric.md b/collectors/numastatsMetric.md new file mode 100644 index 0000000..8eb1a0c --- /dev/null +++ b/collectors/numastatsMetric.md @@ -0,0 +1,15 @@ + +## `numastat` collector +```json + "numastat": {} +``` + +The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html + +Metrics: +* `numastats_numa_hit`: A process wanted to allocate memory from this node, and succeeded. +* `numastats_numa_miss`: A process wanted to allocate memory from another node, but ended up with memory from this node. +* `numastats_numa_foreign`: A process wanted to allocate on this node, but ended up with memory from another node. +* `numastats_local_node`: A process ran on this node's CPU, and got memory from this node. +* `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node. +* `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded. \ No newline at end of file From 1ea63332d3db5f7b885990605f0e81041a065ea1 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 8 Feb 2022 13:49:48 +0100 Subject: [PATCH 122/174] Update README.md --- collectors/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/collectors/README.md b/collectors/README.md index 558649e..393b200 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -36,7 +36,6 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md) * [`numastat`](./numastatMetric.md) * [`gpfs`](./gpfsMetric.md) -* [`ipmistat`](./ipmiMetric.md) ## Todos From fec3c5981d2387a31c6c5eb3b8df72347c60f8db Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Feb 2022 18:04:08 +0100 Subject: [PATCH 123/174] Fix for gangliaSink --- sinks/gangliaSink.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index a586375..f7ba30e 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -14,17 +14,20 @@ import ( ) const GMETRIC_EXEC = `gmetric` +const GMETRIC_CONFIG = `/etc/ganglia/gmond.conf` type GangliaSinkConfig struct { defaultSinkConfig - GmetricPath string `json:"gmetric_path"` - AddGangliaGroup bool `json:"add_ganglia_group"` + GmetricPath string `json:"gmetric_path,omitempty"` + GmetricConfig string `json:"gmetric_config,omitempty"` + AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` } type GangliaSink struct { sink - gmetric_path string - config GangliaSinkConfig + gmetric_path string + gmetric_config string + config GangliaSinkConfig } func (s *GangliaSink) Init(config json.RawMessage) error { @@ -38,6 +41,7 @@ func (s *GangliaSink) Init(config json.RawMessage) error { } } s.gmetric_path = "" + s.gmetric_config = "" if len(s.config.GmetricPath) > 0 { p, err := exec.LookPath(s.config.GmetricPath) if err == nil { @@ -53,6 +57,9 @@ func (s *GangliaSink) Init(config json.RawMessage) error { if len(s.gmetric_path) == 0 { err = errors.New("cannot find executable 'gmetric'") } + if len(s.config.GmetricConfig) > 0 { + s.gmetric_config = s.config.GmetricConfig + } return err } @@ -93,6 +100,9 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { if len(tagsstr) > 0 { argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) } + if len(s.gmetric_config) > 0 { + argstr = append(argstr, fmt.Sprintf("--conf=%s", s.gmetric_config)) + } argstr = append(argstr, fmt.Sprintf("--name=%s", point.Name())) for k, v := range point.Fields() { if k == "value" { @@ -115,7 +125,7 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { } } } - command := exec.Command(string(GMETRIC_EXEC), strings.Join(argstr, " ")) + command := exec.Command(s.gmetric_path, argstr...) command.Wait() _, err = command.Output() return err From 7f78a5baf26f743a4d3e2549aefb956f9bbda267 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 8 Feb 2022 18:06:07 +0100 Subject: [PATCH 124/174] Add timeout options to httpSink --- sinks/httpSink.go | 54 +++++++++++++++++++++++++++++++++++------------ sinks/httpSink.md | 10 +++++++-- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/sinks/httpSink.go b/sinks/httpSink.go index fc7b450..15d38fb 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -14,25 +14,34 @@ import ( type HttpSinkConfig struct { defaultSinkConfig - Host string `json:"host,omitempty"` - Port string `json:"port,omitempty"` - Database string `json:"database,omitempty"` - JWT string `json:"jwt,omitempty"` - SSL bool `json:"ssl,omitempty"` + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + JWT string `json:"jwt,omitempty"` + SSL bool `json:"ssl,omitempty"` + Timeout string `json:"timeout,omitempty"` + MaxIdleConns int `json:"max_idle_connections,omitempty"` + IdleConnTimeout string `json:"idle_connection_timeout,omitempty"` } type HttpSink struct { sink - client *http.Client - url, jwt string - encoder *influx.Encoder - buffer *bytes.Buffer - config HttpSinkConfig + client *http.Client + url, jwt string + encoder *influx.Encoder + buffer *bytes.Buffer + config HttpSinkConfig + maxIdleConns int + idleConnTimeout time.Duration + timeout time.Duration } func (s *HttpSink) Init(config json.RawMessage) error { s.name = "HttpSink" s.config.SSL = false + s.config.MaxIdleConns = 10 + s.config.IdleConnTimeout = "5s" + s.config.Timeout = "5s" if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { @@ -42,8 +51,26 @@ func (s *HttpSink) Init(config json.RawMessage) error { if len(s.config.Host) == 0 || len(s.config.Port) == 0 || len(s.config.Database) == 0 { return errors.New("`host`, `port` and `database` config options required for TCP sink") } - - s.client = &http.Client{} + if s.config.MaxIdleConns > 0 { + s.maxIdleConns = s.config.MaxIdleConns + } + if len(s.config.IdleConnTimeout) > 0 { + t, err := time.ParseDuration(s.config.IdleConnTimeout) + if err == nil { + s.idleConnTimeout = t + } + } + if len(s.config.Timeout) > 0 { + t, err := time.ParseDuration(s.config.Timeout) + if err == nil { + s.timeout = t + } + } + tr := &http.Transport{ + MaxIdleConns: s.maxIdleConns, + IdleConnTimeout: s.idleConnTimeout, + } + s.client = &http.Client{Transport: tr, Timeout: s.timeout} proto := "http" if s.config.SSL { proto = "https" @@ -58,7 +85,8 @@ func (s *HttpSink) Init(config json.RawMessage) error { } func (s *HttpSink) Write(m lp.CCMetric) error { - _, err := s.encoder.Encode(m.ToPoint(s.config.MetaAsTags)) + p := m.ToPoint(s.config.MetaAsTags) + _, err := s.encoder.Encode(p) return err } diff --git a/sinks/httpSink.md b/sinks/httpSink.md index 5440a82..fe466e8 100644 --- a/sinks/httpSink.md +++ b/sinks/httpSink.md @@ -13,7 +13,10 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the "host": "dbhost.example.com", "port": "4222", "jwt" : "0x0000q231", - "ssl" : false + "ssl" : false, + "timeout": "5s", + "max_idle_connections" : 10, + "idle_connection_timeout" : "5s" } } ``` @@ -24,4 +27,7 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the - `host`: Hostname of the InfluxDB database server - `port`: Portnumber (as string) of the InfluxDB database server - `jwt`: JSON web tokens for authentification -- `ssl`: Activate SSL encryption \ No newline at end of file +- `ssl`: Activate SSL encryption +- `timeout`: General timeout for the HTTP client (default '5s') +- `max_idle_connections`: Maximally idle connections (default 10) +- `idle_connection_timeout`: Timeout for idle connections (default '5s') \ No newline at end of file From b4d7643c25999f0d93b736467e8372727bcf541e Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 9 Feb 2022 10:09:03 +0100 Subject: [PATCH 125/174] Add comments --- sinks/sinkManager.go | 50 +++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 09b4fc4..b319eb4 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -37,38 +37,50 @@ type SinkManager interface { Close() } +// Init initializes the sink manager by: +// * Reading its configuration file +// * Adding the configured sinks and providing them with the corresponding config func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { sm.input = nil sm.done = make(chan bool) sm.wg = wg sm.sinks = make(map[string]Sink, 0) + if len(sinkConfigFile) == 0 { + return nil + } + // Read sink config file - if len(sinkConfigFile) > 0 { - configFile, err := os.Open(sinkConfigFile) + configFile, err := os.Open(sinkConfigFile) + if err != nil { + cclog.ComponentError("SinkManager", err.Error()) + return err + } + defer configFile.Close() + + // Parse config + jsonParser := json.NewDecoder(configFile) + var rawConfigs map[string]json.RawMessage + err = jsonParser.Decode(&rawConfigs) + if err != nil { + cclog.ComponentError("SinkManager", err.Error()) + return err + } + + // Start sinks + for name, raw := range rawConfigs { + err = sm.AddOutput(name, raw) if err != nil { cclog.ComponentError("SinkManager", err.Error()) - return err - } - defer configFile.Close() - jsonParser := json.NewDecoder(configFile) - var rawConfigs map[string]json.RawMessage - err = jsonParser.Decode(&rawConfigs) - if err != nil { - cclog.ComponentError("SinkManager", err.Error()) - return err - } - for name, raw := range rawConfigs { - err = sm.AddOutput(name, raw) - if err != nil { - cclog.ComponentError("SinkManager", err.Error()) - continue - } + continue } } + return nil } +// Start starts the sink managers background task, which +// distributes received metrics to the sinks func (sm *sinkManager) Start() { batchcount := 20 @@ -156,7 +168,7 @@ func (sm *sinkManager) Close() { // New creates a new initialized sink manager func New(wg *sync.WaitGroup, sinkConfigFile string) (SinkManager, error) { - sm := &sinkManager{} + sm := new(sinkManager) err := sm.Init(wg, sinkConfigFile) if err != nil { return nil, err From 1d299be3ea537c1597ef8f6df5557f5588d67a8d Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 9 Feb 2022 11:08:50 +0100 Subject: [PATCH 126/174] Add comments --- sinks/influxAsyncSink.go | 19 +++++++++++++------ sinks/influxSink.go | 17 +++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index 3315456..ba60799 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -1,8 +1,6 @@ package sinks import ( - // "context" - "crypto/tls" "encoding/json" "errors" @@ -54,10 +52,14 @@ func (s *InfluxAsyncSink) connect() error { if batch == 0 { batch = 100 } - s.client = influxdb2.NewClientWithOptions(uri, auth, - influxdb2.DefaultOptions().SetBatchSize(batch).SetTLSConfig(&tls.Config{ + clientOptions := influxdb2.DefaultOptions() + clientOptions.SetBatchSize(batch) + clientOptions.SetTLSConfig( + &tls.Config{ InsecureSkipVerify: true, - })) + }, + ) + s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database) return nil } @@ -78,7 +80,11 @@ func (s *InfluxAsyncSink) Init(config json.RawMessage) error { len(s.config.Password) == 0 { return errors.New("not all configuration variables set required by InfluxAsyncSink") } + + // Connect to InfluxDB server err := s.connect() + + // Start background: Read from error channel s.errors = s.writeApi.Errors() go func() { for err := range s.errors { @@ -90,7 +96,8 @@ func (s *InfluxAsyncSink) Init(config json.RawMessage) error { func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { s.writeApi.WritePoint( - m.ToPoint(s.config.MetaAsTags)) + m.ToPoint(s.config.MetaAsTags), + ) return nil } diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 156f6eb..99304c0 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -46,14 +46,13 @@ func (s *InfluxSink) connect() error { auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) } cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) - s.client = - influxdb2.NewClientWithOptions( - uri, - auth, - influxdb2.DefaultOptions().SetTLSConfig( - &tls.Config{InsecureSkipVerify: true}, - ), - ) + clientOptions := influxdb2.DefaultOptions() + clientOptions.SetTLSConfig( + &tls.Config{ + InsecureSkipVerify: true, + }, + ) + s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) return nil } @@ -73,6 +72,8 @@ func (s *InfluxSink) Init(config json.RawMessage) error { len(s.config.Password) == 0 { return errors.New("not all configuration variables set required by InfluxSink") } + + // Connect to InfluxDB server return s.connect() } From a0e97d216a392fd1979611430e02ba8bf589d157 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 9 Feb 2022 19:47:49 +0100 Subject: [PATCH 127/174] Move all flush operations to the sinks --- sinks/httpSink.go | 22 +++++++++++++++++++++- sinks/sinkManager.go | 13 ------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 15d38fb..844de0b 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -22,6 +22,7 @@ type HttpSinkConfig struct { Timeout string `json:"timeout,omitempty"` MaxIdleConns int `json:"max_idle_connections,omitempty"` IdleConnTimeout string `json:"idle_connection_timeout,omitempty"` + BatchSize int `json:"batch_size,omitempty"` } type HttpSink struct { @@ -34,14 +35,19 @@ type HttpSink struct { maxIdleConns int idleConnTimeout time.Duration timeout time.Duration + batchCounter int } func (s *HttpSink) Init(config json.RawMessage) error { + // Set default values s.name = "HttpSink" s.config.SSL = false s.config.MaxIdleConns = 10 s.config.IdleConnTimeout = "5s" s.config.Timeout = "5s" + s.config.BatchSize = 20 + + // Read config if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { @@ -87,27 +93,40 @@ func (s *HttpSink) Init(config json.RawMessage) error { func (s *HttpSink) Write(m lp.CCMetric) error { p := m.ToPoint(s.config.MetaAsTags) _, err := s.encoder.Encode(p) + + // Flush when received more metrics than batch size + s.batchCounter++ + if s.batchCounter > s.config.BatchSize { + s.Flush() + } return err } func (s *HttpSink) Flush() error { + // Create new request to send buffer req, err := http.NewRequest(http.MethodPost, s.url, s.buffer) if err != nil { return err } + // Set authorization header if len(s.jwt) != 0 { req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", s.jwt)) } + // Send res, err := s.client.Do(req) + + // Clear buffer s.buffer.Reset() + // Handle error code if err != nil { return err } - if res.StatusCode != 200 { + // Handle status code + if res.StatusCode != http.StatusOK { return errors.New(res.Status) } @@ -115,5 +134,6 @@ func (s *HttpSink) Flush() error { } func (s *HttpSink) Close() { + s.Flush() s.client.CloseIdleConnections() } diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index b319eb4..80877c8 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -82,8 +82,6 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { // Start starts the sink managers background task, which // distributes received metrics to the sinks func (sm *sinkManager) Start() { - batchcount := 20 - sm.wg.Add(1) go func() { defer sm.wg.Done() @@ -91,7 +89,6 @@ func (sm *sinkManager) Start() { // Sink manager is done done := func() { for _, s := range sm.sinks { - s.Flush() s.Close() } @@ -111,16 +108,6 @@ func (sm *sinkManager) Start() { for _, s := range sm.sinks { s.Write(p) } - - // Flush all outputs - if batchcount == 0 { - cclog.ComponentDebug("SinkManager", "FLUSH") - for _, s := range sm.sinks { - s.Flush() - } - batchcount = 20 - } - batchcount-- } } }() From acf5db543e1a56f7aa1046cc1316674f17db743a Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 9 Feb 2022 23:22:54 +0100 Subject: [PATCH 128/174] Fix: Reset counter --- sinks/httpSink.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 844de0b..22ecfce 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -47,6 +47,9 @@ func (s *HttpSink) Init(config json.RawMessage) error { s.config.Timeout = "5s" s.config.BatchSize = 20 + // Reset counter + s.batchCounter = 0 + // Read config if len(config) > 0 { err := json.Unmarshal(config, &s.config) @@ -103,6 +106,14 @@ func (s *HttpSink) Write(m lp.CCMetric) error { } func (s *HttpSink) Flush() error { + // Do not flush empty buffer + if s.batchCounter == 0 { + return nil + } + + // Reset counter + s.batchCounter = 0 + // Create new request to send buffer req, err := http.NewRequest(http.MethodPost, s.url, s.buffer) if err != nil { From 82138df48e178fa76d8e788e1373c5d336e187e2 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 10 Feb 2022 09:28:06 +0100 Subject: [PATCH 129/174] Refactor: Replace readOneLine() by ioutil.ReadFile() --- collectors/cpufreqMetric.go | 42 +++++++++++----------------------- collectors/infinibandMetric.go | 16 +++++++++---- 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 5f47ce5..da9f2d7 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -1,10 +1,9 @@ package collectors import ( - "bufio" "encoding/json" "fmt" - "os" + "io/ioutil" "path/filepath" "strconv" "strings" @@ -15,23 +14,6 @@ import ( "golang.org/x/sys/unix" ) -// -// readOneLine reads one line from a file. -// It returns ok when file was successfully read. -// In this case text contains the first line of the files contents. -// -func readOneLine(filename string) (text string, ok bool) { - file, err := os.Open(filename) - if err != nil { - return - } - defer file.Close() - scanner := bufio.NewScanner(file) - ok = scanner.Scan() - text = scanner.Text() - return -} - type CPUFreqCollectorTopology struct { processor string // logical processor number (continuous, starting at 0) coreID string // socket local core ID @@ -105,10 +87,11 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { // Read package ID physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") - physicalPackageID, ok := readOneLine(physicalPackageIDFile) - if !ok { - return fmt.Errorf("Unable to read physical package ID from file '%s'", physicalPackageIDFile) + line, err := ioutil.ReadFile(physicalPackageIDFile) + if err != nil { + return fmt.Errorf("Unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err) } + physicalPackageID := strings.TrimSpace(string(line)) physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64) if err != nil { return fmt.Errorf("Unable to convert packageID '%s' to int64: %v", physicalPackageID, err) @@ -116,10 +99,11 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { // Read core ID coreIDFile := filepath.Join(cpuDir, "topology", "core_id") - coreID, ok := readOneLine(coreIDFile) - if !ok { - return fmt.Errorf("Unable to read core ID from file '%s'", coreIDFile) + line, err = ioutil.ReadFile(coreIDFile) + if err != nil { + return fmt.Errorf("Unable to read core ID from file '%s': %v", coreIDFile, err) } + coreID := strings.TrimSpace(string(line)) coreID_int, err := strconv.ParseInt(coreID, 10, 64) if err != nil { return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err) @@ -205,14 +189,14 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) } // Read current frequency - line, ok := readOneLine(t.scalingCurFreqFile) - if !ok { + line, err := ioutil.ReadFile(t.scalingCurFreqFile) + if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile)) + fmt.Sprintf("Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err)) continue } - cpuFreq, err := strconv.ParseInt(line, 10, 64) + cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64) if err != nil { cclog.ComponentError( m.name, diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 4c7615b..e5197de 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -2,6 +2,7 @@ package collectors import ( "fmt" + "io/ioutil" "os" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" @@ -68,8 +69,12 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { for _, path := range ibDirs { // Skip, when no LID is assigned - LID, ok := readOneLine(path + "/lid") - if !ok || LID == "0x0" { + line, err := ioutil.ReadFile(filepath.Join(path, "lid")) + if err != nil { + continue + } + LID := strings.TrimSpace(string(line)) + if LID == "0x0" { continue } @@ -142,13 +147,14 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr // device info info := &m.info[i] for counterName, counterFile := range info.portCounterFiles { - data, ok := readOneLine(counterFile) - if !ok { + line, err := ioutil.ReadFile(counterFile) + if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to read one line from file '%s'", counterFile)) + fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterFile, err)) continue } + data := strings.TrimSpace(string(line)) v, err := strconv.ParseInt(data, 10, 64) if err != nil { cclog.ComponentError( From 2aa8c812a63bc6f7439207876eca53c353879a5d Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 10 Feb 2022 09:43:02 +0100 Subject: [PATCH 130/174] Add config option flush_interval --- sinks/influxAsyncSink.go | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index ba60799..20aa60c 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -22,7 +22,10 @@ type InfluxAsyncSinkConfig struct { Organization string `json:"organization,omitempty"` SSL bool `json:"ssl,omitempty"` RetentionPol string `json:"retention_policy,omitempty"` - BatchSize uint `json:"batch_size,omitempty"` + // Maximum number of points sent to server in single request. Default 5000 + BatchSize uint `json:"batch_size,omitempty"` + // Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms + FlushInterval uint `json:"flush_interval,omitempty"` } type InfluxAsyncSink struct { @@ -48,12 +51,13 @@ func (s *InfluxAsyncSink) connect() error { auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) } cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) - batch := s.config.BatchSize - if batch == 0 { - batch = 100 - } clientOptions := influxdb2.DefaultOptions() - clientOptions.SetBatchSize(batch) + if s.config.BatchSize != 0 { + clientOptions.SetBatchSize(s.config.BatchSize) + } + if s.config.FlushInterval != 0 { + clientOptions.SetFlushInterval(s.config.FlushInterval) + } clientOptions.SetTLSConfig( &tls.Config{ InsecureSkipVerify: true, @@ -66,7 +70,10 @@ func (s *InfluxAsyncSink) connect() error { func (s *InfluxAsyncSink) Init(config json.RawMessage) error { s.name = "InfluxSink" + + // Set default for maximum number of points sent to server in single request. s.config.BatchSize = 100 + if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { From 442e512f2dfe0af9293fef5881905cdeba0ebb8e Mon Sep 17 00:00:00 2001 From: Lou Date: Thu, 10 Feb 2022 13:12:32 +0100 Subject: [PATCH 131/174] Automatically flush batched writes in the HTTP sink (#31) * Add error handling for Sink.Write * simplify HttpSink config * HttpSink: dynamically sized batches flushed after timer * fix panic if sink type does not exist --- sinks/httpSink.go | 93 +++++++++++++++++++++++++++----------------- sinks/httpSink.md | 20 ++++------ sinks/sinkManager.go | 6 ++- 3 files changed, 69 insertions(+), 50 deletions(-) diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 22ecfce..ce46bab 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -6,49 +6,45 @@ import ( "errors" "fmt" "net/http" + "sync" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" ) type HttpSinkConfig struct { defaultSinkConfig - Host string `json:"host,omitempty"` - Port string `json:"port,omitempty"` - Database string `json:"database,omitempty"` + URL string `json:"url,omitempty"` JWT string `json:"jwt,omitempty"` - SSL bool `json:"ssl,omitempty"` Timeout string `json:"timeout,omitempty"` MaxIdleConns int `json:"max_idle_connections,omitempty"` IdleConnTimeout string `json:"idle_connection_timeout,omitempty"` - BatchSize int `json:"batch_size,omitempty"` + FlushDelay string `json:"flush_delay,omitempty"` } type HttpSink struct { sink client *http.Client - url, jwt string encoder *influx.Encoder + lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer buffer *bytes.Buffer + flushTimer *time.Timer config HttpSinkConfig maxIdleConns int idleConnTimeout time.Duration timeout time.Duration - batchCounter int + flushDelay time.Duration } func (s *HttpSink) Init(config json.RawMessage) error { // Set default values s.name = "HttpSink" - s.config.SSL = false s.config.MaxIdleConns = 10 s.config.IdleConnTimeout = "5s" s.config.Timeout = "5s" - s.config.BatchSize = 20 - - // Reset counter - s.batchCounter = 0 + s.config.FlushDelay = "1s" // Read config if len(config) > 0 { @@ -57,8 +53,8 @@ func (s *HttpSink) Init(config json.RawMessage) error { return err } } - if len(s.config.Host) == 0 || len(s.config.Port) == 0 || len(s.config.Database) == 0 { - return errors.New("`host`, `port` and `database` config options required for TCP sink") + if len(s.config.URL) == 0 { + return errors.New("`url` config option is required for HTTP sink") } if s.config.MaxIdleConns > 0 { s.maxIdleConns = s.config.MaxIdleConns @@ -75,17 +71,17 @@ func (s *HttpSink) Init(config json.RawMessage) error { s.timeout = t } } + if len(s.config.FlushDelay) > 0 { + t, err := time.ParseDuration(s.config.FlushDelay) + if err == nil { + s.flushDelay = t + } + } tr := &http.Transport{ MaxIdleConns: s.maxIdleConns, IdleConnTimeout: s.idleConnTimeout, } s.client = &http.Client{Transport: tr, Timeout: s.timeout} - proto := "http" - if s.config.SSL { - proto = "https" - } - s.url = fmt.Sprintf("%s://%s:%s/%s", proto, s.config.Host, s.config.Port, s.config.Database) - s.jwt = s.config.JWT s.buffer = &bytes.Buffer{} s.encoder = influx.NewEncoder(s.buffer) s.encoder.SetPrecision(time.Second) @@ -94,35 +90,57 @@ func (s *HttpSink) Init(config json.RawMessage) error { } func (s *HttpSink) Write(m lp.CCMetric) error { - p := m.ToPoint(s.config.MetaAsTags) - _, err := s.encoder.Encode(p) + if s.buffer.Len() == 0 && s.flushDelay != 0 { + // This is the first write since the last flush, start the flushTimer! + if s.flushTimer != nil && s.flushTimer.Stop() { + cclog.ComponentDebug("HttpSink", "unexpected: the flushTimer was already running?") + } - // Flush when received more metrics than batch size - s.batchCounter++ - if s.batchCounter > s.config.BatchSize { - s.Flush() + // Run a batched flush for all lines that have arrived in the last second + s.flushTimer = time.AfterFunc(s.flushDelay, func() { + if err := s.Flush(); err != nil { + cclog.ComponentError("HttpSink", "flush failed:", err.Error()) + } + }) } + + p := m.ToPoint(s.config.MetaAsTags) + + s.lock.Lock() + _, err := s.encoder.Encode(p) + s.lock.Unlock() // defer does not work here as Flush() takes the lock as well + + if err != nil { + return err + } + + // Flush synchronously if "flush_delay" is zero + if s.flushDelay == 0 { + return s.Flush() + } + return err } func (s *HttpSink) Flush() error { + // buffer is read by client.Do, prevent concurrent modifications + s.lock.Lock() + defer s.lock.Unlock() + // Do not flush empty buffer - if s.batchCounter == 0 { + if s.buffer.Len() == 0 { return nil } - // Reset counter - s.batchCounter = 0 - // Create new request to send buffer - req, err := http.NewRequest(http.MethodPost, s.url, s.buffer) + req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer) if err != nil { return err } // Set authorization header - if len(s.jwt) != 0 { - req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", s.jwt)) + if len(s.config.JWT) != 0 { + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", s.config.JWT)) } // Send @@ -131,12 +149,12 @@ func (s *HttpSink) Flush() error { // Clear buffer s.buffer.Reset() - // Handle error code + // Handle transport/tcp errors if err != nil { return err } - // Handle status code + // Handle application errors if res.StatusCode != http.StatusOK { return errors.New(res.Status) } @@ -145,6 +163,9 @@ func (s *HttpSink) Flush() error { } func (s *HttpSink) Close() { - s.Flush() + s.flushTimer.Stop() + if err := s.Flush(); err != nil { + cclog.ComponentError("HttpSink", "flush failed:", err.Error()) + } s.client.CloseIdleConnections() } diff --git a/sinks/httpSink.md b/sinks/httpSink.md index fe466e8..23203a2 100644 --- a/sinks/httpSink.md +++ b/sinks/httpSink.md @@ -9,25 +9,21 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the "": { "type": "http", "meta_as_tags" : true, - "database" : "mymetrics", - "host": "dbhost.example.com", - "port": "4222", - "jwt" : "0x0000q231", - "ssl" : false, + "url" : "https://my-monitoring.example.com:1234/api/write", + "jwt" : "blabla.blabla.blabla", "timeout": "5s", "max_idle_connections" : 10, - "idle_connection_timeout" : "5s" + "idle_connection_timeout" : "5s", + "flush_delay": "2s", } } ``` - `type`: makes the sink an `http` sink - `meta_as_tags`: print all meta information as tags in the output (optional) -- `database`: All metrics are written to this bucket -- `host`: Hostname of the InfluxDB database server -- `port`: Portnumber (as string) of the InfluxDB database server -- `jwt`: JSON web tokens for authentification -- `ssl`: Activate SSL encryption +- `url`: The full URL of the endpoint +- `jwt`: JSON web tokens for authentification (Using the *Bearer* scheme) - `timeout`: General timeout for the HTTP client (default '5s') - `max_idle_connections`: Maximally idle connections (default 10) -- `idle_connection_timeout`: Timeout for idle connections (default '5s') \ No newline at end of file +- `idle_connection_timeout`: Timeout for idle connections (default '5s') +- `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0) diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 80877c8..ff6e01d 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -106,7 +106,9 @@ func (sm *sinkManager) Start() { // Send received metric to all outputs cclog.ComponentDebug("SinkManager", "WRITE", p) for _, s := range sm.sinks { - s.Write(p) + if err := s.Write(p); err != nil { + cclog.ComponentError("SinkManager", "WRITE", s.Name(), "write failed:", err.Error()) + } } } } @@ -131,7 +133,7 @@ func (sm *sinkManager) AddOutput(name string, rawConfig json.RawMessage) error { } } if _, found := AvailableSinks[sinkConfig.Type]; !found { - cclog.ComponentError("SinkManager", "SKIP", name, "unknown sink:", err.Error()) + cclog.ComponentError("SinkManager", "SKIP", name, "unknown sink:", sinkConfig.Type) return err } s := AvailableSinks[sinkConfig.Type] From 184d60cc584741978d5e0555a1c2a8a5a9095948 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 10 Feb 2022 15:21:26 +0100 Subject: [PATCH 132/174] Locking in MetricCache --- internal/metricRouter/metricCache.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/internal/metricRouter/metricCache.go b/internal/metricRouter/metricCache.go index 67522c9..8886f47 100644 --- a/internal/metricRouter/metricCache.go +++ b/internal/metricRouter/metricCache.go @@ -23,6 +23,7 @@ type metricCachePeriod struct { type metricCache struct { numPeriods int curPeriod int + lock sync.Mutex intervals []*metricCachePeriod wg *sync.WaitGroup ticker mct.MultiChanTicker @@ -103,9 +104,11 @@ func (c *metricCache) Start() { done() return case tick := <-c.tickchan: + c.lock.Lock() old := rotate(tick) // Get the last period and evaluate aggregation metrics starttime, endtime, metrics := c.GetPeriod(old) + c.lock.Unlock() if len(metrics) > 0 { c.aggEngine.Eval(starttime, endtime, metrics) } else { @@ -123,6 +126,7 @@ func (c *metricCache) Start() { // to avoid reallocations func (c *metricCache) Add(metric lp.CCMetric) { if c.curPeriod >= 0 && c.curPeriod < c.numPeriods { + c.lock.Lock() p := c.intervals[c.curPeriod] if p.numMetrics < p.sizeMetrics { p.metrics[p.numMetrics] = metric @@ -134,6 +138,7 @@ func (c *metricCache) Add(metric lp.CCMetric) { p.sizeMetrics = p.sizeMetrics + 1 p.stopstamp = metric.Time() } + c.lock.Unlock() } } @@ -149,16 +154,26 @@ func (c *metricCache) DeleteAggregation(name string) error { // is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index // is given (negative index, index larger than configured number of total intervals, ...) func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric) { + var start time.Time = time.Now() + var stop time.Time = time.Now() + var metrics []lp.CCMetric if index >= 0 && index < c.numPeriods { pindex := c.curPeriod - index if pindex < 0 { pindex = c.numPeriods - pindex } if pindex >= 0 && pindex < c.numPeriods { - return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics + start = c.intervals[pindex].startstamp + stop = c.intervals[pindex].stopstamp + metrics = c.intervals[pindex].metrics + //return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics + } else { + metrics = make([]lp.CCMetric, 0) } + } else { + metrics = make([]lp.CCMetric, 0) } - return time.Now(), time.Now(), make([]lp.CCMetric, 0) + return start, stop, metrics } // Close finishes / stops the metric cache From b15fdf72b9c677050f076529cadf00221b6aeb83 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 11 Feb 2022 14:20:06 +0100 Subject: [PATCH 133/174] Exclude metrics and devices in Init() for NvidiaCollector --- collectors/nvidiaMetric.go | 370 ++++++++++++++++++++++--------------- 1 file changed, 221 insertions(+), 149 deletions(-) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 1eff3be..374f7ee 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -6,6 +6,8 @@ import ( "fmt" "log" "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "github.com/NVIDIA/go-nvml/pkg/nvml" ) @@ -13,12 +15,20 @@ import ( type NvidiaCollectorConfig struct { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` ExcludeDevices []string `json:"exclude_devices,omitempty"` + AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` +} + +type NvidiaCollectorDevice struct { + device nvml.Device + excludeMetrics map[string]bool + tags map[string]string } type NvidiaCollector struct { metricCollector num_gpus int config NvidiaCollectorConfig + gpus []NvidiaCollectorDevice } func (m *NvidiaCollector) CatchPanic() { @@ -31,6 +41,7 @@ func (m *NvidiaCollector) CatchPanic() { func (m *NvidiaCollector) Init(config json.RawMessage) error { var err error m.name = "NvidiaCollector" + m.config.AddPciInfoTag = false m.setup() m.meta = map[string]string{"source": m.name, "group": "Nvidia"} if len(config) > 0 { @@ -44,13 +55,48 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { ret := nvml.Init() if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error()) return err } - m.num_gpus, ret = nvml.DeviceGetCount() + num_gpus, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to get device count", err.Error()) return err } + m.gpus = make([]NvidiaCollectorDevice, num_gpus) + idx := 0 + for i := 0; i < num_gpus && idx < num_gpus; i++ { + str_i := fmt.Sprintf("%d", i) + if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip { + continue + } + + device, ret := nvml.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error()) + return err + } + g := m.gpus[idx] + g.device = device + g.tags = map[string]string{"type": "accelerator", "type-id": str_i} + g.excludeMetrics = map[string]bool{} + for _, e := range m.config.ExcludeMetrics { + g.excludeMetrics[e] = true + } + if m.config.AddPciInfoTag { + pciinfo, ret := nvml.DeviceGetPciInfo(g.device) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to get pciInfo for device at index", i, ":", err.Error()) + return err + } + g.tags["pci_identifier"] = fmt.Sprintf("%08X:%02X:%02X.0", pciinfo.Domain, pciinfo.Bus, pciinfo.Device) + } + m.gpus[idx] = g + idx++ + } m.init = true return nil } @@ -59,207 +105,233 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) if !m.init { return } - for i := 0; i < m.num_gpus; i++ { - device, ret := nvml.DeviceGetHandleByIndex(i) - if ret != nvml.SUCCESS { - log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret)) - return - } - _, skip := stringArrayContains(m.config.ExcludeDevices, fmt.Sprintf("%d", i)) - if skip { - continue - } - tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)} - util, ret := nvml.DeviceGetUtilizationRates(device) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_util") - y, err := lp.New("nv_util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) - if err == nil && !skip { - output <- y + for _, device := range m.gpus { + + exclude := func(metric string) bool { + if _, ok := device.excludeMetrics[metric]; !ok { + return true } - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_util") - y, err = lp.New("nv_mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) - if err == nil && !skip { - output <- y + return false + } + + ex_nv_util := exclude("nv_util") + ex_nv_mem_util := exclude("nv_mem_util") + if (!ex_nv_util) || (!ex_nv_mem_util) { + util, ret := nvml.DeviceGetUtilizationRates(device.device) + if ret == nvml.SUCCESS { + if !ex_nv_util { + y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) + if err == nil { + output <- y + } + } + if !ex_nv_mem_util { + y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) + if err == nil { + output <- y + } + } } } - meminfo, ret := nvml.DeviceGetMemoryInfo(device) - if ret == nvml.SUCCESS { - t := float64(meminfo.Total) / (1024 * 1024) - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_total") - y, err := lp.New("nv_mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now()) - if err == nil && !skip { - y.AddMeta("unit", "MByte") - output <- y - } - f := float64(meminfo.Used) / (1024 * 1024) - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fb_memory") - y, err = lp.New("nv_fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now()) - if err == nil && !skip { - y.AddMeta("unit", "MByte") - output <- y + ex_nv_mem_total := exclude("nv_mem_total") + ex_nv_fb_memory := exclude("nv_fb_memory") + if (!ex_nv_mem_total) || (!ex_nv_fb_memory) { + meminfo, ret := nvml.DeviceGetMemoryInfo(device.device) + if ret == nvml.SUCCESS { + if !ex_nv_mem_total { + t := float64(meminfo.Total) / (1024 * 1024) + y, err := lp.New("nv_mem_total", device.tags, m.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "MByte") + output <- y + } + } + + if !ex_nv_fb_memory { + f := float64(meminfo.Used) / (1024 * 1024) + y, err := lp.New("nv_fb_memory", device.tags, m.meta, map[string]interface{}{"value": f}, time.Now()) + if err == nil { + y.AddMeta("unit", "MByte") + output <- y + } + } } } - temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_temp") - y, err := lp.New("nv_temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) - if err == nil && !skip { - y.AddMeta("unit", "degC") - output <- y + if !exclude("nv_temp") { + temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) + if err == nil { + y.AddMeta("unit", "degC") + output <- y + } } } - fan, ret := nvml.DeviceGetFanSpeed(device) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fan") - y, err := lp.New("nv_fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_fan") { + fan, ret := nvml.DeviceGetFanSpeed(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) + if err == nil { + output <- y + } } } - _, ecc_pend, ret := nvml.DeviceGetEccMode(device) - if ret == nvml.SUCCESS { - var y lp.CCMetric - var err error - switch ecc_pend { - case nvml.FEATURE_DISABLED: - y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) - case nvml.FEATURE_ENABLED: - y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) - default: - y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) - } - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") - if err == nil && !skip { - output <- y - } - } else if ret == nvml.ERROR_NOT_SUPPORTED { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") - y, err := lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_ecc_mode") { + _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) + if ret == nvml.SUCCESS { + var y lp.CCMetric + var err error + switch ecc_pend { + case nvml.FEATURE_DISABLED: + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) + case nvml.FEATURE_ENABLED: + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) + default: + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) + } + if err == nil { + output <- y + } + } else if ret == nvml.ERROR_NOT_SUPPORTED { + y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) + if err == nil { + output <- y + } } } - pstate, ret := nvml.DeviceGetPerformanceState(device) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_perf_state") - y, err := lp.New("nv_perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_perf_state") { + pstate, ret := nvml.DeviceGetPerformanceState(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) + if err == nil { + output <- y + } } } - power, ret := nvml.DeviceGetPowerUsage(device) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_usage_report") - y, err := lp.New("nv_power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_power_usage_report") { + power, ret := nvml.DeviceGetPowerUsage(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) + if err == nil { + output <- y + } } } - gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_graphics_clock_report") - y, err := lp.New("nv_graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_graphics_clock_report") { + gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) + if err == nil { + output <- y + } } } - smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_sm_clock_report") - y, err := lp.New("nv_sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_sm_clock_report") { + smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) + if err == nil { + output <- y + } } } - memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_clock_report") - y, err := lp.New("nv_mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_mem_clock_report") { + memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) + if err == nil { + output <- y + } } } - max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_graphics_clock") - y, err := lp.New("nv_max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_max_graphics_clock") { + max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) + if err == nil { + output <- y + } } } - max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_sm_clock") - y, err := lp.New("nv_max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_max_sm_clock") { + max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) + if err == nil { + output <- y + } } } - max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_mem_clock") - y, err := lp.New("nv_max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_max_mem_clock") { + max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) + if err == nil { + output <- y + } } } - ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_db_error") - y, err := lp.New("nv_ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_ecc_db_error") { + ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) + if err == nil { + output <- y + } } } - ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_sb_error") - y, err := lp.New("nv_ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_ecc_sb_error") { + ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) + if err == nil { + output <- y + } } } - pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_man_limit") - y, err := lp.New("nv_power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_power_man_limit") { + pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) + if err == nil { + output <- y + } } } - enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_encoder_util") - y, err := lp.New("nv_encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_encoder_util") { + enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) + if err == nil { + output <- y + } } } - dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device) - if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_decoder_util") - y, err := lp.New("nv_decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) - if err == nil && !skip { - output <- y + if !exclude("nv_decoder_util") { + dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) + if err == nil { + output <- y + } } } } From dc1e4f28a05b9ee8c439cf967597e8a1bb0b47e9 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 11 Feb 2022 16:10:59 +0100 Subject: [PATCH 134/174] Check for Ganglia group explicitly in tags and meta info --- sinks/gangliaSink.go | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index f7ba30e..714c5a4 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -21,6 +21,7 @@ type GangliaSinkConfig struct { GmetricPath string `json:"gmetric_path,omitempty"` GmetricConfig string `json:"gmetric_config,omitempty"` AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` + AddTagsAsDesc bool `json:"add_tags_as_desc,omitempty"` } type GangliaSink struct { @@ -33,6 +34,8 @@ type GangliaSink struct { func (s *GangliaSink) Init(config json.RawMessage) error { var err error = nil s.name = "GangliaSink" + s.config.AddTagsAsDesc = false + s.config.AddGangliaGroup = false if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { @@ -67,16 +70,22 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { var err error = nil var tagsstr []string var argstr []string + if s.config.AddGangliaGroup { + if point.HasTag("group") { + g, _ := point.GetTag("group") + argstr = append(argstr, fmt.Sprintf("--group=%s", g)) + } else if point.HasMeta("group") { + g, _ := point.GetMeta("group") + argstr = append(argstr, fmt.Sprintf("--group=%s", g)) + } + } + for key, value := range point.Tags() { switch key { case "cluster": argstr = append(argstr, fmt.Sprintf("--cluster=%s", value)) case "unit": argstr = append(argstr, fmt.Sprintf("--units=%s", value)) - case "group": - if s.config.AddGangliaGroup { - argstr = append(argstr, fmt.Sprintf("--group=%s", value)) - } default: tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) } @@ -88,16 +97,12 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { argstr = append(argstr, fmt.Sprintf("--cluster=%s", value)) case "unit": argstr = append(argstr, fmt.Sprintf("--units=%s", value)) - case "group": - if s.config.AddGangliaGroup { - argstr = append(argstr, fmt.Sprintf("--group=%s", value)) - } default: tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) } } } - if len(tagsstr) > 0 { + if s.config.AddTagsAsDesc && len(tagsstr) > 0 { argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) } if len(s.gmetric_config) > 0 { From e4285f02c58896ace6b94c2c8ebd73291a691c33 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 11 Feb 2022 15:34:12 +0100 Subject: [PATCH 135/174] Avoid one append --- sinks/gangliaSink.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 714c5a4..c53b11a 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -113,20 +113,20 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { if k == "value" { switch value := v.(type) { case float64: - argstr = append(argstr, fmt.Sprintf("--value=%v", value)) - argstr = append(argstr, "--type=double") + argstr = append(argstr, + fmt.Sprintf("--value=%v", value), "--type=double") case float32: - argstr = append(argstr, fmt.Sprintf("--value=%v", value)) - argstr = append(argstr, "--type=float") + argstr = append(argstr, + fmt.Sprintf("--value=%v", value), "--type=float") case int: - argstr = append(argstr, fmt.Sprintf("--value=%d", value)) - argstr = append(argstr, "--type=int32") + argstr = append(argstr, + fmt.Sprintf("--value=%d", value), "--type=int32") case int64: - argstr = append(argstr, fmt.Sprintf("--value=%d", value)) - argstr = append(argstr, "--type=int32") + argstr = append(argstr, + fmt.Sprintf("--value=%d", value), "--type=int32") case string: - argstr = append(argstr, fmt.Sprintf("--value=%q", value)) - argstr = append(argstr, "--type=string") + argstr = append(argstr, + fmt.Sprintf("--value=%q", value), "--type=string") } } } From cfc5279958ffff8bed0ed7940aab6452db8e5a38 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 11 Feb 2022 17:17:10 +0100 Subject: [PATCH 136/174] Move sensor detection to Init() --- collectors/tempMetric.go | 110 +++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 63 deletions(-) diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index caa726e..e87603d 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -4,12 +4,12 @@ import ( "encoding/json" "fmt" "io/ioutil" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - "os" "path/filepath" "strconv" "strings" "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -22,7 +22,8 @@ type TempCollectorConfig struct { type TempCollector struct { metricCollector - config TempCollectorConfig + config TempCollectorConfig + sensors map[string]string } func (m *TempCollector) Init(config json.RawMessage) error { @@ -35,79 +36,62 @@ func (m *TempCollector) Init(config json.RawMessage) error { return err } } + + // Find all temperature sensor files + m.sensors = make(map[string]string) + globPattern := filepath.Join(HWMON_PATH, "*", "temp*_input") + inputFiles, err := filepath.Glob(globPattern) + if err != nil { + return fmt.Errorf("Unable to glob files with pattern '%s': %v", globPattern, err) + } + if inputFiles == nil { + return fmt.Errorf("Unable to find any files with pattern '%s'", globPattern) + } + + // Get sensor name for each temperature sensor file + for _, file := range inputFiles { + nameFile := strings.TrimSuffix(file, "_input") + "_label" + name, err := ioutil.ReadFile(nameFile) + if err != nil { + continue + } + metricName := strings.TrimSpace(string(name)) + metricName = strings.Replace(metricName, " ", "_", -1) + if !strings.Contains(metricName, "temp") { + metricName = "temp_" + metricName + } + metricName = strings.ToLower(metricName) + m.sensors[metricName] = file + } + m.init = true return nil } -func get_hwmon_sensors() (map[string]map[string]string, error) { - var folders []string - var sensors map[string]map[string]string - sensors = make(map[string]map[string]string) - err := filepath.Walk(HWMON_PATH, func(p string, info os.FileInfo, err error) error { - if info.IsDir() { - return nil - } - folders = append(folders, p) - return nil - }) - if err != nil { - return sensors, err - } +func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { - for _, f := range folders { - sensors[f] = make(map[string]string) - myp := fmt.Sprintf("%s/", f) - err := filepath.Walk(myp, func(path string, info os.FileInfo, err error) error { - dir, fname := filepath.Split(path) - if strings.Contains(fname, "temp") && strings.Contains(fname, "_input") { - namefile := fmt.Sprintf("%s/%s", dir, strings.Replace(fname, "_input", "_label", -1)) - name, ierr := ioutil.ReadFile(namefile) - if ierr == nil { - sensors[f][strings.Replace(string(name), "\n", "", -1)] = path - } + for metricName, file := range m.sensors { + tags := map[string]string{"type": "node"} + for key, newtags := range m.config.TagOverride { + if strings.Contains(file, key) { + tags = newtags + break } - return nil - }) + } + buffer, err := ioutil.ReadFile(file) if err != nil { continue } - } - return sensors, nil -} - -func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { - - sensors, err := get_hwmon_sensors() - if err != nil { - return - } - for _, files := range sensors { - for name, file := range files { - tags := map[string]string{"type": "node"} - for key, newtags := range m.config.TagOverride { - if strings.Contains(file, key) { - tags = newtags - break - } - } - mname := strings.Replace(name, " ", "_", -1) - if !strings.Contains(mname, "temp") { - mname = fmt.Sprintf("temp_%s", mname) - } - buffer, err := ioutil.ReadFile(string(file)) - if err != nil { - continue - } - x, err := strconv.ParseInt(strings.Replace(string(buffer), "\n", "", -1), 0, 64) + x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64) + if err == nil { + y, err := lp.New(metricName, tags, m.meta, map[string]interface{}{"value": int(float64(x) / 1000)}, time.Now()) if err == nil { - y, err := lp.New(strings.ToLower(mname), tags, m.meta, map[string]interface{}{"value": int(float64(x) / 1000)}, time.Now()) - if err == nil { - cclog.ComponentDebug(m.name, y) - output <- y - } + cclog.ComponentDebug(m.name, y) + output <- y } } } + } func (m *TempCollector) Close() { From 23d13b2ceb03d9ca9160074cfc35b9f5f58ffe21 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 11 Feb 2022 18:09:39 +0100 Subject: [PATCH 137/174] Fix group for netstat collector --- collectors/netstatMetric.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 0ec94a4..eb47a69 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -26,7 +26,7 @@ type NetstatCollector struct { func (m *NetstatCollector) Init(config json.RawMessage) error { m.name = "NetstatCollector" m.setup() - m.meta = map[string]string{"source": m.name, "group": "Memory"} + m.meta = map[string]string{"source": m.name, "group": "network"} m.matches = map[int]string{ 1: "net_bytes_in", 9: "net_bytes_out", From bd246bdacf422c9bc3a66b9bdf51023250df0472 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 11 Feb 2022 18:18:10 +0100 Subject: [PATCH 138/174] Fix group for netstat collector --- collectors/netstatMetric.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index eb47a69..3ca4cd3 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -26,7 +26,7 @@ type NetstatCollector struct { func (m *NetstatCollector) Init(config json.RawMessage) error { m.name = "NetstatCollector" m.setup() - m.meta = map[string]string{"source": m.name, "group": "network"} + m.meta = map[string]string{"source": m.name, "group": "Network"} m.matches = map[int]string{ 1: "net_bytes_in", 9: "net_bytes_out", From 6b12baff6ef9bfcd40c5a753b229436d9cc590b4 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Sat, 12 Feb 2022 10:13:26 +0100 Subject: [PATCH 139/174] Use sensor name and sensor label as metric name --- collectors/tempMetric.go | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index e87603d..aea6d62 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -50,12 +50,29 @@ func (m *TempCollector) Init(config json.RawMessage) error { // Get sensor name for each temperature sensor file for _, file := range inputFiles { - nameFile := strings.TrimSuffix(file, "_input") + "_label" - name, err := ioutil.ReadFile(nameFile) - if err != nil { - continue + nameFile := filepath.Join(filepath.Dir(file), "name") + name := "" + n, err := ioutil.ReadFile(nameFile) + if err == nil { + name = strings.TrimSpace(string(n)) + } + labelFile := strings.TrimSuffix(file, "_input") + "_label" + label := "" + l, err := ioutil.ReadFile(labelFile) + if err == nil { + label = strings.TrimSpace(string(l)) + } + metricName := "" + switch { + case len(name) == 0 && len(label) == 0: + continue + case len(name) != 0 && len(label) != 0: + metricName = name + "_" + label + case len(name) != 0: + metricName = name + case len(label) != 0: + metricName = label } - metricName := strings.TrimSpace(string(name)) metricName = strings.Replace(metricName, " ", "_", -1) if !strings.Contains(metricName, "temp") { metricName = "temp_" + metricName @@ -64,6 +81,12 @@ func (m *TempCollector) Init(config json.RawMessage) error { m.sensors[metricName] = file } + // Empty sensors map + if len(m.sensors) == 0 { + return fmt.Errorf("No temperature sensors found") + } + + // Finished initialization m.init = true return nil } From 09b1ea130e57f28782821e3c24d912b17ab2c8cf Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 14 Feb 2022 10:46:05 +0100 Subject: [PATCH 140/174] Add error handling. Cleanup. --- collectors/tempMetric.go | 135 +++++++++++++++++++++++++++------------ 1 file changed, 93 insertions(+), 42 deletions(-) diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index aea6d62..bd26584 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -13,23 +13,38 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -const HWMON_PATH = `/sys/class/hwmon` +// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html +// /sys/class/hwmon/hwmon*/name -> coretemp +// /sys/class/hwmon/hwmon*/temp*_label -> Core 0 +// /sys/class/hwmon/hwmon*/temp*_input -> 27800 = 27.8°C +// /sys/class/hwmon/hwmon*/temp*_max -> 86000 = 86.0°C +// /sys/class/hwmon/hwmon*/temp*_crit -> 100000 = 100.0°C -type TempCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics"` - TagOverride map[string]map[string]string `json:"tag_override"` +type TempCollectorSensor struct { + name string + label string + metricName string // Default: name_label + file string + tags map[string]string } type TempCollector struct { metricCollector - config TempCollectorConfig - sensors map[string]string + config struct { + ExcludeMetrics []string `json:"exclude_metrics"` + TagOverride map[string]map[string]string `json:"tag_override"` + } + sensors []*TempCollectorSensor } func (m *TempCollector) Init(config json.RawMessage) error { + // Check if already initialized + if m.init { + return nil + } + m.name = "TempCollector" m.setup() - m.meta = map[string]string{"source": m.name, "group": "IPMI", "unit": "degC"} if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { @@ -37,9 +52,16 @@ func (m *TempCollector) Init(config json.RawMessage) error { } } + m.meta = map[string]string{ + "source": m.name, + "group": "IPMI", + "unit": "degC", + } + + m.sensors = make([]*TempCollectorSensor, 0) + // Find all temperature sensor files - m.sensors = make(map[string]string) - globPattern := filepath.Join(HWMON_PATH, "*", "temp*_input") + globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input") inputFiles, err := filepath.Glob(globPattern) if err != nil { return fmt.Errorf("Unable to glob files with pattern '%s': %v", globPattern, err) @@ -50,35 +72,57 @@ func (m *TempCollector) Init(config json.RawMessage) error { // Get sensor name for each temperature sensor file for _, file := range inputFiles { + sensor := new(TempCollectorSensor) + + // sensor name nameFile := filepath.Join(filepath.Dir(file), "name") - name := "" - n, err := ioutil.ReadFile(nameFile) + name, err := ioutil.ReadFile(nameFile) if err == nil { - name = strings.TrimSpace(string(n)) + sensor.name = strings.TrimSpace(string(name)) } + + // sensor label labelFile := strings.TrimSuffix(file, "_input") + "_label" - label := "" - l, err := ioutil.ReadFile(labelFile) + label, err := ioutil.ReadFile(labelFile) if err == nil { - label = strings.TrimSpace(string(l)) + sensor.label = strings.TrimSpace(string(label)) } - metricName := "" + + // sensor metric name switch { - case len(name) == 0 && len(label) == 0: + case len(sensor.name) == 0 && len(sensor.label) == 0: continue - case len(name) != 0 && len(label) != 0: - metricName = name + "_" + label - case len(name) != 0: - metricName = name - case len(label) != 0: - metricName = label + case len(sensor.name) != 0 && len(sensor.label) != 0: + sensor.metricName = sensor.name + "_" + sensor.label + case len(sensor.name) != 0: + sensor.metricName = sensor.name + case len(sensor.label) != 0: + sensor.metricName = sensor.label } - metricName = strings.Replace(metricName, " ", "_", -1) - if !strings.Contains(metricName, "temp") { - metricName = "temp_" + metricName + sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1) + // Add temperature prefix, if required + if !strings.Contains(sensor.metricName, "temp") { + sensor.metricName = "temp_" + sensor.metricName } - metricName = strings.ToLower(metricName) - m.sensors[metricName] = file + sensor.metricName = strings.ToLower(sensor.metricName) + + // Sensor file + sensor.file = file + + // Sensor tags + sensor.tags = map[string]string{ + "type": "node", + } + + // Apply tag override configuration + for key, newtags := range m.config.TagOverride { + if strings.Contains(sensor.file, key) { + sensor.tags = newtags + break + } + } + + m.sensors = append(m.sensors, sensor) } // Empty sensors map @@ -93,25 +137,32 @@ func (m *TempCollector) Init(config json.RawMessage) error { func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { - for metricName, file := range m.sensors { - tags := map[string]string{"type": "node"} - for key, newtags := range m.config.TagOverride { - if strings.Contains(file, key) { - tags = newtags - break - } - } - buffer, err := ioutil.ReadFile(file) + for _, sensor := range m.sensors { + // Read sensor file + buffer, err := ioutil.ReadFile(sensor.file) if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err)) continue } x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64) + if err != nil { + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)) + continue + } + x /= 1000 + y, err := lp.New( + sensor.metricName, + sensor.tags, + m.meta, + map[string]interface{}{"value": x}, + time.Now(), + ) if err == nil { - y, err := lp.New(metricName, tags, m.meta, map[string]interface{}{"value": int(float64(x) / 1000)}, time.Now()) - if err == nil { - cclog.ComponentDebug(m.name, y) - output <- y - } + output <- y } } From 342f09fabf2351d845627d069808a9e99b41f22c Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 14 Feb 2022 11:19:19 +0100 Subject: [PATCH 141/174] Cleanup --- collectors/cpufreqCpuinfoMetric.go | 47 +++++++++++++++++------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 5d3d4b5..f527859 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -36,7 +36,7 @@ type CPUFreqCpuInfoCollectorTopology struct { type CPUFreqCpuInfoCollector struct { metricCollector - topology []CPUFreqCpuInfoCollectorTopology + topology []*CPUFreqCpuInfoCollectorTopology } func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { @@ -45,6 +45,8 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { return nil } + m.setup() + m.name = "CPUFreqCpuInfoCollector" m.meta = map[string]string{ "source": m.name, @@ -66,8 +68,10 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { coreID := "" physicalPackageID := "" var maxPhysicalPackageID int64 = 0 - m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0) + m.topology = make([]*CPUFreqCpuInfoCollectorTopology, 0) coreSeenBefore := make(map[string]bool) + + // Read cpuinfo file, line by line scanner := bufio.NewScanner(file) for scanner.Scan() { lineSplit := strings.Split(scanner.Text(), ":") @@ -93,39 +97,41 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { len(coreID) > 0 && len(physicalPackageID) > 0 { - coreID_int, err := strconv.ParseInt(coreID, 10, 64) + topology := new(CPUFreqCpuInfoCollectorTopology) + + // Processor + topology.processor = processor + + // Core ID + topology.coreID = coreID + topology.coreID_int, err = strconv.ParseInt(coreID, 10, 64) if err != nil { return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err) } - physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64) + + // Physical package ID + topology.physicalPackageID = physicalPackageID + topology.physicalPackageID_int, err = strconv.ParseInt(physicalPackageID, 10, 64) if err != nil { return fmt.Errorf("Unable to convert physicalPackageID '%s' to int64: %v", physicalPackageID, err) } // increase maximun socket / package ID, when required - if physicalPackageID_int > maxPhysicalPackageID { - maxPhysicalPackageID = physicalPackageID_int + if topology.physicalPackageID_int > maxPhysicalPackageID { + maxPhysicalPackageID = topology.physicalPackageID_int } + // is hyperthread? globalID := physicalPackageID + ":" + coreID - isHT := coreSeenBefore[globalID] + topology.isHT = coreSeenBefore[globalID] coreSeenBefore[globalID] = true - if !isHT { + if !topology.isHT { // increase number on non hyper thread cores numNonHT_int++ } // store collected topology information - m.topology = append( - m.topology, - CPUFreqCpuInfoCollectorTopology{ - processor: processor, - coreID: coreID, - coreID_int: coreID_int, - physicalPackageID: physicalPackageID, - physicalPackageID_int: physicalPackageID_int, - isHT: isHT, - }) + m.topology = append(m.topology, topology) // reset topology information foundFreq = false @@ -138,8 +144,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { numPhysicalPackageID_int := maxPhysicalPackageID + 1 numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int) numNonHT := fmt.Sprint(numNonHT_int) - for i := range m.topology { - t := &m.topology[i] + for _, t := range m.topology { t.numPhysicalPackages = numPhysicalPackageID t.numPhysicalPackages_int = numPhysicalPackageID_int t.numNonHT = numNonHT @@ -183,7 +188,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC // frequency if key == "cpu MHz" { - t := &m.topology[processorCounter] + t := m.topology[processorCounter] if !t.isHT { value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64) if err != nil { From 56363a66361523039832980cc18cc0c3e3b8492d Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 14 Feb 2022 15:20:38 +0100 Subject: [PATCH 142/174] Update cc-metric-collector.spec --- scripts/cc-metric-collector.spec | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/scripts/cc-metric-collector.spec b/scripts/cc-metric-collector.spec index d63836c..5944b28 100644 --- a/scripts/cc-metric-collector.spec +++ b/scripts/cc-metric-collector.spec @@ -26,10 +26,14 @@ make %install -install -Dpm 0755 %{name} %{buildroot}%{_sbindir}/%{name} +install -Dpm 0750 %{name} %{buildroot}%{_sbindir}/%{name} install -Dpm 0600 config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json -install -Dpm 644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service -install -Dpm 600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name} +install -Dpm 0600 collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json +install -Dpm 0600 sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json +install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json +install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json +install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service +install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name} %check @@ -46,9 +50,15 @@ install -Dpm 600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{nam %{_sbindir}/%{name} %{_unitdir}/%{name}.service %{_sysconfdir}/default/%{name} -%config(noreplace) %{_sysconfdir}/%{name}/%{name}.json - +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/collectors.json +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/sinks.json +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/receivers.json +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/router.json %changelog +* Mon Feb 14 2022 Thomas Gruber - 0.2 +- Add component specific configuration files +- Add %attr to config files * Mon Nov 22 2021 Thomas Gruber - 0.1 - Initial spec file From 95a58c29e26ead22d1fb12a1f0422ff7538a008b Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 14 Feb 2022 15:21:01 +0100 Subject: [PATCH 143/174] Set version in RPM spec file --- scripts/cc-metric-collector.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cc-metric-collector.spec b/scripts/cc-metric-collector.spec index 5944b28..65a9b55 100644 --- a/scripts/cc-metric-collector.spec +++ b/scripts/cc-metric-collector.spec @@ -1,5 +1,5 @@ Name: cc-metric-collector -Version: 0.1 +Version: 0.2 Release: 1%{?dist} Summary: Metric collection daemon from the ClusterCockpit suite From 247fb23de18d58b54f88246fbea4e019c99ae193 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 14 Feb 2022 18:12:50 +0100 Subject: [PATCH 144/174] Try to operate on multiple metrics if channels if filled --- internal/metricRouter/metricRouter.go | 73 +++++++++++++++++++-------- sinks/sinkManager.go | 31 ++++++++---- 2 files changed, 73 insertions(+), 31 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 5b254f8..c5ff0bd 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -14,6 +14,8 @@ import ( mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" ) +const ROUTER_MAX_FORWARD = 50 + // Metric router tag configuration type metricRouterTagConfig struct { Key string `json:"key"` // Tag name @@ -49,6 +51,7 @@ type metricRouter struct { config metricRouterConfig // json encoded config for metric router cache MetricCache // pointer to MetricCache cachewg sync.WaitGroup // wait group for MetricCache + maxForward int // number of metrics to forward maximally in one iteration } // MetricRouter access functions @@ -73,6 +76,7 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout r.cache_input = make(chan lp.CCMetric) r.wg = wg r.ticker = ticker + r.maxForward = ROUTER_MAX_FORWARD // Set hostname hostname, err := os.Hostname() @@ -242,6 +246,43 @@ func (r *metricRouter) Start() { } } + // Foward message received from collector channel + coll_forward := func(p lp.CCMetric) { + // receive from metric collector + p.AddTag("hostname", r.hostname) + if r.config.IntervalStamp { + p.SetTime(r.timestamp) + } + if !r.dropMetric(p) { + forward(p) + } + // even if the metric is dropped, it is stored in the cache for + // aggregations + if r.config.NumCacheIntervals > 0 { + r.cache.Add(p) + } + } + + // Foward message received from receivers channel + recv_forward := func(p lp.CCMetric) { + // receive from receive manager + if r.config.IntervalStamp { + p.SetTime(r.timestamp) + } + if !r.dropMetric(p) { + forward(p) + } + } + + // Foward message received from cache channel + cache_forward := func(p lp.CCMetric) { + // receive from metric collector + if !r.dropMetric(p) { + p.AddTag("hostname", r.hostname) + forward(p) + } + } + // Start Metric Cache if r.config.NumCacheIntervals > 0 { r.cache.Start() @@ -250,6 +291,7 @@ func (r *metricRouter) Start() { r.wg.Add(1) go func() { defer r.wg.Done() + for { select { case <-r.done: @@ -257,34 +299,21 @@ func (r *metricRouter) Start() { return case p := <-r.coll_input: - // receive from metric collector - p.AddTag("hostname", r.hostname) - if r.config.IntervalStamp { - p.SetTime(r.timestamp) - } - if !r.dropMetric(p) { - forward(p) - } - // even if the metric is dropped, it is stored in the cache for - // aggregations - if r.config.NumCacheIntervals > 0 { - r.cache.Add(p) + coll_forward(p) + for i := 0; len(r.coll_input) > 0 && i < r.maxForward; i++ { + coll_forward(<-r.coll_input) } case p := <-r.recv_input: - // receive from receive manager - if r.config.IntervalStamp { - p.SetTime(r.timestamp) - } - if !r.dropMetric(p) { - forward(p) + recv_forward(p) + for i := 0; len(r.recv_input) > 0 && i < r.maxForward; i++ { + recv_forward(<-r.recv_input) } case p := <-r.cache_input: - // receive from metric collector - if !r.dropMetric(p) { - p.AddTag("hostname", r.hostname) - forward(p) + cache_forward(p) + for i := 0; len(r.cache_input) > 0 && i < r.maxForward; i++ { + cache_forward(<-r.cache_input) } } } diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index ff6e01d..e2d01a7 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -10,6 +10,8 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) +const SINK_MAX_FORWARD = 50 + // Map of all available sinks var AvailableSinks = map[string]Sink{ "influxdb": new(InfluxSink), @@ -22,10 +24,11 @@ var AvailableSinks = map[string]Sink{ // Metric collector manager data structure type sinkManager struct { - input chan lp.CCMetric // input channel - done chan bool // channel to finish / stop metric sink manager - wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector - sinks map[string]Sink // Mapping sink name to sink + input chan lp.CCMetric // input channel + done chan bool // channel to finish / stop metric sink manager + wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector + sinks map[string]Sink // Mapping sink name to sink + maxForward int // number of metrics to write maximally in one iteration } // Sink manager access functions @@ -45,6 +48,7 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { sm.done = make(chan bool) sm.wg = wg sm.sinks = make(map[string]Sink, 0) + sm.maxForward = SINK_MAX_FORWARD if len(sinkConfigFile) == 0 { return nil @@ -97,12 +101,8 @@ func (sm *sinkManager) Start() { } for { - select { - case <-sm.done: - done() - return - case p := <-sm.input: + toTheSinks := func(p lp.CCMetric) { // Send received metric to all outputs cclog.ComponentDebug("SinkManager", "WRITE", p) for _, s := range sm.sinks { @@ -111,6 +111,19 @@ func (sm *sinkManager) Start() { } } } + + select { + case <-sm.done: + done() + return + + case p := <-sm.input: + toTheSinks(p) + for i := 0; len(sm.input) > 0 && i < sm.maxForward; i++ { + p := <-sm.input + toTheSinks(p) + } + } } }() From a3ad9d0cb0d58343a7d8da998b46964d37ae4ae7 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 14 Feb 2022 18:53:37 +0100 Subject: [PATCH 145/174] Move toTheSinks out of for loop --- sinks/sinkManager.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index e2d01a7..bd243f4 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -100,18 +100,17 @@ func (sm *sinkManager) Start() { cclog.ComponentDebug("SinkManager", "DONE") } - for { - - toTheSinks := func(p lp.CCMetric) { - // Send received metric to all outputs - cclog.ComponentDebug("SinkManager", "WRITE", p) - for _, s := range sm.sinks { - if err := s.Write(p); err != nil { - cclog.ComponentError("SinkManager", "WRITE", s.Name(), "write failed:", err.Error()) - } + toTheSinks := func(p lp.CCMetric) { + // Send received metric to all outputs + cclog.ComponentDebug("SinkManager", "WRITE", p) + for _, s := range sm.sinks { + if err := s.Write(p); err != nil { + cclog.ComponentError("SinkManager", "WRITE", s.Name(), "write failed:", err.Error()) } } + } + for { select { case <-sm.done: done() From 5060497abd4ac05cefea43286bb75fd47b93d3ab Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 14 Feb 2022 22:14:06 +0100 Subject: [PATCH 146/174] Cleanup --- collectors.json | 1 + collectors/numastatsMetric.go | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/collectors.json b/collectors.json index 8a06608..09731ab 100644 --- a/collectors.json +++ b/collectors.json @@ -1,4 +1,5 @@ { + "numastats": {}, "cpufreq": {}, "cpufreq_cpuinfo": {}, "gpfs": { diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go index 707768d..52a2638 100644 --- a/collectors/numastatsMetric.go +++ b/collectors/numastatsMetric.go @@ -61,8 +61,8 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error { } // Loop for all NUMA node directories - baseDir := "/sys/devices/system/node" - globPattern := filepath.Join(baseDir, "node[0-9]*") + base := "/sys/devices/system/node/node" + globPattern := base + "[0-9]*" dirs, err := filepath.Glob(globPattern) if err != nil { return fmt.Errorf("unable to glob files with pattern '%s'", globPattern) @@ -72,7 +72,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error { } m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs)) for _, dir := range dirs { - node := strings.TrimPrefix(dir, "/sys/devices/system/node/node") + node := strings.TrimPrefix(dir, base) file := filepath.Join(dir, "numastat") m.topology = append(m.topology, NUMAStatsCollectorTopolgy{ @@ -103,6 +103,8 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetri return } scanner := bufio.NewScanner(file) + + // Read line by line for scanner.Scan() { split := strings.Fields(scanner.Text()) if len(split) != 2 { @@ -116,7 +118,13 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetri fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err)) continue } - y, err := lp.New("numastats_"+key, t.tagSet, m.meta, map[string]interface{}{"value": value}, now) + y, err := lp.New( + "numastats_"+key, + t.tagSet, + m.meta, + map[string]interface{}{"value": value}, + now, + ) if err == nil { output <- y } From fcfb58c31c7c82038a36379a3e9f422f63953fc8 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 09:23:57 +0100 Subject: [PATCH 147/174] Use slice element of m.gpus without slice index --- collectors/nvidiaMetric.go | 44 +++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 374f7ee..87c1d62 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -43,60 +43,84 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { m.name = "NvidiaCollector" m.config.AddPciInfoTag = false m.setup() - m.meta = map[string]string{"source": m.name, "group": "Nvidia"} if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { return err } } + m.meta = map[string]string{ + "source": m.name, + "group": "Nvidia", + } + m.num_gpus = 0 defer m.CatchPanic() + + // Initialize NVIDIA Management Library (NVML) ret := nvml.Init() if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error()) return err } + + // Number of NVIDIA GPUs num_gpus, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) cclog.ComponentError(m.name, "Unable to get device count", err.Error()) return err } + + // For all GPUs m.gpus = make([]NvidiaCollectorDevice, num_gpus) - idx := 0 - for i := 0; i < num_gpus && idx < num_gpus; i++ { + for i := 0; i < num_gpus; i++ { + g := &m.gpus[i] + + // Skip excluded devices str_i := fmt.Sprintf("%d", i) if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip { continue } + // Get device handle device, ret := nvml.DeviceGetHandleByIndex(i) if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error()) return err } - g := m.gpus[idx] g.device = device - g.tags = map[string]string{"type": "accelerator", "type-id": str_i} + + // Add tags + g.tags = map[string]string{ + "type": "accelerator", + "type-id": str_i, + } + + // Add excluded metrics g.excludeMetrics = map[string]bool{} for _, e := range m.config.ExcludeMetrics { g.excludeMetrics[e] = true } + + // Add PCI info as tag if m.config.AddPciInfoTag { - pciinfo, ret := nvml.DeviceGetPciInfo(g.device) + pciInfo, ret := nvml.DeviceGetPciInfo(g.device) if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) - cclog.ComponentError(m.name, "Unable to get pciInfo for device at index", i, ":", err.Error()) + cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error()) return err } - g.tags["pci_identifier"] = fmt.Sprintf("%08X:%02X:%02X.0", pciinfo.Domain, pciinfo.Bus, pciinfo.Device) + g.tags["pci_identifier"] = fmt.Sprintf( + "%08X:%02X:%02X.0", + pciInfo.Domain, + pciInfo.Bus, + pciInfo.Device) } - m.gpus[idx] = g - idx++ } + m.init = true return nil } From 14c9d6f7927ca008547cd292c896261c509627e1 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 09:47:24 +0100 Subject: [PATCH 148/174] Fixed: All nvidia metrics were excluded --- collectors/nvidiaMetric.go | 58 ++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 87c1d62..27b921a 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -130,27 +130,19 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) return } - for _, device := range m.gpus { + for i := range m.gpus { + device := &m.gpus[i] - exclude := func(metric string) bool { - if _, ok := device.excludeMetrics[metric]; !ok { - return true - } - return false - } - - ex_nv_util := exclude("nv_util") - ex_nv_mem_util := exclude("nv_mem_util") - if (!ex_nv_util) || (!ex_nv_mem_util) { + if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] { util, ret := nvml.DeviceGetUtilizationRates(device.device) if ret == nvml.SUCCESS { - if !ex_nv_util { + if !device.excludeMetrics["nv_util"] { y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if err == nil { output <- y } } - if !ex_nv_mem_util { + if !device.excludeMetrics["nv_mem_util"] { y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) if err == nil { output <- y @@ -159,12 +151,10 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - ex_nv_mem_total := exclude("nv_mem_total") - ex_nv_fb_memory := exclude("nv_fb_memory") - if (!ex_nv_mem_total) || (!ex_nv_fb_memory) { + if !device.excludeMetrics["nv_mem_total"] || !device.excludeMetrics["nv_fb_memory"] { meminfo, ret := nvml.DeviceGetMemoryInfo(device.device) if ret == nvml.SUCCESS { - if !ex_nv_mem_total { + if !device.excludeMetrics["nv_mem_total"] { t := float64(meminfo.Total) / (1024 * 1024) y, err := lp.New("nv_mem_total", device.tags, m.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { @@ -173,7 +163,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !ex_nv_fb_memory { + if !device.excludeMetrics["nv_fb_memory"] { f := float64(meminfo.Used) / (1024 * 1024) y, err := lp.New("nv_fb_memory", device.tags, m.meta, map[string]interface{}{"value": f}, time.Now()) if err == nil { @@ -184,7 +174,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_temp") { + if !device.excludeMetrics["nv_temp"] { temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) if ret == nvml.SUCCESS { y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) @@ -195,7 +185,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_fan") { + if !device.excludeMetrics["nv_fan"] { fan, ret := nvml.DeviceGetFanSpeed(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) @@ -205,7 +195,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_ecc_mode") { + if !device.excludeMetrics["nv_ecc_mode"] { _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) if ret == nvml.SUCCESS { var y lp.CCMetric @@ -229,7 +219,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_perf_state") { + if !device.excludeMetrics["nv_perf_state"] { pstate, ret := nvml.DeviceGetPerformanceState(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) @@ -239,7 +229,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_power_usage_report") { + if !device.excludeMetrics["nv_power_usage_report"] { power, ret := nvml.DeviceGetPowerUsage(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) @@ -249,7 +239,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_graphics_clock_report") { + if !device.excludeMetrics["nv_graphics_clock_report"] { gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) @@ -259,7 +249,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_sm_clock_report") { + if !device.excludeMetrics["nv_sm_clock_report"] { smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) @@ -269,7 +259,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_mem_clock_report") { + if !device.excludeMetrics["nv_mem_clock_report"] { memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) @@ -279,7 +269,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_max_graphics_clock") { + if !device.excludeMetrics["nv_max_graphics_clock"] { max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) @@ -289,7 +279,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_max_sm_clock") { + if !device.excludeMetrics["nv_max_sm_clock"] { max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) @@ -299,7 +289,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_max_mem_clock") { + if !device.excludeMetrics["nv_max_mem_clock"] { max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) @@ -309,7 +299,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_ecc_db_error") { + if !device.excludeMetrics["nv_ecc_db_error"] { ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1) if ret == nvml.SUCCESS { y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) @@ -319,7 +309,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_ecc_sb_error") { + if !device.excludeMetrics["nv_ecc_sb_error"] { ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1) if ret == nvml.SUCCESS { y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) @@ -329,7 +319,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_power_man_limit") { + if !device.excludeMetrics["nv_power_man_limit"] { pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) @@ -339,7 +329,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_encoder_util") { + if !device.excludeMetrics["nv_encoder_util"] { enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) @@ -349,7 +339,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - if !exclude("nv_decoder_util") { + if !device.excludeMetrics["nv_decoder_util"] { dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) From 01faa3b5317814dab94432fd9dd5d0647ba6d481 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 10:57:32 +0100 Subject: [PATCH 149/174] Add comments and units to all nvidia metrics --- collectors.json | 31 ++++---- collectors/nvidiaMetric.go | 150 ++++++++++++++++++++++++++++++++----- 2 files changed, 149 insertions(+), 32 deletions(-) diff --git a/collectors.json b/collectors.json index 09731ab..cbfc23d 100644 --- a/collectors.json +++ b/collectors.json @@ -1,23 +1,28 @@ { - "numastats": {}, "cpufreq": {}, "cpufreq_cpuinfo": {}, "gpfs": { - "exclude_filesystem": [ "test_fs" ] + "exclude_filesystem": [ + "test_fs" + ] }, "loadavg": { - "exclude_metrics": [ "proc_total" ] + "exclude_metrics": [ + "proc_total" + ] }, + "numastats": {}, + "nvidia": {}, "tempstat": { - "tag_override": { - "hwmon0" : { - "type" : "socket", - "type-id" : "0" - }, - "hwmon1" : { - "type" : "socket", - "type-id" : "1" + "tag_override": { + "hwmon0": { + "type": "socket", + "type-id": "0" + }, + "hwmon1": { + "type": "socket", + "type-id": "1" + } } - } } -} +} \ No newline at end of file diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 27b921a..24f0855 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -134,17 +134,29 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) device := &m.gpus[i] if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] { + // Retrieves the current utilization rates for the device's major subsystems. + // + // Available utilization rates + // * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU. + // * Memory: Percent of time over the past sample period during which global (device) memory was being read or written + // + // Note: + // * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. + // This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. + // * On MIG-enabled GPUs, querying device utilization rates is not currently supported. util, ret := nvml.DeviceGetUtilizationRates(device.device) if ret == nvml.SUCCESS { if !device.excludeMetrics["nv_util"] { y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } if !device.excludeMetrics["nv_mem_util"] { y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } @@ -152,6 +164,20 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_mem_total"] || !device.excludeMetrics["nv_fb_memory"] { + // Retrieves the amount of used, free and total memory available on the device, in bytes. + // + // Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. + // + // The reported amount of used memory is equal to the sum of memory allocated by all active channels on the device. + // + // Available memory info: + // * Free: Unallocated FB memory (in bytes). + // * Total: Total installed FB memory (in bytes). + // * Used: Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping. + // + // Note: + // In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges. + // Per-instance information can be queried by using specific MIG device handles. meminfo, ret := nvml.DeviceGetMemoryInfo(device.device) if ret == nvml.SUCCESS { if !device.excludeMetrics["nv_mem_total"] { @@ -175,6 +201,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_temp"] { + // Retrieves the current temperature readings for the device, in degrees C. + // + // Available temperature sensors: + // * TEMPERATURE_GPU: Temperature sensor for the GPU die. + // * NVML_TEMPERATURE_COUNT temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) if ret == nvml.SUCCESS { y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) @@ -186,33 +217,50 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_fan"] { + // Retrieves the intended operating speed of the device's fan. + // + // Note: The reported speed is the intended fan speed. + // If the fan is physically blocked and unable to spin, the output will not match the actual fan speed. + // + // For all discrete products with dedicated fans. + // + // The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + // This value may exceed 100% in certain cases. fan, ret := nvml.DeviceGetFanSpeed(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } } if !device.excludeMetrics["nv_ecc_mode"] { + // Retrieves the current and pending ECC modes for the device. + // + // For Fermi or newer fully supported devices. Only applicable to devices with ECC. + // Requires NVML_INFOROM_ECC version 1.0 or higher. + // + // Changing ECC modes requires a reboot. + // The "pending" ECC mode refers to the target mode following the next reboot. _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) if ret == nvml.SUCCESS { var y lp.CCMetric var err error switch ecc_pend { case nvml.FEATURE_DISABLED: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "OFF"}, time.Now()) case nvml.FEATURE_ENABLED: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "ON"}, time.Now()) default: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now()) } if err == nil { output <- y } } else if ret == nvml.ERROR_NOT_SUPPORTED { - y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) + y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "N/A"}, time.Now()) if err == nil { output <- y } @@ -220,9 +268,16 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_perf_state"] { - pstate, ret := nvml.DeviceGetPerformanceState(device.device) + // Retrieves the current performance state for the device. + // + // Allowed PStates: + // 0: Maximum Performance. + // .. + // 15: Minimum Performance. + // 32: Unknown performance state. + pState, ret := nvml.DeviceGetPerformanceState(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) + y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now()) if err == nil { output <- y } @@ -230,77 +285,115 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_power_usage_report"] { + // Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + // + // On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + // + // It is only available if power management mode is supported power, ret := nvml.DeviceGetPowerUsage(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) if err == nil { + y.AddMeta("unit", "watts") output <- y } } } + // Retrieves the current clock speeds for the device. + // + // Available clock information: + // * CLOCK_GRAPHICS: Graphics clock domain. + // * CLOCK_SM: Streaming Multiprocessor clock domain. + // * CLOCK_MEM: Memory clock domain. if !device.excludeMetrics["nv_graphics_clock_report"] { - gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) + graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) + y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_sm_clock_report"] { - smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) + y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smCock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_mem_clock_report"] { - memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) + y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } + // Retrieves the maximum clock speeds for the device. + // + // Available clock information: + // * CLOCK_GRAPHICS: Graphics clock domain. + // * CLOCK_SM: Streaming multiprocessor clock domain. + // * CLOCK_MEM: Memory clock domain. + // * CLOCK_VIDEO: Video encoder/decoder clock domain. + // * CLOCK_COUNT: Count of clock types. + // + // Note: + /// On GPUs from Fermi family current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by few MHz. if !device.excludeMetrics["nv_max_graphics_clock"] { max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_max_sm_clock"] { - max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) + y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_max_mem_clock"] { - max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) + y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_ecc_db_error"] { - ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1) + // Retrieves the total ECC error counts for the device. + // + // For Fermi or newer fully supported devices. + // Only applicable to devices with ECC. + // Requires NVML_INFOROM_ECC version 1.0 or higher. + // Requires ECC Mode to be enabled. + // + // The total error count is the sum of errors across each of the separate memory systems, + // i.e. the total set of errors across the entire device. + ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC) if ret == nvml.SUCCESS { y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) if err == nil { @@ -310,7 +403,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_ecc_sb_error"] { - ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1) + ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC) if ret == nvml.SUCCESS { y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) if err == nil { @@ -320,30 +413,49 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_power_man_limit"] { + // Retrieves the power management limit associated with this device. + // + // For Fermi or newer fully supported devices. + // + // The power limit defines the upper boundary for the card's power draw. + // If the card's total power draw reaches this limit the power management algorithm kicks in. pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) + y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now()) if err == nil { + y.AddMeta("unit", "watts") output <- y } } } if !device.excludeMetrics["nv_encoder_util"] { + // Retrieves the current utilization and sampling size in microseconds for the Encoder + // + // For Kepler or newer fully supported devices. + // + // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } } if !device.excludeMetrics["nv_decoder_util"] { + // Retrieves the current utilization and sampling size in microseconds for the Decoder + // + // For Kepler or newer fully supported devices. + // + // Note: On MIG-enabled GPUs, querying decoder utilization is not currently supported. dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } From 69b31e87e7da685d1ed0d88aad2b575b8421456d Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 11:27:42 +0100 Subject: [PATCH 150/174] Fix: Start cache manager only when NumCacheIntervals > 0 --- internal/metricRouter/metricRouter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index c5ff0bd..e8c30cc 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -100,7 +100,7 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout cclog.ComponentError("MetricRouter", err.Error()) return err } - if r.config.NumCacheIntervals >= 0 { + if r.config.NumCacheIntervals > 0 { r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals) if err != nil { cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error()) From 2031f35d9bb86e7a90315e4b64550b199c4da553 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 11:36:17 +0100 Subject: [PATCH 151/174] Cleanup --- internal/metricRouter/metricRouter.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index e8c30cc..88f5817 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -263,7 +263,7 @@ func (r *metricRouter) Start() { } } - // Foward message received from receivers channel + // Forward message received from receivers channel recv_forward := func(p lp.CCMetric) { // receive from receive manager if r.config.IntervalStamp { @@ -274,7 +274,7 @@ func (r *metricRouter) Start() { } } - // Foward message received from cache channel + // Forward message received from cache channel cache_forward := func(p lp.CCMetric) { // receive from metric collector if !r.dropMetric(p) { @@ -342,13 +342,18 @@ func (r *metricRouter) Close() { r.done <- true // wait for close of channel r.done <-r.done + + // stop timer if r.config.IntervalStamp { cclog.ComponentDebug("MetricRouter", "TIMER CLOSE") r.timerdone <- true // wait for close of channel r.timerdone <-r.timerdone } + + // stop metric cache if r.config.NumCacheIntervals > 0 { + cclog.ComponentDebug("MetricRouter", "CACHE CLOSE") r.cache.Close() r.cachewg.Wait() } From a8821b7ac5e9ab1fac82e1fa92cf983ed19b4705 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 12:39:54 +0100 Subject: [PATCH 152/174] Add comments --- internal/metricRouter/metricRouter.go | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 88f5817..90650ea 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -159,12 +159,13 @@ func getParamMap(point lp.CCMetric) map[string]interface{} { // DoAddTags adds a tag when condition is fullfiled func (r *metricRouter) DoAddTags(point lp.CCMetric) { + var conditionMatches bool for _, m := range r.config.AddTags { - var conditionMatches bool = false - if m.Condition == "*" { + // Condition is always matched conditionMatches = true } else { + // Evaluate condition var err error conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point)) if err != nil { @@ -180,12 +181,13 @@ func (r *metricRouter) DoAddTags(point lp.CCMetric) { // DoDelTags removes a tag when condition is fullfiled func (r *metricRouter) DoDelTags(point lp.CCMetric) { + var conditionMatches bool for _, m := range r.config.DelTags { - var conditionMatches bool = false - if m.Condition == "*" { + // Condition is always matched conditionMatches = true } else { + // Evaluate condition var err error conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point)) if err != nil { @@ -202,16 +204,23 @@ func (r *metricRouter) DoDelTags(point lp.CCMetric) { // Conditional test whether a metric should be dropped func (r *metricRouter) dropMetric(point lp.CCMetric) bool { // Simple drop check - if _, ok := r.config.dropMetrics[point.Name()]; ok { - return true + if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok { + return conditionMatches } + // Checking the dropping conditions for _, m := range r.config.DropMetricsIf { conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point)) - if conditionMatches || err != nil { - return true + if err != nil { + cclog.ComponentError("MetricRouter", err.Error()) + conditionMatches = false + } + if conditionMatches { + return conditionMatches } } + + // No dropping condition met return false } From 2f0e229936416350ba17e477f92d5f72ea00181b Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 13:35:24 +0100 Subject: [PATCH 153/174] Simplify EvalBoolCondition --- internal/metricAggregator/metricAggregator.go | 28 +++++++------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index a05f061..f97859a 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -283,8 +283,10 @@ func (c *metricAggregator) AddFunction(name string, function func(args ...interf } func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) { - newcond := strings.ReplaceAll(condition, "'", "\"") - newcond = strings.ReplaceAll(newcond, "%", "\\") + newcond := + strings.ReplaceAll( + strings.ReplaceAll( + condition, "'", "\""), "%", "\\") language := gval.NewLanguage( gval.Full(), metricCacheLanguage, @@ -293,31 +295,21 @@ func EvalBoolCondition(condition string, params map[string]interface{}) (bool, e if err != nil { return false, err } - var endResult bool = false + endResult := false err = nil switch r := value.(type) { case bool: endResult = r case float64: - if r != 0.0 { - endResult = true - } + endResult = r != 0.0 case float32: - if r != 0.0 { - endResult = true - } + endResult = r != 0.0 case int: - if r != 0 { - endResult = true - } + endResult = r != 0 case int64: - if r != 0 { - endResult = true - } + endResult = r != 0 case int32: - if r != 0 { - endResult = true - } + endResult = r != 0 default: err = fmt.Errorf("cannot evaluate '%s' to bool", newcond) } From d6154ff35b59a718f553bf706b981e8780b69189 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 13:51:46 +0100 Subject: [PATCH 154/174] Simplification --- internal/metricAggregator/metricAggregator.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index f97859a..6b9d531 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -246,15 +246,16 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags return nil } } - var agg MetricAggregatorIntervalConfig - agg.Name = name - agg.Condition = newcond - agg.gvalCond = gvalCond - agg.Function = newfunc - agg.gvalFunc = gvalFunc - agg.Tags = tags - agg.Meta = meta - c.functions = append(c.functions, &agg) + agg := &MetricAggregatorIntervalConfig{ + Name: name, + Condition: newcond, + gvalCond: gvalCond, + Function: newfunc, + gvalFunc: gvalFunc, + Tags: tags, + Meta: meta, + } + c.functions = append(c.functions, agg) return nil } From 542520d2c01cd66fc2dd1945f740749e85085243 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 15:37:25 +0100 Subject: [PATCH 155/174] Refactoring: Use array of pointers --- collectors.json | 1 + collectors/infinibandMetric.go | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/collectors.json b/collectors.json index cbfc23d..669355c 100644 --- a/collectors.json +++ b/collectors.json @@ -6,6 +6,7 @@ "test_fs" ] }, + "ibstat": {}, "loadavg": { "exclude_metrics": [ "proc_total" diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index e5197de..ac79e0a 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -31,7 +31,7 @@ type InfinibandCollector struct { config struct { ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0 } - info []InfinibandCollectorInfo + info []*InfinibandCollectorInfo } // Init initializes the Infiniband collector by walking through files below IB_BASEPATH @@ -111,7 +111,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { } m.info = append(m.info, - InfinibandCollectorInfo{ + &InfinibandCollectorInfo{ LID: LID, device: device, port: port, @@ -142,10 +142,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr } now := time.Now() - for i := range m.info { - - // device info - info := &m.info[i] + for _, info := range m.info { for counterName, counterFile := range info.portCounterFiles { line, err := ioutil.ReadFile(counterFile) if err != nil { From 248c815a1c1b6a985f4bafbc5b8fb8b7a723ff11 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 16 Feb 2022 08:40:57 +0100 Subject: [PATCH 156/174] Make created language persistant --- internal/metricAggregator/metricAggregator.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index 6b9d531..b37b877 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -62,6 +62,10 @@ var metricCacheLanguage = gval.NewLanguage( gval.Function("getCpuList", getCpuListOfNode), gval.Function("getCpuListOfType", getCpuListOfType), ) +var language gval.Language = gval.NewLanguage( + gval.Full(), + metricCacheLanguage, +) func (c *metricAggregator) Init(output chan lp.CCMetric) error { c.output = output @@ -288,10 +292,6 @@ func EvalBoolCondition(condition string, params map[string]interface{}) (bool, e strings.ReplaceAll( strings.ReplaceAll( condition, "'", "\""), "%", "\\") - language := gval.NewLanguage( - gval.Full(), - metricCacheLanguage, - ) value, err := gval.Evaluate(newcond, params, language) if err != nil { return false, err @@ -321,10 +321,6 @@ func EvalFloat64Condition(condition string, params map[string]interface{}) (floa var endResult float64 = math.NaN() newcond := strings.ReplaceAll(condition, "'", "\"") newcond = strings.ReplaceAll(newcond, "%", "\\") - language := gval.NewLanguage( - gval.Full(), - metricCacheLanguage, - ) value, err := gval.Evaluate(newcond, params, language) if err != nil { cclog.ComponentDebug("MetricRouter", condition, " = ", err.Error()) From b44e2264963b591cdaa483c136fbd1d9a0243642 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 16 Feb 2022 13:59:32 +0100 Subject: [PATCH 157/174] Add caching for condition evaluation --- internal/metricAggregator/metricAggregator.go | 82 +++++++------------ 1 file changed, 29 insertions(+), 53 deletions(-) diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index b37b877..5a94a83 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -66,6 +66,7 @@ var language gval.Language = gval.NewLanguage( gval.Full(), metricCacheLanguage, ) +var evaluables map[string]gval.Evaluable = make(map[string]gval.Evaluable) func (c *metricAggregator) Init(output chan lp.CCMetric) error { c.output = output @@ -288,66 +289,41 @@ func (c *metricAggregator) AddFunction(name string, function func(args ...interf } func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) { - newcond := - strings.ReplaceAll( + var evaluable gval.Evaluable + var ok bool + if evaluable, ok = evaluables[condition]; !ok { + newcond := strings.ReplaceAll( - condition, "'", "\""), "%", "\\") - value, err := gval.Evaluate(newcond, params, language) - if err != nil { - return false, err + strings.ReplaceAll( + condition, "'", "\""), "%", "\\") + var err error + evaluable, err = language.NewEvaluable(newcond) + if err != nil { + return false, err + } + evaluables[condition] = evaluable } - endResult := false - err = nil - switch r := value.(type) { - case bool: - endResult = r - case float64: - endResult = r != 0.0 - case float32: - endResult = r != 0.0 - case int: - endResult = r != 0 - case int64: - endResult = r != 0 - case int32: - endResult = r != 0 - default: - err = fmt.Errorf("cannot evaluate '%s' to bool", newcond) - } - return endResult, err + value, err := evaluable.EvalBool(context.Background(), params) + return value, err } func EvalFloat64Condition(condition string, params map[string]interface{}) (float64, error) { - var endResult float64 = math.NaN() - newcond := strings.ReplaceAll(condition, "'", "\"") - newcond = strings.ReplaceAll(newcond, "%", "\\") - value, err := gval.Evaluate(newcond, params, language) - if err != nil { - cclog.ComponentDebug("MetricRouter", condition, " = ", err.Error()) - return endResult, err - } - err = nil - switch r := value.(type) { - case bool: - if r { - endResult = 1.0 - } else { - endResult = 0.0 + var evaluable gval.Evaluable + var ok bool + if evaluable, ok = evaluables[condition]; !ok { + newcond := + strings.ReplaceAll( + strings.ReplaceAll( + condition, "'", "\""), "%", "\\") + var err error + evaluable, err = language.NewEvaluable(newcond) + if err != nil { + return math.NaN(), err } - case float64: - endResult = r - case float32: - endResult = float64(r) - case int: - endResult = float64(r) - case int64: - endResult = float64(r) - case int32: - endResult = float64(r) - default: - err = fmt.Errorf("cannot evaluate '%s' to float64", newcond) + evaluables[condition] = evaluable } - return endResult, err + value, err := evaluable.EvalFloat64(context.Background(), params) + return value, err } func NewAggregator(output chan lp.CCMetric) (MetricAggregator, error) { From 154b56000e62150b559b25bf43989c09dc6d915b Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 16 Feb 2022 14:30:11 +0100 Subject: [PATCH 158/174] Allow concurrent access to condition map --- internal/metricAggregator/metricAggregator.go | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index 5a94a83..f5c7ada 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -6,6 +6,7 @@ import ( "math" "os" "strings" + "sync" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" @@ -66,7 +67,12 @@ var language gval.Language = gval.NewLanguage( gval.Full(), metricCacheLanguage, ) -var evaluables map[string]gval.Evaluable = make(map[string]gval.Evaluable) +var evaluables = struct { + mapping map[string]gval.Evaluable + mutex sync.Mutex +}{ + mapping: make(map[string]gval.Evaluable), +} func (c *metricAggregator) Init(output chan lp.CCMetric) error { c.output = output @@ -289,9 +295,10 @@ func (c *metricAggregator) AddFunction(name string, function func(args ...interf } func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) { - var evaluable gval.Evaluable - var ok bool - if evaluable, ok = evaluables[condition]; !ok { + evaluables.mutex.Lock() + evaluable, ok := evaluables.mapping[condition] + evaluables.mutex.Unlock() + if !ok { newcond := strings.ReplaceAll( strings.ReplaceAll( @@ -301,16 +308,19 @@ func EvalBoolCondition(condition string, params map[string]interface{}) (bool, e if err != nil { return false, err } - evaluables[condition] = evaluable + evaluables.mutex.Lock() + evaluables.mapping[condition] = evaluable + evaluables.mutex.Unlock() } value, err := evaluable.EvalBool(context.Background(), params) return value, err } func EvalFloat64Condition(condition string, params map[string]interface{}) (float64, error) { - var evaluable gval.Evaluable - var ok bool - if evaluable, ok = evaluables[condition]; !ok { + evaluables.mutex.Lock() + evaluable, ok := evaluables.mapping[condition] + evaluables.mutex.Unlock() + if !ok { newcond := strings.ReplaceAll( strings.ReplaceAll( @@ -320,7 +330,9 @@ func EvalFloat64Condition(condition string, params map[string]interface{}) (floa if err != nil { return math.NaN(), err } - evaluables[condition] = evaluable + evaluables.mutex.Lock() + evaluables.mapping[condition] = evaluable + evaluables.mutex.Unlock() } value, err := evaluable.EvalFloat64(context.Background(), params) return value, err From e28c1fb30bff621c0d8118bbe3f75efa3561b80e Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Wed, 16 Feb 2022 18:33:46 +0100 Subject: [PATCH 159/174] Ganglia sink using `libganglia.so` directly (#35) * Add sink directly using libganglia.so * Remove unneeded confuse header * add submodule init to build action * add submodule init to runonce action * add installation og ganglia to runonce * add installation of ganglia to runonce * add installation of ganglia to runonce * libconfuse not required * Remove ganglia submodule * Remove ganglia.h * Add Makefile to help creating the libganglia.so link * Fix cgo header * Rename new Ganglia sink to 'libgangliaSink' * Add documentation for libgangliaSink * Extend make buildsystem with find&symlink helper for libgangliaSink * Add metric renaming function * Add build tag 'ganglia' and create corresponding files --- .github/workflows/rpmbuild.yml | 2 + .github/workflows/runonce.yml | 12 +- .gitmodules | 4 + Makefile | 12 +- sinks/Makefile | 11 ++ sinks/README.md | 3 +- sinks/gangliaSink.go | 2 + sinks/gangliaSink_disabled.go | 31 ++++ sinks/libgangliaSink.go | 295 +++++++++++++++++++++++++++++++ sinks/libgangliaSink.md | 41 +++++ sinks/libgangliaSink_disabled.go | 29 +++ sinks/sinkManager.go | 1 + 12 files changed, 440 insertions(+), 3 deletions(-) create mode 100644 .gitmodules create mode 100644 sinks/Makefile create mode 100644 sinks/gangliaSink_disabled.go create mode 100644 sinks/libgangliaSink.go create mode 100644 sinks/libgangliaSink.md create mode 100644 sinks/libgangliaSink_disabled.go diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index 8d16e37..d9220a7 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -9,6 +9,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + with: + submodules: recursive - uses: TomTheBear/rpmbuild@master id: rpm name: Build RPM package on CentOS8 diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index 2a2cc8a..da5b86c 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -6,6 +6,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + with: + submodules: recursive # See: https://github.com/marketplace/actions/setup-go-environment - name: Setup Golang @@ -13,6 +15,9 @@ jobs: with: go-version: '^1.17.6' + - name: Setup Ganglia + run: sudo apt install ganglia-monitor libganglia1 + - name: Build MetricCollector run: make @@ -22,6 +27,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + with: + submodules: recursive # See: https://github.com/marketplace/actions/setup-go-environment - name: Setup Golang @@ -29,8 +36,11 @@ jobs: with: go-version: '^1.16.7' # The version AlmaLinux 8.5 uses + - name: Setup Ganglia + run: sudo apt install ganglia-monitor libganglia1 + - name: Build MetricCollector run: make - - name: Run MetricCollector + - name: Run MetricCollectorlibganglia1 run: ./cc-metric-collector --once --config .github/ci-config.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ef3fc5c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule ".github/actions/rpmbuild-centos8-golang"] + path = .github/actions/rpmbuild-centos8-golang + url = https://github.com/naveenrajm7/rpmbuild.git + branch = centos8 diff --git a/Makefile b/Makefile index c9805eb..33fd515 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,15 @@ GOSRC_SINKS := $(wildcard sinks/*.go) GOSRC_RECEIVERS := $(wildcard receivers/*.go) GOSRC_INTERNAL := $(wildcard internal/*/*.go) GOSRC := $(GOSRC_APP) $(GOSRC_COLLECTORS) $(GOSRC_SINKS) $(GOSRC_RECEIVERS) $(GOSRC_INTERNAL) +COMPONENT_DIRS := collectors \ + sinks \ + receivers \ + internal/metricRouter \ + internal/ccMetric \ + internal/metricAggregator \ + internal/ccLogger \ + internal/ccTopology \ + internal/multiChanTicker .PHONY: all @@ -12,12 +21,13 @@ all: $(APP) $(APP): $(GOSRC) make -C collectors + make -C sinks go get go build -o $(APP) $(GOSRC_APP) .PHONY: clean clean: - make -C collectors clean + @for COMP in $(COMPONENT_DIRS); do if [ -e $$COMP/Makefile ]; then make -C $$COMP clean; fi; done rm -f $(APP) .PHONY: fmt diff --git a/sinks/Makefile b/sinks/Makefile new file mode 100644 index 0000000..bc0c09d --- /dev/null +++ b/sinks/Makefile @@ -0,0 +1,11 @@ + +all: libganglia.so + +libganglia.so: + @find /usr -name "libganglia.so*" -exec ln -s {} libganglia.so \; + + +clean: + rm -f libganglia.so + +.PHONY: clean diff --git a/sinks/README.md b/sinks/README.md index 1690df9..8ff3743 100644 --- a/sinks/README.md +++ b/sinks/README.md @@ -7,7 +7,8 @@ This folder contains the SinkManager and sink implementations for the cc-metric- - [`http`](./httpSink.md): Send metrics to an HTTP server as POST requests - [`influxdb`](./influxSink.md): Send metrics to an [InfluxDB](https://www.influxdata.com/products/influxdb/) database - [`nats`](./natsSink.md): Publish metrics to the [NATS](https://nats.io/) network overlay system -- [`ganglia`](./gangliaSink.md): Publish metrics in the [Ganglia Monitoring System](http://ganglia.info/) +- [`ganglia`](./gangliaSink.md): Publish metrics in the [Ganglia Monitoring System](http://ganglia.info/) using the `gmetric` CLI tool +- [`libganglia`](./libgangliaSink.md): Publish metrics in the [Ganglia Monitoring System](http://ganglia.info/) directly using `libganglia.so` # Configuration diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index c53b11a..4a57d20 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -1,3 +1,5 @@ +//go:build ganglia + package sinks import ( diff --git a/sinks/gangliaSink_disabled.go b/sinks/gangliaSink_disabled.go new file mode 100644 index 0000000..84156d1 --- /dev/null +++ b/sinks/gangliaSink_disabled.go @@ -0,0 +1,31 @@ +//go:build !ganglia + +package sinks + +import ( + "encoding/json" + "errors" + + // "time" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +type GangliaSink struct { + sink +} + +func (s *GangliaSink) Init(config json.RawMessage) error { + return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") +} + +func (s *GangliaSink) Write(point lp.CCMetric) error { + return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") +} + +func (s *GangliaSink) Flush() error { + return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") +} + +func (s *GangliaSink) Close() { +} diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go new file mode 100644 index 0000000..7e03494 --- /dev/null +++ b/sinks/libgangliaSink.go @@ -0,0 +1,295 @@ +//go:build ganglia + +package sinks + +/* +#cgo CFLAGS: -DGM_PROTOCOL_GUARD +#cgo LDFLAGS: -L. -lganglia +#include + +// This is a copy&paste snippet of ganglia.h (BSD-3 license) +// See https://github.com/ganglia/monitor-core +// for further information + +enum ganglia_slope { + GANGLIA_SLOPE_ZERO = 0, + GANGLIA_SLOPE_POSITIVE, + GANGLIA_SLOPE_NEGATIVE, + GANGLIA_SLOPE_BOTH, + GANGLIA_SLOPE_UNSPECIFIED, + GANGLIA_SLOPE_DERIVATIVE, + GANGLIA_SLOPE_LAST_LEGAL_VALUE=GANGLIA_SLOPE_DERIVATIVE +}; +typedef enum ganglia_slope ganglia_slope_t; + +typedef struct Ganglia_pool* Ganglia_pool; +typedef struct Ganglia_gmond_config* Ganglia_gmond_config; +typedef struct Ganglia_udp_send_channels* Ganglia_udp_send_channels; + +struct Ganglia_metric { + Ganglia_pool pool; + struct Ganglia_metadata_message *msg; + char *value; + void *extra; +}; +typedef struct Ganglia_metric * Ganglia_metric; + +#ifdef __cplusplus +extern "C" { +#endif + +Ganglia_gmond_config Ganglia_gmond_config_create(char *path, int fallback_to_default); +//void Ganglia_gmond_config_destroy(Ganglia_gmond_config config); + +Ganglia_udp_send_channels Ganglia_udp_send_channels_create(Ganglia_pool p, Ganglia_gmond_config config); +void Ganglia_udp_send_channels_destroy(Ganglia_udp_send_channels channels); + +int Ganglia_udp_send_message(Ganglia_udp_send_channels channels, char *buf, int len ); + +Ganglia_metric Ganglia_metric_create( Ganglia_pool parent_pool ); +int Ganglia_metric_set( Ganglia_metric gmetric, char *name, char *value, char *type, char *units, unsigned int slope, unsigned int tmax, unsigned int dmax); +int Ganglia_metric_send( Ganglia_metric gmetric, Ganglia_udp_send_channels send_channels ); +//int Ganglia_metadata_send( Ganglia_metric gmetric, Ganglia_udp_send_channels send_channels ); +//int Ganglia_metadata_send_real( Ganglia_metric gmetric, Ganglia_udp_send_channels send_channels, char *override_string ); +void Ganglia_metadata_add( Ganglia_metric gmetric, char *name, char *value ); +//int Ganglia_value_send( Ganglia_metric gmetric, Ganglia_udp_send_channels send_channels ); +void Ganglia_metric_destroy( Ganglia_metric gmetric ); + +Ganglia_pool Ganglia_pool_create( Ganglia_pool parent ); +void Ganglia_pool_destroy( Ganglia_pool pool ); + +//ganglia_slope_t cstr_to_slope(const char* str); +//const char* slope_to_cstr(unsigned int slope); + +#ifdef __cplusplus +} +#endif +*/ +import "C" + +import ( + "encoding/json" + "errors" + "fmt" + "strings" + "unsafe" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +const GMOND_CONFIG_FILE = `/var/ganglia/gmond.conf` + +type LibgangliaSinkConfig struct { + defaultSinkConfig + GmondConfig string `json:"gmond_config,omitempty"` + AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` + //AddTagsAsDesc bool `json:"add_tags_as_desc,omitempty"` + AddTypeToName bool `json:"add_type_to_name,omitempty"` + AddUnits bool `json:"add_units,omitempty"` + ClusterName string `json:"cluster_name,omitempty"` +} + +type LibgangliaSink struct { + sink + config LibgangliaSinkConfig + global_context C.Ganglia_pool + gmond_config C.Ganglia_gmond_config + send_channels C.Ganglia_udp_send_channels + cstrCache map[string]*C.char +} + +func gangliaMetricName(point lp.CCMetric) string { + name := point.Name() + metricType, typeOK := point.GetTag("type") + metricTid, tidOk := point.GetTag("type-id") + gangliaType := metricType + metricTid + if strings.Contains(name, metricType) && tidOk { + name = strings.Replace(name, metricType, gangliaType, -1) + } else if typeOK && tidOk { + name = metricType + metricTid + "_" + name + } else if point.HasTag("device") { + device, _ := point.GetTag("device") + name = name + "_" + device + } + + return name +} + +func (s *LibgangliaSink) Init(config json.RawMessage) error { + var err error = nil + s.name = "LibgangliaSink" + //s.config.AddTagsAsDesc = false + s.config.AddGangliaGroup = false + s.config.AddTypeToName = false + s.config.AddUnits = true + s.config.GmondConfig = string(GMOND_CONFIG_FILE) + if len(config) > 0 { + err = json.Unmarshal(config, &s.config) + if err != nil { + fmt.Println(s.name, "Error reading config for", s.name, ":", err.Error()) + return err + } + } + + // Set up cache for the C strings + s.cstrCache = make(map[string]*C.char) + // s.cstrCache["globals"] = C.CString("globals") + + // s.cstrCache["override_hostname"] = C.CString("override_hostname") + // s.cstrCache["override_ip"] = C.CString("override_ip") + + // Add some constant strings + s.cstrCache["GROUP"] = C.CString("GROUP") + s.cstrCache["CLUSTER"] = C.CString("CLUSTER") + s.cstrCache[""] = C.CString("") + + // Add cluster name for lookup in Write() + if len(s.config.ClusterName) > 0 { + s.cstrCache[s.config.ClusterName] = C.CString(s.config.ClusterName) + } + // Add supported types for later lookup in Write() + s.cstrCache["double"] = C.CString("double") + s.cstrCache["int32"] = C.CString("int32") + s.cstrCache["string"] = C.CString("string") + + // Create Ganglia pool + s.global_context = C.Ganglia_pool_create(nil) + // Load Ganglia configuration + s.cstrCache[s.config.GmondConfig] = C.CString(s.config.GmondConfig) + s.gmond_config = C.Ganglia_gmond_config_create(s.cstrCache[s.config.GmondConfig], 0) + //globals := C.cfg_getsec(gmond_config, s.cstrCache["globals"]) + //override_hostname := C.cfg_getstr(globals, s.cstrCache["override_hostname"]) + //override_ip := C.cfg_getstr(globals, s.cstrCache["override_ip"]) + + s.send_channels = C.Ganglia_udp_send_channels_create(s.global_context, s.gmond_config) + return nil +} + +func (s *LibgangliaSink) Write(point lp.CCMetric) error { + var err error = nil + var c_name *C.char + var c_value *C.char + var c_type *C.char + var c_unit *C.char + + // helper function for looking up C strings in the cache + lookup := func(key string) *C.char { + if _, exist := s.cstrCache[key]; !exist { + s.cstrCache[key] = C.CString(key) + } + return s.cstrCache[key] + } + + // Get metric name + if s.config.AddTypeToName { + c_name = lookup(gangliaMetricName(point)) + } else { + c_name = lookup(point.Name()) + } + + // Get the value C string and lookup the type string in the cache + value, ok := point.GetField("value") + if !ok { + return fmt.Errorf("metric %s has no 'value' field", point.Name()) + } + switch real := value.(type) { + case float64: + c_value = C.CString(fmt.Sprintf("%f", real)) + c_type = lookup("double") + case float32: + c_value = C.CString(fmt.Sprintf("%f", real)) + c_type = lookup("float") + case int64: + c_value = C.CString(fmt.Sprintf("%d", real)) + c_type = lookup("int32") + case int32: + c_value = C.CString(fmt.Sprintf("%d", real)) + c_type = lookup("int32") + case int: + c_value = C.CString(fmt.Sprintf("%d", real)) + c_type = lookup("int32") + case string: + c_value = C.CString(real) + c_type = lookup("string") + default: + return fmt.Errorf("metric %s has invalid 'value' type for %s", point.Name(), s.name) + } + + // Add unit + if s.config.AddUnits { + if tagunit, tagok := point.GetTag("unit"); tagok { + c_unit = lookup(tagunit) + } else if metaunit, metaok := point.GetMeta("unit"); metaok { + c_unit = lookup(metaunit) + } else { + c_unit = lookup("") + } + } else { + c_unit = lookup("") + } + + // Create a new Ganglia metric + gmetric := C.Ganglia_metric_create(s.global_context) + rval := C.int(0) + // Set name, value, type and unit in the Ganglia metric + // Since we don't have this information from the collectors, + // we assume that the metric value can go up and down (slope), + // and their is no maximum for 'dmax' and 'tmax' + rval = C.Ganglia_metric_set(gmetric, c_name, c_value, c_type, c_unit, C.GANGLIA_SLOPE_BOTH, 0, 0) + switch rval { + case 1: + C.free(unsafe.Pointer(c_value)) + return errors.New("invalid parameters") + case 2: + C.free(unsafe.Pointer(c_value)) + return errors.New("one of your parameters has an invalid character '\"'") + case 3: + C.free(unsafe.Pointer(c_value)) + return fmt.Errorf("the type parameter \"%s\" is not a valid type", C.GoString(c_type)) + case 4: + C.free(unsafe.Pointer(c_value)) + return fmt.Errorf("the value parameter \"%s\" does not represent a number", C.GoString(c_value)) + default: + } + + // Set the cluster name, otherwise it takes it from the configuration file + if len(s.config.ClusterName) > 0 { + C.Ganglia_metadata_add(gmetric, lookup("CLUSTER"), lookup(s.config.ClusterName)) + } + // Set the group metadata in the Ganglia metric if configured + if group, ok := point.GetMeta("group"); ok && s.config.AddGangliaGroup { + c_group := lookup(group) + C.Ganglia_metadata_add(gmetric, lookup("GROUP"), c_group) + } + + // Now we send the metric + // gmetric does provide some more options like description and other options + // but they are not provided by the collectors + rval = C.Ganglia_metric_send(gmetric, s.send_channels) + if rval != 0 { + err = fmt.Errorf("there was an error sending metric %s to %d of the send channels ", point.Name(), rval) + // fall throuph to use Ganglia_metric_destroy from common cleanup + } + // Cleanup Ganglia metric + C.Ganglia_metric_destroy(gmetric) + // Free the value C string, the only one not stored in the cache + C.free(unsafe.Pointer(c_value)) + return err +} + +func (s *LibgangliaSink) Flush() error { + return nil +} + +func (s *LibgangliaSink) Close() { + // Destroy Ganglia configration struct + // (not done by gmetric, I thought I am more clever but no...) + //C.Ganglia_gmond_config_destroy(s.gmond_config) + // Destroy Ganglia pool + C.Ganglia_pool_destroy(s.global_context) + + // Cleanup C string cache + for _, cstr := range s.cstrCache { + C.free(unsafe.Pointer(cstr)) + } +} diff --git a/sinks/libgangliaSink.md b/sinks/libgangliaSink.md new file mode 100644 index 0000000..a0dede7 --- /dev/null +++ b/sinks/libgangliaSink.md @@ -0,0 +1,41 @@ +## `libganglia` sink + +The `libganglia` sink interacts directly with the library of the [Ganglia Monitoring System](http://ganglia.info/) to submit the metrics. Consequently, it needs to be installed on all nodes. But this is commonly the case if you want to use Ganglia, because it requires at least a node daemon (`gmond` or `ganglia-monitor`) to work. + +The `libganglia` sink has probably less overhead compared to the `ganglia` sink because it does not require any process generation but initializes the environment and UDP connections only once. + + +### Configuration structure + +```json +{ + "": { + "type": "libganglia", + "gmetric_config" : "/path/to/gmetric/config", + "cluster_name": "MyCluster", + "add_ganglia_group" : true, + "add_type_to_name": true, + "add_units" : true + } +} +``` + +- `type`: makes the sink an `libganglia` sink +- `meta_as_tags`: print all meta information as tags in the output (optional) +- `gmond_config`: Path to the Ganglia configuration file `gmond.conf` (default: `/etc/ganglia/gmond.conf`) +- `cluster_name`: Set a cluster name for the metric. If not set, it is taken from `gmond_config` +- `add_ganglia_group`: Add a Ganglia metric group based on meta information. Some old versions of `gmetric` do not support the `--group` option +- `add_type_to_name`: Ganglia commonly uses only node-level metrics but with cc-metric-collector, there are metrics for cpus, memory domains, CPU sockets and the whole node. In order to get eeng, this option prefixes the metric name with `_` or `device_` depending on the metric tags and meta information. For metrics of the whole node `type=node`, no prefix is added +- `add_units`: Add metric value unit if there is a `unit` entry in the metric tags or meta information + +### Ganglia Installation + +My development system is Ubuntu 20.04. To install the required libraries with `apt`: + +``` +$ sudo apt install libganglia1 +``` + +The `libganglia.so` gets installed in `/usr/lib`. The Ganglia headers `libganglia1-dev` are **not** required. + +I added a `Makefile` in the `sinks` subfolder that searches for the library in `/usr` and creates a symlink (`sinks/libganglia.so`) for running/building the cc-metric-collector. So just type `make` before running/building in the main folder or the `sinks` subfolder. \ No newline at end of file diff --git a/sinks/libgangliaSink_disabled.go b/sinks/libgangliaSink_disabled.go new file mode 100644 index 0000000..87ee75c --- /dev/null +++ b/sinks/libgangliaSink_disabled.go @@ -0,0 +1,29 @@ +//go:build !ganglia + +package sinks + +import ( + "encoding/json" + "errors" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +type LibgangliaSink struct { + sink +} + +func (s *LibgangliaSink) Init(config json.RawMessage) error { + return errors.New("sink 'libganglia' not implemented, rebuild with tag 'ganglia'") +} + +func (s *LibgangliaSink) Write(point lp.CCMetric) error { + return errors.New("sink 'libganglia' not implemented, rebuild with tag 'ganglia'") +} + +func (s *LibgangliaSink) Flush() error { + return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") +} + +func (s *LibgangliaSink) Close() { +} diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index bd243f4..487e7ca 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -20,6 +20,7 @@ var AvailableSinks = map[string]Sink{ "http": new(HttpSink), "ganglia": new(GangliaSink), "influxasync": new(InfluxAsyncSink), + "libganglia": new(LibgangliaSink), } // Metric collector manager data structure From e5585eaaa01247e9aa8920ed93e9adbaf7a1aaa7 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 16 Feb 2022 19:07:00 +0100 Subject: [PATCH 160/174] Add additional lines for build tags --- sinks/gangliaSink.go | 1 + sinks/gangliaSink_disabled.go | 1 + sinks/libgangliaSink.go | 1 + sinks/libgangliaSink_disabled.go | 1 + 4 files changed, 4 insertions(+) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 4a57d20..2675d5c 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -1,4 +1,5 @@ //go:build ganglia +// +build ganglia package sinks diff --git a/sinks/gangliaSink_disabled.go b/sinks/gangliaSink_disabled.go index 84156d1..836e8fb 100644 --- a/sinks/gangliaSink_disabled.go +++ b/sinks/gangliaSink_disabled.go @@ -1,4 +1,5 @@ //go:build !ganglia +// +build !ganglia package sinks diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 7e03494..a128572 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -1,4 +1,5 @@ //go:build ganglia +// +build ganglia package sinks diff --git a/sinks/libgangliaSink_disabled.go b/sinks/libgangliaSink_disabled.go index 87ee75c..01886b9 100644 --- a/sinks/libgangliaSink_disabled.go +++ b/sinks/libgangliaSink_disabled.go @@ -1,4 +1,5 @@ //go:build !ganglia +// +build !ganglia package sinks From 7be6d2a3389dd3342ac7cc9496bdafd45ff45221 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 16 Feb 2022 23:23:07 +0100 Subject: [PATCH 161/174] Use only readable files when searching for libganglia.so --- sinks/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sinks/Makefile b/sinks/Makefile index bc0c09d..65d7e75 100644 --- a/sinks/Makefile +++ b/sinks/Makefile @@ -2,7 +2,7 @@ all: libganglia.so libganglia.so: - @find /usr -name "libganglia.so*" -exec ln -s {} libganglia.so \; + @find /usr -readable -name "libganglia.so*" -exec ln -sf {} libganglia.so \; clean: From 3f8a2cb9b85f0d49b2c7e1b6115cf075ea642427 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 17 Feb 2022 01:33:38 +0100 Subject: [PATCH 162/174] Fix default gmond config file path for libgangliaSink --- sinks/libgangliaSink.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index a128572..085f10c 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -78,7 +78,7 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -const GMOND_CONFIG_FILE = `/var/ganglia/gmond.conf` +const GMOND_CONFIG_FILE = `/etc/ganglia/gmond.conf` type LibgangliaSinkConfig struct { defaultSinkConfig From bb9c6be741b4d03b33dc0253a6c090d00049c99d Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 17 Feb 2022 08:03:55 +0100 Subject: [PATCH 163/174] Do not create libganglia link if libganglia is not installed --- sinks/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sinks/Makefile b/sinks/Makefile index 65d7e75..4274c4d 100644 --- a/sinks/Makefile +++ b/sinks/Makefile @@ -2,7 +2,9 @@ all: libganglia.so libganglia.so: - @find /usr -readable -name "libganglia.so*" -exec ln -sf {} libganglia.so \; + @find /usr -readable -name "libganglia.so*" -print0 | \ + xargs --null --no-run-if-empty --replace \ + ln --symbolic --verbose --force '{}' libganglia.so clean: From 873befb41f47426572cbd97346516625ae1ec92b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 17 Feb 2022 15:06:55 +0100 Subject: [PATCH 164/174] Add ganglia build tag to README --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9b628fc..5aa8806 100644 --- a/README.md +++ b/README.md @@ -39,14 +39,21 @@ See the component READMEs for their configuration: ``` $ git clone git@github.com:ClusterCockpit/cc-metric-collector.git $ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector) -$ go get (requires at least golang 1.13) -$ go build metric-collector +$ go get (requires at least golang 1.16) +$ make tags +Available tags: +ganglia +[...] +$ make # calls go build (-tags ganglia,...) -o cc-metric-collector ``` +## `ganglia` build tag +If you want support for the [Ganglia Monitoring System](http://ganglia.info/), you have to add `-tags ganglia` to the build command line. This enables two metric sinks. One is using the command line application `gmetric` (see [`ganglia`](./sinks/gangliaSink.md) sink), the other one interacts directly with `libganglia` the main Ganglia library that is commonly installed on each compute node (see [`libganglia`](./sinks/libgangliaSink.md) sink). The later one requires configuration before building, so use `make` instead of `go build` directly. + # Running ``` -$ ./metric-collector --help +$ ./cc-metric-collector --help Usage of metric-collector: -config string Path to configuration file (default "./config.json") From 0152c0dc1e1c5da65711c42f3020805727911aa6 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Thu, 17 Feb 2022 15:46:06 +0100 Subject: [PATCH 165/174] Update CpustatCollector (#36) * Update cpustat collector * Update CpustatCollector to use percentages and add 'num_cpus' metric --- collectors/cpustatMetric.go | 137 ++++++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 39 deletions(-) diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index f517300..28ae002 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -1,12 +1,15 @@ package collectors import ( + "bufio" "encoding/json" "fmt" - "io/ioutil" + "os" "strconv" "strings" "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -18,72 +21,128 @@ type CpustatCollectorConfig struct { type CpustatCollector struct { metricCollector - config CpustatCollectorConfig + config CpustatCollectorConfig + matches map[string]int + cputags map[string]map[string]string + nodetags map[string]string + num_cpus_metric lp.CCMetric } func (m *CpustatCollector) Init(config json.RawMessage) error { m.name = "CpustatCollector" m.setup() - m.meta = map[string]string{"source": m.name, "group": "CPU"} + m.meta = map[string]string{"source": m.name, "group": "CPU", "unit": "Percent"} + m.nodetags = map[string]string{"type": "node"} if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { return err } } + matches := map[string]int{ + "cpu_user": 1, + "cpu_nice": 2, + "cpu_system": 3, + "cpu_idle": 4, + "cpu_iowait": 5, + "cpu_irq": 6, + "cpu_softirq": 7, + "cpu_steal": 8, + "cpu_guest": 9, + "cpu_guest_nice": 10, + } + + m.matches = make(map[string]int) + for match, index := range matches { + doExclude := false + for _, exclude := range m.config.ExcludeMetrics { + if match == exclude { + doExclude = true + break + } + } + if !doExclude { + m.matches[match] = index + } + } + + // Check input file + file, err := os.Open(string(CPUSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + } + defer file.Close() + + // Pre-generate tags for all CPUs + num_cpus := 0 + m.cputags = make(map[string]map[string]string) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { + cpustr := strings.TrimLeft(linefields[0], "cpu") + cpu, _ := strconv.Atoi(cpustr) + m.cputags[linefields[0]] = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)} + num_cpus++ + } + } m.init = true return nil } -func (c *CpustatCollector) parseStatLine(line string, cpu int, exclude []string, output chan lp.CCMetric) { - ls := strings.Fields(line) - matches := []string{"", "cpu_user", "cpu_nice", "cpu_system", "cpu_idle", "cpu_iowait", "cpu_irq", "cpu_softirq", "cpu_steal", "cpu_guest", "cpu_guest_nice"} - for _, ex := range exclude { - matches, _ = RemoveFromStringList(matches, ex) - } - - var tags map[string]string - if cpu < 0 { - tags = map[string]string{"type": "node"} - } else { - tags = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)} - } - for i, m := range matches { - if len(m) > 0 { - x, err := strconv.ParseInt(ls[i], 0, 64) +func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric) { + values := make(map[string]float64) + total := 0.0 + for match, index := range m.matches { + if len(match) > 0 { + x, err := strconv.ParseInt(linefields[index], 0, 64) if err == nil { - y, err := lp.New(m, tags, c.meta, map[string]interface{}{"value": int(x)}, time.Now()) - if err == nil { - output <- y - } + values[match] = float64(x) + total += values[match] } } } + t := time.Now() + for name, value := range values { + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": (value * 100.0) / total}, t) + if err == nil { + output <- y + } + } } func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { return } - buffer, err := ioutil.ReadFile(string(CPUSTATFILE)) - + num_cpus := 0 + file, err := os.Open(string(CPUSTATFILE)) if err != nil { - return + cclog.ComponentError(m.name, err.Error()) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + if strings.Compare(linefields[0], "cpu") == 0 { + m.parseStatLine(linefields, m.nodetags, output) + } else if strings.HasPrefix(linefields[0], "cpu") { + m.parseStatLine(linefields, m.cputags[linefields[0]], output) + num_cpus++ + } } - ll := strings.Split(string(buffer), "\n") - for _, line := range ll { - if len(line) == 0 { - continue - } - ls := strings.Fields(line) - if strings.Compare(ls[0], "cpu") == 0 { - m.parseStatLine(line, -1, m.config.ExcludeMetrics, output) - } else if strings.HasPrefix(ls[0], "cpu") { - cpustr := strings.TrimLeft(ls[0], "cpu") - cpu, _ := strconv.Atoi(cpustr) - m.parseStatLine(line, cpu, m.config.ExcludeMetrics, output) - } + num_cpus_metric, err := lp.New("num_cpus", + m.nodetags, + m.meta, + map[string]interface{}{"value": int(num_cpus)}, + time.Now(), + ) + if err == nil { + output <- num_cpus_metric } } From 4e8ee592118b0e1e3821be848c9b5ed0d70e59df Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Feb 2022 02:25:23 +0100 Subject: [PATCH 166/174] Update NetstatCollector to derive bandwidths and use an include list --- collectors/netstatMetric.go | 129 ++++++++++++++++++++++++------------ collectors/netstatMetric.md | 14 ++-- 2 files changed, 94 insertions(+), 49 deletions(-) diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 3ca4cd3..7eaa3cf 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -1,93 +1,138 @@ package collectors import ( + "bufio" "encoding/json" - "io/ioutil" - "log" + "errors" + "os" "strconv" "strings" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) const NETSTATFILE = `/proc/net/dev` type NetstatCollectorConfig struct { - ExcludeDevices []string `json:"exclude_devices"` + IncludeDevices []string `json:"include_devices"` +} + +type NetstatCollectorMetric struct { + index int + lastValue float64 } type NetstatCollector struct { metricCollector - config NetstatCollectorConfig - matches map[int]string + config NetstatCollectorConfig + matches map[string]map[string]NetstatCollectorMetric + devtags map[string]map[string]string + lastTimestamp time.Time } func (m *NetstatCollector) Init(config json.RawMessage) error { m.name = "NetstatCollector" m.setup() + m.lastTimestamp = time.Now() m.meta = map[string]string{"source": m.name, "group": "Network"} - m.matches = map[int]string{ - 1: "net_bytes_in", - 9: "net_bytes_out", - 2: "net_pkts_in", - 10: "net_pkts_out", + m.devtags = make(map[string]map[string]string) + nameIndexMap := map[string]int{ + "net_bytes_in": 1, + "net_pkts_in": 2, + "net_bytes_out": 9, + "net_pkts_out": 10, } + m.matches = make(map[string]map[string]NetstatCollectorMetric) if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { - log.Print(err.Error()) + cclog.ComponentError(m.name, "Error reading config:", err.Error()) return err } } - _, err := ioutil.ReadFile(string(NETSTATFILE)) - if err == nil { - m.init = true - } - return nil -} - -func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { - data, err := ioutil.ReadFile(string(NETSTATFILE)) + file, err := os.Open(string(NETSTATFILE)) if err != nil { - log.Print(err.Error()) - return + cclog.ComponentError(m.name, err.Error()) + return err } + defer file.Close() - lines := strings.Split(string(data), "\n") - for _, l := range lines { + scanner := bufio.NewScanner(file) + for scanner.Scan() { + l := scanner.Text() if !strings.Contains(l, ":") { continue } f := strings.Fields(l) - dev := f[0][0 : len(f[0])-1] - cont := false - for _, d := range m.config.ExcludeDevices { - if d == dev { - cont = true + dev := strings.Trim(f[0], ": ") + if _, ok := stringArrayContains(m.config.IncludeDevices, dev); ok { + m.matches[dev] = make(map[string]NetstatCollectorMetric) + for name, idx := range nameIndexMap { + m.matches[dev][name] = NetstatCollectorMetric{ + index: idx, + lastValue: 0, + } } + m.devtags[dev] = map[string]string{"device": dev, "type": "node"} } - if cont { + } + if len(m.devtags) == 0 { + return errors.New("no devices to collector metrics found") + } + m.init = true + return nil +} + +func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + now := time.Now() + file, err := os.Open(string(NETSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return + } + defer file.Close() + tdiff := now.Sub(m.lastTimestamp) + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + l := scanner.Text() + if !strings.Contains(l, ":") { continue } - tags := map[string]string{"device": dev, "type": "node"} - for i, name := range m.matches { - v, err := strconv.ParseInt(f[i], 10, 0) - if err == nil { - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": int(float64(v) * 1.0e-3)}, time.Now()) + f := strings.Fields(l) + dev := strings.Trim(f[0], ":") + + if devmetrics, ok := m.matches[dev]; ok { + for name, data := range devmetrics { + v, err := strconv.ParseFloat(f[data.index], 64) if err == nil { - switch { - case strings.Contains(name, "byte"): - y.AddMeta("unit", "Byte") - case strings.Contains(name, "pkt"): - y.AddMeta("unit", "Packets") + vdiff := v - data.lastValue + value := vdiff / tdiff.Seconds() + if data.lastValue == 0 { + value = 0 } - output <- y + data.lastValue = v + y, err := lp.New(name, m.devtags[dev], m.meta, map[string]interface{}{"value": value}, now) + if err == nil { + switch { + case strings.Contains(name, "byte"): + y.AddMeta("unit", "bytes/sec") + case strings.Contains(name, "pkt"): + y.AddMeta("unit", "packets/sec") + } + output <- y + } + devmetrics[name] = data } } } } - + m.lastTimestamp = time.Now() } func (m *NetstatCollector) Close() { diff --git a/collectors/netstatMetric.md b/collectors/netstatMetric.md index 34a48fd..90d8600 100644 --- a/collectors/netstatMetric.md +++ b/collectors/netstatMetric.md @@ -3,19 +3,19 @@ ```json "netstat": { - "exclude_devices": [ - "lo" + "include_devices": [ + "eth0" ] } ``` -The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a device is not required, it can be excluded from forwarding it to the sink. Commonly the `lo` device should be excluded. +The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. With the `include_devices` list you can specify which network devices should be measured. **Note**: Most other collectors use an _exclude_ list instead of an include list. Metrics: -* `bytes_in` -* `bytes_out` -* `pkts_in` -* `pkts_out` +* `net_bytes_in` (`unit=bytes/sec`) +* `net_bytes_out` (`unit=bytes/sec`) +* `net_pkts_in` (`unit=packets/sec`) +* `net_pkts_out` (`unit=packets/sec`) The device name is added as tag `device`. From e2f78fe1c0325ed64c3e0c0dc5592479b1ace2eb Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Feb 2022 11:41:15 +0100 Subject: [PATCH 167/174] Add linker flag -Wl,--unresolved-symbols=ignore-in-object-files to build without library. Remove build tags --- sinks/gangliaSink.go | 3 -- sinks/gangliaSink_disabled.go | 32 ----------------- sinks/libgangliaSink.go | 59 +++++++++++++++++++++++--------- sinks/libgangliaSink_disabled.go | 30 ---------------- 4 files changed, 43 insertions(+), 81 deletions(-) delete mode 100644 sinks/gangliaSink_disabled.go delete mode 100644 sinks/libgangliaSink_disabled.go diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 2675d5c..c53b11a 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -1,6 +1,3 @@ -//go:build ganglia -// +build ganglia - package sinks import ( diff --git a/sinks/gangliaSink_disabled.go b/sinks/gangliaSink_disabled.go deleted file mode 100644 index 836e8fb..0000000 --- a/sinks/gangliaSink_disabled.go +++ /dev/null @@ -1,32 +0,0 @@ -//go:build !ganglia -// +build !ganglia - -package sinks - -import ( - "encoding/json" - "errors" - - // "time" - - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" -) - -type GangliaSink struct { - sink -} - -func (s *GangliaSink) Init(config json.RawMessage) error { - return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") -} - -func (s *GangliaSink) Write(point lp.CCMetric) error { - return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") -} - -func (s *GangliaSink) Flush() error { - return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") -} - -func (s *GangliaSink) Close() { -} diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 085f10c..f6baef9 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -1,11 +1,8 @@ -//go:build ganglia -// +build ganglia - package sinks /* #cgo CFLAGS: -DGM_PROTOCOL_GUARD -#cgo LDFLAGS: -L. -lganglia +#cgo LDFLAGS: -L. -lganglia -Wl,--unresolved-symbols=ignore-in-object-files #include // This is a copy&paste snippet of ganglia.h (BSD-3 license) @@ -76,18 +73,31 @@ import ( "unsafe" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + "github.com/NVIDIA/go-nvml/pkg/dl" ) -const GMOND_CONFIG_FILE = `/etc/ganglia/gmond.conf` +const ( + GANGLIA_LIB_NAME = "libganglia.so" + GANGLIA_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL + GMOND_CONFIG_FILE = `/etc/ganglia/gmond.conf` +) + +type LibgangliaSinkSpecialMetric struct { + MetricName string `json:"metric_name,omitempty"` + NewName string `json:"new_name,omitempty"` + Slope string `json:"slope,omitempty"` +} type LibgangliaSinkConfig struct { defaultSinkConfig - GmondConfig string `json:"gmond_config,omitempty"` - AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` - //AddTagsAsDesc bool `json:"add_tags_as_desc,omitempty"` - AddTypeToName bool `json:"add_type_to_name,omitempty"` - AddUnits bool `json:"add_units,omitempty"` - ClusterName string `json:"cluster_name,omitempty"` + GangliaLib string `json:"libganglia_path,omitempty"` + GmondConfig string `json:"gmond_config,omitempty"` + AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` + AddTypeToName bool `json:"add_type_to_name,omitempty"` + AddUnits bool `json:"add_units,omitempty"` + ClusterName string `json:"cluster_name,omitempty"` + SpecialMetrics map[string]LibgangliaSinkSpecialMetric `json:"rename_metrics,omitempty"` // Map to rename metric name from key to value + //AddTagsAsDesc bool `json:"add_tags_as_desc,omitempty"` } type LibgangliaSink struct { @@ -124,6 +134,7 @@ func (s *LibgangliaSink) Init(config json.RawMessage) error { s.config.AddTypeToName = false s.config.AddUnits = true s.config.GmondConfig = string(GMOND_CONFIG_FILE) + s.config.GangliaLib = string(GANGLIA_LIB_NAME) if len(config) > 0 { err = json.Unmarshal(config, &s.config) if err != nil { @@ -131,6 +142,10 @@ func (s *LibgangliaSink) Init(config json.RawMessage) error { return err } } + lib := dl.New(s.config.GangliaLib, GANGLIA_LIB_DL_FLAGS) + if lib == nil { + return fmt.Errorf("error instantiating DynamicLibrary for %s", s.config.GangliaLib) + } // Set up cache for the C strings s.cstrCache = make(map[string]*C.char) @@ -182,16 +197,17 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { } // Get metric name + metricname := point.Name() if s.config.AddTypeToName { c_name = lookup(gangliaMetricName(point)) } else { - c_name = lookup(point.Name()) + c_name = lookup(metricname) } // Get the value C string and lookup the type string in the cache value, ok := point.GetField("value") if !ok { - return fmt.Errorf("metric %s has no 'value' field", point.Name()) + return fmt.Errorf("metric %s has no 'value' field", metricname) } switch real := value.(type) { case float64: @@ -229,14 +245,25 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { c_unit = lookup("") } + // Determine the slope of the metric. Ganglia's own collector mostly use + // 'both' but the mem and swap total uses 'zero'. + slope_type := C.GANGLIA_SLOPE_BOTH + switch metricname { + case "mem_total": + slope_type = C.GANGLIA_SLOPE_ZERO + case "swap_total": + slope_type = C.GANGLIA_SLOPE_ZERO + } + // Create a new Ganglia metric gmetric := C.Ganglia_metric_create(s.global_context) - rval := C.int(0) // Set name, value, type and unit in the Ganglia metric // Since we don't have this information from the collectors, // we assume that the metric value can go up and down (slope), - // and their is no maximum for 'dmax' and 'tmax' - rval = C.Ganglia_metric_set(gmetric, c_name, c_value, c_type, c_unit, C.GANGLIA_SLOPE_BOTH, 0, 0) + // and there is no maximum for 'dmax' and 'tmax'. + // Ganglia's collectors set 'tmax' but not 'dmax' + rval := C.int(0) + rval = C.Ganglia_metric_set(gmetric, c_name, c_value, c_type, c_unit, C.uint(slope_type), 0, 0) switch rval { case 1: C.free(unsafe.Pointer(c_value)) diff --git a/sinks/libgangliaSink_disabled.go b/sinks/libgangliaSink_disabled.go deleted file mode 100644 index 01886b9..0000000 --- a/sinks/libgangliaSink_disabled.go +++ /dev/null @@ -1,30 +0,0 @@ -//go:build !ganglia -// +build !ganglia - -package sinks - -import ( - "encoding/json" - "errors" - - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" -) - -type LibgangliaSink struct { - sink -} - -func (s *LibgangliaSink) Init(config json.RawMessage) error { - return errors.New("sink 'libganglia' not implemented, rebuild with tag 'ganglia'") -} - -func (s *LibgangliaSink) Write(point lp.CCMetric) error { - return errors.New("sink 'libganglia' not implemented, rebuild with tag 'ganglia'") -} - -func (s *LibgangliaSink) Flush() error { - return errors.New("sink 'ganglia' not implemented, rebuild with tag 'ganglia'") -} - -func (s *LibgangliaSink) Close() { -} From d9a81501e5ff91e3ecd663a29f770145e90f3b54 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Feb 2022 15:05:45 +0100 Subject: [PATCH 168/174] Use common functions in both Ganglia sinks --- sinks/gangliaCommon.go | 50 +++++++++++++++++++++++++++++++++++++++++ sinks/gangliaSink.go | 23 ++++++++++++++----- sinks/libgangliaSink.go | 29 +++++------------------- 3 files changed, 73 insertions(+), 29 deletions(-) create mode 100644 sinks/gangliaCommon.go diff --git a/sinks/gangliaCommon.go b/sinks/gangliaCommon.go new file mode 100644 index 0000000..b939f16 --- /dev/null +++ b/sinks/gangliaCommon.go @@ -0,0 +1,50 @@ +package sinks + +import ( + "strings" + + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +func GangliaMetricName(point lp.CCMetric) string { + name := point.Name() + metricType, typeOK := point.GetTag("type") + metricTid, tidOk := point.GetTag("type-id") + gangliaType := metricType + metricTid + if strings.Contains(name, metricType) && tidOk { + name = strings.Replace(name, metricType, gangliaType, -1) + } else if typeOK && tidOk { + name = metricType + metricTid + "_" + name + } else if point.HasTag("device") { + device, _ := point.GetTag("device") + name = name + "_" + device + } + + return name +} + +func GangliaMetricRename(point lp.CCMetric) string { + name := point.Name() + if name == "mem_total" || name == "swap_total" { + return name + } else if name == "net_bytes_in" { + return "bytes_in" + } else if name == "net_bytes_out" { + return "bytes_out" + } else if name == "net_pkts_in" { + return "pkts_in" + } else if name == "net_pkts_out" { + return "pkts_out" + } else if name == "cpu_iowait" { + return "cpu_wio" + } + return name +} + +func GangliaSlopeType(point lp.CCMetric) uint { + name := point.Name() + if name == "mem_total" || name == "swap_total" { + return 0 + } + return 3 +} diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index c53b11a..fa95f43 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -22,6 +22,8 @@ type GangliaSinkConfig struct { GmetricConfig string `json:"gmetric_config,omitempty"` AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` AddTagsAsDesc bool `json:"add_tags_as_desc,omitempty"` + ClusterName string `json:"cluster_name,omitempty"` + AddTypeToName bool `json:"add_type_to_name,omitempty"` } type GangliaSink struct { @@ -82,8 +84,6 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { for key, value := range point.Tags() { switch key { - case "cluster": - argstr = append(argstr, fmt.Sprintf("--cluster=%s", value)) case "unit": argstr = append(argstr, fmt.Sprintf("--units=%s", value)) default: @@ -93,8 +93,6 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { if s.config.MetaAsTags { for key, value := range point.Meta() { switch key { - case "cluster": - argstr = append(argstr, fmt.Sprintf("--cluster=%s", value)) case "unit": argstr = append(argstr, fmt.Sprintf("--units=%s", value)) default: @@ -102,13 +100,28 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { } } } + if len(s.config.ClusterName) > 0 { + argstr = append(argstr, fmt.Sprintf("--cluster=%s", s.config.ClusterName)) + } if s.config.AddTagsAsDesc && len(tagsstr) > 0 { argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) } if len(s.gmetric_config) > 0 { argstr = append(argstr, fmt.Sprintf("--conf=%s", s.gmetric_config)) } - argstr = append(argstr, fmt.Sprintf("--name=%s", point.Name())) + name := GangliaMetricRename(point) + if s.config.AddTypeToName { + argstr = append(argstr, fmt.Sprintf("--name=%s", GangliaMetricName(point))) + } else { + argstr = append(argstr, fmt.Sprintf("--name=%s", name)) + } + slope := GangliaSlopeType(point) + slopeStr := "both" + if slope == 0 { + slopeStr = "zero" + } + argstr = append(argstr, fmt.Sprintf("--slope=%s", slopeStr)) + for k, v := range point.Fields() { if k == "value" { switch value := v.(type) { diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index f6baef9..ed19145 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -69,7 +69,6 @@ import ( "encoding/json" "errors" "fmt" - "strings" "unsafe" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" @@ -109,23 +108,6 @@ type LibgangliaSink struct { cstrCache map[string]*C.char } -func gangliaMetricName(point lp.CCMetric) string { - name := point.Name() - metricType, typeOK := point.GetTag("type") - metricTid, tidOk := point.GetTag("type-id") - gangliaType := metricType + metricTid - if strings.Contains(name, metricType) && tidOk { - name = strings.Replace(name, metricType, gangliaType, -1) - } else if typeOK && tidOk { - name = metricType + metricTid + "_" + name - } else if point.HasTag("device") { - device, _ := point.GetTag("device") - name = name + "_" + device - } - - return name -} - func (s *LibgangliaSink) Init(config json.RawMessage) error { var err error = nil s.name = "LibgangliaSink" @@ -197,9 +179,9 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { } // Get metric name - metricname := point.Name() + metricname := GangliaMetricRename(point) if s.config.AddTypeToName { - c_name = lookup(gangliaMetricName(point)) + c_name = lookup(GangliaMetricName(point)) } else { c_name = lookup(metricname) } @@ -247,11 +229,10 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { // Determine the slope of the metric. Ganglia's own collector mostly use // 'both' but the mem and swap total uses 'zero'. + slope := GangliaSlopeType(point) slope_type := C.GANGLIA_SLOPE_BOTH - switch metricname { - case "mem_total": - slope_type = C.GANGLIA_SLOPE_ZERO - case "swap_total": + switch slope { + case 0: slope_type = C.GANGLIA_SLOPE_ZERO } From e8adf5b909500dd4af8e368db275dbee6313c274 Mon Sep 17 00:00:00 2001 From: Holger Obermaier Date: Fri, 18 Feb 2022 15:43:52 +0100 Subject: [PATCH 169/174] Create an empty libganglia.so stub if none exists --- sinks/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sinks/Makefile b/sinks/Makefile index 4274c4d..bd40f10 100644 --- a/sinks/Makefile +++ b/sinks/Makefile @@ -2,9 +2,10 @@ all: libganglia.so libganglia.so: - @find /usr -readable -name "libganglia.so*" -print0 | \ + @find /usr ! -readable -prune -o -type d ! -executable -prune -o -name "$@*" -print0 | \ xargs --null --no-run-if-empty --replace \ - ln --symbolic --verbose --force '{}' libganglia.so + ln --symbolic --verbose --force '{}' "$@" + @if [[ ! -e "$@" ]]; then touch "$@"; fi clean: From 635a75c64bb430d242f09f6648de223867f3cf32 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 18 Feb 2022 16:56:41 +0100 Subject: [PATCH 170/174] Report maximum and critical temperature --- collectors.json | 2 ++ collectors/tempMetric.go | 75 +++++++++++++++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/collectors.json b/collectors.json index 669355c..27ef822 100644 --- a/collectors.json +++ b/collectors.json @@ -15,6 +15,8 @@ "numastats": {}, "nvidia": {}, "tempstat": { + "report_max_temperature": true, + "report_critical_temperature": true, "tag_override": { "hwmon0": { "type": "socket", diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index bd26584..bbc5100 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -21,18 +21,24 @@ import ( // /sys/class/hwmon/hwmon*/temp*_crit -> 100000 = 100.0°C type TempCollectorSensor struct { - name string - label string - metricName string // Default: name_label - file string - tags map[string]string + name string + label string + metricName string // Default: name_label + file string + maxTempName string + maxTemp int64 + critTempName string + critTemp int64 + tags map[string]string } type TempCollector struct { metricCollector config struct { - ExcludeMetrics []string `json:"exclude_metrics"` - TagOverride map[string]map[string]string `json:"tag_override"` + ExcludeMetrics []string `json:"exclude_metrics"` + TagOverride map[string]map[string]string `json:"tag_override"` + ReportMaxTemp bool `json:"report_max_temperature"` + ReportCriticalTemp bool `json:"report_critical_temperature"` } sensors []*TempCollectorSensor } @@ -92,6 +98,9 @@ func (m *TempCollector) Init(config json.RawMessage) error { switch { case len(sensor.name) == 0 && len(sensor.label) == 0: continue + case sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Core ") || + sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Package id "): + sensor.metricName = "temp_" + sensor.label case len(sensor.name) != 0 && len(sensor.label) != 0: sensor.metricName = sensor.name + "_" + sensor.label case len(sensor.name) != 0: @@ -99,12 +108,12 @@ func (m *TempCollector) Init(config json.RawMessage) error { case len(sensor.label) != 0: sensor.metricName = sensor.label } + sensor.metricName = strings.ToLower(sensor.metricName) sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1) // Add temperature prefix, if required if !strings.Contains(sensor.metricName, "temp") { sensor.metricName = "temp_" + sensor.metricName } - sensor.metricName = strings.ToLower(sensor.metricName) // Sensor file sensor.file = file @@ -122,6 +131,28 @@ func (m *TempCollector) Init(config json.RawMessage) error { } } + // max temperature + if m.config.ReportMaxTemp { + maxTempFile := strings.TrimSuffix(file, "_input") + "_max" + if buffer, err := ioutil.ReadFile(maxTempFile); err == nil { + if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil { + sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1) + sensor.maxTemp = x / 1000 + } + } + } + + // critical temperature + if m.config.ReportCriticalTemp { + criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit" + if buffer, err := ioutil.ReadFile(criticalTempFile); err == nil { + if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil { + sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1) + sensor.critTemp = x / 1000 + } + } + } + m.sensors = append(m.sensors, sensor) } @@ -164,6 +195,34 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err == nil { output <- y } + + // max temperature + if m.config.ReportMaxTemp && sensor.maxTemp != 0 { + y, err := lp.New( + sensor.maxTempName, + sensor.tags, + m.meta, + map[string]interface{}{"value": sensor.maxTemp}, + time.Now(), + ) + if err == nil { + output <- y + } + } + + // critical temperature + if m.config.ReportCriticalTemp && sensor.critTemp != 0 { + y, err := lp.New( + sensor.critTempName, + sensor.tags, + m.meta, + map[string]interface{}{"value": sensor.critTemp}, + time.Now(), + ) + if err == nil { + output <- y + } + } } } From 65c3106af2ad99ffe1724d6c6a3e7da0b534a515 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 18 Feb 2022 16:59:59 +0100 Subject: [PATCH 171/174] Remove tags for num cores and packages --- collectors/cpufreqCpuinfoMetric.go | 8 +++----- collectors/cpufreqMetric.go | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index f527859..44a3b0c 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -150,11 +150,9 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ - "type": "cpu", - "type-id": t.processor, - "num_core": t.numNonHT, - "package_id": t.physicalPackageID, - "num_package": t.numPhysicalPackages, + "type": "cpu", + "type-id": t.processor, + "package_id": t.physicalPackageID, } } diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index da9f2d7..5146baa 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -161,11 +161,9 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ - "type": "cpu", - "type-id": t.processor, - "num_core": t.numNonHT, - "package_id": t.physicalPackageID, - "num_package": t.numPhysicalPackages, + "type": "cpu", + "type-id": t.processor, + "package_id": t.physicalPackageID, } } From 435528fa97ae5d2c15607a3c3efd38ad48ca0ba1 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 21 Feb 2022 12:44:26 +0100 Subject: [PATCH 172/174] Split diskstat Collector (#38) * Split diskstats (free, total space) and iostats (reads, writes, ... * Add iostat Collector to CollectorManager --- collectors/README.md | 1 + collectors/collectorManager.go | 1 + collectors/diskstatMetric.go | 127 ++++++++++++++------------- collectors/diskstatMetric.md | 29 ++---- collectors/iostatMetric.go | 155 +++++++++++++++++++++++++++++++++ collectors/iostatMetric.md | 34 ++++++++ 6 files changed, 262 insertions(+), 85 deletions(-) create mode 100644 collectors/iostatMetric.go create mode 100644 collectors/iostatMetric.md diff --git a/collectors/README.md b/collectors/README.md index 393b200..00e0da7 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -18,6 +18,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`cpustat`](./cpustatMetric.md) * [`memstat`](./memstatMetric.md) +* [`iostat`](./iostatMetric.md) * [`diskstat`](./diskstatMetric.md) * [`loadavg`](./loadavgMetric.md) * [`netstat`](./netstatMetric.md) diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 7918793..86b423e 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -25,6 +25,7 @@ var AvailableCollectors = map[string]MetricCollector{ "topprocs": new(TopProcsCollector), "nvidia": new(NvidiaCollector), "customcmd": new(CustomCmdCollector), + "iostat": new(IOstatCollector), "diskstat": new(DiskstatCollector), "tempstat": new(TempCollector), "ipmistat": new(IpmiCollector), diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 50c41cd..819a1ab 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -1,18 +1,21 @@ package collectors import ( - "io/ioutil" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - // "log" + "bufio" "encoding/json" - "errors" - "strconv" + "fmt" + "os" "strings" + "syscall" "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -const DISKSTATFILE = `/proc/diskstats` -const DISKSTAT_SYSFSPATH = `/sys/block` +// "log" + +const MOUNTFILE = `/proc/self/mounts` type DiskstatCollectorConfig struct { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` @@ -20,93 +23,89 @@ type DiskstatCollectorConfig struct { type DiskstatCollector struct { metricCollector - matches map[int]string - config DiskstatCollectorConfig + //matches map[string]int + config IOstatCollectorConfig + //devices map[string]IOstatCollectorEntry } func (m *DiskstatCollector) Init(config json.RawMessage) error { - var err error m.name = "DiskstatCollector" m.meta = map[string]string{"source": m.name, "group": "Disk"} m.setup() if len(config) > 0 { - err = json.Unmarshal(config, &m.config) + err := json.Unmarshal(config, &m.config) if err != nil { return err } } - // https://www.kernel.org/doc/html/latest/admin-guide/iostats.html - matches := map[int]string{ - 3: "reads", - 4: "reads_merged", - 5: "read_sectors", - 6: "read_ms", - 7: "writes", - 8: "writes_merged", - 9: "writes_sectors", - 10: "writes_ms", - 11: "ioops", - 12: "ioops_ms", - 13: "ioops_weighted_ms", - 14: "discards", - 15: "discards_merged", - 16: "discards_sectors", - 17: "discards_ms", - 18: "flushes", - 19: "flushes_ms", + file, err := os.Open(string(MOUNTFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return err } - m.matches = make(map[int]string) - for k, v := range matches { - _, skip := stringArrayContains(m.config.ExcludeMetrics, v) - if !skip { - m.matches[k] = v - } - } - if len(m.matches) == 0 { - return errors.New("No metrics to collect") - } - _, err = ioutil.ReadFile(string(DISKSTATFILE)) - if err == nil { - m.init = true - } - return err + defer file.Close() + m.init = true + return nil } func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { - var lines []string if !m.init { return } - buffer, err := ioutil.ReadFile(string(DISKSTATFILE)) + file, err := os.Open(string(MOUNTFILE)) if err != nil { + cclog.ComponentError(m.name, err.Error()) return } - lines = strings.Split(string(buffer), "\n") + defer file.Close() - for _, line := range lines { + part_max_used := uint64(0) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() if len(line) == 0 { continue } - f := strings.Fields(line) - if strings.Contains(f[2], "loop") { + if !strings.HasPrefix(line, "/dev") { continue } - tags := map[string]string{ - "device": f[2], - "type": "node", + linefields := strings.Fields(line) + if strings.Contains(linefields[0], "loop") { + continue } - for idx, name := range m.matches { - if idx < len(f) { - x, err := strconv.ParseInt(f[idx], 0, 64) - if err == nil { - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": int(x)}, time.Now()) - if err == nil { - output <- y - } - } - } + if strings.Contains(linefields[1], "boot") { + continue } + path := strings.Replace(linefields[1], `\040`, " ", -1) + stat := syscall.Statfs_t{} + err := syscall.Statfs(path, &stat) + if err != nil { + fmt.Println(err.Error()) + return + } + tags := map[string]string{"type": "node", "device": linefields[0]} + total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) + y, err := lp.New("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now()) + if err == nil { + y.AddMeta("unit", "GBytes") + output <- y + } + free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000) + y, err = lp.New("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now()) + if err == nil { + y.AddMeta("unit", "GBytes") + output <- y + } + perc := (100 * (total - free)) / total + if perc > part_max_used { + part_max_used = perc + } + } + y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": part_max_used}, time.Now()) + if err == nil { + y.AddMeta("unit", "percent") + output <- y } } diff --git a/collectors/diskstatMetric.md b/collectors/diskstatMetric.md index 1ac341d..a38f154 100644 --- a/collectors/diskstatMetric.md +++ b/collectors/diskstatMetric.md @@ -4,31 +4,18 @@ ```json "diskstat": { "exclude_metrics": [ - "read_ms" + "disk_total" ], } ``` -The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. +The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. -Metrics: -* `reads` -* `reads_merged` -* `read_sectors` -* `read_ms` -* `writes` -* `writes_merged` -* `writes_sectors` -* `writes_ms` -* `ioops` -* `ioops_ms` -* `ioops_weighted_ms` -* `discards` -* `discards_merged` -* `discards_sectors` -* `discards_ms` -* `flushes` -* `flushes_ms` +Metrics per device (with `device` tag): +* `disk_total` (unit `GBytes`) +* `disk_free` (unit `GBytes`) + +Global metrics: +* `part_max_used` (unit `percent`) -The device name is added as tag `device`. diff --git a/collectors/iostatMetric.go b/collectors/iostatMetric.go new file mode 100644 index 0000000..ca7f33c --- /dev/null +++ b/collectors/iostatMetric.go @@ -0,0 +1,155 @@ +package collectors + +import ( + "bufio" + "os" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + + // "log" + "encoding/json" + "errors" + "strconv" + "strings" + "time" +) + +const IOSTATFILE = `/proc/diskstats` +const IOSTAT_SYSFSPATH = `/sys/block` + +type IOstatCollectorConfig struct { + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` +} + +type IOstatCollectorEntry struct { + lastValues map[string]int64 + tags map[string]string +} + +type IOstatCollector struct { + metricCollector + matches map[string]int + config IOstatCollectorConfig + devices map[string]IOstatCollectorEntry +} + +func (m *IOstatCollector) Init(config json.RawMessage) error { + var err error + m.name = "IOstatCollector" + m.meta = map[string]string{"source": m.name, "group": "Disk"} + m.setup() + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + return err + } + } + // https://www.kernel.org/doc/html/latest/admin-guide/iostats.html + matches := map[string]int{ + "io_reads": 3, + "io_reads_merged": 4, + "io_read_sectors": 5, + "io_read_ms": 6, + "io_writes": 7, + "io_writes_merged": 8, + "io_writes_sectors": 9, + "io_writes_ms": 10, + "io_ioops": 11, + "io_ioops_ms": 12, + "io_ioops_weighted_ms": 13, + "io_discards": 14, + "io_discards_merged": 15, + "io_discards_sectors": 16, + "io_discards_ms": 17, + "io_flushes": 18, + "io_flushes_ms": 19, + } + m.devices = make(map[string]IOstatCollectorEntry) + m.matches = make(map[string]int) + for k, v := range matches { + if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip { + m.matches[k] = v + } + } + if len(m.matches) == 0 { + return errors.New("no metrics to collect") + } + file, err := os.Open(string(IOSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + device := linefields[2] + if strings.Contains(device, "loop") { + continue + } + values := make(map[string]int64) + for m := range m.matches { + values[m] = 0 + } + m.devices[device] = IOstatCollectorEntry{ + tags: map[string]string{ + "device": linefields[2], + "type": "node", + }, + lastValues: values, + } + } + m.init = true + return err +} + +func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + + file, err := os.Open(string(IOSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if len(line) == 0 { + continue + } + linefields := strings.Fields(line) + device := linefields[2] + if strings.Contains(device, "loop") { + continue + } + if _, ok := m.devices[device]; !ok { + continue + } + entry := m.devices[device] + for name, idx := range m.matches { + if idx < len(linefields) { + x, err := strconv.ParseInt(linefields[idx], 0, 64) + if err == nil { + diff := x - entry.lastValues[name] + y, err := lp.New(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now()) + if err == nil { + output <- y + } + } + entry.lastValues[name] = x + } + } + m.devices[device] = entry + } +} + +func (m *IOstatCollector) Close() { + m.init = false +} diff --git a/collectors/iostatMetric.md b/collectors/iostatMetric.md new file mode 100644 index 0000000..e3e8604 --- /dev/null +++ b/collectors/iostatMetric.md @@ -0,0 +1,34 @@ + +## `iostat` collector + +```json + "iostat": { + "exclude_metrics": [ + "read_ms" + ], + } +``` + +The `iostat` collector reads data from `/proc/diskstats` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. + +Metrics: +* `io_reads` +* `io_reads_merged` +* `io_read_sectors` +* `io_read_ms` +* `io_writes` +* `io_writes_merged` +* `io_writes_sectors` +* `io_writes_ms` +* `io_ioops` +* `io_ioops_ms` +* `io_ioops_weighted_ms` +* `io_discards` +* `io_discards_merged` +* `io_discards_sectors` +* `io_discards_ms` +* `io_flushes` +* `io_flushes_ms` + +The device name is added as tag `device`. For more details, see https://www.kernel.org/doc/html/latest/admin-guide/iostats.html + From ea5b3bdbd63129e198a0208814502c8d7db2a95f Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 21 Feb 2022 12:45:08 +0100 Subject: [PATCH 173/174] Use receiver names from config (#34) * Use common configuration format of 'receiver_name' -> 'receiver_config' * Adjust receiver configuration files --- .github/ci-receivers.json | 2 +- receivers.json | 6 ++--- receivers/metricReceiver.go | 17 +++++++----- receivers/natsReceiver.go | 53 ++++++++++++++++++++----------------- receivers/receiveManager.go | 20 +++++++------- 5 files changed, 52 insertions(+), 46 deletions(-) diff --git a/.github/ci-receivers.json b/.github/ci-receivers.json index fe51488..0967ef4 100644 --- a/.github/ci-receivers.json +++ b/.github/ci-receivers.json @@ -1 +1 @@ -[] +{} diff --git a/receivers.json b/receivers.json index e368fc3..a27f07d 100644 --- a/receivers.json +++ b/receivers.json @@ -1,8 +1,8 @@ -[ - { +{ + "natsrecv" : { "type": "nats", "address": "nats://my-url", "port" : "4222", "database": "testcluster" } -] +} diff --git a/receivers/metricReceiver.go b/receivers/metricReceiver.go index 50724b1..c712186 100644 --- a/receivers/metricReceiver.go +++ b/receivers/metricReceiver.go @@ -2,9 +2,15 @@ package receivers import ( // "time" + "encoding/json" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) +type defaultReceiverConfig struct { + Type string `json:"type"` +} + type ReceiverConfig struct { Addr string `json:"address"` Port string `json:"port"` @@ -14,16 +20,13 @@ type ReceiverConfig struct { } type receiver struct { - name string - addr string - port string - database string - organization string - sink chan lp.CCMetric + typename string + name string + sink chan lp.CCMetric } type Receiver interface { - Init(config ReceiverConfig) error + Init(name string, config json.RawMessage) error Start() Close() Name() string diff --git a/receivers/natsReceiver.go b/receivers/natsReceiver.go index 853edf1..dc96971 100644 --- a/receivers/natsReceiver.go +++ b/receivers/natsReceiver.go @@ -1,19 +1,22 @@ package receivers import ( + "encoding/json" "errors" "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" nats "github.com/nats-io/nats.go" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - "time" ) type NatsReceiverConfig struct { - Addr string `json:"address"` - Port string `json:"port"` - Database string `json:"database"` + Type string `json:"type"` + Addr string `json:"address"` + Port string `json:"port"` + Subject string `json:"subject"` } type NatsReceiver struct { @@ -22,35 +25,35 @@ type NatsReceiver struct { handler *influx.MetricHandler parser *influx.Parser meta map[string]string - config ReceiverConfig + config NatsReceiverConfig } var DefaultTime = func() time.Time { return time.Unix(42, 0) } -func (r *NatsReceiver) Init(config ReceiverConfig) error { - r.name = "NatsReceiver" - r.config = config +func (r *NatsReceiver) Init(name string, config json.RawMessage) error { + r.typename = "NatsReceiver" + r.name = name + r.config.Addr = nats.DefaultURL + r.config.Port = "4222" + if len(config) > 0 { + err := json.Unmarshal(config, &r.config) + if err != nil { + cclog.ComponentError(r.name, "Error reading config:", err.Error()) + return err + } + } if len(r.config.Addr) == 0 || len(r.config.Port) == 0 || - len(r.config.Database) == 0 { - return errors.New("Not all configuration variables set required by NatsReceiver") + len(r.config.Subject) == 0 { + return errors.New("not all configuration variables set required by NatsReceiver") } r.meta = map[string]string{"source": r.name} - r.addr = r.config.Addr - if len(r.addr) == 0 { - r.addr = nats.DefaultURL - } - r.port = r.config.Port - if len(r.port) == 0 { - r.port = "4222" - } - uri := fmt.Sprintf("%s:%s", r.addr, r.port) - cclog.ComponentDebug("NatsReceiver", "INIT", uri) + uri := fmt.Sprintf("%s:%s", r.config.Addr, r.config.Port) + cclog.ComponentDebug(r.name, "INIT", uri, "Subject", r.config.Subject) nc, err := nats.Connect(uri) if err == nil { - r.database = r.config.Database r.nc = nc } else { r.nc = nil @@ -63,8 +66,8 @@ func (r *NatsReceiver) Init(config ReceiverConfig) error { } func (r *NatsReceiver) Start() { - cclog.ComponentDebug("NatsReceiver", "START") - r.nc.Subscribe(r.database, r._NatsReceive) + cclog.ComponentDebug(r.name, "START") + r.nc.Subscribe(r.config.Subject, r._NatsReceive) } func (r *NatsReceiver) _NatsReceive(m *nats.Msg) { @@ -84,7 +87,7 @@ func (r *NatsReceiver) _NatsReceive(m *nats.Msg) { func (r *NatsReceiver) Close() { if r.nc != nil { - cclog.ComponentDebug("NatsReceiver", "CLOSE") + cclog.ComponentDebug(r.name, "CLOSE") r.nc.Close() } } diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index c570aa4..7141170 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -18,12 +18,12 @@ type receiveManager struct { output chan lp.CCMetric done chan bool wg *sync.WaitGroup - config []ReceiverConfig + config []json.RawMessage } type ReceiveManager interface { Init(wg *sync.WaitGroup, receiverConfigFile string) error - AddInput(rawConfig json.RawMessage) error + AddInput(name string, rawConfig json.RawMessage) error AddOutput(output chan lp.CCMetric) Start() Close() @@ -34,7 +34,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er rm.output = nil rm.done = make(chan bool) rm.wg = wg - rm.config = make([]ReceiverConfig, 0) + rm.config = make([]json.RawMessage, 0) configFile, err := os.Open(receiverConfigFile) if err != nil { cclog.ComponentError("ReceiveManager", err.Error()) @@ -42,14 +42,14 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er } defer configFile.Close() jsonParser := json.NewDecoder(configFile) - var rawConfigs []json.RawMessage + var rawConfigs map[string]json.RawMessage err = jsonParser.Decode(&rawConfigs) if err != nil { cclog.ComponentError("ReceiveManager", err.Error()) return err } - for _, raw := range rawConfigs { - rm.AddInput(raw) + for name, raw := range rawConfigs { + rm.AddInput(name, raw) } return nil } @@ -64,8 +64,8 @@ func (rm *receiveManager) Start() { cclog.ComponentDebug("ReceiveManager", "STARTED") } -func (rm *receiveManager) AddInput(rawConfig json.RawMessage) error { - var config ReceiverConfig +func (rm *receiveManager) AddInput(name string, rawConfig json.RawMessage) error { + var config defaultReceiverConfig err := json.Unmarshal(rawConfig, &config) if err != nil { cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "JSON config error:", err.Error()) @@ -76,13 +76,13 @@ func (rm *receiveManager) AddInput(rawConfig json.RawMessage) error { return err } r := AvailableReceivers[config.Type] - err = r.Init(config) + err = r.Init(name, rawConfig) if err != nil { cclog.ComponentError("ReceiveManager", "SKIP", r.Name(), "initialization failed:", err.Error()) return err } rm.inputs = append(rm.inputs, r) - rm.config = append(rm.config, config) + rm.config = append(rm.config, rawConfig) cclog.ComponentDebug("ReceiveManager", "ADD RECEIVER", r.Name()) return nil } From f683f2e6da296de696d393be71a7483be3ae841a Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 21 Feb 2022 13:29:33 +0100 Subject: [PATCH 174/174] Dynamically load liblikwid (#40) * Check whether LIKWID library is present * Generalize nan_to_zero option to invalid_to_zero including +Inf,+Inf and NaN * Remove double error printing and return if measurements do not work --- collectors/likwidMetric.go | 35 ++++++++++++++++++++++++----------- collectors/likwidMetric.md | 2 +- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 3acd627..8626d7c 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -2,7 +2,7 @@ package collectors /* #cgo CFLAGS: -I./likwid -#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm +#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm -Wl,--unresolved-symbols=ignore-in-object-files #include #include */ @@ -25,6 +25,7 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" + "github.com/NVIDIA/go-nvml/pkg/dl" ) type MetricScope string @@ -69,6 +70,11 @@ func GetAllMetricScopes() []MetricScope { return []MetricScope{"cpu" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"} } +const ( + LIKWID_LIB_NAME = "liblikwid.so" + LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL +) + type LikwidCollectorMetricConfig struct { Name string `json:"name"` // Name of the metric Calc string `json:"calc"` // Calculation for the metric using @@ -88,7 +94,7 @@ type LikwidCollectorConfig struct { Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"` Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` ForceOverwrite bool `json:"force_overwrite,omitempty"` - NanToZero bool `json:"nan_to_zero,omitempty"` + InvalidToZero bool `json:"invalid_to_zero,omitempty"` } type LikwidCollector struct { @@ -260,6 +266,10 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { return err } } + lib := dl.New(LIKWID_LIB_NAME, LIKWID_LIB_DL_FLAGS) + if lib == nil { + return fmt.Errorf("error instantiating DynamicLibrary for %s", LIKWID_LIB_NAME) + } if m.config.ForceOverwrite { cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1") os.Setenv("LIKWID_FORCE", "1") @@ -374,15 +384,13 @@ func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) err ret = C.perfmon_setupCounters(gid) if ret != 0 { gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to setup performance group %s", gctr) - cclog.ComponentError(m.name, err.Error()) + err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr) return err } ret = C.perfmon_startCounters() if ret != 0 { gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to start performance group %s", gctr) - cclog.ComponentError(m.name, err.Error()) + err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr) return err } m.running = true @@ -391,8 +399,7 @@ func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) err ret = C.perfmon_stopCounters() if ret != 0 { gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to stop performance group %s", gctr) - cclog.ComponentError(m.name, err.Error()) + err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr) return err } return nil @@ -439,7 +446,10 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, continue } m.mresults[group][tid][metric.Name] = value - if m.config.NanToZero && math.IsNaN(value) { + if m.config.InvalidToZero && math.IsNaN(value) { + value = 0.0 + } + if m.config.InvalidToZero && math.IsInf(value, 0) { value = 0.0 } // Now we have the result, send it with the proper tags @@ -483,7 +493,10 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan continue } m.gmresults[tid][metric.Name] = value - if m.config.NanToZero && math.IsNaN(value) { + if m.config.InvalidToZero && math.IsNaN(value) { + value = 0.0 + } + if m.config.InvalidToZero && math.IsInf(value, 0) { value = 0.0 } // Now we have the result, send it with the proper tags @@ -517,7 +530,7 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) err := m.takeMeasurement(i, interval) if err != nil { cclog.ComponentError(m.name, err.Error()) - continue + return } // read measurements and derive event set metrics m.calcEventsetMetrics(i, interval, output) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 5c54bb6..8b5dee2 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -9,7 +9,7 @@ The `likwid` configuration consists of two parts, the "eventsets" and "globalmet Additional options: - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements -- `nan_to_zero`: In some cases, the calculations result in `NaN`. With this option, all `NaN` values are replaces with `0.0`. +- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. ### Available metric scopes