From 830b7de9ea847e85c5fa30b94b673392bd66aa53 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 6 Jan 2022 15:26:51 +0100 Subject: [PATCH 01/49] Cast collector measurement duration to seconds. Thanks to KIT --- collectors/likwidMetric.go | 2 +- metric-collector.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 2fd1129..34e2364 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -200,7 +200,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) m.results[i][tid][gctr] = float64(res) } - m.results[i][tid]["time"] = float64(interval) + m.results[i][tid]["time"] = interval.Seconds() m.results[i][tid]["inverseClock"] = float64(1.0 / m.basefreq) for _, metric := range evset.Metrics { expression, err := govaluate.NewEvaluableExpression(metric.Calc) diff --git a/metric-collector.go b/metric-collector.go index f6c8f5c..fd3b556 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -281,7 +281,7 @@ func main() { // storage locations for _, c := range config.Collectors { col := Collectors[c] - col.Read(time.Duration(config.Duration), &tmpPoints) + col.Read(time.Duration(config.Duration)*time.Second, &tmpPoints) for { if len(tmpPoints) == 0 { From bd831060c72de96b19f19e6abf64b6e1a49fdcb9 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 10:26:38 +0100 Subject: [PATCH 02/49] Add IB metrics ib_recv_pkts and ib_xmit_pkts --- collectors/infinibandMetric.go | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 93725d1..54c974e 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -47,6 +47,8 @@ func (m *InfinibandCollector) Help() { fmt.Println("Metrics:") fmt.Println("- ib_recv") fmt.Println("- ib_xmit") + fmt.Println("- ib_recv_pkts") + fmt.Println("- ib_xmit_pkts") } func (m *InfinibandCollector) Init(config []byte) error { @@ -143,6 +145,26 @@ func DoPerfQuery(cmd string, dev string, lid string, port string, tags map[strin } } } + if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } + if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") { + lv := strings.Fields(line) + v, err := strconv.ParseFloat(lv[1], 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } } return nil } @@ -171,6 +193,28 @@ func DoSysfsRead(dev string, lid string, port string, tags map[string]string, ou } } } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_recv_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } + buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path)) + if err == nil { + data := strings.Replace(string(buffer), "\n", "", -1) + v, err := strconv.ParseFloat(data, 64) + if err == nil { + y, err := lp.New("ib_xmit_pkts", tags, map[string]interface{}{"value": float64(v)}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } return nil } From a6cc914b9937219166143480e5038b9d34ac91bf Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 14:25:24 +0100 Subject: [PATCH 03/49] Add GPFS / IBM Spectrum Scale collector --- collectors/gpfs.go | 298 ++++++++++++++++++++++++++++++++++++++++++++ metric-collector.go | 1 + 2 files changed, 299 insertions(+) create mode 100644 collectors/gpfs.go diff --git a/collectors/gpfs.go b/collectors/gpfs.go new file mode 100644 index 0000000..14398b4 --- /dev/null +++ b/collectors/gpfs.go @@ -0,0 +1,298 @@ +package collectors + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "log" + "os" + "os/exec" + "os/user" + "strconv" + "strings" + "time" + + lp "github.com/influxdata/line-protocol" +) + +type GpfsCollectorConfig struct { + Mmpmon string `json:"mmpmon"` +} + +type GpfsCollector struct { + MetricCollector + config GpfsCollectorConfig +} + +func (m *GpfsCollector) Init(config []byte) error { + var err error + m.name = "GpfsCollector" + m.setup() + + // Set default mmpmon binary + m.config.Mmpmon = "/usr/lpp/mmfs/bin/mmpmon" + + // Read JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + log.Print(err.Error()) + return err + } + } + + // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root + user, err := user.Current() + if err != nil { + return fmt.Errorf("GpfsCollector.Init(): Failed to get current user: %v", err) + } + if user.Uid != "0" { + return fmt.Errorf("GpfsCollector.Init(): GPFS file system statistics can only be queried by user root") + } + + // Check if mmpmon is in executable search path + _, err = exec.LookPath(m.config.Mmpmon) + if err != nil { + return fmt.Errorf("GpfsCollector.Init(): Failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err) + } + + m.init = true + return nil +} + +func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + if !m.init { + return + } + + // mmpmon: + // -p: generate output that can be parsed + // -s: suppress the prompt on input + // fs_io_s: Displays I/O statistics per mounted file system + cmd := exec.Command(m.config.Mmpmon, "-p", "-s") + cmd.Stdin = strings.NewReader("once fs_io_s\n") + cmdStdout := new(bytes.Buffer) + cmdStderr := new(bytes.Buffer) + cmd.Stdout = cmdStdout + cmd.Stderr = cmdStderr + err := cmd.Run() + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) + data, _ := ioutil.ReadAll(cmdStderr) + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stderr: \"%s\"\n", string(data)) + data, _ = ioutil.ReadAll(cmdStdout) + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stdout: \"%s\"\n", string(data)) + return + } + + // Read I/O statistics + scanner := bufio.NewScanner(cmdStdout) + for scanner.Scan() { + lineSplit := strings.Fields(scanner.Text()) + if lineSplit[0] == "_fs_io_s_" { + key_value := make(map[string]string) + for i := 1; i < len(lineSplit); i += 2 { + key_value[lineSplit[i]] = lineSplit[i+1] + } + + // Ignore keys: + // _n_: node IP address, + // _nn_: node name, + // _cl_: cluster name, + // _d_: number of disks + + filesystem, ok := key_value["_fs_"] + if !ok { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to get filesystem name.\n") + continue + } + + // return code + rc, err := strconv.Atoi(key_value["_rc_"]) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert return code: %s\n", err.Error()) + continue + } + if rc != 0 { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Filesystem %s not ok.", filesystem) + continue + } + + // unix epoch in microseconds + timestampInt, err := strconv.ParseInt(key_value["_t_"]+key_value["_tu_"], 10, 64) + timestamp := time.UnixMicro(timestampInt) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert time stamp '%s': %s\n", + key_value["_t_"]+key_value["_tu_"], err.Error()) + continue + } + + // bytes read + bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert bytes read '%s': %s\n", + key_value["_br_"], err.Error()) + continue + } + y, err := lp.New( + "gpfs_bytes_read", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": bytesRead, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // bytes written + bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert bytes written '%s': %s\n", + key_value["_bw_"], err.Error()) + continue + } + y, err = lp.New( + "gpfs_bytes_written", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": bytesWritten, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of opens + numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, + "GpfsCollector.Read(): Failed to convert number of opens '%s': %s\n", + key_value["_oc_"], err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_opens", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numOpens, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of closes + numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of closes: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_closes", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numCloses, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of reads + numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of reads: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_reads", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numReads, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of writes + numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of writes: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_writes", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numWrites, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // number of read directories + numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of read directories: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_readdirs", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numReaddirs, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + + // Number of inode updates + numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64) + if err != nil { + fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert Number of inode updates: %s\n", err.Error()) + continue + } + y, err = lp.New( + "gpfs_num_inode_updates", + map[string]string{ + "filesystem": filesystem, + }, + map[string]interface{}{ + "value": numInodeUpdates, + }, + timestamp) + if err == nil { + *out = append(*out, y) + } + } + } +} + +func (m *GpfsCollector) Close() { + m.init = false + return +} diff --git a/metric-collector.go b/metric-collector.go index fd3b556..0b75675 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -32,6 +32,7 @@ var Collectors = map[string]collectors.MetricGetter{ "diskstat": &collectors.DiskstatCollector{}, "tempstat": &collectors.TempCollector{}, "ipmistat": &collectors.IpmiCollector{}, + "gpfs": &collectors.GpfsCollector{}, } var Sinks = map[string]sinks.SinkFuncs{ From 38cba10fb682d55a187dd063de34146e2e9fe01b Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 14:47:59 +0100 Subject: [PATCH 04/49] Fix to work with golang 1.16 --- collectors/gpfs.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/collectors/gpfs.go b/collectors/gpfs.go index 14398b4..db8a0d0 100644 --- a/collectors/gpfs.go +++ b/collectors/gpfs.go @@ -121,6 +121,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { continue } + /* requires go 1.17 // unix epoch in microseconds timestampInt, err := strconv.ParseInt(key_value["_t_"]+key_value["_tu_"], 10, 64) timestamp := time.UnixMicro(timestampInt) @@ -130,6 +131,8 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { key_value["_t_"]+key_value["_tu_"], err.Error()) continue } + */ + timestamp := time.Now() // bytes read bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64) From c5e90247dff2ad0caf10c6c67694c0935c55aac2 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 15:35:12 +0100 Subject: [PATCH 05/49] Drop domain part of host name --- metric-collector.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metric-collector.go b/metric-collector.go index 0b75675..04c221f 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -166,6 +166,8 @@ func main() { log.Print(err) return } + // Drop domain part of host name + host = strings.SplitN(host, `.`, 2)[0] clicfg := ReadCli() err = CreatePidfile(clicfg["pidfile"]) err = SetLogging(clicfg["logfile"]) From caebca5609b4aaa4289b9e50771fc0104c31f9db Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 15:55:15 +0100 Subject: [PATCH 06/49] Updated to latest stable version of likwid --- collectors/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectors/Makefile b/collectors/Makefile index ab47caa..0c637b5 100644 --- a/collectors/Makefile +++ b/collectors/Makefile @@ -8,9 +8,9 @@ ACCESSMODE = direct # if CENTRAL_INSTALL == true ####################################################################### # Path to central installation (if CENTRAL_INSTALL=true) -LIKWID_BASE=/apps/likwid/5.2.0 -# LIKWID version (should be same major version as central installation, 5.1.x) -LIKWID_VERSION = 5.2.0 +LIKWID_BASE=/apps/likwid/5.2.1 +# LIKWID version (should be same major version as central installation, 5.2.x) +LIKWID_VERSION = 5.2.1 ####################################################################### # if CENTRAL_INSTALL == false and ACCESSMODE == accessdaemon From f229f59dd5019fb0d166cf291a882c4ab7389ebf Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 19 Jan 2022 16:41:32 +0100 Subject: [PATCH 07/49] Define source code dependencies in Makefile --- Makefile | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index e82685e..f49162e 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,27 @@ APP = cc-metric-collector +GOSRC_APP := metric-collector.go +GOSRC_COLLECTORS := $(wildcard collectors/*.go) +GOSRC_SINKS := $(wildcard sinks/*.go) +GOSRC_RECEIVERS := $(wildcard receivers/*.go) +GOSRC := $(GOSRC_APP) $(GOSRC_COLLECTORS) $(GOSRC_SINKS) $(GOSRC_RECEIVERS) +.PHONY: all all: $(APP) -$(APP): metric-collector.go +$(APP): $(GOSRC) make -C collectors go get - go build -o $(APP) metric-collector.go + go build -o $(APP) $(GOSRC_APP) +.PHONY: clean clean: make -C collectors clean rm -f $(APP) +.PHONY: fmt fmt: - go fmt collectors/*.go - go fmt sinks/*.go - go fmt receivers/*.go - go fmt metric-collector.go + go fmt $(GOSRC_COLLECTORS) + go fmt $(GOSRC_SINKS) + go fmt $(GOSRC_RECEIVERS) + go fmt $(GOSRC_APP) -.PHONY: clean From 59bf28f6065e05742420ac6995804bb3c9a5db89 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 12:09:22 +0100 Subject: [PATCH 08/49] Add vet and staticcheck make targets --- Makefile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Makefile b/Makefile index f49162e..91a1200 100644 --- a/Makefile +++ b/Makefile @@ -25,3 +25,14 @@ fmt: go fmt $(GOSRC_RECEIVERS) go fmt $(GOSRC_APP) +# Examine Go source code and reports suspicious constructs +.PHONY: vet + go vet ./... + + +# Run linter for the Go programming language. +# Using static analysis, it finds bugs and performance issues, offers simplifications, and enforces style rules +.PHONY: staticcheck +staticcheck: + go install honnef.co/go/tools/cmd/staticcheck@latest + $$(go env GOPATH)/bin/staticcheck ./... From c23c197982327edc7cd10cdfee7826ef8374cbb5 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 12:13:50 +0100 Subject: [PATCH 09/49] Add vet and staticcheck make targets --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 91a1200..892bbcc 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,7 @@ fmt: # Examine Go source code and reports suspicious constructs .PHONY: vet +vet: go vet ./... From 458f6dec086422a0ba114b2c2a8900f24131c29a Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 12:38:52 +0100 Subject: [PATCH 10/49] Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value --- collectors/cpustatMetric.go | 2 +- collectors/diskstatMetric.go | 2 +- collectors/infinibandMetric.go | 2 +- collectors/loadavgMetric.go | 2 +- collectors/nvidiaMetric.go | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index fe31c3c..9e44fa7 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -13,7 +13,7 @@ import ( const CPUSTATFILE = `/proc/stat` type CpustatCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } type CpustatCollector struct { diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index e2d2f25..5080ca2 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -15,7 +15,7 @@ const DISKSTATFILE = `/proc/diskstats` const DISKSTAT_SYSFSPATH = `/sys/block` type DiskstatCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } type DiskstatCollector struct { diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 54c974e..a9552f7 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -20,7 +20,7 @@ const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid` const PERFQUERY = `/usr/sbin/perfquery` type InfinibandCollectorConfig struct { - ExcludeDevices []string `json:"exclude_devices, omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` PerfQueryPath string `json:"perfquery_path"` } diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index dbccf22..21cf350 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -12,7 +12,7 @@ import ( const LOADAVGFILE = `/proc/loadavg` type LoadavgCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } type LoadavgCollector struct { diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 4597610..bd63e2c 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -11,8 +11,8 @@ import ( ) type NvidiaCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics, omitempty"` - ExcludeDevices []string `json:"exclude_devices, omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` } type NvidiaCollector struct { From 3a9ea0042c8a6e2c78be4559401e30a5cc615949 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 20 Jan 2022 16:32:10 +0100 Subject: [PATCH 11/49] Correct go syntax in README.md --- collectors/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/README.md b/collectors/README.md index b5ae4e1..df02dd6 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -339,7 +339,7 @@ func (m *SampleCollector) Read(interval time.Duration, out *[]lp.MutableMetric) return } // tags for the metric, if type != node use proper type and type-id - tags := map[string][string]{"type" : "node"} + tags := map[string]string{"type" : "node"} // Each metric has exactly one field: value ! value := map[string]interface{}{"value": int(x)} y, err := lp.New("sample_metric", tags, value, time.Now()) From 611ac0fcb0d2754dffe2234a1c5bd995e786be7b Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 21 Jan 2022 09:59:57 +0100 Subject: [PATCH 12/49] Add CPU frequency collector --- collectors/cpufreqMetric.go | 189 ++++++++++++++++++++++++++++++++++++ metric-collector.go | 12 ++- 2 files changed, 196 insertions(+), 5 deletions(-) create mode 100644 collectors/cpufreqMetric.go diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go new file mode 100644 index 0000000..94f8f4a --- /dev/null +++ b/collectors/cpufreqMetric.go @@ -0,0 +1,189 @@ +package collectors + +import ( + "bufio" + "encoding/json" + "fmt" + "log" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + lp "github.com/influxdata/line-protocol" + "golang.org/x/sys/unix" +) + +var warnLog *log.Logger = log.New(os.Stderr, "Warning: ", log.LstdFlags) + +// +// readOneLine reads one line from a file. +// It returns ok when file was successfully read. +// In this case text contains the first line of the files contents. +// +func readOneLine(filename string) (text string, ok bool) { + file, err := os.Open(filename) + if err != nil { + return + } + defer file.Close() + scanner := bufio.NewScanner(file) + ok = scanner.Scan() + text = scanner.Text() + return +} + +type CPUFreqCollectorCPU struct { + // coreID, packageID, num_cores, num_package + tagSet map[string]string + scalingCurFreqFile string +} + +// +// CPUFreqCollector +// a metric collector to measure the current frequency of the CPUs +// as obtained from the hardware (in KHz) +// Only measure on the first hyper thread +// +// See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html +// +type CPUFreqCollector struct { + MetricCollector + config struct { + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + } + cpus []CPUFreqCollectorCPU +} + +func (m *CPUFreqCollector) Init(config []byte) error { + m.name = "CPUFreqCollector" + m.setup() + if len(config) > 0 { + err := json.Unmarshal(config, &m.config) + if err != nil { + return err + } + } + + // Initialize CPU list + m.cpus = make([]CPUFreqCollectorCPU, 0) + + // Loop for all CPU directories + baseDir := "/sys/devices/system/cpu" + globPattern := filepath.Join(baseDir, "cpu[0-9]*") + cpuDirs, err := filepath.Glob(globPattern) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to glob files with pattern %s: %v", globPattern, err) + } + if cpuDirs == nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to find any files with pattern %s", globPattern) + } + + maxPackageID := 0 + maxCoreID := 0 + for _, cpuDir := range cpuDirs { + cpuID := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") + + // Read thread sibling list + threadSiblingListFile := filepath.Join(cpuDir, "topology", "thread_siblings_list") + threadSiblingList, ok := readOneLine(threadSiblingListFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read thread siblings list from %s", threadSiblingListFile) + } + + // Read frequency only from first hardware thread + // Ignore Simultaneous Multithreading (SMT) / Hyper-Threading + if strings.Split(threadSiblingList, ",")[0] == cpuID { + // Read package ID + packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") + packageID, ok := readOneLine(packageIDFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) + } + packageID_int, err := strconv.Atoi(packageID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) + } + + // Update maxPackageID + if packageID_int > maxPackageID { + maxPackageID = packageID_int + } + + // Read core ID + coreIDFile := filepath.Join(cpuDir, "topology", "core_id") + coreID, ok := readOneLine(coreIDFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) + } + coreID_int, err := strconv.Atoi(coreID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) + } + + // Update maxCoreID + if coreID_int > maxCoreID { + maxCoreID = coreID_int + } + + // Check access to current frequency file + scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") + err = unix.Access(scalingCurFreqFile, unix.R_OK) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) + } + + m.cpus = append( + m.cpus, + CPUFreqCollectorCPU{ + tagSet: map[string]string{ + "coreID": strings.TrimSpace(coreID), + "packageID": strings.TrimSpace(packageID), + }, + scalingCurFreqFile: scalingCurFreqFile, + }) + } + } + + // Add num packages and num cores as tags + numPackages := strconv.Itoa(maxPackageID + 1) + numCores := strconv.Itoa(maxCoreID + 1) + for i := range m.cpus { + m.cpus[i].tagSet["num_core"] = numCores + m.cpus[i].tagSet["num_package"] = numPackages + } + + m.init = true + return nil +} + +func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + if !m.init { + return + } + + for _, cpu := range m.cpus { + // Read current frequency + line, ok := readOneLine(cpu.scalingCurFreqFile) + if !ok { + warnLog.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) + continue + } + cpuFreq, err := strconv.Atoi(line) + if err != nil { + warnLog.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err) + continue + } + + value := map[string]interface{}{"value": cpuFreq} + y, err := lp.New("cpufreq", cpu.tagSet, value, time.Now()) + if err == nil { + *out = append(*out, y) + } + } +} + +func (m *CPUFreqCollector) Close() { + m.init = false +} diff --git a/metric-collector.go b/metric-collector.go index 04c221f..90f50c4 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -4,16 +4,17 @@ import ( "encoding/json" "flag" "fmt" - "github.com/ClusterCockpit/cc-metric-collector/collectors" - "github.com/ClusterCockpit/cc-metric-collector/receivers" - "github.com/ClusterCockpit/cc-metric-collector/sinks" - lp "github.com/influxdata/line-protocol" "log" "os" "os/signal" "strings" "sync" "time" + + "github.com/ClusterCockpit/cc-metric-collector/collectors" + "github.com/ClusterCockpit/cc-metric-collector/receivers" + "github.com/ClusterCockpit/cc-metric-collector/sinks" + lp "github.com/influxdata/line-protocol" ) // List of provided collectors. Which collector should be run can be @@ -32,7 +33,8 @@ var Collectors = map[string]collectors.MetricGetter{ "diskstat": &collectors.DiskstatCollector{}, "tempstat": &collectors.TempCollector{}, "ipmistat": &collectors.IpmiCollector{}, - "gpfs": &collectors.GpfsCollector{}, + "gpfs": new(collectors.GpfsCollector), + "cpufreq": new(collectors.CPUFreqCollector), } var Sinks = map[string]sinks.SinkFuncs{ From 72722eff31443b852ba0effc2937a7ecbadb4eaa Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 21 Jan 2022 14:35:52 +0100 Subject: [PATCH 13/49] Avoid staticcheck warning: redundant return statement --- collectors/cpustatMetric.go | 4 ++-- collectors/customCmdMetric.go | 4 ++-- collectors/diskstatMetric.go | 6 +++--- collectors/{gpfs.go => gpfsMetric.go} | 9 +++------ collectors/infinibandMetric.go | 5 +++-- collectors/ipmiMetric.go | 4 ++-- collectors/likwidMetric.go | 6 +++--- collectors/loadavgMetric.go | 4 ++-- collectors/lustreMetric.go | 4 ++-- collectors/memstatMetric.go | 4 ++-- collectors/netstatMetric.go | 4 ++-- collectors/nvidiaMetric.go | 6 +++--- collectors/tempMetric.go | 4 ++-- collectors/topprocsMetric.go | 4 ++-- sinks/stdoutSink.go | 4 +--- 15 files changed, 34 insertions(+), 38 deletions(-) rename collectors/{gpfs.go => gpfsMetric.go} (98%) diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 9e44fa7..64b5842 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -3,11 +3,12 @@ package collectors import ( "encoding/json" "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const CPUSTATFILE = `/proc/stat` @@ -88,5 +89,4 @@ func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *CpustatCollector) Close() { m.init = false - return } diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index 547bb87..bbafc2d 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -3,12 +3,13 @@ package collectors import ( "encoding/json" "errors" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "os/exec" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom` @@ -126,5 +127,4 @@ func (m *CustomCmdCollector) Read(interval time.Duration, out *[]lp.MutableMetri func (m *CustomCmdCollector) Close() { m.init = false - return } diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 5080ca2..4cbd3c6 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -1,8 +1,10 @@ package collectors import ( - lp "github.com/influxdata/line-protocol" "io/ioutil" + + lp "github.com/influxdata/line-protocol" + // "log" "encoding/json" "errors" @@ -107,10 +109,8 @@ func (m *DiskstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric } } } - return } func (m *DiskstatCollector) Close() { m.init = false - return } diff --git a/collectors/gpfs.go b/collectors/gpfsMetric.go similarity index 98% rename from collectors/gpfs.go rename to collectors/gpfsMetric.go index db8a0d0..fbf3a63 100644 --- a/collectors/gpfs.go +++ b/collectors/gpfsMetric.go @@ -17,13 +17,11 @@ import ( lp "github.com/influxdata/line-protocol" ) -type GpfsCollectorConfig struct { - Mmpmon string `json:"mmpmon"` -} - type GpfsCollector struct { MetricCollector - config GpfsCollectorConfig + config struct { + Mmpmon string `json:"mmpmon"` + } } func (m *GpfsCollector) Init(config []byte) error { @@ -297,5 +295,4 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { func (m *GpfsCollector) Close() { m.init = false - return } diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index a9552f7..6e14251 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -2,10 +2,12 @@ package collectors import ( "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "os/exec" + + lp "github.com/influxdata/line-protocol" + // "os" "encoding/json" "errors" @@ -278,5 +280,4 @@ func (m *InfinibandCollector) Read(interval time.Duration, out *[]lp.MutableMetr func (m *InfinibandCollector) Close() { m.init = false - return } diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index d28a134..3179148 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -3,13 +3,14 @@ package collectors import ( "encoding/json" "errors" - lp "github.com/influxdata/line-protocol" "log" "os" "os/exec" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const IPMITOOL_PATH = `/usr/bin/ipmitool` @@ -133,5 +134,4 @@ func (m *IpmiCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { func (m *IpmiCollector) Close() { m.init = false - return } diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 34e2364..454a593 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -12,8 +12,6 @@ import ( "encoding/json" "errors" "fmt" - lp "github.com/influxdata/line-protocol" - "gopkg.in/Knetic/govaluate.v2" "io/ioutil" "log" "math" @@ -22,6 +20,9 @@ import ( "strings" "time" "unsafe" + + lp "github.com/influxdata/line-protocol" + "gopkg.in/Knetic/govaluate.v2" ) type LikwidCollectorMetricConfig struct { @@ -303,5 +304,4 @@ func (m *LikwidCollector) Close() { C.perfmon_finalize() C.topology_finalize() } - return } diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index 21cf350..1ecaea5 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -2,11 +2,12 @@ package collectors import ( "encoding/json" - lp "github.com/influxdata/line-protocol" "io/ioutil" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const LOADAVGFILE = `/proc/loadavg` @@ -76,5 +77,4 @@ func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *LoadavgCollector) Close() { m.init = false - return } diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index e7bb7a6..d77ac09 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -3,12 +3,13 @@ package collectors import ( "encoding/json" "errors" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats` @@ -102,5 +103,4 @@ func (m *LustreCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *LustreCollector) Close() { m.init = false - return } diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index 91987bb..17db13e 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -4,12 +4,13 @@ import ( "encoding/json" "errors" "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const MEMSTATFILE = `/proc/meminfo` @@ -125,5 +126,4 @@ func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *MemstatCollector) Close() { m.init = false - return } diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 659b89f..a273de1 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -2,12 +2,13 @@ package collectors import ( "encoding/json" - lp "github.com/influxdata/line-protocol" "io/ioutil" "log" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const NETSTATFILE = `/proc/net/dev` @@ -84,5 +85,4 @@ func (m *NetstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) func (m *NetstatCollector) Close() { m.init = false - return } diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index bd63e2c..31118c2 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -4,10 +4,11 @@ import ( "encoding/json" "errors" "fmt" - "github.com/NVIDIA/go-nvml/pkg/nvml" - lp "github.com/influxdata/line-protocol" "log" "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + lp "github.com/influxdata/line-protocol" ) type NvidiaCollectorConfig struct { @@ -267,5 +268,4 @@ func (m *NvidiaCollector) Close() { nvml.Shutdown() m.init = false } - return } diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index 3665025..b074d78 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -3,13 +3,14 @@ package collectors import ( "encoding/json" "fmt" - lp "github.com/influxdata/line-protocol" "io/ioutil" "os" "path/filepath" "strconv" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const HWMON_PATH = `/sys/class/hwmon` @@ -105,5 +106,4 @@ func (m *TempCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { func (m *TempCollector) Close() { m.init = false - return } diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index a1bf989..e1b31ee 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -4,11 +4,12 @@ import ( "encoding/json" "errors" "fmt" - lp "github.com/influxdata/line-protocol" "log" "os/exec" "strings" "time" + + lp "github.com/influxdata/line-protocol" ) const MAX_NUM_PROCS = 10 @@ -74,5 +75,4 @@ func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric func (m *TopProcsCollector) Close() { m.init = false - return } diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index 34561e0..8016fcb 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -59,6 +59,4 @@ func (s *StdoutSink) Flush() error { return nil } -func (s *StdoutSink) Close() { - return -} +func (s *StdoutSink) Close() {} From 5987901005d50e60a2a6a3f35c64bb78ef5f9068 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 21 Jan 2022 15:20:53 +0100 Subject: [PATCH 14/49] Avoid staticcheck warning: unnecessary assignment to the blank identifier --- collectors/likwidMetric.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 454a593..45fe68c 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -152,7 +152,7 @@ func (m *LikwidCollector) Init(config []byte) error { C.free(unsafe.Pointer(cstr)) m.results[i] = make(map[int]map[string]interface{}) m.mresults[i] = make(map[int]map[string]float64) - for tid, _ := range m.cpulist { + for tid := range m.cpulist { m.results[i][tid] = make(map[string]interface{}) m.mresults[i][tid] = make(map[string]float64) m.gmresults[tid] = make(map[string]float64) @@ -194,7 +194,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) continue } var eidx C.int - for tid, _ := range m.cpulist { + for tid := range m.cpulist { for eidx = 0; int(eidx) < len(evset.Events); eidx++ { ctr := C.perfmon_getCounterName(gid, eidx) gctr := C.GoString(ctr) @@ -220,7 +220,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) } for _, metric := range m.config.Metrics { - for tid, _ := range m.cpulist { + for tid := range m.cpulist { var params map[string]interface{} expression, err := govaluate.NewEvaluableExpression(metric.Calc) if err != nil { @@ -228,7 +228,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) continue } params = make(map[string]interface{}) - for j, _ := range m.groups { + for j := range m.groups { for mname, mres := range m.mresults[j][tid] { params[mname] = mres } @@ -241,7 +241,7 @@ func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) m.gmresults[tid][metric.Name] = float64(result.(float64)) } } - for i, _ := range m.groups { + for i := range m.groups { evset := m.config.Eventsets[i] for _, metric := range evset.Metrics { _, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name) From bcce471b271cd70c371de1855184afebb6a7c585 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 11:31:45 +0100 Subject: [PATCH 15/49] Simplified code --- collectors/cpufreqMetric.go | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 94f8f4a..f5a10bc 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -15,8 +15,6 @@ import ( "golang.org/x/sys/unix" ) -var warnLog *log.Logger = log.New(os.Stderr, "Warning: ", log.LstdFlags) - // // readOneLine reads one line from a file. // It returns ok when file was successfully read. @@ -138,7 +136,8 @@ func (m *CPUFreqCollector) Init(config []byte) error { m.cpus, CPUFreqCollectorCPU{ tagSet: map[string]string{ - "coreID": strings.TrimSpace(coreID), + "type": "cpu", + "type-id": strings.TrimSpace(coreID), "packageID": strings.TrimSpace(packageID), }, scalingCurFreqFile: scalingCurFreqFile, @@ -150,8 +149,9 @@ func (m *CPUFreqCollector) Init(config []byte) error { numPackages := strconv.Itoa(maxPackageID + 1) numCores := strconv.Itoa(maxCoreID + 1) for i := range m.cpus { - m.cpus[i].tagSet["num_core"] = numCores - m.cpus[i].tagSet["num_package"] = numPackages + c := &m.cpus[i] + c.tagSet["num_core"] = numCores + c.tagSet["num_package"] = numPackages } m.init = true @@ -163,21 +163,23 @@ func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) return } - for _, cpu := range m.cpus { + now := time.Now() + for i := range m.cpus { + cpu := &m.cpus[i] + // Read current frequency line, ok := readOneLine(cpu.scalingCurFreqFile) if !ok { - warnLog.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) + log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) continue } cpuFreq, err := strconv.Atoi(line) if err != nil { - warnLog.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err) + log.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err) continue } - value := map[string]interface{}{"value": cpuFreq} - y, err := lp.New("cpufreq", cpu.tagSet, value, time.Now()) + y, err := lp.New("cpufreq", cpu.tagSet, map[string]interface{}{"value": cpuFreq}, now) if err == nil { *out = append(*out, y) } From f84f7de05c7a479a9d7782c8f9f485752401d061 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 13:10:33 +0100 Subject: [PATCH 16/49] Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread --- collectors/cpufreqCpuinfoMetric.go | 176 +++++++++++++++++++++++++++++ metric-collector.go | 31 ++--- 2 files changed, 192 insertions(+), 15 deletions(-) create mode 100644 collectors/cpufreqCpuinfoMetric.go diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go new file mode 100644 index 0000000..1658878 --- /dev/null +++ b/collectors/cpufreqCpuinfoMetric.go @@ -0,0 +1,176 @@ +package collectors + +import ( + "bufio" + "fmt" + "log" + "os" + "strconv" + "strings" + "time" + + lp "github.com/influxdata/line-protocol" +) + +// +// CPUFreqCollector +// a metric collector to measure the current frequency of the CPUs +// as obtained from /proc/cpuinfo +// Only measure on the first hyperthread +// +type CPUFreqCpuInfoCollectorTopology struct { + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + physicalID string // socket / package ID + numPhysicalID string // number of sockets / packages + isHT bool + numNonHT string // number of non hyperthreading processors + tagSet map[string]string +} + +type CPUFreqCpuInfoCollector struct { + MetricCollector + topology []CPUFreqCpuInfoCollectorTopology +} + +func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { + m.name = "CPUFreqCpuInfoCollector" + + const cpuInfoFile = "/proc/cpuinfo" + file, err := os.Open(cpuInfoFile) + if err != nil { + return fmt.Errorf("Failed to open '%s': %v", cpuInfoFile, err) + } + defer file.Close() + + // Collect topology information from file cpuinfo + foundFreq := false + processor := "" + numNonHT := 0 + coreID := "" + physicalID := "" + maxPhysicalID := 0 + m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0) + coreSeenBefore := make(map[string]bool) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lineSplit := strings.Split(scanner.Text(), ":") + if len(lineSplit) == 2 { + key := strings.TrimSpace(lineSplit[0]) + value := strings.TrimSpace(lineSplit[1]) + switch key { + case "cpu MHz": + // frequency + foundFreq = true + case "processor": + processor = value + case "core id": + coreID = value + case "physical id": + physicalID = value + } + } + + // were all topology information collected? + if foundFreq && + len(processor) > 0 && + len(coreID) > 0 && + len(physicalID) > 0 { + + globalID := physicalID + ":" + coreID + isHT := coreSeenBefore[globalID] + coreSeenBefore[globalID] = true + if !isHT { + // increase number on non hyper thread cores + numNonHT++ + + // increase maximun socket / package ID, when required + physicalIDInt, err := strconv.Atoi(physicalID) + if err != nil { + return fmt.Errorf("Failed to convert physical id to int: %v", err) + } + if physicalIDInt > maxPhysicalID { + maxPhysicalID = physicalIDInt + } + } + + // store collected topology information + m.topology = append( + m.topology, + CPUFreqCpuInfoCollectorTopology{ + processor: processor, + coreID: coreID, + physicalID: physicalID, + isHT: isHT, + }) + + // reset topology information + foundFreq = false + processor = "" + coreID = "" + physicalID = "" + } + } + + numPhysicalID := fmt.Sprint(maxPhysicalID + 1) + numNonHTString := fmt.Sprint(numNonHT) + for i := range m.topology { + t := &m.topology[i] + t.numPhysicalID = numPhysicalID + t.numNonHT = numNonHTString + t.tagSet = map[string]string{ + "type": "cpu", + "type-id": t.processor, + "num_core": t.numNonHT, + "package_id": t.physicalID, + "num_package": t.numPhysicalID, + } + } + + m.init = true + return nil +} + +func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + if !m.init { + return + } + const cpuInfoFile = "/proc/cpuinfo" + file, err := os.Open(cpuInfoFile) + if err != nil { + log.Printf("Failed to open '%s': %v", cpuInfoFile, err) + return + } + defer file.Close() + + processorCounter := 0 + now := time.Now() + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lineSplit := strings.Split(scanner.Text(), ":") + if len(lineSplit) == 2 { + key := strings.TrimSpace(lineSplit[0]) + + // frequency + if key == "cpu MHz" { + t := &m.topology[processorCounter] + if !t.isHT { + value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64) + if err != nil { + log.Printf("Failed to convert cpu MHz to float: %v", err) + return + } + y, err := lp.New("cpufreq", t.tagSet, map[string]interface{}{"value": value}, now) + if err == nil { + *out = append(*out, y) + } + } + processorCounter++ + } + } + } +} + +func (m *CPUFreqCpuInfoCollector) Close() { + m.init = false +} diff --git a/metric-collector.go b/metric-collector.go index 90f50c4..02a2b21 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -20,21 +20,22 @@ import ( // List of provided collectors. Which collector should be run can be // configured at 'collectors' list in 'config.json'. var Collectors = map[string]collectors.MetricGetter{ - "likwid": &collectors.LikwidCollector{}, - "loadavg": &collectors.LoadavgCollector{}, - "memstat": &collectors.MemstatCollector{}, - "netstat": &collectors.NetstatCollector{}, - "ibstat": &collectors.InfinibandCollector{}, - "lustrestat": &collectors.LustreCollector{}, - "cpustat": &collectors.CpustatCollector{}, - "topprocs": &collectors.TopProcsCollector{}, - "nvidia": &collectors.NvidiaCollector{}, - "customcmd": &collectors.CustomCmdCollector{}, - "diskstat": &collectors.DiskstatCollector{}, - "tempstat": &collectors.TempCollector{}, - "ipmistat": &collectors.IpmiCollector{}, - "gpfs": new(collectors.GpfsCollector), - "cpufreq": new(collectors.CPUFreqCollector), + "likwid": &collectors.LikwidCollector{}, + "loadavg": &collectors.LoadavgCollector{}, + "memstat": &collectors.MemstatCollector{}, + "netstat": &collectors.NetstatCollector{}, + "ibstat": &collectors.InfinibandCollector{}, + "lustrestat": &collectors.LustreCollector{}, + "cpustat": &collectors.CpustatCollector{}, + "topprocs": &collectors.TopProcsCollector{}, + "nvidia": &collectors.NvidiaCollector{}, + "customcmd": &collectors.CustomCmdCollector{}, + "diskstat": &collectors.DiskstatCollector{}, + "tempstat": &collectors.TempCollector{}, + "ipmistat": &collectors.IpmiCollector{}, + "gpfs": new(collectors.GpfsCollector), + "cpufreq": new(collectors.CPUFreqCollector), + "cpufreq_cpuinfo": new(collectors.CPUFreqCpuInfoCollector), } var Sinks = map[string]sinks.SinkFuncs{ From f0a62152fd06f63bbfffc022a72f368969d7c816 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 15:55:15 +0100 Subject: [PATCH 17/49] Update GitHub actions --- .github/ci-config.json | 23 +++++++++++++---------- .github/workflows/runonce.yml | 5 ++++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/ci-config.json b/.github/ci-config.json index b3fbff1..402388d 100644 --- a/.github/ci-config.json +++ b/.github/ci-config.json @@ -21,7 +21,10 @@ "topprocs", "nvidia", "diskstat", - "ipmistat" + "ipmistat", + "gpfs", + "cpufreq", + "cpufreq_cpuinfo" ], "default_tags": { "cluster": "testcluster" @@ -30,20 +33,20 @@ "type": "none" }, "collect_config": { - "topprocs" : { + "topprocs": { "num_procs": 2 - }, + }, "tempstat": { "tag_override": { - "hwmon0" : { - "type" : "socket", - "type-id" : "0" + "hwmon0": { + "type": "socket", + "type-id": "0" }, - "hwmon1" : { - "type" : "socket", - "type-id" : "1" + "hwmon1": { + "type": "socket", + "type-id": "1" } } } } -} +} \ No newline at end of file diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index 8efc70a..194710f 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -7,8 +7,11 @@ jobs: steps: - uses: actions/checkout@v2 + # See: https://github.com/marketplace/actions/setup-go-environment - name: Setup Golang - uses: actions/setup-go@v2.1.4 + uses: actions/setup-go@v2.1.5 + with: + go-version: '^1.17.6' - name: Build MetricCollector run: make From 9157fdbab2e3b53c7921e6cfcbe28227f2b7c913 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 20:22:08 +0100 Subject: [PATCH 18/49] Fixed topology detection --- collectors/cpufreqMetric.go | 175 +++++++++++++++++++++--------------- 1 file changed, 101 insertions(+), 74 deletions(-) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index f5a10bc..ec42445 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -32,10 +32,19 @@ func readOneLine(filename string) (text string, ok bool) { return } -type CPUFreqCollectorCPU struct { - // coreID, packageID, num_cores, num_package - tagSet map[string]string +type CPUFreqCollectorTopology struct { + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + coreID_int int + physicalID string // socket / package ID + physicalID_int int + numPhysicalID string // number of sockets / packages + numPhysicalID_int int + isHT bool + numNonHT string // number of non hyperthreading processors + numNonHT_int int scalingCurFreqFile string + tagSet map[string]string } // @@ -48,10 +57,10 @@ type CPUFreqCollectorCPU struct { // type CPUFreqCollector struct { MetricCollector - config struct { + topology []CPUFreqCollectorTopology + config struct { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` } - cpus []CPUFreqCollectorCPU } func (m *CPUFreqCollector) Init(config []byte) error { @@ -64,9 +73,6 @@ func (m *CPUFreqCollector) Init(config []byte) error { } } - // Initialize CPU list - m.cpus = make([]CPUFreqCollectorCPU, 0) - // Loop for all CPU directories baseDir := "/sys/devices/system/cpu" globPattern := filepath.Join(baseDir, "cpu[0-9]*") @@ -78,82 +84,98 @@ func (m *CPUFreqCollector) Init(config []byte) error { return fmt.Errorf("CPUFreqCollector.Init() unable to find any files with pattern %s", globPattern) } - maxPackageID := 0 - maxCoreID := 0 + // Initialize CPU topology + m.topology = make([]CPUFreqCollectorTopology, len(cpuDirs)) for _, cpuDir := range cpuDirs { - cpuID := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") + processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") + processor_int, err := strconv.Atoi(processor) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert cpuID to int: %v", err) + } - // Read thread sibling list - threadSiblingListFile := filepath.Join(cpuDir, "topology", "thread_siblings_list") - threadSiblingList, ok := readOneLine(threadSiblingListFile) + // Read package ID + packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") + packageID, ok := readOneLine(packageIDFile) if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read thread siblings list from %s", threadSiblingListFile) + return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) + } + packageID_int, err := strconv.Atoi(packageID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) } - // Read frequency only from first hardware thread - // Ignore Simultaneous Multithreading (SMT) / Hyper-Threading - if strings.Split(threadSiblingList, ",")[0] == cpuID { - // Read package ID - packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") - packageID, ok := readOneLine(packageIDFile) - if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) - } - packageID_int, err := strconv.Atoi(packageID) - if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) - } + // Read core ID + coreIDFile := filepath.Join(cpuDir, "topology", "core_id") + coreID, ok := readOneLine(coreIDFile) + if !ok { + return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) + } + coreID_int, err := strconv.Atoi(coreID) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) + } - // Update maxPackageID - if packageID_int > maxPackageID { - maxPackageID = packageID_int - } + // Check access to current frequency file + scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") + err = unix.Access(scalingCurFreqFile, unix.R_OK) + if err != nil { + return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) + } - // Read core ID - coreIDFile := filepath.Join(cpuDir, "topology", "core_id") - coreID, ok := readOneLine(coreIDFile) - if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile) - } - coreID_int, err := strconv.Atoi(coreID) - if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err) - } + t := &m.topology[processor_int] + t.processor = processor + t.physicalID = packageID + t.physicalID_int = packageID_int + t.coreID = coreID + t.coreID_int = coreID_int + t.scalingCurFreqFile = scalingCurFreqFile + } - // Update maxCoreID - if coreID_int > maxCoreID { - maxCoreID = coreID_int - } + // is processor a hyperthread? + coreSeenBefore := make(map[string]bool) + for i := range m.topology { + t := &m.topology[i] - // Check access to current frequency file - scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") - err = unix.Access(scalingCurFreqFile, unix.R_OK) - if err != nil { - return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err) - } + globalID := t.physicalID + ":" + t.coreID + t.isHT = coreSeenBefore[globalID] + coreSeenBefore[globalID] = true + } - m.cpus = append( - m.cpus, - CPUFreqCollectorCPU{ - tagSet: map[string]string{ - "type": "cpu", - "type-id": strings.TrimSpace(coreID), - "packageID": strings.TrimSpace(packageID), - }, - scalingCurFreqFile: scalingCurFreqFile, - }) + // number of non hyper thread cores and packages / sockets + numNonHT_int := 0 + maxPhysicalID := 0 + for i := range m.topology { + t := &m.topology[i] + + // Update maxPackageID + if t.physicalID_int > maxPhysicalID { + maxPhysicalID = t.physicalID_int + } + + if !t.isHT { + numNonHT_int++ } } - // Add num packages and num cores as tags - numPackages := strconv.Itoa(maxPackageID + 1) - numCores := strconv.Itoa(maxCoreID + 1) - for i := range m.cpus { - c := &m.cpus[i] - c.tagSet["num_core"] = numCores - c.tagSet["num_package"] = numPackages + numPhysicalID_int := maxPhysicalID + 1 + numPhysicalID := fmt.Sprint(numPhysicalID_int) + numNonHT := fmt.Sprint(numNonHT_int) + for i := range m.topology { + t := &m.topology[i] + t.numPhysicalID = numPhysicalID + t.numPhysicalID_int = numPhysicalID_int + t.numNonHT = numNonHT + t.numNonHT_int = numNonHT_int + t.tagSet = map[string]string{ + "type": "cpu", + "type-id": t.processor, + "num_core": t.numNonHT, + "package_id": t.physicalID, + "num_package": t.numPhysicalID, + } } + fmt.Printf("%+v\n", m.topology) m.init = true return nil } @@ -164,13 +186,18 @@ func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) } now := time.Now() - for i := range m.cpus { - cpu := &m.cpus[i] + for i := range m.topology { + t := &m.topology[i] + + // skip hyperthreads + if t.isHT { + continue + } // Read current frequency - line, ok := readOneLine(cpu.scalingCurFreqFile) + line, ok := readOneLine(t.scalingCurFreqFile) if !ok { - log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", cpu.scalingCurFreqFile) + log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile) continue } cpuFreq, err := strconv.Atoi(line) @@ -179,7 +206,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, out *[]lp.MutableMetric) continue } - y, err := lp.New("cpufreq", cpu.tagSet, map[string]interface{}{"value": cpuFreq}, now) + y, err := lp.New("cpufreq", t.tagSet, map[string]interface{}{"value": cpuFreq}, now) if err == nil { *out = append(*out, y) } From 3d377760b86071af8b50d9134bc106a326970229 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 24 Jan 2022 22:03:13 +0100 Subject: [PATCH 19/49] Refactoring --- collectors/cpufreqMetric.go | 51 ++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index ec42445..35e64ac 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -33,18 +33,18 @@ func readOneLine(filename string) (text string, ok bool) { } type CPUFreqCollectorTopology struct { - processor string // logical processor number (continuous, starting at 0) - coreID string // socket local core ID - coreID_int int - physicalID string // socket / package ID - physicalID_int int - numPhysicalID string // number of sockets / packages - numPhysicalID_int int - isHT bool - numNonHT string // number of non hyperthreading processors - numNonHT_int int - scalingCurFreqFile string - tagSet map[string]string + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + coreID_int int + physicalPackageID string // socket / package ID + physicalPackageID_int int + numPhysicalPackages string // number of sockets / packages + numPhysicalPackages_int int + isHT bool + numNonHT string // number of non hyperthreading processors + numNonHT_int int + scalingCurFreqFile string + tagSet map[string]string } // @@ -94,12 +94,12 @@ func (m *CPUFreqCollector) Init(config []byte) error { } // Read package ID - packageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") - packageID, ok := readOneLine(packageIDFile) + physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") + physicalPackageID, ok := readOneLine(physicalPackageIDFile) if !ok { - return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", packageIDFile) + return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", physicalPackageIDFile) } - packageID_int, err := strconv.Atoi(packageID) + physicalPackageID_int, err := strconv.Atoi(physicalPackageID) if err != nil { return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err) } @@ -124,8 +124,8 @@ func (m *CPUFreqCollector) Init(config []byte) error { t := &m.topology[processor_int] t.processor = processor - t.physicalID = packageID - t.physicalID_int = packageID_int + t.physicalPackageID = physicalPackageID + t.physicalPackageID_int = physicalPackageID_int t.coreID = coreID t.coreID_int = coreID_int t.scalingCurFreqFile = scalingCurFreqFile @@ -136,7 +136,7 @@ func (m *CPUFreqCollector) Init(config []byte) error { for i := range m.topology { t := &m.topology[i] - globalID := t.physicalID + ":" + t.coreID + globalID := t.physicalPackageID + ":" + t.coreID t.isHT = coreSeenBefore[globalID] coreSeenBefore[globalID] = true } @@ -148,8 +148,8 @@ func (m *CPUFreqCollector) Init(config []byte) error { t := &m.topology[i] // Update maxPackageID - if t.physicalID_int > maxPhysicalID { - maxPhysicalID = t.physicalID_int + if t.physicalPackageID_int > maxPhysicalID { + maxPhysicalID = t.physicalPackageID_int } if !t.isHT { @@ -162,20 +162,19 @@ func (m *CPUFreqCollector) Init(config []byte) error { numNonHT := fmt.Sprint(numNonHT_int) for i := range m.topology { t := &m.topology[i] - t.numPhysicalID = numPhysicalID - t.numPhysicalID_int = numPhysicalID_int + t.numPhysicalPackages = numPhysicalID + t.numPhysicalPackages_int = numPhysicalID_int t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ "type": "cpu", "type-id": t.processor, "num_core": t.numNonHT, - "package_id": t.physicalID, - "num_package": t.numPhysicalID, + "package_id": t.physicalPackageID, + "num_package": t.numPhysicalPackages, } } - fmt.Printf("%+v\n", m.topology) m.init = true return nil } From ae6ffd4974da4740a2f4cd22535e42de71a43538 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 09:47:24 +0100 Subject: [PATCH 20/49] Refactoring --- collectors/cpufreqCpuinfoMetric.go | 82 +++++++++++++++++------------- collectors/cpufreqMetric.go | 14 ++--- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 1658878..e8cd0fc 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -19,13 +19,17 @@ import ( // Only measure on the first hyperthread // type CPUFreqCpuInfoCollectorTopology struct { - processor string // logical processor number (continuous, starting at 0) - coreID string // socket local core ID - physicalID string // socket / package ID - numPhysicalID string // number of sockets / packages - isHT bool - numNonHT string // number of non hyperthreading processors - tagSet map[string]string + processor string // logical processor number (continuous, starting at 0) + coreID string // socket local core ID + coreID_int int + physicalPackageID string // socket / package ID + physicalPackageID_int int + numPhysicalPackages string // number of sockets / packages + numPhysicalPackages_int int + isHT bool + numNonHT string // number of non hyperthreading processors + numNonHT_int int + tagSet map[string]string } type CPUFreqCpuInfoCollector struct { @@ -46,10 +50,10 @@ func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { // Collect topology information from file cpuinfo foundFreq := false processor := "" - numNonHT := 0 + numNonHT_int := 0 coreID := "" - physicalID := "" - maxPhysicalID := 0 + physicalPackageID := "" + maxPhysicalPackageID := 0 m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0) coreSeenBefore := make(map[string]bool) scanner := bufio.NewScanner(file) @@ -67,7 +71,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { case "core id": coreID = value case "physical id": - physicalID = value + physicalPackageID = value } } @@ -75,55 +79,65 @@ func (m *CPUFreqCpuInfoCollector) Init(config []byte) error { if foundFreq && len(processor) > 0 && len(coreID) > 0 && - len(physicalID) > 0 { + len(physicalPackageID) > 0 { - globalID := physicalID + ":" + coreID + coreID_int, err := strconv.Atoi(coreID) + if err != nil { + return fmt.Errorf("Unable to convert coreID to int: %v", err) + } + physicalPackageID_int, err := strconv.Atoi(physicalPackageID) + if err != nil { + return fmt.Errorf("Unable to convert physicalPackageID to int: %v", err) + } + + // increase maximun socket / package ID, when required + if physicalPackageID_int > maxPhysicalPackageID { + maxPhysicalPackageID = physicalPackageID_int + } + + globalID := physicalPackageID + ":" + coreID isHT := coreSeenBefore[globalID] coreSeenBefore[globalID] = true if !isHT { // increase number on non hyper thread cores - numNonHT++ - - // increase maximun socket / package ID, when required - physicalIDInt, err := strconv.Atoi(physicalID) - if err != nil { - return fmt.Errorf("Failed to convert physical id to int: %v", err) - } - if physicalIDInt > maxPhysicalID { - maxPhysicalID = physicalIDInt - } + numNonHT_int++ } // store collected topology information m.topology = append( m.topology, CPUFreqCpuInfoCollectorTopology{ - processor: processor, - coreID: coreID, - physicalID: physicalID, - isHT: isHT, + processor: processor, + coreID: coreID, + coreID_int: coreID_int, + physicalPackageID: physicalPackageID, + physicalPackageID_int: physicalPackageID_int, + isHT: isHT, }) // reset topology information foundFreq = false processor = "" coreID = "" - physicalID = "" + physicalPackageID = "" } } - numPhysicalID := fmt.Sprint(maxPhysicalID + 1) - numNonHTString := fmt.Sprint(numNonHT) + numPhysicalPackageID_int := maxPhysicalPackageID + 1 + numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int) + numNonHT := fmt.Sprint(numNonHT_int) for i := range m.topology { t := &m.topology[i] - t.numPhysicalID = numPhysicalID - t.numNonHT = numNonHTString + t.numPhysicalPackages = numPhysicalPackageID + t.numPhysicalPackages_int = numPhysicalPackageID_int + t.numNonHT = numNonHT + t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ "type": "cpu", "type-id": t.processor, "num_core": t.numNonHT, - "package_id": t.physicalID, - "num_package": t.numPhysicalID, + "package_id": t.physicalPackageID, + "num_package": t.numPhysicalPackages, } } diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 35e64ac..fcab782 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -143,13 +143,13 @@ func (m *CPUFreqCollector) Init(config []byte) error { // number of non hyper thread cores and packages / sockets numNonHT_int := 0 - maxPhysicalID := 0 + maxPhysicalPackageID := 0 for i := range m.topology { t := &m.topology[i] // Update maxPackageID - if t.physicalPackageID_int > maxPhysicalID { - maxPhysicalID = t.physicalPackageID_int + if t.physicalPackageID_int > maxPhysicalPackageID { + maxPhysicalPackageID = t.physicalPackageID_int } if !t.isHT { @@ -157,13 +157,13 @@ func (m *CPUFreqCollector) Init(config []byte) error { } } - numPhysicalID_int := maxPhysicalID + 1 - numPhysicalID := fmt.Sprint(numPhysicalID_int) + numPhysicalPackageID_int := maxPhysicalPackageID + 1 + numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int) numNonHT := fmt.Sprint(numNonHT_int) for i := range m.topology { t := &m.topology[i] - t.numPhysicalPackages = numPhysicalID - t.numPhysicalPackages_int = numPhysicalID_int + t.numPhysicalPackages = numPhysicalPackageID + t.numPhysicalPackages_int = numPhysicalPackageID_int t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ From 9f8d3ddbd3f8f3f366c4207616d79b85b1d9717c Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 10:32:08 +0100 Subject: [PATCH 21/49] Avoid vet warning: Println arg list ends with redundant newline --- collectors/infinibandMetric.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 6e14251..db7c129 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -17,9 +17,10 @@ import ( "time" ) -const IBBASEPATH = `/sys/class/infiniband/` -const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid` -const PERFQUERY = `/usr/sbin/perfquery` +const ( + IBBASEPATH = `/sys/class/infiniband/` + PERFQUERY = `/usr/sbin/perfquery` +) type InfinibandCollectorConfig struct { ExcludeDevices []string `json:"exclude_devices,omitempty"` @@ -40,12 +41,14 @@ func (m *InfinibandCollector) Help() { fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.") fmt.Println("For each found LIDs the collector calls the 'perfquery' command") fmt.Println("The path to the 'perfquery' command can be configured with the 'perfquery_path' option") - fmt.Println("in the configuration\n") + fmt.Println("in the configuration") + fmt.Println("") fmt.Println("Full configuration object:") fmt.Println("\"ibstat\" : {") fmt.Println(" \"perfquery_path\" : \"path/to/perfquery\" # if omitted, it searches in $PATH") fmt.Println(" \"exclude_devices\" : [\"dev1\"]") - fmt.Println("}\n") + fmt.Println("}") + fmt.Println("") fmt.Println("Metrics:") fmt.Println("- ib_recv") fmt.Println("- ib_xmit") From d903fc6daaf89168439a212aa8d36b0b579ce355 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 25 Jan 2022 11:12:06 +0100 Subject: [PATCH 22/49] Avoid vet warning struct field commands has json tag but is not exported --- collectors/customCmdMetric.go | 8 ++++---- collectors/lustreMetric.go | 4 ++-- collectors/topprocsMetric.go | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index bbafc2d..e11f4c7 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -15,8 +15,8 @@ import ( const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom` type CustomCmdCollectorConfig struct { - commands []string `json:"commands"` - files []string `json:"files"` + Commands []string `json:"commands"` + Files []string `json:"files"` ExcludeMetrics []string `json:"exclude_metrics"` } @@ -40,7 +40,7 @@ func (m *CustomCmdCollector) Init(config []byte) error { } } m.setup() - for _, c := range m.config.commands { + for _, c := range m.config.Commands { cmdfields := strings.Fields(c) command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " ")) command.Wait() @@ -49,7 +49,7 @@ func (m *CustomCmdCollector) Init(config []byte) error { m.commands = append(m.commands, c) } } - for _, f := range m.config.files { + for _, f := range m.config.Files { _, err = ioutil.ReadFile(f) if err == nil { m.files = append(m.files, f) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index d77ac09..8931f84 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -15,7 +15,7 @@ import ( const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats` type LustreCollectorConfig struct { - procfiles []string `json:"procfiles"` + Procfiles []string `json:"procfiles"` ExcludeMetrics []string `json:"exclude_metrics"` } @@ -47,7 +47,7 @@ func (m *LustreCollector) Init(config []byte) error { "statfs": {"statfs": 1}, "inode_permission": {"inode_permission": 1}} m.devices = make([]string, 0) - for _, p := range m.config.procfiles { + for _, p := range m.config.Procfiles { _, err := ioutil.ReadFile(p) if err == nil { m.devices = append(m.devices, p) diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index e1b31ee..715b8c3 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -16,7 +16,7 @@ const MAX_NUM_PROCS = 10 const DEFAULT_NUM_PROCS = 2 type TopProcsCollectorConfig struct { - num_procs int `json:"num_procs"` + Num_procs int `json:"num_procs"` } type TopProcsCollector struct { @@ -35,9 +35,9 @@ func (m *TopProcsCollector) Init(config []byte) error { return err } } else { - m.config.num_procs = int(DEFAULT_NUM_PROCS) + m.config.Num_procs = int(DEFAULT_NUM_PROCS) } - if m.config.num_procs <= 0 || m.config.num_procs > MAX_NUM_PROCS { + if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS { return errors.New(fmt.Sprintf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)) } m.setup() @@ -64,7 +64,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric } lines := strings.Split(string(stdout), "\n") - for i := 1; i < m.config.num_procs+1; i++ { + for i := 1; i < m.config.Num_procs+1; i++ { name := fmt.Sprintf("topproc%d", i) y, err := lp.New(name, m.tags, map[string]interface{}{"value": string(lines[i])}, time.Now()) if err == nil { From db5b4e4f65fe21337e2c26dfe1a4a20aaabfa0e5 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 28 Jan 2022 09:14:25 +0100 Subject: [PATCH 23/49] Add type=node to gpf metric tags --- collectors/gpfsMetric.go | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index fbf3a63..f2e33c6 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -108,6 +108,11 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { continue } + tagList := map[string]string{ + "type": "node", + "filesystem": filesystem, + } + // return code rc, err := strconv.Atoi(key_value["_rc_"]) if err != nil { @@ -142,9 +147,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err := lp.New( "gpfs_bytes_read", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": bytesRead, }, @@ -163,9 +166,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err = lp.New( "gpfs_bytes_written", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": bytesWritten, }, @@ -184,9 +185,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err = lp.New( "gpfs_num_opens", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": numOpens, }, @@ -203,9 +202,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err = lp.New( "gpfs_num_closes", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": numCloses, }, @@ -222,9 +219,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err = lp.New( "gpfs_num_reads", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": numReads, }, @@ -241,9 +236,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err = lp.New( "gpfs_num_writes", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": numWrites, }, @@ -260,9 +253,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err = lp.New( "gpfs_num_readdirs", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": numReaddirs, }, @@ -279,9 +270,7 @@ func (m *GpfsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { } y, err = lp.New( "gpfs_num_inode_updates", - map[string]string{ - "filesystem": filesystem, - }, + tagList, map[string]interface{}{ "value": numInodeUpdates, }, From d5ff5b83ce1bbbda388ec0d126ee5f62225bedb0 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 3 Feb 2022 16:19:45 +0100 Subject: [PATCH 24/49] Add NUMA metric collector --- collectors/README.md | 5 ++ collectors/numastatsMetric.go | 121 ++++++++++++++++++++++++++++++++++ metric-collector.go | 1 + 3 files changed, 127 insertions(+) create mode 100644 collectors/numastatsMetric.go diff --git a/collectors/README.md b/collectors/README.md index df02dd6..14556c1 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -322,6 +322,11 @@ type SampleCollector struct { } func (m *SampleCollector) Init(config []byte) error { + // Check if already initialized + if m.init { + return nil + } + m.name = "SampleCollector" m.setup() if len(config) > 0 { diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go new file mode 100644 index 0000000..1c58483 --- /dev/null +++ b/collectors/numastatsMetric.go @@ -0,0 +1,121 @@ +package collectors + +import ( + "bufio" + "fmt" + "log" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + lp "github.com/influxdata/line-protocol" +) + +// +// Numa policy hit/miss statistics +// +// numa_hit: +// A process wanted to allocate memory from this node, and succeeded. +// numa_miss: +// A process wanted to allocate memory from another node, +// but ended up with memory from this node. +// numa_foreign: +// A process wanted to allocate on this node, +// but ended up with memory from another node. +// local_node: +// A process ran on this node's CPU, +// and got memory from this node. +// other_node: +// A process ran on a different node's CPU +// and got memory from this node. +// interleave_hit: +// Interleaving wanted to allocate from this node +// and succeeded. +// +// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html +// +type NUMAStatsCollectorTopolgy struct { + file string + tagSet map[string]string +} + +type NUMAStatsCollector struct { + MetricCollector + topology []NUMAStatsCollectorTopolgy +} + +func (m *NUMAStatsCollector) Init(config []byte) error { + // Check if already initialized + if m.init { + return nil + } + + m.name = "NUMAStatsCollector" + m.setup() + + // Loop for all NUMA node directories + baseDir := "/sys/devices/system/node" + globPattern := filepath.Join(baseDir, "node[0-9]*") + dirs, err := filepath.Glob(globPattern) + if err != nil { + return fmt.Errorf("unable to glob files with pattern %s", globPattern) + } + if dirs == nil { + return fmt.Errorf("unable to find any files with pattern %s", globPattern) + } + m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs)) + for _, dir := range dirs { + node := strings.TrimPrefix(dir, "/sys/devices/system/node/node") + file := filepath.Join(dir, "numastat") + m.topology = append(m.topology, + NUMAStatsCollectorTopolgy{ + file: file, + tagSet: map[string]string{"domain": node}, + }) + } + + m.init = true + return nil +} + +func (m *NUMAStatsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + if !m.init { + return + } + + for i := range m.topology { + // Loop for all NUMA domains + t := &m.topology[i] + + now := time.Now() + file, err := os.Open(t.file) + if err != nil { + return + } + scanner := bufio.NewScanner(file) + for scanner.Scan() { + split := strings.Fields(scanner.Text()) + if len(split) != 2 { + continue + } + key := split[0] + value, err := strconv.ParseInt(split[1], 10, 64) + if err != nil { + log.Printf("failed to convert %s='%s' to int64: %v", key, split[1], err) + continue + } + y, err := lp.New("numastats_"+key, t.tagSet, map[string]interface{}{"value": value}, now) + if err == nil { + *out = append(*out, y) + } + } + + file.Close() + } +} + +func (m *NUMAStatsCollector) Close() { + m.init = false +} diff --git a/metric-collector.go b/metric-collector.go index 02a2b21..d62b516 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -36,6 +36,7 @@ var Collectors = map[string]collectors.MetricGetter{ "gpfs": new(collectors.GpfsCollector), "cpufreq": new(collectors.CPUFreqCollector), "cpufreq_cpuinfo": new(collectors.CPUFreqCpuInfoCollector), + "numastats": new(collectors.NUMAStatsCollector), } var Sinks = map[string]sinks.SinkFuncs{ From eed9cd227ce0618038a4160a9c83c9ae966daddb Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 14:50:11 +0100 Subject: [PATCH 25/49] Remove doubled import and remove merge artifacts --- collectors/customCmdMetric.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index db55ac0..e978c49 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -9,13 +9,8 @@ import ( "strings" "time" -<<<<<<< HEAD - lp "github.com/influxdata/line-protocol" -======= - ccmetric "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" ->>>>>>> develop ) const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom` @@ -103,7 +98,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri continue } - y := ccmetric.FromInfluxMetric(c) + y := lp.FromInfluxMetric(c) if err == nil { output <- y } @@ -125,7 +120,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri if skip { continue } - y := ccmetric.FromInfluxMetric(f) + y := lp.FromInfluxMetric(f) if err == nil { output <- y } From 3ade75490c9de1cb1bf9a76e484b94de0cf5d2fe Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 15:09:42 +0100 Subject: [PATCH 26/49] Add RHEL UBI 8 to rpmbuild action --- .github/workflows/rpmbuild.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index d9220a7..8cb71e6 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -64,3 +64,22 @@ jobs: with: name: cc-metric-collector SRPM AlmaLinux 8.5 path: ${{ steps.rpm.outputs.source_rpm_path }} + build-rhel-ubi8: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: TomTheBear/rpmbuild@rh-ubi8 + id: rpm + name: Build RPM package on Red Hat Universal Base Image 8 + with: + spec_file: "./scripts/cc-metric-collector.spec" + - name: Save RPM as artifact + uses: actions/upload-artifact@v1.0.0 + with: + name: cc-metric-collector RPM Red Hat Universal Base Image 8 + path: ${{ steps.rpm.outputs.rpm_dir_path }} + - name: Save SRPM as artifact + uses: actions/upload-artifact@v1.0.0 + with: + name: cc-metric-collector SRPM Red Hat Universal Base Image 8 + path: ${{ steps.rpm.outputs.source_rpm_path }} From 842395682cfdc6fe64c8070b7301665053965e00 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 17:01:13 +0100 Subject: [PATCH 27/49] Remove -lganglia ldflag and check dl lib --- sinks/libgangliaSink.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index ed19145..051bd03 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -2,7 +2,7 @@ package sinks /* #cgo CFLAGS: -DGM_PROTOCOL_GUARD -#cgo LDFLAGS: -L. -lganglia -Wl,--unresolved-symbols=ignore-in-object-files +#cgo LDFLAGS: -L. -Wl,--unresolved-symbols=ignore-in-object-files #include // This is a copy&paste snippet of ganglia.h (BSD-3 license) @@ -71,6 +71,7 @@ import ( "fmt" "unsafe" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" "github.com/NVIDIA/go-nvml/pkg/dl" ) @@ -120,7 +121,7 @@ func (s *LibgangliaSink) Init(config json.RawMessage) error { if len(config) > 0 { err = json.Unmarshal(config, &s.config) if err != nil { - fmt.Println(s.name, "Error reading config for", s.name, ":", err.Error()) + cclog.ComponentError(s.name, "Error reading config:", err.Error()) return err } } @@ -128,6 +129,10 @@ func (s *LibgangliaSink) Init(config json.RawMessage) error { if lib == nil { return fmt.Errorf("error instantiating DynamicLibrary for %s", s.config.GangliaLib) } + err = lib.Open() + if err != nil { + return fmt.Errorf("error opening %s: %v", s.config.GangliaLib, err) + } // Set up cache for the C strings s.cstrCache = make(map[string]*C.char) From a1328b30e90cf4c0bbb0ae7de3225ca2f718088c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 17:16:07 +0100 Subject: [PATCH 28/49] Remove CentOS8 from RPMBUILD action. End of Life --- .github/workflows/rpmbuild.yml | 40 ---------------------------------- 1 file changed, 40 deletions(-) diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index 8cb71e6..f0c0d09 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -5,46 +5,6 @@ on: - '**' jobs: - build-centos8: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - submodules: recursive - - uses: TomTheBear/rpmbuild@master - id: rpm - name: Build RPM package on CentOS8 - with: - spec_file: "./scripts/cc-metric-collector.spec" - - name: Save RPM as artifact - uses: actions/upload-artifact@v1.0.0 - with: - name: cc-metric-collector RPM CentOS8 - path: ${{ steps.rpm.outputs.rpm_dir_path }} - - name: Save SRPM as artifact - uses: actions/upload-artifact@v1.0.0 - with: - name: cc-metric-collector SRPM CentOS8 - path: ${{ steps.rpm.outputs.source_rpm_path }} - build-centos-latest: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: TomTheBear/rpmbuild@centos_latest - id: rpm - name: Build RPM package on CentOS 'Latest' - with: - spec_file: "./scripts/cc-metric-collector.spec" - - name: Save RPM as artifact - uses: actions/upload-artifact@v1.0.0 - with: - name: cc-metric-collector RPM CentOS 'Latest' - path: ${{ steps.rpm.outputs.rpm_dir_path }} - - name: Save SRPM as artifact - uses: actions/upload-artifact@v1.0.0 - with: - name: cc-metric-collector SRPM CentOS 'Latest' - path: ${{ steps.rpm.outputs.source_rpm_path }} build-alma-8_5: runs-on: ubuntu-latest steps: From 54c693532f3a38ab67c1f3e82b2606bdb94c2be3 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 21 Feb 2022 17:26:55 +0100 Subject: [PATCH 29/49] Do not create fake libganglia.so. libganglia.so is now loaded during runtime by dlopen and no longer required during link time --- sinks/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/sinks/Makefile b/sinks/Makefile index bd40f10..5cc3e84 100644 --- a/sinks/Makefile +++ b/sinks/Makefile @@ -5,8 +5,6 @@ libganglia.so: @find /usr ! -readable -prune -o -type d ! -executable -prune -o -name "$@*" -print0 | \ xargs --null --no-run-if-empty --replace \ ln --symbolic --verbose --force '{}' "$@" - @if [[ ! -e "$@" ]]; then touch "$@"; fi - clean: rm -f libganglia.so From afa831aff81b50d0bb8ed5aaf0da6dad36ff9659 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 21 Feb 2022 18:17:12 +0100 Subject: [PATCH 30/49] Update rpmbuild.yml --- .github/workflows/rpmbuild.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index f0c0d09..1326494 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -24,6 +24,12 @@ jobs: with: name: cc-metric-collector SRPM AlmaLinux 8.5 path: ${{ steps.rpm.outputs.source_rpm_path }} + - name: Release + uses: softprops/action-gh-release@v1 + with: + files: | + ${{ steps.rpm.outputs.source_rpm_path }} + ${{ steps.rpm.outputs.rpm_dir_path }} build-rhel-ubi8: runs-on: ubuntu-latest steps: @@ -43,3 +49,9 @@ jobs: with: name: cc-metric-collector SRPM Red Hat Universal Base Image 8 path: ${{ steps.rpm.outputs.source_rpm_path }} + - name: Release + uses: softprops/action-gh-release@v1 + with: + files: | + ${{ steps.rpm.outputs.source_rpm_path }} + ${{ steps.rpm.outputs.rpm_dir_path }} From 5cd7ac25419a18742f8d44b90937c6cc22b51bb2 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 21 Feb 2022 18:28:07 +0100 Subject: [PATCH 31/49] Update rpmbuild.yml --- .github/workflows/rpmbuild.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index 1326494..8479cd6 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -18,7 +18,7 @@ jobs: uses: actions/upload-artifact@v1.0.0 with: name: cc-metric-collector RPM AlmaLinux 8.5 - path: ${{ steps.rpm.outputs.rpm_dir_path }} + path: ${{ steps.rpm.outputs.rpm_path }} - name: Save SRPM as artifact uses: actions/upload-artifact@v1.0.0 with: @@ -29,7 +29,7 @@ jobs: with: files: | ${{ steps.rpm.outputs.source_rpm_path }} - ${{ steps.rpm.outputs.rpm_dir_path }} + ${{ steps.rpm.outputs.rpm_path }} build-rhel-ubi8: runs-on: ubuntu-latest steps: @@ -43,7 +43,7 @@ jobs: uses: actions/upload-artifact@v1.0.0 with: name: cc-metric-collector RPM Red Hat Universal Base Image 8 - path: ${{ steps.rpm.outputs.rpm_dir_path }} + path: ${{ steps.rpm.outputs.rpm_path }} - name: Save SRPM as artifact uses: actions/upload-artifact@v1.0.0 with: @@ -54,4 +54,4 @@ jobs: with: files: | ${{ steps.rpm.outputs.source_rpm_path }} - ${{ steps.rpm.outputs.rpm_dir_path }} + ${{ steps.rpm.outputs.rpm_path }} From 1425463a26e36ff7e5f017c8253afab15b8e2254 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 18:49:39 +0100 Subject: [PATCH 32/49] Upload RPM not SRPM --- .github/workflows/rpmbuild.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index 8479cd6..db4fc49 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -27,9 +27,8 @@ jobs: - name: Release uses: softprops/action-gh-release@v1 with: - files: | - ${{ steps.rpm.outputs.source_rpm_path }} - ${{ steps.rpm.outputs.rpm_path }} + name: cc-metric-collector-${{github.ref}} + files: ${{ steps.rpm.outputs.rpm_path }} build-rhel-ubi8: runs-on: ubuntu-latest steps: @@ -52,6 +51,5 @@ jobs: - name: Release uses: softprops/action-gh-release@v1 with: - files: | - ${{ steps.rpm.outputs.source_rpm_path }} - ${{ steps.rpm.outputs.rpm_path }} + name: cc-metric-collector-${{github.ref}} + files: ${{ steps.rpm.outputs.rpm_path }} From 888292dbef1a17a627c9c4beeaa7d8fe6bb82ddc Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 18:53:03 +0100 Subject: [PATCH 33/49] Remove ganglia build tag documentation, not needed anymore --- README.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/README.md b/README.md index 5aa8806..14bacec 100644 --- a/README.md +++ b/README.md @@ -40,15 +40,9 @@ See the component READMEs for their configuration: $ git clone git@github.com:ClusterCockpit/cc-metric-collector.git $ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector) $ go get (requires at least golang 1.16) -$ make tags -Available tags: -ganglia -[...] -$ make # calls go build (-tags ganglia,...) -o cc-metric-collector +$ make ``` -## `ganglia` build tag -If you want support for the [Ganglia Monitoring System](http://ganglia.info/), you have to add `-tags ganglia` to the build command line. This enables two metric sinks. One is using the command line application `gmetric` (see [`ganglia`](./sinks/gangliaSink.md) sink), the other one interacts directly with `libganglia` the main Ganglia library that is commonly installed on each compute node (see [`libganglia`](./sinks/libgangliaSink.md) sink). The later one requires configuration before building, so use `make` instead of `go build` directly. # Running From 5742721cbdb5d8af097e24f4981700425306b8b3 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 18:58:13 +0100 Subject: [PATCH 34/49] Proper tag name in release --- .github/workflows/rpmbuild.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rpmbuild.yml b/.github/workflows/rpmbuild.yml index db4fc49..9c6ae13 100644 --- a/.github/workflows/rpmbuild.yml +++ b/.github/workflows/rpmbuild.yml @@ -27,8 +27,10 @@ jobs: - name: Release uses: softprops/action-gh-release@v1 with: - name: cc-metric-collector-${{github.ref}} - files: ${{ steps.rpm.outputs.rpm_path }} + name: cc-metric-collector-${{github.ref_name}} + files: | + ${{ steps.rpm.outputs.source_rpm_path }} + ${{ steps.rpm.outputs.rpm_path }} build-rhel-ubi8: runs-on: ubuntu-latest steps: @@ -51,5 +53,6 @@ jobs: - name: Release uses: softprops/action-gh-release@v1 with: - name: cc-metric-collector-${{github.ref}} - files: ${{ steps.rpm.outputs.rpm_path }} + files: | + ${{ steps.rpm.outputs.source_rpm_path }} + ${{ steps.rpm.outputs.rpm_path }} From a97c705f4c2482449f4cc15198561d6db53da261 Mon Sep 17 00:00:00 2001 From: Holger Obermaier Date: Mon, 21 Feb 2022 20:53:55 +0100 Subject: [PATCH 35/49] Do not create link to libganglia.so. libganglia.so is now loaded during runtime by dlopen and no longer required during link time --- Makefile | 9 +++++++-- sinks/Makefile | 12 ------------ 2 files changed, 7 insertions(+), 14 deletions(-) delete mode 100644 sinks/Makefile diff --git a/Makefile b/Makefile index 33fd515..b32fb6b 100644 --- a/Makefile +++ b/Makefile @@ -21,13 +21,18 @@ all: $(APP) $(APP): $(GOSRC) make -C collectors - make -C sinks go get go build -o $(APP) $(GOSRC_APP) .PHONY: clean +.ONESHELL: clean: - @for COMP in $(COMPONENT_DIRS); do if [ -e $$COMP/Makefile ]; then make -C $$COMP clean; fi; done + @for COMP in $(COMPONENT_DIRS) + do + if [[ -e $$COMP/Makefile ]]; then + make -C $$COMP clean + fi + done rm -f $(APP) .PHONY: fmt diff --git a/sinks/Makefile b/sinks/Makefile deleted file mode 100644 index 5cc3e84..0000000 --- a/sinks/Makefile +++ /dev/null @@ -1,12 +0,0 @@ - -all: libganglia.so - -libganglia.so: - @find /usr ! -readable -prune -o -type d ! -executable -prune -o -name "$@*" -print0 | \ - xargs --null --no-run-if-empty --replace \ - ln --symbolic --verbose --force '{}' "$@" - -clean: - rm -f libganglia.so - -.PHONY: clean From 45714fe33774132a514295e6585d2b61ab8dca77 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 22 Feb 2022 15:09:12 +0100 Subject: [PATCH 36/49] Update README.md --- README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/README.md b/README.md index 14bacec..d03464a 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,39 @@ Usage of metric-collector: -once Run all collectors only once ``` +# Scenarios +The metric collector was designed with flexibility in mind, so it can be used in many scenarios. Here are a few: + +```mermaid +flowchart TD + subgraph a ["Cluster A"] + nodeA[NodeA with CC collector] + nodeB[NodeB with CC collector] + nodeC[NodeC with CC collector] + end + a --> db[(Database)] + db <--> ccweb("Webfrontend") +``` + +``` mermaid +flowchart TD + subgraph a [ClusterA] + nodeA[NodeA with CC collector] + nodeB[NodeB with CC collector] + nodeC[NodeC with CC collector] + end + subgraph b [ClusterB] + nodeD[NodeD with CC collector] + nodeE[NodeE with CC collector] + nodeF[NodeF with CC collector] + end + a --> ccrecv{"CC collector as receiver"} + b --> ccrecv + ccrecv --> db[("Database1")] + ccrecv -.-> db2[("Database2")] + db <-.-> ccweb("Webfrontend") +``` # Contributing The ClusterCockpit ecosystem is designed to be used by different HPC computing centers. Since configurations and setups differ between the centers, the centers likely have to put some work into the cc-metric-collector to gather all desired metrics. From b4cc6d54ea3816f467ab64913ccaa863b63b857c Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 22 Feb 2022 15:10:27 +0100 Subject: [PATCH 37/49] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index d03464a..f819be2 100644 --- a/README.md +++ b/README.md @@ -74,11 +74,13 @@ flowchart TD ``` mermaid flowchart TD subgraph a [ClusterA] + direction LR nodeA[NodeA with CC collector] nodeB[NodeB with CC collector] nodeC[NodeC with CC collector] end subgraph b [ClusterB] + direction LR nodeD[NodeD with CC collector] nodeE[NodeE with CC collector] nodeF[NodeF with CC collector] From 66275ecf74608ab6b7b41c8b1a65694413a84b0d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Feb 2022 15:50:49 +0100 Subject: [PATCH 38/49] DiskstatCollector: cast part_max_used metric to int --- collectors/diskstatMetric.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 819a1ab..16c70ba 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -102,7 +102,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric part_max_used = perc } } - y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": part_max_used}, time.Now()) + y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now()) if err == nil { y.AddMeta("unit", "percent") output <- y From 9cfbe10247f1512cf961f92b3ab02c7630222225 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Feb 2022 15:51:08 +0100 Subject: [PATCH 39/49] Add uint types to GangliaSink and LibgangliaSink --- sinks/gangliaSink.go | 14 ++++++++++++++ sinks/libgangliaSink.go | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index fa95f43..ae53dd8 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -134,9 +134,21 @@ func (s *GangliaSink) Write(point lp.CCMetric) error { case int: argstr = append(argstr, fmt.Sprintf("--value=%d", value), "--type=int32") + case int32: + argstr = append(argstr, + fmt.Sprintf("--value=%d", value), "--type=int32") case int64: argstr = append(argstr, fmt.Sprintf("--value=%d", value), "--type=int32") + case uint: + argstr = append(argstr, + fmt.Sprintf("--value=%d", value), "--type=uint32") + case uint32: + argstr = append(argstr, + fmt.Sprintf("--value=%d", value), "--type=uint32") + case uint64: + argstr = append(argstr, + fmt.Sprintf("--value=%d", value), "--type=uint32") case string: argstr = append(argstr, fmt.Sprintf("--value=%q", value), "--type=string") @@ -155,3 +167,5 @@ func (s *GangliaSink) Flush() error { func (s *GangliaSink) Close() { } + +func NewGangliaSink() diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 051bd03..7136e42 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -212,6 +212,15 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { case int: c_value = C.CString(fmt.Sprintf("%d", real)) c_type = lookup("int32") + case uint64: + c_value = C.CString(fmt.Sprintf("%d", real)) + c_type = lookup("uint32") + case uint32: + c_value = C.CString(fmt.Sprintf("%d", real)) + c_type = lookup("uint32") + case uint: + c_value = C.CString(fmt.Sprintf("%d", real)) + c_type = lookup("uint32") case string: c_value = C.CString(real) c_type = lookup("string") From 18a226183c92cf10c59be029d1667761ba592f23 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Feb 2022 16:15:25 +0100 Subject: [PATCH 40/49] Use new sink instances to allow multiple of same sink type --- sinks/gangliaSink.go | 10 +++++++--- sinks/httpSink.go | 10 ++++++++-- sinks/influxAsyncSink.go | 10 ++++++++-- sinks/influxSink.go | 10 ++++++++-- sinks/libgangliaSink.go | 10 ++++++++-- sinks/metricSink.go | 2 +- sinks/natsSink.go | 10 ++++++++-- sinks/sinkManager.go | 19 +++++++++---------- sinks/stdoutSink.go | 10 ++++++++-- 9 files changed, 65 insertions(+), 26 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index ae53dd8..0c9459b 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -33,9 +33,9 @@ type GangliaSink struct { config GangliaSinkConfig } -func (s *GangliaSink) Init(config json.RawMessage) error { +func (s *GangliaSink) Init(name string, config json.RawMessage) error { var err error = nil - s.name = "GangliaSink" + s.name = fmt.Sprintf("GangliaSink(%s)", name) s.config.AddTagsAsDesc = false s.config.AddGangliaGroup = false if len(config) > 0 { @@ -168,4 +168,8 @@ func (s *GangliaSink) Flush() error { func (s *GangliaSink) Close() { } -func NewGangliaSink() +func NewGangliaSink(name string, config json.RawMessage) (Sink, error) { + s := new(GangliaSink) + s.Init(name, config) + return s, nil +} diff --git a/sinks/httpSink.go b/sinks/httpSink.go index ce46bab..41a5919 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -38,9 +38,9 @@ type HttpSink struct { flushDelay time.Duration } -func (s *HttpSink) Init(config json.RawMessage) error { +func (s *HttpSink) Init(name string, config json.RawMessage) error { // Set default values - s.name = "HttpSink" + s.name = fmt.Sprintf("HttpSink(%s)", name) s.config.MaxIdleConns = 10 s.config.IdleConnTimeout = "5s" s.config.Timeout = "5s" @@ -169,3 +169,9 @@ func (s *HttpSink) Close() { } s.client.CloseIdleConnections() } + +func NewHttpSink(name string, config json.RawMessage) (Sink, error) { + s := new(HttpSink) + s.Init(name, config) + return s, nil +} diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index 20aa60c..e2f8995 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -68,8 +68,8 @@ func (s *InfluxAsyncSink) connect() error { return nil } -func (s *InfluxAsyncSink) Init(config json.RawMessage) error { - s.name = "InfluxSink" +func (s *InfluxAsyncSink) Init(name string, config json.RawMessage) error { + s.name = fmt.Sprintf("InfluxSink(%s)", name) // Set default for maximum number of points sent to server in single request. s.config.BatchSize = 100 @@ -118,3 +118,9 @@ func (s *InfluxAsyncSink) Close() { s.writeApi.Flush() s.client.Close() } + +func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { + s := new(InfluxAsyncSink) + s.Init(name, config) + return s, nil +} diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 99304c0..f235054 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -57,8 +57,8 @@ func (s *InfluxSink) connect() error { return nil } -func (s *InfluxSink) Init(config json.RawMessage) error { - s.name = "InfluxSink" +func (s *InfluxSink) Init(name string, config json.RawMessage) error { + s.name = fmt.Sprintf("InfluxSink(%s)", name) if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { @@ -94,3 +94,9 @@ func (s *InfluxSink) Close() { cclog.ComponentDebug(s.name, "Closing InfluxDB connection") s.client.Close() } + +func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { + s := new(InfluxSink) + s.Init(name, config) + return s, nil +} diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 7136e42..9a45df4 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -109,9 +109,9 @@ type LibgangliaSink struct { cstrCache map[string]*C.char } -func (s *LibgangliaSink) Init(config json.RawMessage) error { +func (s *LibgangliaSink) Init(name string, config json.RawMessage) error { var err error = nil - s.name = "LibgangliaSink" + s.name = fmt.Sprintf("LibgangliaSink(%s)", name) //s.config.AddTagsAsDesc = false s.config.AddGangliaGroup = false s.config.AddTypeToName = false @@ -316,3 +316,9 @@ func (s *LibgangliaSink) Close() { C.free(unsafe.Pointer(cstr)) } } + +func NewLibgangliaSink(name string, config json.RawMessage) (Sink, error) { + s := new(LibgangliaSink) + s.Init(name, config) + return s, nil +} diff --git a/sinks/metricSink.go b/sinks/metricSink.go index d76f5f2..4583a2c 100644 --- a/sinks/metricSink.go +++ b/sinks/metricSink.go @@ -17,7 +17,7 @@ type sink struct { } type Sink interface { - Init(config json.RawMessage) error + Init(name string, config json.RawMessage) error Write(point lp.CCMetric) error Flush() error Close() diff --git a/sinks/natsSink.go b/sinks/natsSink.go index 187157e..6087da0 100644 --- a/sinks/natsSink.go +++ b/sinks/natsSink.go @@ -53,8 +53,8 @@ func (s *NatsSink) connect() error { return nil } -func (s *NatsSink) Init(config json.RawMessage) error { - s.name = "NatsSink" +func (s *NatsSink) Init(name string, config json.RawMessage) error { + s.name = fmt.Sprintf("NatsSink(%s)", name) if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { @@ -105,3 +105,9 @@ func (s *NatsSink) Close() { s.client.Close() } } + +func NewNatsSink(name string, config json.RawMessage) (Sink, error) { + s := new(NatsSink) + s.Init(name, config) + return s, nil +} diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 487e7ca..f531f5d 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -13,14 +13,14 @@ import ( const SINK_MAX_FORWARD = 50 // Map of all available sinks -var AvailableSinks = map[string]Sink{ - "influxdb": new(InfluxSink), - "stdout": new(StdoutSink), - "nats": new(NatsSink), - "http": new(HttpSink), - "ganglia": new(GangliaSink), - "influxasync": new(InfluxAsyncSink), - "libganglia": new(LibgangliaSink), +var AvailableSinks = map[string]func(name string, config json.RawMessage) (Sink, error){ + "ganglia": NewGangliaSink, + "libganglia": NewLibgangliaSink, + "stdout": NewStdoutSink, + "nats": NewNatsSink, + "influxdb": NewInfluxSink, + "influxasync": NewInfluxAsyncSink, + "http": NewHttpSink, } // Metric collector manager data structure @@ -149,8 +149,7 @@ func (sm *sinkManager) AddOutput(name string, rawConfig json.RawMessage) error { cclog.ComponentError("SinkManager", "SKIP", name, "unknown sink:", sinkConfig.Type) return err } - s := AvailableSinks[sinkConfig.Type] - err = s.Init(rawConfig) + s, err := AvailableSinks[sinkConfig.Type](name, rawConfig) if err != nil { cclog.ComponentError("SinkManager", "SKIP", s.Name(), "initialization failed:", err.Error()) return err diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index 5d0761a..d6c2e1b 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -19,8 +19,8 @@ type StdoutSink struct { } } -func (s *StdoutSink) Init(config json.RawMessage) error { - s.name = "StdoutSink" +func (s *StdoutSink) Init(name string, config json.RawMessage) error { + s.name = fmt.Sprintf("StdoutSink(%s)", name) if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { @@ -65,3 +65,9 @@ func (s *StdoutSink) Close() { s.output.Close() } } + +func NewStdoutSink(name string, config json.RawMessage) (Sink, error) { + s := new(StdoutSink) + s.Init(name, config) + return s, nil +} From 24e12ccc57eeb71dc9aafb43cb21dec935eb0fdc Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Feb 2022 16:19:46 +0100 Subject: [PATCH 41/49] Update sink README and SampleSink --- sinks/README.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sinks/README.md b/sinks/README.md index 8ff3743..65d2851 100644 --- a/sinks/README.md +++ b/sinks/README.md @@ -6,6 +6,7 @@ This folder contains the SinkManager and sink implementations for the cc-metric- - [`stdout`](./stdoutSink.md): Print all metrics to `stdout`, `stderr` or a file - [`http`](./httpSink.md): Send metrics to an HTTP server as POST requests - [`influxdb`](./influxSink.md): Send metrics to an [InfluxDB](https://www.influxdata.com/products/influxdb/) database +- [`influxasync`](./influxAsyncSink.md): Send metrics to an [InfluxDB](https://www.influxdata.com/products/influxdb/) database with non-blocking write API - [`nats`](./natsSink.md): Publish metrics to the [NATS](https://nats.io/) network overlay system - [`ganglia`](./gangliaSink.md): Publish metrics in the [Ganglia Monitoring System](http://ganglia.info/) using the `gmetric` CLI tool - [`libganglia`](./libgangliaSink.md): Publish metrics in the [Ganglia Monitoring System](http://ganglia.info/) directly using `libganglia.so` @@ -34,11 +35,12 @@ The configuration file for the sinks is a list of configurations. The `type` fie # Contributing own sinks -A sink contains four functions and is derived from the type `sink`: -* `Init(config json.RawMessage) error` +A sink contains five functions and is derived from the type `sink`: +* `Init(name string, config json.RawMessage) error` * `Write(point CCMetric) error` * `Flush() error` * `Close()` +* `New(name string, config json.RawMessage) (Sink, error)` (calls the `Init()` function) The data structures should be set up in `Init()` like opening a file or server connection. The `Write()` function writes/sends the data. For non-blocking sinks, the `Flush()` method tells the sink to drain its internal buffers. The `Close()` function should tear down anything created in `Init()`. @@ -65,8 +67,8 @@ type SampleSink struct { } // Initialize the sink by giving it a name and reading in the config JSON -func (s *SampleSink) Init(config json.RawMessage) error { - s.name = "SampleSink" // Always specify a name here +func (s *SampleSink) Init(name string, config json.RawMessage) error { + s.name = fmt.Sprintf("SampleSink(%s)", name) // Always specify a name here // Read in the config JSON if len(config) > 0 { err := json.Unmarshal(config, &s.config) @@ -91,4 +93,13 @@ func (s *SampleSink) Flush() error { // Close sink: close network connection, close files, close libraries, ... func (s *SampleSink) Close() {} + + +// New function to create a new instance of the sink +func NewSampleSink(name string, config json.RawMessage) (Sink, error) { + s := new(SampleSink) + err := s.Init(name, config) + return s, err +} + ``` \ No newline at end of file From 3598aed0909b033d728d14c55e424a1c5f7295ee Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Feb 2022 16:33:38 +0100 Subject: [PATCH 42/49] Use new receiver instances to allow multiple of same receiver type --- receivers/README.md | 22 ++++++++++------------ receivers/metricReceiver.go | 5 ++--- receivers/natsReceiver.go | 9 +++++++-- receivers/receiveManager.go | 7 +++---- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/receivers/README.md b/receivers/README.md index 24425f2..49015d3 100644 --- a/receivers/README.md +++ b/receivers/README.md @@ -7,14 +7,11 @@ This folder contains the ReceiveManager and receiver implementations for the cc- The configuration file for the receivers is a list of configurations. The `type` field in each specifies which receiver to initialize. ```json -[ - { - "type": "nats", - "address": "nats://my-url", - "port" : "4222", - "database": "testcluster" +{ + "myreceivername" : { + } -] +} ``` @@ -25,20 +22,21 @@ The configuration file for the receivers is a list of configurations. The `type` "type": "nats", "address": "", "port" : "", - "database": "" + "subject": "" } ``` The `nats` receiver subscribes to the topic `database` and listens on `address` and `port` for metrics in the InfluxDB line protocol. # Contributing own receivers -A receiver contains three functions and is derived from the type `Receiver` (in `metricReceiver.go`): -* `Init(config ReceiverConfig) error` +A receiver contains a few functions and is derived from the type `Receiver` (in `metricReceiver.go`): +* `Init(name string, config json.RawMessage) error` * `Start() error` * `Close()` * `Name() string` -* `SetSink(sink chan ccMetric.CCMetric)` +* `SetSink(sink chan lp.CCMetric)` +* `New(name string, config json.RawMessage)` The data structures should be set up in `Init()` like opening a file or server connection. The `Start()` function should either start a go routine or issue some other asynchronous mechanism for receiving metrics. The `Close()` function should tear down anything created in `Init()`. -Finally, the receiver needs to be registered in the `receiveManager.go`. There is a list of receivers called `AvailableReceivers` which is a map (`receiver_type_string` -> `pointer to Receiver interface`). Add a new entry with a descriptive name and the new receiver. +Finally, the receiver needs to be registered in the `receiveManager.go`. There is a list of receivers called `AvailableReceivers` which is a map (`receiver_type_string` -> `pointer to NewReceiver function`). Add a new entry with a descriptive name and the new receiver. diff --git a/receivers/metricReceiver.go b/receivers/metricReceiver.go index c712186..e1a384a 100644 --- a/receivers/metricReceiver.go +++ b/receivers/metricReceiver.go @@ -20,9 +20,8 @@ type ReceiverConfig struct { } type receiver struct { - typename string - name string - sink chan lp.CCMetric + name string + sink chan lp.CCMetric } type Receiver interface { diff --git a/receivers/natsReceiver.go b/receivers/natsReceiver.go index dc96971..6114ecd 100644 --- a/receivers/natsReceiver.go +++ b/receivers/natsReceiver.go @@ -33,8 +33,7 @@ var DefaultTime = func() time.Time { } func (r *NatsReceiver) Init(name string, config json.RawMessage) error { - r.typename = "NatsReceiver" - r.name = name + r.name = fmt.Sprintf("NatsReceiver(%s)", name) r.config.Addr = nats.DefaultURL r.config.Port = "4222" if len(config) > 0 { @@ -91,3 +90,9 @@ func (r *NatsReceiver) Close() { r.nc.Close() } } + +func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) { + r := new(NatsReceiver) + err := r.Init(name, config) + return r, err +} diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index 7141170..7b1a8fe 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -9,8 +9,8 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -var AvailableReceivers = map[string]Receiver{ - "nats": &NatsReceiver{}, +var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){ + "nats": NewNatsReceiver, } type receiveManager struct { @@ -75,8 +75,7 @@ func (rm *receiveManager) AddInput(name string, rawConfig json.RawMessage) error cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "unknown receiver:", err.Error()) return err } - r := AvailableReceivers[config.Type] - err = r.Init(name, rawConfig) + r, err := AvailableReceivers[config.Type](name, rawConfig) if err != nil { cclog.ComponentError("ReceiveManager", "SKIP", r.Name(), "initialization failed:", err.Error()) return err From 6b6566b0aa04c0ce813baa766a3e267f62ccdbfe Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Feb 2022 17:46:17 +0100 Subject: [PATCH 43/49] Fix metric scope in likwid configuration script --- scripts/likwid_perfgroup_to_cc_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/likwid_perfgroup_to_cc_config.py b/scripts/likwid_perfgroup_to_cc_config.py index 52959ed..f1c3451 100755 --- a/scripts/likwid_perfgroup_to_cc_config.py +++ b/scripts/likwid_perfgroup_to_cc_config.py @@ -39,7 +39,7 @@ def group_to_json(groupfile): llist = re.split("\s+", line) calc = llist[-1] metric = " ".join(llist[:-1]) - scope = "hwthread" + scope = "cpu" if "BOX" in calc: scope = "socket" if "PWR" in calc: From d542f32baa62935ad8d18e1767823caaa4d7944e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Feb 2022 17:46:44 +0100 Subject: [PATCH 44/49] Mention likwid config script in LikwidCollector README --- collectors/likwidMetric.md | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 8b5dee2..3ef51f3 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -4,7 +4,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": -- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. A counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. +- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. - The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Additional options: @@ -26,6 +26,42 @@ As a guideline: - All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope - All `DFCx` counters have scope `socket` +### Help with the configuration + +The configuration for the `likwid` collector is quite complicated. Most users don't use LIKWID with the event:counter notation but rely on the performance groups defined by the LIKWID team for each architecture. In order to help with the `likwid` collector configuration, we included a script `scripts/likwid_perfgroup_to_cc_config.py` that creates the configuration of an `eventset` from a performance group (using a LIKWID installation in `$PATH`): +``` +$ likwid-perfctr -i +[...] +short name: ICX +[...] +$ likwid-perfctr -a +[...] +MEM_DP +MEM +FLOPS_SP +CLOCK +[...] +$ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP +{ + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "..." : "..." + }, + "metrics" : [ + { + "calc": "time", + "name": "Runtime (RDTSC) [s]", + "publish": true, + "scope": "hwthread" + }, + { + "..." : "..." + } + ] +} +``` + +You can copy this JSON and add it to the `eventsets` list. If you specify multiple event sets, you can add globally derived metrics in the extra `global_metrics` section with the metric names as variables. ### Example configuration From 73981527d3392408a3be52194e8dadd252e9406d Mon Sep 17 00:00:00 2001 From: Holger Obermaier Date: Wed, 23 Feb 2022 14:56:29 +0100 Subject: [PATCH 45/49] Refactor: Embed Init() into New() function --- sinks/gangliaSink.go | 66 ++++++++++------------ sinks/httpSink.go | 98 ++++++++++++++++----------------- sinks/influxAsyncSink.go | 75 ++++++++++++------------- sinks/influxSink.go | 40 +++++++------- sinks/libgangliaSink.go | 115 +++++++++++++++++++-------------------- sinks/metricSink.go | 3 - sinks/natsSink.go | 48 ++++++++-------- sinks/stdoutSink.go | 54 +++++++++--------- 8 files changed, 234 insertions(+), 265 deletions(-) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 0c9459b..8431011 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -33,41 +33,6 @@ type GangliaSink struct { config GangliaSinkConfig } -func (s *GangliaSink) Init(name string, config json.RawMessage) error { - var err error = nil - s.name = fmt.Sprintf("GangliaSink(%s)", name) - s.config.AddTagsAsDesc = false - s.config.AddGangliaGroup = false - if len(config) > 0 { - err := json.Unmarshal(config, &s.config) - if err != nil { - cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error()) - return err - } - } - s.gmetric_path = "" - s.gmetric_config = "" - if len(s.config.GmetricPath) > 0 { - p, err := exec.LookPath(s.config.GmetricPath) - if err == nil { - s.gmetric_path = p - } - } - if len(s.gmetric_path) == 0 { - p, err := exec.LookPath(string(GMETRIC_EXEC)) - if err == nil { - s.gmetric_path = p - } - } - if len(s.gmetric_path) == 0 { - err = errors.New("cannot find executable 'gmetric'") - } - if len(s.config.GmetricConfig) > 0 { - s.gmetric_config = s.config.GmetricConfig - } - return err -} - func (s *GangliaSink) Write(point lp.CCMetric) error { var err error = nil var tagsstr []string @@ -170,6 +135,35 @@ func (s *GangliaSink) Close() { func NewGangliaSink(name string, config json.RawMessage) (Sink, error) { s := new(GangliaSink) - s.Init(name, config) + s.name = fmt.Sprintf("GangliaSink(%s)", name) + s.config.AddTagsAsDesc = false + s.config.AddGangliaGroup = false + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error()) + return nil, err + } + } + s.gmetric_path = "" + s.gmetric_config = "" + if len(s.config.GmetricPath) > 0 { + p, err := exec.LookPath(s.config.GmetricPath) + if err == nil { + s.gmetric_path = p + } + } + if len(s.gmetric_path) == 0 { + p, err := exec.LookPath(string(GMETRIC_EXEC)) + if err == nil { + s.gmetric_path = p + } + } + if len(s.gmetric_path) == 0 { + return nil, errors.New("cannot find executable 'gmetric'") + } + if len(s.config.GmetricConfig) > 0 { + s.gmetric_config = s.config.GmetricConfig + } return s, nil } diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 41a5919..c2dd2ea 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -38,57 +38,6 @@ type HttpSink struct { flushDelay time.Duration } -func (s *HttpSink) Init(name string, config json.RawMessage) error { - // Set default values - s.name = fmt.Sprintf("HttpSink(%s)", name) - s.config.MaxIdleConns = 10 - s.config.IdleConnTimeout = "5s" - s.config.Timeout = "5s" - s.config.FlushDelay = "1s" - - // Read config - if len(config) > 0 { - err := json.Unmarshal(config, &s.config) - if err != nil { - return err - } - } - if len(s.config.URL) == 0 { - return errors.New("`url` config option is required for HTTP sink") - } - if s.config.MaxIdleConns > 0 { - s.maxIdleConns = s.config.MaxIdleConns - } - if len(s.config.IdleConnTimeout) > 0 { - t, err := time.ParseDuration(s.config.IdleConnTimeout) - if err == nil { - s.idleConnTimeout = t - } - } - if len(s.config.Timeout) > 0 { - t, err := time.ParseDuration(s.config.Timeout) - if err == nil { - s.timeout = t - } - } - if len(s.config.FlushDelay) > 0 { - t, err := time.ParseDuration(s.config.FlushDelay) - if err == nil { - s.flushDelay = t - } - } - tr := &http.Transport{ - MaxIdleConns: s.maxIdleConns, - IdleConnTimeout: s.idleConnTimeout, - } - s.client = &http.Client{Transport: tr, Timeout: s.timeout} - s.buffer = &bytes.Buffer{} - s.encoder = influx.NewEncoder(s.buffer) - s.encoder.SetPrecision(time.Second) - - return nil -} - func (s *HttpSink) Write(m lp.CCMetric) error { if s.buffer.Len() == 0 && s.flushDelay != 0 { // This is the first write since the last flush, start the flushTimer! @@ -172,6 +121,51 @@ func (s *HttpSink) Close() { func NewHttpSink(name string, config json.RawMessage) (Sink, error) { s := new(HttpSink) - s.Init(name, config) + // Set default values + s.name = fmt.Sprintf("HttpSink(%s)", name) + s.config.MaxIdleConns = 10 + s.config.IdleConnTimeout = "5s" + s.config.Timeout = "5s" + s.config.FlushDelay = "1s" + + // Read config + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return nil, err + } + } + if len(s.config.URL) == 0 { + return nil, errors.New("`url` config option is required for HTTP sink") + } + if s.config.MaxIdleConns > 0 { + s.maxIdleConns = s.config.MaxIdleConns + } + if len(s.config.IdleConnTimeout) > 0 { + t, err := time.ParseDuration(s.config.IdleConnTimeout) + if err == nil { + s.idleConnTimeout = t + } + } + if len(s.config.Timeout) > 0 { + t, err := time.ParseDuration(s.config.Timeout) + if err == nil { + s.timeout = t + } + } + if len(s.config.FlushDelay) > 0 { + t, err := time.ParseDuration(s.config.FlushDelay) + if err == nil { + s.flushDelay = t + } + } + tr := &http.Transport{ + MaxIdleConns: s.maxIdleConns, + IdleConnTimeout: s.idleConnTimeout, + } + s.client = &http.Client{Transport: tr, Timeout: s.timeout} + s.buffer = &bytes.Buffer{} + s.encoder = influx.NewEncoder(s.buffer) + s.encoder.SetPrecision(time.Second) return s, nil } diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index e2f8995..81b7f78 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -30,11 +30,10 @@ type InfluxAsyncSinkConfig struct { type InfluxAsyncSink struct { sink - client influxdb2.Client - writeApi influxdb2Api.WriteAPI - retPolicy string - errors <-chan error - config InfluxAsyncSinkConfig + client influxdb2.Client + writeApi influxdb2Api.WriteAPI + errors <-chan error + config InfluxAsyncSinkConfig } func (s *InfluxAsyncSink) connect() error { @@ -68,39 +67,6 @@ func (s *InfluxAsyncSink) connect() error { return nil } -func (s *InfluxAsyncSink) Init(name string, config json.RawMessage) error { - s.name = fmt.Sprintf("InfluxSink(%s)", name) - - // Set default for maximum number of points sent to server in single request. - s.config.BatchSize = 100 - - if len(config) > 0 { - err := json.Unmarshal(config, &s.config) - if err != nil { - return err - } - } - if len(s.config.Host) == 0 || - len(s.config.Port) == 0 || - len(s.config.Database) == 0 || - len(s.config.Organization) == 0 || - len(s.config.Password) == 0 { - return errors.New("not all configuration variables set required by InfluxAsyncSink") - } - - // Connect to InfluxDB server - err := s.connect() - - // Start background: Read from error channel - s.errors = s.writeApi.Errors() - go func() { - for err := range s.errors { - cclog.ComponentError(s.name, err.Error()) - } - }() - return err -} - func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { s.writeApi.WritePoint( m.ToPoint(s.config.MetaAsTags), @@ -121,6 +87,37 @@ func (s *InfluxAsyncSink) Close() { func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s := new(InfluxAsyncSink) - s.Init(name, config) + s.name = fmt.Sprintf("InfluxSink(%s)", name) + + // Set default for maximum number of points sent to server in single request. + s.config.BatchSize = 100 + + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return nil, err + } + } + if len(s.config.Host) == 0 || + len(s.config.Port) == 0 || + len(s.config.Database) == 0 || + len(s.config.Organization) == 0 || + len(s.config.Password) == 0 { + return nil, errors.New("not all configuration variables set required by InfluxAsyncSink") + } + + // Connect to InfluxDB server + if err := s.connect(); err != nil { + return nil, fmt.Errorf("Unable to connect: %v", err) + } + + // Start background: Read from error channel + s.errors = s.writeApi.Errors() + go func() { + for err := range s.errors { + cclog.ComponentError(s.name, err.Error()) + } + }() + return s, nil } diff --git a/sinks/influxSink.go b/sinks/influxSink.go index f235054..d156585 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -57,26 +57,6 @@ func (s *InfluxSink) connect() error { return nil } -func (s *InfluxSink) Init(name string, config json.RawMessage) error { - s.name = fmt.Sprintf("InfluxSink(%s)", name) - if len(config) > 0 { - err := json.Unmarshal(config, &s.config) - if err != nil { - return err - } - } - if len(s.config.Host) == 0 || - len(s.config.Port) == 0 || - len(s.config.Database) == 0 || - len(s.config.Organization) == 0 || - len(s.config.Password) == 0 { - return errors.New("not all configuration variables set required by InfluxSink") - } - - // Connect to InfluxDB server - return s.connect() -} - func (s *InfluxSink) Write(m lp.CCMetric) error { err := s.writeApi.WritePoint( @@ -97,6 +77,24 @@ func (s *InfluxSink) Close() { func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { s := new(InfluxSink) - s.Init(name, config) + s.name = fmt.Sprintf("InfluxSink(%s)", name) + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return nil, err + } + } + if len(s.config.Host) == 0 || + len(s.config.Port) == 0 || + len(s.config.Database) == 0 || + len(s.config.Organization) == 0 || + len(s.config.Password) == 0 { + return nil, errors.New("not all configuration variables set required by InfluxSink") + } + + // Connect to InfluxDB server + if err := s.connect(); err != nil { + return nil, fmt.Errorf("Unable to connect: %v", err) + } return s, nil } diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 9a45df4..5fd1eb8 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -109,65 +109,6 @@ type LibgangliaSink struct { cstrCache map[string]*C.char } -func (s *LibgangliaSink) Init(name string, config json.RawMessage) error { - var err error = nil - s.name = fmt.Sprintf("LibgangliaSink(%s)", name) - //s.config.AddTagsAsDesc = false - s.config.AddGangliaGroup = false - s.config.AddTypeToName = false - s.config.AddUnits = true - s.config.GmondConfig = string(GMOND_CONFIG_FILE) - s.config.GangliaLib = string(GANGLIA_LIB_NAME) - if len(config) > 0 { - err = json.Unmarshal(config, &s.config) - if err != nil { - cclog.ComponentError(s.name, "Error reading config:", err.Error()) - return err - } - } - lib := dl.New(s.config.GangliaLib, GANGLIA_LIB_DL_FLAGS) - if lib == nil { - return fmt.Errorf("error instantiating DynamicLibrary for %s", s.config.GangliaLib) - } - err = lib.Open() - if err != nil { - return fmt.Errorf("error opening %s: %v", s.config.GangliaLib, err) - } - - // Set up cache for the C strings - s.cstrCache = make(map[string]*C.char) - // s.cstrCache["globals"] = C.CString("globals") - - // s.cstrCache["override_hostname"] = C.CString("override_hostname") - // s.cstrCache["override_ip"] = C.CString("override_ip") - - // Add some constant strings - s.cstrCache["GROUP"] = C.CString("GROUP") - s.cstrCache["CLUSTER"] = C.CString("CLUSTER") - s.cstrCache[""] = C.CString("") - - // Add cluster name for lookup in Write() - if len(s.config.ClusterName) > 0 { - s.cstrCache[s.config.ClusterName] = C.CString(s.config.ClusterName) - } - // Add supported types for later lookup in Write() - s.cstrCache["double"] = C.CString("double") - s.cstrCache["int32"] = C.CString("int32") - s.cstrCache["string"] = C.CString("string") - - // Create Ganglia pool - s.global_context = C.Ganglia_pool_create(nil) - // Load Ganglia configuration - s.cstrCache[s.config.GmondConfig] = C.CString(s.config.GmondConfig) - s.gmond_config = C.Ganglia_gmond_config_create(s.cstrCache[s.config.GmondConfig], 0) - //globals := C.cfg_getsec(gmond_config, s.cstrCache["globals"]) - //override_hostname := C.cfg_getstr(globals, s.cstrCache["override_hostname"]) - //override_ip := C.cfg_getstr(globals, s.cstrCache["override_ip"]) - - s.send_channels = C.Ganglia_udp_send_channels_create(s.global_context, s.gmond_config) - return nil -} - func (s *LibgangliaSink) Write(point lp.CCMetric) error { var err error = nil var c_name *C.char @@ -319,6 +260,60 @@ func (s *LibgangliaSink) Close() { func NewLibgangliaSink(name string, config json.RawMessage) (Sink, error) { s := new(LibgangliaSink) - s.Init(name, config) + var err error = nil + s.name = fmt.Sprintf("LibgangliaSink(%s)", name) + //s.config.AddTagsAsDesc = false + s.config.AddGangliaGroup = false + s.config.AddTypeToName = false + s.config.AddUnits = true + s.config.GmondConfig = string(GMOND_CONFIG_FILE) + s.config.GangliaLib = string(GANGLIA_LIB_NAME) + if len(config) > 0 { + err = json.Unmarshal(config, &s.config) + if err != nil { + cclog.ComponentError(s.name, "Error reading config:", err.Error()) + return nil, err + } + } + lib := dl.New(s.config.GangliaLib, GANGLIA_LIB_DL_FLAGS) + if lib == nil { + return nil, fmt.Errorf("error instantiating DynamicLibrary for %s", s.config.GangliaLib) + } + err = lib.Open() + if err != nil { + return nil, fmt.Errorf("error opening %s: %v", s.config.GangliaLib, err) + } + + // Set up cache for the C strings + s.cstrCache = make(map[string]*C.char) + // s.cstrCache["globals"] = C.CString("globals") + + // s.cstrCache["override_hostname"] = C.CString("override_hostname") + // s.cstrCache["override_ip"] = C.CString("override_ip") + + // Add some constant strings + s.cstrCache["GROUP"] = C.CString("GROUP") + s.cstrCache["CLUSTER"] = C.CString("CLUSTER") + s.cstrCache[""] = C.CString("") + + // Add cluster name for lookup in Write() + if len(s.config.ClusterName) > 0 { + s.cstrCache[s.config.ClusterName] = C.CString(s.config.ClusterName) + } + // Add supported types for later lookup in Write() + s.cstrCache["double"] = C.CString("double") + s.cstrCache["int32"] = C.CString("int32") + s.cstrCache["string"] = C.CString("string") + + // Create Ganglia pool + s.global_context = C.Ganglia_pool_create(nil) + // Load Ganglia configuration + s.cstrCache[s.config.GmondConfig] = C.CString(s.config.GmondConfig) + s.gmond_config = C.Ganglia_gmond_config_create(s.cstrCache[s.config.GmondConfig], 0) + //globals := C.cfg_getsec(gmond_config, s.cstrCache["globals"]) + //override_hostname := C.cfg_getstr(globals, s.cstrCache["override_hostname"]) + //override_ip := C.cfg_getstr(globals, s.cstrCache["override_ip"]) + + s.send_channels = C.Ganglia_udp_send_channels_create(s.global_context, s.gmond_config) return s, nil } diff --git a/sinks/metricSink.go b/sinks/metricSink.go index 4583a2c..8fe02d7 100644 --- a/sinks/metricSink.go +++ b/sinks/metricSink.go @@ -1,8 +1,6 @@ package sinks import ( - "encoding/json" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -17,7 +15,6 @@ type sink struct { } type Sink interface { - Init(name string, config json.RawMessage) error Write(point lp.CCMetric) error Flush() error Close() diff --git a/sinks/natsSink.go b/sinks/natsSink.go index 6087da0..0d7987e 100644 --- a/sinks/natsSink.go +++ b/sinks/natsSink.go @@ -53,30 +53,6 @@ func (s *NatsSink) connect() error { return nil } -func (s *NatsSink) Init(name string, config json.RawMessage) error { - s.name = fmt.Sprintf("NatsSink(%s)", name) - if len(config) > 0 { - err := json.Unmarshal(config, &s.config) - if err != nil { - cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error()) - return err - } - } - if len(s.config.Host) == 0 || - len(s.config.Port) == 0 || - len(s.config.Database) == 0 { - return errors.New("not all configuration variables set required by NatsSink") - } - // Setup Influx line protocol - s.buffer = &bytes.Buffer{} - s.buffer.Grow(1025) - s.encoder = influx.NewEncoder(s.buffer) - s.encoder.SetPrecision(time.Second) - s.encoder.SetMaxLineBytes(1024) - // Setup infos for connection - return s.connect() -} - func (s *NatsSink) Write(m lp.CCMetric) error { if s.client != nil { _, err := s.encoder.Encode(m.ToPoint(s.config.MetaAsTags)) @@ -108,6 +84,28 @@ func (s *NatsSink) Close() { func NewNatsSink(name string, config json.RawMessage) (Sink, error) { s := new(NatsSink) - s.Init(name, config) + s.name = fmt.Sprintf("NatsSink(%s)", name) + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error()) + return nil, err + } + } + if len(s.config.Host) == 0 || + len(s.config.Port) == 0 || + len(s.config.Database) == 0 { + return nil, errors.New("not all configuration variables set required by NatsSink") + } + // Setup Influx line protocol + s.buffer = &bytes.Buffer{} + s.buffer.Grow(1025) + s.encoder = influx.NewEncoder(s.buffer) + s.encoder.SetPrecision(time.Second) + s.encoder.SetMaxLineBytes(1024) + // Setup infos for connection + if err := s.connect(); err != nil { + return nil, fmt.Errorf("Unable to connect: %v", err) + } return s, nil } diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index d6c2e1b..acf2621 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -19,34 +19,6 @@ type StdoutSink struct { } } -func (s *StdoutSink) Init(name string, config json.RawMessage) error { - s.name = fmt.Sprintf("StdoutSink(%s)", name) - if len(config) > 0 { - err := json.Unmarshal(config, &s.config) - if err != nil { - return err - } - } - - s.output = os.Stdout - if len(s.config.Output) > 0 { - switch strings.ToLower(s.config.Output) { - case "stdout": - s.output = os.Stdout - case "stderr": - s.output = os.Stderr - default: - f, err := os.OpenFile(s.config.Output, os.O_CREATE|os.O_WRONLY, os.FileMode(0600)) - if err != nil { - return err - } - s.output = f - } - } - s.meta_as_tags = s.config.MetaAsTags - return nil -} - func (s *StdoutSink) Write(m lp.CCMetric) error { fmt.Fprint( s.output, @@ -68,6 +40,30 @@ func (s *StdoutSink) Close() { func NewStdoutSink(name string, config json.RawMessage) (Sink, error) { s := new(StdoutSink) - s.Init(name, config) + s.name = fmt.Sprintf("StdoutSink(%s)", name) + if len(config) > 0 { + err := json.Unmarshal(config, &s.config) + if err != nil { + return nil, err + } + } + + s.output = os.Stdout + if len(s.config.Output) > 0 { + switch strings.ToLower(s.config.Output) { + case "stdout": + s.output = os.Stdout + case "stderr": + s.output = os.Stderr + default: + f, err := os.OpenFile(s.config.Output, os.O_CREATE|os.O_WRONLY, os.FileMode(0600)) + if err != nil { + return nil, err + } + s.output = f + } + } + s.meta_as_tags = s.config.MetaAsTags + return s, nil } From 2f363754700a134a8d2e32a8b7e82933890ea267 Mon Sep 17 00:00:00 2001 From: Holger Obermaier Date: Wed, 23 Feb 2022 15:15:17 +0100 Subject: [PATCH 46/49] Refactor: Embed Init() into New() function --- receivers/README.md | 2 -- receivers/metricReceiver.go | 4 --- receivers/natsReceiver.go | 62 +++++++++++++++++-------------------- 3 files changed, 28 insertions(+), 40 deletions(-) diff --git a/receivers/README.md b/receivers/README.md index 49015d3..808dc74 100644 --- a/receivers/README.md +++ b/receivers/README.md @@ -14,7 +14,6 @@ The configuration file for the receivers is a list of configurations. The `type` } ``` - ## Type `nats` ```json @@ -30,7 +29,6 @@ The `nats` receiver subscribes to the topic `database` and listens on `address` # Contributing own receivers A receiver contains a few functions and is derived from the type `Receiver` (in `metricReceiver.go`): -* `Init(name string, config json.RawMessage) error` * `Start() error` * `Close()` * `Name() string` diff --git a/receivers/metricReceiver.go b/receivers/metricReceiver.go index e1a384a..e133354 100644 --- a/receivers/metricReceiver.go +++ b/receivers/metricReceiver.go @@ -1,9 +1,6 @@ package receivers import ( - // "time" - "encoding/json" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) @@ -25,7 +22,6 @@ type receiver struct { } type Receiver interface { - Init(name string, config json.RawMessage) error Start() Close() Name() string diff --git a/receivers/natsReceiver.go b/receivers/natsReceiver.go index 6114ecd..1a5f47b 100644 --- a/receivers/natsReceiver.go +++ b/receivers/natsReceiver.go @@ -32,38 +32,6 @@ var DefaultTime = func() time.Time { return time.Unix(42, 0) } -func (r *NatsReceiver) Init(name string, config json.RawMessage) error { - r.name = fmt.Sprintf("NatsReceiver(%s)", name) - r.config.Addr = nats.DefaultURL - r.config.Port = "4222" - if len(config) > 0 { - err := json.Unmarshal(config, &r.config) - if err != nil { - cclog.ComponentError(r.name, "Error reading config:", err.Error()) - return err - } - } - if len(r.config.Addr) == 0 || - len(r.config.Port) == 0 || - len(r.config.Subject) == 0 { - return errors.New("not all configuration variables set required by NatsReceiver") - } - r.meta = map[string]string{"source": r.name} - uri := fmt.Sprintf("%s:%s", r.config.Addr, r.config.Port) - cclog.ComponentDebug(r.name, "INIT", uri, "Subject", r.config.Subject) - nc, err := nats.Connect(uri) - if err == nil { - r.nc = nc - } else { - r.nc = nil - return err - } - r.handler = influx.NewMetricHandler() - r.parser = influx.NewParser(r.handler) - r.parser.SetTimeFunc(DefaultTime) - return err -} - func (r *NatsReceiver) Start() { cclog.ComponentDebug(r.name, "START") r.nc.Subscribe(r.config.Subject, r._NatsReceive) @@ -93,6 +61,32 @@ func (r *NatsReceiver) Close() { func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) { r := new(NatsReceiver) - err := r.Init(name, config) - return r, err + r.name = fmt.Sprintf("NatsReceiver(%s)", name) + r.config.Addr = nats.DefaultURL + r.config.Port = "4222" + if len(config) > 0 { + err := json.Unmarshal(config, &r.config) + if err != nil { + cclog.ComponentError(r.name, "Error reading config:", err.Error()) + return nil, err + } + } + if len(r.config.Addr) == 0 || + len(r.config.Port) == 0 || + len(r.config.Subject) == 0 { + return nil, errors.New("not all configuration variables set required by NatsReceiver") + } + r.meta = map[string]string{"source": r.name} + uri := fmt.Sprintf("%s:%s", r.config.Addr, r.config.Port) + cclog.ComponentDebug(r.name, "NewNatsReceiver", uri, "Subject", r.config.Subject) + if nc, err := nats.Connect(uri); err == nil { + r.nc = nc + } else { + r.nc = nil + return nil, err + } + r.handler = influx.NewMetricHandler() + r.parser = influx.NewParser(r.handler) + r.parser.SetTimeFunc(DefaultTime) + return r, nil } From 2f044f4b586129da8f93db8814c3362547ac9a8e Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 23 Feb 2022 15:58:51 +0100 Subject: [PATCH 47/49] Fix: MetricReceiver uses uninitialized values, when initialization fails --- receivers/receiveManager.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index 7b1a8fe..1c13026 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -30,11 +30,13 @@ type ReceiveManager interface { } func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) error { + // Initialize struct fields rm.inputs = make([]Receiver, 0) rm.output = nil rm.done = make(chan bool) rm.wg = wg rm.config = make([]json.RawMessage, 0) + configFile, err := os.Open(receiverConfigFile) if err != nil { cclog.ComponentError("ReceiveManager", err.Error()) @@ -51,6 +53,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er for name, raw := range rawConfigs { rm.AddInput(name, raw) } + return nil } @@ -77,7 +80,7 @@ func (rm *receiveManager) AddInput(name string, rawConfig json.RawMessage) error } r, err := AvailableReceivers[config.Type](name, rawConfig) if err != nil { - cclog.ComponentError("ReceiveManager", "SKIP", r.Name(), "initialization failed:", err.Error()) + cclog.ComponentError("ReceiveManager", "SKIP", name, "initialization failed:", err.Error()) return err } rm.inputs = append(rm.inputs, r) From 16c03d2aa2e7b390ab5d31f054c688cec8c65a96 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Thu, 24 Feb 2022 18:22:20 +0100 Subject: [PATCH 48/49] Use Ganglia configuration (#44) * Copy all metric configurations from original Ganglia code * Use metric configurations from Ganglia for some metrics * Format value string also for known metrics --- sinks/gangliaCommon.go | 218 +++++++++++++++++++++++++++++++++++++++- sinks/gangliaSink.go | 96 +++++------------- sinks/libgangliaSink.go | 111 +++++++------------- 3 files changed, 279 insertions(+), 146 deletions(-) diff --git a/sinks/gangliaCommon.go b/sinks/gangliaCommon.go index b939f16..b2a1b2c 100644 --- a/sinks/gangliaCommon.go +++ b/sinks/gangliaCommon.go @@ -1,6 +1,7 @@ package sinks import ( + "fmt" "strings" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" @@ -23,11 +24,8 @@ func GangliaMetricName(point lp.CCMetric) string { return name } -func GangliaMetricRename(point lp.CCMetric) string { - name := point.Name() - if name == "mem_total" || name == "swap_total" { - return name - } else if name == "net_bytes_in" { +func GangliaMetricRename(name string) string { + if name == "net_bytes_in" { return "bytes_in" } else if name == "net_bytes_out" { return "bytes_out" @@ -48,3 +46,213 @@ func GangliaSlopeType(point lp.CCMetric) uint { } return 3 } + +const DEFAULT_GANGLIA_METRIC_TMAX = 300 +const DEFAULT_GANGLIA_METRIC_SLOPE = "both" + +type GangliaMetric struct { + Name string + Type string + Slope string + Tmax int + Unit string +} + +type GangliaMetricGroup struct { + Name string + Metrics []GangliaMetric +} + +var CommonGangliaMetrics = []GangliaMetricGroup{ + { + Name: "memory", + Metrics: []GangliaMetric{ + {"mem_total", "float", "zero", 1200, "KB"}, + {"swap_total", "float", "zero", 1200, "KB"}, + {"mem_free", "float", "both", 180, "KB"}, + {"mem_shared", "float", "both", 180, "KB"}, + {"mem_buffers", "float", "both", 180, "KB"}, + {"mem_cached", "float", "both", 180, "KB"}, + {"swap_free", "float", "both", 180, "KB"}, + {"mem_sreclaimable", "float", "both", 180, "KB"}, + {"mem_slab", "float", "both", 180, "KB"}, + }, + }, + { + Name: "cpu", + Metrics: []GangliaMetric{ + {"cpu_num", "uint32", "zero", 1200, "CPUs"}, + {"cpu_speed", "uint32", "zero", 1200, "MHz"}, + {"cpu_user", "float", "both", 90, "%"}, + {"cpu_nice", "float", "both", 90, "%"}, + {"cpu_system", "float", "both", 90, "%"}, + {"cpu_idle", "float", "both", 3800, "%"}, + {"cpu_aidle", "float", "both", 90, "%"}, + {"cpu_wio", "float", "both", 90, "%"}, + {"cpu_intr", "float", "both", 90, "%"}, + {"cpu_sintr", "float", "both", 90, "%"}, + {"cpu_steal", "float", "both", 90, "%"}, + {"cpu_guest", "float", "both", 90, "%"}, + {"cpu_gnice", "float", "both", 90, "%"}, + }, + }, + { + Name: "load", + Metrics: []GangliaMetric{ + {"load_one", "float", "both", 70, ""}, + {"load_five", "float", "both", 325, ""}, + {"load_fifteen", "float", "both", 950, ""}, + }, + }, + { + Name: "disk", + Metrics: []GangliaMetric{ + {"disk_total", "double", "both", 1200, "GB"}, + {"disk_free", "double", "both", 180, "GB"}, + {"part_max_used", "float", "both", 180, "%"}, + }, + }, + { + Name: "network", + Metrics: []GangliaMetric{ + {"bytes_out", "float", "both", 300, "bytes/sec"}, + {"bytes_in", "float", "both", 300, "bytes/sec"}, + {"pkts_in", "float", "both", 300, "packets/sec"}, + {"pkts_out", "float", "both", 300, "packets/sec"}, + }, + }, + { + Name: "process", + Metrics: []GangliaMetric{ + {"proc_run", "uint32", "both", 950, ""}, + {"proc_total", "uint32", "both", 950, ""}, + }, + }, + { + Name: "system", + Metrics: []GangliaMetric{ + {"boottime", "uint32", "zero", 1200, "s"}, + {"sys_clock", "uint32", "zero", 1200, "s"}, + {"machine_type", "string", "zero", 1200, ""}, + {"os_name", "string", "zero", 1200, ""}, + {"os_release", "string", "zero", 1200, ""}, + {"mtu", "uint32", "both", 1200, ""}, + }, + }, +} + +type GangliaMetricConfig struct { + Type string + Slope string + Tmax int + Unit string + Group string + Value string +} + +func GetCommonGangliaConfig(point lp.CCMetric) GangliaMetricConfig { + mname := GangliaMetricRename(point.Name()) + for _, group := range CommonGangliaMetrics { + for _, metric := range group.Metrics { + if metric.Name == mname { + valueStr := "" + value, ok := point.GetField("value") + if ok { + switch real := value.(type) { + case float64: + valueStr = fmt.Sprintf("%f", real) + case float32: + valueStr = fmt.Sprintf("%f", real) + case int64: + valueStr = fmt.Sprintf("%d", real) + case int32: + valueStr = fmt.Sprintf("%d", real) + case int: + valueStr = fmt.Sprintf("%d", real) + case uint64: + valueStr = fmt.Sprintf("%d", real) + case uint32: + valueStr = fmt.Sprintf("%d", real) + case uint: + valueStr = fmt.Sprintf("%d", real) + case string: + valueStr = real + default: + } + } + return GangliaMetricConfig{ + Group: group.Name, + Type: metric.Type, + Slope: metric.Slope, + Tmax: metric.Tmax, + Unit: metric.Unit, + Value: valueStr, + } + } + } + } + return GangliaMetricConfig{ + Group: "", + Type: "", + Slope: "", + Tmax: 0, + Unit: "", + Value: "", + } +} + +func GetGangliaConfig(point lp.CCMetric) GangliaMetricConfig { + group := "" + if g, ok := point.GetMeta("group"); ok { + group = g + } + unit := "" + if u, ok := point.GetMeta("unit"); ok { + unit = u + } + valueType := "double" + valueStr := "" + value, ok := point.GetField("value") + if ok { + switch real := value.(type) { + case float64: + valueStr = fmt.Sprintf("%f", real) + valueType = "double" + case float32: + valueStr = fmt.Sprintf("%f", real) + valueType = "float" + case int64: + valueStr = fmt.Sprintf("%d", real) + valueType = "int32" + case int32: + valueStr = fmt.Sprintf("%d", real) + valueType = "int32" + case int: + valueStr = fmt.Sprintf("%d", real) + valueType = "int32" + case uint64: + valueStr = fmt.Sprintf("%d", real) + valueType = "uint32" + case uint32: + valueStr = fmt.Sprintf("%d", real) + valueType = "uint32" + case uint: + valueStr = fmt.Sprintf("%d", real) + valueType = "uint32" + case string: + valueStr = real + valueType = "string" + default: + valueType = "invalid" + } + } + + return GangliaMetricConfig{ + Group: group, + Type: valueType, + Slope: DEFAULT_GANGLIA_METRIC_SLOPE, + Tmax: DEFAULT_GANGLIA_METRIC_TMAX, + Unit: unit, + Value: valueStr, + } +} diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 8431011..22096af 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -24,6 +24,7 @@ type GangliaSinkConfig struct { AddTagsAsDesc bool `json:"add_tags_as_desc,omitempty"` ClusterName string `json:"cluster_name,omitempty"` AddTypeToName bool `json:"add_type_to_name,omitempty"` + AddUnits bool `json:"add_units,omitempty"` } type GangliaSink struct { @@ -35,91 +36,48 @@ type GangliaSink struct { func (s *GangliaSink) Write(point lp.CCMetric) error { var err error = nil - var tagsstr []string + //var tagsstr []string var argstr []string - if s.config.AddGangliaGroup { - if point.HasTag("group") { - g, _ := point.GetTag("group") - argstr = append(argstr, fmt.Sprintf("--group=%s", g)) - } else if point.HasMeta("group") { - g, _ := point.GetMeta("group") - argstr = append(argstr, fmt.Sprintf("--group=%s", g)) - } + + // Get metric name + metricname := GangliaMetricRename(point.Name()) + + // Get metric config (type, value, ... in suitable format) + conf := GetCommonGangliaConfig(point) + if len(conf.Type) == 0 { + conf = GetGangliaConfig(point) + } + if len(conf.Type) == 0 { + return fmt.Errorf("metric %s has no 'value' field", metricname) } - for key, value := range point.Tags() { - switch key { - case "unit": - argstr = append(argstr, fmt.Sprintf("--units=%s", value)) - default: - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) - } + if s.config.AddGangliaGroup { + argstr = append(argstr, fmt.Sprintf("--group=%s", conf.Group)) } - if s.config.MetaAsTags { - for key, value := range point.Meta() { - switch key { - case "unit": - argstr = append(argstr, fmt.Sprintf("--units=%s", value)) - default: - tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", key, value)) - } - } + if s.config.AddUnits && len(conf.Unit) > 0 { + argstr = append(argstr, fmt.Sprintf("--units=%s", conf.Unit)) } + if len(s.config.ClusterName) > 0 { argstr = append(argstr, fmt.Sprintf("--cluster=%s", s.config.ClusterName)) } - if s.config.AddTagsAsDesc && len(tagsstr) > 0 { - argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) - } + // if s.config.AddTagsAsDesc && len(tagsstr) > 0 { + // argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) + // } if len(s.gmetric_config) > 0 { argstr = append(argstr, fmt.Sprintf("--conf=%s", s.gmetric_config)) } - name := GangliaMetricRename(point) if s.config.AddTypeToName { argstr = append(argstr, fmt.Sprintf("--name=%s", GangliaMetricName(point))) } else { - argstr = append(argstr, fmt.Sprintf("--name=%s", name)) + argstr = append(argstr, fmt.Sprintf("--name=%s", metricname)) } - slope := GangliaSlopeType(point) - slopeStr := "both" - if slope == 0 { - slopeStr = "zero" - } - argstr = append(argstr, fmt.Sprintf("--slope=%s", slopeStr)) + argstr = append(argstr, fmt.Sprintf("--slope=%s", conf.Slope)) + argstr = append(argstr, fmt.Sprintf("--value=%s", conf.Value)) + argstr = append(argstr, fmt.Sprintf("--type=%s", conf.Type)) + argstr = append(argstr, fmt.Sprintf("--tmax=%d", conf.Tmax)) - for k, v := range point.Fields() { - if k == "value" { - switch value := v.(type) { - case float64: - argstr = append(argstr, - fmt.Sprintf("--value=%v", value), "--type=double") - case float32: - argstr = append(argstr, - fmt.Sprintf("--value=%v", value), "--type=float") - case int: - argstr = append(argstr, - fmt.Sprintf("--value=%d", value), "--type=int32") - case int32: - argstr = append(argstr, - fmt.Sprintf("--value=%d", value), "--type=int32") - case int64: - argstr = append(argstr, - fmt.Sprintf("--value=%d", value), "--type=int32") - case uint: - argstr = append(argstr, - fmt.Sprintf("--value=%d", value), "--type=uint32") - case uint32: - argstr = append(argstr, - fmt.Sprintf("--value=%d", value), "--type=uint32") - case uint64: - argstr = append(argstr, - fmt.Sprintf("--value=%d", value), "--type=uint32") - case string: - argstr = append(argstr, - fmt.Sprintf("--value=%q", value), "--type=string") - } - } - } + cclog.ComponentDebug(s.name, s.gmetric_path, strings.Join(argstr, " ")) command := exec.Command(s.gmetric_path, argstr...) command.Wait() _, err = command.Output() diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 5fd1eb8..1fc7863 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -82,21 +82,21 @@ const ( GMOND_CONFIG_FILE = `/etc/ganglia/gmond.conf` ) -type LibgangliaSinkSpecialMetric struct { - MetricName string `json:"metric_name,omitempty"` - NewName string `json:"new_name,omitempty"` - Slope string `json:"slope,omitempty"` -} +// type LibgangliaSinkSpecialMetric struct { +// MetricName string `json:"metric_name,omitempty"` +// NewName string `json:"new_name,omitempty"` +// Slope string `json:"slope,omitempty"` +// } type LibgangliaSinkConfig struct { defaultSinkConfig - GangliaLib string `json:"libganglia_path,omitempty"` - GmondConfig string `json:"gmond_config,omitempty"` - AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` - AddTypeToName bool `json:"add_type_to_name,omitempty"` - AddUnits bool `json:"add_units,omitempty"` - ClusterName string `json:"cluster_name,omitempty"` - SpecialMetrics map[string]LibgangliaSinkSpecialMetric `json:"rename_metrics,omitempty"` // Map to rename metric name from key to value + GangliaLib string `json:"libganglia_path,omitempty"` + GmondConfig string `json:"gmond_config,omitempty"` + AddGangliaGroup bool `json:"add_ganglia_group,omitempty"` + AddTypeToName bool `json:"add_type_to_name,omitempty"` + AddUnits bool `json:"add_units,omitempty"` + ClusterName string `json:"cluster_name,omitempty"` + //SpecialMetrics map[string]LibgangliaSinkSpecialMetric `json:"rename_metrics,omitempty"` // Map to rename metric name from key to value //AddTagsAsDesc bool `json:"add_tags_as_desc,omitempty"` } @@ -125,81 +125,48 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { } // Get metric name - metricname := GangliaMetricRename(point) - if s.config.AddTypeToName { - c_name = lookup(GangliaMetricName(point)) - } else { - c_name = lookup(metricname) - } + metricname := GangliaMetricRename(point.Name()) - // Get the value C string and lookup the type string in the cache - value, ok := point.GetField("value") - if !ok { + conf := GetCommonGangliaConfig(point) + if len(conf.Type) == 0 { + conf = GetGangliaConfig(point) + } + if len(conf.Type) == 0 { return fmt.Errorf("metric %s has no 'value' field", metricname) } - switch real := value.(type) { - case float64: - c_value = C.CString(fmt.Sprintf("%f", real)) - c_type = lookup("double") - case float32: - c_value = C.CString(fmt.Sprintf("%f", real)) - c_type = lookup("float") - case int64: - c_value = C.CString(fmt.Sprintf("%d", real)) - c_type = lookup("int32") - case int32: - c_value = C.CString(fmt.Sprintf("%d", real)) - c_type = lookup("int32") - case int: - c_value = C.CString(fmt.Sprintf("%d", real)) - c_type = lookup("int32") - case uint64: - c_value = C.CString(fmt.Sprintf("%d", real)) - c_type = lookup("uint32") - case uint32: - c_value = C.CString(fmt.Sprintf("%d", real)) - c_type = lookup("uint32") - case uint: - c_value = C.CString(fmt.Sprintf("%d", real)) - c_type = lookup("uint32") - case string: - c_value = C.CString(real) - c_type = lookup("string") - default: - return fmt.Errorf("metric %s has invalid 'value' type for %s", point.Name(), s.name) + + if s.config.AddTypeToName { + metricname = GangliaMetricName(point) } + c_value = C.CString(conf.Value) + c_type = lookup(conf.Type) + c_name = lookup(metricname) + // Add unit + unit := "" if s.config.AddUnits { - if tagunit, tagok := point.GetTag("unit"); tagok { - c_unit = lookup(tagunit) - } else if metaunit, metaok := point.GetMeta("unit"); metaok { - c_unit = lookup(metaunit) - } else { - c_unit = lookup("") - } - } else { - c_unit = lookup("") + unit = conf.Unit } + c_unit = lookup(unit) // Determine the slope of the metric. Ganglia's own collector mostly use // 'both' but the mem and swap total uses 'zero'. - slope := GangliaSlopeType(point) slope_type := C.GANGLIA_SLOPE_BOTH - switch slope { - case 0: + switch conf.Slope { + case "zero": slope_type = C.GANGLIA_SLOPE_ZERO + case "both": + slope_type = C.GANGLIA_SLOPE_BOTH } // Create a new Ganglia metric gmetric := C.Ganglia_metric_create(s.global_context) // Set name, value, type and unit in the Ganglia metric - // Since we don't have this information from the collectors, - // we assume that the metric value can go up and down (slope), - // and there is no maximum for 'dmax' and 'tmax'. - // Ganglia's collectors set 'tmax' but not 'dmax' + // The default slope_type is both directions, so up and down. Some metrics want 'zero' slope, probably constant. + // The 'tmax' value is by default 300. rval := C.int(0) - rval = C.Ganglia_metric_set(gmetric, c_name, c_value, c_type, c_unit, C.uint(slope_type), 0, 0) + rval = C.Ganglia_metric_set(gmetric, c_name, c_value, c_type, c_unit, C.uint(slope_type), C.uint(conf.Tmax), 0) switch rval { case 1: C.free(unsafe.Pointer(c_value)) @@ -209,10 +176,10 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { return errors.New("one of your parameters has an invalid character '\"'") case 3: C.free(unsafe.Pointer(c_value)) - return fmt.Errorf("the type parameter \"%s\" is not a valid type", C.GoString(c_type)) + return fmt.Errorf("the type parameter \"%s\" is not a valid type", conf.Type) case 4: C.free(unsafe.Pointer(c_value)) - return fmt.Errorf("the value parameter \"%s\" does not represent a number", C.GoString(c_value)) + return fmt.Errorf("the value parameter \"%s\" does not represent a number", conf.Value) default: } @@ -221,8 +188,8 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error { C.Ganglia_metadata_add(gmetric, lookup("CLUSTER"), lookup(s.config.ClusterName)) } // Set the group metadata in the Ganglia metric if configured - if group, ok := point.GetMeta("group"); ok && s.config.AddGangliaGroup { - c_group := lookup(group) + if s.config.AddGangliaGroup { + c_group := lookup(conf.Group) C.Ganglia_metadata_add(gmetric, lookup("GROUP"), c_group) } From c8bca59de4ef254c919babe6f113ff2d9bd60c51 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Thu, 24 Feb 2022 18:27:05 +0100 Subject: [PATCH 49/49] Numa-aware memstat collector (#45) --- collectors/memstatMetric.go | 176 ++++++++++++++++++++++++------------ 1 file changed, 118 insertions(+), 58 deletions(-) diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index b6ef855..3998537 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -1,35 +1,76 @@ package collectors import ( + "bufio" "encoding/json" "errors" "fmt" - "io/ioutil" - "log" + "os" + "path/filepath" + "regexp" "strconv" "strings" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" ) -const MEMSTATFILE = `/proc/meminfo` +const MEMSTATFILE = "/proc/meminfo" +const NUMA_MEMSTAT_BASE = "/sys/devices/system/node" type MemstatCollectorConfig struct { ExcludeMetrics []string `json:"exclude_metrics"` + NodeStats bool `json:"node_stats,omitempty"` + NumaStats bool `json:"numa_stats,omitempty"` +} + +type MemstatCollectorNode struct { + file string + tags map[string]string } type MemstatCollector struct { metricCollector - stats map[string]int64 - tags map[string]string - matches map[string]string - config MemstatCollectorConfig + stats map[string]int64 + tags map[string]string + matches map[string]string + config MemstatCollectorConfig + nodefiles map[int]MemstatCollectorNode +} + +func getStats(filename string) map[string]float64 { + stats := make(map[string]float64) + file, err := os.Open(filename) + if err != nil { + cclog.Error(err.Error()) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + if len(linefields) == 3 { + v, err := strconv.ParseFloat(linefields[1], 64) + if err == nil { + stats[strings.Trim(linefields[0], ":")] = v + } + } else if len(linefields) == 5 { + v, err := strconv.ParseFloat(linefields[3], 64) + if err == nil { + stats[strings.Trim(linefields[0], ":")] = v + } + } + } + return stats } func (m *MemstatCollector) Init(config json.RawMessage) error { var err error m.name = "MemstatCollector" + m.config.NodeStats = true + m.config.NumaStats = false if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { @@ -40,7 +81,8 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { m.stats = make(map[string]int64) m.matches = make(map[string]string) m.tags = map[string]string{"type": "node"} - matches := map[string]string{`MemTotal`: "mem_total", + matches := map[string]string{ + "MemTotal": "mem_total", "SwapTotal": "swap_total", "SReclaimable": "mem_sreclaimable", "Slab": "mem_slab", @@ -48,7 +90,9 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { "Buffers": "mem_buffers", "Cached": "mem_cached", "MemAvailable": "mem_available", - "SwapFree": "swap_free"} + "SwapFree": "swap_free", + "MemShared": "mem_shared", + } for k, v := range matches { _, skip := stringArrayContains(m.config.ExcludeMetrics, k) if !skip { @@ -56,13 +100,44 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { } } if len(m.matches) == 0 { - return errors.New("No metrics to collect") + return errors.New("no metrics to collect") } m.setup() - _, err = ioutil.ReadFile(string(MEMSTATFILE)) - if err == nil { - m.init = true + + if m.config.NodeStats { + if stats := getStats(MEMSTATFILE); len(stats) == 0 { + return fmt.Errorf("cannot read data from file %s", MEMSTATFILE) + } } + + if m.config.NumaStats { + globPattern := filepath.Join(NUMA_MEMSTAT_BASE, "node[0-9]*", "meminfo") + regex := regexp.MustCompile(filepath.Join(NUMA_MEMSTAT_BASE, "node(\\d+)", "meminfo")) + files, err := filepath.Glob(globPattern) + if err == nil { + m.nodefiles = make(map[int]MemstatCollectorNode) + for _, f := range files { + if stats := getStats(f); len(stats) == 0 { + return fmt.Errorf("cannot read data from file %s", f) + } + rematch := regex.FindStringSubmatch(f) + if len(rematch) == 2 { + id, err := strconv.Atoi(rematch[1]) + if err == nil { + f := MemstatCollectorNode{ + file: f, + tags: map[string]string{ + "type": "memoryDomain", + "type-id": fmt.Sprintf("%d", id), + }, + } + m.nodefiles[id] = f + } + } + } + } + } + m.init = true return err } @@ -71,56 +146,41 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) return } - buffer, err := ioutil.ReadFile(string(MEMSTATFILE)) - if err != nil { - log.Print(err) - return - } - - ll := strings.Split(string(buffer), "\n") - for _, line := range ll { - ls := strings.Split(line, `:`) - if len(ls) > 1 { - lv := strings.Fields(ls[1]) - m.stats[ls[0]], err = strconv.ParseInt(lv[0], 0, 64) + sendStats := func(stats map[string]float64, tags map[string]string) { + for match, name := range m.matches { + var value float64 = 0 + if v, ok := stats[match]; ok { + value = v + } + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + if err == nil { + output <- y + } } - } - - if _, exists := m.stats[`MemTotal`]; !exists { - err = errors.New("Parse error") - log.Print(err) - return - } - - for match, name := range m.matches { - if _, exists := m.stats[match]; !exists { - err = fmt.Errorf("Parse error for %s : %s", match, name) - log.Print(err) - continue - } - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": int(float64(m.stats[match]) * 1.0e-3)}, time.Now()) - if err == nil { - output <- y - } - } - - if _, free := m.stats[`MemFree`]; free { - if _, buffers := m.stats[`Buffers`]; buffers { - if _, cached := m.stats[`Cached`]; cached { - memUsed := m.stats[`MemTotal`] - (m.stats[`MemFree`] + m.stats[`Buffers`] + m.stats[`Cached`]) - _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used") - y, err := lp.New("mem_used", m.tags, m.meta, map[string]interface{}{"value": int(float64(memUsed) * 1.0e-3)}, time.Now()) - if err == nil && !skip { - output <- y + if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip { + if freeVal, free := stats["MemFree"]; free { + if bufVal, buffers := stats["Buffers"]; buffers { + if cacheVal, cached := stats["Cached"]; cached { + memUsed := stats["MemTotal"] - (freeVal + bufVal + cacheVal) + y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now()) + if err == nil { + output <- y + } + } } } } } - if _, found := m.stats[`MemShared`]; found { - _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_shared") - y, err := lp.New("mem_shared", m.tags, m.meta, map[string]interface{}{"value": int(float64(m.stats[`MemShared`]) * 1.0e-3)}, time.Now()) - if err == nil && !skip { - output <- y + + if m.config.NodeStats { + nodestats := getStats(MEMSTATFILE) + sendStats(nodestats, m.tags) + } + + if m.config.NumaStats { + for _, nodeConf := range m.nodefiles { + stats := getStats(nodeConf.file) + sendStats(stats, nodeConf.tags) } } }