From 9ae0806aa9d5f0b0f0552e929aeceacdc0706be5 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 10 Oct 2022 12:18:52 +0200 Subject: [PATCH] Add collector for monitoring the execution of cc-metric-collector itself (#81) * Add collector to monitor execution of cc-metric-collector itself * Register SelfCollector * Fix import paths for moved packages --- collectors/collectorManager.go | 1 + collectors/selfMetric.go | 144 +++++++++++++++++++++++++++++++++ collectors/selfMetric.md | 34 ++++++++ 3 files changed, 179 insertions(+) create mode 100644 collectors/selfMetric.go create mode 100644 collectors/selfMetric.md diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 3b11e78..ea648ef 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -37,6 +37,7 @@ var AvailableCollectors = map[string]MetricCollector{ "beegfs_meta": new(BeegfsMetaCollector), "beegfs_storage": new(BeegfsStorageCollector), "rocm_smi": new(RocmSmiCollector), + "self": new(SelfCollector), "schedstat": new(SchedstatCollector), } diff --git a/collectors/selfMetric.go b/collectors/selfMetric.go new file mode 100644 index 0000000..4fc95c0 --- /dev/null +++ b/collectors/selfMetric.go @@ -0,0 +1,144 @@ +package collectors + +import ( + "encoding/json" + "runtime" + "syscall" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" +) + +type SelfCollectorConfig struct { + MemStats bool `json:"read_mem_stats"` + GoRoutines bool `json:"read_goroutines"` + CgoCalls bool `json:"read_cgo_calls"` + Rusage bool `json:"read_rusage"` +} + +type SelfCollector struct { + metricCollector + config SelfCollectorConfig // the configuration structure + meta map[string]string // default meta information + tags map[string]string // default tags +} + +func (m *SelfCollector) Init(config json.RawMessage) error { + var err error = nil + m.name = "SelfCollector" + m.setup() + m.parallel = true + m.meta = map[string]string{"source": m.name, "group": "Self"} + m.tags = map[string]string{"type": "node"} + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + m.init = true + return err +} + +func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) { + timestamp := time.Now() + + if m.config.MemStats { + var memstats runtime.MemStats + runtime.ReadMemStats(&memstats) + + y, err := lp.New("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp) + if err == nil { + output <- y + } + } + if m.config.GoRoutines { + y, err := lp.New("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp) + if err == nil { + output <- y + } + } + if m.config.CgoCalls { + y, err := lp.New("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp) + if err == nil { + output <- y + } + } + if m.config.Rusage { + var rusage syscall.Rusage + err := syscall.Getrusage(syscall.RUSAGE_SELF, &rusage) + if err == nil { + sec, nsec := rusage.Utime.Unix() + t := float64(sec) + (float64(nsec) * 1e-9) + y, err := lp.New("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) + if err == nil { + y.AddMeta("unit", "seconds") + output <- y + } + sec, nsec = rusage.Stime.Unix() + t = float64(sec) + (float64(nsec) * 1e-9) + y, err = lp.New("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) + if err == nil { + y.AddMeta("unit", "seconds") + output <- y + } + y, err = lp.New("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp) + if err == nil { + output <- y + } + } + + } +} + +func (m *SelfCollector) Close() { + m.init = false +} diff --git a/collectors/selfMetric.md b/collectors/selfMetric.md new file mode 100644 index 0000000..ab8e50b --- /dev/null +++ b/collectors/selfMetric.md @@ -0,0 +1,34 @@ +## `self` collector + +```json + "self": { + "read_mem_stats" : true, + "read_goroutines" : true, + "read_cgo_calls" : true, + "read_rusage" : true + } +``` + +The `self` collector reads the data from the `runtime` and `syscall` packages, so monitors the execution of the cc-metric-collector itself. + +Metrics: +* If `read_mem_stats == true`: + * `total_alloc`: The metric reports cumulative bytes allocated for heap objects. + * `heap_alloc`: The metric reports bytes of allocated heap objects. + * `heap_sys`: The metric reports bytes of heap memory obtained from the OS. + * `heap_idle`: The metric reports bytes in idle (unused) spans. + * `heap_inuse`: The metric reports bytes in in-use spans. + * `heap_released`: The metric reports bytes of physical memory returned to the OS. + * `heap_objects`: The metric reports the number of allocated heap objects. +* If `read_goroutines == true`: + * `num_goroutines`: The metric reports the number of goroutines that currently exist. +* If `read_cgo_calls == true`: + * `num_cgo_calls`: The metric reports the number of cgo calls made by the current process. +* If `read_rusage == true`: + * `rusage_user_time`: The metric reports the amount of time that this process has been scheduled in user mode. + * `rusage_system_time`: The metric reports the amount of time that this process has been scheduled in kernel mode. + * `rusage_vol_ctx_switch`: The metric reports the amount of voluntary context switches. + * `rusage_invol_ctx_switch`: The metric reports the amount of involuntary context switches. + * `rusage_signals`: The metric reports the number of signals received. + * `rusage_major_pgfaults`: The metric reports the number of major faults the process has made which have required loading a memory page from disk. + * `rusage_minor_pgfaults`: The metric reports the number of minor faults the process has made which have not required loading a memory page from disk.